tangled
alpha
login
or
join now
dunkirk.sh
/
dots
3
fork
atom
Kieran's opinionated (and probably slightly dumb) nix config
3
fork
atom
overview
issues
pulls
pipelines
feat: update triage agent
dunkirk.sh
5 days ago
d4501257
3804ed58
verified
This commit was signed with the committer's
known signature
.
dunkirk.sh
SSH Key Fingerprint:
SHA256:DqcG0RXYExE26KiWo3VxJnsxswN1QNfTBvB+bdSpk80=
+44
-9
1 changed file
expand all
collapse all
unified
split
modules
nixos
services
triage-agent.nix
+44
-9
modules/nixos/services/triage-agent.nix
···
32
32
INCIDENT_ID="$2"
33
33
CALLBACK_URL="$3"
34
34
35
35
+
echo "[triage] Starting triage for service=$SERVICE_NAME incident=$INCIDENT_ID"
36
36
+
35
37
# Collect filtered logs
36
38
FILTERED_LOGS=$(${triageFilterScript}/bin/triage-filter "$SERVICE_NAME" 2>&1 || true)
37
39
···
39
41
FILTERED_LOGS="No relevant log lines found in the last 10 minutes."
40
42
fi
41
43
44
44
+
LOG_LINES=$(echo "$FILTERED_LOGS" | ${pkgs.coreutils}/bin/wc -l)
45
45
+
echo "[triage] Log collection done: $LOG_LINES lines"
46
46
+
42
47
# Run Claude headless for triage
43
48
export HOME="/var/lib/triage-agent"
44
44
-
REPORT=$(${cfg.claudePath} -p \
49
49
+
echo "[triage] Invoking Claude for analysis..."
50
50
+
RAW_OUTPUT=$(${cfg.claudePath} -p \
45
51
--model sonnet \
46
52
--max-turns 10 \
47
53
--allowedTools "Bash(journalctl:*) Bash(systemctl:*) Bash(curl:*) Bash(ss:*) Bash(df:*) Bash(free:*) Bash(ps:*) Bash(cat:/etc/*) Bash(ls:*) Bash(stat:*)" \
48
48
-
"You are a service triage agent investigating why a service is down. You have access to tools for deeper investigation. Write a concise triage report (max 500 words) in markdown with: 1) What failed 2) Likely root cause 3) Suggested fix. Do not speculate beyond what the evidence shows.
54
54
+
"You are a service triage agent investigating why a service is down. You have access to tools for deeper investigation.
55
55
+
56
56
+
Your output MUST follow this exact format:
57
57
+
SUMMARY: <one sentence describing the root cause>
58
58
+
---
59
59
+
<full triage report in markdown, max 500 words, with: 1) What failed 2) Likely root cause 3) Suggested fix>
60
60
+
61
61
+
The SUMMARY line must be a single sentence. Do not speculate beyond what the evidence shows.
49
62
50
63
Service '$SERVICE_NAME' is down. Here are the filtered logs from the last 10 minutes:
51
64
52
65
$FILTERED_LOGS
53
66
54
54
-
Investigate further if the logs aren't conclusive — check disk space, memory, open ports, config files, or related service logs. Then write a triage report." 2>&1 || echo "Triage agent failed to produce a report.")
67
67
+
Investigate further if the logs aren't conclusive — check disk space, memory, open ports, config files, or related service logs. Then write your response in the format above." 2>&1 || echo "SUMMARY: Triage agent failed to produce a report.
68
68
+
---
69
69
+
Triage agent failed to produce a report.")
70
70
+
71
71
+
echo "[triage] Report generated: ''${#RAW_OUTPUT} chars"
72
72
+
73
73
+
# Split output into summary and full report
74
74
+
SUMMARY=$(echo "$RAW_OUTPUT" | ${pkgs.gnused}/bin/sed -n 's/^SUMMARY: //p' | ${pkgs.coreutils}/bin/head -1)
75
75
+
REPORT=$(echo "$RAW_OUTPUT" | ${pkgs.gnused}/bin/sed '1,/^---$/d')
76
76
+
77
77
+
# Fallback if parsing fails
78
78
+
if [ -z "$SUMMARY" ]; then
79
79
+
SUMMARY="Triage completed for $SERVICE_NAME"
80
80
+
fi
81
81
+
if [ -z "$REPORT" ]; then
82
82
+
REPORT="$RAW_OUTPUT"
83
83
+
fi
55
84
56
85
# Callback to status worker
57
57
-
${pkgs.curl}/bin/curl -sf -X PATCH "$CALLBACK_URL" \
86
86
+
echo "[triage] Posting report to $CALLBACK_URL"
87
87
+
CALLBACK_RESULT=$(${pkgs.curl}/bin/curl -sf -X PATCH "$CALLBACK_URL" \
58
88
-H "Authorization: Bearer $TRIAGE_AUTH_TOKEN" \
59
89
-H "Content-Type: application/json" \
60
60
-
-d "$(${pkgs.jq}/bin/jq -n --arg report "$REPORT" '{triage_report: $report, status: "identified"}')" \
61
61
-
|| echo "Failed to post triage report to $CALLBACK_URL"
90
90
+
-d "$(${pkgs.jq}/bin/jq -n --arg report "$REPORT" --arg summary "$SUMMARY" '{triage_report: $report, status: "identified", summary: $summary}')" \
91
91
+
-w "%{http_code}" -o /dev/null \
92
92
+
|| echo "FAILED")
93
93
+
echo "[triage] Callback result: $CALLBACK_RESULT"
62
94
'';
63
95
64
96
webhookServer = pkgs.writeText "triage-webhook.ts" ''
···
89
121
return new Response("missing fields", { status: 400 });
90
122
}
91
123
124
124
+
console.log(`[webhook] Received triage request: service=''${body.service_name} incident=''${body.incident_id}`);
125
125
+
92
126
// Spawn triage in background and respond immediately
93
127
Bun.spawn(["${triageRunScript}/bin/run-triage", body.service_name, String(body.incident_id), body.callback_url], {
94
128
env: { ...process.env },
95
95
-
stdout: "ignore",
96
96
-
stderr: "ignore",
129
129
+
stdout: "inherit",
130
130
+
stderr: "inherit",
97
131
});
98
132
99
133
return new Response(JSON.stringify({ ok: true, incident_id: body.incident_id }), {
···
166
200
pkgs.systemd
167
201
pkgs.coreutils
168
202
pkgs.gnugrep
203
203
+
pkgs.gnused
169
204
pkgs.curl
170
205
pkgs.jq
171
206
pkgs.nodejs_22
···
185
220
ExecStart = "${pkgs.unstable.bun}/bin/bun run ${webhookServer}";
186
221
EnvironmentFile = cfg.secretsFile;
187
222
Environment = [
188
188
-
"PATH=/var/lib/triage-agent/.npm-global/bin:${lib.makeBinPath [ pkgs.systemd pkgs.coreutils pkgs.gnugrep pkgs.curl pkgs.jq pkgs.nodejs_22 ]}"
223
223
+
"PATH=/var/lib/triage-agent/.npm-global/bin:${lib.makeBinPath [ pkgs.systemd pkgs.coreutils pkgs.gnugrep pkgs.gnused pkgs.curl pkgs.jq pkgs.nodejs_22 ]}"
189
224
];
190
225
Restart = "on-failure";
191
226
RestartSec = "10s";