tangled
alpha
login
or
join now
dunkirk.sh
/
dots
3
fork
atom
Kieran's opinionated (and probably slightly dumb) nix config
3
fork
atom
overview
issues
pulls
pipelines
feat: update triage agent
dunkirk.sh
5 days ago
d4501257
3804ed58
verified
This commit was signed with the committer's
known signature
.
dunkirk.sh
SSH Key Fingerprint:
SHA256:DqcG0RXYExE26KiWo3VxJnsxswN1QNfTBvB+bdSpk80=
+44
-9
1 changed file
expand all
collapse all
unified
split
modules
nixos
services
triage-agent.nix
+44
-9
modules/nixos/services/triage-agent.nix
···
32
INCIDENT_ID="$2"
33
CALLBACK_URL="$3"
34
0
0
35
# Collect filtered logs
36
FILTERED_LOGS=$(${triageFilterScript}/bin/triage-filter "$SERVICE_NAME" 2>&1 || true)
37
···
39
FILTERED_LOGS="No relevant log lines found in the last 10 minutes."
40
fi
41
0
0
0
42
# Run Claude headless for triage
43
export HOME="/var/lib/triage-agent"
44
-
REPORT=$(${cfg.claudePath} -p \
0
45
--model sonnet \
46
--max-turns 10 \
47
--allowedTools "Bash(journalctl:*) Bash(systemctl:*) Bash(curl:*) Bash(ss:*) Bash(df:*) Bash(free:*) Bash(ps:*) Bash(cat:/etc/*) Bash(ls:*) Bash(stat:*)" \
48
-
"You are a service triage agent investigating why a service is down. You have access to tools for deeper investigation. Write a concise triage report (max 500 words) in markdown with: 1) What failed 2) Likely root cause 3) Suggested fix. Do not speculate beyond what the evidence shows.
0
0
0
0
0
0
0
49
50
Service '$SERVICE_NAME' is down. Here are the filtered logs from the last 10 minutes:
51
52
$FILTERED_LOGS
53
54
-
Investigate further if the logs aren't conclusive — check disk space, memory, open ports, config files, or related service logs. Then write a triage report." 2>&1 || echo "Triage agent failed to produce a report.")
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
55
56
# Callback to status worker
57
-
${pkgs.curl}/bin/curl -sf -X PATCH "$CALLBACK_URL" \
0
58
-H "Authorization: Bearer $TRIAGE_AUTH_TOKEN" \
59
-H "Content-Type: application/json" \
60
-
-d "$(${pkgs.jq}/bin/jq -n --arg report "$REPORT" '{triage_report: $report, status: "identified"}')" \
61
-
|| echo "Failed to post triage report to $CALLBACK_URL"
0
0
62
'';
63
64
webhookServer = pkgs.writeText "triage-webhook.ts" ''
···
89
return new Response("missing fields", { status: 400 });
90
}
91
0
0
92
// Spawn triage in background and respond immediately
93
Bun.spawn(["${triageRunScript}/bin/run-triage", body.service_name, String(body.incident_id), body.callback_url], {
94
env: { ...process.env },
95
-
stdout: "ignore",
96
-
stderr: "ignore",
97
});
98
99
return new Response(JSON.stringify({ ok: true, incident_id: body.incident_id }), {
···
166
pkgs.systemd
167
pkgs.coreutils
168
pkgs.gnugrep
0
169
pkgs.curl
170
pkgs.jq
171
pkgs.nodejs_22
···
185
ExecStart = "${pkgs.unstable.bun}/bin/bun run ${webhookServer}";
186
EnvironmentFile = cfg.secretsFile;
187
Environment = [
188
-
"PATH=/var/lib/triage-agent/.npm-global/bin:${lib.makeBinPath [ pkgs.systemd pkgs.coreutils pkgs.gnugrep pkgs.curl pkgs.jq pkgs.nodejs_22 ]}"
189
];
190
Restart = "on-failure";
191
RestartSec = "10s";
···
32
INCIDENT_ID="$2"
33
CALLBACK_URL="$3"
34
35
+
echo "[triage] Starting triage for service=$SERVICE_NAME incident=$INCIDENT_ID"
36
+
37
# Collect filtered logs
38
FILTERED_LOGS=$(${triageFilterScript}/bin/triage-filter "$SERVICE_NAME" 2>&1 || true)
39
···
41
FILTERED_LOGS="No relevant log lines found in the last 10 minutes."
42
fi
43
44
+
LOG_LINES=$(echo "$FILTERED_LOGS" | ${pkgs.coreutils}/bin/wc -l)
45
+
echo "[triage] Log collection done: $LOG_LINES lines"
46
+
47
# Run Claude headless for triage
48
export HOME="/var/lib/triage-agent"
49
+
echo "[triage] Invoking Claude for analysis..."
50
+
RAW_OUTPUT=$(${cfg.claudePath} -p \
51
--model sonnet \
52
--max-turns 10 \
53
--allowedTools "Bash(journalctl:*) Bash(systemctl:*) Bash(curl:*) Bash(ss:*) Bash(df:*) Bash(free:*) Bash(ps:*) Bash(cat:/etc/*) Bash(ls:*) Bash(stat:*)" \
54
+
"You are a service triage agent investigating why a service is down. You have access to tools for deeper investigation.
55
+
56
+
Your output MUST follow this exact format:
57
+
SUMMARY: <one sentence describing the root cause>
58
+
---
59
+
<full triage report in markdown, max 500 words, with: 1) What failed 2) Likely root cause 3) Suggested fix>
60
+
61
+
The SUMMARY line must be a single sentence. Do not speculate beyond what the evidence shows.
62
63
Service '$SERVICE_NAME' is down. Here are the filtered logs from the last 10 minutes:
64
65
$FILTERED_LOGS
66
67
+
Investigate further if the logs aren't conclusive — check disk space, memory, open ports, config files, or related service logs. Then write your response in the format above." 2>&1 || echo "SUMMARY: Triage agent failed to produce a report.
68
+
---
69
+
Triage agent failed to produce a report.")
70
+
71
+
echo "[triage] Report generated: ''${#RAW_OUTPUT} chars"
72
+
73
+
# Split output into summary and full report
74
+
SUMMARY=$(echo "$RAW_OUTPUT" | ${pkgs.gnused}/bin/sed -n 's/^SUMMARY: //p' | ${pkgs.coreutils}/bin/head -1)
75
+
REPORT=$(echo "$RAW_OUTPUT" | ${pkgs.gnused}/bin/sed '1,/^---$/d')
76
+
77
+
# Fallback if parsing fails
78
+
if [ -z "$SUMMARY" ]; then
79
+
SUMMARY="Triage completed for $SERVICE_NAME"
80
+
fi
81
+
if [ -z "$REPORT" ]; then
82
+
REPORT="$RAW_OUTPUT"
83
+
fi
84
85
# Callback to status worker
86
+
echo "[triage] Posting report to $CALLBACK_URL"
87
+
CALLBACK_RESULT=$(${pkgs.curl}/bin/curl -sf -X PATCH "$CALLBACK_URL" \
88
-H "Authorization: Bearer $TRIAGE_AUTH_TOKEN" \
89
-H "Content-Type: application/json" \
90
+
-d "$(${pkgs.jq}/bin/jq -n --arg report "$REPORT" --arg summary "$SUMMARY" '{triage_report: $report, status: "identified", summary: $summary}')" \
91
+
-w "%{http_code}" -o /dev/null \
92
+
|| echo "FAILED")
93
+
echo "[triage] Callback result: $CALLBACK_RESULT"
94
'';
95
96
webhookServer = pkgs.writeText "triage-webhook.ts" ''
···
121
return new Response("missing fields", { status: 400 });
122
}
123
124
+
console.log(`[webhook] Received triage request: service=''${body.service_name} incident=''${body.incident_id}`);
125
+
126
// Spawn triage in background and respond immediately
127
Bun.spawn(["${triageRunScript}/bin/run-triage", body.service_name, String(body.incident_id), body.callback_url], {
128
env: { ...process.env },
129
+
stdout: "inherit",
130
+
stderr: "inherit",
131
});
132
133
return new Response(JSON.stringify({ ok: true, incident_id: body.incident_id }), {
···
200
pkgs.systemd
201
pkgs.coreutils
202
pkgs.gnugrep
203
+
pkgs.gnused
204
pkgs.curl
205
pkgs.jq
206
pkgs.nodejs_22
···
220
ExecStart = "${pkgs.unstable.bun}/bin/bun run ${webhookServer}";
221
EnvironmentFile = cfg.secretsFile;
222
Environment = [
223
+
"PATH=/var/lib/triage-agent/.npm-global/bin:${lib.makeBinPath [ pkgs.systemd pkgs.coreutils pkgs.gnugrep pkgs.gnused pkgs.curl pkgs.jq pkgs.nodejs_22 ]}"
224
];
225
Restart = "on-failure";
226
RestartSec = "10s";