Kieran's opinionated (and probably slightly dumb) nix config

feat: update triage agent

dunkirk.sh d4501257 3804ed58

verified
+44 -9
+44 -9
modules/nixos/services/triage-agent.nix
··· 32 INCIDENT_ID="$2" 33 CALLBACK_URL="$3" 34 35 # Collect filtered logs 36 FILTERED_LOGS=$(${triageFilterScript}/bin/triage-filter "$SERVICE_NAME" 2>&1 || true) 37 ··· 39 FILTERED_LOGS="No relevant log lines found in the last 10 minutes." 40 fi 41 42 # Run Claude headless for triage 43 export HOME="/var/lib/triage-agent" 44 - REPORT=$(${cfg.claudePath} -p \ 45 --model sonnet \ 46 --max-turns 10 \ 47 --allowedTools "Bash(journalctl:*) Bash(systemctl:*) Bash(curl:*) Bash(ss:*) Bash(df:*) Bash(free:*) Bash(ps:*) Bash(cat:/etc/*) Bash(ls:*) Bash(stat:*)" \ 48 - "You are a service triage agent investigating why a service is down. You have access to tools for deeper investigation. Write a concise triage report (max 500 words) in markdown with: 1) What failed 2) Likely root cause 3) Suggested fix. Do not speculate beyond what the evidence shows. 49 50 Service '$SERVICE_NAME' is down. Here are the filtered logs from the last 10 minutes: 51 52 $FILTERED_LOGS 53 54 - Investigate further if the logs aren't conclusive — check disk space, memory, open ports, config files, or related service logs. Then write a triage report." 2>&1 || echo "Triage agent failed to produce a report.") 55 56 # Callback to status worker 57 - ${pkgs.curl}/bin/curl -sf -X PATCH "$CALLBACK_URL" \ 58 -H "Authorization: Bearer $TRIAGE_AUTH_TOKEN" \ 59 -H "Content-Type: application/json" \ 60 - -d "$(${pkgs.jq}/bin/jq -n --arg report "$REPORT" '{triage_report: $report, status: "identified"}')" \ 61 - || echo "Failed to post triage report to $CALLBACK_URL" 62 ''; 63 64 webhookServer = pkgs.writeText "triage-webhook.ts" '' ··· 89 return new Response("missing fields", { status: 400 }); 90 } 91 92 // Spawn triage in background and respond immediately 93 Bun.spawn(["${triageRunScript}/bin/run-triage", body.service_name, String(body.incident_id), body.callback_url], { 94 env: { ...process.env }, 95 - stdout: "ignore", 96 - stderr: "ignore", 97 }); 98 99 return new Response(JSON.stringify({ ok: true, incident_id: body.incident_id }), { ··· 166 pkgs.systemd 167 pkgs.coreutils 168 pkgs.gnugrep 169 pkgs.curl 170 pkgs.jq 171 pkgs.nodejs_22 ··· 185 ExecStart = "${pkgs.unstable.bun}/bin/bun run ${webhookServer}"; 186 EnvironmentFile = cfg.secretsFile; 187 Environment = [ 188 - "PATH=/var/lib/triage-agent/.npm-global/bin:${lib.makeBinPath [ pkgs.systemd pkgs.coreutils pkgs.gnugrep pkgs.curl pkgs.jq pkgs.nodejs_22 ]}" 189 ]; 190 Restart = "on-failure"; 191 RestartSec = "10s";
··· 32 INCIDENT_ID="$2" 33 CALLBACK_URL="$3" 34 35 + echo "[triage] Starting triage for service=$SERVICE_NAME incident=$INCIDENT_ID" 36 + 37 # Collect filtered logs 38 FILTERED_LOGS=$(${triageFilterScript}/bin/triage-filter "$SERVICE_NAME" 2>&1 || true) 39 ··· 41 FILTERED_LOGS="No relevant log lines found in the last 10 minutes." 42 fi 43 44 + LOG_LINES=$(echo "$FILTERED_LOGS" | ${pkgs.coreutils}/bin/wc -l) 45 + echo "[triage] Log collection done: $LOG_LINES lines" 46 + 47 # Run Claude headless for triage 48 export HOME="/var/lib/triage-agent" 49 + echo "[triage] Invoking Claude for analysis..." 50 + RAW_OUTPUT=$(${cfg.claudePath} -p \ 51 --model sonnet \ 52 --max-turns 10 \ 53 --allowedTools "Bash(journalctl:*) Bash(systemctl:*) Bash(curl:*) Bash(ss:*) Bash(df:*) Bash(free:*) Bash(ps:*) Bash(cat:/etc/*) Bash(ls:*) Bash(stat:*)" \ 54 + "You are a service triage agent investigating why a service is down. You have access to tools for deeper investigation. 55 + 56 + Your output MUST follow this exact format: 57 + SUMMARY: <one sentence describing the root cause> 58 + --- 59 + <full triage report in markdown, max 500 words, with: 1) What failed 2) Likely root cause 3) Suggested fix> 60 + 61 + The SUMMARY line must be a single sentence. Do not speculate beyond what the evidence shows. 62 63 Service '$SERVICE_NAME' is down. Here are the filtered logs from the last 10 minutes: 64 65 $FILTERED_LOGS 66 67 + Investigate further if the logs aren't conclusive — check disk space, memory, open ports, config files, or related service logs. Then write your response in the format above." 2>&1 || echo "SUMMARY: Triage agent failed to produce a report. 68 + --- 69 + Triage agent failed to produce a report.") 70 + 71 + echo "[triage] Report generated: ''${#RAW_OUTPUT} chars" 72 + 73 + # Split output into summary and full report 74 + SUMMARY=$(echo "$RAW_OUTPUT" | ${pkgs.gnused}/bin/sed -n 's/^SUMMARY: //p' | ${pkgs.coreutils}/bin/head -1) 75 + REPORT=$(echo "$RAW_OUTPUT" | ${pkgs.gnused}/bin/sed '1,/^---$/d') 76 + 77 + # Fallback if parsing fails 78 + if [ -z "$SUMMARY" ]; then 79 + SUMMARY="Triage completed for $SERVICE_NAME" 80 + fi 81 + if [ -z "$REPORT" ]; then 82 + REPORT="$RAW_OUTPUT" 83 + fi 84 85 # Callback to status worker 86 + echo "[triage] Posting report to $CALLBACK_URL" 87 + CALLBACK_RESULT=$(${pkgs.curl}/bin/curl -sf -X PATCH "$CALLBACK_URL" \ 88 -H "Authorization: Bearer $TRIAGE_AUTH_TOKEN" \ 89 -H "Content-Type: application/json" \ 90 + -d "$(${pkgs.jq}/bin/jq -n --arg report "$REPORT" --arg summary "$SUMMARY" '{triage_report: $report, status: "identified", summary: $summary}')" \ 91 + -w "%{http_code}" -o /dev/null \ 92 + || echo "FAILED") 93 + echo "[triage] Callback result: $CALLBACK_RESULT" 94 ''; 95 96 webhookServer = pkgs.writeText "triage-webhook.ts" '' ··· 121 return new Response("missing fields", { status: 400 }); 122 } 123 124 + console.log(`[webhook] Received triage request: service=''${body.service_name} incident=''${body.incident_id}`); 125 + 126 // Spawn triage in background and respond immediately 127 Bun.spawn(["${triageRunScript}/bin/run-triage", body.service_name, String(body.incident_id), body.callback_url], { 128 env: { ...process.env }, 129 + stdout: "inherit", 130 + stderr: "inherit", 131 }); 132 133 return new Response(JSON.stringify({ ok: true, incident_id: body.incident_id }), { ··· 200 pkgs.systemd 201 pkgs.coreutils 202 pkgs.gnugrep 203 + pkgs.gnused 204 pkgs.curl 205 pkgs.jq 206 pkgs.nodejs_22 ··· 220 ExecStart = "${pkgs.unstable.bun}/bin/bun run ${webhookServer}"; 221 EnvironmentFile = cfg.secretsFile; 222 Environment = [ 223 + "PATH=/var/lib/triage-agent/.npm-global/bin:${lib.makeBinPath [ pkgs.systemd pkgs.coreutils pkgs.gnugrep pkgs.gnused pkgs.curl pkgs.jq pkgs.nodejs_22 ]}" 224 ]; 225 Restart = "on-failure"; 226 RestartSec = "10s";