Kieran's opinionated (and probably slightly dumb) nix config

feat: update triage agent

dunkirk.sh d4501257 3804ed58

verified
+44 -9
+44 -9
modules/nixos/services/triage-agent.nix
··· 32 32 INCIDENT_ID="$2" 33 33 CALLBACK_URL="$3" 34 34 35 + echo "[triage] Starting triage for service=$SERVICE_NAME incident=$INCIDENT_ID" 36 + 35 37 # Collect filtered logs 36 38 FILTERED_LOGS=$(${triageFilterScript}/bin/triage-filter "$SERVICE_NAME" 2>&1 || true) 37 39 ··· 39 41 FILTERED_LOGS="No relevant log lines found in the last 10 minutes." 40 42 fi 41 43 44 + LOG_LINES=$(echo "$FILTERED_LOGS" | ${pkgs.coreutils}/bin/wc -l) 45 + echo "[triage] Log collection done: $LOG_LINES lines" 46 + 42 47 # Run Claude headless for triage 43 48 export HOME="/var/lib/triage-agent" 44 - REPORT=$(${cfg.claudePath} -p \ 49 + echo "[triage] Invoking Claude for analysis..." 50 + RAW_OUTPUT=$(${cfg.claudePath} -p \ 45 51 --model sonnet \ 46 52 --max-turns 10 \ 47 53 --allowedTools "Bash(journalctl:*) Bash(systemctl:*) Bash(curl:*) Bash(ss:*) Bash(df:*) Bash(free:*) Bash(ps:*) Bash(cat:/etc/*) Bash(ls:*) Bash(stat:*)" \ 48 - "You are a service triage agent investigating why a service is down. You have access to tools for deeper investigation. Write a concise triage report (max 500 words) in markdown with: 1) What failed 2) Likely root cause 3) Suggested fix. Do not speculate beyond what the evidence shows. 54 + "You are a service triage agent investigating why a service is down. You have access to tools for deeper investigation. 55 + 56 + Your output MUST follow this exact format: 57 + SUMMARY: <one sentence describing the root cause> 58 + --- 59 + <full triage report in markdown, max 500 words, with: 1) What failed 2) Likely root cause 3) Suggested fix> 60 + 61 + The SUMMARY line must be a single sentence. Do not speculate beyond what the evidence shows. 49 62 50 63 Service '$SERVICE_NAME' is down. Here are the filtered logs from the last 10 minutes: 51 64 52 65 $FILTERED_LOGS 53 66 54 - Investigate further if the logs aren't conclusive — check disk space, memory, open ports, config files, or related service logs. Then write a triage report." 2>&1 || echo "Triage agent failed to produce a report.") 67 + Investigate further if the logs aren't conclusive — check disk space, memory, open ports, config files, or related service logs. Then write your response in the format above." 2>&1 || echo "SUMMARY: Triage agent failed to produce a report. 68 + --- 69 + Triage agent failed to produce a report.") 70 + 71 + echo "[triage] Report generated: ''${#RAW_OUTPUT} chars" 72 + 73 + # Split output into summary and full report 74 + SUMMARY=$(echo "$RAW_OUTPUT" | ${pkgs.gnused}/bin/sed -n 's/^SUMMARY: //p' | ${pkgs.coreutils}/bin/head -1) 75 + REPORT=$(echo "$RAW_OUTPUT" | ${pkgs.gnused}/bin/sed '1,/^---$/d') 76 + 77 + # Fallback if parsing fails 78 + if [ -z "$SUMMARY" ]; then 79 + SUMMARY="Triage completed for $SERVICE_NAME" 80 + fi 81 + if [ -z "$REPORT" ]; then 82 + REPORT="$RAW_OUTPUT" 83 + fi 55 84 56 85 # Callback to status worker 57 - ${pkgs.curl}/bin/curl -sf -X PATCH "$CALLBACK_URL" \ 86 + echo "[triage] Posting report to $CALLBACK_URL" 87 + CALLBACK_RESULT=$(${pkgs.curl}/bin/curl -sf -X PATCH "$CALLBACK_URL" \ 58 88 -H "Authorization: Bearer $TRIAGE_AUTH_TOKEN" \ 59 89 -H "Content-Type: application/json" \ 60 - -d "$(${pkgs.jq}/bin/jq -n --arg report "$REPORT" '{triage_report: $report, status: "identified"}')" \ 61 - || echo "Failed to post triage report to $CALLBACK_URL" 90 + -d "$(${pkgs.jq}/bin/jq -n --arg report "$REPORT" --arg summary "$SUMMARY" '{triage_report: $report, status: "identified", summary: $summary}')" \ 91 + -w "%{http_code}" -o /dev/null \ 92 + || echo "FAILED") 93 + echo "[triage] Callback result: $CALLBACK_RESULT" 62 94 ''; 63 95 64 96 webhookServer = pkgs.writeText "triage-webhook.ts" '' ··· 89 121 return new Response("missing fields", { status: 400 }); 90 122 } 91 123 124 + console.log(`[webhook] Received triage request: service=''${body.service_name} incident=''${body.incident_id}`); 125 + 92 126 // Spawn triage in background and respond immediately 93 127 Bun.spawn(["${triageRunScript}/bin/run-triage", body.service_name, String(body.incident_id), body.callback_url], { 94 128 env: { ...process.env }, 95 - stdout: "ignore", 96 - stderr: "ignore", 129 + stdout: "inherit", 130 + stderr: "inherit", 97 131 }); 98 132 99 133 return new Response(JSON.stringify({ ok: true, incident_id: body.incident_id }), { ··· 166 200 pkgs.systemd 167 201 pkgs.coreutils 168 202 pkgs.gnugrep 203 + pkgs.gnused 169 204 pkgs.curl 170 205 pkgs.jq 171 206 pkgs.nodejs_22 ··· 185 220 ExecStart = "${pkgs.unstable.bun}/bin/bun run ${webhookServer}"; 186 221 EnvironmentFile = cfg.secretsFile; 187 222 Environment = [ 188 - "PATH=/var/lib/triage-agent/.npm-global/bin:${lib.makeBinPath [ pkgs.systemd pkgs.coreutils pkgs.gnugrep pkgs.curl pkgs.jq pkgs.nodejs_22 ]}" 223 + "PATH=/var/lib/triage-agent/.npm-global/bin:${lib.makeBinPath [ pkgs.systemd pkgs.coreutils pkgs.gnugrep pkgs.gnused pkgs.curl pkgs.jq pkgs.nodejs_22 ]}" 189 224 ]; 190 225 Restart = "on-failure"; 191 226 RestartSec = "10s";