Openstatus www.openstatus.dev

feat: incident audit logs (#1255)

* feat: incident audit logs

* chore: add missing comment

* chore: audit log positioning

* chore: rework audit log

* chore: add notification id to log

* ci: apply automated fixes

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>

authored by

Maximilian Kaske
autofix-ci[bot]
and committed by
GitHub
be3096a2 b302a29a

+118 -34
+11 -2
apps/workflows/src/checker/alerting.ts
··· 102 102 id: `monitor:${monitorId}`, 103 103 action: "notification.sent", 104 104 targets: [{ id: monitorId, type: "monitor" }], 105 - metadata: { provider: notif.notification.provider }, 105 + metadata: { 106 + provider: notif.notification.provider, 107 + cronTimestamp, 108 + type: notifType, 109 + notificationId: notif.notification.id, 110 + }, 106 111 }); 107 112 // 108 113 } ··· 112 117 monitorId, 113 118 notificationId, 114 119 cronTimestamp, 115 - }: { monitorId: number; notificationId: number; cronTimestamp: number }) => { 120 + }: { 121 + monitorId: number; 122 + notificationId: number; 123 + cronTimestamp: number; 124 + }) => { 116 125 await db 117 126 .insert(schema.notificationTrigger) 118 127 .values({
+67 -24
apps/workflows/src/checker/index.ts
··· 4 4 import { z } from "zod"; 5 5 6 6 import { and, count, db, eq, inArray, isNull, schema } from "@openstatus/db"; 7 - import { incidentTable, workspace } from "@openstatus/db/src/schema"; 7 + import { incidentTable } from "@openstatus/db/src/schema"; 8 8 import { 9 9 monitorStatusSchema, 10 10 selectMonitorSchema, ··· 94 94 return c.json({ success: true }, 200); 95 95 } 96 96 97 + // audit log the current state of the ping 98 + 99 + switch (status) { 100 + case "active": 101 + await checkerAudit.publishAuditLog({ 102 + id: `monitor:${monitorId}`, 103 + action: "monitor.recovered", 104 + targets: [{ id: monitorId, type: "monitor" }], 105 + metadata: { 106 + region, 107 + statusCode: statusCode ?? -1, 108 + cronTimestamp, 109 + latency, 110 + }, 111 + }); 112 + break; 113 + case "degraded": 114 + await checkerAudit.publishAuditLog({ 115 + id: `monitor:${monitorId}`, 116 + action: "monitor.degraded", 117 + targets: [{ id: monitorId, type: "monitor" }], 118 + metadata: { 119 + region, 120 + statusCode: statusCode ?? -1, 121 + cronTimestamp, 122 + latency, 123 + }, 124 + }); 125 + break; 126 + case "error": 127 + await checkerAudit.publishAuditLog({ 128 + id: `monitor:${monitorId}`, 129 + action: "monitor.failed", 130 + targets: [{ id: monitorId, type: "monitor" }], 131 + metadata: { 132 + region, 133 + statusCode: statusCode ?? -1, 134 + message, 135 + cronTimestamp, 136 + latency, 137 + }, 138 + }); 139 + break; 140 + } 141 + 97 142 if (affectedRegion.count >= numberOfRegions / 2 || numberOfRegions === 1) { 98 143 switch (status) { 99 144 case "active": { ··· 130 175 // incident is already resolved 131 176 break; 132 177 } 133 - 134 178 console.log(`🤓 recovering incident ${incident.id}`); 179 + 135 180 await db 136 181 .update(incidentTable) 137 182 .set({ ··· 140 185 }) 141 186 .where(eq(incidentTable.id, incident.id)) 142 187 .run(); 188 + 189 + await checkerAudit.publishAuditLog({ 190 + id: `monitor:${monitorId}`, 191 + action: "incident.resolved", 192 + targets: [{ id: monitorId, type: "monitor" }], 193 + metadata: { cronTimestamp, incidentId: incident.id }, 194 + }); 143 195 } 144 196 145 197 await triggerNotifications({ ··· 153 205 incidentId: `${cronTimestamp}`, 154 206 }); 155 207 156 - await checkerAudit.publishAuditLog({ 157 - id: `monitor:${monitorId}`, 158 - action: "monitor.recovered", 159 - targets: [{ id: monitorId, type: "monitor" }], 160 - metadata: { region: region, statusCode: statusCode ?? -1 }, 161 - }); 162 - 163 208 break; 164 209 } 165 210 case "degraded": ··· 168 213 break; 169 214 } 170 215 console.log(`🔄 update monitorStatus ${monitor.id} status: DEGRADED`); 216 + 171 217 await db 172 218 .update(schema.monitor) 173 219 .set({ status: "degraded" }) ··· 184 230 incidentId: `${cronTimestamp}`, 185 231 }); 186 232 187 - await checkerAudit.publishAuditLog({ 188 - id: `monitor:${monitorId}`, 189 - action: "monitor.degraded", 190 - targets: [{ id: monitorId, type: "monitor" }], 191 - metadata: { region, statusCode: statusCode ?? -1 }, 192 - }); 193 233 break; 194 234 case "error": 195 235 if (monitor.status === "error") { ··· 198 238 } 199 239 200 240 console.log(`🔄 update monitorStatus ${monitor.id} status: ERROR`); 241 + 201 242 await db 202 243 .update(schema.monitor) 203 244 .set({ status: "error" }) ··· 219 260 console.log("we are already in incident"); 220 261 break; 221 262 } 222 - const newIncident = await db 263 + const [newIncident] = await db 223 264 .insert(incidentTable) 224 265 .values({ 225 266 monitorId: Number(monitorId), ··· 228 269 }) 229 270 .returning(); 230 271 231 - if (!newIncident[0].id) { 272 + if (!newIncident.id) { 232 273 return; 233 274 } 275 + 276 + await checkerAudit.publishAuditLog({ 277 + id: `monitor:${monitorId}`, 278 + action: "incident.created", 279 + targets: [{ id: monitorId, type: "monitor" }], 280 + metadata: { cronTimestamp, incidentId: newIncident.id }, 281 + }); 282 + 234 283 await triggerNotifications({ 235 284 monitorId, 236 285 statusCode, ··· 239 288 cronTimestamp, 240 289 latency, 241 290 region, 242 - incidentId: String(newIncident[0].id), 291 + incidentId: String(newIncident.id), 243 292 }); 244 293 245 294 await db 246 295 .update(schema.monitor) 247 296 .set({ status: "error" }) 248 297 .where(eq(schema.monitor.id, monitor.id)); 249 - await checkerAudit.publishAuditLog({ 250 - id: `monitor:${monitorId}`, 251 - action: "monitor.failed", 252 - targets: [{ id: monitorId, type: "monitor" }], 253 - metadata: { region, statusCode, message }, 254 - }); 255 298 } catch { 256 299 console.log("incident was already created"); 257 300 }
+18 -8
packages/tinybird/src/audit-log/action-schema.ts
··· 9 9 metadata: z.object({ 10 10 region: z.string(), 11 11 statusCode: z.number(), 12 + latency: z.number().optional(), 12 13 cronTimestamp: z.number().optional(), 13 14 }), 14 15 }); ··· 23 24 region: z.string(), 24 25 statusCode: z.number(), 25 26 cronTimestamp: z.number().optional(), 27 + latency: z.number().optional(), 26 28 }), 27 29 }); 28 30 ··· 36 38 region: z.string(), 37 39 statusCode: z.number().optional(), 38 40 message: z.string().optional(), 41 + latency: z.number().optional(), 39 42 cronTimestamp: z.number().optional(), 40 43 }), 41 44 }); ··· 46 49 */ 47 50 export const notificationSentSchema = z.object({ 48 51 action: z.literal("notification.sent"), 49 - // we could use the notificationProviderSchema for more type safety 50 - metadata: z.object({ provider: z.string() }), 52 + metadata: z.object({ 53 + // we could use the notificationProviderSchema for more type safety 54 + provider: z.string(), 55 + cronTimestamp: z.number().optional(), 56 + type: z.enum(["alert", "recovery", "degraded"]).optional(), 57 + notificationId: z.number().optional(), 58 + }), 51 59 }); 52 60 53 - // TODO: update schemas with correct metadata and description 54 - 55 61 export const incidentCreatedSchema = z.object({ 56 62 action: z.literal("incident.created"), 57 - metadata: z.object({}), // tbd 63 + metadata: z.object({ 64 + cronTimestamp: z.number().optional(), 65 + incidentId: z.number().optional(), 66 + }), 58 67 }); 59 68 60 69 export const incidentResolvedSchema = z.object({ 61 70 action: z.literal("incident.resolved"), 62 - metadata: z.object({}), // tbd 71 + metadata: z.object({ 72 + cronTimestamp: z.number().optional(), 73 + incidentId: z.number().optional(), 74 + }), 63 75 }); 64 - 65 - // ...
+22
packages/tinybird/src/audit-log/action-validation.ts
··· 1 1 import { z } from "zod"; 2 2 3 3 import { 4 + incidentCreatedSchema, 5 + incidentResolvedSchema, 4 6 monitorDegradedSchema, 5 7 monitorFailedSchema, 6 8 monitorRecoveredSchema, ··· 25 27 monitorDegradedSchema, 26 28 monitorFailedSchema, 27 29 notificationSentSchema, 30 + incidentCreatedSchema, 31 + incidentResolvedSchema, 28 32 ]), 29 33 ingestBaseEventSchema, 30 34 ) ··· 49 53 monitorRecoveredSchema.shape.metadata, 50 54 ), 51 55 }), 56 + monitorDegradedSchema.extend({ 57 + metadata: z.preprocess( 58 + (val) => JSON.parse(String(val)), 59 + monitorDegradedSchema.shape.metadata, 60 + ), 61 + }), 52 62 monitorFailedSchema.extend({ 53 63 metadata: z.preprocess( 54 64 (val) => JSON.parse(String(val)), ··· 59 69 metadata: z.preprocess( 60 70 (val) => JSON.parse(String(val)), 61 71 notificationSentSchema.shape.metadata, 72 + ), 73 + }), 74 + incidentCreatedSchema.extend({ 75 + metadata: z.preprocess( 76 + (val) => JSON.parse(String(val)), 77 + incidentCreatedSchema.shape.metadata, 78 + ), 79 + }), 80 + incidentResolvedSchema.extend({ 81 + metadata: z.preprocess( 82 + (val) => JSON.parse(String(val)), 83 + incidentResolvedSchema.shape.metadata, 62 84 ), 63 85 }), 64 86 ]),