feat: LLM curator for named groups, haiku display, and docs overhaul

+2

CLAUDE.md

··· 31 - **entity graph**: nodes = named entities, edges = co-occurrence (mentioned in same post) 32 - **pheromone edges**: edge weights decay exponentially, reinforced on repeated co-occurrence 33 - **z-score trending**: entities ranked by "surprise" (sigma above baseline), not raw counts 34 - **firehose rate**: raw jetstream messages/sec displayed as health indicator 35 36 ## entity types ··· 46 - docs/02-semantic-percolation-plan.md: theoretical background + design decisions 47 - docs/03-baseline-audit.md: empirical observations from live data 48 - docs/04-architecture.md: system architecture, data flow, deployment, failure modes

··· 31 - **entity graph**: nodes = named entities, edges = co-occurrence (mentioned in same post) 32 - **pheromone edges**: edge weights decay exponentially, reinforced on repeated co-occurrence 33 - **z-score trending**: entities ranked by "surprise" (sigma above baseline), not raw counts 34 + - **LLM curator**: Claude Haiku 4.5 curates clusters into named groups every 5 min (dedup.py) 35 - **firehose rate**: raw jetstream messages/sec displayed as health indicator 36 37 ## entity types ··· 47 - docs/02-semantic-percolation-plan.md: theoretical background + design decisions 48 - docs/03-baseline-audit.md: empirical observations from live data 49 - docs/04-architecture.md: system architecture, data flow, deployment, failure modes 50 + - docs/06-llm-curator.md: LLM curator design, prompt, validation, topic continuity

+97 -63

backend/src/entity_graph.zig

··· 98 // what "spanning" means in a co-occurrence graph context 99 pub const SPANNING_THRESHOLD: f32 = 0.5; 100 101 pub const EntityId = u32; 102 pub const UserId = u32; 103 pub const Timestamp = i64; // milliseconds since epoch ··· 462 // bridge rate: EWMA of cross-cluster co-occurrences 463 bridges: BridgeCounter = .{}, 464 465 - // external cluster labels from NER bridge LLM narration 466 - // indexed by cluster root entity ID; overrides mechanical "A + B" label 467 - cluster_label_text: [MAX_ENTITIES][64]u8 = undefined, 468 - cluster_label_len: [MAX_ENTITIES]u8 = [_]u8{0} ** MAX_ENTITIES, 469 470 // cached stats 471 stats: GraphStats = .{ ··· 481 482 mutex: Mutex = .{}, 483 484 - /// set an external cluster label (from LLM narration) 485 - /// cluster_id is the entity ID of the cluster root 486 - pub fn setClusterLabel(self: *EntityGraph, cluster_id: u32, label_text: []const u8) void { 487 - if (cluster_id >= MAX_ENTITIES) return; 488 - const len: u8 = @intCast(@min(label_text.len, 64)); 489 - @memcpy(self.cluster_label_text[cluster_id][0..len], label_text[0..len]); 490 - self.cluster_label_len[cluster_id] = len; 491 } 492 493 - /// clear all external cluster labels (call before setting new batch) 494 - pub fn clearClusterLabels(self: *EntityGraph) void { 495 - @memset(&self.cluster_label_len, 0); 496 } 497 498 /// find or create user by DID hash, returns user ID ··· 968 } 969 } 970 971 - // cluster-level scores (sum of positive trends) and labels 972 var cluster_score: [MAX_ENTITIES]f32 = [_]f32{0} ** MAX_ENTITIES; 973 - var cluster_label_weight: [MAX_ENTITIES]f32 = [_]f32{0} ** MAX_ENTITIES; 974 - var cluster_label_a: [MAX_ENTITIES]EntityId = [_]EntityId{NO_ENTITY} ** MAX_ENTITIES; 975 - var cluster_label_b: [MAX_ENTITIES]EntityId = [_]EntityId{NO_ENTITY} ** MAX_ENTITIES; 976 977 for (0..self.count) |i| { 978 const entity = &self.entities[i]; ··· 985 } 986 } 987 988 - // pick strongest degree-normalized edge per cluster for a lightweight label 989 - for (0..self.count) |i| { 990 - const entity_a = &self.entities[i]; 991 - if (!entity_a.isActive()) continue; 992 - const root = roots[i]; 993 - if (root == NO_ENTITY) continue; 994 - for (0..entity_a.edges.count) |edge_idx| { 995 - const id_b = entity_a.edges.edges[edge_idx]; 996 - if (id_b <= i) continue; 997 - if (id_b >= self.count or !self.entities[id_b].isActive()) continue; 998 - if (roots[id_b] != root) continue; 999 - 1000 - const edge_ts = entity_a.edges.last_seen[edge_idx]; 1001 - if ((now - edge_ts) >= EDGE_DECAY_MS) continue; 1002 - 1003 - const raw_weight = self.edgeWeight(@intCast(i), id_b, now); 1004 - const ra = @max(self.entities[i].smoothed_rate, 0.001); 1005 - const rb = @max(self.entities[id_b].smoothed_rate, 0.001); 1006 - const weight = raw_weight / @sqrt(ra * rb); 1007 - if (weight > cluster_label_weight[root]) { 1008 - cluster_label_weight[root] = weight; 1009 - cluster_label_a[root] = @intCast(i); 1010 - cluster_label_b[root] = id_b; 1011 - } 1012 - } 1013 - } 1014 - 1015 jw.beginObject() catch return "{}"; 1016 jw.objectField("entities") catch return "{}"; 1017 jw.beginArray() catch return "{}"; ··· 1047 if (root != NO_ENTITY) { 1048 jw.objectField("cluster_score") catch break; 1049 jw.write(cluster_score[root]) catch break; 1050 1051 - // prefer LLM-generated label over mechanical "A + B" 1052 - jw.objectField("cluster_label") catch break; 1053 - if (self.cluster_label_len[root] > 0) { 1054 - jw.write(self.cluster_label_text[root][0..self.cluster_label_len[root]]) catch break; 1055 - } else { 1056 - const label_a = cluster_label_a[root]; 1057 - const label_b = cluster_label_b[root]; 1058 - if (label_a != NO_ENTITY and label_b != NO_ENTITY) { 1059 - var label_buf: [140]u8 = undefined; 1060 - const label_str = std.fmt.bufPrint(&label_buf, "{s} + {s}", .{ 1061 - self.entities[label_a].getText(), 1062 - self.entities[label_b].getText(), 1063 - }) catch ""; 1064 - jw.write(label_str) catch break; 1065 - } else { 1066 - jw.write("") catch break; 1067 - } 1068 - } 1069 } 1070 jw.objectField("largest") catch break; 1071 jw.write(in_largest) catch break; 1072 jw.objectField("x") catch break; ··· 1104 1105 jw.endArray() catch return "{}"; 1106 1107 // stats object 1108 jw.objectField("stats") catch return "{}"; 1109 jw.beginObject() catch return "{}"; ··· 1127 jw.write(self.stats.bridge_rate) catch return "{}"; 1128 jw.endObject() catch return "{}"; 1129 1130 jw.endObject() catch return "{}"; 1131 1132 return w.buffered(); 1133 } 1134 1135 /// diagnostics: compute distributions for analysis

··· 98 // what "spanning" means in a co-occurrence graph context 99 pub const SPANNING_THRESHOLD: f32 = 0.5; 100 101 + // LLM-curated named groups 102 + pub const MAX_GROUPS = 5; 103 + pub const MAX_GROUP_ENTITIES = 20; 104 + 105 + pub const Group = struct { 106 + name: [64]u8 = undefined, 107 + name_len: u8 = 0, 108 + entity_texts: [MAX_GROUP_ENTITIES][64]u8 = undefined, 109 + entity_text_lens: [MAX_GROUP_ENTITIES]u8 = [_]u8{0} ** MAX_GROUP_ENTITIES, 110 + entity_count: u8 = 0, 111 + }; 112 + 113 pub const EntityId = u32; 114 pub const UserId = u32; 115 pub const Timestamp = i64; // milliseconds since epoch ··· 474 // bridge rate: EWMA of cross-cluster co-occurrences 475 bridges: BridgeCounter = .{}, 476 477 + // LLM-curated named groups (max 5, replaced atomically) 478 + groups: [MAX_GROUPS]Group = [_]Group{.{}} ** MAX_GROUPS, 479 + group_count: u8 = 0, 480 + 481 + // global haiku from LLM narration (zeitgeist summary) 482 + haiku_text: [256]u8 = undefined, 483 + haiku_len: u16 = 0, 484 485 // cached stats 486 stats: GraphStats = .{ ··· 496 497 mutex: Mutex = .{}, 498 499 + /// atomically replace all groups (from LLM curation) 500 + pub fn setGroups(self: *EntityGraph, groups: []const Group, count: u8) void { 501 + const n = @min(count, MAX_GROUPS); 502 + for (0..n) |i| { 503 + self.groups[i] = groups[i]; 504 + } 505 + self.group_count = n; 506 } 507 508 + /// clear all groups 509 + pub fn clearGroups(self: *EntityGraph) void { 510 + self.group_count = 0; 511 + } 512 + 513 + /// set global haiku (zeitgeist summary from LLM) 514 + pub fn setHaiku(self: *EntityGraph, text: []const u8) void { 515 + const len: u16 = @intCast(@min(text.len, 256)); 516 + @memcpy(self.haiku_text[0..len], text[0..len]); 517 + self.haiku_len = len; 518 } 519 520 /// find or create user by DID hash, returns user ID ··· 990 } 991 } 992 993 + // cluster-level scores (sum of positive trends) 994 var cluster_score: [MAX_ENTITIES]f32 = [_]f32{0} ** MAX_ENTITIES; 995 996 for (0..self.count) |i| { 997 const entity = &self.entities[i]; ··· 1004 } 1005 } 1006 1007 jw.beginObject() catch return "{}"; 1008 jw.objectField("entities") catch return "{}"; 1009 jw.beginArray() catch return "{}"; ··· 1039 if (root != NO_ENTITY) { 1040 jw.objectField("cluster_score") catch break; 1041 jw.write(cluster_score[root]) catch break; 1042 + } 1043 1044 + // match entity text against LLM groups (case-insensitive) 1045 + const group_match = self.matchGroup(entity.getText()); 1046 + if (group_match.index) |gi| { 1047 + jw.objectField("group") catch break; 1048 + jw.write(gi) catch break; 1049 + jw.objectField("group_name") catch break; 1050 + jw.write(self.groups[gi].name[0..self.groups[gi].name_len]) catch break; 1051 } 1052 + 1053 jw.objectField("largest") catch break; 1054 jw.write(in_largest) catch break; 1055 jw.objectField("x") catch break; ··· 1087 1088 jw.endArray() catch return "{}"; 1089 1090 + // top-level groups array 1091 + jw.objectField("groups") catch return "{}"; 1092 + jw.beginArray() catch return "{}"; 1093 + for (0..self.group_count) |gi| { 1094 + const grp = &self.groups[gi]; 1095 + jw.beginObject() catch break; 1096 + jw.objectField("name") catch break; 1097 + jw.write(grp.name[0..grp.name_len]) catch break; 1098 + jw.objectField("entities") catch break; 1099 + jw.beginArray() catch break; 1100 + for (0..grp.entity_count) |ei| { 1101 + jw.write(grp.entity_texts[ei][0..grp.entity_text_lens[ei]]) catch break; 1102 + } 1103 + jw.endArray() catch break; 1104 + jw.endObject() catch break; 1105 + } 1106 + jw.endArray() catch return "{}"; 1107 + 1108 // stats object 1109 jw.objectField("stats") catch return "{}"; 1110 jw.beginObject() catch return "{}"; ··· 1128 jw.write(self.stats.bridge_rate) catch return "{}"; 1129 jw.endObject() catch return "{}"; 1130 1131 + // global haiku (zeitgeist summary) 1132 + if (self.haiku_len > 0) { 1133 + jw.objectField("haiku") catch return "{}"; 1134 + jw.write(self.haiku_text[0..self.haiku_len]) catch return "{}"; 1135 + } 1136 + 1137 jw.endObject() catch return "{}"; 1138 1139 return w.buffered(); 1140 + } 1141 + 1142 + /// match entity text against group entity lists (case-insensitive) 1143 + const GroupMatch = struct { index: ?usize }; 1144 + fn matchGroup(self: *EntityGraph, entity_text: []const u8) GroupMatch { 1145 + var e_lower: [64]u8 = undefined; 1146 + const e_len = @min(entity_text.len, 64); 1147 + for (0..e_len) |i| { 1148 + e_lower[i] = std.ascii.toLower(entity_text[i]); 1149 + } 1150 + const e_norm = e_lower[0..e_len]; 1151 + 1152 + for (0..self.group_count) |gi| { 1153 + const grp = &self.groups[gi]; 1154 + for (0..grp.entity_count) |ei| { 1155 + const g_len = grp.entity_text_lens[ei]; 1156 + if (g_len != e_len) continue; 1157 + var g_lower: [64]u8 = undefined; 1158 + for (0..g_len) |k| { 1159 + g_lower[k] = std.ascii.toLower(grp.entity_texts[ei][k]); 1160 + } 1161 + if (std.mem.eql(u8, g_lower[0..g_len], e_norm)) { 1162 + return .{ .index = gi }; 1163 + } 1164 + } 1165 + } 1166 + return .{ .index = null }; 1167 } 1168 1169 /// diagnostics: compute distributions for analysis

+54 -22

backend/src/http.zig

··· 69 try handleEntity(request); 70 } else if (mem.eql(u8, target, "/entity-graph")) { 71 try handleEntityGraph(request); 72 - } else if (mem.eql(u8, target, "/cluster-labels")) { 73 - try handleClusterLabels(request); 74 } else if (mem.eql(u8, target, "/diagnostics")) { 75 try handleDiagnostics(request); 76 } else if (mem.eql(u8, target, "/")) { 77 - try sendJson(request, "{\"name\":\"coral\",\"endpoints\":[\"/stats\",\"/state/32\",\"/state/128\",\"/state/512\",\"/entity\",\"/entity-graph\",\"/cluster-labels\",\"/diagnostics\"]}"); 78 } else { 79 try sendNotFound(request); 80 } ··· 293 try sendJson(request, json_data); 294 } 295 296 - /// POST /cluster-labels - accept LLM-generated cluster labels from NER bridge 297 - fn handleClusterLabels(request: *http.Server.Request) !void { 298 if (request.head.method != .POST) { 299 try sendJson(request, "{\"error\":\"method not allowed\"}"); 300 return; ··· 311 defer body_collector.deinit(); 312 313 _ = body_reader.stream(&body_collector.writer, .unlimited) catch |err| { 314 - log.err("failed to read cluster-labels body: {}", .{err}); 315 try sendJson(request, "{\"error\":\"failed to read body\"}"); 316 return; 317 }; ··· 328 return; 329 } 330 331 - const labels_obj = parsed.value.object.get("labels") orelse { 332 - try sendJson(request, "{\"error\":\"missing labels object\"}"); 333 return; 334 }; 335 336 - if (labels_obj != .object) { 337 - try sendJson(request, "{\"error\":\"labels must be object\"}"); 338 return; 339 } 340 341 - // clear old labels and set new ones 342 - // labels format: {"cluster_id_str": "topic label", ...} 343 entity_graph.graph.mutex.lock(); 344 defer entity_graph.graph.mutex.unlock(); 345 346 - entity_graph.graph.clearClusterLabels(); 347 348 - var count: usize = 0; 349 - var it = labels_obj.object.iterator(); 350 - while (it.next()) |entry| { 351 - const cluster_id = std.fmt.parseInt(u32, entry.key_ptr.*, 10) catch continue; 352 - if (entry.value_ptr.* != .string) continue; 353 - entity_graph.graph.setClusterLabel(cluster_id, entry.value_ptr.string); 354 - count += 1; 355 } 356 357 - log.info("cluster-labels: set {d} labels", .{count}); 358 359 var buf: [64]u8 = undefined; 360 - const response = std.fmt.bufPrint(&buf, "{{\"set\":{d}}}", .{count}) catch "{\"set\":0}"; 361 try sendJson(request, response); 362 } 363

··· 69 try handleEntity(request); 70 } else if (mem.eql(u8, target, "/entity-graph")) { 71 try handleEntityGraph(request); 72 + } else if (mem.eql(u8, target, "/groups")) { 73 + try handleGroups(request); 74 } else if (mem.eql(u8, target, "/diagnostics")) { 75 try handleDiagnostics(request); 76 } else if (mem.eql(u8, target, "/")) { 77 + try sendJson(request, "{\"name\":\"coral\",\"endpoints\":[\"/stats\",\"/state/32\",\"/state/128\",\"/state/512\",\"/entity\",\"/entity-graph\",\"/groups\",\"/diagnostics\"]}"); 78 } else { 79 try sendNotFound(request); 80 } ··· 293 try sendJson(request, json_data); 294 } 295 296 + /// POST /groups - accept LLM-curated named groups from NER bridge 297 + fn handleGroups(request: *http.Server.Request) !void { 298 if (request.head.method != .POST) { 299 try sendJson(request, "{\"error\":\"method not allowed\"}"); 300 return; ··· 311 defer body_collector.deinit(); 312 313 _ = body_reader.stream(&body_collector.writer, .unlimited) catch |err| { 314 + log.err("failed to read groups body: {}", .{err}); 315 try sendJson(request, "{\"error\":\"failed to read body\"}"); 316 return; 317 }; ··· 328 return; 329 } 330 331 + const groups_arr = parsed.value.object.get("groups") orelse { 332 + try sendJson(request, "{\"error\":\"missing groups array\"}"); 333 return; 334 }; 335 336 + if (groups_arr != .array) { 337 + try sendJson(request, "{\"error\":\"groups must be array\"}"); 338 return; 339 } 340 341 + // parse groups into fixed-size structs 342 + var groups: [entity_graph.MAX_GROUPS]entity_graph.Group = [_]entity_graph.Group{.{}} ** entity_graph.MAX_GROUPS; 343 + var group_count: u8 = 0; 344 + 345 + for (groups_arr.array.items) |group_val| { 346 + if (group_count >= entity_graph.MAX_GROUPS) break; 347 + if (group_val != .object) continue; 348 + 349 + const name_val = group_val.object.get("name") orelse continue; 350 + if (name_val != .string) continue; 351 + 352 + var grp = entity_graph.Group{}; 353 + 354 + // set name 355 + const name_len: u8 = @intCast(@min(name_val.string.len, 64)); 356 + @memcpy(grp.name[0..name_len], name_val.string[0..name_len]); 357 + grp.name_len = name_len; 358 + 359 + // set entities 360 + if (group_val.object.get("entities")) |ents_val| { 361 + if (ents_val == .array) { 362 + for (ents_val.array.items) |ent_val| { 363 + if (grp.entity_count >= entity_graph.MAX_GROUP_ENTITIES) break; 364 + if (ent_val != .string) continue; 365 + const ent_len: u8 = @intCast(@min(ent_val.string.len, 64)); 366 + @memcpy(grp.entity_texts[grp.entity_count][0..ent_len], ent_val.string[0..ent_len]); 367 + grp.entity_text_lens[grp.entity_count] = ent_len; 368 + grp.entity_count += 1; 369 + } 370 + } 371 + } 372 + 373 + groups[group_count] = grp; 374 + group_count += 1; 375 + } 376 + 377 entity_graph.graph.mutex.lock(); 378 defer entity_graph.graph.mutex.unlock(); 379 380 + entity_graph.graph.setGroups(&groups, group_count); 381 382 + // optional haiku field 383 + if (parsed.value.object.get("haiku")) |haiku_val| { 384 + if (haiku_val == .string) { 385 + entity_graph.graph.setHaiku(haiku_val.string); 386 + } 387 } 388 389 + log.info("groups: set {d} groups", .{group_count}); 390 391 var buf: [64]u8 = undefined; 392 + const response = std.fmt.bufPrint(&buf, "{{\"set\":{d}}}", .{group_count}) catch "{\"set\":0}"; 393 try sendJson(request, response); 394 } 395

+92 -84

docs/04-architecture.md

··· 8 bluesky firehose 9 | 10 v 11 - turbostream (graze.social) 12 | 13 - | wss://api.graze.social/.../turbostream 14 v 15 NER bridge (python, fly.io) 16 | 17 | POST http://coral.internal:3000/entity 18 v 19 backend (zig, fly.io) 20 | ··· 23 frontend (static, cloudflare pages) 24 ``` 25 26 - turbostream is a hydrated repeater built on bluesky's jetstream. it enriches raw firehose events with author profiles, mentions, parent posts, etc. we get ~50 messages/sec from it. 27 28 ## NER bridge 29 30 `ner/bridge.py` - python async worker on fly.io (2gb performance-1x, spaCy needs ~400mb but benefits from CPU headroom). 31 32 **what it does**: 33 - 1. connects to turbostream via websocket 34 - 2. extracts post text, author DID, URI, follower count 35 3. checks spam labels (drops labeled DIDs/URIs before NER) 36 4. runs spaCy NER (`en_core_web_sm`, only NER pipe enabled) 37 5. normalizes entities (whitespace, punctuation, NORP plural stripping) 38 6. POSTs all entities from one post together to the backend (preserves co-occurrence) 39 7. tracks firehose rate via EWMA and sends it with each POST 40 41 - **spam filtering**: subscribes to `ozone.hailey.at` label stream (CBOR-encoded). caches labels by DID/URI with expiry timestamps. seeds cache on startup via queryLabels API. label types: spam, shopping-spam, general-spam, reply-link-spam, inauth-fundraising, coordinated-abuse, men-facet-abuse, mass-follow-high/mid, elon-handle, new-acct-replies. 42 43 - **stability design**: process synchronously (no unbounded queues). spaCy has 20x headroom. POST with 2s timeout, drop on failure. no retries, no queues. 44 45 - **health check**: exposes `/health` on port 8080. returns 200 if connected to turbostream AND successful POST within 2 minutes, 503 otherwise. note: fly.io `http_service.checks` only affect routing, not machine restarts. 46 47 **watchdog**: self-terminates via `sys.exit(1)` if unhealthy, triggering fly.io's auto-restart. two modes: (1) no successful POST within 5 min of startup = broken from start, (2) no successful POST for 3 min after previously working = wedged state. 48 49 - **env vars**: 50 - - `TURBOSTREAM_URL` - turbostream endpoint (default: graze.social) 51 - - `ZIG_BACKEND_URL` - backend HTTP endpoint (default: localhost:3000, prod: coral.internal:3000) 52 - - `LABEL_STREAM_URL` - labeler websocket 53 - - `LABELER_ENABLED` - toggle spam filtering (default: on) 54 55 ## backend 56 ··· 65 66 **HTTP endpoints**: 67 - `POST /entity` - ingest entities from bridge 68 - `GET /stats` - lattice health, firehose rate, percolation state 69 - - `GET /entity-graph` - full co-occurrence graph (entities, edges, clusters) 70 - `GET /state/{size}` - lattice bitmap (base64) 71 - `GET /diagnostics` - distribution analysis 72 - - `GET /top-post/{id}` - top post for an entity 73 74 **entity graph model**: see `docs/02-semantic-percolation-plan.md` for design rationale. short version: entities co-occurring in the same post form edges. edges have pheromone weights that decay exponentially (10min half-life). union-find clusters active entities. trend = (smoothed_rate - baseline) / baseline. 75 76 **persistence**: SQLite at `/data/coral.db` on a fly volume. saves entities, edges, baselines, post metadata every 30s. activity buckets (in-memory mention counts) reset on restart but baselines seed recovery. WAL mode, busy_timeout=5000. 77 78 ## frontend 79 80 `site/` - static HTML/CSS/JS on cloudflare pages. 81 82 connects to backend websocket at `wss://coral.fly.dev:3001`. receives lattice bitmaps (binary, 10Hz), entity updates (JSON, 2Hz), and entity graph state (JSON, 2Hz). renders a 128x128 lattice with entity overlay, trending list, live feed, and stats. 83 84 - **rendering**: uses `requestAnimationFrame` for all canvas updates (websocket handlers set dirty flags, rAF loop renders). this prevents Chrome mobile GPU compositor from sampling canvas mid-render. entity graph overlay is cached on an offscreen canvas and re-rendered only when graph data changes (2Hz), then blitted every frame. 85 86 - **trending list**: clicking an entity name opens Google News search in a new tab (`https://news.google.com/search?q=ENTITY`). 87 88 - **degraded alert**: when `firehose_rate < 10/s` (normal ~50/s), shows a coral-colored banner: "turbostream may be degraded - data may be stale". the backend includes `firehose_degraded: bool` in its stats JSON. 89 90 ## deployment 91 ··· 101 102 ### 1. POST timeouts between NER bridge and backend 103 104 - **symptom**: NER bridge logs flooded with `POST exception: TimeoutError`. firehose_rate drops to ~1/s. turbostream connection is healthy, entities are extracted, but POSTs to the backend time out. 105 - 106 - **root cause**: the NER bridge was POSTing to `https://coral.fly.dev` (public proxy). fly.io's public proxy was intermittently slow or unresponsive, causing the 2s POST timeout to fire. 107 - 108 - **fix**: two changes. 109 - 110 - 1. **backend** (`main.zig`): changed HTTP server bind from `0.0.0.0` to `::` (dual-stack). fly's internal network uses IPv6, and the backend was only listening on IPv4. 111 - 112 - 2. **NER bridge** (`fly.toml`): changed `ZIG_BACKEND_URL` from `https://coral.fly.dev` to `http://coral.internal:3000`. this routes over fly's private IPv6 network between machines in the same org, bypassing the public proxy entirely. 113 - 114 - **result**: POST latency dropped from intermittent 2s+ timeouts to ~1.8ms. entity drops went from 5678/5820 to 0. 115 - 116 - ### 2. turbostream websocket disconnects (blocked read path) 117 - 118 - **symptoms** (appeared in sequence as we iterated on fixes): 119 - - `websocket closed` every ~44s, throughput ~12/s (client-side ping timeout) 120 - - `code=1011 reason='keepalive ping timeout'` every ~2min (server-side ping timeout) 121 - - `code=1006 reason=''` every ~60s (abnormal closure) 122 - 123 - all three were symptoms of the same root cause. 124 - 125 - **root cause**: the `websockets` library processes ping/pong control frames inside `recv()`. the original code did `async for raw in ws:` then ran NER + HTTP POST inside the loop body. while processing, the library couldn't call `recv()`, so it couldn't respond to pings. 126 - 127 - this explains the "looks good for 2-5 minutes then degrades" pattern: processing latency accumulates until keepalive timeouts hit. toggling client-side pings changed which side timed out first, but didn't fix the underlying problem. even `run_in_executor()` for spaCy didn't help — `await`ing the result still blocked the read loop. 128 - 129 - **fix**: decouple the websocket reader from processing. two concurrent asyncio tasks: 130 - 131 - ```python 132 - queue = asyncio.Queue(maxsize=200) # ~4 seconds buffer at 50 msgs/sec 133 134 - async def reader(): 135 - """only recv() + enqueue. keeps pong responsive.""" 136 - async for raw in ws: 137 - stats["messages"] += 1 138 - try: 139 - queue.put_nowait(raw) 140 - except asyncio.QueueFull: 141 - stats["queue_drops"] += 1 142 143 - async def worker(): 144 - """consume from queue, do NER + POST.""" 145 - while True: 146 - raw = await queue.get() 147 - # JSON parse → extract_post → spam check → NER (in executor) → POST 148 - ``` 149 150 - the reader stays in `recv()` so websockets can handle ping/pong at all times. bounded queue with drop policy prevents unbounded backlog. client-side pings disabled (turbostream handles keepalive server-side). 151 152 - **result**: zero disconnects, 0 drops, queue steady at 4-8/200, latency 3-10ms. previously dying every 60 seconds. 153 154 - see `docs/05-websocket-stability.md` for the full debugging timeline. 155 156 ### 3. fly.io API instability 157 158 - **symptom**: `fly machine restart` and `fly deploy` commands fail with DNS resolution errors (`lookup api.machines.dev: no such host`) or EOF errors on the release API. 159 160 - **root cause**: fly.io's own API and metrics infrastructure has intermittent DNS/connectivity issues. not specific to coral. 161 162 - **mitigation**: retry. deploys usually succeed on second attempt. the `fly deploy` command is idempotent - re-running it after a failed release API call will push the already-built image. 163 164 - ### general lessons 165 166 - - **use fly internal networking** for app-to-app communication. `coral.internal:3000` over IPv6 is dramatically more reliable than routing through the public proxy at `coral.fly.dev`. 167 - - **decouple websocket reads from processing**. `websockets` handles ping/pong inside `recv()`. if your loop body does slow work (NER, HTTP), use a reader task + queue + worker task so `recv()` is never stalled. `run_in_executor()` alone isn't enough — `await`ing it still blocks the read loop. 168 - - **bind to `::` not `0.0.0.0`** if your fly app needs to accept connections from other fly apps on the internal network. 169 - - **the firehose_rate EWMA** (`alpha=0.3`, updated every 2s) is a good health signal. threshold of 10/s (vs normal ~50/s) catches real degradation without false positives. 170 171 - ### 4. NER bridge stuck in timeout loop 172 173 - **symptom**: NER bridge logs flooded with `POST exception: TimeoutError` and `timed out during opening handshake`. entity graph empties out (all entities decay below threshold). manual restart fixes it. 174 175 - **root cause**: transient network issues can cause both turbostream websocket and backend HTTP to time out simultaneously. the bridge enters a reconnect loop but can't recover without restart. 176 177 - **fix**: added watchdog task that self-terminates via `sys.exit(1)` when unhealthy, triggering fly.io's auto-restart. two modes: (1) no successful POST within 5 min of startup = broken from start, (2) no successful POST for 3 min after previously working = wedged state. note: fly.io `http_service.checks` only affect routing decisions, not machine restarts—the app must self-exit to trigger restart. 178 179 - ### 5. entity graph JSON buffer overflow 180 181 - **symptom**: `/entity-graph` endpoint returns `{}`. websocket graph messages have empty entities array. stats show active entities exist but they're not serialized. 182 183 - **root cause**: the Zig `toJson()` function used a 256KB fixed buffer. with 272+ active entities (each with edges), serialization exceeded the buffer. the `catch` clause returned `"{}"` on overflow, silently dropping all data including stats. 184 185 - **fix**: bumped JSON serialization buffers from 256KB to 1MB in `ws_server.zig` and `http.zig`. 186 187 - ### 6. Chrome mobile canvas flashing 188 189 - **symptom**: entity graph overlay flashes/flickers on Chrome Android. works fine on Safari iOS and desktop Chrome. 190 191 - **root cause**: canvas renders fired directly from websocket `onmessage` handlers at 10Hz. Chrome Android's GPU compositor can sample the canvas buffer asynchronously — catching intermediate frames where the canvas has been cleared but the overlay hasn't been drawn yet. Safari's compositor waits for the JS task to complete. 192 193 - **fix**: defer all rendering to `requestAnimationFrame`. websocket handlers set dirty flags, rAF loop renders once per display frame. rAF callbacks execute in the browser's rendering phase, synchronized with the compositor. also caches graph overlay on offscreen canvas (render at 2Hz, blit at 10Hz).

··· 8 bluesky firehose 9 | 10 v 11 + jetstream2 (multiple instances) 12 | 13 + | wss://jetstream2.us-east.bsky.network/subscribe 14 v 15 NER bridge (python, fly.io) 16 | 17 | POST http://coral.internal:3000/entity 18 + | POST http://coral.internal:3000/groups (every 5 min, LLM-curated) 19 v 20 backend (zig, fly.io) 21 | ··· 24 frontend (static, cloudflare pages) 25 ``` 26 27 + jetstream2 is bluesky's official firehose consumer. the NER bridge connects to independent relay instances (firehose.stream, jetstream2.us-east.bsky.network, etc.) with automatic fallback. we get ~50 messages/sec. 28 29 ## NER bridge 30 31 `ner/bridge.py` - python async worker on fly.io (2gb performance-1x, spaCy needs ~400mb but benefits from CPU headroom). 32 33 **what it does**: 34 + 1. connects to jetstream2 via websocket (multi-instance fallback with cursor rewind) 35 + 2. extracts post text from commit records 36 3. checks spam labels (drops labeled DIDs/URIs before NER) 37 4. runs spaCy NER (`en_core_web_sm`, only NER pipe enabled) 38 5. normalizes entities (whitespace, punctuation, NORP plural stripping) 39 6. POSTs all entities from one post together to the backend (preserves co-occurrence) 40 7. tracks firehose rate via EWMA and sends it with each POST 41 42 + **reader/worker architecture**: the websocket reader is decoupled from processing via a bounded `asyncio.Queue(maxsize=200)`. the reader task only calls `recv()` and enqueues, keeping ping/pong responsive. worker tasks dequeue and do NER + POST. see `docs/05-websocket-stability.md` for the debugging story. 43 44 + **spam filtering**: subscribes to `ozone.hailey.at` label stream (CBOR-encoded). caches labels by DID/URI with expiry timestamps. seeds cache on startup via queryLabels API. 45 46 + **stability design**: process synchronously (no unbounded queues). spaCy has 20x headroom. POST with 2s timeout, drop on failure. no retries, no queues. 47 48 **watchdog**: self-terminates via `sys.exit(1)` if unhealthy, triggering fly.io's auto-restart. two modes: (1) no successful POST within 5 min of startup = broken from start, (2) no successful POST for 3 min after previously working = wedged state. 49 50 + ## LLM curator 51 + 52 + `ner/dedup.py` - Claude Haiku 4.5 curates co-occurrence clusters into named groups every 5 minutes. see `docs/06-llm-curator.md` for full details. 53 + 54 + **what it does**: 55 + 1. fetches the entity graph from the backend (`GET /entity-graph`) 56 + 2. extracts co-occurrence clusters (union-find clusters with trending entities) 57 + 3. builds a prompt with cluster entities, trend values, mention counts, and topic history 58 + 4. calls Claude Haiku 4.5 via `messages.parse()` with structured output (Pydantic models) 59 + 5. validates LLM output against the actual entity graph (code-based, not LLM) 60 + 6. POSTs validated groups + haiku to backend (`POST /groups`) 61 + 62 + **output**: up to 5 named groups (e.g., "Jesse Jackson Tribute", "Iran Nuclear Talks") with entity lists, plus a haiku about current events. 63 + 64 + **topic continuity**: tracks topic history via Jaccard similarity matching. the LLM sees which topics are active and for how long, encouraging label reuse for continuing stories. history TTL is 1 hour. 65 66 ## backend 67 ··· 76 77 **HTTP endpoints**: 78 - `POST /entity` - ingest entities from bridge 79 + - `POST /groups` - receive LLM-curated groups + haiku from curator 80 - `GET /stats` - lattice health, firehose rate, percolation state 81 + - `GET /entity-graph` - full co-occurrence graph (entities, edges, clusters, groups) 82 - `GET /state/{size}` - lattice bitmap (base64) 83 - `GET /diagnostics` - distribution analysis 84 85 **entity graph model**: see `docs/02-semantic-percolation-plan.md` for design rationale. short version: entities co-occurring in the same post form edges. edges have pheromone weights that decay exponentially (10min half-life). union-find clusters active entities. trend = (smoothed_rate - baseline) / baseline. 86 87 + **groups**: the backend stores up to 5 `Group` structs, each with a name and up to 20 entity text references. when serializing the entity graph to JSON, each entity is matched against groups via case-insensitive text comparison (`matchGroup()`), adding `group` (index) and `group_name` fields. 88 + 89 **persistence**: SQLite at `/data/coral.db` on a fly volume. saves entities, edges, baselines, post metadata every 30s. activity buckets (in-memory mention counts) reset on restart but baselines seed recovery. WAL mode, busy_timeout=5000. 90 91 + **entity graph JSON** (broadcast via websocket): 92 + ```json 93 + { 94 + "entities": [ 95 + { 96 + "id": 0, 97 + "text": "Trump", 98 + "label": "PERSON", 99 + "rate": 0.17, 100 + "count": 52, 101 + "trend": -0.019, 102 + "baseline": 0.175, 103 + "cluster": 123, 104 + "cluster_score": 0.45, 105 + "group": 0, 106 + "group_name": "US Political Commentary", 107 + "largest": false, 108 + "x": 45, 109 + "y": 78, 110 + "edges": [{"t": 5, "a": 12.5, "w": 0.67}] 111 + } 112 + ], 113 + "groups": [ 114 + {"name": "US Political Commentary", "entities": ["Trump", "Congress", "MAGA"]} 115 + ], 116 + "haiku": "spring wind carries news\nacross the digital plain—\nold voices persist" 117 + } 118 + ``` 119 + 120 ## frontend 121 122 `site/` - static HTML/CSS/JS on cloudflare pages. 123 124 connects to backend websocket at `wss://coral.fly.dev:3001`. receives lattice bitmaps (binary, 10Hz), entity updates (JSON, 2Hz), and entity graph state (JSON, 2Hz). renders a 128x128 lattice with entity overlay, trending list, live feed, and stats. 125 126 + **rendering**: uses `requestAnimationFrame` for all canvas updates (websocket handlers set dirty flags, rAF loop renders). entity graph overlay is cached on an offscreen canvas and re-rendered only when graph data changes (2Hz), then blitted every frame. 127 128 + **trending list**: entities ranked by `trend * log(1 + count)` with EMA smoothing. hysteresis thresholds for enter/exit prevent flickering. trend arrows are relative (percentile-based against the max trend in the current list), not absolute thresholds. clicking an entity name opens Google News search. 129 130 + **group pills**: LLM-curated groups render as colored pills above the trending list (shapes: ●, ✦, ▲, ◇, ✚, ■). entities belonging to a group get a colored left border and shape pip. groups update every ~5 minutes when the curator runs. 131 + 132 + **haiku display**: attributed quote from Claude Haiku 4.5, centered below the live feed. timestamp visible on hover. updates every ~5 minutes with each curator cycle. 133 + 134 + **degraded alert**: when `firehose_rate < 10/s` (normal ~50/s), shows a coral-colored banner. the backend includes `firehose_degraded: bool` in its stats JSON. 135 136 ## deployment 137 ··· 147 148 ### 1. POST timeouts between NER bridge and backend 149 150 + **symptom**: NER bridge logs flooded with `POST exception: TimeoutError`. firehose_rate drops to ~1/s. 151 152 + **root cause**: the NER bridge was POSTing to `https://coral.fly.dev` (public proxy). fly.io's public proxy was intermittently slow. 153 154 + **fix**: changed to `http://coral.internal:3000` (fly's private IPv6 network). POST latency dropped from 2s+ timeouts to ~1.8ms. 155 156 + ### 2. websocket disconnects (blocked read path) 157 158 + **root cause**: the `websockets` library processes ping/pong inside `recv()`. the old code blocked the read loop with NER + POST work, so pongs were never sent. 159 160 + **fix**: reader/worker split with bounded queue. see `docs/05-websocket-stability.md` for full details. 161 162 ### 3. fly.io API instability 163 164 + **symptom**: `fly deploy` fails with DNS errors. 165 166 + **mitigation**: retry. deploys are idempotent. 167 168 + ### 4. NER bridge stuck in timeout loop 169 170 + **symptom**: bridge enters reconnect loop, can't recover. 171 172 + **fix**: watchdog self-terminates via `sys.exit(1)` after 3-5 min of no successful POSTs, triggering fly.io auto-restart. 173 174 + ### 5. entity graph JSON buffer overflow 175 176 + **symptom**: `/entity-graph` returns `{}`. 177 178 + **root cause**: 256KB fixed buffer exceeded with 272+ entities. 179 180 + **fix**: bumped to 1MB. 181 182 + ### 6. Chrome mobile canvas flashing 183 184 + **root cause**: canvas renders from websocket handlers; Chrome Android's GPU compositor samples mid-render. 185 186 + **fix**: all rendering deferred to `requestAnimationFrame`. 187 188 + ### 7. stats key rename crash loop 189 190 + **symptom**: NER bridge crash-loops, never lives long enough for curator to run. 191 192 + **root cause**: during the narrator→curator refactor, a stats key was renamed from `'topics'` to `'groups'` in `dedup.py` but a logging line in `bridge.py` still referenced `ns['topics']`. the `KeyError` killed the worker every 1000 messages. 193 194 + **lesson**: when renaming dict keys, grep all consumers — not just the module where the key is defined. 195 + 196 + ### general lessons 197 198 + - **use fly internal networking** for app-to-app communication. `coral.internal:3000` over IPv6 is dramatically more reliable than the public proxy. 199 + - **decouple websocket reads from processing**. use a reader task + queue + worker task so `recv()` is never stalled. 200 + - **bind to `::` not `0.0.0.0`** for fly internal network access. 201 + - **grep all consumers when renaming keys/fields** — the compiler won't catch dict key mismatches in Python.

+156

docs/06-llm-curator.md

···

··· 1 + # LLM curator 2 + 3 + ## overview 4 + 5 + the curator (`ner/dedup.py`) uses Claude Haiku 4.5 to transform raw co-occurrence clusters into human-readable named groups. it runs every 5 minutes as an async task alongside the NER bridge. 6 + 7 + the LLM's job is **not** to label every cluster. it's to identify the 3-5 real stories, merge duplicates, and drop garbage. this is a filter, not a summarizer. 8 + 9 + ## data flow 10 + 11 + ``` 12 + backend (GET /entity-graph) 13 + | 14 + v 15 + extract clusters (union-find, trending entities only) 16 + | 17 + v 18 + build prompt (clusters + trend data + topic history) 19 + | 20 + v 21 + Claude Haiku 4.5 (messages.parse, structured output) 22 + | 23 + v 24 + code-based validation (entity text must exist in graph) 25 + | 26 + v 27 + backend (POST /groups) → broadcast to frontend via websocket 28 + ``` 29 + 30 + ## structured output 31 + 32 + uses `messages.parse()` with Pydantic models for type-safe extraction: 33 + 34 + ```python 35 + class NamedGroup(BaseModel): 36 + name: str # 2-4 word topic name 37 + entities: list[str] # exact entity text from clusters 38 + 39 + class CuratorOutput(BaseModel): 40 + groups: list[NamedGroup] # max 5 41 + haiku: str # 5-7-5 syllable haiku 42 + ``` 43 + 44 + the API returns parsed Pydantic objects directly — no JSON parsing or regex needed. 45 + 46 + ## prompt design 47 + 48 + the system prompt instructs the LLM to: 49 + - identify coherent stories from the cluster data 50 + - merge clusters about the same story 51 + - drop incoherent or uninteresting clusters 52 + - return at most 5 groups (quality over quantity) 53 + - use specific names ("Trump Budget Fight" not "Politics") 54 + - reuse or refine recent topic names when the story continues 55 + - list exact entity text from the clusters for each group 56 + - write a haiku (5-7-5 syllables) about the real-world events 57 + 58 + the user message includes: 59 + 1. **topic history** — recent groups with duration and status (active/ended) 60 + 2. **cluster data** — entity text, NER label, trend score, mention count per cluster 61 + 62 + ## topic continuity 63 + 64 + the curator tracks a `topic_history` list of `TopicRecord` objects: 65 + 66 + ```python 67 + @dataclass 68 + class TopicRecord: 69 + label: str # group name 70 + entities: set[str] # member entity texts 71 + first_seen: float # unix timestamp 72 + last_seen: float # unix timestamp 73 + times_labeled: int # how many cycles this topic appeared 74 + ``` 75 + 76 + each cycle: 77 + 1. new groups are matched against history via **Jaccard similarity** on entity sets 78 + 2. matching groups (overlap >= 0.3) update the existing record 79 + 3. non-matching groups create new records 80 + 4. records older than 1 hour (`HISTORY_TTL`) are pruned 81 + 82 + the LLM sees this history formatted as: 83 + ``` 84 + Recent topic history (reuse labels for continuing topics): 85 + - "Jesse Jackson Tribute" — active for 23m (Jesse Jackson, Chicago, Fred Hampton) 86 + - "Iran Nuclear Talks" — ended 8m ago, lasted 45m (Geneva, Iranian, diplomacy) 87 + ``` 88 + 89 + this encourages label stability — the LLM reuses names for continuing stories rather than inventing new ones each cycle. 90 + 91 + ## validation 92 + 93 + all LLM output goes through code-based validation before reaching the backend: 94 + 95 + 1. **entity existence check**: each entity text in a group must exist in the current entity graph. entities the LLM hallucinated or misspelled are dropped. 96 + 2. **group filtering**: groups with zero valid entities after filtering are dropped entirely. 97 + 3. **max groups**: capped at 5 regardless of what the LLM returns. 98 + 4. **fallback**: if the LLM returns 0 valid groups, the previous groups are kept. this prevents the UI from going blank on a bad LLM response. 99 + 100 + ## backend integration 101 + 102 + ### Group struct (entity_graph.zig) 103 + 104 + ```zig 105 + pub const MAX_GROUPS = 5; 106 + pub const MAX_GROUP_ENTITIES = 20; 107 + 108 + pub const Group = struct { 109 + name: [64]u8, 110 + name_len: u8, 111 + entity_texts: [MAX_GROUP_ENTITIES][64]u8, 112 + entity_text_lens: [MAX_GROUP_ENTITIES]u8, 113 + entity_count: u8, 114 + }; 115 + ``` 116 + 117 + ### entity-to-group matching 118 + 119 + `matchGroup()` does case-insensitive text comparison between each entity's text and the entity lists in all active groups. matched entities get `group` (index) and `group_name` fields in the JSON output. 120 + 121 + ### POST /groups endpoint (http.zig) 122 + 123 + accepts: 124 + ```json 125 + { 126 + "groups": [ 127 + {"name": "Topic Name", "entities": ["Entity A", "Entity B"]}, 128 + ], 129 + "haiku": "five syllable line\nseven syllable line here\nfive syllables end" 130 + } 131 + ``` 132 + 133 + constructs `Group` structs and calls `entity_graph.graph.setGroups()`. the haiku is stored and included in every subsequent websocket broadcast. 134 + 135 + ## frontend rendering 136 + 137 + - **group pills**: colored pills above the trending list, one per group, with distinct shapes (●, ✦, ▲, ◇, ✚, ■) and neon colors 138 + - **entity pips**: trending entities matched to a group show the group's shape and color 139 + - **tooltip**: hovering an entity in a group shows the group name 140 + - **haiku**: displayed as a centered blockquote attributed to "claude haiku 4.5" with a hover-visible timestamp 141 + 142 + groups render directly from the websocket data — no grow/decay lifecycle. they update atomically every ~5 minutes when the curator runs. 143 + 144 + ## parameters 145 + 146 + | parameter | value | notes | 147 + |-----------|-------|-------| 148 + | `NARRATOR_INTERVAL` | 300s | curator cycle frequency | 149 + | `MAX_GROUPS` | 5 | max groups per cycle | 150 + | `MAX_CLUSTERS` | 15 | max clusters sent to LLM | 151 + | `MIN_CLUSTER_SIZE` | 2 | min trending entities per cluster | 152 + | `HISTORY_TTL` | 3600s | topic history expiry | 153 + | `MAX_HISTORY` | 10 | max historical topics shown to LLM | 154 + | `OVERLAP_THRESHOLD` | 0.3 | Jaccard threshold for topic matching | 155 + | `temperature` | 0.3 | LLM temperature (lower = more precise) | 156 + | model | `claude-haiku-4-5-20251001` | fast, cheap, structured output support |

+1 -1

ner/bridge.py

··· 542 if narrator is not None: 543 ns = narrator.stats 544 narrator_info = ( 545 - f" topics={ns['topics']}" 546 f" clusters={ns['clusters_seen']}" 547 f" narrator_calls={ns['calls']}" 548 )

··· 542 if narrator is not None: 543 ns = narrator.stats 544 narrator_info = ( 545 + f" groups={ns['groups']}" 546 f" clusters={ns['clusters_seen']}" 547 f" narrator_calls={ns['calls']}" 548 )

+99 -65

ner/dedup.py

··· 1 """ 2 - LLM-powered cluster narration via Claude Haiku. 3 4 Periodically fetches the entity graph from the backend, extracts 5 - trending clusters, and asks Haiku to generate short topic labels. 6 - These labels describe what a cluster of co-occurring entities is about 7 - (e.g., {Trump, Congress, Budget} -> "Trump Budget Fight"). 8 9 - The narrator maintains a rolling history of recent topics so the LLM 10 can see what it labeled before, how long topics have persisted, and 11 what's new vs continuing. This produces more stable, contextual labels. 12 13 - The labels are POSTed back to the backend, which serves them in the 14 - entity-graph websocket data (overriding the mechanical "A + B" label). 15 16 Runs as an async task alongside the bridge. 17 """ ··· 25 from pydantic import BaseModel 26 27 NARRATOR_ENABLED = os.getenv("NARRATOR_ENABLED", "1") != "0" 28 - NARRATOR_INTERVAL = int(os.getenv("NARRATOR_INTERVAL", "60")) 29 MIN_CLUSTER_SIZE = 2 # need at least 2 entities to narrate 30 MAX_CLUSTERS = 10 # cap clusters sent to LLM 31 MAX_HISTORY = 8 # max past topics shown to LLM 32 - HISTORY_TTL = 30 * 60 # prune topics older than 30 minutes 33 OVERLAP_THRESHOLD = 0.3 # Jaccard similarity to match a cluster to history 34 35 36 - class TopicLabel(BaseModel): 37 - cluster_id: int 38 - label: str 39 40 41 - class TopicLabels(BaseModel): 42 - topics: list[TopicLabel] 43 44 45 @dataclass ··· 52 53 54 SYSTEM_PROMPT = """\ 55 - You are analyzing clusters of co-occurring named entities trending on social media. 56 - Each cluster contains entities that frequently appear together in recent posts. 57 - For each cluster, provide a short (2-4 word) topic label that describes what 58 - the cluster is about. 59 60 - Rules: 61 - - Be specific: "Trump Budget Fight" not "Politics" 62 - - Don't just concatenate entity names: {Lakers, LeBron, NBA} -> "Lakers Game" 63 - - Use natural language: "SpaceX Launch" not "SPACEX/LAUNCH" 64 - - If a cluster's entities don't form a coherent topic, label it "Unclear" 65 - - Preserve proper nouns and capitalization 66 - - When a cluster matches a recent topic (shown below), prefer reusing or 67 - refining that label rather than inventing a new one — unless the entities 68 - have changed significantly""" 69 70 71 def _jaccard(a: set, b: set) -> float: ··· 87 def __init__(self, backend_url: str, call_interval: int = NARRATOR_INTERVAL): 88 self.backend_url = backend_url 89 self.call_interval = call_interval 90 - self.topic_labels: dict[int, str] = {} # cluster_id -> label 91 self.topic_history: list[TopicRecord] = [] 92 self.stats = { 93 "calls": 0, 94 "errors": 0, 95 - "topics": 0, 96 "clusters_seen": 0, 97 } 98 ··· 109 print(f"narrator: fetch error: {e}", flush=True) 110 return None 111 112 - async def _post_labels(self, labels: dict[int, str]) -> None: 113 try: 114 async with aiohttp.ClientSession() as session: 115 async with session.post( 116 - f"{self.backend_url}/cluster-labels", 117 - json={"labels": {str(k): v for k, v in labels.items()}}, 118 timeout=aiohttp.ClientTimeout(total=5), 119 ) as resp: 120 if resp.status != 200: ··· 149 return best 150 return None 151 152 - def _update_history( 153 - self, clusters: dict[int, list[dict]], labels: dict[int, str] 154 - ) -> None: 155 """update topic history with current cycle's results.""" 156 now = time() 157 158 - for cid, label in labels.items(): 159 - if label == "Unclear": 160 - continue 161 - entities = clusters.get(cid, []) 162 - entity_names = {e["text"] for e in entities} 163 if not entity_names: 164 continue 165 166 if existing := self._match_history(entity_names): 167 - # update existing topic 168 - existing.label = label 169 existing.last_seen = now 170 existing.entities = entity_names 171 existing.times_labeled += 1 172 else: 173 - # new topic 174 self.topic_history.append( 175 TopicRecord( 176 - label=label, 177 entities=entity_names, 178 first_seen=now, 179 last_seen=now, ··· 219 if not clusters: 220 return 221 222 # sort by total cluster trend descending, take top N 223 sorted_clusters = sorted( 224 clusters.items(), ··· 245 if history_context: 246 user_content = ( 247 f"{history_context}\n\n" 248 - f"Label these trending clusters:\n\n{entities_text}" 249 ) 250 else: 251 - user_content = f"Label these trending clusters:\n\n{entities_text}" 252 253 try: 254 import anthropic ··· 256 client = anthropic.AsyncAnthropic() 257 response = await client.messages.parse( 258 model="claude-haiku-4-5-20251001", 259 - max_tokens=512, 260 system=SYSTEM_PROMPT, 261 - output_format=TopicLabels, 262 messages=[ 263 { 264 "role": "user", ··· 268 ) 269 270 result = response.parsed_output 271 - new_labels = {t.cluster_id: t.label for t in result.topics} 272 273 - # only keep labels for clusters that actually exist 274 - active_ids = set(clusters.keys()) 275 - self.topic_labels = { 276 - cid: label 277 - for cid, label in new_labels.items() 278 - if cid in active_ids 279 - } 280 281 # update history with this cycle's results 282 - self._update_history(clusters, self.topic_labels) 283 284 - self.stats["topics"] = len(self.topic_labels) 285 self.stats["calls"] += 1 286 287 - labels_str = ", ".join( 288 - f"{cid}={label!r}" for cid, label in self.topic_labels.items() 289 ) 290 history_size = len(self.topic_history) 291 print( 292 - f"narrator: {len(self.topic_labels)} topics from " 293 f"{len(sorted_clusters)} clusters, " 294 - f"{history_size} in history ({labels_str})", 295 flush=True, 296 ) 297 298 - # push labels to backend 299 - if self.topic_labels: 300 - await self._post_labels(self.topic_labels) 301 302 except ImportError: 303 print("narrator: anthropic package not available", flush=True)

··· 1 """ 2 + LLM-powered cluster curation via Claude Haiku. 3 4 Periodically fetches the entity graph from the backend, extracts 5 + trending clusters, and asks Haiku to curate them into named groups. 6 + The LLM identifies coherent news stories from co-occurrence clusters, 7 + merges duplicates, drops garbage, and returns at most 5 named groups. 8 9 + The curator maintains a rolling history of recent topics so the LLM 10 can see what it labeled before, how long topics have persisted, and 11 what's new vs continuing. This produces more stable, contextual labels. 12 13 + The groups are POSTed back to the backend, which serves them in the 14 + entity-graph websocket data for frontend rendering. 15 16 Runs as an async task alongside the bridge. 17 """ ··· 25 from pydantic import BaseModel 26 27 NARRATOR_ENABLED = os.getenv("NARRATOR_ENABLED", "1") != "0" 28 + NARRATOR_INTERVAL = int(os.getenv("NARRATOR_INTERVAL", "300")) 29 MIN_CLUSTER_SIZE = 2 # need at least 2 entities to narrate 30 MAX_CLUSTERS = 10 # cap clusters sent to LLM 31 MAX_HISTORY = 8 # max past topics shown to LLM 32 + HISTORY_TTL = 60 * 60 # prune topics older than 1 hour 33 OVERLAP_THRESHOLD = 0.3 # Jaccard similarity to match a cluster to history 34 + MAX_GROUPS = 5 # max groups returned to backend 35 36 37 + class NamedGroup(BaseModel): 38 + name: str 39 + entities: list[str] 40 41 42 + class CuratorOutput(BaseModel): 43 + groups: list[NamedGroup] 44 + haiku: str 45 46 47 @dataclass ··· 54 55 56 SYSTEM_PROMPT = """\ 57 + You are curating trending topics from a real-time entity co-occurrence graph. 58 + Each cluster below contains entities that frequently appear together in 59 + social media posts. Your job is to identify the real stories. 60 61 + For each coherent cluster, provide a short (2-4 word) topic name. 62 + You may merge clusters that are about the same story. 63 + Drop clusters that are incoherent or uninteresting. 64 + Return at most 5 groups — quality over quantity. 65 + 66 + Be specific: "Trump Budget Fight" not "Politics". 67 + Use natural language: "SpaceX Launch" not "SPACEX/LAUNCH". 68 + Preserve proper nouns and capitalization. 69 + When a group matches a recent topic (shown below), prefer reusing or 70 + refining that name rather than inventing a new one — unless the entities 71 + have changed significantly. 72 + 73 + For each group, list the entity names (exact text from the clusters) that 74 + belong to it. 75 + 76 + Also write a haiku about the real-world events in these clusters. 77 + STRICT RULES for the haiku: 78 + - Line 1: exactly 5 syllables 79 + - Line 2: exactly 7 syllables 80 + - Line 3: exactly 5 syllables 81 + - Count syllables carefully before writing each line 82 + - Write in the style of Bashō — concrete, grounded in the physical world 83 + - Reference specific people, places, or events from the data 84 + - The haiku is about the world, not about the internet""" 85 86 87 def _jaccard(a: set, b: set) -> float: ··· 103 def __init__(self, backend_url: str, call_interval: int = NARRATOR_INTERVAL): 104 self.backend_url = backend_url 105 self.call_interval = call_interval 106 + self.current_groups: list[dict] = [] # [{name, entities}, ...] 107 self.topic_history: list[TopicRecord] = [] 108 self.stats = { 109 "calls": 0, 110 "errors": 0, 111 + "groups": 0, 112 "clusters_seen": 0, 113 } 114 ··· 125 print(f"narrator: fetch error: {e}", flush=True) 126 return None 127 128 + async def _post_groups(self, groups: list[dict], haiku: str = "") -> None: 129 try: 130 + payload: dict = {"groups": groups} 131 + if haiku: 132 + payload["haiku"] = haiku 133 async with aiohttp.ClientSession() as session: 134 async with session.post( 135 + f"{self.backend_url}/groups", 136 + json=payload, 137 timeout=aiohttp.ClientTimeout(total=5), 138 ) as resp: 139 if resp.status != 200: ··· 168 return best 169 return None 170 171 + def _update_history(self, groups: list[dict]) -> None: 172 """update topic history with current cycle's results.""" 173 now = time() 174 175 + for group in groups: 176 + name = group["name"] 177 + entity_names = set(group["entities"]) 178 if not entity_names: 179 continue 180 181 if existing := self._match_history(entity_names): 182 + existing.label = name 183 existing.last_seen = now 184 existing.entities = entity_names 185 existing.times_labeled += 1 186 else: 187 self.topic_history.append( 188 TopicRecord( 189 + label=name, 190 entities=entity_names, 191 first_seen=now, 192 last_seen=now, ··· 232 if not clusters: 233 return 234 235 + # build set of all entity texts in the graph (for validation) 236 + all_entity_texts = { 237 + e["text"] for e in graph.get("entities", []) 238 + } 239 + 240 # sort by total cluster trend descending, take top N 241 sorted_clusters = sorted( 242 clusters.items(), ··· 263 if history_context: 264 user_content = ( 265 f"{history_context}\n\n" 266 + f"Curate these trending clusters into named groups:\n\n{entities_text}" 267 ) 268 else: 269 + user_content = f"Curate these trending clusters into named groups:\n\n{entities_text}" 270 271 try: 272 import anthropic ··· 274 client = anthropic.AsyncAnthropic() 275 response = await client.messages.parse( 276 model="claude-haiku-4-5-20251001", 277 + max_tokens=1024, 278 + temperature=0.3, 279 system=SYSTEM_PROMPT, 280 + output_format=CuratorOutput, 281 messages=[ 282 { 283 "role": "user", ··· 287 ) 288 289 result = response.parsed_output 290 + haiku = result.haiku.strip() if result.haiku else "" 291 292 + # code-based validation (not LLM) 293 + validated_groups = [] 294 + for group in result.groups[:MAX_GROUPS]: 295 + # validate each entity text exists in the graph 296 + valid_entities = [ 297 + e for e in group.entities if e in all_entity_texts 298 + ] 299 + if valid_entities: 300 + validated_groups.append( 301 + {"name": group.name, "entities": valid_entities} 302 + ) 303 + 304 + # if LLM returned 0 valid groups, keep previous groups 305 + if not validated_groups: 306 + if self.current_groups: 307 + print("narrator: 0 valid groups, keeping previous", flush=True) 308 + return 309 + else: 310 + self.current_groups = validated_groups 311 312 # update history with this cycle's results 313 + self._update_history(self.current_groups) 314 315 + self.stats["groups"] = len(self.current_groups) 316 self.stats["calls"] += 1 317 318 + groups_str = ", ".join( 319 + f"{g['name']!r} ({len(g['entities'])} ents)" 320 + for g in self.current_groups 321 ) 322 history_size = len(self.topic_history) 323 + haiku_preview = haiku.replace("\n", " / ") if haiku else "(none)" 324 print( 325 + f"narrator: {len(self.current_groups)} groups from " 326 f"{len(sorted_clusters)} clusters, " 327 + f"{history_size} in history ({groups_str}) " 328 + f"haiku: {haiku_preview}", 329 flush=True, 330 ) 331 332 + # push groups + haiku to backend 333 + if self.current_groups: 334 + await self._post_groups(self.current_groups, haiku) 335 336 except ImportError: 337 print("narrator: anthropic package not available", flush=True)

+70 -117

site/grid.js

··· 598 <span class="tooltip-key">connections</span> 599 </div> 600 </div> 601 - ${entity.cluster_label ? `<div class="tooltip-cluster">cluster: ${entity.cluster_label}</div>` : ''} 602 `; 603 604 // position near cursor but not under it ··· 890 scheduleRender(mainGridSize); 891 // accumulate trending scores (DOM renders on its own timer) 892 updateTrendingScores(data.data.entities || []); 893 syncSelectedEntity(); 894 if (searchState.open) updateSearchResults(); 895 } else { ··· 1643 const GRID_SIZE = 128; 1644 1645 // render trending entities from entity graph, prioritizing spike vs baseline 1646 - function trendIndicator(trendVal) { 1647 const pct = `${(trendVal * 100).toFixed(0)}% above baseline`; 1648 - if (trendVal >= 0.3) return { text: '↑↑', color: 'var(--percolating)', title: pct }; 1649 - if (trendVal >= 0.1) return { text: '↑', color: 'var(--percolating)', title: pct }; 1650 - if (trendVal >= 0.02) return { text: '↑', color: 'var(--text-dim)', title: pct }; 1651 - if (trendVal <= -0.1) return { text: '↓', color: 'var(--text-muted)', title: `${(trendVal * 100).toFixed(0)}% below baseline` }; 1652 - return { text: '·', color: 'var(--text-muted)', title: 'stable' }; 1653 } 1654 1655 // cluster visual identity: shapes chosen for maximum distinctness at small sizes 1656 const CLUSTER_SHAPES = ['●', '✦', '▲', '◇', '✚', '■']; 1657 const CLUSTER_COLORS = [ 1658 - { css: 'var(--teal)', bg: 'rgba(125, 211, 192, 0.10)', border: 'rgba(125, 211, 192, 0.25)' }, 1659 - { css: 'var(--lavender)', bg: 'rgba(196, 181, 253, 0.10)', border: 'rgba(196, 181, 253, 0.25)' }, 1660 - { css: 'var(--coral)', bg: 'rgba(252, 165, 165, 0.10)', border: 'rgba(252, 165, 165, 0.25)' }, 1661 - { css: 'var(--sky)', bg: 'rgba(147, 197, 253, 0.10)', border: 'rgba(147, 197, 253, 0.25)' }, 1662 - { css: 'var(--mint)', bg: 'rgba(134, 239, 172, 0.10)', border: 'rgba(134, 239, 172, 0.25)' }, 1663 - { css: 'var(--indigo)', bg: 'rgba(165, 180, 252, 0.10)', border: 'rgba(165, 180, 252, 0.25)' }, 1664 ]; 1665 - const MAX_CLUSTERS = 6; 1666 1667 // --- trending list: dam pattern --- 1668 // scores accumulate silently on every websocket message (500ms), ··· 1678 let latestTrending = []; 1679 let trendingRenderTimer = null; 1680 1681 - // --- cluster pills: much slower cadence than the entity list --- 1682 - // clusters represent themes (minutes-scale), not individual signals (seconds-scale). 1683 - // we accumulate cluster observations silently, then refresh pills on a slow timer. 1684 - // pills have their own hysteresis: must be seen N times to appear, absent N times to vanish. 1685 - const CLUSTER_REFRESH_INTERVAL = 30000; // ms between pill updates 1686 - const CLUSTER_APPEAR_THRESHOLD = 20; // must be seen in 20+ score cycles (~10s of a 30s window) 1687 - const CLUSTER_REMOVE_THRESHOLD = 3; // must be absent 3+ pill cycles to lose a pill 1688 - const MAX_PILLS = 3; // at most 3 topic pills — only the dominant themes 1689 - 1690 - // persistent cluster style assignments: cid -> styleIndex 1691 - const clusterStyleMap = new Map(); 1692 1693 - // cluster observation counts: cid -> { label, seen (score cycles), missed (pill cycles) } 1694 - const clusterObservations = new Map(); 1695 - 1696 - // the frozen cluster snapshot used by the entity list renderer 1697 - let frozenClusters = new Map(); // cid -> { label, index } 1698 - let clusterPillTimer = null; 1699 - 1700 - // called on every score update (every 500ms) to tally cluster observations 1701 - function observeClusters(trendingEntities) { 1702 - const seen = new Map(); // cid -> { label, maxScore } 1703 - for (const e of trendingEntities) { 1704 - const cid = e.cluster || 0; 1705 - const clabel = e.cluster_label || ''; 1706 - if (!cid || !clabel || clabel === 'Unclear') continue; 1707 - if (!seen.has(cid)) { 1708 - seen.set(cid, { label: clabel, maxScore: e._score }); 1709 - } 1710 - } 1711 - for (const [cid, { label }] of seen) { 1712 - const obs = clusterObservations.get(cid) || { label, seen: 0, missed: 0 }; 1713 - obs.label = label; 1714 - obs.seen++; 1715 - obs.missed = 0; // reset miss counter when seen 1716 - clusterObservations.set(cid, obs); 1717 - } 1718 - } 1719 - 1720 - // called on the slow timer to rebuild the frozen cluster snapshot 1721 - function refreshClusterPills() { 1722 - // collect candidates that met the observation threshold, sorted by strength 1723 - const candidates = []; 1724 - for (const [cid, obs] of clusterObservations) { 1725 - if (obs.seen >= CLUSTER_APPEAR_THRESHOLD) { 1726 - candidates.push({ cid, label: obs.label, seen: obs.seen }); 1727 - obs.seen = 0; // reset — must earn it again next cycle 1728 - } else { 1729 - obs.missed++; 1730 - if (obs.missed >= CLUSTER_REMOVE_THRESHOLD) { 1731 - clusterObservations.delete(cid); 1732 - clusterStyleMap.delete(cid); 1733 - } 1734 - } 1735 - } 1736 - 1737 - // only the top MAX_PILLS by observation count 1738 - candidates.sort((a, b) => b.seen - a.seen); 1739 - const promoted = new Map(); 1740 - for (const c of candidates.slice(0, MAX_PILLS)) { 1741 - let styleIdx = clusterStyleMap.get(c.cid); 1742 - if (styleIdx === undefined) { 1743 - const usedIndices = new Set(clusterStyleMap.values()); 1744 - const available = []; 1745 - for (let i = 0; i < MAX_CLUSTERS; i++) { 1746 - if (!usedIndices.has(i)) available.push(i); 1747 - } 1748 - if (available.length > 0) { 1749 - styleIdx = available[Math.floor(Math.random() * available.length)]; 1750 - clusterStyleMap.set(c.cid, styleIdx); 1751 - } 1752 - } 1753 - if (styleIdx !== undefined) { 1754 - promoted.set(c.cid, { label: c.label, index: styleIdx }); 1755 - } 1756 - } 1757 - 1758 - if (promoted.size < 2) promoted.clear(); 1759 - frozenClusters = promoted; 1760 } 1761 1762 // accumulate scores from each websocket message — no DOM work here ··· 1795 // store snapshot for next render 1796 latestTrending = top; 1797 1798 - // accumulate cluster observations (silent — no DOM work) 1799 - observeClusters(top); 1800 - 1801 - // start timers on first data 1802 if (!trendingRenderTimer) { 1803 renderTrendingSnapshot(); // render immediately on first data 1804 trendingRenderTimer = setInterval(renderTrendingSnapshot, TRENDING_RENDER_INTERVAL); 1805 - refreshClusterPills(); // initial pill snapshot 1806 - clusterPillTimer = setInterval(refreshClusterPills, CLUSTER_REFRESH_INTERVAL); 1807 } 1808 } 1809 ··· 1814 1815 const top = latestTrending; 1816 1817 - // use the frozen cluster snapshot (updated on its own slow timer) 1818 - const clusterMap = frozenClusters; 1819 - 1820 - // render cluster pills above the list 1821 const section = container.parentElement; 1822 let pillsContainer = section.querySelector('.cluster-pills'); 1823 - if (clusterMap.size > 0) { 1824 if (!pillsContainer) { 1825 pillsContainer = document.createElement('div'); 1826 pillsContainer.className = 'cluster-pills'; 1827 section.insertBefore(pillsContainer, container); 1828 } 1829 pillsContainer.innerHTML = ''; 1830 - const sorted = [...clusterMap.entries()].sort((a, b) => a[1].index - b[1].index); 1831 - for (const [, cl] of sorted) { 1832 const pill = document.createElement('span'); 1833 pill.className = 'cluster-pill'; 1834 - const palette = CLUSTER_COLORS[cl.index % CLUSTER_COLORS.length]; 1835 pill.style.background = palette.bg; 1836 pill.style.borderColor = palette.border; 1837 pill.style.color = palette.css; 1838 - pill.innerHTML = `<span class="cluster-pill-shape">${CLUSTER_SHAPES[cl.index]}</span>${cl.label}`; 1839 pillsContainer.appendChild(pill); 1840 } 1841 } else if (pillsContainer) { ··· 1861 }); 1862 1863 // update or create rows 1864 top.forEach((entity, i) => { 1865 const entityId = String(entity.id); 1866 let row = container.querySelector(`.entity-row[data-entity-id="${entityId}"]`); ··· 1875 row.classList.remove('entity-row-exit', 'entity-row-enter'); 1876 } 1877 1878 - const cid = entity.cluster || 0; 1879 - const cl = clusterMap.get(cid); 1880 - if (cl) { 1881 - const palette = CLUSTER_COLORS[cl.index % CLUSTER_COLORS.length]; 1882 row.style.borderLeft = `2px solid ${palette.border}`; 1883 } else { 1884 row.style.borderLeft = '2px solid transparent'; ··· 1886 1887 row.innerHTML = ''; 1888 1889 - if (cl) { 1890 const pip = document.createElement('span'); 1891 pip.className = 'cluster-pip'; 1892 - pip.textContent = CLUSTER_SHAPES[cl.index]; 1893 - pip.style.color = CLUSTER_COLORS[cl.index % CLUSTER_COLORS.length].css; 1894 - pip.title = cl.label; 1895 row.appendChild(pip); 1896 } 1897 ··· 1905 text.href = `https://news.google.com/search?q=${encodeURIComponent(entity.text || '')}`; 1906 text.target = '_blank'; 1907 text.rel = 'noopener'; 1908 - if (cl) text.title = cl.label; 1909 1910 const countEl = document.createElement('span'); 1911 countEl.className = 'entity-count'; 1912 countEl.textContent = entity._count; 1913 countEl.title = `${entity._count} mentions in last 5 min`; 1914 1915 - const ti = trendIndicator(entity._trend); 1916 const trendEl = document.createElement('span'); 1917 trendEl.className = 'entity-trend'; 1918 trendEl.textContent = ti.text;

··· 598 <span class="tooltip-key">connections</span> 599 </div> 600 </div> 601 + ${entity.group_name ? `<div class="tooltip-cluster">topic: ${entity.group_name}</div>` : ''} 602 `; 603 604 // position near cursor but not under it ··· 890 scheduleRender(mainGridSize); 891 // accumulate trending scores (DOM renders on its own timer) 892 updateTrendingScores(data.data.entities || []); 893 + // update groups from backend (LLM-curated, stable for ~5 min) 894 + if (data.data.groups) updateGroups(data.data.groups); 895 + // update haiku if present 896 + if (data.data.haiku) updateHaiku(data.data.haiku); 897 syncSelectedEntity(); 898 if (searchState.open) updateSearchResults(); 899 } else { ··· 1647 const GRID_SIZE = 128; 1648 1649 // render trending entities from entity graph, prioritizing spike vs baseline 1650 + function trendIndicator(trendVal, maxTrend) { 1651 const pct = `${(trendVal * 100).toFixed(0)}% above baseline`; 1652 + if (trendVal <= 0) return { text: '·', color: 'var(--text-muted)', title: 'stable' }; 1653 + const ratio = maxTrend > 0 ? trendVal / maxTrend : 0; 1654 + if (ratio >= 0.6) return { text: '↑↑', color: 'var(--percolating)', title: pct }; 1655 + if (ratio >= 0.3) return { text: '↑', color: 'var(--percolating)', title: pct }; 1656 + return { text: '↑', color: 'var(--text-dim)', title: pct }; 1657 } 1658 1659 // cluster visual identity: shapes chosen for maximum distinctness at small sizes 1660 const CLUSTER_SHAPES = ['●', '✦', '▲', '◇', '✚', '■']; 1661 const CLUSTER_COLORS = [ 1662 + { css: 'var(--neon-pink)', bg: 'rgba(255, 45, 120, 0.15)', border: 'rgba(255, 45, 120, 0.40)' }, 1663 + { css: 'var(--neon-cyan)', bg: 'rgba(0, 240, 255, 0.12)', border: 'rgba(0, 240, 255, 0.35)' }, 1664 + { css: 'var(--neon-lime)', bg: 'rgba(57, 255, 20, 0.12)', border: 'rgba(57, 255, 20, 0.35)' }, 1665 + { css: 'var(--neon-amber)', bg: 'rgba(255, 183, 0, 0.14)', border: 'rgba(255, 183, 0, 0.38)' }, 1666 + { css: 'var(--neon-violet)', bg: 'rgba(191, 64, 255, 0.14)', border: 'rgba(191, 64, 255, 0.38)' }, 1667 + { css: 'var(--neon-blue)', bg: 'rgba(77, 139, 255, 0.14)', border: 'rgba(77, 139, 255, 0.38)' }, 1668 ]; 1669 + 1670 1671 // --- trending list: dam pattern --- 1672 // scores accumulate silently on every websocket message (500ms), ··· 1682 let latestTrending = []; 1683 let trendingRenderTimer = null; 1684 1685 + // --- LLM-curated groups: direct rendering from backend --- 1686 + // groups are stable for ~5 min (LLM cadence), no presence tracking needed 1687 + let currentGroups = []; // [{name, styleIndex}] 1688 1689 + function updateGroups(groups) { 1690 + if (!groups || !groups.length) { currentGroups = []; return; } 1691 + currentGroups = groups.map((g, i) => ({ 1692 + name: g.name, 1693 + styleIndex: i % CLUSTER_COLORS.length, 1694 + })); 1695 } 1696 1697 // accumulate scores from each websocket message — no DOM work here ··· 1730 // store snapshot for next render 1731 latestTrending = top; 1732 1733 + // start render timer on first data 1734 if (!trendingRenderTimer) { 1735 renderTrendingSnapshot(); // render immediately on first data 1736 trendingRenderTimer = setInterval(renderTrendingSnapshot, TRENDING_RENDER_INTERVAL); 1737 + } 1738 + } 1739 + 1740 + // update the haiku display (called when websocket data contains a haiku) 1741 + let currentHaiku = ''; 1742 + function updateHaiku(haiku) { 1743 + if (!haiku || haiku === currentHaiku) return; 1744 + currentHaiku = haiku; 1745 + const container = document.getElementById('haiku-container'); 1746 + const textEl = document.getElementById('haiku-text'); 1747 + const timeEl = document.getElementById('haiku-time'); 1748 + if (!container || !textEl) return; 1749 + textEl.textContent = '\u201C' + haiku + '\u201D'; 1750 + container.hidden = false; 1751 + 1752 + // set human-readable timestamp 1753 + if (timeEl) { 1754 + const now = new Date(); 1755 + const hours = now.getHours(); 1756 + const minutes = now.getMinutes().toString().padStart(2, '0'); 1757 + const ampm = hours >= 12 ? 'pm' : 'am'; 1758 + const h12 = hours % 12 || 12; 1759 + const timeStr = `${h12}:${minutes} ${ampm}`; 1760 + timeEl.textContent = timeStr; 1761 + timeEl.title = now.toLocaleString(); 1762 + timeEl.dateTime = now.toISOString(); 1763 } 1764 } 1765 ··· 1770 1771 const top = latestTrending; 1772 1773 + // render group pills above the list (directly from LLM-curated groups) 1774 const section = container.parentElement; 1775 let pillsContainer = section.querySelector('.cluster-pills'); 1776 + if (currentGroups.length > 0) { 1777 if (!pillsContainer) { 1778 pillsContainer = document.createElement('div'); 1779 pillsContainer.className = 'cluster-pills'; 1780 section.insertBefore(pillsContainer, container); 1781 } 1782 pillsContainer.innerHTML = ''; 1783 + for (const grp of currentGroups) { 1784 const pill = document.createElement('span'); 1785 pill.className = 'cluster-pill'; 1786 + const palette = CLUSTER_COLORS[grp.styleIndex % CLUSTER_COLORS.length]; 1787 pill.style.background = palette.bg; 1788 pill.style.borderColor = palette.border; 1789 pill.style.color = palette.css; 1790 + pill.innerHTML = `<span class="cluster-pill-shape">${CLUSTER_SHAPES[grp.styleIndex]}</span>${grp.name}`; 1791 pillsContainer.appendChild(pill); 1792 } 1793 } else if (pillsContainer) { ··· 1813 }); 1814 1815 // update or create rows 1816 + const maxTrend = top.reduce((m, e) => Math.max(m, e._trend || 0), 0); 1817 top.forEach((entity, i) => { 1818 const entityId = String(entity.id); 1819 let row = container.querySelector(`.entity-row[data-entity-id="${entityId}"]`); ··· 1828 row.classList.remove('entity-row-exit', 'entity-row-enter'); 1829 } 1830 1831 + const groupIdx = entity.group; 1832 + const grp = groupIdx != null ? currentGroups[groupIdx] : null; 1833 + if (grp) { 1834 + const palette = CLUSTER_COLORS[grp.styleIndex % CLUSTER_COLORS.length]; 1835 row.style.borderLeft = `2px solid ${palette.border}`; 1836 } else { 1837 row.style.borderLeft = '2px solid transparent'; ··· 1839 1840 row.innerHTML = ''; 1841 1842 + if (grp) { 1843 const pip = document.createElement('span'); 1844 pip.className = 'cluster-pip'; 1845 + pip.textContent = CLUSTER_SHAPES[grp.styleIndex]; 1846 + pip.style.color = CLUSTER_COLORS[grp.styleIndex % CLUSTER_COLORS.length].css; 1847 + pip.title = grp.name; 1848 row.appendChild(pip); 1849 } 1850 ··· 1858 text.href = `https://news.google.com/search?q=${encodeURIComponent(entity.text || '')}`; 1859 text.target = '_blank'; 1860 text.rel = 'noopener'; 1861 + if (grp) text.title = grp.name; 1862 1863 const countEl = document.createElement('span'); 1864 countEl.className = 'entity-count'; 1865 countEl.textContent = entity._count; 1866 countEl.title = `${entity._count} mentions in last 5 min`; 1867 1868 + const ti = trendIndicator(entity._trend, maxTrend); 1869 const trendEl = document.createElement('span'); 1870 trendEl.className = 'entity-trend'; 1871 trendEl.textContent = ti.text;

+10

site/index.html

··· 85 </div> 86 </div> 87 88 <footer> 89 <div class="footer-byline">by <a href="https://bsky.app/profile/zzstoatzz.io" target="_blank">nate</a> <span class="footer-sep">·</span> inspired by <a href="https://hailey.at/posts/3mcy5b5gfi222" target="_blank">hailey</a></div> 90 <div class="footer-src"><a href="https://tangled.sh/@zzstoatzz.io/coral" target="_blank">[src]</a></div>

··· 85 </div> 86 </div> 87 88 + <div class="haiku-container" id="haiku-container" hidden> 89 + <blockquote class="haiku-quote"> 90 + <div class="haiku-text" id="haiku-text"></div> 91 + <div class="haiku-footer"> 92 + <cite class="haiku-attr">— claude haiku 4.5</cite> 93 + <time class="haiku-time" id="haiku-time" title=""></time> 94 + </div> 95 + </blockquote> 96 + </div> 97 + 98 <footer> 99 <div class="footer-byline">by <a href="https://bsky.app/profile/zzstoatzz.io" target="_blank">nate</a> <span class="footer-sep">·</span> inspired by <a href="https://hailey.at/posts/3mcy5b5gfi222" target="_blank">hailey</a></div> 100 <div class="footer-src"><a href="https://tangled.sh/@zzstoatzz.io/coral" target="_blank">[src]</a></div>

+59 -1

site/style.css

··· 18 --sky: #93c5fd; 19 --mint: #86efac; 20 --indigo: #a5b4fc; 21 --percolating: #4ade80; 22 --trending: #fbbf24; 23 ··· 632 633 /* footer */ 634 footer { 635 - padding-top: 1rem; 636 border-top: 1px solid var(--border); 637 text-align: center; 638 display: flex; ··· 675 676 .footer-src a:hover { 677 color: var(--text-dim); 678 } 679 680 /* mobile: stack vertically */

··· 18 --sky: #93c5fd; 19 --mint: #86efac; 20 --indigo: #a5b4fc; 21 + 22 + /* cyberpunk cluster palette */ 23 + --neon-pink: #ff2d78; 24 + --neon-cyan: #00f0ff; 25 + --neon-lime: #39ff14; 26 + --neon-amber: #ffb700; 27 + --neon-violet: #bf40ff; 28 + --neon-blue: #4d8bff; 29 --percolating: #4ade80; 30 --trending: #fbbf24; 31 ··· 640 641 /* footer */ 642 footer { 643 + padding-top: 0.5rem; 644 border-top: 1px solid var(--border); 645 text-align: center; 646 display: flex; ··· 683 684 .footer-src a:hover { 685 color: var(--text-dim); 686 + } 687 + 688 + /* haiku display — proper attributed quote */ 689 + .haiku-container { 690 + display: flex; 691 + align-items: center; 692 + justify-content: center; 693 + padding: 1.5rem 0; 694 + margin-bottom: 0.5rem; 695 + border-top: 1px solid var(--border); 696 + } 697 + 698 + .haiku-quote { 699 + text-align: center; 700 + margin: 0; 701 + padding: 0; 702 + border: none; 703 + } 704 + 705 + .haiku-text { 706 + font-size: 11px; 707 + line-height: 1.8; 708 + color: var(--text-dim); 709 + font-style: italic; 710 + white-space: pre-line; 711 + } 712 + 713 + .haiku-footer { 714 + display: flex; 715 + align-items: baseline; 716 + justify-content: center; 717 + gap: 0.4rem; 718 + margin-top: 0.6rem; 719 + } 720 + 721 + .haiku-attr { 722 + font-size: 9px; 723 + color: var(--text-muted); 724 + font-style: normal; 725 + } 726 + 727 + .haiku-time { 728 + font-size: 8px; 729 + color: transparent; 730 + transition: color 0.2s ease; 731 + font-style: normal; 732 + } 733 + 734 + .haiku-container:hover .haiku-time { 735 + color: var(--text-muted); 736 } 737 738 /* mobile: stack vertically */