this repo has no description coral.waow.tech

feat: LLM curator for named groups, haiku display, and docs overhaul

Replace per-cluster labeling with LLM-curated named groups:
- Claude Haiku 4.5 curates co-occurrence clusters into max 5 named
groups every 5 min with topic history for continuity
- Backend Group struct with case-insensitive entity matching
- POST /groups endpoint replaces /cluster-labels
- Frontend renders group pills, entity pips, and attributed haiku
- Relative trend arrows (percentile-based, not absolute thresholds)
- Fix: stats key rename crash loop (topics→groups in bridge.py)
- Fix: haiku container styling (Safari footer bleed, height collapse)
- Docs: rewrite architecture, add curator doc, update CLAUDE.md

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

+640 -353
+2
CLAUDE.md
··· 31 31 - **entity graph**: nodes = named entities, edges = co-occurrence (mentioned in same post) 32 32 - **pheromone edges**: edge weights decay exponentially, reinforced on repeated co-occurrence 33 33 - **z-score trending**: entities ranked by "surprise" (sigma above baseline), not raw counts 34 + - **LLM curator**: Claude Haiku 4.5 curates clusters into named groups every 5 min (dedup.py) 34 35 - **firehose rate**: raw jetstream messages/sec displayed as health indicator 35 36 36 37 ## entity types ··· 46 47 - docs/02-semantic-percolation-plan.md: theoretical background + design decisions 47 48 - docs/03-baseline-audit.md: empirical observations from live data 48 49 - docs/04-architecture.md: system architecture, data flow, deployment, failure modes 50 + - docs/06-llm-curator.md: LLM curator design, prompt, validation, topic continuity
+97 -63
backend/src/entity_graph.zig
··· 98 98 // what "spanning" means in a co-occurrence graph context 99 99 pub const SPANNING_THRESHOLD: f32 = 0.5; 100 100 101 + // LLM-curated named groups 102 + pub const MAX_GROUPS = 5; 103 + pub const MAX_GROUP_ENTITIES = 20; 104 + 105 + pub const Group = struct { 106 + name: [64]u8 = undefined, 107 + name_len: u8 = 0, 108 + entity_texts: [MAX_GROUP_ENTITIES][64]u8 = undefined, 109 + entity_text_lens: [MAX_GROUP_ENTITIES]u8 = [_]u8{0} ** MAX_GROUP_ENTITIES, 110 + entity_count: u8 = 0, 111 + }; 112 + 101 113 pub const EntityId = u32; 102 114 pub const UserId = u32; 103 115 pub const Timestamp = i64; // milliseconds since epoch ··· 462 474 // bridge rate: EWMA of cross-cluster co-occurrences 463 475 bridges: BridgeCounter = .{}, 464 476 465 - // external cluster labels from NER bridge LLM narration 466 - // indexed by cluster root entity ID; overrides mechanical "A + B" label 467 - cluster_label_text: [MAX_ENTITIES][64]u8 = undefined, 468 - cluster_label_len: [MAX_ENTITIES]u8 = [_]u8{0} ** MAX_ENTITIES, 477 + // LLM-curated named groups (max 5, replaced atomically) 478 + groups: [MAX_GROUPS]Group = [_]Group{.{}} ** MAX_GROUPS, 479 + group_count: u8 = 0, 480 + 481 + // global haiku from LLM narration (zeitgeist summary) 482 + haiku_text: [256]u8 = undefined, 483 + haiku_len: u16 = 0, 469 484 470 485 // cached stats 471 486 stats: GraphStats = .{ ··· 481 496 482 497 mutex: Mutex = .{}, 483 498 484 - /// set an external cluster label (from LLM narration) 485 - /// cluster_id is the entity ID of the cluster root 486 - pub fn setClusterLabel(self: *EntityGraph, cluster_id: u32, label_text: []const u8) void { 487 - if (cluster_id >= MAX_ENTITIES) return; 488 - const len: u8 = @intCast(@min(label_text.len, 64)); 489 - @memcpy(self.cluster_label_text[cluster_id][0..len], label_text[0..len]); 490 - self.cluster_label_len[cluster_id] = len; 499 + /// atomically replace all groups (from LLM curation) 500 + pub fn setGroups(self: *EntityGraph, groups: []const Group, count: u8) void { 501 + const n = @min(count, MAX_GROUPS); 502 + for (0..n) |i| { 503 + self.groups[i] = groups[i]; 504 + } 505 + self.group_count = n; 491 506 } 492 507 493 - /// clear all external cluster labels (call before setting new batch) 494 - pub fn clearClusterLabels(self: *EntityGraph) void { 495 - @memset(&self.cluster_label_len, 0); 508 + /// clear all groups 509 + pub fn clearGroups(self: *EntityGraph) void { 510 + self.group_count = 0; 511 + } 512 + 513 + /// set global haiku (zeitgeist summary from LLM) 514 + pub fn setHaiku(self: *EntityGraph, text: []const u8) void { 515 + const len: u16 = @intCast(@min(text.len, 256)); 516 + @memcpy(self.haiku_text[0..len], text[0..len]); 517 + self.haiku_len = len; 496 518 } 497 519 498 520 /// find or create user by DID hash, returns user ID ··· 968 990 } 969 991 } 970 992 971 - // cluster-level scores (sum of positive trends) and labels 993 + // cluster-level scores (sum of positive trends) 972 994 var cluster_score: [MAX_ENTITIES]f32 = [_]f32{0} ** MAX_ENTITIES; 973 - var cluster_label_weight: [MAX_ENTITIES]f32 = [_]f32{0} ** MAX_ENTITIES; 974 - var cluster_label_a: [MAX_ENTITIES]EntityId = [_]EntityId{NO_ENTITY} ** MAX_ENTITIES; 975 - var cluster_label_b: [MAX_ENTITIES]EntityId = [_]EntityId{NO_ENTITY} ** MAX_ENTITIES; 976 995 977 996 for (0..self.count) |i| { 978 997 const entity = &self.entities[i]; ··· 985 1004 } 986 1005 } 987 1006 988 - // pick strongest degree-normalized edge per cluster for a lightweight label 989 - for (0..self.count) |i| { 990 - const entity_a = &self.entities[i]; 991 - if (!entity_a.isActive()) continue; 992 - const root = roots[i]; 993 - if (root == NO_ENTITY) continue; 994 - for (0..entity_a.edges.count) |edge_idx| { 995 - const id_b = entity_a.edges.edges[edge_idx]; 996 - if (id_b <= i) continue; 997 - if (id_b >= self.count or !self.entities[id_b].isActive()) continue; 998 - if (roots[id_b] != root) continue; 999 - 1000 - const edge_ts = entity_a.edges.last_seen[edge_idx]; 1001 - if ((now - edge_ts) >= EDGE_DECAY_MS) continue; 1002 - 1003 - const raw_weight = self.edgeWeight(@intCast(i), id_b, now); 1004 - const ra = @max(self.entities[i].smoothed_rate, 0.001); 1005 - const rb = @max(self.entities[id_b].smoothed_rate, 0.001); 1006 - const weight = raw_weight / @sqrt(ra * rb); 1007 - if (weight > cluster_label_weight[root]) { 1008 - cluster_label_weight[root] = weight; 1009 - cluster_label_a[root] = @intCast(i); 1010 - cluster_label_b[root] = id_b; 1011 - } 1012 - } 1013 - } 1014 - 1015 1007 jw.beginObject() catch return "{}"; 1016 1008 jw.objectField("entities") catch return "{}"; 1017 1009 jw.beginArray() catch return "{}"; ··· 1047 1039 if (root != NO_ENTITY) { 1048 1040 jw.objectField("cluster_score") catch break; 1049 1041 jw.write(cluster_score[root]) catch break; 1042 + } 1050 1043 1051 - // prefer LLM-generated label over mechanical "A + B" 1052 - jw.objectField("cluster_label") catch break; 1053 - if (self.cluster_label_len[root] > 0) { 1054 - jw.write(self.cluster_label_text[root][0..self.cluster_label_len[root]]) catch break; 1055 - } else { 1056 - const label_a = cluster_label_a[root]; 1057 - const label_b = cluster_label_b[root]; 1058 - if (label_a != NO_ENTITY and label_b != NO_ENTITY) { 1059 - var label_buf: [140]u8 = undefined; 1060 - const label_str = std.fmt.bufPrint(&label_buf, "{s} + {s}", .{ 1061 - self.entities[label_a].getText(), 1062 - self.entities[label_b].getText(), 1063 - }) catch ""; 1064 - jw.write(label_str) catch break; 1065 - } else { 1066 - jw.write("") catch break; 1067 - } 1068 - } 1044 + // match entity text against LLM groups (case-insensitive) 1045 + const group_match = self.matchGroup(entity.getText()); 1046 + if (group_match.index) |gi| { 1047 + jw.objectField("group") catch break; 1048 + jw.write(gi) catch break; 1049 + jw.objectField("group_name") catch break; 1050 + jw.write(self.groups[gi].name[0..self.groups[gi].name_len]) catch break; 1069 1051 } 1052 + 1070 1053 jw.objectField("largest") catch break; 1071 1054 jw.write(in_largest) catch break; 1072 1055 jw.objectField("x") catch break; ··· 1104 1087 1105 1088 jw.endArray() catch return "{}"; 1106 1089 1090 + // top-level groups array 1091 + jw.objectField("groups") catch return "{}"; 1092 + jw.beginArray() catch return "{}"; 1093 + for (0..self.group_count) |gi| { 1094 + const grp = &self.groups[gi]; 1095 + jw.beginObject() catch break; 1096 + jw.objectField("name") catch break; 1097 + jw.write(grp.name[0..grp.name_len]) catch break; 1098 + jw.objectField("entities") catch break; 1099 + jw.beginArray() catch break; 1100 + for (0..grp.entity_count) |ei| { 1101 + jw.write(grp.entity_texts[ei][0..grp.entity_text_lens[ei]]) catch break; 1102 + } 1103 + jw.endArray() catch break; 1104 + jw.endObject() catch break; 1105 + } 1106 + jw.endArray() catch return "{}"; 1107 + 1107 1108 // stats object 1108 1109 jw.objectField("stats") catch return "{}"; 1109 1110 jw.beginObject() catch return "{}"; ··· 1127 1128 jw.write(self.stats.bridge_rate) catch return "{}"; 1128 1129 jw.endObject() catch return "{}"; 1129 1130 1131 + // global haiku (zeitgeist summary) 1132 + if (self.haiku_len > 0) { 1133 + jw.objectField("haiku") catch return "{}"; 1134 + jw.write(self.haiku_text[0..self.haiku_len]) catch return "{}"; 1135 + } 1136 + 1130 1137 jw.endObject() catch return "{}"; 1131 1138 1132 1139 return w.buffered(); 1140 + } 1141 + 1142 + /// match entity text against group entity lists (case-insensitive) 1143 + const GroupMatch = struct { index: ?usize }; 1144 + fn matchGroup(self: *EntityGraph, entity_text: []const u8) GroupMatch { 1145 + var e_lower: [64]u8 = undefined; 1146 + const e_len = @min(entity_text.len, 64); 1147 + for (0..e_len) |i| { 1148 + e_lower[i] = std.ascii.toLower(entity_text[i]); 1149 + } 1150 + const e_norm = e_lower[0..e_len]; 1151 + 1152 + for (0..self.group_count) |gi| { 1153 + const grp = &self.groups[gi]; 1154 + for (0..grp.entity_count) |ei| { 1155 + const g_len = grp.entity_text_lens[ei]; 1156 + if (g_len != e_len) continue; 1157 + var g_lower: [64]u8 = undefined; 1158 + for (0..g_len) |k| { 1159 + g_lower[k] = std.ascii.toLower(grp.entity_texts[ei][k]); 1160 + } 1161 + if (std.mem.eql(u8, g_lower[0..g_len], e_norm)) { 1162 + return .{ .index = gi }; 1163 + } 1164 + } 1165 + } 1166 + return .{ .index = null }; 1133 1167 } 1134 1168 1135 1169 /// diagnostics: compute distributions for analysis
+54 -22
backend/src/http.zig
··· 69 69 try handleEntity(request); 70 70 } else if (mem.eql(u8, target, "/entity-graph")) { 71 71 try handleEntityGraph(request); 72 - } else if (mem.eql(u8, target, "/cluster-labels")) { 73 - try handleClusterLabels(request); 72 + } else if (mem.eql(u8, target, "/groups")) { 73 + try handleGroups(request); 74 74 } else if (mem.eql(u8, target, "/diagnostics")) { 75 75 try handleDiagnostics(request); 76 76 } else if (mem.eql(u8, target, "/")) { 77 - try sendJson(request, "{\"name\":\"coral\",\"endpoints\":[\"/stats\",\"/state/32\",\"/state/128\",\"/state/512\",\"/entity\",\"/entity-graph\",\"/cluster-labels\",\"/diagnostics\"]}"); 77 + try sendJson(request, "{\"name\":\"coral\",\"endpoints\":[\"/stats\",\"/state/32\",\"/state/128\",\"/state/512\",\"/entity\",\"/entity-graph\",\"/groups\",\"/diagnostics\"]}"); 78 78 } else { 79 79 try sendNotFound(request); 80 80 } ··· 293 293 try sendJson(request, json_data); 294 294 } 295 295 296 - /// POST /cluster-labels - accept LLM-generated cluster labels from NER bridge 297 - fn handleClusterLabels(request: *http.Server.Request) !void { 296 + /// POST /groups - accept LLM-curated named groups from NER bridge 297 + fn handleGroups(request: *http.Server.Request) !void { 298 298 if (request.head.method != .POST) { 299 299 try sendJson(request, "{\"error\":\"method not allowed\"}"); 300 300 return; ··· 311 311 defer body_collector.deinit(); 312 312 313 313 _ = body_reader.stream(&body_collector.writer, .unlimited) catch |err| { 314 - log.err("failed to read cluster-labels body: {}", .{err}); 314 + log.err("failed to read groups body: {}", .{err}); 315 315 try sendJson(request, "{\"error\":\"failed to read body\"}"); 316 316 return; 317 317 }; ··· 328 328 return; 329 329 } 330 330 331 - const labels_obj = parsed.value.object.get("labels") orelse { 332 - try sendJson(request, "{\"error\":\"missing labels object\"}"); 331 + const groups_arr = parsed.value.object.get("groups") orelse { 332 + try sendJson(request, "{\"error\":\"missing groups array\"}"); 333 333 return; 334 334 }; 335 335 336 - if (labels_obj != .object) { 337 - try sendJson(request, "{\"error\":\"labels must be object\"}"); 336 + if (groups_arr != .array) { 337 + try sendJson(request, "{\"error\":\"groups must be array\"}"); 338 338 return; 339 339 } 340 340 341 - // clear old labels and set new ones 342 - // labels format: {"cluster_id_str": "topic label", ...} 341 + // parse groups into fixed-size structs 342 + var groups: [entity_graph.MAX_GROUPS]entity_graph.Group = [_]entity_graph.Group{.{}} ** entity_graph.MAX_GROUPS; 343 + var group_count: u8 = 0; 344 + 345 + for (groups_arr.array.items) |group_val| { 346 + if (group_count >= entity_graph.MAX_GROUPS) break; 347 + if (group_val != .object) continue; 348 + 349 + const name_val = group_val.object.get("name") orelse continue; 350 + if (name_val != .string) continue; 351 + 352 + var grp = entity_graph.Group{}; 353 + 354 + // set name 355 + const name_len: u8 = @intCast(@min(name_val.string.len, 64)); 356 + @memcpy(grp.name[0..name_len], name_val.string[0..name_len]); 357 + grp.name_len = name_len; 358 + 359 + // set entities 360 + if (group_val.object.get("entities")) |ents_val| { 361 + if (ents_val == .array) { 362 + for (ents_val.array.items) |ent_val| { 363 + if (grp.entity_count >= entity_graph.MAX_GROUP_ENTITIES) break; 364 + if (ent_val != .string) continue; 365 + const ent_len: u8 = @intCast(@min(ent_val.string.len, 64)); 366 + @memcpy(grp.entity_texts[grp.entity_count][0..ent_len], ent_val.string[0..ent_len]); 367 + grp.entity_text_lens[grp.entity_count] = ent_len; 368 + grp.entity_count += 1; 369 + } 370 + } 371 + } 372 + 373 + groups[group_count] = grp; 374 + group_count += 1; 375 + } 376 + 343 377 entity_graph.graph.mutex.lock(); 344 378 defer entity_graph.graph.mutex.unlock(); 345 379 346 - entity_graph.graph.clearClusterLabels(); 380 + entity_graph.graph.setGroups(&groups, group_count); 347 381 348 - var count: usize = 0; 349 - var it = labels_obj.object.iterator(); 350 - while (it.next()) |entry| { 351 - const cluster_id = std.fmt.parseInt(u32, entry.key_ptr.*, 10) catch continue; 352 - if (entry.value_ptr.* != .string) continue; 353 - entity_graph.graph.setClusterLabel(cluster_id, entry.value_ptr.string); 354 - count += 1; 382 + // optional haiku field 383 + if (parsed.value.object.get("haiku")) |haiku_val| { 384 + if (haiku_val == .string) { 385 + entity_graph.graph.setHaiku(haiku_val.string); 386 + } 355 387 } 356 388 357 - log.info("cluster-labels: set {d} labels", .{count}); 389 + log.info("groups: set {d} groups", .{group_count}); 358 390 359 391 var buf: [64]u8 = undefined; 360 - const response = std.fmt.bufPrint(&buf, "{{\"set\":{d}}}", .{count}) catch "{\"set\":0}"; 392 + const response = std.fmt.bufPrint(&buf, "{{\"set\":{d}}}", .{group_count}) catch "{\"set\":0}"; 361 393 try sendJson(request, response); 362 394 } 363 395
+92 -84
docs/04-architecture.md
··· 8 8 bluesky firehose 9 9 | 10 10 v 11 - turbostream (graze.social) 11 + jetstream2 (multiple instances) 12 12 | 13 - | wss://api.graze.social/.../turbostream 13 + | wss://jetstream2.us-east.bsky.network/subscribe 14 14 v 15 15 NER bridge (python, fly.io) 16 16 | 17 17 | POST http://coral.internal:3000/entity 18 + | POST http://coral.internal:3000/groups (every 5 min, LLM-curated) 18 19 v 19 20 backend (zig, fly.io) 20 21 | ··· 23 24 frontend (static, cloudflare pages) 24 25 ``` 25 26 26 - turbostream is a hydrated repeater built on bluesky's jetstream. it enriches raw firehose events with author profiles, mentions, parent posts, etc. we get ~50 messages/sec from it. 27 + jetstream2 is bluesky's official firehose consumer. the NER bridge connects to independent relay instances (firehose.stream, jetstream2.us-east.bsky.network, etc.) with automatic fallback. we get ~50 messages/sec. 27 28 28 29 ## NER bridge 29 30 30 31 `ner/bridge.py` - python async worker on fly.io (2gb performance-1x, spaCy needs ~400mb but benefits from CPU headroom). 31 32 32 33 **what it does**: 33 - 1. connects to turbostream via websocket 34 - 2. extracts post text, author DID, URI, follower count 34 + 1. connects to jetstream2 via websocket (multi-instance fallback with cursor rewind) 35 + 2. extracts post text from commit records 35 36 3. checks spam labels (drops labeled DIDs/URIs before NER) 36 37 4. runs spaCy NER (`en_core_web_sm`, only NER pipe enabled) 37 38 5. normalizes entities (whitespace, punctuation, NORP plural stripping) 38 39 6. POSTs all entities from one post together to the backend (preserves co-occurrence) 39 40 7. tracks firehose rate via EWMA and sends it with each POST 40 41 41 - **spam filtering**: subscribes to `ozone.hailey.at` label stream (CBOR-encoded). caches labels by DID/URI with expiry timestamps. seeds cache on startup via queryLabels API. label types: spam, shopping-spam, general-spam, reply-link-spam, inauth-fundraising, coordinated-abuse, men-facet-abuse, mass-follow-high/mid, elon-handle, new-acct-replies. 42 + **reader/worker architecture**: the websocket reader is decoupled from processing via a bounded `asyncio.Queue(maxsize=200)`. the reader task only calls `recv()` and enqueues, keeping ping/pong responsive. worker tasks dequeue and do NER + POST. see `docs/05-websocket-stability.md` for the debugging story. 42 43 43 - **stability design**: process synchronously (no unbounded queues). spaCy has 20x headroom. POST with 2s timeout, drop on failure. no retries, no queues. 44 + **spam filtering**: subscribes to `ozone.hailey.at` label stream (CBOR-encoded). caches labels by DID/URI with expiry timestamps. seeds cache on startup via queryLabels API. 44 45 45 - **health check**: exposes `/health` on port 8080. returns 200 if connected to turbostream AND successful POST within 2 minutes, 503 otherwise. note: fly.io `http_service.checks` only affect routing, not machine restarts. 46 + **stability design**: process synchronously (no unbounded queues). spaCy has 20x headroom. POST with 2s timeout, drop on failure. no retries, no queues. 46 47 47 48 **watchdog**: self-terminates via `sys.exit(1)` if unhealthy, triggering fly.io's auto-restart. two modes: (1) no successful POST within 5 min of startup = broken from start, (2) no successful POST for 3 min after previously working = wedged state. 48 49 49 - **env vars**: 50 - - `TURBOSTREAM_URL` - turbostream endpoint (default: graze.social) 51 - - `ZIG_BACKEND_URL` - backend HTTP endpoint (default: localhost:3000, prod: coral.internal:3000) 52 - - `LABEL_STREAM_URL` - labeler websocket 53 - - `LABELER_ENABLED` - toggle spam filtering (default: on) 50 + ## LLM curator 51 + 52 + `ner/dedup.py` - Claude Haiku 4.5 curates co-occurrence clusters into named groups every 5 minutes. see `docs/06-llm-curator.md` for full details. 53 + 54 + **what it does**: 55 + 1. fetches the entity graph from the backend (`GET /entity-graph`) 56 + 2. extracts co-occurrence clusters (union-find clusters with trending entities) 57 + 3. builds a prompt with cluster entities, trend values, mention counts, and topic history 58 + 4. calls Claude Haiku 4.5 via `messages.parse()` with structured output (Pydantic models) 59 + 5. validates LLM output against the actual entity graph (code-based, not LLM) 60 + 6. POSTs validated groups + haiku to backend (`POST /groups`) 61 + 62 + **output**: up to 5 named groups (e.g., "Jesse Jackson Tribute", "Iran Nuclear Talks") with entity lists, plus a haiku about current events. 63 + 64 + **topic continuity**: tracks topic history via Jaccard similarity matching. the LLM sees which topics are active and for how long, encouraging label reuse for continuing stories. history TTL is 1 hour. 54 65 55 66 ## backend 56 67 ··· 65 76 66 77 **HTTP endpoints**: 67 78 - `POST /entity` - ingest entities from bridge 79 + - `POST /groups` - receive LLM-curated groups + haiku from curator 68 80 - `GET /stats` - lattice health, firehose rate, percolation state 69 - - `GET /entity-graph` - full co-occurrence graph (entities, edges, clusters) 81 + - `GET /entity-graph` - full co-occurrence graph (entities, edges, clusters, groups) 70 82 - `GET /state/{size}` - lattice bitmap (base64) 71 83 - `GET /diagnostics` - distribution analysis 72 - - `GET /top-post/{id}` - top post for an entity 73 84 74 85 **entity graph model**: see `docs/02-semantic-percolation-plan.md` for design rationale. short version: entities co-occurring in the same post form edges. edges have pheromone weights that decay exponentially (10min half-life). union-find clusters active entities. trend = (smoothed_rate - baseline) / baseline. 75 86 87 + **groups**: the backend stores up to 5 `Group` structs, each with a name and up to 20 entity text references. when serializing the entity graph to JSON, each entity is matched against groups via case-insensitive text comparison (`matchGroup()`), adding `group` (index) and `group_name` fields. 88 + 76 89 **persistence**: SQLite at `/data/coral.db` on a fly volume. saves entities, edges, baselines, post metadata every 30s. activity buckets (in-memory mention counts) reset on restart but baselines seed recovery. WAL mode, busy_timeout=5000. 77 90 91 + **entity graph JSON** (broadcast via websocket): 92 + ```json 93 + { 94 + "entities": [ 95 + { 96 + "id": 0, 97 + "text": "Trump", 98 + "label": "PERSON", 99 + "rate": 0.17, 100 + "count": 52, 101 + "trend": -0.019, 102 + "baseline": 0.175, 103 + "cluster": 123, 104 + "cluster_score": 0.45, 105 + "group": 0, 106 + "group_name": "US Political Commentary", 107 + "largest": false, 108 + "x": 45, 109 + "y": 78, 110 + "edges": [{"t": 5, "a": 12.5, "w": 0.67}] 111 + } 112 + ], 113 + "groups": [ 114 + {"name": "US Political Commentary", "entities": ["Trump", "Congress", "MAGA"]} 115 + ], 116 + "haiku": "spring wind carries news\nacross the digital plain—\nold voices persist" 117 + } 118 + ``` 119 + 78 120 ## frontend 79 121 80 122 `site/` - static HTML/CSS/JS on cloudflare pages. 81 123 82 124 connects to backend websocket at `wss://coral.fly.dev:3001`. receives lattice bitmaps (binary, 10Hz), entity updates (JSON, 2Hz), and entity graph state (JSON, 2Hz). renders a 128x128 lattice with entity overlay, trending list, live feed, and stats. 83 125 84 - **rendering**: uses `requestAnimationFrame` for all canvas updates (websocket handlers set dirty flags, rAF loop renders). this prevents Chrome mobile GPU compositor from sampling canvas mid-render. entity graph overlay is cached on an offscreen canvas and re-rendered only when graph data changes (2Hz), then blitted every frame. 126 + **rendering**: uses `requestAnimationFrame` for all canvas updates (websocket handlers set dirty flags, rAF loop renders). entity graph overlay is cached on an offscreen canvas and re-rendered only when graph data changes (2Hz), then blitted every frame. 85 127 86 - **trending list**: clicking an entity name opens Google News search in a new tab (`https://news.google.com/search?q=ENTITY`). 128 + **trending list**: entities ranked by `trend * log(1 + count)` with EMA smoothing. hysteresis thresholds for enter/exit prevent flickering. trend arrows are relative (percentile-based against the max trend in the current list), not absolute thresholds. clicking an entity name opens Google News search. 87 129 88 - **degraded alert**: when `firehose_rate < 10/s` (normal ~50/s), shows a coral-colored banner: "turbostream may be degraded - data may be stale". the backend includes `firehose_degraded: bool` in its stats JSON. 130 + **group pills**: LLM-curated groups render as colored pills above the trending list (shapes: ●, ✦, ▲, ◇, ✚, ■). entities belonging to a group get a colored left border and shape pip. groups update every ~5 minutes when the curator runs. 131 + 132 + **haiku display**: attributed quote from Claude Haiku 4.5, centered below the live feed. timestamp visible on hover. updates every ~5 minutes with each curator cycle. 133 + 134 + **degraded alert**: when `firehose_rate < 10/s` (normal ~50/s), shows a coral-colored banner. the backend includes `firehose_degraded: bool` in its stats JSON. 89 135 90 136 ## deployment 91 137 ··· 101 147 102 148 ### 1. POST timeouts between NER bridge and backend 103 149 104 - **symptom**: NER bridge logs flooded with `POST exception: TimeoutError`. firehose_rate drops to ~1/s. turbostream connection is healthy, entities are extracted, but POSTs to the backend time out. 105 - 106 - **root cause**: the NER bridge was POSTing to `https://coral.fly.dev` (public proxy). fly.io's public proxy was intermittently slow or unresponsive, causing the 2s POST timeout to fire. 107 - 108 - **fix**: two changes. 109 - 110 - 1. **backend** (`main.zig`): changed HTTP server bind from `0.0.0.0` to `::` (dual-stack). fly's internal network uses IPv6, and the backend was only listening on IPv4. 111 - 112 - 2. **NER bridge** (`fly.toml`): changed `ZIG_BACKEND_URL` from `https://coral.fly.dev` to `http://coral.internal:3000`. this routes over fly's private IPv6 network between machines in the same org, bypassing the public proxy entirely. 113 - 114 - **result**: POST latency dropped from intermittent 2s+ timeouts to ~1.8ms. entity drops went from 5678/5820 to 0. 115 - 116 - ### 2. turbostream websocket disconnects (blocked read path) 117 - 118 - **symptoms** (appeared in sequence as we iterated on fixes): 119 - - `websocket closed` every ~44s, throughput ~12/s (client-side ping timeout) 120 - - `code=1011 reason='keepalive ping timeout'` every ~2min (server-side ping timeout) 121 - - `code=1006 reason=''` every ~60s (abnormal closure) 122 - 123 - all three were symptoms of the same root cause. 124 - 125 - **root cause**: the `websockets` library processes ping/pong control frames inside `recv()`. the original code did `async for raw in ws:` then ran NER + HTTP POST inside the loop body. while processing, the library couldn't call `recv()`, so it couldn't respond to pings. 126 - 127 - this explains the "looks good for 2-5 minutes then degrades" pattern: processing latency accumulates until keepalive timeouts hit. toggling client-side pings changed which side timed out first, but didn't fix the underlying problem. even `run_in_executor()` for spaCy didn't help — `await`ing the result still blocked the read loop. 128 - 129 - **fix**: decouple the websocket reader from processing. two concurrent asyncio tasks: 130 - 131 - ```python 132 - queue = asyncio.Queue(maxsize=200) # ~4 seconds buffer at 50 msgs/sec 150 + **symptom**: NER bridge logs flooded with `POST exception: TimeoutError`. firehose_rate drops to ~1/s. 133 151 134 - async def reader(): 135 - """only recv() + enqueue. keeps pong responsive.""" 136 - async for raw in ws: 137 - stats["messages"] += 1 138 - try: 139 - queue.put_nowait(raw) 140 - except asyncio.QueueFull: 141 - stats["queue_drops"] += 1 152 + **root cause**: the NER bridge was POSTing to `https://coral.fly.dev` (public proxy). fly.io's public proxy was intermittently slow. 142 153 143 - async def worker(): 144 - """consume from queue, do NER + POST.""" 145 - while True: 146 - raw = await queue.get() 147 - # JSON parse → extract_post → spam check → NER (in executor) → POST 148 - ``` 154 + **fix**: changed to `http://coral.internal:3000` (fly's private IPv6 network). POST latency dropped from 2s+ timeouts to ~1.8ms. 149 155 150 - the reader stays in `recv()` so websockets can handle ping/pong at all times. bounded queue with drop policy prevents unbounded backlog. client-side pings disabled (turbostream handles keepalive server-side). 156 + ### 2. websocket disconnects (blocked read path) 151 157 152 - **result**: zero disconnects, 0 drops, queue steady at 4-8/200, latency 3-10ms. previously dying every 60 seconds. 158 + **root cause**: the `websockets` library processes ping/pong inside `recv()`. the old code blocked the read loop with NER + POST work, so pongs were never sent. 153 159 154 - see `docs/05-websocket-stability.md` for the full debugging timeline. 160 + **fix**: reader/worker split with bounded queue. see `docs/05-websocket-stability.md` for full details. 155 161 156 162 ### 3. fly.io API instability 157 163 158 - **symptom**: `fly machine restart` and `fly deploy` commands fail with DNS resolution errors (`lookup api.machines.dev: no such host`) or EOF errors on the release API. 164 + **symptom**: `fly deploy` fails with DNS errors. 159 165 160 - **root cause**: fly.io's own API and metrics infrastructure has intermittent DNS/connectivity issues. not specific to coral. 166 + **mitigation**: retry. deploys are idempotent. 161 167 162 - **mitigation**: retry. deploys usually succeed on second attempt. the `fly deploy` command is idempotent - re-running it after a failed release API call will push the already-built image. 168 + ### 4. NER bridge stuck in timeout loop 163 169 164 - ### general lessons 170 + **symptom**: bridge enters reconnect loop, can't recover. 165 171 166 - - **use fly internal networking** for app-to-app communication. `coral.internal:3000` over IPv6 is dramatically more reliable than routing through the public proxy at `coral.fly.dev`. 167 - - **decouple websocket reads from processing**. `websockets` handles ping/pong inside `recv()`. if your loop body does slow work (NER, HTTP), use a reader task + queue + worker task so `recv()` is never stalled. `run_in_executor()` alone isn't enough — `await`ing it still blocks the read loop. 168 - - **bind to `::` not `0.0.0.0`** if your fly app needs to accept connections from other fly apps on the internal network. 169 - - **the firehose_rate EWMA** (`alpha=0.3`, updated every 2s) is a good health signal. threshold of 10/s (vs normal ~50/s) catches real degradation without false positives. 172 + **fix**: watchdog self-terminates via `sys.exit(1)` after 3-5 min of no successful POSTs, triggering fly.io auto-restart. 170 173 171 - ### 4. NER bridge stuck in timeout loop 174 + ### 5. entity graph JSON buffer overflow 172 175 173 - **symptom**: NER bridge logs flooded with `POST exception: TimeoutError` and `timed out during opening handshake`. entity graph empties out (all entities decay below threshold). manual restart fixes it. 176 + **symptom**: `/entity-graph` returns `{}`. 174 177 175 - **root cause**: transient network issues can cause both turbostream websocket and backend HTTP to time out simultaneously. the bridge enters a reconnect loop but can't recover without restart. 178 + **root cause**: 256KB fixed buffer exceeded with 272+ entities. 176 179 177 - **fix**: added watchdog task that self-terminates via `sys.exit(1)` when unhealthy, triggering fly.io's auto-restart. two modes: (1) no successful POST within 5 min of startup = broken from start, (2) no successful POST for 3 min after previously working = wedged state. note: fly.io `http_service.checks` only affect routing decisions, not machine restarts—the app must self-exit to trigger restart. 180 + **fix**: bumped to 1MB. 178 181 179 - ### 5. entity graph JSON buffer overflow 182 + ### 6. Chrome mobile canvas flashing 180 183 181 - **symptom**: `/entity-graph` endpoint returns `{}`. websocket graph messages have empty entities array. stats show active entities exist but they're not serialized. 184 + **root cause**: canvas renders from websocket handlers; Chrome Android's GPU compositor samples mid-render. 182 185 183 - **root cause**: the Zig `toJson()` function used a 256KB fixed buffer. with 272+ active entities (each with edges), serialization exceeded the buffer. the `catch` clause returned `"{}"` on overflow, silently dropping all data including stats. 186 + **fix**: all rendering deferred to `requestAnimationFrame`. 184 187 185 - **fix**: bumped JSON serialization buffers from 256KB to 1MB in `ws_server.zig` and `http.zig`. 188 + ### 7. stats key rename crash loop 186 189 187 - ### 6. Chrome mobile canvas flashing 190 + **symptom**: NER bridge crash-loops, never lives long enough for curator to run. 188 191 189 - **symptom**: entity graph overlay flashes/flickers on Chrome Android. works fine on Safari iOS and desktop Chrome. 192 + **root cause**: during the narrator→curator refactor, a stats key was renamed from `'topics'` to `'groups'` in `dedup.py` but a logging line in `bridge.py` still referenced `ns['topics']`. the `KeyError` killed the worker every 1000 messages. 190 193 191 - **root cause**: canvas renders fired directly from websocket `onmessage` handlers at 10Hz. Chrome Android's GPU compositor can sample the canvas buffer asynchronously — catching intermediate frames where the canvas has been cleared but the overlay hasn't been drawn yet. Safari's compositor waits for the JS task to complete. 194 + **lesson**: when renaming dict keys, grep all consumers — not just the module where the key is defined. 195 + 196 + ### general lessons 192 197 193 - **fix**: defer all rendering to `requestAnimationFrame`. websocket handlers set dirty flags, rAF loop renders once per display frame. rAF callbacks execute in the browser's rendering phase, synchronized with the compositor. also caches graph overlay on offscreen canvas (render at 2Hz, blit at 10Hz). 198 + - **use fly internal networking** for app-to-app communication. `coral.internal:3000` over IPv6 is dramatically more reliable than the public proxy. 199 + - **decouple websocket reads from processing**. use a reader task + queue + worker task so `recv()` is never stalled. 200 + - **bind to `::` not `0.0.0.0`** for fly internal network access. 201 + - **grep all consumers when renaming keys/fields** — the compiler won't catch dict key mismatches in Python.
+156
docs/06-llm-curator.md
··· 1 + # LLM curator 2 + 3 + ## overview 4 + 5 + the curator (`ner/dedup.py`) uses Claude Haiku 4.5 to transform raw co-occurrence clusters into human-readable named groups. it runs every 5 minutes as an async task alongside the NER bridge. 6 + 7 + the LLM's job is **not** to label every cluster. it's to identify the 3-5 real stories, merge duplicates, and drop garbage. this is a filter, not a summarizer. 8 + 9 + ## data flow 10 + 11 + ``` 12 + backend (GET /entity-graph) 13 + | 14 + v 15 + extract clusters (union-find, trending entities only) 16 + | 17 + v 18 + build prompt (clusters + trend data + topic history) 19 + | 20 + v 21 + Claude Haiku 4.5 (messages.parse, structured output) 22 + | 23 + v 24 + code-based validation (entity text must exist in graph) 25 + | 26 + v 27 + backend (POST /groups) → broadcast to frontend via websocket 28 + ``` 29 + 30 + ## structured output 31 + 32 + uses `messages.parse()` with Pydantic models for type-safe extraction: 33 + 34 + ```python 35 + class NamedGroup(BaseModel): 36 + name: str # 2-4 word topic name 37 + entities: list[str] # exact entity text from clusters 38 + 39 + class CuratorOutput(BaseModel): 40 + groups: list[NamedGroup] # max 5 41 + haiku: str # 5-7-5 syllable haiku 42 + ``` 43 + 44 + the API returns parsed Pydantic objects directly — no JSON parsing or regex needed. 45 + 46 + ## prompt design 47 + 48 + the system prompt instructs the LLM to: 49 + - identify coherent stories from the cluster data 50 + - merge clusters about the same story 51 + - drop incoherent or uninteresting clusters 52 + - return at most 5 groups (quality over quantity) 53 + - use specific names ("Trump Budget Fight" not "Politics") 54 + - reuse or refine recent topic names when the story continues 55 + - list exact entity text from the clusters for each group 56 + - write a haiku (5-7-5 syllables) about the real-world events 57 + 58 + the user message includes: 59 + 1. **topic history** — recent groups with duration and status (active/ended) 60 + 2. **cluster data** — entity text, NER label, trend score, mention count per cluster 61 + 62 + ## topic continuity 63 + 64 + the curator tracks a `topic_history` list of `TopicRecord` objects: 65 + 66 + ```python 67 + @dataclass 68 + class TopicRecord: 69 + label: str # group name 70 + entities: set[str] # member entity texts 71 + first_seen: float # unix timestamp 72 + last_seen: float # unix timestamp 73 + times_labeled: int # how many cycles this topic appeared 74 + ``` 75 + 76 + each cycle: 77 + 1. new groups are matched against history via **Jaccard similarity** on entity sets 78 + 2. matching groups (overlap >= 0.3) update the existing record 79 + 3. non-matching groups create new records 80 + 4. records older than 1 hour (`HISTORY_TTL`) are pruned 81 + 82 + the LLM sees this history formatted as: 83 + ``` 84 + Recent topic history (reuse labels for continuing topics): 85 + - "Jesse Jackson Tribute" — active for 23m (Jesse Jackson, Chicago, Fred Hampton) 86 + - "Iran Nuclear Talks" — ended 8m ago, lasted 45m (Geneva, Iranian, diplomacy) 87 + ``` 88 + 89 + this encourages label stability — the LLM reuses names for continuing stories rather than inventing new ones each cycle. 90 + 91 + ## validation 92 + 93 + all LLM output goes through code-based validation before reaching the backend: 94 + 95 + 1. **entity existence check**: each entity text in a group must exist in the current entity graph. entities the LLM hallucinated or misspelled are dropped. 96 + 2. **group filtering**: groups with zero valid entities after filtering are dropped entirely. 97 + 3. **max groups**: capped at 5 regardless of what the LLM returns. 98 + 4. **fallback**: if the LLM returns 0 valid groups, the previous groups are kept. this prevents the UI from going blank on a bad LLM response. 99 + 100 + ## backend integration 101 + 102 + ### Group struct (entity_graph.zig) 103 + 104 + ```zig 105 + pub const MAX_GROUPS = 5; 106 + pub const MAX_GROUP_ENTITIES = 20; 107 + 108 + pub const Group = struct { 109 + name: [64]u8, 110 + name_len: u8, 111 + entity_texts: [MAX_GROUP_ENTITIES][64]u8, 112 + entity_text_lens: [MAX_GROUP_ENTITIES]u8, 113 + entity_count: u8, 114 + }; 115 + ``` 116 + 117 + ### entity-to-group matching 118 + 119 + `matchGroup()` does case-insensitive text comparison between each entity's text and the entity lists in all active groups. matched entities get `group` (index) and `group_name` fields in the JSON output. 120 + 121 + ### POST /groups endpoint (http.zig) 122 + 123 + accepts: 124 + ```json 125 + { 126 + "groups": [ 127 + {"name": "Topic Name", "entities": ["Entity A", "Entity B"]}, 128 + ], 129 + "haiku": "five syllable line\nseven syllable line here\nfive syllables end" 130 + } 131 + ``` 132 + 133 + constructs `Group` structs and calls `entity_graph.graph.setGroups()`. the haiku is stored and included in every subsequent websocket broadcast. 134 + 135 + ## frontend rendering 136 + 137 + - **group pills**: colored pills above the trending list, one per group, with distinct shapes (●, ✦, ▲, ◇, ✚, ■) and neon colors 138 + - **entity pips**: trending entities matched to a group show the group's shape and color 139 + - **tooltip**: hovering an entity in a group shows the group name 140 + - **haiku**: displayed as a centered blockquote attributed to "claude haiku 4.5" with a hover-visible timestamp 141 + 142 + groups render directly from the websocket data — no grow/decay lifecycle. they update atomically every ~5 minutes when the curator runs. 143 + 144 + ## parameters 145 + 146 + | parameter | value | notes | 147 + |-----------|-------|-------| 148 + | `NARRATOR_INTERVAL` | 300s | curator cycle frequency | 149 + | `MAX_GROUPS` | 5 | max groups per cycle | 150 + | `MAX_CLUSTERS` | 15 | max clusters sent to LLM | 151 + | `MIN_CLUSTER_SIZE` | 2 | min trending entities per cluster | 152 + | `HISTORY_TTL` | 3600s | topic history expiry | 153 + | `MAX_HISTORY` | 10 | max historical topics shown to LLM | 154 + | `OVERLAP_THRESHOLD` | 0.3 | Jaccard threshold for topic matching | 155 + | `temperature` | 0.3 | LLM temperature (lower = more precise) | 156 + | model | `claude-haiku-4-5-20251001` | fast, cheap, structured output support |
+1 -1
ner/bridge.py
··· 542 542 if narrator is not None: 543 543 ns = narrator.stats 544 544 narrator_info = ( 545 - f" topics={ns['topics']}" 545 + f" groups={ns['groups']}" 546 546 f" clusters={ns['clusters_seen']}" 547 547 f" narrator_calls={ns['calls']}" 548 548 )
+99 -65
ner/dedup.py
··· 1 1 """ 2 - LLM-powered cluster narration via Claude Haiku. 2 + LLM-powered cluster curation via Claude Haiku. 3 3 4 4 Periodically fetches the entity graph from the backend, extracts 5 - trending clusters, and asks Haiku to generate short topic labels. 6 - These labels describe what a cluster of co-occurring entities is about 7 - (e.g., {Trump, Congress, Budget} -> "Trump Budget Fight"). 5 + trending clusters, and asks Haiku to curate them into named groups. 6 + The LLM identifies coherent news stories from co-occurrence clusters, 7 + merges duplicates, drops garbage, and returns at most 5 named groups. 8 8 9 - The narrator maintains a rolling history of recent topics so the LLM 9 + The curator maintains a rolling history of recent topics so the LLM 10 10 can see what it labeled before, how long topics have persisted, and 11 11 what's new vs continuing. This produces more stable, contextual labels. 12 12 13 - The labels are POSTed back to the backend, which serves them in the 14 - entity-graph websocket data (overriding the mechanical "A + B" label). 13 + The groups are POSTed back to the backend, which serves them in the 14 + entity-graph websocket data for frontend rendering. 15 15 16 16 Runs as an async task alongside the bridge. 17 17 """ ··· 25 25 from pydantic import BaseModel 26 26 27 27 NARRATOR_ENABLED = os.getenv("NARRATOR_ENABLED", "1") != "0" 28 - NARRATOR_INTERVAL = int(os.getenv("NARRATOR_INTERVAL", "60")) 28 + NARRATOR_INTERVAL = int(os.getenv("NARRATOR_INTERVAL", "300")) 29 29 MIN_CLUSTER_SIZE = 2 # need at least 2 entities to narrate 30 30 MAX_CLUSTERS = 10 # cap clusters sent to LLM 31 31 MAX_HISTORY = 8 # max past topics shown to LLM 32 - HISTORY_TTL = 30 * 60 # prune topics older than 30 minutes 32 + HISTORY_TTL = 60 * 60 # prune topics older than 1 hour 33 33 OVERLAP_THRESHOLD = 0.3 # Jaccard similarity to match a cluster to history 34 + MAX_GROUPS = 5 # max groups returned to backend 34 35 35 36 36 - class TopicLabel(BaseModel): 37 - cluster_id: int 38 - label: str 37 + class NamedGroup(BaseModel): 38 + name: str 39 + entities: list[str] 39 40 40 41 41 - class TopicLabels(BaseModel): 42 - topics: list[TopicLabel] 42 + class CuratorOutput(BaseModel): 43 + groups: list[NamedGroup] 44 + haiku: str 43 45 44 46 45 47 @dataclass ··· 52 54 53 55 54 56 SYSTEM_PROMPT = """\ 55 - You are analyzing clusters of co-occurring named entities trending on social media. 56 - Each cluster contains entities that frequently appear together in recent posts. 57 - For each cluster, provide a short (2-4 word) topic label that describes what 58 - the cluster is about. 57 + You are curating trending topics from a real-time entity co-occurrence graph. 58 + Each cluster below contains entities that frequently appear together in 59 + social media posts. Your job is to identify the real stories. 59 60 60 - Rules: 61 - - Be specific: "Trump Budget Fight" not "Politics" 62 - - Don't just concatenate entity names: {Lakers, LeBron, NBA} -> "Lakers Game" 63 - - Use natural language: "SpaceX Launch" not "SPACEX/LAUNCH" 64 - - If a cluster's entities don't form a coherent topic, label it "Unclear" 65 - - Preserve proper nouns and capitalization 66 - - When a cluster matches a recent topic (shown below), prefer reusing or 67 - refining that label rather than inventing a new one — unless the entities 68 - have changed significantly""" 61 + For each coherent cluster, provide a short (2-4 word) topic name. 62 + You may merge clusters that are about the same story. 63 + Drop clusters that are incoherent or uninteresting. 64 + Return at most 5 groups — quality over quantity. 65 + 66 + Be specific: "Trump Budget Fight" not "Politics". 67 + Use natural language: "SpaceX Launch" not "SPACEX/LAUNCH". 68 + Preserve proper nouns and capitalization. 69 + When a group matches a recent topic (shown below), prefer reusing or 70 + refining that name rather than inventing a new one — unless the entities 71 + have changed significantly. 72 + 73 + For each group, list the entity names (exact text from the clusters) that 74 + belong to it. 75 + 76 + Also write a haiku about the real-world events in these clusters. 77 + STRICT RULES for the haiku: 78 + - Line 1: exactly 5 syllables 79 + - Line 2: exactly 7 syllables 80 + - Line 3: exactly 5 syllables 81 + - Count syllables carefully before writing each line 82 + - Write in the style of Bashō — concrete, grounded in the physical world 83 + - Reference specific people, places, or events from the data 84 + - The haiku is about the world, not about the internet""" 69 85 70 86 71 87 def _jaccard(a: set, b: set) -> float: ··· 87 103 def __init__(self, backend_url: str, call_interval: int = NARRATOR_INTERVAL): 88 104 self.backend_url = backend_url 89 105 self.call_interval = call_interval 90 - self.topic_labels: dict[int, str] = {} # cluster_id -> label 106 + self.current_groups: list[dict] = [] # [{name, entities}, ...] 91 107 self.topic_history: list[TopicRecord] = [] 92 108 self.stats = { 93 109 "calls": 0, 94 110 "errors": 0, 95 - "topics": 0, 111 + "groups": 0, 96 112 "clusters_seen": 0, 97 113 } 98 114 ··· 109 125 print(f"narrator: fetch error: {e}", flush=True) 110 126 return None 111 127 112 - async def _post_labels(self, labels: dict[int, str]) -> None: 128 + async def _post_groups(self, groups: list[dict], haiku: str = "") -> None: 113 129 try: 130 + payload: dict = {"groups": groups} 131 + if haiku: 132 + payload["haiku"] = haiku 114 133 async with aiohttp.ClientSession() as session: 115 134 async with session.post( 116 - f"{self.backend_url}/cluster-labels", 117 - json={"labels": {str(k): v for k, v in labels.items()}}, 135 + f"{self.backend_url}/groups", 136 + json=payload, 118 137 timeout=aiohttp.ClientTimeout(total=5), 119 138 ) as resp: 120 139 if resp.status != 200: ··· 149 168 return best 150 169 return None 151 170 152 - def _update_history( 153 - self, clusters: dict[int, list[dict]], labels: dict[int, str] 154 - ) -> None: 171 + def _update_history(self, groups: list[dict]) -> None: 155 172 """update topic history with current cycle's results.""" 156 173 now = time() 157 174 158 - for cid, label in labels.items(): 159 - if label == "Unclear": 160 - continue 161 - entities = clusters.get(cid, []) 162 - entity_names = {e["text"] for e in entities} 175 + for group in groups: 176 + name = group["name"] 177 + entity_names = set(group["entities"]) 163 178 if not entity_names: 164 179 continue 165 180 166 181 if existing := self._match_history(entity_names): 167 - # update existing topic 168 - existing.label = label 182 + existing.label = name 169 183 existing.last_seen = now 170 184 existing.entities = entity_names 171 185 existing.times_labeled += 1 172 186 else: 173 - # new topic 174 187 self.topic_history.append( 175 188 TopicRecord( 176 - label=label, 189 + label=name, 177 190 entities=entity_names, 178 191 first_seen=now, 179 192 last_seen=now, ··· 219 232 if not clusters: 220 233 return 221 234 235 + # build set of all entity texts in the graph (for validation) 236 + all_entity_texts = { 237 + e["text"] for e in graph.get("entities", []) 238 + } 239 + 222 240 # sort by total cluster trend descending, take top N 223 241 sorted_clusters = sorted( 224 242 clusters.items(), ··· 245 263 if history_context: 246 264 user_content = ( 247 265 f"{history_context}\n\n" 248 - f"Label these trending clusters:\n\n{entities_text}" 266 + f"Curate these trending clusters into named groups:\n\n{entities_text}" 249 267 ) 250 268 else: 251 - user_content = f"Label these trending clusters:\n\n{entities_text}" 269 + user_content = f"Curate these trending clusters into named groups:\n\n{entities_text}" 252 270 253 271 try: 254 272 import anthropic ··· 256 274 client = anthropic.AsyncAnthropic() 257 275 response = await client.messages.parse( 258 276 model="claude-haiku-4-5-20251001", 259 - max_tokens=512, 277 + max_tokens=1024, 278 + temperature=0.3, 260 279 system=SYSTEM_PROMPT, 261 - output_format=TopicLabels, 280 + output_format=CuratorOutput, 262 281 messages=[ 263 282 { 264 283 "role": "user", ··· 268 287 ) 269 288 270 289 result = response.parsed_output 271 - new_labels = {t.cluster_id: t.label for t in result.topics} 290 + haiku = result.haiku.strip() if result.haiku else "" 272 291 273 - # only keep labels for clusters that actually exist 274 - active_ids = set(clusters.keys()) 275 - self.topic_labels = { 276 - cid: label 277 - for cid, label in new_labels.items() 278 - if cid in active_ids 279 - } 292 + # code-based validation (not LLM) 293 + validated_groups = [] 294 + for group in result.groups[:MAX_GROUPS]: 295 + # validate each entity text exists in the graph 296 + valid_entities = [ 297 + e for e in group.entities if e in all_entity_texts 298 + ] 299 + if valid_entities: 300 + validated_groups.append( 301 + {"name": group.name, "entities": valid_entities} 302 + ) 303 + 304 + # if LLM returned 0 valid groups, keep previous groups 305 + if not validated_groups: 306 + if self.current_groups: 307 + print("narrator: 0 valid groups, keeping previous", flush=True) 308 + return 309 + else: 310 + self.current_groups = validated_groups 280 311 281 312 # update history with this cycle's results 282 - self._update_history(clusters, self.topic_labels) 313 + self._update_history(self.current_groups) 283 314 284 - self.stats["topics"] = len(self.topic_labels) 315 + self.stats["groups"] = len(self.current_groups) 285 316 self.stats["calls"] += 1 286 317 287 - labels_str = ", ".join( 288 - f"{cid}={label!r}" for cid, label in self.topic_labels.items() 318 + groups_str = ", ".join( 319 + f"{g['name']!r} ({len(g['entities'])} ents)" 320 + for g in self.current_groups 289 321 ) 290 322 history_size = len(self.topic_history) 323 + haiku_preview = haiku.replace("\n", " / ") if haiku else "(none)" 291 324 print( 292 - f"narrator: {len(self.topic_labels)} topics from " 325 + f"narrator: {len(self.current_groups)} groups from " 293 326 f"{len(sorted_clusters)} clusters, " 294 - f"{history_size} in history ({labels_str})", 327 + f"{history_size} in history ({groups_str}) " 328 + f"haiku: {haiku_preview}", 295 329 flush=True, 296 330 ) 297 331 298 - # push labels to backend 299 - if self.topic_labels: 300 - await self._post_labels(self.topic_labels) 332 + # push groups + haiku to backend 333 + if self.current_groups: 334 + await self._post_groups(self.current_groups, haiku) 301 335 302 336 except ImportError: 303 337 print("narrator: anthropic package not available", flush=True)
+70 -117
site/grid.js
··· 598 598 <span class="tooltip-key">connections</span> 599 599 </div> 600 600 </div> 601 - ${entity.cluster_label ? `<div class="tooltip-cluster">cluster: ${entity.cluster_label}</div>` : ''} 601 + ${entity.group_name ? `<div class="tooltip-cluster">topic: ${entity.group_name}</div>` : ''} 602 602 `; 603 603 604 604 // position near cursor but not under it ··· 890 890 scheduleRender(mainGridSize); 891 891 // accumulate trending scores (DOM renders on its own timer) 892 892 updateTrendingScores(data.data.entities || []); 893 + // update groups from backend (LLM-curated, stable for ~5 min) 894 + if (data.data.groups) updateGroups(data.data.groups); 895 + // update haiku if present 896 + if (data.data.haiku) updateHaiku(data.data.haiku); 893 897 syncSelectedEntity(); 894 898 if (searchState.open) updateSearchResults(); 895 899 } else { ··· 1643 1647 const GRID_SIZE = 128; 1644 1648 1645 1649 // render trending entities from entity graph, prioritizing spike vs baseline 1646 - function trendIndicator(trendVal) { 1650 + function trendIndicator(trendVal, maxTrend) { 1647 1651 const pct = `${(trendVal * 100).toFixed(0)}% above baseline`; 1648 - if (trendVal >= 0.3) return { text: '↑↑', color: 'var(--percolating)', title: pct }; 1649 - if (trendVal >= 0.1) return { text: '↑', color: 'var(--percolating)', title: pct }; 1650 - if (trendVal >= 0.02) return { text: '↑', color: 'var(--text-dim)', title: pct }; 1651 - if (trendVal <= -0.1) return { text: '↓', color: 'var(--text-muted)', title: `${(trendVal * 100).toFixed(0)}% below baseline` }; 1652 - return { text: '·', color: 'var(--text-muted)', title: 'stable' }; 1652 + if (trendVal <= 0) return { text: '·', color: 'var(--text-muted)', title: 'stable' }; 1653 + const ratio = maxTrend > 0 ? trendVal / maxTrend : 0; 1654 + if (ratio >= 0.6) return { text: '↑↑', color: 'var(--percolating)', title: pct }; 1655 + if (ratio >= 0.3) return { text: '↑', color: 'var(--percolating)', title: pct }; 1656 + return { text: '↑', color: 'var(--text-dim)', title: pct }; 1653 1657 } 1654 1658 1655 1659 // cluster visual identity: shapes chosen for maximum distinctness at small sizes 1656 1660 const CLUSTER_SHAPES = ['●', '✦', '▲', '◇', '✚', '■']; 1657 1661 const CLUSTER_COLORS = [ 1658 - { css: 'var(--teal)', bg: 'rgba(125, 211, 192, 0.10)', border: 'rgba(125, 211, 192, 0.25)' }, 1659 - { css: 'var(--lavender)', bg: 'rgba(196, 181, 253, 0.10)', border: 'rgba(196, 181, 253, 0.25)' }, 1660 - { css: 'var(--coral)', bg: 'rgba(252, 165, 165, 0.10)', border: 'rgba(252, 165, 165, 0.25)' }, 1661 - { css: 'var(--sky)', bg: 'rgba(147, 197, 253, 0.10)', border: 'rgba(147, 197, 253, 0.25)' }, 1662 - { css: 'var(--mint)', bg: 'rgba(134, 239, 172, 0.10)', border: 'rgba(134, 239, 172, 0.25)' }, 1663 - { css: 'var(--indigo)', bg: 'rgba(165, 180, 252, 0.10)', border: 'rgba(165, 180, 252, 0.25)' }, 1662 + { css: 'var(--neon-pink)', bg: 'rgba(255, 45, 120, 0.15)', border: 'rgba(255, 45, 120, 0.40)' }, 1663 + { css: 'var(--neon-cyan)', bg: 'rgba(0, 240, 255, 0.12)', border: 'rgba(0, 240, 255, 0.35)' }, 1664 + { css: 'var(--neon-lime)', bg: 'rgba(57, 255, 20, 0.12)', border: 'rgba(57, 255, 20, 0.35)' }, 1665 + { css: 'var(--neon-amber)', bg: 'rgba(255, 183, 0, 0.14)', border: 'rgba(255, 183, 0, 0.38)' }, 1666 + { css: 'var(--neon-violet)', bg: 'rgba(191, 64, 255, 0.14)', border: 'rgba(191, 64, 255, 0.38)' }, 1667 + { css: 'var(--neon-blue)', bg: 'rgba(77, 139, 255, 0.14)', border: 'rgba(77, 139, 255, 0.38)' }, 1664 1668 ]; 1665 - const MAX_CLUSTERS = 6; 1669 + 1666 1670 1667 1671 // --- trending list: dam pattern --- 1668 1672 // scores accumulate silently on every websocket message (500ms), ··· 1678 1682 let latestTrending = []; 1679 1683 let trendingRenderTimer = null; 1680 1684 1681 - // --- cluster pills: much slower cadence than the entity list --- 1682 - // clusters represent themes (minutes-scale), not individual signals (seconds-scale). 1683 - // we accumulate cluster observations silently, then refresh pills on a slow timer. 1684 - // pills have their own hysteresis: must be seen N times to appear, absent N times to vanish. 1685 - const CLUSTER_REFRESH_INTERVAL = 30000; // ms between pill updates 1686 - const CLUSTER_APPEAR_THRESHOLD = 20; // must be seen in 20+ score cycles (~10s of a 30s window) 1687 - const CLUSTER_REMOVE_THRESHOLD = 3; // must be absent 3+ pill cycles to lose a pill 1688 - const MAX_PILLS = 3; // at most 3 topic pills — only the dominant themes 1689 - 1690 - // persistent cluster style assignments: cid -> styleIndex 1691 - const clusterStyleMap = new Map(); 1685 + // --- LLM-curated groups: direct rendering from backend --- 1686 + // groups are stable for ~5 min (LLM cadence), no presence tracking needed 1687 + let currentGroups = []; // [{name, styleIndex}] 1692 1688 1693 - // cluster observation counts: cid -> { label, seen (score cycles), missed (pill cycles) } 1694 - const clusterObservations = new Map(); 1695 - 1696 - // the frozen cluster snapshot used by the entity list renderer 1697 - let frozenClusters = new Map(); // cid -> { label, index } 1698 - let clusterPillTimer = null; 1699 - 1700 - // called on every score update (every 500ms) to tally cluster observations 1701 - function observeClusters(trendingEntities) { 1702 - const seen = new Map(); // cid -> { label, maxScore } 1703 - for (const e of trendingEntities) { 1704 - const cid = e.cluster || 0; 1705 - const clabel = e.cluster_label || ''; 1706 - if (!cid || !clabel || clabel === 'Unclear') continue; 1707 - if (!seen.has(cid)) { 1708 - seen.set(cid, { label: clabel, maxScore: e._score }); 1709 - } 1710 - } 1711 - for (const [cid, { label }] of seen) { 1712 - const obs = clusterObservations.get(cid) || { label, seen: 0, missed: 0 }; 1713 - obs.label = label; 1714 - obs.seen++; 1715 - obs.missed = 0; // reset miss counter when seen 1716 - clusterObservations.set(cid, obs); 1717 - } 1718 - } 1719 - 1720 - // called on the slow timer to rebuild the frozen cluster snapshot 1721 - function refreshClusterPills() { 1722 - // collect candidates that met the observation threshold, sorted by strength 1723 - const candidates = []; 1724 - for (const [cid, obs] of clusterObservations) { 1725 - if (obs.seen >= CLUSTER_APPEAR_THRESHOLD) { 1726 - candidates.push({ cid, label: obs.label, seen: obs.seen }); 1727 - obs.seen = 0; // reset — must earn it again next cycle 1728 - } else { 1729 - obs.missed++; 1730 - if (obs.missed >= CLUSTER_REMOVE_THRESHOLD) { 1731 - clusterObservations.delete(cid); 1732 - clusterStyleMap.delete(cid); 1733 - } 1734 - } 1735 - } 1736 - 1737 - // only the top MAX_PILLS by observation count 1738 - candidates.sort((a, b) => b.seen - a.seen); 1739 - const promoted = new Map(); 1740 - for (const c of candidates.slice(0, MAX_PILLS)) { 1741 - let styleIdx = clusterStyleMap.get(c.cid); 1742 - if (styleIdx === undefined) { 1743 - const usedIndices = new Set(clusterStyleMap.values()); 1744 - const available = []; 1745 - for (let i = 0; i < MAX_CLUSTERS; i++) { 1746 - if (!usedIndices.has(i)) available.push(i); 1747 - } 1748 - if (available.length > 0) { 1749 - styleIdx = available[Math.floor(Math.random() * available.length)]; 1750 - clusterStyleMap.set(c.cid, styleIdx); 1751 - } 1752 - } 1753 - if (styleIdx !== undefined) { 1754 - promoted.set(c.cid, { label: c.label, index: styleIdx }); 1755 - } 1756 - } 1757 - 1758 - if (promoted.size < 2) promoted.clear(); 1759 - frozenClusters = promoted; 1689 + function updateGroups(groups) { 1690 + if (!groups || !groups.length) { currentGroups = []; return; } 1691 + currentGroups = groups.map((g, i) => ({ 1692 + name: g.name, 1693 + styleIndex: i % CLUSTER_COLORS.length, 1694 + })); 1760 1695 } 1761 1696 1762 1697 // accumulate scores from each websocket message — no DOM work here ··· 1795 1730 // store snapshot for next render 1796 1731 latestTrending = top; 1797 1732 1798 - // accumulate cluster observations (silent — no DOM work) 1799 - observeClusters(top); 1800 - 1801 - // start timers on first data 1733 + // start render timer on first data 1802 1734 if (!trendingRenderTimer) { 1803 1735 renderTrendingSnapshot(); // render immediately on first data 1804 1736 trendingRenderTimer = setInterval(renderTrendingSnapshot, TRENDING_RENDER_INTERVAL); 1805 - refreshClusterPills(); // initial pill snapshot 1806 - clusterPillTimer = setInterval(refreshClusterPills, CLUSTER_REFRESH_INTERVAL); 1737 + } 1738 + } 1739 + 1740 + // update the haiku display (called when websocket data contains a haiku) 1741 + let currentHaiku = ''; 1742 + function updateHaiku(haiku) { 1743 + if (!haiku || haiku === currentHaiku) return; 1744 + currentHaiku = haiku; 1745 + const container = document.getElementById('haiku-container'); 1746 + const textEl = document.getElementById('haiku-text'); 1747 + const timeEl = document.getElementById('haiku-time'); 1748 + if (!container || !textEl) return; 1749 + textEl.textContent = '\u201C' + haiku + '\u201D'; 1750 + container.hidden = false; 1751 + 1752 + // set human-readable timestamp 1753 + if (timeEl) { 1754 + const now = new Date(); 1755 + const hours = now.getHours(); 1756 + const minutes = now.getMinutes().toString().padStart(2, '0'); 1757 + const ampm = hours >= 12 ? 'pm' : 'am'; 1758 + const h12 = hours % 12 || 12; 1759 + const timeStr = `${h12}:${minutes} ${ampm}`; 1760 + timeEl.textContent = timeStr; 1761 + timeEl.title = now.toLocaleString(); 1762 + timeEl.dateTime = now.toISOString(); 1807 1763 } 1808 1764 } 1809 1765 ··· 1814 1770 1815 1771 const top = latestTrending; 1816 1772 1817 - // use the frozen cluster snapshot (updated on its own slow timer) 1818 - const clusterMap = frozenClusters; 1819 - 1820 - // render cluster pills above the list 1773 + // render group pills above the list (directly from LLM-curated groups) 1821 1774 const section = container.parentElement; 1822 1775 let pillsContainer = section.querySelector('.cluster-pills'); 1823 - if (clusterMap.size > 0) { 1776 + if (currentGroups.length > 0) { 1824 1777 if (!pillsContainer) { 1825 1778 pillsContainer = document.createElement('div'); 1826 1779 pillsContainer.className = 'cluster-pills'; 1827 1780 section.insertBefore(pillsContainer, container); 1828 1781 } 1829 1782 pillsContainer.innerHTML = ''; 1830 - const sorted = [...clusterMap.entries()].sort((a, b) => a[1].index - b[1].index); 1831 - for (const [, cl] of sorted) { 1783 + for (const grp of currentGroups) { 1832 1784 const pill = document.createElement('span'); 1833 1785 pill.className = 'cluster-pill'; 1834 - const palette = CLUSTER_COLORS[cl.index % CLUSTER_COLORS.length]; 1786 + const palette = CLUSTER_COLORS[grp.styleIndex % CLUSTER_COLORS.length]; 1835 1787 pill.style.background = palette.bg; 1836 1788 pill.style.borderColor = palette.border; 1837 1789 pill.style.color = palette.css; 1838 - pill.innerHTML = `<span class="cluster-pill-shape">${CLUSTER_SHAPES[cl.index]}</span>${cl.label}`; 1790 + pill.innerHTML = `<span class="cluster-pill-shape">${CLUSTER_SHAPES[grp.styleIndex]}</span>${grp.name}`; 1839 1791 pillsContainer.appendChild(pill); 1840 1792 } 1841 1793 } else if (pillsContainer) { ··· 1861 1813 }); 1862 1814 1863 1815 // update or create rows 1816 + const maxTrend = top.reduce((m, e) => Math.max(m, e._trend || 0), 0); 1864 1817 top.forEach((entity, i) => { 1865 1818 const entityId = String(entity.id); 1866 1819 let row = container.querySelector(`.entity-row[data-entity-id="${entityId}"]`); ··· 1875 1828 row.classList.remove('entity-row-exit', 'entity-row-enter'); 1876 1829 } 1877 1830 1878 - const cid = entity.cluster || 0; 1879 - const cl = clusterMap.get(cid); 1880 - if (cl) { 1881 - const palette = CLUSTER_COLORS[cl.index % CLUSTER_COLORS.length]; 1831 + const groupIdx = entity.group; 1832 + const grp = groupIdx != null ? currentGroups[groupIdx] : null; 1833 + if (grp) { 1834 + const palette = CLUSTER_COLORS[grp.styleIndex % CLUSTER_COLORS.length]; 1882 1835 row.style.borderLeft = `2px solid ${palette.border}`; 1883 1836 } else { 1884 1837 row.style.borderLeft = '2px solid transparent'; ··· 1886 1839 1887 1840 row.innerHTML = ''; 1888 1841 1889 - if (cl) { 1842 + if (grp) { 1890 1843 const pip = document.createElement('span'); 1891 1844 pip.className = 'cluster-pip'; 1892 - pip.textContent = CLUSTER_SHAPES[cl.index]; 1893 - pip.style.color = CLUSTER_COLORS[cl.index % CLUSTER_COLORS.length].css; 1894 - pip.title = cl.label; 1845 + pip.textContent = CLUSTER_SHAPES[grp.styleIndex]; 1846 + pip.style.color = CLUSTER_COLORS[grp.styleIndex % CLUSTER_COLORS.length].css; 1847 + pip.title = grp.name; 1895 1848 row.appendChild(pip); 1896 1849 } 1897 1850 ··· 1905 1858 text.href = `https://news.google.com/search?q=${encodeURIComponent(entity.text || '')}`; 1906 1859 text.target = '_blank'; 1907 1860 text.rel = 'noopener'; 1908 - if (cl) text.title = cl.label; 1861 + if (grp) text.title = grp.name; 1909 1862 1910 1863 const countEl = document.createElement('span'); 1911 1864 countEl.className = 'entity-count'; 1912 1865 countEl.textContent = entity._count; 1913 1866 countEl.title = `${entity._count} mentions in last 5 min`; 1914 1867 1915 - const ti = trendIndicator(entity._trend); 1868 + const ti = trendIndicator(entity._trend, maxTrend); 1916 1869 const trendEl = document.createElement('span'); 1917 1870 trendEl.className = 'entity-trend'; 1918 1871 trendEl.textContent = ti.text;
+10
site/index.html
··· 85 85 </div> 86 86 </div> 87 87 88 + <div class="haiku-container" id="haiku-container" hidden> 89 + <blockquote class="haiku-quote"> 90 + <div class="haiku-text" id="haiku-text"></div> 91 + <div class="haiku-footer"> 92 + <cite class="haiku-attr">&mdash; claude haiku 4.5</cite> 93 + <time class="haiku-time" id="haiku-time" title=""></time> 94 + </div> 95 + </blockquote> 96 + </div> 97 + 88 98 <footer> 89 99 <div class="footer-byline">by <a href="https://bsky.app/profile/zzstoatzz.io" target="_blank">nate</a> <span class="footer-sep">·</span> inspired by <a href="https://hailey.at/posts/3mcy5b5gfi222" target="_blank">hailey</a></div> 90 100 <div class="footer-src"><a href="https://tangled.sh/@zzstoatzz.io/coral" target="_blank">[src]</a></div>
+59 -1
site/style.css
··· 18 18 --sky: #93c5fd; 19 19 --mint: #86efac; 20 20 --indigo: #a5b4fc; 21 + 22 + /* cyberpunk cluster palette */ 23 + --neon-pink: #ff2d78; 24 + --neon-cyan: #00f0ff; 25 + --neon-lime: #39ff14; 26 + --neon-amber: #ffb700; 27 + --neon-violet: #bf40ff; 28 + --neon-blue: #4d8bff; 21 29 --percolating: #4ade80; 22 30 --trending: #fbbf24; 23 31 ··· 632 640 633 641 /* footer */ 634 642 footer { 635 - padding-top: 1rem; 643 + padding-top: 0.5rem; 636 644 border-top: 1px solid var(--border); 637 645 text-align: center; 638 646 display: flex; ··· 675 683 676 684 .footer-src a:hover { 677 685 color: var(--text-dim); 686 + } 687 + 688 + /* haiku display — proper attributed quote */ 689 + .haiku-container { 690 + display: flex; 691 + align-items: center; 692 + justify-content: center; 693 + padding: 1.5rem 0; 694 + margin-bottom: 0.5rem; 695 + border-top: 1px solid var(--border); 696 + } 697 + 698 + .haiku-quote { 699 + text-align: center; 700 + margin: 0; 701 + padding: 0; 702 + border: none; 703 + } 704 + 705 + .haiku-text { 706 + font-size: 11px; 707 + line-height: 1.8; 708 + color: var(--text-dim); 709 + font-style: italic; 710 + white-space: pre-line; 711 + } 712 + 713 + .haiku-footer { 714 + display: flex; 715 + align-items: baseline; 716 + justify-content: center; 717 + gap: 0.4rem; 718 + margin-top: 0.6rem; 719 + } 720 + 721 + .haiku-attr { 722 + font-size: 9px; 723 + color: var(--text-muted); 724 + font-style: normal; 725 + } 726 + 727 + .haiku-time { 728 + font-size: 8px; 729 + color: transparent; 730 + transition: color 0.2s ease; 731 + font-style: normal; 732 + } 733 + 734 + .haiku-container:hover .haiku-time { 735 + color: var(--text-muted); 678 736 } 679 737 680 738 /* mobile: stack vertically */