fix: resolve intermittent 1-3s search stalls caused by otel mutex contention

search for standard sites pub-search.waow.tech

search zig blog atproto

root cause: otel-zig's BatchSpanProcessor held its mutex during HTTP
export (1-2s TLS round-trip), blocking any thread calling span.end().
fixed upstream in otel-zig, propagated through logfire-zig.

other fixes included in this commit:
- add publication_uri index (600ms -> <1ms for base_path joins)
- increase HTTP_BUF_SIZE 8KB -> 64KB (fewer flush syscalls)
- add mmap + cache_size pragmas on sqlite read connection
- add search.iterate.* spans for iteration visibility
- add queue_ms tracking + socket timeout 30s -> 5s
- document the full investigation in docs/performance-saga.md
- add zig fetch --save tip to CLAUDE.md

result: consistent 75-110ms with full logfire observability

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

zzstoatzz.io 1 month ago dfc58ad4 8c2b2edd

+209 -21

7 changed files

expand all

unified split

CLAUDE.md

backend

build.zig.zon

src

LocalDb.zig

main.zig

search.zig

server.zig

docs

performance-saga.md

CLAUDE.md

··· 29 29 - from `tap/` directory: `just check` (status), `just turbo` (catch-up), `just normal` (steady state) 30 30 - see `docs/tap.md` for memory tuning and debugging 31 31 32 + ## zig dependencies 33 + - update a dependency hash: `zig fetch --save <url>` (fetches and updates build.zig.zon automatically) 34 + 32 35 ## common tasks 33 36 - check indexing: `curl -s https://leaflet-search-backend.fly.dev/api/dashboard | jq`

+1 -1

backend/build.zig.zon

··· 22 22 }, 23 23 .logfire = .{ 24 24 .url = "https://tangled.sh/zzstoatzz.io/logfire-zig/archive/main", 25 - .hash = "logfire_zig-0.1.0-x2yDLgdwAABPUnnE_Smk0Rjrf93qyC5vcJXzvieuNFWI", 25 + .hash = "logfire_zig-0.1.0-x2yDLgdwAADOXAZLNQJ8FUH5v1vfFwe5CApJtQ7c_pZd", 26 26 }, 27 27 }, 28 28 .paths = .{

backend/src/db/LocalDb.zig

··· 85 85 return err; 86 86 }; 87 87 _ = self.read_conn.?.exec("PRAGMA busy_timeout=1000", .{}) catch {}; 88 + // mmap for fast reads — avoids pread() syscalls, uses OS page cache directly 89 + _ = self.read_conn.?.exec("PRAGMA mmap_size=268435456", .{}) catch {}; 90 + // 20MB page cache — keeps FTS5 index pages in memory across queries 91 + _ = self.read_conn.?.exec("PRAGMA cache_size=-20000", .{}) catch {}; 88 92 89 93 // check integrity - if corrupt, delete and recreate 90 94 if (!self.checkIntegrity()) { ··· 201 205 202 206 // index for tag queries 203 207 c.exec("CREATE INDEX IF NOT EXISTS idx_document_tags_tag ON document_tags(tag)", .{}) catch {}; 208 + 209 + // index for base_path join (documents → publications via publication_uri) 210 + c.exec("CREATE INDEX IF NOT EXISTS idx_documents_publication_uri ON documents(publication_uri)", .{}) catch {}; 204 211 205 212 // sync metadata table 206 213 c.exec(

+3 -2

backend/src/main.zig

··· 9 9 const ingest = @import("ingest.zig"); 10 10 11 11 const MAX_HTTP_WORKERS = 16; 12 - const SOCKET_TIMEOUT_SECS = 30; 12 + const SOCKET_TIMEOUT_SECS = 5; 13 13 14 14 pub fn main() !void { 15 15 var gpa = std.heap.GeneralPurposeAllocator(.{}){}; ··· 63 63 logfire.warn("failed to set socket timeout: {}", .{err}); 64 64 }; 65 65 66 - pool.spawn(server.handleConnection, .{conn}) catch |err| { 66 + const accepted_at = std.time.microTimestamp(); 67 + pool.spawn(server.handleConnection, .{ conn, accepted_at }) catch |err| { 67 68 logfire.err("pool spawn error: {}", .{err}); 68 69 conn.stream.close(); 69 70 };

+32 -11

backend/src/search.zig

··· 446 446 , .{fts_query}); 447 447 defer rows.deinit(); 448 448 449 - while (rows.next()) |row| { 450 - const doc = Doc.fromLocalRow(row); 451 - const uri_dupe = try alloc.dupe(u8, doc.uri); 452 - try seen_uris.put(uri_dupe, {}); 453 - try jw.write(doc.toJson()); 449 + { 450 + const iter_span = logfire.span("search.iterate.docs_fts", .{}); 451 + defer iter_span.end(); 452 + var doc_count: u32 = 0; 453 + while (rows.next()) |row| { 454 + const doc = Doc.fromLocalRow(row); 455 + const uri_dupe = try alloc.dupe(u8, doc.uri); 456 + try seen_uris.put(uri_dupe, {}); 457 + try jw.write(doc.toJson()); 458 + doc_count += 1; 459 + } 460 + logfire.info("search.iterate.docs_fts rows={d}", .{doc_count}); 454 461 } 455 462 456 463 // base_path search ··· 466 473 , .{fts_query}); 467 474 defer bp_rows.deinit(); 468 475 469 - while (bp_rows.next()) |row| { 470 - const doc = Doc.fromLocalRow(row); 471 - if (!seen_uris.contains(doc.uri)) { 472 - try jw.write(doc.toJson()); 476 + { 477 + const iter_span = logfire.span("search.iterate.base_path", .{}); 478 + defer iter_span.end(); 479 + var bp_count: u32 = 0; 480 + while (bp_rows.next()) |row| { 481 + const doc = Doc.fromLocalRow(row); 482 + if (!seen_uris.contains(doc.uri)) { 483 + try jw.write(doc.toJson()); 484 + } 485 + bp_count += 1; 473 486 } 487 + logfire.info("search.iterate.base_path rows={d}", .{bp_count}); 474 488 } 475 489 476 490 // publication search ··· 485 499 , .{fts_query}); 486 500 defer pub_rows.deinit(); 487 501 488 - while (pub_rows.next()) |row| { 489 - try jw.write(Pub.fromLocalRow(row).toJson()); 502 + { 503 + const iter_span = logfire.span("search.iterate.pubs_fts", .{}); 504 + defer iter_span.end(); 505 + var pub_count: u32 = 0; 506 + while (pub_rows.next()) |row| { 507 + try jw.write(Pub.fromLocalRow(row).toJson()); 508 + pub_count += 1; 509 + } 510 + logfire.info("search.iterate.pubs_fts rows={d}", .{pub_count}); 490 511 } 491 512 } 492 513

+8 -7

backend/src/server.zig

··· 11 11 const search = @import("search.zig"); 12 12 const dashboard = @import("dashboard.zig"); 13 13 14 - const HTTP_BUF_SIZE = 8192; 14 + const HTTP_BUF_SIZE = 65536; 15 15 const QUERY_PARAM_BUF_SIZE = 64; 16 16 17 - pub fn handleConnection(conn: net.Server.Connection) void { 17 + pub fn handleConnection(conn: net.Server.Connection, accepted_at: i64) void { 18 18 defer conn.stream.close(); 19 + 20 + const queue_us = std.time.microTimestamp() - accepted_at; 21 + if (queue_us > 100_000) { // > 100ms 22 + logfire.warn("http.queue slow: {d}ms", .{@divTrunc(queue_us, 1000)}); 23 + } 19 24 20 25 var read_buffer: [HTTP_BUF_SIZE]u8 = undefined; 21 26 var write_buffer: [HTTP_BUF_SIZE]u8 = undefined; ··· 36 41 const recv_us = std.time.microTimestamp() - recv_start; 37 42 const target = request.head.target; 38 43 39 - // log slow receives — this captures time waiting for the request to arrive 40 - if (recv_us > 100_000) { // > 100ms 41 - logfire.warn("http.receive slow: {d}ms | {s}", .{ @divTrunc(recv_us, 1000), target }); 42 - } 43 - 44 44 const req_span = logfire.span("http.request", .{ 45 45 .target = target, 46 + .queue_ms = @divTrunc(queue_us, 1000), 46 47 .receive_ms = @divTrunc(recv_us, 1000), 47 48 }); 48 49

+155

docs/performance-saga.md

··· 1 + # performance saga (feb 2026) 2 + 3 + ## what happened 4 + 5 + attempted to add a vector similarity search feature using voyage-3 embeddings (1024 dims) stored in turso with a DiskANN index. the embedding model change had a different shape than what was stored, turso performance degraded badly, and multiple attempts to back out the changes failed to restore performance. 6 + 7 + ## the problems we found and fixed 8 + 9 + ### 1. vector similarity saturating turso 10 + 11 + the `/similar` endpoint was doing brute-force `vector_distance_cos` queries on turso. each call took 10-15 seconds and blocked turso for all other queries. this meant keyword search (which falls back to turso when local isn't ready) took 30-57 seconds. 12 + 13 + **fix**: neutered `/similar` to return `[]` immediately. removed all vector similarity code from search.zig (~150 lines). commit `d179d20`. 14 + 15 + **result**: search dropped from 19.6s avg / 185s max → sub-second. 16 + 17 + ### 2. sqlite read/write lock contention 18 + 19 + a single sqlite connection (`conn`) was shared between search queries and sync writes, protected by a mutex. during sync batch writes, search queries blocked waiting for the lock. 20 + 21 + **fix**: added a separate read-only connection (`read_conn`) opened with `ReadOnly` flags. search queries use `read_conn` (no mutex needed). writes still use `conn` with mutex. WAL mode guarantees readers never block on writers. commit `1268461`. 22 + 23 + **result**: `db.local.query` consistently 0-0.4ms. 24 + 25 + ### 3. missing instrumentation 26 + 27 + had no visibility into the HTTP connection lifecycle. couldn't tell where time was spent between client request and server processing. 28 + 29 + **fix**: added `http.request` span wrapping the full request lifecycle, `queue_ms` tracking pool queueing time, `receive_ms` tracking `receiveHead()` latency, and slow-receive warnings. 30 + 31 + ### 4. missing publication_uri index 32 + 33 + the base_path FTS query joins `documents → publications → publications_fts`. without an index on `documents.publication_uri`, sqlite did a full table scan (5752 rows) for each publication match. for "test" (many matches), this caused ~600ms delays during iteration. 34 + 35 + **fix**: added `CREATE INDEX IF NOT EXISTS idx_documents_publication_uri ON documents(publication_uri)` in LocalDb.zig createSchema(). 36 + 37 + **result**: "test" query dropped from ~1.5s → ~100ms. 38 + 39 + ### 5. HTTP write buffer too small 40 + 41 + `HTTP_BUF_SIZE` was 8192 (8KB). for large responses (32KB for "test"), zig's HTTP server needed multiple flush cycles to write the response, each involving a syscall. 42 + 43 + **fix**: increased `HTTP_BUF_SIZE` to 65536 (64KB). a 32KB response now fits in a single flush. 44 + 45 + ### 6. sqlite read connection tuning 46 + 47 + added `PRAGMA mmap_size=268435456` (256MB) and `PRAGMA cache_size=-20000` (20MB) on the read connection so FTS5 index pages stay in memory. 48 + 49 + ## what we know now (instrumented) 50 + 51 + added `search.iterate.*` spans around each iteration loop in `searchLocal()`. this revealed: 52 + 53 + server-side processing is fast: 54 + - `db.local.query` (prepare): 0.2ms per query 55 + - `search.iterate.docs_fts` (40 rows + snippet + JSON): **2ms** 56 + - `search.iterate.base_path`: 0.3ms 57 + - `search.iterate.pubs_fts`: 0.3ms 58 + - total actual work: **~3ms** 59 + 60 + but `http.search` intermittently shows 1-3 seconds. the gap occurs between two consecutive lines of code (after `local.query()` returns, before the iteration span starts). there is literally nothing in between — no I/O, no allocations, no function calls. 61 + 62 + ### what we ruled out 63 + 64 + - **snippet() overhead**: replaced with `substr(d.content, 1, 200)` — spikes still occurred. reverted. snippet is not the cause. 65 + - **iteration overhead**: spans prove all 3 iteration loops complete in ~3ms total 66 + - **logfire span.end() blocking**: logfire-zig uses `BatchSpanProcessor` (async, 500ms interval). span.end() just queues — no synchronous I/O. 67 + - **query preparation**: db.local.query spans show 0.2ms consistently 68 + 69 + ### what it actually is 70 + 71 + **CPU starvation on shared-cpu-1x Fly VM.** the thread gets preempted by the hypervisor for 1-2 seconds between two consecutive lines of code. the VM runs: 72 + - 16 HTTP worker threads 73 + - sync thread (turso → local batch writes) 74 + - tap consumer thread (firehose processing) 75 + - stats buffer thread (periodic turso flush) 76 + - activity tracker thread 77 + - BatchSpanProcessor thread (logfire export every 500ms, involves TLS + protobuf) 78 + 79 + on a shared CPU with fractional allocation, all these threads compete. the BatchSpanProcessor's 500ms TLS flush is particularly suspect — TLS is CPU-intensive. 80 + 81 + ## possible next steps 82 + 83 + 1. **upgrade VM**: move to `performance-1x` (dedicated CPU) — this is the real fix 84 + 2. **reduce thread contention**: lower HTTP workers from 16, or reduce logfire export frequency 85 + 3. **reduce logfire spans in hot path**: fewer spans = less BatchSpanProcessor work = less CPU contention 86 + 87 + ## architecture (current) 88 + 89 + ``` 90 + client → fly proxy (TLS termination) → app (port 3000) 91 + ├── HTTP thread pool (16 workers) 92 + ├── local SQLite (read_conn for search, conn+mutex for writes) 93 + ├── turso client (fallback for unsupported queries) 94 + ├── sync thread (turso → local, periodic) 95 + ├── tap consumer (firehose → turso) 96 + ├── stats buffer (periodic flush to turso) 97 + └── activity tracker 98 + ``` 99 + 100 + search path: `handleSearch` → `searchLocal` (3 FTS queries on read_conn, ~3ms total) → `sendJson` → `respond()` + `flush()` 101 + 102 + hot path timing (when not CPU-starved): ~3-6ms end to end 103 + hot path timing (when CPU-starved): 1-3 seconds (thread preempted between any two operations) 104 + 105 + ## resolution: otel-zig BatchSpanProcessor mutex contention 106 + 107 + the "CPU starvation" theory above was wrong. Fly CPU utilization was at ~0-5% during the stalls — the CPU was idle, not saturated. the thread wasn't being preempted by the hypervisor; it was **blocked on a mutex**. 108 + 109 + ### root cause 110 + 111 + otel-zig's `BatchSpanProcessor` (`src/sdk/trace/batch_span_processor.zig`) has a background thread that periodically exports queued spans to the logfire backend via OTLP/HTTP. the `exportThreadFn` called `exportBatchLocked()` which did the full HTTP export (TLS handshake + protobuf serialization + POST) **while holding the mutex**. the same mutex is acquired by `onEnd()`, which is called by every `span.end()`. 112 + 113 + so every 500ms, the background thread would: 114 + 1. lock mutex 115 + 2. serialize all queued spans to protobuf 116 + 3. POST to logfire via HTTPS (1-2 seconds on a fly VM) 117 + 4. unlock mutex 118 + 119 + any application thread calling `span.end()` during step 2-3 would block for the entire duration of the HTTP export. this explains the 1-3 second stalls appearing between arbitrary consecutive lines of code — the stall was in the `defer span.end()` of whichever logfire span happened to coincide with a batch flush. 120 + 121 + ### how we confirmed it 122 + 123 + 1. added `search.iterate.*` spans — proved all actual search work takes ~3ms 124 + 2. the 1-3s gap appeared between `db.local.query` span ending and the next span starting (i.e., inside `span.end()`) 125 + 3. disabled logfire entirely (`logfire.configure()` commented out) — stalls disappeared completely, all requests 75-110ms 126 + 4. Fly CPU graph showed ~0-5% utilization during stalls — ruled out CPU contention, pointed to lock contention 127 + 128 + ### the fix 129 + 130 + in `otel-zig/src/sdk/trace/batch_span_processor.zig`, changed `exportThreadFn` to drain the queue under the lock, then release the lock before doing the HTTP export. same pattern that `forceFlush()` already used correctly. 131 + 132 + before: 133 + ``` 134 + exportThreadFn: lock → export (HTTP POST, 1-2s) → unlock 135 + onEnd: lock (BLOCKED for 1-2s) → enqueue → unlock 136 + ``` 137 + 138 + after: 139 + ``` 140 + exportThreadFn: lock → drain queue → unlock → export (HTTP POST) → done 141 + onEnd: lock (instant) → enqueue → unlock 142 + ``` 143 + 144 + changes pushed through the dependency chain: 145 + - `otel-zig` (zzstoatzz/otel-zig trunk): fix in `batch_span_processor.zig` 146 + - `logfire-zig` (zzstoatzz.io/logfire-zig main): bumped otel-zig hash 147 + - `leaflet-search` backend: bumped logfire-zig hash, re-enabled logfire 148 + 149 + ### final results 150 + 151 + with logfire fully enabled and the fix applied: 152 + - hello (21KB response): **75-112ms** consistently, 10/10 requests 153 + - test (32KB response): **76-110ms** consistently, 10/10 requests 154 + - zero multi-second stalls 155 + - full observability retained