search for standard sites pub-search.waow.tech
search zig blog atproto

fix: handle quoted phrases and literal OR in search queries

buildFtsQuery was treating quotes and OR as plain characters, causing:
- "bertha OR burton" → "bertha OR OR OR burton*" (0 results)
- python "machine learning" → "python OR machine OR learning*" (no phrase match)

rewrite tokenizer to recognize quoted phrases, bare words, and literal OR.
add search syntax tooltip to frontend and docs/search-syntax.md reference.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

+272 -62
+132 -58
backend/src/server/search.zig
··· 1148 1148 }; 1149 1149 } 1150 1150 1151 - /// Build FTS5 query with OR between terms: "cat dog" -> "cat OR dog*" 1152 - /// Uses OR for better recall with BM25 ranking (more matches = higher score) 1153 - /// Quoted queries are passed through as phrase matches: "exact phrase" -> "exact phrase" 1151 + /// Build FTS5 query from user input. 1152 + /// - bare words are OR'd together, prefix `*` on last word 1153 + /// - quoted phrases (`"..."`) are passed through for exact phrase matching 1154 + /// - literal `OR` (case-sensitive) is recognized as operator, not a search term 1155 + /// - unclosed quotes are treated as phrases with synthetic closing quote 1154 1156 /// Separators match FTS5 unicode61 tokenizer: any non-alphanumeric character 1155 1157 pub fn buildFtsQuery(alloc: Allocator, query: []const u8) ![]const u8 { 1156 1158 if (query.len == 0) return ""; ··· 1164 1166 1165 1167 const trimmed = query[start..end]; 1166 1168 1167 - // quoted phrase: pass through to FTS5 for exact phrase matching 1168 - if (trimmed.len >= 2 and trimmed[0] == '"' and trimmed[trimmed.len - 1] == '"') { 1169 - return try alloc.dupe(u8, trimmed); 1170 - } 1169 + // tokenize into phrases and words 1170 + const TokenKind = enum { word, phrase }; 1171 + const Token = struct { kind: TokenKind, text: []const u8 }; 1171 1172 1172 - // count words and total length 1173 - // match FTS5 unicode61 tokenizer: non-alphanumeric = separator 1174 - var word_count: usize = 0; 1175 - var total_word_len: usize = 0; 1176 - var in_word = false; 1177 - for (trimmed) |c| { 1178 - const is_alnum = (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z') or (c >= '0' and c <= '9'); 1179 - if (!is_alnum) { 1180 - in_word = false; 1181 - } else { 1182 - if (!in_word) word_count += 1; 1183 - in_word = true; 1184 - total_word_len += 1; 1185 - } 1186 - } 1173 + var tokens: std.ArrayList(Token) = .empty; 1174 + defer tokens.deinit(alloc); 1187 1175 1188 - if (word_count == 0) return ""; 1176 + var i: usize = 0; 1177 + while (i < trimmed.len) { 1178 + if (trimmed[i] == '"') { 1179 + // quoted phrase: scan to closing quote or end 1180 + i += 1; // skip opening quote 1181 + const inner_start = i; 1182 + while (i < trimmed.len and trimmed[i] != '"') : (i += 1) {} 1183 + const inner_end = i; 1184 + if (i < trimmed.len) i += 1; // skip closing quote 1189 1185 1190 - // single word: just add prefix wildcard 1191 - if (word_count == 1) { 1192 - const buf = try alloc.alloc(u8, total_word_len + 1); 1193 - var pos: usize = 0; 1194 - for (trimmed) |c| { 1195 - const is_alnum = (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z') or (c >= '0' and c <= '9'); 1196 - if (is_alnum) { 1197 - buf[pos] = c; 1198 - pos += 1; 1186 + // only emit if inner text has alphanumeric content 1187 + const inner = trimmed[inner_start..inner_end]; 1188 + for (inner) |c| { 1189 + if (isAlnum(c)) { 1190 + try tokens.append(alloc, .{ .kind = .phrase, .text = inner }); 1191 + break; 1192 + } 1199 1193 } 1194 + } else if (isAlnum(trimmed[i])) { 1195 + // bare word: scan alphanumeric run 1196 + const word_start = i; 1197 + while (i < trimmed.len and isAlnum(trimmed[i])) : (i += 1) {} 1198 + const word = trimmed[word_start..i]; 1199 + // skip literal "OR" — it's an operator, not a search term 1200 + if (!std.mem.eql(u8, word, "OR")) { 1201 + try tokens.append(alloc, .{ .kind = .word, .text = word }); 1202 + } 1203 + } else { 1204 + i += 1; // skip separator 1200 1205 } 1201 - buf[pos] = '*'; 1202 - return buf; 1203 1206 } 1204 1207 1205 - // multiple words: join with " OR ", prefix on last 1206 - // size = word chars + (n-1) * 4 for " OR " + 1 for "*" 1207 - const buf_len = total_word_len + (word_count - 1) * 4 + 1; 1208 - const buf = try alloc.alloc(u8, buf_len); 1208 + if (tokens.items.len == 0) return ""; 1209 1209 1210 - var pos: usize = 0; 1211 - var current_word: usize = 0; 1212 - in_word = false; 1210 + // build output: join with " OR ", prefix * on last token if it's a bare word 1211 + var out: std.ArrayList(u8) = .empty; 1212 + errdefer out.deinit(alloc); 1213 1213 1214 - for (trimmed) |c| { 1215 - const is_alnum = (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z') or (c >= '0' and c <= '9'); 1216 - if (!is_alnum) { 1217 - if (in_word) { 1218 - // end of word - add " OR " if not last 1219 - current_word += 1; 1220 - if (current_word < word_count) { 1221 - @memcpy(buf[pos .. pos + 4], " OR "); 1222 - pos += 4; 1214 + for (tokens.items, 0..) |token, idx| { 1215 + if (idx > 0) { 1216 + try out.appendSlice(alloc, " OR "); 1217 + } 1218 + switch (token.kind) { 1219 + .word => { 1220 + try out.appendSlice(alloc, token.text); 1221 + if (idx == tokens.items.len - 1) { 1222 + try out.append(alloc, '*'); 1223 1223 } 1224 - } 1225 - in_word = false; 1226 - } else { 1227 - buf[pos] = c; 1228 - pos += 1; 1229 - in_word = true; 1224 + }, 1225 + .phrase => { 1226 + try out.append(alloc, '"'); 1227 + try out.appendSlice(alloc, token.text); 1228 + try out.append(alloc, '"'); 1229 + }, 1230 1230 } 1231 1231 } 1232 - buf[pos] = '*'; 1233 - return buf; 1232 + 1233 + return try out.toOwnedSlice(alloc); 1234 + } 1235 + 1236 + fn isAlnum(c: u8) bool { 1237 + return (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z') or (c >= '0' and c <= '9'); 1234 1238 } 1235 1239 1236 1240 // --- tests --- ··· 1292 1296 defer std.testing.allocator.free(result); 1293 1297 try std.testing.expectEqualStrings("don OR t OR stop OR now*", result); 1294 1298 } 1299 + 1300 + test "buildFtsQuery: embedded quoted phrase" { 1301 + const result = try buildFtsQuery(std.testing.allocator, "python \"machine learning\" tutorial"); 1302 + defer std.testing.allocator.free(result); 1303 + try std.testing.expectEqualStrings("python OR \"machine learning\" OR tutorial*", result); 1304 + } 1305 + 1306 + test "buildFtsQuery: quoted phrase at start" { 1307 + const result = try buildFtsQuery(std.testing.allocator, "\"exact phrase\" python"); 1308 + defer std.testing.allocator.free(result); 1309 + try std.testing.expectEqualStrings("\"exact phrase\" OR python*", result); 1310 + } 1311 + 1312 + test "buildFtsQuery: quoted phrase at end" { 1313 + const result = try buildFtsQuery(std.testing.allocator, "python \"machine learning\""); 1314 + defer std.testing.allocator.free(result); 1315 + try std.testing.expectEqualStrings("python OR \"machine learning\"", result); 1316 + } 1317 + 1318 + test "buildFtsQuery: literal OR passthrough" { 1319 + const result = try buildFtsQuery(std.testing.allocator, "bertha OR burton"); 1320 + defer std.testing.allocator.free(result); 1321 + try std.testing.expectEqualStrings("bertha OR burton*", result); 1322 + } 1323 + 1324 + test "buildFtsQuery: multiple ORs" { 1325 + const result = try buildFtsQuery(std.testing.allocator, "cat OR dog OR fish"); 1326 + defer std.testing.allocator.free(result); 1327 + try std.testing.expectEqualStrings("cat OR dog OR fish*", result); 1328 + } 1329 + 1330 + test "buildFtsQuery: OR at start ignored" { 1331 + const result = try buildFtsQuery(std.testing.allocator, "OR cat dog"); 1332 + defer std.testing.allocator.free(result); 1333 + try std.testing.expectEqualStrings("cat OR dog*", result); 1334 + } 1335 + 1336 + test "buildFtsQuery: OR at end ignored" { 1337 + const result = try buildFtsQuery(std.testing.allocator, "cat dog OR"); 1338 + defer std.testing.allocator.free(result); 1339 + try std.testing.expectEqualStrings("cat OR dog*", result); 1340 + } 1341 + 1342 + test "buildFtsQuery: only OR" { 1343 + const result = try buildFtsQuery(std.testing.allocator, "OR"); 1344 + try std.testing.expectEqualStrings("", result); 1345 + } 1346 + 1347 + test "buildFtsQuery: unclosed quote" { 1348 + const result = try buildFtsQuery(std.testing.allocator, "\"hello world"); 1349 + defer std.testing.allocator.free(result); 1350 + try std.testing.expectEqualStrings("\"hello world\"", result); 1351 + } 1352 + 1353 + test "buildFtsQuery: empty quotes" { 1354 + const result = try buildFtsQuery(std.testing.allocator, "\"\""); 1355 + try std.testing.expectEqualStrings("", result); 1356 + } 1357 + 1358 + test "buildFtsQuery: empty quotes with word" { 1359 + const result = try buildFtsQuery(std.testing.allocator, "\"\" hello"); 1360 + defer std.testing.allocator.free(result); 1361 + try std.testing.expectEqualStrings("hello*", result); 1362 + } 1363 + 1364 + test "buildFtsQuery: mixed quotes and OR" { 1365 + const result = try buildFtsQuery(std.testing.allocator, "\"exact phrase\" OR python"); 1366 + defer std.testing.allocator.free(result); 1367 + try std.testing.expectEqualStrings("\"exact phrase\" OR python*", result); 1368 + }
+1
docs/README.md
··· 70 70 71 71 ## further reading 72 72 73 + - [search-syntax.md](search-syntax.md) — query syntax reference (quotes, OR, filters, modes) 73 74 - [search-architecture.md](search-architecture.md) — FTS5 details, scaling considerations, future options 74 75 - [content-extraction.md](content-extraction.md) — how content is extracted from each platform 75 76 - [api.md](api.md) — API endpoint reference
+3 -2
docs/search-architecture.md
··· 26 26 results with snippet() 27 27 ``` 28 28 29 - key decisions: 29 + key decisions (see [search-syntax.md](search-syntax.md) for the user-facing reference): 30 30 - **OR between terms** for better recall (deliberate, see commit 35ad4b5) 31 - - **prefix match on last word** for type-ahead feel 31 + - **quoted phrases** passed through to FTS5 for exact matching 32 + - **prefix match on last word** for type-ahead feel (bare words only, not phrases) 32 33 - **unicode61 tokenizer** splits on non-alphanumeric (we match this in buildFtsQuery) 33 34 - **recency decay** boosts recent docs: `ORDER BY rank + (days_old / 30)` 34 35
+68
docs/search-syntax.md
··· 1 + # search syntax 2 + 3 + a reference for the query syntax at [pub-search.waow.tech](https://pub-search.waow.tech). 4 + 5 + ## basics 6 + 7 + terms are OR'd together — a query matches documents containing *any* of the words. the last word gets prefix matching for a type-ahead feel. 8 + 9 + | you type | what runs | why | 10 + |----------|-----------|-----| 11 + | `cat dog` | `cat OR dog*` | matches docs with "cat" or "dog" (or "dogs", "dogma", etc.) | 12 + | `crypto` | `crypto*` | prefix match: finds "crypto", "cryptocurrency", etc. | 13 + 14 + ## quoted phrases 15 + 16 + wrap words in double quotes for exact phrase matching — FTS5 requires the words to appear adjacent and in order. 17 + 18 + | you type | what runs | 19 + |----------|-----------| 20 + | `"machine learning"` | `"machine learning"` | 21 + | `python "machine learning" tutorial` | `python OR "machine learning" OR tutorial*` | 22 + | `"exact phrase" python` | `"exact phrase" OR python*` | 23 + 24 + the last token only gets a prefix `*` if it's a bare word — phrases are never prefix-expanded. 25 + 26 + unclosed quotes are treated as phrases: `"hello world` → `"hello world"`. 27 + 28 + ## explicit OR 29 + 30 + `OR` (uppercase, case-sensitive) between terms is recognized as an operator rather than a search term. this means you can write natural boolean queries without them getting mangled. 31 + 32 + | you type | what runs | 33 + |----------|-----------| 34 + | `bertha OR burton` | `bertha OR burton*` | 35 + | `cat OR dog OR fish` | `cat OR dog OR fish*` | 36 + 37 + `OR` at the start or end of a query is ignored — only `OR` between terms matters. 38 + 39 + ## filters 40 + 41 + beyond the query text, you can filter results by: 42 + 43 + - **platform**: leaflet, pckt, offprint, greengale, whitewind, other 44 + - **tag**: click any tag in the results to filter by it 45 + - **date**: today, this week, this month, this year 46 + 47 + filters combine with the search query — e.g., searching `python` with the `leaflet` platform filter returns only leaflet posts matching "python". 48 + 49 + ## search modes 50 + 51 + three modes are available via the toggle below the search box: 52 + 53 + - **keyword** (default): SQLite FTS5 full-text search with BM25 ranking + recency boost. fastest (~9ms). 54 + - **semantic**: vector similarity via Voyage AI embeddings + turbopuffer. finds conceptually similar content even without shared words (~345ms). 55 + - **hybrid**: runs both keyword and semantic in parallel, merges via reciprocal rank fusion. best quality, slightly slower (~360ms). 56 + 57 + ## ranking 58 + 59 + keyword results are ranked by `BM25 + recency`: 60 + - BM25 scores term frequency and document length (standard IR ranking) 61 + - recency adds a small boost for newer documents: `rank + (days_old / 30)` 62 + 63 + ## tokenization 64 + 65 + the FTS5 unicode61 tokenizer treats any non-alphanumeric character as a separator. this means: 66 + - `crypto-casino` → matches "crypto" and "casino" separately 67 + - `don't` → matches "don" and "t" 68 + - `foo.bar` → matches "foo" and "bar"
+68 -2
site/index.html
··· 125 125 .search-box { 126 126 display: flex; 127 127 gap: 0.5rem; 128 - margin-bottom: 1.5rem; 128 + margin-bottom: 0.5rem; 129 + } 130 + 131 + .syntax-help-btn { 132 + padding: 0.5rem 0.5rem; 133 + font-family: monospace; 134 + font-size: 11px; 135 + background: var(--bg-subtle); 136 + border: 1px solid var(--border-focus); 137 + color: var(--text-dim); 138 + cursor: pointer; 139 + flex-shrink: 0; 140 + } 141 + 142 + .syntax-help-btn:hover { color: var(--text-secondary); } 143 + .syntax-help-btn.active { color: #1B7340; border-color: #1B7340; } 144 + 145 + .syntax-help { 146 + display: none; 147 + background: var(--tooltip-bg); 148 + border: 1px solid var(--border); 149 + padding: 0.5rem 0.75rem; 150 + margin-bottom: 1rem; 151 + font-size: 11px; 152 + line-height: 1.8; 129 153 } 154 + 155 + .syntax-help.open { display: block; } 156 + 157 + .syntax-help code { 158 + color: #1B7340; 159 + font-family: monospace; 160 + } 161 + 162 + .syntax-help .syntax-row { 163 + display: flex; 164 + gap: 1rem; 165 + } 166 + 167 + .syntax-help .syntax-example { min-width: 110px; } 168 + .syntax-help .syntax-desc { color: var(--text-secondary); } 130 169 131 170 input[type="text"] { 132 171 flex: 1; ··· 676 715 width: 100%; 677 716 } 678 717 679 - .search-box button { 718 + .search-box #search-btn { 680 719 width: 100%; 720 + } 721 + 722 + .search-box .syntax-help-btn { 723 + width: auto; 681 724 } 682 725 683 726 /* result card mobile tweaks */ ··· 753 796 <div class="search-box"> 754 797 <input type="text" id="query" placeholder="search content..." autofocus> 755 798 <button id="search-btn">search</button> 799 + <button class="syntax-help-btn" id="syntax-help-btn" title="search syntax">[?]</button> 800 + </div> 801 + 802 + <div class="syntax-help" id="syntax-help"> 803 + <div class="syntax-row"><span class="syntax-example"><code>cat dog</code></span><span class="syntax-desc">OR between terms</span></div> 804 + <div class="syntax-row"><span class="syntax-example"><code>"exact phrase"</code></span><span class="syntax-desc">phrase match</span></div> 805 + <div class="syntax-row"><span class="syntax-example"><code>pyth</code></span><span class="syntax-desc">prefix on last word</span></div> 806 + <div class="syntax-row"><span class="syntax-example"><code>a OR b</code></span><span class="syntax-desc">explicit OR</span></div> 756 807 </div> 757 808 758 809 <div id="mode-toggle" class="mode-toggle"></div> ··· 808 859 if (currentTheme === 'system') applyTheme('system'); 809 860 }); 810 861 renderThemeToggle(); 862 + 863 + // syntax help toggle 864 + const syntaxHelpBtn = document.getElementById('syntax-help-btn'); 865 + const syntaxHelp = document.getElementById('syntax-help'); 866 + syntaxHelpBtn.addEventListener('click', function(e) { 867 + e.stopPropagation(); 868 + syntaxHelp.classList.toggle('open'); 869 + syntaxHelpBtn.classList.toggle('active'); 870 + }); 871 + document.addEventListener('click', function(e) { 872 + if (!syntaxHelp.contains(e.target) && e.target !== syntaxHelpBtn) { 873 + syntaxHelp.classList.remove('open'); 874 + syntaxHelpBtn.classList.remove('active'); 875 + } 876 + }); 811 877 812 878 const queryInput = document.getElementById('query'); 813 879 const searchBtn = document.getElementById('search-btn');