search for standard sites pub-search.waow.tech
search zig blog atproto

fix: improve semantic search quality + revert mode toggle UI

- embedder: skip docs with content < 50 chars or test titles
- searchSemantic: over-fetch 40, filter dist > 0.5 + empty titles, cap at 20
- frontend: remove mode toggle (keep backend support for when quality is ready)
- scripts: add cleanup-vector-index to purge junk vectors from tpuf

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

+210 -221
+4 -1
backend/src/ingest/embedder.zig
··· 26 26 const DocsNeedingEmbeddings = zql.Query( 27 27 \\SELECT uri, title, content, did, created_at, rkey, 28 28 \\ base_path, has_publication, platform, COALESCE(path, '') as path 29 - \\FROM documents WHERE embedded_at IS NULL LIMIT :limit 29 + \\FROM documents WHERE embedded_at IS NULL 30 + \\ AND LENGTH(content) > 50 31 + \\ AND title NOT IN ('test', 'testing', 'Test', 'Testing', 'Untitled') 32 + \\LIMIT :limit 30 33 ); 31 34 32 35 /// Start the embedder background worker
+10 -3
backend/src/search.zig
··· 681 681 }; 682 682 defer alloc.free(vector); 683 683 684 - // ANN query 685 - const results = tpuf.query(alloc, vector, 20) catch |err| { 684 + // ANN query — over-fetch to allow filtering 685 + const results = tpuf.query(alloc, vector, 40) catch |err| { 686 686 logfire.warn("search.semantic: tpuf query failed: {}", .{err}); 687 687 return try alloc.dupe(u8, "{\"error\":\"vector search failed\"}"); 688 688 }; ··· 701 701 alloc.free(results); 702 702 } 703 703 704 - // serialize results, post-filtering by platform if set 704 + // serialize results, filtering by distance + platform, capped at 20 705 705 var output: std.Io.Writer.Allocating = .init(alloc); 706 706 errdefer output.deinit(); 707 707 708 708 var jw: json.Stringify = .{ .writer = &output.writer }; 709 709 try jw.beginArray(); 710 + var count: usize = 0; 710 711 for (results) |r| { 712 + if (count >= 20) break; 713 + // skip results with high cosine distance (low similarity) 714 + if (r.dist > 0.5) continue; 715 + // skip documents with empty/test titles 716 + if (r.title.len == 0) continue; 711 717 if (platform_filter) |pf| { 712 718 if (!std.mem.eql(u8, r.platform, pf)) continue; 713 719 } 720 + count += 1; 714 721 try jw.write(SearchResultJson{ 715 722 .type = if (r.has_publication) "article" else "looseleaf", 716 723 .uri = r.uri,
+135
scripts/cleanup-vector-index
··· 1 + #!/usr/bin/env -S uv run --script --quiet 2 + # /// script 3 + # requires-python = ">=3.12" 4 + # dependencies = ["httpx", "pydantic-settings"] 5 + # /// 6 + """Delete junk vectors from turbopuffer. 7 + 8 + Finds documents with short content or test titles in turso, 9 + hashes their URIs, and deletes the corresponding vectors from tpuf. 10 + """ 11 + 12 + import hashlib 13 + import os 14 + 15 + import httpx 16 + from pydantic_settings import BaseSettings, SettingsConfigDict 17 + 18 + 19 + class Settings(BaseSettings): 20 + model_config = SettingsConfigDict( 21 + env_file=os.environ.get("ENV_FILE", ".env"), extra="ignore" 22 + ) 23 + turso_url: str 24 + turso_token: str 25 + turbopuffer_api_key: str 26 + turbopuffer_namespace: str = "leaflet-search" 27 + 28 + @property 29 + def turso_host(self) -> str: 30 + url = self.turso_url 31 + if url.startswith("libsql://"): 32 + url = url[len("libsql://"):] 33 + return url 34 + 35 + 36 + def turso_query(settings: Settings, sql: str, args: list | None = None): 37 + stmt: dict = {"sql": sql} 38 + if args: 39 + stmt["args"] = [{"type": "text", "value": str(a)} for a in args] 40 + 41 + response = httpx.post( 42 + f"https://{settings.turso_host}/v2/pipeline", 43 + headers={ 44 + "Authorization": f"Bearer {settings.turso_token}", 45 + "Content-Type": "application/json", 46 + }, 47 + json={ 48 + "requests": [ 49 + {"type": "execute", "stmt": stmt}, 50 + {"type": "close"}, 51 + ], 52 + }, 53 + timeout=30, 54 + ) 55 + response.raise_for_status() 56 + return response.json() 57 + 58 + 59 + def hash_id(uri: str) -> str: 60 + """Match tpuf.zig hashId: first 32 hex chars of SHA256.""" 61 + return hashlib.sha256(uri.encode()).hexdigest()[:32] 62 + 63 + 64 + def tpuf_delete(settings: Settings, ids: list[str]): 65 + """Delete vectors by ID from turbopuffer.""" 66 + url = f"https://api.turbopuffer.com/v2/namespaces/{settings.turbopuffer_namespace}" 67 + response = httpx.post( 68 + url, 69 + headers={ 70 + "Authorization": f"Bearer {settings.turbopuffer_api_key}", 71 + "Content-Type": "application/json", 72 + }, 73 + json={"deletes": ids}, 74 + timeout=30, 75 + ) 76 + response.raise_for_status() 77 + return response.json() 78 + 79 + 80 + settings = Settings() # type: ignore 81 + 82 + # find junk docs: short content OR test titles 83 + sql = """ 84 + SELECT uri, title, LENGTH(content) as content_len 85 + FROM documents 86 + WHERE embedded_at IS NOT NULL 87 + AND ( 88 + LENGTH(content) <= 50 89 + OR LOWER(title) IN ('test', 'testing', 'untitled', 'test test', 'hello world') 90 + OR LOWER(title) LIKE 'test %' 91 + OR LOWER(title) LIKE '% test' 92 + ) 93 + ORDER BY content_len ASC 94 + """ 95 + 96 + result = turso_query(settings, sql) 97 + rows = result["results"][0]["response"]["result"]["rows"] 98 + 99 + print(f"found {len(rows)} junk documents with vectors") 100 + 101 + if not rows: 102 + print("nothing to clean up") 103 + raise SystemExit(0) 104 + 105 + # show what we'll delete 106 + for row in rows[:20]: 107 + uri = row[0]["value"] if isinstance(row[0], dict) else row[0] 108 + title = row[1]["value"] if isinstance(row[1], dict) else row[1] 109 + content_len = row[2]["value"] if isinstance(row[2], dict) else row[2] 110 + print(f" [{content_len:>5} chars] {title!r:40s} {uri[:60]}") 111 + 112 + if len(rows) > 20: 113 + print(f" ... and {len(rows) - 20} more") 114 + 115 + # compute tpuf IDs 116 + uris = [row[0]["value"] if isinstance(row[0], dict) else row[0] for row in rows] 117 + tpuf_ids = [hash_id(uri) for uri in uris] 118 + 119 + print(f"\ndeleting {len(tpuf_ids)} vectors from turbopuffer...") 120 + 121 + # batch delete (tpuf accepts up to 1000 per request) 122 + for i in range(0, len(tpuf_ids), 100): 123 + batch = tpuf_ids[i : i + 100] 124 + tpuf_delete(settings, batch) 125 + print(f" deleted batch {i // 100 + 1} ({len(batch)} vectors)") 126 + 127 + print("done") 128 + 129 + # also clear embedded_at so these docs don't get re-embedded 130 + # (the new embedder filter will skip them anyway, but belt + suspenders) 131 + print("\nclearing embedded_at on junk docs...") 132 + for uri in uris: 133 + turso_query(settings, "UPDATE documents SET embedded_at = NULL WHERE uri = ?", [uri]) 134 + 135 + print(f"cleared {len(uris)} embedded_at timestamps")
+61 -217
site/index.html
··· 182 182 color: #d4956a; 183 183 } 184 184 185 - .mode-toggle { 186 - display: flex; 187 - gap: 0.5rem; 188 - margin-bottom: 1rem; 189 - } 190 - 191 - .mode-option { 192 - font-size: 11px; 193 - padding: 3px 8px; 194 - background: #151515; 195 - border: 1px solid #252525; 196 - border-radius: 3px; 197 - cursor: pointer; 198 - color: #777; 199 - } 200 - 201 - .mode-option:hover { 202 - background: #1a1a1a; 203 - border-color: #333; 204 - color: #aaa; 205 - } 206 - 207 - .mode-option.active { 208 - background: rgba(27, 115, 64, 0.2); 209 - border-color: #1B7340; 210 - color: #2a9d5c; 211 - } 212 - 213 - .source-badge { 214 - font-size: 9px; 215 - padding: 1px 5px; 216 - border-radius: 3px; 217 - margin-left: 6px; 218 - text-transform: lowercase; 219 - } 220 - 221 - .source-badge.keyword { 222 - background: rgba(64, 115, 180, 0.2); 223 - color: #6a9fd4; 224 - } 225 - 226 - .source-badge.semantic { 227 - background: rgba(140, 80, 200, 0.2); 228 - color: #b08ae0; 229 - } 230 - 231 185 .status { 232 186 padding: 1rem; 233 187 text-align: center; ··· 488 442 } 489 443 490 444 /* ensure minimum 44px touch targets */ 491 - .tag, .platform-option, .mode-option, .suggestion, input.tag-input { 445 + .tag, .platform-option, .suggestion, input.tag-input { 492 446 min-height: 44px; 493 447 display: inline-flex; 494 448 align-items: center; ··· 566 520 567 521 /* ensure touch targets on tablets too */ 568 522 @media (hover: none) and (pointer: coarse) { 569 - .tag, .platform-option, .mode-option, .suggestion, .related-item, input.tag-input { 523 + .tag, .platform-option, .suggestion, .related-item, input.tag-input { 570 524 min-height: 44px; 571 525 display: inline-flex; 572 526 align-items: center; ··· 582 536 <input type="text" id="query" placeholder="search content..." autofocus> 583 537 <button id="search-btn">search</button> 584 538 </div> 585 - 586 - <div id="mode-toggle" class="mode-toggle"></div> 587 539 588 540 <div id="suggestions"></div> 589 541 ··· 616 568 const activeFilterDiv = document.getElementById('active-filter'); 617 569 const suggestionsDiv = document.getElementById('suggestions'); 618 570 const platformFilterDiv = document.getElementById('platform-filter'); 619 - const modeToggleDiv = document.getElementById('mode-toggle'); 620 - 621 571 let currentTag = null; 622 572 let currentPlatform = null; 623 - let currentMode = 'keyword'; 624 573 let allTags = []; 625 574 let popularSearches = []; 626 575 const authorCache = new Map(); 627 - 628 - function renderModeToggle() { 629 - const modes = [ 630 - { id: 'keyword', label: 'keyword' }, 631 - { id: 'semantic', label: 'semantic' }, 632 - { id: 'hybrid', label: 'hybrid' }, 633 - ]; 634 - modeToggleDiv.innerHTML = modes.map(m => ` 635 - <span class="mode-option${currentMode === m.id ? ' active' : ''}" onclick="setMode('${m.id}')">${m.label}</span> 636 - `).join(''); 637 - } 638 - 639 - function setMode(mode) { 640 - currentMode = mode; 641 - renderModeToggle(); 642 - // hide tag filter in semantic/hybrid since tpuf doesn't store tags 643 - tagsDiv.style.display = (mode === 'keyword') ? '' : 'none'; 644 - if (mode !== 'keyword') { 645 - currentTag = null; 646 - renderActiveFilter(); 647 - } 648 - if (queryInput.value.trim() || currentTag || currentPlatform) { 649 - doSearch(); 650 - } 651 - } 652 - 653 576 async function search(query, tag = null, platform = null) { 654 577 if (!query.trim() && !tag && !platform) return; 655 578 656 579 searchBtn.disabled = true; 657 580 resultsDiv.innerHTML = `<div class="status">searching...</div>`; 658 581 659 - try { 660 - if (currentMode === 'hybrid') { 661 - await searchHybrid(query, tag, platform); 662 - } else if (currentMode === 'semantic') { 663 - await searchSingle(query, platform, 'semantic'); 664 - } else { 665 - await searchSingle(query, platform, 'keyword', tag); 666 - } 667 - } catch (err) { 668 - resultsDiv.innerHTML = `<div class="status error">error: ${err.message}</div>`; 669 - } finally { 670 - searchBtn.disabled = false; 671 - } 672 - } 673 - 674 - async function searchSingle(query, platform, mode, tag = null) { 675 582 let searchUrl = `${API_URL}/search?q=${encodeURIComponent(query || '')}`; 676 583 if (tag) searchUrl += `&tag=${encodeURIComponent(tag)}`; 677 584 if (platform) searchUrl += `&platform=${encodeURIComponent(platform)}`; 678 - if (mode === 'semantic') searchUrl += '&mode=semantic'; 679 - 680 - const res = await fetch(searchUrl); 681 - const rawText = await res.text(); 682 - let results; 683 585 684 586 try { 685 - results = JSON.parse(rawText); 686 - } catch (parseErr) { 687 - resultsDiv.innerHTML = `<div class="status error">JSON parse error<pre style="text-align:left;font-size:0.7rem;overflow:auto;max-height:200px">${escapeHtml(rawText)}</pre></div>`; 688 - return; 689 - } 587 + const res = await fetch(searchUrl); 588 + const rawText = await res.text(); 589 + let results; 690 590 691 - if (results.error) { 692 - resultsDiv.innerHTML = `<div class="status error">${results.error}</div>`; 693 - return; 694 - } 695 - 696 - if (results.length === 0) { 697 - resultsDiv.innerHTML = ` 698 - <div class="empty-state"> 699 - <p>no results${query ? ` for ${formatQueryForDisplay(query)}` : ''}${tag ? ` in #${escapeHtml(tag)}` : ''}${platform ? ` on ${escapeHtml(platform)}` : ''}</p> 700 - </div> 701 - `; 702 - statsDiv.textContent = ''; 703 - return; 704 - } 705 - 706 - renderResults(results, query); 707 - resolveAuthors(results); 708 - 709 - if (results.length > 0 && results[0].uri) { 710 - loadRelated(results[0]); 711 - } 712 - } 713 - 714 - async function searchHybrid(query, tag, platform) { 715 - let kwUrl = `${API_URL}/search?q=${encodeURIComponent(query || '')}`; 716 - if (tag) kwUrl += `&tag=${encodeURIComponent(tag)}`; 717 - if (platform) kwUrl += `&platform=${encodeURIComponent(platform)}`; 718 - 719 - let semUrl = `${API_URL}/search?q=${encodeURIComponent(query || '')}&mode=semantic`; 720 - if (platform) semUrl += `&platform=${encodeURIComponent(platform)}`; 591 + try { 592 + results = JSON.parse(rawText); 593 + } catch (parseErr) { 594 + resultsDiv.innerHTML = `<div class="status error">JSON parse error<pre style="text-align:left;font-size:0.7rem;overflow:auto;max-height:200px">${escapeHtml(rawText)}</pre></div>`; 595 + return; 596 + } 721 597 722 - const kwPromise = fetch(kwUrl).then(r => r.json()); 723 - const semPromise = fetch(semUrl).then(r => r.json()); 598 + if (results.error) { 599 + resultsDiv.innerHTML = `<div class="status error">${results.error}</div>`; 600 + return; 601 + } 724 602 725 - // show keyword results as soon as they arrive 726 - const kwResults = await kwPromise; 727 - if (kwResults.error) { 728 - resultsDiv.innerHTML = `<div class="status error">${kwResults.error}</div>`; 729 - return; 730 - } 603 + if (results.length === 0) { 604 + resultsDiv.innerHTML = ` 605 + <div class="empty-state"> 606 + <p>no results${query ? ` for ${formatQueryForDisplay(query)}` : ''}${tag ? ` in #${escapeHtml(tag)}` : ''}${platform ? ` on ${escapeHtml(platform)}` : ''}</p> 607 + </div> 608 + `; 609 + statsDiv.textContent = ''; 610 + return; 611 + } 731 612 732 - // tag each result with source 733 - const tagged = (kwResults || []).map(r => ({ ...r, _source: 'keyword' })); 734 - renderResults(tagged, query, true); 735 - resolveAuthors(tagged); 613 + let html = ''; 614 + for (const doc of results) { 615 + const entityType = doc.type || 'article'; 616 + const plat = doc.platform || 'leaflet'; 617 + const docUrl = buildDocUrl(doc, entityType, plat); 618 + const platformConfig = PLATFORM_CONFIG[plat]; 619 + const platformBadge = platformConfig 620 + ? `<span class="platform-badge">${escapeHtml(platformConfig.label)}</span>` 621 + : ''; 622 + const date = doc.createdAt ? new Date(doc.createdAt).toLocaleDateString() : ''; 623 + const platformHome = getPlatformHome(plat, doc.basePath); 736 624 737 - // append semantic results when ready 738 - try { 739 - const semResults = await semPromise; 740 - if (!semResults.error && semResults.length > 0) { 741 - const seenUris = new Set(tagged.map(r => r.uri)); 742 - const unique = semResults.filter(r => !seenUris.has(r.uri)).map(r => ({ ...r, _source: 'semantic' })); 743 - if (unique.length > 0) { 744 - const merged = [...tagged, ...unique]; 745 - renderResults(merged, query, true); 746 - resolveAuthors(unique); 747 - } 625 + html += ` 626 + <div class="result"> 627 + <div class="result-title"> 628 + <span class="entity-type ${entityType}">${entityType}</span>${platformBadge} 629 + ${docUrl 630 + ? `<a href="${docUrl}" target="_blank">${escapeHtml(doc.title || 'Untitled')}</a>` 631 + : escapeHtml(doc.title || 'Untitled')} 632 + </div> 633 + <div class="result-snippet">${highlightTerms(doc.snippet, query)}</div> 634 + <div class="result-meta" ${doc.did ? `data-did="${escapeHtml(doc.did)}"` : ''}> 635 + ${date ? `${date} | ` : ''}<span class="author-name"></span>${platformHome.url 636 + ? `<a href="${platformHome.url}" target="_blank">${platformHome.label}</a>` 637 + : platformHome.label} 638 + </div> 639 + </div> 640 + `; 748 641 } 749 - } catch (e) { 750 - // semantic failed silently — keyword results already shown 751 - } 752 642 753 - if (tagged.length > 0 && tagged[0].uri) { 754 - loadRelated(tagged[0]); 755 - } 756 - } 643 + resultsDiv.innerHTML = html; 644 + statsDiv.textContent = `${results.length} result${results.length === 1 ? '' : 's'}`; 645 + resolveAuthors(results); 757 646 758 - function renderResults(results, query, showSource = false) { 759 - if (results.length === 0) { 760 - resultsDiv.innerHTML = ` 761 - <div class="empty-state"> 762 - <p>no results${query ? ` for ${formatQueryForDisplay(query)}` : ''}</p> 763 - </div> 764 - `; 765 - statsDiv.textContent = ''; 766 - return; 767 - } 647 + if (results.length > 0 && results[0].uri) { 648 + loadRelated(results[0]); 649 + } 768 650 769 - let html = ''; 770 - for (const doc of results) { 771 - const entityType = doc.type || 'article'; 772 - const plat = doc.platform || 'leaflet'; 773 - const docUrl = buildDocUrl(doc, entityType, plat); 774 - const platformConfig = PLATFORM_CONFIG[plat]; 775 - const platformBadge = platformConfig 776 - ? `<span class="platform-badge">${escapeHtml(platformConfig.label)}</span>` 777 - : ''; 778 - const sourceBadge = showSource && doc._source 779 - ? `<span class="source-badge ${doc._source}">${doc._source}</span>` 780 - : ''; 781 - const date = doc.createdAt ? new Date(doc.createdAt).toLocaleDateString() : ''; 782 - const platformHome = getPlatformHome(plat, doc.basePath); 783 - 784 - html += ` 785 - <div class="result"> 786 - <div class="result-title"> 787 - <span class="entity-type ${entityType}">${entityType}</span>${platformBadge} 788 - ${docUrl 789 - ? `<a href="${docUrl}" target="_blank">${escapeHtml(doc.title || 'Untitled')}</a>` 790 - : escapeHtml(doc.title || 'Untitled')}${sourceBadge} 791 - </div> 792 - ${doc.snippet ? `<div class="result-snippet">${highlightTerms(doc.snippet, query)}</div>` : ''} 793 - <div class="result-meta" ${doc.did ? `data-did="${escapeHtml(doc.did)}"` : ''}> 794 - ${date ? `${date} | ` : ''}<span class="author-name"></span>${platformHome.url 795 - ? `<a href="${platformHome.url}" target="_blank">${platformHome.label}</a>` 796 - : platformHome.label} 797 - </div> 798 - </div> 799 - `; 651 + } catch (err) { 652 + resultsDiv.innerHTML = `<div class="status error">error: ${err.message}</div>`; 653 + } finally { 654 + searchBtn.disabled = false; 800 655 } 801 - 802 - resultsDiv.innerHTML = html; 803 - statsDiv.textContent = `${results.length} result${results.length === 1 ? '' : 's'}`; 804 656 } 805 657 806 658 async function resolveAuthors(results) { ··· 981 833 if (q) params.set('q', q); 982 834 if (currentTag) params.set('tag', currentTag); 983 835 if (currentPlatform) params.set('platform', currentPlatform); 984 - if (currentMode !== 'keyword') params.set('mode', currentMode); 985 836 const url = params.toString() ? `?${params}` : '/'; 986 837 history.pushState(null, '', url); 987 838 } ··· 1157 1008 queryInput.value = params.get('q') || ''; 1158 1009 currentTag = params.get('tag') || null; 1159 1010 currentPlatform = params.get('platform') || null; 1160 - currentMode = params.get('mode') || 'keyword'; 1161 1011 renderActiveFilter(); 1162 1012 renderTags(); 1163 1013 renderPlatformFilter(); 1164 - renderModeToggle(); 1165 - tagsDiv.style.display = (currentMode === 'keyword') ? '' : 'none'; 1166 1014 if (queryInput.value || currentTag || currentPlatform) search(queryInput.value, currentTag, currentPlatform); 1167 1015 }); 1168 1016 ··· 1171 1019 const initialQuery = initialParams.get('q'); 1172 1020 const initialTag = initialParams.get('tag'); 1173 1021 const initialPlatform = initialParams.get('platform'); 1174 - const initialMode = initialParams.get('mode'); 1175 1022 if (initialQuery) queryInput.value = initialQuery; 1176 1023 if (initialTag) currentTag = initialTag; 1177 1024 if (initialPlatform) currentPlatform = initialPlatform; 1178 - if (initialMode) currentMode = initialMode; 1179 1025 renderActiveFilter(); 1180 1026 renderPlatformFilter(); 1181 - renderModeToggle(); 1182 - tagsDiv.style.display = (currentMode === 'keyword') ? '' : 'none'; 1183 1027 1184 1028 if (initialQuery || initialTag || initialPlatform) { 1185 1029 search(initialQuery || '', initialTag, initialPlatform);