search for standard sites pub-search.waow.tech
search zig blog atproto
at multi-platform-schema 346 lines 11 kB view raw
1const std = @import("std"); 2const json = std.json; 3const Allocator = std.mem.Allocator; 4const zql = @import("zql"); 5const db = @import("db/mod.zig"); 6const stats = @import("stats.zig"); 7 8// JSON output type for search results 9const SearchResultJson = struct { 10 type: []const u8, 11 uri: []const u8, 12 did: []const u8, 13 title: []const u8, 14 snippet: []const u8, 15 createdAt: []const u8 = "", 16 rkey: []const u8, 17 basePath: []const u8, 18}; 19 20/// Document search result (internal) 21const Doc = struct { 22 uri: []const u8, 23 did: []const u8, 24 title: []const u8, 25 snippet: []const u8, 26 createdAt: []const u8, 27 rkey: []const u8, 28 basePath: []const u8, 29 hasPublication: bool, 30 31 fn fromRow(row: db.Row) Doc { 32 return .{ 33 .uri = row.text(0), 34 .did = row.text(1), 35 .title = row.text(2), 36 .snippet = row.text(3), 37 .createdAt = row.text(4), 38 .rkey = row.text(5), 39 .basePath = row.text(6), 40 .hasPublication = row.int(7) != 0, 41 }; 42 } 43 44 fn toJson(self: Doc) SearchResultJson { 45 return .{ 46 .type = if (self.hasPublication) "article" else "looseleaf", 47 .uri = self.uri, 48 .did = self.did, 49 .title = self.title, 50 .snippet = self.snippet, 51 .createdAt = self.createdAt, 52 .rkey = self.rkey, 53 .basePath = self.basePath, 54 }; 55 } 56}; 57 58const DocsByTag = zql.Query( 59 \\SELECT d.uri, d.did, d.title, '' as snippet, 60 \\ d.created_at, d.rkey, p.base_path, 61 \\ CASE WHEN d.publication_uri != '' THEN 1 ELSE 0 END as has_publication 62 \\FROM documents d 63 \\LEFT JOIN publications p ON d.publication_uri = p.uri 64 \\JOIN document_tags dt ON d.uri = dt.document_uri 65 \\WHERE dt.tag = :tag 66 \\ORDER BY d.created_at DESC LIMIT 40 67); 68 69const DocsByFtsAndTag = zql.Query( 70 \\SELECT f.uri, d.did, d.title, 71 \\ snippet(documents_fts, 2, '', '', '...', 32) as snippet, 72 \\ d.created_at, d.rkey, p.base_path, 73 \\ CASE WHEN d.publication_uri != '' THEN 1 ELSE 0 END as has_publication 74 \\FROM documents_fts f 75 \\JOIN documents d ON f.uri = d.uri 76 \\LEFT JOIN publications p ON d.publication_uri = p.uri 77 \\JOIN document_tags dt ON d.uri = dt.document_uri 78 \\WHERE documents_fts MATCH :query AND dt.tag = :tag 79 \\ORDER BY rank LIMIT 40 80); 81 82const DocsByFts = zql.Query( 83 \\SELECT f.uri, d.did, d.title, 84 \\ snippet(documents_fts, 2, '', '', '...', 32) as snippet, 85 \\ d.created_at, d.rkey, p.base_path, 86 \\ CASE WHEN d.publication_uri != '' THEN 1 ELSE 0 END as has_publication 87 \\FROM documents_fts f 88 \\JOIN documents d ON f.uri = d.uri 89 \\LEFT JOIN publications p ON d.publication_uri = p.uri 90 \\WHERE documents_fts MATCH :query 91 \\ORDER BY rank LIMIT 40 92); 93 94/// Publication search result (internal) 95const Pub = struct { 96 uri: []const u8, 97 did: []const u8, 98 name: []const u8, 99 snippet: []const u8, 100 rkey: []const u8, 101 basePath: []const u8, 102 103 fn fromRow(row: db.Row) Pub { 104 return .{ 105 .uri = row.text(0), 106 .did = row.text(1), 107 .name = row.text(2), 108 .snippet = row.text(3), 109 .rkey = row.text(4), 110 .basePath = row.text(5), 111 }; 112 } 113 114 fn toJson(self: Pub) SearchResultJson { 115 return .{ 116 .type = "publication", 117 .uri = self.uri, 118 .did = self.did, 119 .title = self.name, 120 .snippet = self.snippet, 121 .rkey = self.rkey, 122 .basePath = self.basePath, 123 }; 124 } 125}; 126 127const PubSearch = zql.Query( 128 \\SELECT f.uri, p.did, p.name, 129 \\ snippet(publications_fts, 2, '', '', '...', 32) as snippet, 130 \\ p.rkey, p.base_path 131 \\FROM publications_fts f 132 \\JOIN publications p ON f.uri = p.uri 133 \\WHERE publications_fts MATCH :query 134 \\ORDER BY rank LIMIT 10 135); 136 137pub fn search(alloc: Allocator, query: []const u8, tag_filter: ?[]const u8) ![]const u8 { 138 const c = db.getClient() orelse return error.NotInitialized; 139 140 var output: std.Io.Writer.Allocating = .init(alloc); 141 errdefer output.deinit(); 142 143 var jw: json.Stringify = .{ .writer = &output.writer }; 144 try jw.beginArray(); 145 146 const fts_query = try buildFtsQuery(alloc, query); 147 148 // search documents 149 var doc_result = if (query.len == 0 and tag_filter != null) 150 c.query(DocsByTag.positional, DocsByTag.bind(.{ .tag = tag_filter.? })) catch null 151 else if (tag_filter) |tag| 152 c.query(DocsByFtsAndTag.positional, DocsByFtsAndTag.bind(.{ .query = fts_query, .tag = tag })) catch null 153 else 154 c.query(DocsByFts.positional, DocsByFts.bind(.{ .query = fts_query })) catch null; 155 156 if (doc_result) |*res| { 157 defer res.deinit(); 158 for (res.rows) |row| try jw.write(Doc.fromRow(row).toJson()); 159 } 160 161 // publications are excluded when filtering by tag (tags only apply to documents) 162 if (tag_filter == null) { 163 var pub_result = c.query( 164 PubSearch.positional, 165 PubSearch.bind(.{ .query = fts_query }), 166 ) catch null; 167 168 if (pub_result) |*res| { 169 defer res.deinit(); 170 for (res.rows) |row| try jw.write(Pub.fromRow(row).toJson()); 171 } 172 } 173 174 try jw.endArray(); 175 return try output.toOwnedSlice(); 176} 177 178/// Find documents similar to a given document using vector similarity 179/// Uses brute-force cosine distance with caching (cache invalidated when doc count changes) 180pub fn findSimilar(alloc: Allocator, uri: []const u8, limit: usize) ![]const u8 { 181 const c = db.getClient() orelse return error.NotInitialized; 182 183 // get current doc count (for cache invalidation) 184 const doc_count = getEmbeddedDocCount(c) orelse return error.QueryFailed; 185 186 // check cache 187 if (getCachedSimilar(alloc, c, uri, doc_count)) |cached| { 188 stats.recordCacheHit(); 189 return cached; 190 } 191 stats.recordCacheMiss(); 192 193 // cache miss - compute similarity 194 var output: std.Io.Writer.Allocating = .init(alloc); 195 errdefer output.deinit(); 196 197 var limit_buf: [8]u8 = undefined; 198 const limit_str = std.fmt.bufPrint(&limit_buf, "{d}", .{limit}) catch "5"; 199 200 // brute-force cosine similarity search (no vector index needed) 201 var res = c.query( 202 \\SELECT d2.uri, d2.did, d2.title, '' as snippet, 203 \\ d2.created_at, d2.rkey, COALESCE(p.base_path, '') as base_path, 204 \\ CASE WHEN d2.publication_uri != '' THEN 1 ELSE 0 END as has_publication 205 \\FROM documents d1, documents d2 206 \\LEFT JOIN publications p ON d2.publication_uri = p.uri 207 \\WHERE d1.uri = ? 208 \\ AND d2.uri != d1.uri 209 \\ AND d1.embedding IS NOT NULL 210 \\ AND d2.embedding IS NOT NULL 211 \\ORDER BY vector_distance_cos(d1.embedding, d2.embedding) 212 \\LIMIT ? 213 , &.{ uri, limit_str }) catch { 214 try output.writer.writeAll("[]"); 215 return try output.toOwnedSlice(); 216 }; 217 defer res.deinit(); 218 219 var jw: json.Stringify = .{ .writer = &output.writer }; 220 try jw.beginArray(); 221 for (res.rows) |row| try jw.write(Doc.fromRow(row).toJson()); 222 try jw.endArray(); 223 224 const results = try output.toOwnedSlice(); 225 226 // cache the results (fire and forget) 227 cacheSimilarResults(c, uri, results, doc_count); 228 229 return results; 230} 231 232fn getEmbeddedDocCount(c: *db.Client) ?i64 { 233 var res = c.query("SELECT COUNT(*) FROM documents WHERE embedding IS NOT NULL", &.{}) catch return null; 234 defer res.deinit(); 235 if (res.rows.len == 0) return null; 236 return res.rows[0].int(0); 237} 238 239fn getCachedSimilar(alloc: Allocator, c: *db.Client, uri: []const u8, current_doc_count: i64) ?[]const u8 { 240 var count_buf: [20]u8 = undefined; 241 const count_str = std.fmt.bufPrint(&count_buf, "{d}", .{current_doc_count}) catch return null; 242 243 var res = c.query( 244 "SELECT results FROM similarity_cache WHERE source_uri = ? AND doc_count = ?", 245 &.{ uri, count_str }, 246 ) catch return null; 247 defer res.deinit(); 248 249 if (res.rows.len == 0) return null; 250 return alloc.dupe(u8, res.rows[0].text(0)) catch null; 251} 252 253fn cacheSimilarResults(c: *db.Client, uri: []const u8, results: []const u8, doc_count: i64) void { 254 var count_buf: [20]u8 = undefined; 255 const count_str = std.fmt.bufPrint(&count_buf, "{d}", .{doc_count}) catch return; 256 257 var ts_buf: [20]u8 = undefined; 258 const ts_str = std.fmt.bufPrint(&ts_buf, "{d}", .{std.time.timestamp()}) catch return; 259 260 c.exec( 261 "INSERT OR REPLACE INTO similarity_cache (source_uri, results, doc_count, computed_at) VALUES (?, ?, ?, ?)", 262 &.{ uri, results, count_str, ts_str }, 263 ) catch {}; 264} 265 266/// Build FTS5 query with OR between terms: "cat dog" -> "cat OR dog*" 267/// Uses OR for better recall with BM25 ranking (more matches = higher score) 268/// Quoted queries are passed through as phrase matches: "exact phrase" -> "exact phrase" 269pub fn buildFtsQuery(alloc: Allocator, query: []const u8) ![]const u8 { 270 if (query.len == 0) return ""; 271 272 // normalize: trim whitespace 273 var start: usize = 0; 274 var end: usize = query.len; 275 while (start < end and query[start] == ' ') start += 1; 276 while (end > start and query[end - 1] == ' ') end -= 1; 277 if (start >= end) return ""; 278 279 const trimmed = query[start..end]; 280 281 // quoted phrase: pass through to FTS5 for exact phrase matching 282 if (trimmed.len >= 2 and trimmed[0] == '"' and trimmed[trimmed.len - 1] == '"') { 283 return try alloc.dupe(u8, trimmed); 284 } 285 286 // count words and total length 287 var word_count: usize = 0; 288 var total_word_len: usize = 0; 289 var in_word = false; 290 for (trimmed) |c| { 291 const is_sep = (c == ' ' or c == '.'); 292 if (is_sep) { 293 in_word = false; 294 } else { 295 if (!in_word) word_count += 1; 296 in_word = true; 297 total_word_len += 1; 298 } 299 } 300 301 if (word_count == 0) return ""; 302 303 // single word: just add prefix wildcard 304 if (word_count == 1) { 305 const buf = try alloc.alloc(u8, total_word_len + 1); 306 var pos: usize = 0; 307 for (trimmed) |c| { 308 if (c != ' ' and c != '.') { 309 buf[pos] = c; 310 pos += 1; 311 } 312 } 313 buf[pos] = '*'; 314 return buf; 315 } 316 317 // multiple words: join with " OR ", prefix on last 318 // size = word chars + (n-1) * 4 for " OR " + 1 for "*" 319 const buf_len = total_word_len + (word_count - 1) * 4 + 1; 320 const buf = try alloc.alloc(u8, buf_len); 321 322 var pos: usize = 0; 323 var current_word: usize = 0; 324 in_word = false; 325 326 for (trimmed) |c| { 327 const is_sep = (c == ' ' or c == '.'); 328 if (is_sep) { 329 if (in_word) { 330 // end of word - add " OR " if not last 331 current_word += 1; 332 if (current_word < word_count) { 333 @memcpy(buf[pos .. pos + 4], " OR "); 334 pos += 4; 335 } 336 } 337 in_word = false; 338 } else { 339 buf[pos] = c; 340 pos += 1; 341 in_word = true; 342 } 343 } 344 buf[pos] = '*'; 345 return buf; 346}