search for standard sites
pub-search.waow.tech
search
zig
blog
atproto
1const std = @import("std");
2const json = std.json;
3const Allocator = std.mem.Allocator;
4const zql = @import("zql");
5const db = @import("db/mod.zig");
6const stats = @import("stats.zig");
7
8// JSON output type for search results
9const SearchResultJson = struct {
10 type: []const u8,
11 uri: []const u8,
12 did: []const u8,
13 title: []const u8,
14 snippet: []const u8,
15 createdAt: []const u8 = "",
16 rkey: []const u8,
17 basePath: []const u8,
18};
19
20/// Document search result (internal)
21const Doc = struct {
22 uri: []const u8,
23 did: []const u8,
24 title: []const u8,
25 snippet: []const u8,
26 createdAt: []const u8,
27 rkey: []const u8,
28 basePath: []const u8,
29 hasPublication: bool,
30
31 fn fromRow(row: db.Row) Doc {
32 return .{
33 .uri = row.text(0),
34 .did = row.text(1),
35 .title = row.text(2),
36 .snippet = row.text(3),
37 .createdAt = row.text(4),
38 .rkey = row.text(5),
39 .basePath = row.text(6),
40 .hasPublication = row.int(7) != 0,
41 };
42 }
43
44 fn toJson(self: Doc) SearchResultJson {
45 return .{
46 .type = if (self.hasPublication) "article" else "looseleaf",
47 .uri = self.uri,
48 .did = self.did,
49 .title = self.title,
50 .snippet = self.snippet,
51 .createdAt = self.createdAt,
52 .rkey = self.rkey,
53 .basePath = self.basePath,
54 };
55 }
56};
57
58const DocsByTag = zql.Query(
59 \\SELECT d.uri, d.did, d.title, '' as snippet,
60 \\ d.created_at, d.rkey, p.base_path,
61 \\ CASE WHEN d.publication_uri != '' THEN 1 ELSE 0 END as has_publication
62 \\FROM documents d
63 \\LEFT JOIN publications p ON d.publication_uri = p.uri
64 \\JOIN document_tags dt ON d.uri = dt.document_uri
65 \\WHERE dt.tag = :tag
66 \\ORDER BY d.created_at DESC LIMIT 40
67);
68
69const DocsByFtsAndTag = zql.Query(
70 \\SELECT f.uri, d.did, d.title,
71 \\ snippet(documents_fts, 2, '', '', '...', 32) as snippet,
72 \\ d.created_at, d.rkey, p.base_path,
73 \\ CASE WHEN d.publication_uri != '' THEN 1 ELSE 0 END as has_publication
74 \\FROM documents_fts f
75 \\JOIN documents d ON f.uri = d.uri
76 \\LEFT JOIN publications p ON d.publication_uri = p.uri
77 \\JOIN document_tags dt ON d.uri = dt.document_uri
78 \\WHERE documents_fts MATCH :query AND dt.tag = :tag
79 \\ORDER BY rank LIMIT 40
80);
81
82const DocsByFts = zql.Query(
83 \\SELECT f.uri, d.did, d.title,
84 \\ snippet(documents_fts, 2, '', '', '...', 32) as snippet,
85 \\ d.created_at, d.rkey, p.base_path,
86 \\ CASE WHEN d.publication_uri != '' THEN 1 ELSE 0 END as has_publication
87 \\FROM documents_fts f
88 \\JOIN documents d ON f.uri = d.uri
89 \\LEFT JOIN publications p ON d.publication_uri = p.uri
90 \\WHERE documents_fts MATCH :query
91 \\ORDER BY rank LIMIT 40
92);
93
94/// Publication search result (internal)
95const Pub = struct {
96 uri: []const u8,
97 did: []const u8,
98 name: []const u8,
99 snippet: []const u8,
100 rkey: []const u8,
101 basePath: []const u8,
102
103 fn fromRow(row: db.Row) Pub {
104 return .{
105 .uri = row.text(0),
106 .did = row.text(1),
107 .name = row.text(2),
108 .snippet = row.text(3),
109 .rkey = row.text(4),
110 .basePath = row.text(5),
111 };
112 }
113
114 fn toJson(self: Pub) SearchResultJson {
115 return .{
116 .type = "publication",
117 .uri = self.uri,
118 .did = self.did,
119 .title = self.name,
120 .snippet = self.snippet,
121 .rkey = self.rkey,
122 .basePath = self.basePath,
123 };
124 }
125};
126
127const PubSearch = zql.Query(
128 \\SELECT f.uri, p.did, p.name,
129 \\ snippet(publications_fts, 2, '', '', '...', 32) as snippet,
130 \\ p.rkey, p.base_path
131 \\FROM publications_fts f
132 \\JOIN publications p ON f.uri = p.uri
133 \\WHERE publications_fts MATCH :query
134 \\ORDER BY rank LIMIT 10
135);
136
137pub fn search(alloc: Allocator, query: []const u8, tag_filter: ?[]const u8) ![]const u8 {
138 const c = db.getClient() orelse return error.NotInitialized;
139
140 var output: std.Io.Writer.Allocating = .init(alloc);
141 errdefer output.deinit();
142
143 var jw: json.Stringify = .{ .writer = &output.writer };
144 try jw.beginArray();
145
146 const fts_query = try buildFtsQuery(alloc, query);
147
148 // search documents
149 var doc_result = if (query.len == 0 and tag_filter != null)
150 c.query(DocsByTag.positional, DocsByTag.bind(.{ .tag = tag_filter.? })) catch null
151 else if (tag_filter) |tag|
152 c.query(DocsByFtsAndTag.positional, DocsByFtsAndTag.bind(.{ .query = fts_query, .tag = tag })) catch null
153 else
154 c.query(DocsByFts.positional, DocsByFts.bind(.{ .query = fts_query })) catch null;
155
156 if (doc_result) |*res| {
157 defer res.deinit();
158 for (res.rows) |row| try jw.write(Doc.fromRow(row).toJson());
159 }
160
161 // publications are excluded when filtering by tag (tags only apply to documents)
162 if (tag_filter == null) {
163 var pub_result = c.query(
164 PubSearch.positional,
165 PubSearch.bind(.{ .query = fts_query }),
166 ) catch null;
167
168 if (pub_result) |*res| {
169 defer res.deinit();
170 for (res.rows) |row| try jw.write(Pub.fromRow(row).toJson());
171 }
172 }
173
174 try jw.endArray();
175 return try output.toOwnedSlice();
176}
177
178/// Find documents similar to a given document using vector similarity
179/// Uses brute-force cosine distance with caching (cache invalidated when doc count changes)
180pub fn findSimilar(alloc: Allocator, uri: []const u8, limit: usize) ![]const u8 {
181 const c = db.getClient() orelse return error.NotInitialized;
182
183 // get current doc count (for cache invalidation)
184 const doc_count = getEmbeddedDocCount(c) orelse return error.QueryFailed;
185
186 // check cache
187 if (getCachedSimilar(alloc, c, uri, doc_count)) |cached| {
188 stats.recordCacheHit();
189 return cached;
190 }
191 stats.recordCacheMiss();
192
193 // cache miss - compute similarity
194 var output: std.Io.Writer.Allocating = .init(alloc);
195 errdefer output.deinit();
196
197 var limit_buf: [8]u8 = undefined;
198 const limit_str = std.fmt.bufPrint(&limit_buf, "{d}", .{limit}) catch "5";
199
200 // brute-force cosine similarity search (no vector index needed)
201 var res = c.query(
202 \\SELECT d2.uri, d2.did, d2.title, '' as snippet,
203 \\ d2.created_at, d2.rkey, COALESCE(p.base_path, '') as base_path,
204 \\ CASE WHEN d2.publication_uri != '' THEN 1 ELSE 0 END as has_publication
205 \\FROM documents d1, documents d2
206 \\LEFT JOIN publications p ON d2.publication_uri = p.uri
207 \\WHERE d1.uri = ?
208 \\ AND d2.uri != d1.uri
209 \\ AND d1.embedding IS NOT NULL
210 \\ AND d2.embedding IS NOT NULL
211 \\ORDER BY vector_distance_cos(d1.embedding, d2.embedding)
212 \\LIMIT ?
213 , &.{ uri, limit_str }) catch {
214 try output.writer.writeAll("[]");
215 return try output.toOwnedSlice();
216 };
217 defer res.deinit();
218
219 var jw: json.Stringify = .{ .writer = &output.writer };
220 try jw.beginArray();
221 for (res.rows) |row| try jw.write(Doc.fromRow(row).toJson());
222 try jw.endArray();
223
224 const results = try output.toOwnedSlice();
225
226 // cache the results (fire and forget)
227 cacheSimilarResults(c, uri, results, doc_count);
228
229 return results;
230}
231
232fn getEmbeddedDocCount(c: *db.Client) ?i64 {
233 var res = c.query("SELECT COUNT(*) FROM documents WHERE embedding IS NOT NULL", &.{}) catch return null;
234 defer res.deinit();
235 if (res.rows.len == 0) return null;
236 return res.rows[0].int(0);
237}
238
239fn getCachedSimilar(alloc: Allocator, c: *db.Client, uri: []const u8, current_doc_count: i64) ?[]const u8 {
240 var count_buf: [20]u8 = undefined;
241 const count_str = std.fmt.bufPrint(&count_buf, "{d}", .{current_doc_count}) catch return null;
242
243 var res = c.query(
244 "SELECT results FROM similarity_cache WHERE source_uri = ? AND doc_count = ?",
245 &.{ uri, count_str },
246 ) catch return null;
247 defer res.deinit();
248
249 if (res.rows.len == 0) return null;
250 return alloc.dupe(u8, res.rows[0].text(0)) catch null;
251}
252
253fn cacheSimilarResults(c: *db.Client, uri: []const u8, results: []const u8, doc_count: i64) void {
254 var count_buf: [20]u8 = undefined;
255 const count_str = std.fmt.bufPrint(&count_buf, "{d}", .{doc_count}) catch return;
256
257 var ts_buf: [20]u8 = undefined;
258 const ts_str = std.fmt.bufPrint(&ts_buf, "{d}", .{std.time.timestamp()}) catch return;
259
260 c.exec(
261 "INSERT OR REPLACE INTO similarity_cache (source_uri, results, doc_count, computed_at) VALUES (?, ?, ?, ?)",
262 &.{ uri, results, count_str, ts_str },
263 ) catch {};
264}
265
266/// Build FTS5 query with OR between terms: "cat dog" -> "cat OR dog*"
267/// Uses OR for better recall with BM25 ranking (more matches = higher score)
268/// Quoted queries are passed through as phrase matches: "exact phrase" -> "exact phrase"
269pub fn buildFtsQuery(alloc: Allocator, query: []const u8) ![]const u8 {
270 if (query.len == 0) return "";
271
272 // normalize: trim whitespace
273 var start: usize = 0;
274 var end: usize = query.len;
275 while (start < end and query[start] == ' ') start += 1;
276 while (end > start and query[end - 1] == ' ') end -= 1;
277 if (start >= end) return "";
278
279 const trimmed = query[start..end];
280
281 // quoted phrase: pass through to FTS5 for exact phrase matching
282 if (trimmed.len >= 2 and trimmed[0] == '"' and trimmed[trimmed.len - 1] == '"') {
283 return try alloc.dupe(u8, trimmed);
284 }
285
286 // count words and total length
287 var word_count: usize = 0;
288 var total_word_len: usize = 0;
289 var in_word = false;
290 for (trimmed) |c| {
291 const is_sep = (c == ' ' or c == '.');
292 if (is_sep) {
293 in_word = false;
294 } else {
295 if (!in_word) word_count += 1;
296 in_word = true;
297 total_word_len += 1;
298 }
299 }
300
301 if (word_count == 0) return "";
302
303 // single word: just add prefix wildcard
304 if (word_count == 1) {
305 const buf = try alloc.alloc(u8, total_word_len + 1);
306 var pos: usize = 0;
307 for (trimmed) |c| {
308 if (c != ' ' and c != '.') {
309 buf[pos] = c;
310 pos += 1;
311 }
312 }
313 buf[pos] = '*';
314 return buf;
315 }
316
317 // multiple words: join with " OR ", prefix on last
318 // size = word chars + (n-1) * 4 for " OR " + 1 for "*"
319 const buf_len = total_word_len + (word_count - 1) * 4 + 1;
320 const buf = try alloc.alloc(u8, buf_len);
321
322 var pos: usize = 0;
323 var current_word: usize = 0;
324 in_word = false;
325
326 for (trimmed) |c| {
327 const is_sep = (c == ' ' or c == '.');
328 if (is_sep) {
329 if (in_word) {
330 // end of word - add " OR " if not last
331 current_word += 1;
332 if (current_word < word_count) {
333 @memcpy(buf[pos .. pos + 4], " OR ");
334 pos += 4;
335 }
336 }
337 in_word = false;
338 } else {
339 buf[pos] = c;
340 pos += 1;
341 in_word = true;
342 }
343 }
344 buf[pos] = '*';
345 return buf;
346}