backend/src/search.zig at multi-platform-schema

zzstoatzz.io / leaflet-search
fork atom
search for standard sites pub-search.waow.tech
search zig blog atproto
fork atom
leaflet-search / backend / src / search.zig
at multi-platform-schema 346 lines 11 kB view raw
wrap content
zzstoatzz.io add similarity cache, cache stats, loading indicator, and planning doc 2mo ago
e264ee4e
  1const std = @import("std");
  2const json = std.json;
  3const Allocator = std.mem.Allocator;
  4const zql = @import("zql");
  5const db = @import("db/mod.zig");
  6const stats = @import("stats.zig");
  7
  8// JSON output type for search results
  9const SearchResultJson = struct {
 10    type: []const u8,
 11    uri: []const u8,
 12    did: []const u8,
 13    title: []const u8,
 14    snippet: []const u8,
 15    createdAt: []const u8 = "",
 16    rkey: []const u8,
 17    basePath: []const u8,
 18};
 19
 20/// Document search result (internal)
 21const Doc = struct {
 22    uri: []const u8,
 23    did: []const u8,
 24    title: []const u8,
 25    snippet: []const u8,
 26    createdAt: []const u8,
 27    rkey: []const u8,
 28    basePath: []const u8,
 29    hasPublication: bool,
 30
 31    fn fromRow(row: db.Row) Doc {
 32        return .{
 33            .uri = row.text(0),
 34            .did = row.text(1),
 35            .title = row.text(2),
 36            .snippet = row.text(3),
 37            .createdAt = row.text(4),
 38            .rkey = row.text(5),
 39            .basePath = row.text(6),
 40            .hasPublication = row.int(7) != 0,
 41        };
 42    }
 43
 44    fn toJson(self: Doc) SearchResultJson {
 45        return .{
 46            .type = if (self.hasPublication) "article" else "looseleaf",
 47            .uri = self.uri,
 48            .did = self.did,
 49            .title = self.title,
 50            .snippet = self.snippet,
 51            .createdAt = self.createdAt,
 52            .rkey = self.rkey,
 53            .basePath = self.basePath,
 54        };
 55    }
 56};
 57
 58const DocsByTag = zql.Query(
 59    \\SELECT d.uri, d.did, d.title, '' as snippet,
 60    \\  d.created_at, d.rkey, p.base_path,
 61    \\  CASE WHEN d.publication_uri != '' THEN 1 ELSE 0 END as has_publication
 62    \\FROM documents d
 63    \\LEFT JOIN publications p ON d.publication_uri = p.uri
 64    \\JOIN document_tags dt ON d.uri = dt.document_uri
 65    \\WHERE dt.tag = :tag
 66    \\ORDER BY d.created_at DESC LIMIT 40
 67);
 68
 69const DocsByFtsAndTag = zql.Query(
 70    \\SELECT f.uri, d.did, d.title,
 71    \\  snippet(documents_fts, 2, '', '', '...', 32) as snippet,
 72    \\  d.created_at, d.rkey, p.base_path,
 73    \\  CASE WHEN d.publication_uri != '' THEN 1 ELSE 0 END as has_publication
 74    \\FROM documents_fts f
 75    \\JOIN documents d ON f.uri = d.uri
 76    \\LEFT JOIN publications p ON d.publication_uri = p.uri
 77    \\JOIN document_tags dt ON d.uri = dt.document_uri
 78    \\WHERE documents_fts MATCH :query AND dt.tag = :tag
 79    \\ORDER BY rank LIMIT 40
 80);
 81
 82const DocsByFts = zql.Query(
 83    \\SELECT f.uri, d.did, d.title,
 84    \\  snippet(documents_fts, 2, '', '', '...', 32) as snippet,
 85    \\  d.created_at, d.rkey, p.base_path,
 86    \\  CASE WHEN d.publication_uri != '' THEN 1 ELSE 0 END as has_publication
 87    \\FROM documents_fts f
 88    \\JOIN documents d ON f.uri = d.uri
 89    \\LEFT JOIN publications p ON d.publication_uri = p.uri
 90    \\WHERE documents_fts MATCH :query
 91    \\ORDER BY rank LIMIT 40
 92);
 93
 94/// Publication search result (internal)
 95const Pub = struct {
 96    uri: []const u8,
 97    did: []const u8,
 98    name: []const u8,
 99    snippet: []const u8,
100    rkey: []const u8,
101    basePath: []const u8,
102
103    fn fromRow(row: db.Row) Pub {
104        return .{
105            .uri = row.text(0),
106            .did = row.text(1),
107            .name = row.text(2),
108            .snippet = row.text(3),
109            .rkey = row.text(4),
110            .basePath = row.text(5),
111        };
112    }
113
114    fn toJson(self: Pub) SearchResultJson {
115        return .{
116            .type = "publication",
117            .uri = self.uri,
118            .did = self.did,
119            .title = self.name,
120            .snippet = self.snippet,
121            .rkey = self.rkey,
122            .basePath = self.basePath,
123        };
124    }
125};
126
127const PubSearch = zql.Query(
128    \\SELECT f.uri, p.did, p.name,
129    \\  snippet(publications_fts, 2, '', '', '...', 32) as snippet,
130    \\  p.rkey, p.base_path
131    \\FROM publications_fts f
132    \\JOIN publications p ON f.uri = p.uri
133    \\WHERE publications_fts MATCH :query
134    \\ORDER BY rank LIMIT 10
135);
136
137pub fn search(alloc: Allocator, query: []const u8, tag_filter: ?[]const u8) ![]const u8 {
138    const c = db.getClient() orelse return error.NotInitialized;
139
140    var output: std.Io.Writer.Allocating = .init(alloc);
141    errdefer output.deinit();
142
143    var jw: json.Stringify = .{ .writer = &output.writer };
144    try jw.beginArray();
145
146    const fts_query = try buildFtsQuery(alloc, query);
147
148    // search documents
149    var doc_result = if (query.len == 0 and tag_filter != null)
150        c.query(DocsByTag.positional, DocsByTag.bind(.{ .tag = tag_filter.? })) catch null
151    else if (tag_filter) |tag|
152        c.query(DocsByFtsAndTag.positional, DocsByFtsAndTag.bind(.{ .query = fts_query, .tag = tag })) catch null
153    else
154        c.query(DocsByFts.positional, DocsByFts.bind(.{ .query = fts_query })) catch null;
155
156    if (doc_result) |*res| {
157        defer res.deinit();
158        for (res.rows) |row| try jw.write(Doc.fromRow(row).toJson());
159    }
160
161    // publications are excluded when filtering by tag (tags only apply to documents)
162    if (tag_filter == null) {
163        var pub_result = c.query(
164            PubSearch.positional,
165            PubSearch.bind(.{ .query = fts_query }),
166        ) catch null;
167
168        if (pub_result) |*res| {
169            defer res.deinit();
170            for (res.rows) |row| try jw.write(Pub.fromRow(row).toJson());
171        }
172    }
173
174    try jw.endArray();
175    return try output.toOwnedSlice();
176}
177
178/// Find documents similar to a given document using vector similarity
179/// Uses brute-force cosine distance with caching (cache invalidated when doc count changes)
180pub fn findSimilar(alloc: Allocator, uri: []const u8, limit: usize) ![]const u8 {
181    const c = db.getClient() orelse return error.NotInitialized;
182
183    // get current doc count (for cache invalidation)
184    const doc_count = getEmbeddedDocCount(c) orelse return error.QueryFailed;
185
186    // check cache
187    if (getCachedSimilar(alloc, c, uri, doc_count)) |cached| {
188        stats.recordCacheHit();
189        return cached;
190    }
191    stats.recordCacheMiss();
192
193    // cache miss - compute similarity
194    var output: std.Io.Writer.Allocating = .init(alloc);
195    errdefer output.deinit();
196
197    var limit_buf: [8]u8 = undefined;
198    const limit_str = std.fmt.bufPrint(&limit_buf, "{d}", .{limit}) catch "5";
199
200    // brute-force cosine similarity search (no vector index needed)
201    var res = c.query(
202        \\SELECT d2.uri, d2.did, d2.title, '' as snippet,
203        \\  d2.created_at, d2.rkey, COALESCE(p.base_path, '') as base_path,
204        \\  CASE WHEN d2.publication_uri != '' THEN 1 ELSE 0 END as has_publication
205        \\FROM documents d1, documents d2
206        \\LEFT JOIN publications p ON d2.publication_uri = p.uri
207        \\WHERE d1.uri = ?
208        \\  AND d2.uri != d1.uri
209        \\  AND d1.embedding IS NOT NULL
210        \\  AND d2.embedding IS NOT NULL
211        \\ORDER BY vector_distance_cos(d1.embedding, d2.embedding)
212        \\LIMIT ?
213    , &.{ uri, limit_str }) catch {
214        try output.writer.writeAll("[]");
215        return try output.toOwnedSlice();
216    };
217    defer res.deinit();
218
219    var jw: json.Stringify = .{ .writer = &output.writer };
220    try jw.beginArray();
221    for (res.rows) |row| try jw.write(Doc.fromRow(row).toJson());
222    try jw.endArray();
223
224    const results = try output.toOwnedSlice();
225
226    // cache the results (fire and forget)
227    cacheSimilarResults(c, uri, results, doc_count);
228
229    return results;
230}
231
232fn getEmbeddedDocCount(c: *db.Client) ?i64 {
233    var res = c.query("SELECT COUNT(*) FROM documents WHERE embedding IS NOT NULL", &.{}) catch return null;
234    defer res.deinit();
235    if (res.rows.len == 0) return null;
236    return res.rows[0].int(0);
237}
238
239fn getCachedSimilar(alloc: Allocator, c: *db.Client, uri: []const u8, current_doc_count: i64) ?[]const u8 {
240    var count_buf: [20]u8 = undefined;
241    const count_str = std.fmt.bufPrint(&count_buf, "{d}", .{current_doc_count}) catch return null;
242
243    var res = c.query(
244        "SELECT results FROM similarity_cache WHERE source_uri = ? AND doc_count = ?",
245        &.{ uri, count_str },
246    ) catch return null;
247    defer res.deinit();
248
249    if (res.rows.len == 0) return null;
250    return alloc.dupe(u8, res.rows[0].text(0)) catch null;
251}
252
253fn cacheSimilarResults(c: *db.Client, uri: []const u8, results: []const u8, doc_count: i64) void {
254    var count_buf: [20]u8 = undefined;
255    const count_str = std.fmt.bufPrint(&count_buf, "{d}", .{doc_count}) catch return;
256
257    var ts_buf: [20]u8 = undefined;
258    const ts_str = std.fmt.bufPrint(&ts_buf, "{d}", .{std.time.timestamp()}) catch return;
259
260    c.exec(
261        "INSERT OR REPLACE INTO similarity_cache (source_uri, results, doc_count, computed_at) VALUES (?, ?, ?, ?)",
262        &.{ uri, results, count_str, ts_str },
263    ) catch {};
264}
265
266/// Build FTS5 query with OR between terms: "cat dog" -> "cat OR dog*"
267/// Uses OR for better recall with BM25 ranking (more matches = higher score)
268/// Quoted queries are passed through as phrase matches: "exact phrase" -> "exact phrase"
269pub fn buildFtsQuery(alloc: Allocator, query: []const u8) ![]const u8 {
270    if (query.len == 0) return "";
271
272    // normalize: trim whitespace
273    var start: usize = 0;
274    var end: usize = query.len;
275    while (start < end and query[start] == ' ') start += 1;
276    while (end > start and query[end - 1] == ' ') end -= 1;
277    if (start >= end) return "";
278
279    const trimmed = query[start..end];
280
281    // quoted phrase: pass through to FTS5 for exact phrase matching
282    if (trimmed.len >= 2 and trimmed[0] == '"' and trimmed[trimmed.len - 1] == '"') {
283        return try alloc.dupe(u8, trimmed);
284    }
285
286    // count words and total length
287    var word_count: usize = 0;
288    var total_word_len: usize = 0;
289    var in_word = false;
290    for (trimmed) |c| {
291        const is_sep = (c == ' ' or c == '.');
292        if (is_sep) {
293            in_word = false;
294        } else {
295            if (!in_word) word_count += 1;
296            in_word = true;
297            total_word_len += 1;
298        }
299    }
300
301    if (word_count == 0) return "";
302
303    // single word: just add prefix wildcard
304    if (word_count == 1) {
305        const buf = try alloc.alloc(u8, total_word_len + 1);
306        var pos: usize = 0;
307        for (trimmed) |c| {
308            if (c != ' ' and c != '.') {
309                buf[pos] = c;
310                pos += 1;
311            }
312        }
313        buf[pos] = '*';
314        return buf;
315    }
316
317    // multiple words: join with " OR ", prefix on last
318    // size = word chars + (n-1) * 4 for " OR " + 1 for "*"
319    const buf_len = total_word_len + (word_count - 1) * 4 + 1;
320    const buf = try alloc.alloc(u8, buf_len);
321
322    var pos: usize = 0;
323    var current_word: usize = 0;
324    in_word = false;
325
326    for (trimmed) |c| {
327        const is_sep = (c == ' ' or c == '.');
328        if (is_sep) {
329            if (in_word) {
330                // end of word - add " OR " if not last
331                current_word += 1;
332                if (current_word < word_count) {
333                    @memcpy(buf[pos .. pos + 4], " OR ");
334                    pos += 4;
335                }
336            }
337            in_word = false;
338        } else {
339            buf[pos] = c;
340            pos += 1;
341            in_word = true;
342        }
343    }
344    buf[pos] = '*';
345    return buf;
346}