search for standard sites pub-search.waow.tech
search zig blog atproto

fix: stop filtering bridgy-fed standard.site documents

The blanket bridgy-fed filter was added because we couldn't build links
(empty base_path). Now that the indexer resolves base_path from HTTP
site URLs in publication_uri, bridgy-fed documents can get working links
like any other standard.site content.

Removes isBridgyFed, resolvePdsIsBridgy, and PdsCache (no longer needed).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

+5 -88
+5 -88
backend/src/ingest/tap.zig
··· 121 121 } 122 122 }; 123 123 124 - /// Cache of DID → is_bridgy_fed results from PLC directory lookups. 125 - /// Single-threaded (owned by processWorker), no sync needed. 126 - const PdsCache = std.StringHashMap(bool); 127 - 128 124 fn processWorker(queue: *ProcessQueue) void { 129 125 logfire.info("tap: process worker started", .{}); 130 - var pds_cache = PdsCache.init(queue.allocator); 131 - defer { 132 - var it = pds_cache.iterator(); 133 - while (it.next()) |entry| queue.allocator.free(entry.key_ptr.*); 134 - pds_cache.deinit(); 135 - } 136 126 while (queue.pop()) |data| { 137 127 defer queue.allocator.free(data); 138 - processMessage(queue.allocator, data, &pds_cache) catch |err| { 128 + processMessage(queue.allocator, data) catch |err| { 139 129 logfire.err("message processing error: {}", .{err}); 140 130 }; 141 131 queue.mutex.lock(); ··· 302 292 base_path: ?[]const u8 = null, 303 293 }; 304 294 305 - /// Check if a DID is hosted on brid.gy (bridged Mastodon/ActivityPub/Ghost content). 306 - /// Results are cached for the lifetime of the worker thread. 307 - /// Fails open: on HTTP/parse errors, returns false (allow through). 308 - fn isBridgyFed(allocator: Allocator, did: []const u8, cache: *PdsCache) bool { 309 - if (cache.get(did)) |is_bridgy| return is_bridgy; 310 - 311 - const result = resolvePdsIsBridgy(allocator, did); 312 - // cache with duped key (cache outlives the parsed message) 313 - const key = allocator.dupe(u8, did) catch return false; 314 - cache.put(key, result) catch { 315 - allocator.free(key); 316 - return result; 317 - }; 318 - if (result) { 319 - logfire.info("tap: blocked bridgy fed DID: {s}", .{did}); 320 - } 321 - return result; 322 - } 323 - 324 - /// HTTP GET plc.directory/{did}, check if PDS serviceEndpoint contains "brid.gy". 325 - fn resolvePdsIsBridgy(allocator: Allocator, did: []const u8) bool { 326 - const http = std.http; 327 - 328 - var url_buf: [256]u8 = undefined; 329 - const url = std.fmt.bufPrint(&url_buf, "https://plc.directory/{s}", .{did}) catch return false; 330 - 331 - var client: http.Client = .{ .allocator = allocator }; 332 - defer client.deinit(); 333 - 334 - var response_body: std.Io.Writer.Allocating = .init(allocator); 335 - defer response_body.deinit(); 336 - 337 - const res = client.fetch(.{ 338 - .location = .{ .url = url }, 339 - .method = .GET, 340 - .response_writer = &response_body.writer, 341 - }) catch |err| { 342 - logfire.warn("tap: PLC lookup failed for {s}: {}", .{ did, err }); 343 - return false; 344 - }; 345 - 346 - if (res.status != .ok) { 347 - logfire.warn("tap: PLC lookup {s} returned {}", .{ did, res.status }); 348 - return false; 349 - } 350 - 351 - const body = response_body.toOwnedSlice() catch return false; 352 - defer allocator.free(body); 353 - 354 - const parsed = json.parseFromSlice(json.Value, allocator, body, .{}) catch return false; 355 - defer parsed.deinit(); 356 - 357 - // look for service[].serviceEndpoint where type == "AtprotoPersonalDataServer" 358 - const services = parsed.value.object.get("service") orelse return false; 359 - if (services != .array) return false; 360 - 361 - for (services.array.items) |svc| { 362 - if (svc != .object) continue; 363 - const svc_type = svc.object.get("type") orelse continue; 364 - if (svc_type != .string) continue; 365 - if (!mem.eql(u8, svc_type.string, "AtprotoPersonalDataServer")) continue; 366 - const endpoint = svc.object.get("serviceEndpoint") orelse continue; 367 - if (endpoint != .string) continue; 368 - if (mem.indexOf(u8, endpoint.string, "brid.gy") != null) return true; 369 - } 370 - 371 - return false; 372 - } 373 - 374 - fn processMessage(allocator: Allocator, payload: []const u8, pds_cache: *PdsCache) !void { 295 + fn processMessage(allocator: Allocator, payload: []const u8) !void { 375 296 const parsed = json.parseFromSlice(json.Value, allocator, payload, .{}) catch { 376 297 logfire.err("tap: JSON parse failed, first 100 bytes: {s}", .{payload[0..@min(payload.len, 100)]}); 377 298 return; ··· 398 319 return; 399 320 }; 400 321 401 - // skip bridgy fed content (bridged Mastodon/ActivityPub/Ghost posts) 402 - if (isDocumentCollection(rec.collection) or isPublicationCollection(rec.collection)) { 403 - if (isBridgyFed(allocator, did.raw, pds_cache)) { 404 - logfire.span("tap.dropped", .{ .reason = "bridgy_fed", .collection = rec.collection }).end(); 405 - return; 406 - } 407 - } 322 + // note: bridgy fed content is no longer filtered — the indexer's HTTP site URL 323 + // fallback resolves base_path from the publication's "site" field, so we can 324 + // build working links for bridged standard.site documents. 408 325 409 326 // build AT-URI string (no allocation - uses stack buffer) 410 327 var uri_buf: [256]u8 = undefined;