fix: dedup migration queue to prevent unbounded memory growth

+1

build.zig

··· 51 51 // tests 52 52 const test_step = b.step("test", "run unit tests"); 53 53 const test_files = .{ 54 + "src/lru.zig", 54 55 "src/api.zig", 55 56 "src/broadcaster.zig", 56 57 "src/validator.zig",

+7 -2

src/broadcaster.zig

··· 566 566 consumer_queue_depth: usize = 0, 567 567 }; 568 568 569 - pub fn formatPrometheusMetrics(stats: *const Stats, cache_entries: usize, migration_queue_len: usize, attribution: AttributionMetrics, data_dir: []const u8, buf: []u8) []const u8 { 569 + pub fn formatPrometheusMetrics(stats: *const Stats, cache_entries: usize, migration_queue_len: usize, migration_pending_count: u32, attribution: AttributionMetrics, data_dir: []const u8, buf: []u8) []const u8 { 570 570 const uptime: i64 = std.time.timestamp() - stats.start_time; 571 571 var fbs = std.io.fixedBufferStream(buf); 572 572 const w = fbs.writer(); ··· 617 617 \\# TYPE relay_validator_migration_queue gauge 618 618 \\relay_validator_migration_queue {d} 619 619 \\ 620 + \\# TYPE relay_validator_migration_pending gauge 621 + \\# HELP relay_validator_migration_pending DIDs suppressed from re-queueing migration checks 622 + \\relay_validator_migration_pending {d} 623 + \\ 620 624 \\# TYPE relay_validator_cache_evictions_total counter 621 625 \\relay_validator_cache_evictions_total {d} 622 626 \\ ··· 654 658 uptime, 655 659 cache_entries, 656 660 migration_queue_len, 661 + migration_pending_count, 657 662 stats.cache_evictions.load(.acquire), 658 663 attribution.history_entries, 659 664 attribution.evtbuf_entries, ··· 981 986 stats.cache_misses.store(100, .release); 982 987 983 988 var buf: [16384]u8 = undefined; 984 - const output = formatPrometheusMetrics(&stats, 42, 3, .{}, "/tmp", &buf); 989 + const output = formatPrometheusMetrics(&stats, 42, 3, 7, .{}, "/tmp", &buf); 985 990 986 991 try std.testing.expect(std.mem.indexOf(u8, output, "relay_frames_received_total 10000") != null); 987 992 try std.testing.expect(std.mem.indexOf(u8, output, "relay_frames_broadcast_total 9000") != null);

+8 -37

src/event_log.zig

··· 12 12 13 13 const std = @import("std"); 14 14 const pg = @import("pg"); 15 + const lru = @import("lru.zig"); 15 16 16 17 const Allocator = std.mem.Allocator; 17 18 const log = std.log.scoped(.relay); ··· 92 93 retention_hours: u64 = 72, // 3 days 93 94 94 95 // DID → UID cache (matches indigo's bidirectional ARC cache) 95 - did_cache: std.StringHashMapUnmanaged(u64) = .{}, 96 - did_cache_mutex: std.Thread.Mutex = .{}, 97 - max_did_cache_size: u32 = 500_000, 96 + did_cache: lru.LruCache(u64), 98 97 99 98 // write buffer (flushed periodically or when threshold hit) 100 99 evtbuf: std.ArrayListUnmanaged(PersistJob) = .{}, ··· 113 112 } 114 113 115 114 /// current DID cache entry count (for metrics) 116 - pub fn didCacheLen(self: *DiskPersist) usize { 117 - self.did_cache_mutex.lock(); 118 - defer self.did_cache_mutex.unlock(); 115 + pub fn didCacheLen(self: *DiskPersist) u32 { 119 116 return self.did_cache.count(); 120 117 } 121 118 ··· 220 217 .dir_path = try allocator.dupe(u8, dir_path), 221 218 .dir = dir, 222 219 .db = pool, 220 + .did_cache = lru.LruCache(u64).init(allocator, 500_000), 223 221 }; 224 222 225 223 // recover from existing log files ··· 243 241 for (self.evtbuf.items) |job| self.allocator.free(job.data); 244 242 self.evtbuf.deinit(self.allocator); 245 243 246 - // free DID cache keys 247 - { 248 - var it = self.did_cache.iterator(); 249 - while (it.next()) |entry| self.allocator.free(entry.key_ptr.*); 250 - self.did_cache.deinit(self.allocator); 251 - } 244 + self.did_cache.deinit(); 252 245 253 246 if (self.current_file) |f| f.close(); 254 247 if (self.current_file_path) |p| self.allocator.free(p); ··· 293 286 /// matches indigo's Relay.DidToUid → Account.UID mapping. 294 287 pub fn uidForDid(self: *DiskPersist, did: []const u8) !u64 { 295 288 // fast path: check in-memory cache 296 - { 297 - self.did_cache_mutex.lock(); 298 - defer self.did_cache_mutex.unlock(); 299 - if (self.did_cache.get(did)) |uid| return uid; 300 - } 289 + if (self.did_cache.get(did)) |uid| return uid; 301 290 302 291 // check database 303 292 if (try self.db.rowUnsafe( ··· 332 321 return uid; 333 322 } 334 323 335 - /// insert into did_cache, evicting if at capacity. 336 - /// this is a pure lookup cache over postgres — clearing it only costs 337 - /// ~0.5ms per miss on the next lookup for that DID. 324 + /// insert into did_cache (LRU handles eviction at capacity). 338 325 fn didCachePut(self: *DiskPersist, did: []const u8, uid: u64) void { 339 - self.did_cache_mutex.lock(); 340 - defer self.did_cache_mutex.unlock(); 341 - 342 - // evict when at capacity: free all keys and clear the map. 343 - // unlike the validator cache there's no per-entry timestamp to sort by, 344 - // and the postgres fallback is fast enough that a full clear is fine. 345 - if (self.did_cache.count() >= self.max_did_cache_size) { 346 - log.info("did_cache at capacity ({d}), clearing", .{self.did_cache.count()}); 347 - var it = self.did_cache.iterator(); 348 - while (it.next()) |entry| self.allocator.free(entry.key_ptr.*); 349 - self.did_cache.clearRetainingCapacity(); 350 - } 351 - 352 - const did_duped = self.allocator.dupe(u8, did) catch return; 353 - self.did_cache.put(self.allocator, did_duped, uid) catch { 354 - self.allocator.free(did_duped); 355 - }; 326 + self.did_cache.put(did, uid) catch {}; 356 327 } 357 328 358 329 /// per-DID sync state for chain tracking

+281

src/lru.zig

··· 1 + //! generic LRU cache with string keys 2 + //! 3 + //! O(1) get/put/remove via hashmap + intrusive doubly-linked list. 4 + //! thread-safe (internal mutex). keys are duped on insert, freed on eviction. 5 + 6 + const std = @import("std"); 7 + const Allocator = std.mem.Allocator; 8 + 9 + pub fn LruCache(comptime V: type) type { 10 + return struct { 11 + const Self = @This(); 12 + 13 + const Node = struct { 14 + key: []const u8, 15 + value: V, 16 + prev: ?*Node = null, 17 + next: ?*Node = null, 18 + }; 19 + 20 + map: std.StringHashMapUnmanaged(*Node) = .{}, 21 + head: ?*Node = null, // most recently used 22 + tail: ?*Node = null, // least recently used 23 + capacity: u32, 24 + len: u32 = 0, 25 + allocator: Allocator, 26 + mutex: std.Thread.Mutex = .{}, 27 + 28 + pub fn init(allocator: Allocator, capacity: u32) Self { 29 + return .{ 30 + .allocator = allocator, 31 + .capacity = capacity, 32 + }; 33 + } 34 + 35 + pub fn deinit(self: *Self) void { 36 + var node = self.head; 37 + while (node) |n| { 38 + const next = n.next; 39 + self.allocator.free(n.key); 40 + self.allocator.destroy(n); 41 + node = next; 42 + } 43 + self.map.deinit(self.allocator); 44 + } 45 + 46 + pub fn get(self: *Self, key: []const u8) ?V { 47 + self.mutex.lock(); 48 + defer self.mutex.unlock(); 49 + const node = self.map.get(key) orelse return null; 50 + self.moveToHead(node); 51 + return node.value; 52 + } 53 + 54 + pub fn put(self: *Self, key: []const u8, value: V) Allocator.Error!void { 55 + self.mutex.lock(); 56 + defer self.mutex.unlock(); 57 + 58 + if (self.map.get(key)) |node| { 59 + // update existing 60 + node.value = value; 61 + self.moveToHead(node); 62 + return; 63 + } 64 + 65 + // evict if at capacity 66 + if (self.len >= self.capacity) { 67 + self.evictTail(); 68 + } 69 + 70 + // allocate node + dupe key 71 + const duped = try self.allocator.dupe(u8, key); 72 + errdefer self.allocator.free(duped); 73 + const node = try self.allocator.create(Node); 74 + node.* = .{ .key = duped, .value = value }; 75 + 76 + self.map.put(self.allocator, duped, node) catch |err| { 77 + self.allocator.free(duped); 78 + self.allocator.destroy(node); 79 + return err; 80 + }; 81 + 82 + // link at head 83 + node.next = self.head; 84 + if (self.head) |h| h.prev = node; 85 + self.head = node; 86 + if (self.tail == null) self.tail = node; 87 + self.len += 1; 88 + } 89 + 90 + pub fn remove(self: *Self, key: []const u8) bool { 91 + self.mutex.lock(); 92 + defer self.mutex.unlock(); 93 + const node = self.map.get(key) orelse return false; 94 + self.unlink(node); 95 + // fetchRemove uses the node's owned key for lookup, which is valid 96 + // since we haven't freed it yet 97 + _ = self.map.remove(node.key); 98 + self.allocator.free(node.key); 99 + self.allocator.destroy(node); 100 + self.len -= 1; 101 + return true; 102 + } 103 + 104 + /// check if a key exists without promoting it 105 + pub fn contains(self: *Self, key: []const u8) bool { 106 + self.mutex.lock(); 107 + defer self.mutex.unlock(); 108 + return self.map.contains(key); 109 + } 110 + 111 + pub fn count(self: *Self) u32 { 112 + self.mutex.lock(); 113 + defer self.mutex.unlock(); 114 + return self.len; 115 + } 116 + 117 + // --- internal (caller holds mutex) --- 118 + 119 + fn moveToHead(self: *Self, node: *Node) void { 120 + if (self.head == node) return; 121 + self.unlink(node); 122 + node.prev = null; 123 + node.next = self.head; 124 + if (self.head) |h| h.prev = node; 125 + self.head = node; 126 + if (self.tail == null) self.tail = node; 127 + } 128 + 129 + fn unlink(self: *Self, node: *Node) void { 130 + if (node.prev) |p| { 131 + p.next = node.next; 132 + } else { 133 + self.head = node.next; 134 + } 135 + if (node.next) |n| { 136 + n.prev = node.prev; 137 + } else { 138 + self.tail = node.prev; 139 + } 140 + node.prev = null; 141 + node.next = null; 142 + } 143 + 144 + fn evictTail(self: *Self) void { 145 + const t = self.tail orelse return; 146 + self.unlink(t); 147 + _ = self.map.remove(t.key); 148 + self.allocator.free(t.key); 149 + self.allocator.destroy(t); 150 + self.len -= 1; 151 + } 152 + }; 153 + } 154 + 155 + // --- tests --- 156 + 157 + const testing = std.testing; 158 + 159 + test "basic get and put" { 160 + var cache = LruCache(u64).init(testing.allocator, 3); 161 + defer cache.deinit(); 162 + 163 + try cache.put("a", 1); 164 + try cache.put("b", 2); 165 + try cache.put("c", 3); 166 + 167 + try testing.expectEqual(@as(u64, 1), cache.get("a").?); 168 + try testing.expectEqual(@as(u64, 2), cache.get("b").?); 169 + try testing.expectEqual(@as(u64, 3), cache.get("c").?); 170 + try testing.expect(cache.get("d") == null); 171 + } 172 + 173 + test "eviction order" { 174 + var cache = LruCache(u64).init(testing.allocator, 2); 175 + defer cache.deinit(); 176 + 177 + try cache.put("a", 1); 178 + try cache.put("b", 2); 179 + // cache: b(head) → a(tail) 180 + 181 + try cache.put("c", 3); // evicts "a" (LRU) 182 + 183 + try testing.expect(cache.get("a") == null); 184 + try testing.expectEqual(@as(u64, 2), cache.get("b").?); 185 + try testing.expectEqual(@as(u64, 3), cache.get("c").?); 186 + } 187 + 188 + test "update moves to front" { 189 + var cache = LruCache(u64).init(testing.allocator, 2); 190 + defer cache.deinit(); 191 + 192 + try cache.put("a", 1); 193 + try cache.put("b", 2); 194 + // cache: b(head) → a(tail) 195 + 196 + // access "a" — promotes it to head 197 + _ = cache.get("a"); 198 + // cache: a(head) → b(tail) 199 + 200 + try cache.put("c", 3); // evicts "b" (now LRU) 201 + 202 + try testing.expect(cache.get("b") == null); 203 + try testing.expectEqual(@as(u64, 1), cache.get("a").?); 204 + try testing.expectEqual(@as(u64, 3), cache.get("c").?); 205 + } 206 + 207 + test "put update existing key" { 208 + var cache = LruCache(u64).init(testing.allocator, 2); 209 + defer cache.deinit(); 210 + 211 + try cache.put("a", 1); 212 + try cache.put("b", 2); 213 + try cache.put("a", 10); // update, moves to head 214 + 215 + try testing.expectEqual(@as(u64, 10), cache.get("a").?); 216 + 217 + try cache.put("c", 3); // evicts "b" (LRU) 218 + try testing.expect(cache.get("b") == null); 219 + try testing.expectEqual(@as(u64, 10), cache.get("a").?); 220 + } 221 + 222 + test "remove" { 223 + var cache = LruCache(u64).init(testing.allocator, 3); 224 + defer cache.deinit(); 225 + 226 + try cache.put("a", 1); 227 + try cache.put("b", 2); 228 + 229 + try testing.expect(cache.remove("a")); 230 + try testing.expect(cache.get("a") == null); 231 + try testing.expectEqual(@as(u32, 1), cache.count()); 232 + try testing.expect(!cache.remove("nonexistent")); 233 + } 234 + 235 + test "capacity 1" { 236 + var cache = LruCache(u64).init(testing.allocator, 1); 237 + defer cache.deinit(); 238 + 239 + try cache.put("a", 1); 240 + try testing.expectEqual(@as(u64, 1), cache.get("a").?); 241 + 242 + try cache.put("b", 2); // evicts "a" 243 + try testing.expect(cache.get("a") == null); 244 + try testing.expectEqual(@as(u64, 2), cache.get("b").?); 245 + try testing.expectEqual(@as(u32, 1), cache.count()); 246 + } 247 + 248 + test "count" { 249 + var cache = LruCache(u64).init(testing.allocator, 10); 250 + defer cache.deinit(); 251 + 252 + try testing.expectEqual(@as(u32, 0), cache.count()); 253 + try cache.put("a", 1); 254 + try testing.expectEqual(@as(u32, 1), cache.count()); 255 + try cache.put("b", 2); 256 + try testing.expectEqual(@as(u32, 2), cache.count()); 257 + _ = cache.remove("a"); 258 + try testing.expectEqual(@as(u32, 1), cache.count()); 259 + } 260 + 261 + test "contains" { 262 + var cache = LruCache(u64).init(testing.allocator, 3); 263 + defer cache.deinit(); 264 + 265 + try cache.put("a", 1); 266 + try testing.expect(cache.contains("a")); 267 + try testing.expect(!cache.contains("b")); 268 + } 269 + 270 + test "struct values" { 271 + const Val = struct { x: u32, y: u32 }; 272 + var cache = LruCache(Val).init(testing.allocator, 2); 273 + defer cache.deinit(); 274 + 275 + try cache.put("point1", .{ .x = 1, .y = 2 }); 276 + try cache.put("point2", .{ .x = 3, .y = 4 }); 277 + 278 + const v = cache.get("point1").?; 279 + try testing.expectEqual(@as(u32, 1), v.x); 280 + try testing.expectEqual(@as(u32, 2), v.y); 281 + }

+2 -1

src/main.zig

··· 87 87 } else if (std.mem.eql(u8, path, "/metrics")) { 88 88 const cache_entries = validator.cacheSize(); 89 89 const migration_queue_len = validator.migrationQueueLen(); 90 + const migration_pending_count = validator.migrationPendingCount(); 90 91 const attribution = broadcaster.AttributionMetrics{ 91 92 .history_entries = bc.history.count(), 92 93 .evtbuf_entries = persist.evtbufLen(), ··· 95 96 }; 96 97 97 98 var metrics_buf: [16384]u8 = undefined; 98 - const body = broadcaster.formatPrometheusMetrics(stats, cache_entries, migration_queue_len, attribution, data_dir, &metrics_buf); 99 + const body = broadcaster.formatPrometheusMetrics(stats, cache_entries, migration_queue_len, migration_pending_count, attribution, data_dir, &metrics_buf); 99 100 request.respond(body, .{ .status = .ok, .keep_alive = false, .extra_headers = &.{ 100 101 .{ .name = "content-type", .value = "text/plain; version=0.0.4; charset=utf-8" }, 101 102 .{ .name = "server", .value = "zlay (atproto-relay)" },

+96 -136

src/validator.zig

··· 9 9 const zat = @import("zat"); 10 10 const broadcaster = @import("broadcaster.zig"); 11 11 const event_log_mod = @import("event_log.zig"); 12 + const lru = @import("lru.zig"); 12 13 13 14 const Allocator = std.mem.Allocator; 14 15 const log = std.log.scoped(.relay); ··· 51 52 config: ValidatorConfig, 52 53 persist: ?*event_log_mod.DiskPersist = null, 53 54 // DID → signing key cache (decoded, ready for verification) 54 - cache: std.StringHashMapUnmanaged(CachedKey) = .{}, 55 - cache_mutex: std.Thread.Mutex = .{}, 55 + cache: lru.LruCache(CachedKey), 56 56 // background resolve queue 57 57 queue: std.ArrayListUnmanaged([]const u8) = .{}, 58 58 // in-flight set — prevents duplicate DID entries in the queue 59 59 queued_set: std.StringHashMapUnmanaged(void) = .{}, 60 - // migration validation queue 60 + // migration validation queue (with dedup to prevent unbounded growth) 61 61 migration_queue: std.ArrayListUnmanaged(MigrationCheck) = .{}, 62 + migration_pending: std.StringHashMapUnmanaged(void) = .{}, 62 63 queue_mutex: std.Thread.Mutex = .{}, 63 64 queue_cond: std.Thread.Condition = .{}, 64 65 resolver_threads: [max_resolver_threads]?std.Thread = .{null} ** max_resolver_threads, ··· 81 82 .allocator = allocator, 82 83 .stats = stats, 83 84 .config = config, 85 + .cache = lru.LruCache(CachedKey).init(allocator, 250_000), 84 86 }; 85 87 } 86 88 ··· 94 96 } 95 97 } 96 98 97 - // free cache keys (CachedKey is inline, no separate free needed) 98 - var cache_it = self.cache.iterator(); 99 - while (cache_it.next()) |entry| { 100 - self.allocator.free(entry.key_ptr.*); 101 - } 102 - self.cache.deinit(self.allocator); 99 + self.cache.deinit(); 103 100 104 101 // free queued DIDs 105 102 for (self.queue.items) |did| { ··· 113 110 self.allocator.free(mc.did); 114 111 } 115 112 self.migration_queue.deinit(self.allocator); 113 + 114 + // free migration dedup set keys 115 + var mig_it = self.migration_pending.keyIterator(); 116 + while (mig_it.next()) |k| { 117 + self.allocator.free(k.*); 118 + } 119 + self.migration_pending.deinit(self.allocator); 116 120 } 117 121 118 122 /// start background resolver threads 119 123 pub fn start(self: *Validator) !void { 120 124 self.max_cache_size = parseEnvInt(u32, "VALIDATOR_CACHE_SIZE", self.max_cache_size); 125 + self.cache.capacity = self.max_cache_size; 121 126 const n = parseEnvInt(u8, "RESOLVER_THREADS", default_resolver_threads); 122 127 const count = @min(n, max_resolver_threads); 123 128 for (self.resolver_threads[0..count]) |*t| { ··· 160 165 } 161 166 162 167 // cache lookup 163 - const cached_key: ?CachedKey = blk: { 164 - self.cache_mutex.lock(); 165 - defer self.cache_mutex.unlock(); 166 - break :blk self.cache.get(did); 167 - }; 168 + const cached_key: ?CachedKey = self.cache.get(did); 168 169 169 170 if (cached_key == null) { 170 171 _ = self.stats.cache_misses.fetchAdd(1, .monotonic); ··· 217 218 }; 218 219 219 220 // check cache for pre-resolved signing key 220 - const cached_key: ?CachedKey = blk: { 221 - self.cache_mutex.lock(); 222 - defer self.cache_mutex.unlock(); 223 - break :blk self.cache.get(did); 224 - }; 221 + const cached_key: ?CachedKey = self.cache.get(did); 225 222 226 223 if (cached_key == null) { 227 224 // cache miss — queue for background resolution, skip validation ··· 394 391 395 392 fn queueResolve(self: *Validator, did: []const u8) void { 396 393 // check if already cached (race between validate and resolver) 397 - { 398 - self.cache_mutex.lock(); 399 - defer self.cache_mutex.unlock(); 400 - if (self.cache.contains(did)) return; 401 - } 394 + if (self.cache.contains(did)) return; 402 395 403 396 const duped = self.allocator.dupe(u8, did) catch return; 404 397 ··· 445 438 defer self.allocator.free(d); 446 439 447 440 // skip if already cached (resolved while queued) 448 - { 449 - self.cache_mutex.lock(); 450 - defer self.cache_mutex.unlock(); 451 - if (self.cache.contains(d)) continue; 452 - } 441 + if (self.cache.contains(d)) continue; 453 442 454 443 // periodically recreate resolver to free accumulated http.Client state 455 444 resolve_count += 1; ··· 481 470 }; 482 471 @memcpy(cached.raw[0..public_key.raw.len], public_key.raw); 483 472 484 - // evict oldest entries if cache is at capacity 485 - const needs_eviction = blk: { 486 - self.cache_mutex.lock(); 487 - defer self.cache_mutex.unlock(); 488 - break :blk self.cache.count() >= self.max_cache_size; 489 - }; 490 - if (needs_eviction) { 491 - self.evictOldest(); 492 - } 493 - 494 - const did_duped = self.allocator.dupe(u8, d) catch continue; 495 - 496 - self.cache_mutex.lock(); 497 - defer self.cache_mutex.unlock(); 498 - self.cache.put(self.allocator, did_duped, cached) catch { 499 - self.allocator.free(did_duped); 500 - }; 473 + self.cache.put(d, cached) catch continue; 501 474 } else if (migration) |mc| { 502 475 defer self.allocator.free(mc.did); 503 - self.processMigrationCheck(&resolver, mc); 476 + const confirmed = self.processMigrationCheck(&resolver, mc); 477 + 478 + // remove from dedup set — if confirmed, allow future re-queue 479 + // if rejected, leave in set to suppress re-queueing 480 + if (confirmed) { 481 + self.queue_mutex.lock(); 482 + defer self.queue_mutex.unlock(); 483 + if (self.migration_pending.fetchRemove(mc.did)) |entry| { 484 + self.allocator.free(entry.key); 485 + } 486 + } 487 + // rejected DIDs stay in migration_pending, suppressing re-queueing 488 + // until the set is cleared (see migrationPendingCleanup) 504 489 } 505 490 } 506 491 } 507 492 508 - /// validate a host migration by resolving the DID document and checking the PDS endpoint 509 - fn processMigrationCheck(self: *Validator, resolver: *zat.DidResolver, mc: MigrationCheck) void { 510 - const persist = self.persist orelse return; 493 + /// validate a host migration by resolving the DID document and checking the PDS endpoint. 494 + /// returns true if migration was confirmed, false if rejected/failed. 495 + fn processMigrationCheck(self: *Validator, resolver: *zat.DidResolver, mc: MigrationCheck) bool { 496 + const persist = self.persist orelse return false; 511 497 512 498 const parsed = zat.Did.parse(mc.did) orelse { 513 499 log.debug("migration check: invalid DID {s}", .{mc.did}); 514 - return; 500 + return false; 515 501 }; 516 502 517 503 var doc = resolver.resolve(parsed) catch |err| { 518 504 log.debug("migration check: DID resolve failed for {s}: {s}", .{ mc.did, @errorName(err) }); 519 - return; 505 + return false; 520 506 }; 521 507 defer doc.deinit(); 522 508 523 509 const pds_endpoint = doc.pdsEndpoint() orelse { 524 510 log.debug("migration check: no PDS endpoint for {s}", .{mc.did}); 525 - return; 511 + return false; 526 512 }; 527 513 528 514 // extract hostname from PDS endpoint URL (strip https:// prefix) 529 515 const pds_host = extractHostFromUrl(pds_endpoint) orelse { 530 516 log.debug("migration check: cannot parse PDS URL '{s}' for {s}", .{ pds_endpoint, mc.did }); 531 - return; 517 + return false; 532 518 }; 533 519 534 520 // look up the hostname → host_id 535 521 const resolved_host_id = (persist.getHostIdForHostname(pds_host) catch { 536 522 log.debug("migration check: host lookup failed for {s}", .{pds_host}); 537 - return; 523 + return false; 538 524 }) orelse { 539 525 log.debug("migration check: unknown host {s} for {s}", .{ pds_host, mc.did }); 540 - return; 526 + return false; 541 527 }; 542 528 543 529 if (resolved_host_id == mc.new_host_id) { 544 530 // DID document confirms the new host — update 545 - const uid = persist.uidForDid(mc.did) catch return; 546 - persist.setAccountHostId(uid, mc.new_host_id) catch return; 531 + const uid = persist.uidForDid(mc.did) catch return false; 532 + persist.setAccountHostId(uid, mc.new_host_id) catch return false; 547 533 log.info("migration validated: {s} → host {d} (confirmed by DID doc)", .{ mc.did, mc.new_host_id }); 534 + return true; 548 535 } else { 549 536 // mismatch — reject new accounts, warn on migrations 550 - const uid = persist.uidForDid(mc.did) catch return; 551 - const current_host = persist.getAccountHostId(uid) catch return; 537 + const uid = persist.uidForDid(mc.did) catch return false; 538 + const current_host = persist.getAccountHostId(uid) catch return false; 552 539 if (current_host == mc.new_host_id) { 553 540 // new account: host not confirmed by DID doc → reject 554 - persist.updateAccountUpstreamStatus(uid, "rejected") catch return; 541 + persist.updateAccountUpstreamStatus(uid, "rejected") catch return false; 555 542 log.warn("new account rejected: {s} on host {d}, DID doc says {s} (host {d})", .{ 556 543 mc.did, mc.new_host_id, pds_host, resolved_host_id, 557 544 }); ··· 561 548 mc.did, mc.new_host_id, pds_host, resolved_host_id, 562 549 }); 563 550 } 551 + return false; 564 552 } 565 553 } 566 554 567 555 /// evict a DID's cached signing key (e.g. on #identity event). 568 556 /// the next commit from this DID will trigger a fresh resolution. 557 + /// also clears migration suppression so host changes are re-evaluated. 569 558 pub fn evictKey(self: *Validator, did: []const u8) void { 570 - self.cache_mutex.lock(); 571 - defer self.cache_mutex.unlock(); 572 - if (self.cache.fetchRemove(did)) |entry| { 559 + _ = self.cache.remove(did); 560 + 561 + // clear migration suppression — DID doc may have changed 562 + self.queue_mutex.lock(); 563 + defer self.queue_mutex.unlock(); 564 + if (self.migration_pending.fetchRemove(did)) |entry| { 573 565 self.allocator.free(entry.key); 574 566 } 575 567 } 576 568 577 - /// queue a DID for async migration validation (host change detected) 569 + /// queue a DID for async migration validation (host change detected). 570 + /// deduped: only one pending check per DID at a time. 578 571 pub fn queueMigrationCheck(self: *Validator, did: []const u8, new_host_id: u64) void { 572 + self.queue_mutex.lock(); 573 + defer self.queue_mutex.unlock(); 574 + 575 + // skip if already queued or recently checked (prevents unbounded growth) 576 + if (self.migration_pending.contains(did)) return; 577 + 579 578 const duped = self.allocator.dupe(u8, did) catch return; 579 + const set_key = self.allocator.dupe(u8, did) catch { 580 + self.allocator.free(duped); 581 + return; 582 + }; 580 583 581 - self.queue_mutex.lock(); 582 - defer self.queue_mutex.unlock(); 583 584 self.migration_queue.append(self.allocator, .{ 584 585 .did = duped, 585 586 .new_host_id = new_host_id, 586 587 }) catch { 587 588 self.allocator.free(duped); 589 + self.allocator.free(set_key); 588 590 return; 589 591 }; 592 + self.migration_pending.put(self.allocator, set_key, {}) catch { 593 + self.allocator.free(set_key); 594 + }; 590 595 self.queue_cond.signal(); 591 596 } 592 597 593 598 /// cache size (for diagnostics) 594 - pub fn cacheSize(self: *Validator) usize { 595 - self.cache_mutex.lock(); 596 - defer self.cache_mutex.unlock(); 599 + pub fn cacheSize(self: *Validator) u32 { 597 600 return self.cache.count(); 598 601 } 599 602 ··· 604 607 return self.migration_queue.items.len; 605 608 } 606 609 607 - /// evict the oldest 10% of cache entries by resolve_time. 608 - /// called from resolveLoop when cache is at capacity. 609 - fn evictOldest(self: *Validator) void { 610 - self.cache_mutex.lock(); 611 - defer self.cache_mutex.unlock(); 612 - 613 - const count = self.cache.count(); 614 - if (count == 0) return; 615 - const evict_count = @max(1, count / 10); 616 - 617 - // collect all entries with their resolve_times 618 - const Entry = struct { key: []const u8, resolve_time: i64 }; 619 - const entries = self.allocator.alloc(Entry, count) catch return; 620 - defer self.allocator.free(entries); 621 - 622 - var i: usize = 0; 623 - var it = self.cache.iterator(); 624 - while (it.next()) |entry| { 625 - entries[i] = .{ .key = entry.key_ptr.*, .resolve_time = entry.value_ptr.resolve_time }; 626 - i += 1; 627 - } 628 - 629 - // sort by resolve_time ascending (oldest first) 630 - std.mem.sort(Entry, entries[0..i], {}, struct { 631 - fn lessThan(_: void, a: Entry, b: Entry) bool { 632 - return a.resolve_time < b.resolve_time; 633 - } 634 - }.lessThan); 635 - 636 - // evict the oldest entries 637 - for (entries[0..evict_count]) |entry| { 638 - if (self.cache.fetchRemove(entry.key)) |removed| { 639 - self.allocator.free(removed.key); 640 - } 641 - } 642 - 643 - _ = self.stats.cache_evictions.fetchAdd(evict_count, .monotonic); 610 + /// migration pending (suppressed) count (for diagnostics) 611 + pub fn migrationPendingCount(self: *Validator) u32 { 612 + self.queue_mutex.lock(); 613 + defer self.queue_mutex.unlock(); 614 + return self.migration_pending.count(); 644 615 } 645 616 }; 646 617 ··· 797 768 try std.testing.expectEqual(@as(u64, 1), stats.skipped.load(.acquire)); 798 769 } 799 770 800 - test "evictOldest removes oldest entries by resolve_time" { 771 + test "LRU cache evicts least recently used" { 801 772 var stats = broadcaster.Stats{}; 802 773 var v = Validator.init(std.testing.allocator, &stats); 803 - v.max_cache_size = 5; 774 + v.cache.capacity = 3; 804 775 defer v.deinit(); 805 776 806 - // insert 5 entries with staggered resolve_times 807 - const dids = [_][]const u8{ "did:plc:aaa", "did:plc:bbb", "did:plc:ccc", "did:plc:ddd", "did:plc:eee" }; 808 - for (dids, 0..) |did, i| { 809 - const key = try std.testing.allocator.dupe(u8, did); 810 - v.cache.put(std.testing.allocator, key, .{ 811 - .key_type = .p256, 812 - .raw = .{0} ** 33, 813 - .len = 33, 814 - .resolve_time = @intCast(100 + i), // 100, 101, 102, 103, 104 815 - }) catch { 816 - std.testing.allocator.free(key); 817 - return error.TestFailed; 818 - }; 819 - } 820 - try std.testing.expectEqual(@as(u32, 5), v.cache.count()); 777 + const mk = CachedKey{ .key_type = .p256, .raw = .{0} ** 33, .len = 33 }; 778 + 779 + try v.cache.put("did:plc:aaa", mk); 780 + try v.cache.put("did:plc:bbb", mk); 781 + try v.cache.put("did:plc:ccc", mk); 782 + 783 + // access "aaa" to promote it 784 + _ = v.cache.get("did:plc:aaa"); 821 785 822 - // evict — should remove oldest 10% = 1 entry (the one with resolve_time=100) 823 - v.evictOldest(); 786 + // insert "ddd" — should evict "bbb" (LRU) 787 + try v.cache.put("did:plc:ddd", mk); 824 788 825 - try std.testing.expectEqual(@as(u32, 4), v.cache.count()); 826 - // oldest entry ("did:plc:aaa" with resolve_time=100) should be gone 827 - try std.testing.expect(v.cache.get("did:plc:aaa") == null); 828 - // newest entries should still be present 829 - try std.testing.expect(v.cache.get("did:plc:eee") != null); 789 + try std.testing.expect(v.cache.get("did:plc:bbb") == null); 790 + try std.testing.expect(v.cache.get("did:plc:aaa") != null); 791 + try std.testing.expect(v.cache.get("did:plc:ccc") != null); 830 792 try std.testing.expect(v.cache.get("did:plc:ddd") != null); 831 - try std.testing.expectEqual(@as(u64, 1), stats.cache_evictions.load(.acquire)); 793 + try std.testing.expectEqual(@as(u32, 3), v.cache.count()); 832 794 } 833 795 834 796 test "checkCommitStructure rejects too many ops" { ··· 861 823 862 824 // insert a fake cached key so we reach the blocks size check 863 825 const did = "did:plc:test123"; 864 - const did_duped = try std.testing.allocator.dupe(u8, did); 865 - try v.cache.put(std.testing.allocator, did_duped, .{ 826 + try v.cache.put(did, .{ 866 827 .key_type = .p256, 867 828 .raw = .{0} ** 33, 868 829 .len = 33, ··· 891 852 defer v.deinit(); 892 853 893 854 const did = "did:plc:test123"; 894 - const did_duped = try std.testing.allocator.dupe(u8, did); 895 - try v.cache.put(std.testing.allocator, did_duped, .{ 855 + try v.cache.put(did, .{ 896 856 .key_type = .p256, 897 857 .raw = .{0} ** 33, 898 858 .len = 33,