fix: reduce memory usage — c_allocator + 2MB thread stacks

atproto relay implementation in zig zlay.waow.tech

zig's default thread stack is 16 MB. with ~2,750 subscriber threads
that's 44 GB of virtual memory mapped. switch to explicit 2 MB stacks
(generous for websocket read loops and CBOR decoding) and replace the
debug allocator (GPA) with glibc malloc, which returns freed pages to
the OS instead of accumulating metadata.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

zzstoatzz.io 1 week ago 9fc184aa e09f9fa9

+18 -12

6 changed files

expand all

unified split

src

backfill.zig

broadcaster.zig

event_log.zig

main.zig

slurper.zig

validator.zig

+1 -1

src/backfill.zig

··· 55 55 errdefer self.running.store(false, .release); 56 56 57 57 self.source = try self.allocator.dupe(u8, source); 58 - self.thread = try std.Thread.spawn(.{}, run, .{self}); 58 + self.thread = try std.Thread.spawn(.{ .stack_size = 2 * 1024 * 1024 }, run, .{self}); 59 59 } 60 60 61 61 fn run(self: *Backfiller) void {

+2 -2

src/broadcaster.zig

··· 308 308 .conn = conn, 309 309 .allocator = self.allocator, 310 310 }; 311 - consumer.thread = std.Thread.spawn(.{}, Consumer.writeLoop, .{consumer}) catch { 311 + consumer.thread = std.Thread.spawn(.{ .stack_size = 2 * 1024 * 1024 }, Consumer.writeLoop, .{consumer}) catch { 312 312 self.allocator.destroy(consumer); 313 313 return error.ThreadSpawnFailed; 314 314 }; ··· 394 394 consumer.conn.close(.{}) catch {}; 395 395 // clean up asynchronously to avoid joining thread while holding mutex 396 396 const alloc = self.allocator; 397 - const cleanup_thread = std.Thread.spawn(.{}, struct { 397 + const cleanup_thread = std.Thread.spawn(.{ .stack_size = 512 * 1024 }, struct { 398 398 fn run(c: *Consumer, a: Allocator) void { 399 399 c.shutdown(); 400 400 a.destroy(c);

+1 -1

src/event_log.zig

··· 230 230 231 231 /// start the background flush thread 232 232 pub fn start(self: *DiskPersist) !void { 233 - self.flush_thread = try std.Thread.spawn(.{}, flushLoop, .{self}); 233 + self.flush_thread = try std.Thread.spawn(.{ .stack_size = 2 * 1024 * 1024 }, flushLoop, .{self}); 234 234 } 235 235 236 236 /// resolve a DID to a numeric UID, associating with a host.

+11 -5

src/main.zig

··· 30 30 31 31 const log = std.log.scoped(.relay); 32 32 33 + /// zig's default thread stack is 16 MB. with ~2,750 subscriber threads that's 34 + /// 44 GB of virtual memory. most threads need far less — websocket read loops, 35 + /// CBOR decoding, HTTP handlers. 2 MB is generous for all of these. 36 + pub const default_stack_size = 2 * 1024 * 1024; 37 + 33 38 var shutdown_flag: std.atomic.Value(bool) = .{ .raw = false }; 34 39 35 40 /// HTTP server state — shared so main can close the socket to unblock accept() ··· 54 59 }; 55 60 56 61 pub fn main() !void { 57 - var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init; 58 - defer _ = gpa.deinit(); 59 - const allocator = gpa.allocator(); 62 + // use libc allocator — glibc malloc has per-thread arenas, madvise-based page 63 + // return, and proven fragmentation mitigation. GPA is a debug allocator that 64 + // tracks per-allocation metadata and never returns freed pages to the OS. 65 + const allocator = std.heap.c_allocator; 60 66 61 67 // parse config from env 62 68 const port = parseEnvInt(u16, "RELAY_PORT", 3000); ··· 125 131 try slurper.start(); 126 132 127 133 // start GC thread (runs every 10 minutes) 128 - const gc_thread = try std.Thread.spawn(.{}, gcLoop, .{&dp}); 134 + const gc_thread = try std.Thread.spawn(.{ .stack_size = default_stack_size }, gcLoop, .{&dp}); 129 135 130 136 // start HTTP health/stats server 131 137 const address = std.net.Address.initIp4(.{ 0, 0, 0, 0 }, http_port); ··· 140 146 .collection_index = &ci, 141 147 .backfiller = &backfiller, 142 148 }; 143 - const http_thread = try std.Thread.spawn(.{}, HttpServer.run, .{&http_srv}); 149 + const http_thread = try std.Thread.spawn(.{ .stack_size = default_stack_size }, HttpServer.run, .{&http_srv}); 144 150 145 151 // start downstream WebSocket server 146 152 log.info("relay listening on :{d} (ws), :{d} (http)", .{ port, http_port });

+2 -2

src/slurper.zig

··· 263 263 log.info("slurper started with {d} host(s)", .{hosts.len}); 264 264 265 265 // start crawl queue processor thread 266 - self.crawl_thread = try std.Thread.spawn(.{}, processCrawlQueue, .{self}); 266 + self.crawl_thread = try std.Thread.spawn(.{ .stack_size = 2 * 1024 * 1024 }, processCrawlQueue, .{self}); 267 267 } 268 268 269 269 /// pull PDS host list from the seed relay's com.atproto.sync.listHosts endpoint. ··· 439 439 ); 440 440 sub.collection_index = self.collection_index; 441 441 442 - const thread = try std.Thread.spawn(.{}, runWorker, .{ self, host_id, sub }); 442 + const thread = try std.Thread.spawn(.{ .stack_size = 2 * 1024 * 1024 }, runWorker, .{ self, host_id, sub }); 443 443 444 444 self.workers_mutex.lock(); 445 445 defer self.workers_mutex.unlock();

+1 -1

src/validator.zig

··· 97 97 const n = parseEnvInt(u8, "RESOLVER_THREADS", default_resolver_threads); 98 98 const count = @min(n, max_resolver_threads); 99 99 for (self.resolver_threads[0..count]) |*t| { 100 - t.* = try std.Thread.spawn(.{}, resolveLoop, .{self}); 100 + t.* = try std.Thread.spawn(.{ .stack_size = 2 * 1024 * 1024 }, resolveLoop, .{self}); 101 101 } 102 102 } 103 103