//! spacez — named entity recognition in zig. //! //! a from-scratch NER inference engine, compatible with spaCy's //! en_core_web_sm model weights. hash embeddings → CNN → transition //! parser, all in pure zig with zero dependencies. pub const hash = @import("hash.zig"); pub const ops = @import("ops.zig"); pub const embed = @import("embed.zig"); pub const parser = @import("parser.zig"); pub const model = @import("model.zig"); pub const tokenizer = @import("tokenizer.zig"); pub const tokenizer_data = @import("tokenizer_data.zig"); // re-export key types at the top level pub const Model = model.Model; pub const Entity = parser.Entity; pub const Label = parser.Label; pub const Token = tokenizer.Token; pub const TokenAttrs = embed.TokenAttrs; pub const hashString = hash.hashString; pub const extractAttrs = embed.extractAttrs; pub const computeShape = embed.computeShape; pub const tokenizeText = tokenizer.tokenize; /// bundled en_core_web_sm weights (~6MB, embedded at compile time). pub const en_core_web_sm = @embedFile("weights/en_core_web_sm.bin"); /// a recognized entity with byte offsets into the source text. pub const SpanEntity = struct { start: u32, // byte offset of entity start in source text end: u32, // byte offset of entity end (exclusive) label: Label, }; /// run the full NER pipeline: tokenize → embed → CNN encode → parse. /// returns the number of entities written to entities_out. pub fn recognize( m: *const Model, text: []const u8, entities_out: []SpanEntity, ) u32 { // tokenize var tok_buf: [tokenizer.MAX_TOKENS]Token = undefined; const n_toks = tokenizeText(text, &tok_buf); if (n_toks == 0) return 0; // collect token text slices for the model (capped at model's MAX_TOKENS) const n: u32 = @min(n_toks, model.MAX_TOKENS); var tok_slices: [model.MAX_TOKENS][]const u8 = undefined; for (0..n) |i| { tok_slices[i] = tok_buf[i].text(text); } // run model prediction const state = m.predict(tok_slices[0..n]); const ents = state.entities(); // map token-index entities back to byte offsets var count: u32 = 0; for (ents) |e| { if (count >= entities_out.len) break; if (e.start >= n or e.end > n) continue; entities_out[count] = .{ .start = tok_buf[e.start].start, .end = tok_buf[e.end - 1].end, .label = e.label, }; count += 1; } return count; } test { _ = hash; _ = ops; _ = embed; _ = parser; _ = model; _ = tokenizer; _ = tokenizer_data; }