src/spacez.zig at main · zzstoatzz.io/spacez

zzstoatzz.io / spacez
fork atom
this repo has no description
fork atom
spacez / src / spacez.zig
at main 84 lines 2.6 kB view raw
wrap content
zzstoatzz.io feat: bundled weights, NER CLI, and head-to-head comparison 10d ago
63d001a8
 1//! spacez — named entity recognition in zig.
 2//!
 3//! a from-scratch NER inference engine, compatible with spaCy's
 4//! en_core_web_sm model weights. hash embeddings → CNN → transition
 5//! parser, all in pure zig with zero dependencies.
 6
 7pub const hash = @import("hash.zig");
 8pub const ops = @import("ops.zig");
 9pub const embed = @import("embed.zig");
10pub const parser = @import("parser.zig");
11pub const model = @import("model.zig");
12pub const tokenizer = @import("tokenizer.zig");
13pub const tokenizer_data = @import("tokenizer_data.zig");
14
15// re-export key types at the top level
16pub const Model = model.Model;
17pub const Entity = parser.Entity;
18pub const Label = parser.Label;
19pub const Token = tokenizer.Token;
20pub const TokenAttrs = embed.TokenAttrs;
21
22pub const hashString = hash.hashString;
23pub const extractAttrs = embed.extractAttrs;
24pub const computeShape = embed.computeShape;
25pub const tokenizeText = tokenizer.tokenize;
26
27/// bundled en_core_web_sm weights (~6MB, embedded at compile time).
28pub const en_core_web_sm = @embedFile("weights/en_core_web_sm.bin");
29
30/// a recognized entity with byte offsets into the source text.
31pub const SpanEntity = struct {
32    start: u32, // byte offset of entity start in source text
33    end: u32, // byte offset of entity end (exclusive)
34    label: Label,
35};
36
37/// run the full NER pipeline: tokenize → embed → CNN encode → parse.
38/// returns the number of entities written to entities_out.
39pub fn recognize(
40    m: *const Model,
41    text: []const u8,
42    entities_out: []SpanEntity,
43) u32 {
44    // tokenize
45    var tok_buf: [tokenizer.MAX_TOKENS]Token = undefined;
46    const n_toks = tokenizeText(text, &tok_buf);
47    if (n_toks == 0) return 0;
48
49    // collect token text slices for the model (capped at model's MAX_TOKENS)
50    const n: u32 = @min(n_toks, model.MAX_TOKENS);
51    var tok_slices: [model.MAX_TOKENS][]const u8 = undefined;
52    for (0..n) |i| {
53        tok_slices[i] = tok_buf[i].text(text);
54    }
55
56    // run model prediction
57    const state = m.predict(tok_slices[0..n]);
58    const ents = state.entities();
59
60    // map token-index entities back to byte offsets
61    var count: u32 = 0;
62    for (ents) |e| {
63        if (count >= entities_out.len) break;
64        if (e.start >= n or e.end > n) continue;
65        entities_out[count] = .{
66            .start = tok_buf[e.start].start,
67            .end = tok_buf[e.end - 1].end,
68            .label = e.label,
69        };
70        count += 1;
71    }
72
73    return count;
74}
75
76test {
77    _ = hash;
78    _ = ops;
79    _ = embed;
80    _ = parser;
81    _ = model;
82    _ = tokenizer;
83    _ = tokenizer_data;
84}