this repo has no description
1//! spacez — named entity recognition in zig.
2//!
3//! a from-scratch NER inference engine, compatible with spaCy's
4//! en_core_web_sm model weights. hash embeddings → CNN → transition
5//! parser, all in pure zig with zero dependencies.
6
7pub const hash = @import("hash.zig");
8pub const ops = @import("ops.zig");
9pub const embed = @import("embed.zig");
10pub const parser = @import("parser.zig");
11pub const model = @import("model.zig");
12pub const tokenizer = @import("tokenizer.zig");
13pub const tokenizer_data = @import("tokenizer_data.zig");
14
15// re-export key types at the top level
16pub const Model = model.Model;
17pub const Entity = parser.Entity;
18pub const Label = parser.Label;
19pub const Token = tokenizer.Token;
20pub const TokenAttrs = embed.TokenAttrs;
21
22pub const hashString = hash.hashString;
23pub const extractAttrs = embed.extractAttrs;
24pub const computeShape = embed.computeShape;
25pub const tokenizeText = tokenizer.tokenize;
26
27/// bundled en_core_web_sm weights (~6MB, embedded at compile time).
28pub const en_core_web_sm = @embedFile("weights/en_core_web_sm.bin");
29
30/// a recognized entity with byte offsets into the source text.
31pub const SpanEntity = struct {
32 start: u32, // byte offset of entity start in source text
33 end: u32, // byte offset of entity end (exclusive)
34 label: Label,
35};
36
37/// run the full NER pipeline: tokenize → embed → CNN encode → parse.
38/// returns the number of entities written to entities_out.
39pub fn recognize(
40 m: *const Model,
41 text: []const u8,
42 entities_out: []SpanEntity,
43) u32 {
44 // tokenize
45 var tok_buf: [tokenizer.MAX_TOKENS]Token = undefined;
46 const n_toks = tokenizeText(text, &tok_buf);
47 if (n_toks == 0) return 0;
48
49 // collect token text slices for the model (capped at model's MAX_TOKENS)
50 const n: u32 = @min(n_toks, model.MAX_TOKENS);
51 var tok_slices: [model.MAX_TOKENS][]const u8 = undefined;
52 for (0..n) |i| {
53 tok_slices[i] = tok_buf[i].text(text);
54 }
55
56 // run model prediction
57 const state = m.predict(tok_slices[0..n]);
58 const ents = state.entities();
59
60 // map token-index entities back to byte offsets
61 var count: u32 = 0;
62 for (ents) |e| {
63 if (count >= entities_out.len) break;
64 if (e.start >= n or e.end > n) continue;
65 entities_out[count] = .{
66 .start = tok_buf[e.start].start,
67 .end = tok_buf[e.end - 1].end,
68 .label = e.label,
69 };
70 count += 1;
71 }
72
73 return count;
74}
75
76test {
77 _ = hash;
78 _ = ops;
79 _ = embed;
80 _ = parser;
81 _ = model;
82 _ = tokenizer;
83 _ = tokenizer_data;
84}