this repo has no description
at main 84 lines 2.6 kB view raw
1//! spacez — named entity recognition in zig. 2//! 3//! a from-scratch NER inference engine, compatible with spaCy's 4//! en_core_web_sm model weights. hash embeddings → CNN → transition 5//! parser, all in pure zig with zero dependencies. 6 7pub const hash = @import("hash.zig"); 8pub const ops = @import("ops.zig"); 9pub const embed = @import("embed.zig"); 10pub const parser = @import("parser.zig"); 11pub const model = @import("model.zig"); 12pub const tokenizer = @import("tokenizer.zig"); 13pub const tokenizer_data = @import("tokenizer_data.zig"); 14 15// re-export key types at the top level 16pub const Model = model.Model; 17pub const Entity = parser.Entity; 18pub const Label = parser.Label; 19pub const Token = tokenizer.Token; 20pub const TokenAttrs = embed.TokenAttrs; 21 22pub const hashString = hash.hashString; 23pub const extractAttrs = embed.extractAttrs; 24pub const computeShape = embed.computeShape; 25pub const tokenizeText = tokenizer.tokenize; 26 27/// bundled en_core_web_sm weights (~6MB, embedded at compile time). 28pub const en_core_web_sm = @embedFile("weights/en_core_web_sm.bin"); 29 30/// a recognized entity with byte offsets into the source text. 31pub const SpanEntity = struct { 32 start: u32, // byte offset of entity start in source text 33 end: u32, // byte offset of entity end (exclusive) 34 label: Label, 35}; 36 37/// run the full NER pipeline: tokenize → embed → CNN encode → parse. 38/// returns the number of entities written to entities_out. 39pub fn recognize( 40 m: *const Model, 41 text: []const u8, 42 entities_out: []SpanEntity, 43) u32 { 44 // tokenize 45 var tok_buf: [tokenizer.MAX_TOKENS]Token = undefined; 46 const n_toks = tokenizeText(text, &tok_buf); 47 if (n_toks == 0) return 0; 48 49 // collect token text slices for the model (capped at model's MAX_TOKENS) 50 const n: u32 = @min(n_toks, model.MAX_TOKENS); 51 var tok_slices: [model.MAX_TOKENS][]const u8 = undefined; 52 for (0..n) |i| { 53 tok_slices[i] = tok_buf[i].text(text); 54 } 55 56 // run model prediction 57 const state = m.predict(tok_slices[0..n]); 58 const ents = state.entities(); 59 60 // map token-index entities back to byte offsets 61 var count: u32 = 0; 62 for (ents) |e| { 63 if (count >= entities_out.len) break; 64 if (e.start >= n or e.end > n) continue; 65 entities_out[count] = .{ 66 .start = tok_buf[e.start].start, 67 .end = tok_buf[e.end - 1].end, 68 .label = e.label, 69 }; 70 count += 1; 71 } 72 73 return count; 74} 75 76test { 77 _ = hash; 78 _ = ops; 79 _ = embed; 80 _ = parser; 81 _ = model; 82 _ = tokenizer; 83 _ = tokenizer_data; 84}