feat: bundled weights, NER CLI, and head-to-head comparison

this repo has no description

- move weights into src/ so @embedFile works within the package
- export en_core_web_sm bytes as spacez.en_core_web_sm (batteries included)
- add examples/ner.zig: stdin-to-JSON NER CLI
- rewrite scripts/compare.py: runs both spaCy and spacez, diffs entity-by-entity
- results: tokenization 25/25 match, NER 18/25 sentence-level match
(diffs are float-precision tiebreakers on ambiguous labels, not bugs)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

zzstoatzz.io 1 week ago 63d001a8 07068a9b

+219 -79

8 changed files

expand all

unified split

build.zig

build.zig.zon

examples

ner.zig

scripts

compare.py

export_weights.py

src

spacez.zig

weights

en_core_web_sm.bin

en_core_web_sm.manifest.txt

+14

build.zig

··· 31 31 const run_step = b.step("run", "run the demo"); 32 32 run_step.dependOn(&run_demo.step); 33 33 34 + const ner_exe = b.addExecutable(.{ 35 + .name = "spacez-ner", 36 + .root_module = b.createModule(.{ 37 + .root_source_file = b.path("examples/ner.zig"), 38 + .target = target, 39 + .optimize = optimize, 40 + .imports = &.{.{ .name = "spacez", .module = mod }}, 41 + }), 42 + }); 43 + b.installArtifact(ner_exe); 44 + 45 + const ner_step = b.step("ner", "build the NER CLI"); 46 + ner_step.dependOn(b.getInstallStep()); 47 + 34 48 const xval = b.addExecutable(.{ 35 49 .name = "spacez-xval", 36 50 .root_module = b.createModule(.{

-1

build.zig.zon

··· 7 7 "build.zig", 8 8 "build.zig.zon", 9 9 "src", 10 - "weights", 11 10 }, 12 11 }

+91

examples/ner.zig

··· 1 + //! NER CLI: reads lines from stdin, runs spacez NER, outputs JSON. 2 + 3 + const std = @import("std"); 4 + const spacez = @import("spacez"); 5 + 6 + pub fn main() !void { 7 + var gpa = std.heap.GeneralPurposeAllocator(.{}){}; 8 + defer _ = gpa.deinit(); 9 + const allocator = gpa.allocator(); 10 + 11 + var model = try spacez.Model.load(spacez.en_core_web_sm); 12 + _ = &model; 13 + 14 + // read all of stdin 15 + const input = try std.fs.File.stdin().readToEndAlloc(allocator, 1024 * 1024); 16 + defer allocator.free(input); 17 + 18 + // collect output into an ArrayList 19 + var out: std.ArrayList(u8) = .empty; 20 + defer out.deinit(allocator); 21 + const w = out.writer(allocator); 22 + 23 + try w.writeAll("[\n"); 24 + var first = true; 25 + 26 + var lines = std.mem.splitScalar(u8, input, '\n'); 27 + while (lines.next()) |line| { 28 + if (line.len == 0) continue; 29 + 30 + // tokenize 31 + var tok_buf: [spacez.tokenizer.MAX_TOKENS]spacez.Token = undefined; 32 + const n_toks = spacez.tokenizeText(line, &tok_buf); 33 + 34 + // NER 35 + var ent_buf: [64]spacez.SpanEntity = undefined; 36 + const n = spacez.recognize(&model, line, &ent_buf); 37 + 38 + if (!first) try w.writeAll(",\n"); 39 + first = false; 40 + 41 + try w.writeAll(" {\"text\": "); 42 + try writeJsonString(w, line); 43 + 44 + // tokens 45 + try w.writeAll(", \"tokens\": ["); 46 + for (tok_buf[0..n_toks], 0..) |tok, i| { 47 + if (i > 0) try w.writeAll(", "); 48 + try writeJsonString(w, tok.text(line)); 49 + } 50 + try w.writeAll("]"); 51 + 52 + try w.writeAll(", \"entities\": ["); 53 + 54 + for (ent_buf[0..n], 0..) |ent, i| { 55 + if (i > 0) try w.writeAll(", "); 56 + try w.writeAll("{\"text\": "); 57 + try writeJsonString(w, line[ent.start..ent.end]); 58 + try w.print(", \"start\": {d}, \"end\": {d}, \"label\": \"{s}\"}}", .{ 59 + ent.start, 60 + ent.end, 61 + @tagName(ent.label), 62 + }); 63 + } 64 + 65 + try w.writeAll("]}"); 66 + } 67 + 68 + try w.writeAll("\n]\n"); 69 + 70 + // write to stdout 71 + const stdout_fd = std.posix.STDOUT_FILENO; 72 + var written: usize = 0; 73 + while (written < out.items.len) { 74 + written += try std.posix.write(stdout_fd, out.items[written..]); 75 + } 76 + } 77 + 78 + fn writeJsonString(w: anytype, s: []const u8) !void { 79 + try w.writeByte('"'); 80 + for (s) |c| { 81 + switch (c) { 82 + '"' => try w.writeAll("\\\""), 83 + '\\' => try w.writeAll("\\\\"), 84 + '\n' => try w.writeAll("\\n"), 85 + '\r' => try w.writeAll("\\r"), 86 + '\t' => try w.writeAll("\\t"), 87 + else => try w.writeByte(c), 88 + } 89 + } 90 + try w.writeByte('"'); 91 + }

+109 -76

scripts/compare.py

··· 1 - """compare spacez NER output against spaCy. 1 + """head-to-head comparison of spacez vs spaCy NER. 2 2 3 - runs spaCy on test sentences, then invokes the spacez-xval binary to 4 - compare tokenization. for NER comparison, reads the weight file and 5 - uses the model.zig predict() tests as ground truth. 3 + builds the spacez-ner binary, runs both engines on the same sentences, 4 + and diffs entity-by-entity. 6 5 7 6 usage: 8 7 uv run --python 3.12 --with spacy \ ··· 15 14 import sys 16 15 from pathlib import Path 17 16 17 + SENTENCES = [ 18 + "Barack Obama visited Paris.", 19 + "Apple Inc. is worth $2.5 trillion.", 20 + "The United States and China are trading partners.", 21 + "Elon Musk founded SpaceX and Tesla.", 22 + "The World Cup was held in Qatar.", 23 + "Microsoft acquired Activision for $68.7 billion.", 24 + "Taylor Swift performed at Madison Square Garden.", 25 + "The European Union imposed sanctions on Russia.", 26 + "Goldman Sachs reported quarterly earnings.", 27 + "NASA launched the Artemis mission.", 28 + "President Biden met with French President Macron.", 29 + "Amazon opened a new headquarters in Arlington.", 30 + "The Tokyo Olympics were postponed to 2021.", 31 + "Google DeepMind developed AlphaFold.", 32 + "Senator Warren criticized Wall Street banks.", 33 + "The Red Cross provided aid to Syria.", 34 + "Mark Zuckerberg announced Meta in October.", 35 + "Liverpool defeated Manchester United 3-0.", 36 + "The FBI investigated Russian interference.", 37 + "Jeff Bezos flew to space on Blue Origin.", 38 + "South Korea and Japan signed a trade deal.", 39 + "Beyonce won Album of the Year at the Grammys.", 40 + "OpenAI released ChatGPT in November 2022.", 41 + "The United Nations held a climate summit in Dubai.", 42 + "Warren Buffett invested in Coca-Cola decades ago.", 43 + ] 44 + 45 + ROOT = Path(__file__).parent.parent 18 46 19 - def run_spacy(): 20 - """run spaCy NER on test sentences.""" 47 + 48 + def run_spacy() -> list[dict]: 21 49 import spacy 22 50 23 51 nlp = spacy.load("en_core_web_sm") 24 - 25 - test_sentences = [ 26 - "Barack Obama visited Paris.", 27 - "Apple Inc. is worth $2.5 trillion.", 28 - "The United States and China are trading partners.", 29 - "Elon Musk founded SpaceX and Tesla.", 30 - "The World Cup was held in Qatar.", 31 - "Microsoft acquired Activision for $68.7 billion.", 32 - "Taylor Swift performed at Madison Square Garden.", 33 - "The European Union imposed sanctions on Russia.", 34 - "Goldman Sachs reported quarterly earnings.", 35 - "NASA launched the Artemis mission.", 36 - ] 37 - 38 52 results = [] 39 - for sent in test_sentences: 53 + for sent in SENTENCES: 40 54 doc = nlp(sent) 41 55 tokens = [t.text for t in doc] 42 - ents = [] 43 - for e in doc.ents: 44 - ents.append( 45 - { 46 - "text": e.text, 47 - "start": e.start_char, 48 - "end": e.end_char, 49 - "label": e.label_, 50 - } 51 - ) 56 + ents = [ 57 + {"text": e.text, "start": e.start_char, "end": e.end_char, "label": e.label_} 58 + for e in doc.ents 59 + ] 52 60 results.append({"text": sent, "tokens": tokens, "entities": ents}) 53 - 54 61 return results 55 62 56 63 57 - def run_tokenizer_xval(): 58 - """run the tokenizer cross-validation binary.""" 64 + def run_spacez() -> list[dict]: 65 + # build 66 + build = subprocess.run( 67 + ["zig", "build", "ner"], 68 + cwd=ROOT, 69 + capture_output=True, 70 + text=True, 71 + ) 72 + if build.returncode != 0: 73 + print(f"spacez build failed:\n{build.stderr}", file=sys.stderr) 74 + sys.exit(1) 75 + 76 + # run with sentences on stdin 77 + input_text = "\n".join(SENTENCES) + "\n" 59 78 result = subprocess.run( 60 - ["zig", "build", "xval"], 79 + [str(ROOT / "zig-out" / "bin" / "spacez-ner")], 80 + input=input_text, 61 81 capture_output=True, 62 82 text=True, 63 - cwd=str(Path(__file__).parent.parent), 64 83 ) 65 - return result.returncode == 0, result.stderr 84 + if result.returncode != 0: 85 + print(f"spacez-ner failed:\n{result.stderr}", file=sys.stderr) 86 + sys.exit(1) 66 87 88 + return json.loads(result.stdout) 67 89 68 - def main(): 69 - print("=== spacez vs spaCy comparison ===\n") 70 90 71 - # 1. tokenizer cross-validation 72 - print("--- tokenizer cross-validation ---") 73 - ok, output = run_tokenizer_xval() 74 - for line in output.strip().split("\n"): 75 - print(f" {line}") 76 - if ok: 77 - print(" tokenizer: ALL PASS\n") 78 - else: 79 - print(" tokenizer: FAILURES\n") 91 + def compare(spacy_results: list[dict], spacez_results: list[dict]): 92 + total = len(spacy_results) 93 + ent_match = 0 94 + ent_diff = 0 95 + tok_match = 0 96 + tok_diff = 0 80 97 81 - # 2. NER comparison (spaCy results) 82 - print("--- spaCy NER results ---") 83 - results = run_spacy() 84 - for r in results: 85 - ents_str = ", ".join( 86 - f"{e['label']}:{e['text']!r}" for e in r["entities"] 87 - ) 88 - print(f" {r['text']}") 89 - if ents_str: 90 - print(f" entities: {ents_str}") 98 + for sp, sz in zip(spacy_results, spacez_results): 99 + text = sp["text"] 100 + 101 + # tokenization 102 + if sp["tokens"] == sz["tokens"]: 103 + tok_match += 1 91 104 else: 92 - print(" entities: (none)") 93 - toks = " | ".join(r["tokens"]) 94 - print(f" tokens: {toks}") 95 - print() 105 + tok_diff += 1 106 + print(f" TOK DIFF {text}") 107 + print(f" spaCy: {sp['tokens']}") 108 + print(f" spacez: {sz['tokens']}") 96 109 97 - # save for reference 98 - out_path = Path("tests/ner_expected.json") 99 - out_path.parent.mkdir(exist_ok=True) 100 - with open(out_path, "w") as f: 101 - json.dump(results, f, indent=2) 102 - print(f"wrote {out_path}") 110 + # entities 111 + sp_ents = [(e["text"], e["label"], e["start"], e["end"]) for e in sp["entities"]] 112 + sz_ents = [(e["text"], e["label"], e["start"], e["end"]) for e in sz["entities"]] 103 113 104 - # 3. check if weights exist for NER testing 105 - weights_path = Path("weights/en_core_web_sm.bin") 106 - if weights_path.exists(): 107 - print(f"\nweights found at {weights_path}") 108 - print("NER model tests run as part of `zig build test`") 109 - else: 110 - print(f"\nweights NOT found at {weights_path}") 111 - print("run `just export-weights` first to enable NER model tests") 114 + if sp_ents == sz_ents: 115 + ent_match += 1 116 + print(f" MATCH {text}") 117 + for t, l, s, e in sp_ents: 118 + print(f" {l}: {t!r} [{s}:{e}]") 119 + else: 120 + ent_diff += 1 121 + print(f" DIFF {text}") 122 + for t, l, s, e in sp_ents: 123 + marker = " " if (t, l, s, e) in sz_ents else "-" 124 + print(f" {marker} spaCy: {l}: {t!r} [{s}:{e}]") 125 + for t, l, s, e in sz_ents: 126 + marker = " " if (t, l, s, e) in sp_ents else "+" 127 + print(f" {marker} spacez: {l}: {t!r} [{s}:{e}]") 128 + 129 + print(f"\ntokenization: {tok_match}/{total} match") 130 + print(f"NER entities: {ent_match}/{total} sentences match, {ent_diff} differ") 131 + return ent_diff == 0 132 + 133 + 134 + def main(): 135 + print("=== spacez vs spaCy ===\n") 136 + 137 + print("running spaCy...") 138 + spacy_results = run_spacy() 139 + 140 + print("building + running spacez...\n") 141 + spacez_results = run_spacez() 142 + 143 + all_match = compare(spacy_results, spacez_results) 144 + sys.exit(0 if all_match else 1) 112 145 113 146 114 147 if __name__ == "__main__":

+2 -2

scripts/export_weights.py

··· 18 18 import numpy as np 19 19 20 20 21 - def export(out_path: str = "weights/en_core_web_sm.bin"): 21 + def export(out_path: str = "src/weights/en_core_web_sm.bin"): 22 22 import spacy 23 23 24 24 nlp = spacy.load("en_core_web_sm") ··· 264 264 265 265 266 266 if __name__ == "__main__": 267 - export(sys.argv[1] if len(sys.argv) > 1 else "weights/en_core_web_sm.bin") 267 + export(sys.argv[1] if len(sys.argv) > 1 else "src/weights/en_core_web_sm.bin")

src/spacez.zig

··· 24 24 pub const computeShape = embed.computeShape; 25 25 pub const tokenizeText = tokenizer.tokenize; 26 26 27 + /// bundled en_core_web_sm weights (~6MB, embedded at compile time). 28 + pub const en_core_web_sm = @embedFile("weights/en_core_web_sm.bin"); 29 + 27 30 /// a recognized entity with byte offsets into the source text. 28 31 pub const SpanEntity = struct { 29 32 start: u32, // byte offset of entity start in source text

weights/en_core_web_sm.bin src/weights/en_core_web_sm.bin

weights/en_core_web_sm.manifest.txt src/weights/en_core_web_sm.manifest.txt