"""export en_core_web_sm NER weights to a flat binary file for spacez. binary format: [header] — struct of uint32 dimensions [weights] — contiguous float32 arrays in a fixed order the zig side mmap's this file and slices into named weight regions, following the karpathy/llama2.c pattern. usage: uv run --with spacy --with 'en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl' python tools/export_weights.py """ import struct import sys from pathlib import Path import numpy as np def export(out_path: str = "src/weights/en_core_web_sm.bin"): import spacy nlp = spacy.load("en_core_web_sm") ner = nlp.get_pipe("ner") # disable everything except NER (same as coral) for pipe in nlp.pipe_names: if pipe != "ner": nlp.disable_pipe(pipe) model = ner.model tok2vec = model.get_ref("tok2vec") # ── extract dimensions ── # walk the model tree to find all named components # the tok2vec tree is: extract_features >> list2ragged >> with_array( # hashembed|hashembed|hashembed|hashembed) >> with_array(maxout >> layernorm >> dropout) # >> ragged2list >> with_array(residual(expand_window >> maxout >> layernorm >> dropout) * 4) # >> list2array >> linear hash_embeds = [] reduce_maxout = None reduce_ln = None for node in tok2vec.walk(): if node.name == "hashembed": hash_embeds.append(node) if node.name == "maxout" and reduce_maxout is None and len(hash_embeds) == 4: reduce_maxout = node if node.name == "layernorm" and reduce_ln is None and reduce_maxout is not None: reduce_ln = node break assert len(hash_embeds) == 4, f"expected 4 hash embeds, got {len(hash_embeds)}" assert reduce_maxout is not None, "missing reduction maxout" assert reduce_ln is not None, "missing reduction layernorm" # get embed table configs embed_configs = [] for he in hash_embeds: E = he.get_param("E") embed_configs.append({ "nV": E.shape[0], "nO": E.shape[1], "seed": he.attrs["seed"], }) print(f"hash embed configs: {embed_configs}") # CNN encoder blocks encoder_blocks = [] encoder_lns = [] in_encoder = False for node in tok2vec.walk(): if node.name == "maxout" and in_encoder: encoder_blocks.append(node) elif node.name == "layernorm" and in_encoder: encoder_lns.append(node) elif node.name == "residual": in_encoder = True # actually, let's just walk and collect all params in order # this is more reliable than trying to navigate the tree print("\n=== collecting weights ===") weights = [] # 1. hash embed tables (4x) for i, he in enumerate(hash_embeds): E = he.get_param("E") print(f"hash_embed[{i}] E: {E.shape} seed={he.attrs['seed']}") weights.append(("hash_embed_E", i, E)) # 2. reduction maxout (384 → 96) W = reduce_maxout.get_param("W") b = reduce_maxout.get_param("b") print(f"reduce_maxout W: {W.shape}, b: {b.shape}") weights.append(("reduce_maxout_W", 0, W)) weights.append(("reduce_maxout_b", 0, b)) # 3. reduction layernorm G = reduce_ln.get_param("G") b_ln = reduce_ln.get_param("b") print(f"reduce_ln G: {G.shape}, b: {b_ln.shape}") weights.append(("reduce_ln_G", 0, G)) weights.append(("reduce_ln_b", 0, b_ln)) # 4. CNN encoder blocks (4x residual: maxout + layernorm) # re-walk to find them properly cnn_maxouts = [] cnn_lns = [] found_reduce = False for node in tok2vec.walk(): if node.name == "maxout": if not found_reduce: # skip the reduction maxout (already handled) if node is reduce_maxout: found_reduce = True continue cnn_maxouts.append(node) elif node.name == "layernorm" and found_reduce: if node is not reduce_ln: cnn_lns.append(node) print(f"\nfound {len(cnn_maxouts)} CNN blocks, {len(cnn_lns)} CNN layernorms") for i, (mx, ln) in enumerate(zip(cnn_maxouts, cnn_lns)): W = mx.get_param("W") b = mx.get_param("b") G = ln.get_param("G") b_ln = ln.get_param("b") print(f"cnn_block[{i}] W: {W.shape}, b: {b.shape}, G: {G.shape}, b_ln: {b_ln.shape}") weights.append(("cnn_W", i, W)) weights.append(("cnn_b", i, b)) weights.append(("cnn_G", i, G)) weights.append(("cnn_b_ln", i, b_ln)) # 5. linear projection (tok2vec output → parser hidden) # this is the "upper" part of the transition model lower = model.get_ref("lower") upper = model.get_ref("upper") # find the linear projection at the end of tok2vec linear = None for node in tok2vec.walk(): if node.name == "linear": linear = node if linear is not None: W = linear.get_param("W") b = linear.get_param("b") print(f"linear_project W: {W.shape}, b: {b.shape}") weights.append(("linear_project_W", 0, W)) weights.append(("linear_project_b", 0, b)) # 6. precomputable affine (parser lower) for node in lower.walk(): if hasattr(node, 'get_param'): try: W = node.get_param("W") b = node.get_param("b") print(f"lower W: {W.shape}, b: {b.shape}") weights.append(("lower_W", 0, W)) weights.append(("lower_b", 0, b)) except Exception: pass try: pad = node.get_param("pad") print(f"lower pad: {pad.shape}") weights.append(("lower_pad", 0, pad)) except Exception: pass # 7. upper linear (hidden → actions) for node in upper.walk(): if hasattr(node, 'get_param'): try: W = node.get_param("W") b = node.get_param("b") print(f"upper W: {W.shape}, b: {b.shape}") weights.append(("upper_W", 0, W)) weights.append(("upper_b", 0, b)) except Exception: pass # ── write binary file ── out = Path(out_path) out.parent.mkdir(parents=True, exist_ok=True) # header: magic, version, then dimension values MAGIC = 0x5350435A # "SPCZ" for spacez VERSION = 1 # collect all dimension info we need tok2vec_width = embed_configs[0]["nO"] # 96 cnn_depth = len(cnn_maxouts) cnn_nP = 3 parser_hidden = 64 parser_nP = 2 parser_nF = 3 n_actions = 74 # 18*4 + 1(filler) + 1(OUT) if linear is not None: parser_hidden = linear.get_param("W").shape[0] header_values = [ MAGIC, VERSION, tok2vec_width, # 96 cnn_depth, # 4 cnn_nP, # 3 parser_hidden, # 64 parser_nP, # 2 parser_nF, # 3 n_actions, # 73 embed_configs[0]["nV"], # NORM table rows embed_configs[1]["nV"], # PREFIX table rows embed_configs[2]["nV"], # SUFFIX table rows embed_configs[3]["nV"], # SHAPE table rows embed_configs[0]["seed"], embed_configs[1]["seed"], embed_configs[2]["seed"], embed_configs[3]["seed"], ] total_floats = sum(w[2].size for w in weights) total_bytes = total_floats * 4 print(f"\ntotal: {len(weights)} weight arrays, {total_floats:,} floats, {total_bytes:,} bytes ({total_bytes/1024/1024:.2f} MB)") with open(out, "wb") as f: # write header (pad to 64 uint32s for alignment) header = header_values + [0] * (64 - len(header_values)) f.write(struct.pack(f"<{len(header)}I", *header)) # write weight arrays contiguously for name, idx, arr in weights: flat = arr.astype(np.float32).flatten() f.write(flat.tobytes()) print(f" wrote {name}[{idx}]: {arr.shape} = {flat.size} floats") print(f"\nwrote {out} ({out.stat().st_size:,} bytes)") # also write a manifest for debugging manifest_path = out.with_suffix(".manifest.txt") offset = 64 * 4 # header size in bytes with open(manifest_path, "w") as f: f.write(f"# spacez weight manifest\n") f.write(f"# header: {64 * 4} bytes ({64} uint32s)\n") f.write(f"# total weights: {total_floats:,} float32s ({total_bytes:,} bytes)\n\n") for name, idx, arr in weights: size = arr.size * 4 f.write(f"{offset:>10} {size:>10} {name}[{idx}] {arr.shape}\n") offset += size print(f"wrote {manifest_path}") # ── verify: run inference and compare ── print("\n=== verification ===") doc = nlp("Barack Obama visited Paris. SpaceX launched from Cape Canaveral.") print(f"spaCy entities:") for ent in doc.ents: print(f" {ent.text!r} → {ent.label_}") if __name__ == "__main__": export(sys.argv[1] if len(sys.argv) > 1 else "src/weights/en_core_web_sm.bin")