"""dump ALL intermediate values from spaCy's NER pipeline for debugging. compares these against the spacez zig reimplementation to find divergence. usage: uv run --python 3.12 --with spacy \ --with 'en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl' \ python scripts/dump_intermediates.py """ import numpy as np np.set_printoptions(precision=10, suppress=False, linewidth=200) SENTENCE = "NASA launched the Artemis mission." def main(): import spacy from spacy.attrs import NORM, PREFIX, SUFFIX, SHAPE from thinc.backends.numpy_ops import NumpyOps nlp = spacy.load("en_core_web_sm") ner = nlp.get_pipe("ner") model = ner.model ops = NumpyOps() # the NER model has its own tok2vec ref (not a listener — a standalone model) # with 4 hash embeds (NORM, PREFIX, SUFFIX, SHAPE), not the 6 from the pipe tok2vec tok2vec = model.get_ref("tok2vec") lower = model.get_ref("lower") upper = model.get_ref("upper") # walk the NER tok2vec to find components hash_embeds = [] reduce_maxout = None reduce_ln = None for node in tok2vec.walk(): if node.name == "hashembed": hash_embeds.append(node) if node.name == "maxout" and reduce_maxout is None and len(hash_embeds) == 4: reduce_maxout = node if node.name == "layernorm" and reduce_ln is None and reduce_maxout is not None: reduce_ln = node break assert len(hash_embeds) == 4, f"expected 4 hash embeds, got {len(hash_embeds)}" # find CNN residual blocks — filter to exactly the single-block residuals, # not the chain-of-residuals node residual_blocks = [] for node in tok2vec.walk(): if node.name == "residual(expand_window>>maxout>>layernorm>>dropout)": residual_blocks.append(node) assert len(residual_blocks) == 4, f"expected 4 residual blocks, got {len(residual_blocks)}" # find linear projection linear_proj = None for node in tok2vec.walk(): if node.name == "linear": linear_proj = node # ── (a) token texts ── print("=" * 80) print("(a) TOKEN TEXTS (from tokenizer)") print("=" * 80) doc = nlp.make_doc(SENTENCE) tokens = [t.text for t in doc] print(f"sentence: {SENTENCE!r}") print(f"tokens ({len(tokens)}): {tokens}") print() # ── (b) token attributes: NORM, PREFIX, SUFFIX, SHAPE hashes ── print("=" * 80) print("(b) TOKEN ATTRIBUTES (NORM, PREFIX, SUFFIX, SHAPE hashes)") print("=" * 80) attr_array = doc.to_array([NORM, PREFIX, SUFFIX, SHAPE]) for i, tok in enumerate(doc): print(f" token[{i}] = {tok.text!r}") print(f" NORM = 0x{attr_array[i][0]:016x} ({tok.norm_!r})") print(f" PREFIX = 0x{attr_array[i][1]:016x} ({tok.prefix_!r})") print(f" SUFFIX = 0x{attr_array[i][2]:016x} ({tok.suffix_!r})") print(f" SHAPE = 0x{attr_array[i][3]:016x} ({tok.shape_!r})") print() # ── (c) hash embedding table lookups ── print("=" * 80) print("(c) HASH EMBEDDING TABLE LOOKUPS (raw rows from each embed table)") print("=" * 80) attr_names = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] all_embed_rows = [] # [embed_idx][tok_idx] = row vector for embed_idx, he in enumerate(hash_embeds): E = he.get_param("E") seed = he.attrs["seed"] nV = E.shape[0] nO = E.shape[1] print(f"\n hash_embed[{embed_idx}] ({attr_names[embed_idx]}): E.shape={E.shape}, seed={seed}") embed_rows = [] for tok_idx, tok in enumerate(doc): attr_val = attr_array[tok_idx][embed_idx] key_arr = np.array([attr_val], dtype=np.uint64) hash_result = ops.hash(key_arr, seed) # shape (1, 4) of uint32 bucket = hash_result[0][0] % nV row = E[bucket] embed_rows.append(row) print(f" token[{tok_idx}] {tok.text!r}: attr=0x{attr_val:016x}, hash4={hash_result[0].tolist()}, bucket={bucket}") print(f" row = {row}") all_embed_rows.append(embed_rows) print() # ── (d) MultiHashEmbed output (after concat → maxout reduce → layernorm) ── print("=" * 80) print("(d) MULTIHASHEMBED OUTPUT (concat → maxout → layernorm → 96-dim)") print("=" * 80) # concatenate the 4 embed rows for each token: 4 × 96 = 384 n_tokens = len(doc) concat_matrix = np.zeros((n_tokens, 4 * 96), dtype=np.float32) for tok_idx in range(n_tokens): parts = [all_embed_rows[e][tok_idx] for e in range(4)] concat_matrix[tok_idx] = np.concatenate(parts) print(f"\n concatenated embeddings: shape={concat_matrix.shape}") for tok_idx, tok in enumerate(doc): print(f" token[{tok_idx}] {tok.text!r}: first 16 = {concat_matrix[tok_idx, :16]}") # apply reduction maxout W_maxout = reduce_maxout.get_param("W") b_maxout = reduce_maxout.get_param("b") nO = b_maxout.shape[0] nP = b_maxout.shape[1] nI = W_maxout.shape[-1] print(f"\n reduce maxout: W={W_maxout.shape}, b={b_maxout.shape} (nO={nO}, nP={nP}, nI={nI})") W_flat = W_maxout.reshape(nO * nP, nI) b_flat = b_maxout.reshape(nO * nP) Y_pre_max = concat_matrix @ W_flat.T + b_flat Y_pieces = Y_pre_max.reshape(-1, nO, nP) Y_maxout = Y_pieces.max(axis=-1) print(f"\n after maxout: shape={Y_maxout.shape}") for tok_idx, tok in enumerate(doc): print(f" token[{tok_idx}] {tok.text!r}: {Y_maxout[tok_idx]}") # apply layernorm G_ln = reduce_ln.get_param("G") b_ln = reduce_ln.get_param("b") mean = Y_maxout.mean(axis=-1, keepdims=True) var = Y_maxout.var(axis=-1, keepdims=True) std = np.sqrt(var + 1e-12) Y_ln = G_ln * (Y_maxout - mean) / std + b_ln print(f"\n after layernorm (manual): shape={Y_ln.shape}") for tok_idx, tok in enumerate(doc): print(f" token[{tok_idx}] {tok.text!r}: {Y_ln[tok_idx]}") # now get ground truth by running the embed portion of the model # the NER tok2vec structure is: # layers[0] = tok2vec_chain (embed+cnn) # layers[0]._layers[0] = embed (extract_features >> list2ragged >> with_array(concat_embeds) >> with_array(maxout>>ln>>drop) >> ragged2list) # layers[0]._layers[1] = with_array(4 × residual) # layers[1] = list2array # layers[2] = linear tok2vec_chain = tok2vec._layers[0] embed_chain = tok2vec_chain._layers[0] encode_with_array = tok2vec_chain._layers[1] list2array_layer = tok2vec._layers[1] linear_layer = tok2vec._layers[2] # run the embed chain to get ground-truth embed output doc_fresh = nlp.make_doc(SENTENCE) embed_output = embed_chain.predict([doc_fresh]) # list of arrays print(f"\n ground-truth embed output (from model): shape={embed_output[0].shape}") for tok_idx, tok in enumerate(doc_fresh): print(f" token[{tok_idx}] {tok.text!r}: {embed_output[0][tok_idx]}") print() # ── (e) after each CNN block ── print("=" * 80) print("(e) AFTER EACH CNN BLOCK (4 residual blocks)") print("=" * 80) # get the inner chain of residual blocks # encode_with_array wraps a chain of 4 residual blocks encode_inner = encode_with_array._layers[0] # the chain of 4 residuals residual_layers = encode_inner._layers # run residual blocks one at a time on the embed output current = embed_output[0].copy() print(f"\n input to CNN (embed output): shape={current.shape}") for block_idx, res_block in enumerate(residual_layers): current = res_block.predict(current) print(f"\n after CNN block {block_idx}: shape={current.shape}") for tok_idx, tok in enumerate(doc_fresh): print(f" token[{tok_idx}] {tok.text!r}: {current[tok_idx]}") # verify against full encode encoded_output = encode_with_array.predict(embed_output) print(f"\n ground-truth encode output: shape={encoded_output[0].shape}") for tok_idx, tok in enumerate(doc_fresh): print(f" token[{tok_idx}] {tok.text!r}: {encoded_output[0][tok_idx]}") # verify match diff = np.abs(current - encoded_output[0]).max() print(f" max diff between manual CNN and model encode: {diff}") print() # ── (f) after linear projection ── print("=" * 80) print("(f) AFTER LINEAR PROJECTION (96 → 64-dim)") print("=" * 80) # apply list2array then linear as_array = list2array_layer.predict(encoded_output) print(f" list2array output: shape={as_array.shape}") projected = linear_layer.predict(as_array) print(f" linear projection output: shape={projected.shape}") for tok_idx, tok in enumerate(doc_fresh): print(f" token[{tok_idx}] {tok.text!r}: {projected[tok_idx]}") # also verify via full tok2vec predict doc_verify = nlp.make_doc(SENTENCE) full_output = tok2vec.predict([doc_verify]) print(f"\n ground-truth full tok2vec output: shape={full_output.shape}") for tok_idx, tok in enumerate(doc_verify): print(f" token[{tok_idx}] {tok.text!r}: {full_output[tok_idx]}") diff = np.abs(projected - full_output).max() print(f" max diff between manual and model.predict: {diff}") # also print linear weights for reference W_lin = linear_layer.get_param("W") b_lin = linear_layer.get_param("b") print(f"\n linear W: {W_lin.shape}, b: {b_lin.shape}") print() # ── (g) parser steps ── print("=" * 80) print("(g) PARSER (NER) TRANSITION STEPS") print("=" * 80) # get weights lower_W = lower.get_param("W") # (3, 64, 2, 64) = (nF, nO, nP, nI) lower_b = lower.get_param("b") # (64, 2) = (nO, nP) lower_pad = lower.get_param("pad") # (1, 3, 64, 2) = (1, nF, nO, nP) upper_W = upper.get_param("W") # (74, 64) = (n_actions, nO) upper_b = upper.get_param("b") # (74,) = (n_actions,) nF, nO_l, nP_l, nI_l = lower_W.shape print(f"\n lower: W={lower_W.shape} (nF={nF}, nO={nO_l}, nP={nP_l}, nI={nI_l})") print(f" lower: b={lower_b.shape}, pad={lower_pad.shape}") print(f" upper: W={upper_W.shape}, b={upper_b.shape}") # get action names moves = ner.moves n_actions = moves.n_moves action_names = [moves.get_class_name(i) for i in range(n_actions)] print(f"\n actions ({n_actions}): {action_names}") # precompute lower features for all tokens # PrecomputableAffine: Y[t, f, o, p] = sum_i(X[t, i] * W[f, o, p, i]) # bias b[o,p] is added AFTER summing features, not per-token # X = tokvecs, shape (nT, nI_l=64) tokvecs = full_output # shape (nT, 64) precomputed = np.einsum('ti,fopi->tfop', tokvecs, lower_W) print(f"\n precomputed shape: {precomputed.shape}") print(f"\n lower pad vector:") print(f" pad shape: {lower_pad.shape}") for f in range(nF): print(f" pad[0, {f}]: {lower_pad[0, f]}") print() print(f" precomputed features per token:") for tok_idx in range(n_tokens): print(f" token[{tok_idx}] {doc[tok_idx].text!r}:") for f in range(nF): print(f" feat[{f}]: {precomputed[tok_idx, f]}") print() # step through the NER transition system print(" stepping through NER transitions...") print() # CRITICAL: for nF=3, spaCy's set_context_tokens uses: # ids[0] = B(0) — current buffer token # ids[1] = E(0) — first word of open entity, or -1 # ids[2] = B(0) - 1 — word before buffer (end of entity), or -1 # NOT [S(0), B(0), B(1)] as one might assume. from spacy.pipeline._parser_internals.stateclass import StateClass doc_step = nlp.make_doc(SENTENCE) # we need the tok2vec output on this doc — run tok2vec.predict step_tokvecs = tok2vec.predict([doc_step]) # precompute for this doc (NO bias — bias is added after summing features) step_precomputed = np.einsum('ti,fopi->tfop', step_tokvecs, lower_W) # use spaCy's actual get_token_ids via ParserStepModel to get correct features # we construct a lightweight wrapper that gives us token_ids state = StateClass(doc_step) step = 0 # helper: extract token ids using spaCy's actual C code def get_feat_ids(state): """get feature token ids using spaCy's set_context_tokens.""" ids = np.zeros((1, nF), dtype=np.int32) ids.fill(-1) # use the state's C-level set_context_tokens via the StateClass wrapper # StateClass wraps StateC; we can call get_token_ids on a step model # but it's simpler to just use the known nF=3 logic: # ids[0] = B(0), ids[1] = E(0) if entity open else -1, # ids[2] = B(0)-1 if both ids[0] and ids[1] are valid else -1 b0 = state.B(0) if b0 >= 0: ids[0, 0] = b0 else: ids[0, 0] = -1 if state.entity_is_open(): ids[0, 1] = state.E(0) else: ids[0, 1] = -1 if ids[0, 0] == -1 or ids[0, 1] == -1: ids[0, 2] = -1 else: ids[0, 2] = ids[0, 0] - 1 return ids[0] while not state.is_final(): feat_ids = get_feat_ids(state) print(f" --- step {step} ---") feat_labels = ["B(0)", "E(0)", "B(0)-1"] print(f" feature token indices: {feat_labels[0]}={feat_ids[0]}, {feat_labels[1]}={feat_ids[1]}, {feat_labels[2]}={feat_ids[2]}") for fi in range(nF): tid = feat_ids[fi] if 0 <= tid < n_tokens: print(f" feat[{fi}] ({feat_labels[fi]}) → token[{tid}] = {doc_step[tid].text!r}") else: print(f" feat[{fi}] ({feat_labels[fi]}) → PAD (index {tid})") # sum precomputed features (or pad) for each feature slot hidden_input = np.zeros((nO_l, nP_l), dtype=np.float32) for fi in range(nF): tid = feat_ids[fi] if 0 <= tid < n_tokens: contrib = step_precomputed[tid, fi] hidden_input += contrib print(f" precomp[{tid},{fi}] = {contrib}") else: contrib = lower_pad[0, fi] hidden_input += contrib print(f" pad[{fi}] = {contrib}") # add bias (applied after summing, before maxout) hidden_input += lower_b print(f" summed + bias (nO×nP = {nO_l}×{nP_l}): {hidden_input}") # maxout over pieces hidden = hidden_input.max(axis=-1) which = hidden_input.argmax(axis=-1) print(f" after maxout: {hidden}") print(f" maxout winners: {which}") # apply upper: scores = hidden @ W.T + b scores = hidden @ upper_W.T + upper_b print(f" raw scores ({n_actions}): {scores}") # valid actions and best — is_valid takes a move name string valid = [] for i in range(n_actions): if moves.is_valid(state, action_names[i]): valid.append(i) # find best valid best_idx = -1 best_score = -float('inf') for i in valid: if scores[i] > best_score: best_score = scores[i] best_idx = i print(f" valid actions: {[action_names[i] for i in valid]}") print(f" top-5 by score:") top5 = np.argsort(scores)[::-1][:5] for rank, idx in enumerate(top5): v = "(valid)" if idx in valid else "(INVALID)" print(f" [{rank}] {action_names[idx]}: {scores[idx]:.10f} {v}") if best_idx >= 0: print(f" chosen: {action_names[best_idx]} (score={best_score:.10f})") moves.apply_transition(state, action_names[best_idx]) else: print(f" no valid actions, breaking") break print() step += 1 if step > 30: print(" (safety limit: stopping after 30 steps)") break # apply state annotations to the doc so entities are visible moves.set_annotations(state, doc_step) # show final entities print(f" final entities (from manual stepping):") for ent in doc_step.ents: print(f" {ent.text!r} → {ent.label_} [{ent.start_char}:{ent.end_char}]") # compare with nlp() result print() doc_auto = nlp(SENTENCE) print(f" final entities (from nlp()):") for ent in doc_auto.ents: print(f" {ent.text!r} → {ent.label_} [{ent.start_char}:{ent.end_char}]") match = [(e.text, e.label_) for e in doc_step.ents] == [(e.text, e.label_) for e in doc_auto.ents] print(f" manual vs nlp() match: {match}") print("\n" + "=" * 80) print("DONE") print("=" * 80) if __name__ == "__main__": main()