this repo has no description
at main 428 lines 17 kB view raw
1"""dump ALL intermediate values from spaCy's NER pipeline for debugging. 2 3compares these against the spacez zig reimplementation to find divergence. 4 5usage: 6 uv run --python 3.12 --with spacy \ 7 --with 'en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl' \ 8 python scripts/dump_intermediates.py 9""" 10 11import numpy as np 12 13np.set_printoptions(precision=10, suppress=False, linewidth=200) 14 15SENTENCE = "NASA launched the Artemis mission." 16 17 18def main(): 19 import spacy 20 from spacy.attrs import NORM, PREFIX, SUFFIX, SHAPE 21 from thinc.backends.numpy_ops import NumpyOps 22 23 nlp = spacy.load("en_core_web_sm") 24 ner = nlp.get_pipe("ner") 25 model = ner.model 26 ops = NumpyOps() 27 28 # the NER model has its own tok2vec ref (not a listener — a standalone model) 29 # with 4 hash embeds (NORM, PREFIX, SUFFIX, SHAPE), not the 6 from the pipe tok2vec 30 tok2vec = model.get_ref("tok2vec") 31 lower = model.get_ref("lower") 32 upper = model.get_ref("upper") 33 34 # walk the NER tok2vec to find components 35 hash_embeds = [] 36 reduce_maxout = None 37 reduce_ln = None 38 for node in tok2vec.walk(): 39 if node.name == "hashembed": 40 hash_embeds.append(node) 41 if node.name == "maxout" and reduce_maxout is None and len(hash_embeds) == 4: 42 reduce_maxout = node 43 if node.name == "layernorm" and reduce_ln is None and reduce_maxout is not None: 44 reduce_ln = node 45 break 46 assert len(hash_embeds) == 4, f"expected 4 hash embeds, got {len(hash_embeds)}" 47 48 # find CNN residual blocks — filter to exactly the single-block residuals, 49 # not the chain-of-residuals node 50 residual_blocks = [] 51 for node in tok2vec.walk(): 52 if node.name == "residual(expand_window>>maxout>>layernorm>>dropout)": 53 residual_blocks.append(node) 54 assert len(residual_blocks) == 4, f"expected 4 residual blocks, got {len(residual_blocks)}" 55 56 # find linear projection 57 linear_proj = None 58 for node in tok2vec.walk(): 59 if node.name == "linear": 60 linear_proj = node 61 62 # ── (a) token texts ── 63 print("=" * 80) 64 print("(a) TOKEN TEXTS (from tokenizer)") 65 print("=" * 80) 66 doc = nlp.make_doc(SENTENCE) 67 tokens = [t.text for t in doc] 68 print(f"sentence: {SENTENCE!r}") 69 print(f"tokens ({len(tokens)}): {tokens}") 70 print() 71 72 # ── (b) token attributes: NORM, PREFIX, SUFFIX, SHAPE hashes ── 73 print("=" * 80) 74 print("(b) TOKEN ATTRIBUTES (NORM, PREFIX, SUFFIX, SHAPE hashes)") 75 print("=" * 80) 76 attr_array = doc.to_array([NORM, PREFIX, SUFFIX, SHAPE]) 77 for i, tok in enumerate(doc): 78 print(f" token[{i}] = {tok.text!r}") 79 print(f" NORM = 0x{attr_array[i][0]:016x} ({tok.norm_!r})") 80 print(f" PREFIX = 0x{attr_array[i][1]:016x} ({tok.prefix_!r})") 81 print(f" SUFFIX = 0x{attr_array[i][2]:016x} ({tok.suffix_!r})") 82 print(f" SHAPE = 0x{attr_array[i][3]:016x} ({tok.shape_!r})") 83 print() 84 85 # ── (c) hash embedding table lookups ── 86 print("=" * 80) 87 print("(c) HASH EMBEDDING TABLE LOOKUPS (raw rows from each embed table)") 88 print("=" * 80) 89 attr_names = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] 90 91 all_embed_rows = [] # [embed_idx][tok_idx] = row vector 92 for embed_idx, he in enumerate(hash_embeds): 93 E = he.get_param("E") 94 seed = he.attrs["seed"] 95 nV = E.shape[0] 96 nO = E.shape[1] 97 print(f"\n hash_embed[{embed_idx}] ({attr_names[embed_idx]}): E.shape={E.shape}, seed={seed}") 98 99 embed_rows = [] 100 for tok_idx, tok in enumerate(doc): 101 attr_val = attr_array[tok_idx][embed_idx] 102 key_arr = np.array([attr_val], dtype=np.uint64) 103 hash_result = ops.hash(key_arr, seed) # shape (1, 4) of uint32 104 bucket = hash_result[0][0] % nV 105 row = E[bucket] 106 embed_rows.append(row) 107 print(f" token[{tok_idx}] {tok.text!r}: attr=0x{attr_val:016x}, hash4={hash_result[0].tolist()}, bucket={bucket}") 108 print(f" row = {row}") 109 all_embed_rows.append(embed_rows) 110 print() 111 112 # ── (d) MultiHashEmbed output (after concat → maxout reduce → layernorm) ── 113 print("=" * 80) 114 print("(d) MULTIHASHEMBED OUTPUT (concat → maxout → layernorm → 96-dim)") 115 print("=" * 80) 116 117 # concatenate the 4 embed rows for each token: 4 × 96 = 384 118 n_tokens = len(doc) 119 concat_matrix = np.zeros((n_tokens, 4 * 96), dtype=np.float32) 120 for tok_idx in range(n_tokens): 121 parts = [all_embed_rows[e][tok_idx] for e in range(4)] 122 concat_matrix[tok_idx] = np.concatenate(parts) 123 124 print(f"\n concatenated embeddings: shape={concat_matrix.shape}") 125 for tok_idx, tok in enumerate(doc): 126 print(f" token[{tok_idx}] {tok.text!r}: first 16 = {concat_matrix[tok_idx, :16]}") 127 128 # apply reduction maxout 129 W_maxout = reduce_maxout.get_param("W") 130 b_maxout = reduce_maxout.get_param("b") 131 nO = b_maxout.shape[0] 132 nP = b_maxout.shape[1] 133 nI = W_maxout.shape[-1] 134 print(f"\n reduce maxout: W={W_maxout.shape}, b={b_maxout.shape} (nO={nO}, nP={nP}, nI={nI})") 135 136 W_flat = W_maxout.reshape(nO * nP, nI) 137 b_flat = b_maxout.reshape(nO * nP) 138 Y_pre_max = concat_matrix @ W_flat.T + b_flat 139 Y_pieces = Y_pre_max.reshape(-1, nO, nP) 140 Y_maxout = Y_pieces.max(axis=-1) 141 142 print(f"\n after maxout: shape={Y_maxout.shape}") 143 for tok_idx, tok in enumerate(doc): 144 print(f" token[{tok_idx}] {tok.text!r}: {Y_maxout[tok_idx]}") 145 146 # apply layernorm 147 G_ln = reduce_ln.get_param("G") 148 b_ln = reduce_ln.get_param("b") 149 mean = Y_maxout.mean(axis=-1, keepdims=True) 150 var = Y_maxout.var(axis=-1, keepdims=True) 151 std = np.sqrt(var + 1e-12) 152 Y_ln = G_ln * (Y_maxout - mean) / std + b_ln 153 154 print(f"\n after layernorm (manual): shape={Y_ln.shape}") 155 for tok_idx, tok in enumerate(doc): 156 print(f" token[{tok_idx}] {tok.text!r}: {Y_ln[tok_idx]}") 157 158 # now get ground truth by running the embed portion of the model 159 # the NER tok2vec structure is: 160 # layers[0] = tok2vec_chain (embed+cnn) 161 # layers[0]._layers[0] = embed (extract_features >> list2ragged >> with_array(concat_embeds) >> with_array(maxout>>ln>>drop) >> ragged2list) 162 # layers[0]._layers[1] = with_array(4 × residual) 163 # layers[1] = list2array 164 # layers[2] = linear 165 tok2vec_chain = tok2vec._layers[0] 166 embed_chain = tok2vec_chain._layers[0] 167 encode_with_array = tok2vec_chain._layers[1] 168 list2array_layer = tok2vec._layers[1] 169 linear_layer = tok2vec._layers[2] 170 171 # run the embed chain to get ground-truth embed output 172 doc_fresh = nlp.make_doc(SENTENCE) 173 embed_output = embed_chain.predict([doc_fresh]) # list of arrays 174 print(f"\n ground-truth embed output (from model): shape={embed_output[0].shape}") 175 for tok_idx, tok in enumerate(doc_fresh): 176 print(f" token[{tok_idx}] {tok.text!r}: {embed_output[0][tok_idx]}") 177 print() 178 179 # ── (e) after each CNN block ── 180 print("=" * 80) 181 print("(e) AFTER EACH CNN BLOCK (4 residual blocks)") 182 print("=" * 80) 183 184 # get the inner chain of residual blocks 185 # encode_with_array wraps a chain of 4 residual blocks 186 encode_inner = encode_with_array._layers[0] # the chain of 4 residuals 187 residual_layers = encode_inner._layers 188 189 # run residual blocks one at a time on the embed output 190 current = embed_output[0].copy() 191 print(f"\n input to CNN (embed output): shape={current.shape}") 192 for block_idx, res_block in enumerate(residual_layers): 193 current = res_block.predict(current) 194 print(f"\n after CNN block {block_idx}: shape={current.shape}") 195 for tok_idx, tok in enumerate(doc_fresh): 196 print(f" token[{tok_idx}] {tok.text!r}: {current[tok_idx]}") 197 198 # verify against full encode 199 encoded_output = encode_with_array.predict(embed_output) 200 print(f"\n ground-truth encode output: shape={encoded_output[0].shape}") 201 for tok_idx, tok in enumerate(doc_fresh): 202 print(f" token[{tok_idx}] {tok.text!r}: {encoded_output[0][tok_idx]}") 203 204 # verify match 205 diff = np.abs(current - encoded_output[0]).max() 206 print(f" max diff between manual CNN and model encode: {diff}") 207 print() 208 209 # ── (f) after linear projection ── 210 print("=" * 80) 211 print("(f) AFTER LINEAR PROJECTION (96 → 64-dim)") 212 print("=" * 80) 213 214 # apply list2array then linear 215 as_array = list2array_layer.predict(encoded_output) 216 print(f" list2array output: shape={as_array.shape}") 217 218 projected = linear_layer.predict(as_array) 219 print(f" linear projection output: shape={projected.shape}") 220 for tok_idx, tok in enumerate(doc_fresh): 221 print(f" token[{tok_idx}] {tok.text!r}: {projected[tok_idx]}") 222 223 # also verify via full tok2vec predict 224 doc_verify = nlp.make_doc(SENTENCE) 225 full_output = tok2vec.predict([doc_verify]) 226 print(f"\n ground-truth full tok2vec output: shape={full_output.shape}") 227 for tok_idx, tok in enumerate(doc_verify): 228 print(f" token[{tok_idx}] {tok.text!r}: {full_output[tok_idx]}") 229 diff = np.abs(projected - full_output).max() 230 print(f" max diff between manual and model.predict: {diff}") 231 232 # also print linear weights for reference 233 W_lin = linear_layer.get_param("W") 234 b_lin = linear_layer.get_param("b") 235 print(f"\n linear W: {W_lin.shape}, b: {b_lin.shape}") 236 print() 237 238 # ── (g) parser steps ── 239 print("=" * 80) 240 print("(g) PARSER (NER) TRANSITION STEPS") 241 print("=" * 80) 242 243 # get weights 244 lower_W = lower.get_param("W") # (3, 64, 2, 64) = (nF, nO, nP, nI) 245 lower_b = lower.get_param("b") # (64, 2) = (nO, nP) 246 lower_pad = lower.get_param("pad") # (1, 3, 64, 2) = (1, nF, nO, nP) 247 upper_W = upper.get_param("W") # (74, 64) = (n_actions, nO) 248 upper_b = upper.get_param("b") # (74,) = (n_actions,) 249 250 nF, nO_l, nP_l, nI_l = lower_W.shape 251 print(f"\n lower: W={lower_W.shape} (nF={nF}, nO={nO_l}, nP={nP_l}, nI={nI_l})") 252 print(f" lower: b={lower_b.shape}, pad={lower_pad.shape}") 253 print(f" upper: W={upper_W.shape}, b={upper_b.shape}") 254 255 # get action names 256 moves = ner.moves 257 n_actions = moves.n_moves 258 action_names = [moves.get_class_name(i) for i in range(n_actions)] 259 print(f"\n actions ({n_actions}): {action_names}") 260 261 # precompute lower features for all tokens 262 # PrecomputableAffine: Y[t, f, o, p] = sum_i(X[t, i] * W[f, o, p, i]) 263 # bias b[o,p] is added AFTER summing features, not per-token 264 # X = tokvecs, shape (nT, nI_l=64) 265 tokvecs = full_output # shape (nT, 64) 266 precomputed = np.einsum('ti,fopi->tfop', tokvecs, lower_W) 267 print(f"\n precomputed shape: {precomputed.shape}") 268 269 print(f"\n lower pad vector:") 270 print(f" pad shape: {lower_pad.shape}") 271 for f in range(nF): 272 print(f" pad[0, {f}]: {lower_pad[0, f]}") 273 print() 274 275 print(f" precomputed features per token:") 276 for tok_idx in range(n_tokens): 277 print(f" token[{tok_idx}] {doc[tok_idx].text!r}:") 278 for f in range(nF): 279 print(f" feat[{f}]: {precomputed[tok_idx, f]}") 280 print() 281 282 # step through the NER transition system 283 print(" stepping through NER transitions...") 284 print() 285 286 # CRITICAL: for nF=3, spaCy's set_context_tokens uses: 287 # ids[0] = B(0) — current buffer token 288 # ids[1] = E(0) — first word of open entity, or -1 289 # ids[2] = B(0) - 1 — word before buffer (end of entity), or -1 290 # NOT [S(0), B(0), B(1)] as one might assume. 291 292 from spacy.pipeline._parser_internals.stateclass import StateClass 293 294 doc_step = nlp.make_doc(SENTENCE) 295 # we need the tok2vec output on this doc — run tok2vec.predict 296 step_tokvecs = tok2vec.predict([doc_step]) 297 # precompute for this doc (NO bias — bias is added after summing features) 298 step_precomputed = np.einsum('ti,fopi->tfop', step_tokvecs, lower_W) 299 300 # use spaCy's actual get_token_ids via ParserStepModel to get correct features 301 # we construct a lightweight wrapper that gives us token_ids 302 state = StateClass(doc_step) 303 step = 0 304 305 # helper: extract token ids using spaCy's actual C code 306 def get_feat_ids(state): 307 """get feature token ids using spaCy's set_context_tokens.""" 308 ids = np.zeros((1, nF), dtype=np.int32) 309 ids.fill(-1) 310 # use the state's C-level set_context_tokens via the StateClass wrapper 311 # StateClass wraps StateC; we can call get_token_ids on a step model 312 # but it's simpler to just use the known nF=3 logic: 313 # ids[0] = B(0), ids[1] = E(0) if entity open else -1, 314 # ids[2] = B(0)-1 if both ids[0] and ids[1] are valid else -1 315 b0 = state.B(0) 316 if b0 >= 0: 317 ids[0, 0] = b0 318 else: 319 ids[0, 0] = -1 320 if state.entity_is_open(): 321 ids[0, 1] = state.E(0) 322 else: 323 ids[0, 1] = -1 324 if ids[0, 0] == -1 or ids[0, 1] == -1: 325 ids[0, 2] = -1 326 else: 327 ids[0, 2] = ids[0, 0] - 1 328 return ids[0] 329 330 while not state.is_final(): 331 feat_ids = get_feat_ids(state) 332 333 print(f" --- step {step} ---") 334 feat_labels = ["B(0)", "E(0)", "B(0)-1"] 335 print(f" feature token indices: {feat_labels[0]}={feat_ids[0]}, {feat_labels[1]}={feat_ids[1]}, {feat_labels[2]}={feat_ids[2]}") 336 for fi in range(nF): 337 tid = feat_ids[fi] 338 if 0 <= tid < n_tokens: 339 print(f" feat[{fi}] ({feat_labels[fi]}) → token[{tid}] = {doc_step[tid].text!r}") 340 else: 341 print(f" feat[{fi}] ({feat_labels[fi]}) → PAD (index {tid})") 342 343 # sum precomputed features (or pad) for each feature slot 344 hidden_input = np.zeros((nO_l, nP_l), dtype=np.float32) 345 for fi in range(nF): 346 tid = feat_ids[fi] 347 if 0 <= tid < n_tokens: 348 contrib = step_precomputed[tid, fi] 349 hidden_input += contrib 350 print(f" precomp[{tid},{fi}] = {contrib}") 351 else: 352 contrib = lower_pad[0, fi] 353 hidden_input += contrib 354 print(f" pad[{fi}] = {contrib}") 355 356 # add bias (applied after summing, before maxout) 357 hidden_input += lower_b 358 print(f" summed + bias (nO×nP = {nO_l}×{nP_l}): {hidden_input}") 359 360 # maxout over pieces 361 hidden = hidden_input.max(axis=-1) 362 which = hidden_input.argmax(axis=-1) 363 print(f" after maxout: {hidden}") 364 print(f" maxout winners: {which}") 365 366 # apply upper: scores = hidden @ W.T + b 367 scores = hidden @ upper_W.T + upper_b 368 print(f" raw scores ({n_actions}): {scores}") 369 370 # valid actions and best — is_valid takes a move name string 371 valid = [] 372 for i in range(n_actions): 373 if moves.is_valid(state, action_names[i]): 374 valid.append(i) 375 376 # find best valid 377 best_idx = -1 378 best_score = -float('inf') 379 for i in valid: 380 if scores[i] > best_score: 381 best_score = scores[i] 382 best_idx = i 383 384 print(f" valid actions: {[action_names[i] for i in valid]}") 385 print(f" top-5 by score:") 386 top5 = np.argsort(scores)[::-1][:5] 387 for rank, idx in enumerate(top5): 388 v = "(valid)" if idx in valid else "(INVALID)" 389 print(f" [{rank}] {action_names[idx]}: {scores[idx]:.10f} {v}") 390 391 if best_idx >= 0: 392 print(f" chosen: {action_names[best_idx]} (score={best_score:.10f})") 393 moves.apply_transition(state, action_names[best_idx]) 394 else: 395 print(f" no valid actions, breaking") 396 break 397 398 print() 399 step += 1 400 if step > 30: 401 print(" (safety limit: stopping after 30 steps)") 402 break 403 404 # apply state annotations to the doc so entities are visible 405 moves.set_annotations(state, doc_step) 406 407 # show final entities 408 print(f" final entities (from manual stepping):") 409 for ent in doc_step.ents: 410 print(f" {ent.text!r}{ent.label_} [{ent.start_char}:{ent.end_char}]") 411 412 # compare with nlp() result 413 print() 414 doc_auto = nlp(SENTENCE) 415 print(f" final entities (from nlp()):") 416 for ent in doc_auto.ents: 417 print(f" {ent.text!r}{ent.label_} [{ent.start_char}:{ent.end_char}]") 418 419 match = [(e.text, e.label_) for e in doc_step.ents] == [(e.text, e.label_) for e in doc_auto.ents] 420 print(f" manual vs nlp() match: {match}") 421 422 print("\n" + "=" * 80) 423 print("DONE") 424 print("=" * 80) 425 426 427if __name__ == "__main__": 428 main()