"""dump ALL intermediate values from spaCy's NER pipeline for debugging.

compares these against the spacez zig reimplementation to find divergence.

usage:
  uv run --python 3.12 --with spacy \
    --with 'en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl' \
    python scripts/dump_intermediates.py
"""

import numpy as np

np.set_printoptions(precision=10, suppress=False, linewidth=200)

SENTENCE = "NASA launched the Artemis mission."


def main():
    import spacy
    from spacy.attrs import NORM, PREFIX, SUFFIX, SHAPE
    from thinc.backends.numpy_ops import NumpyOps

    nlp = spacy.load("en_core_web_sm")
    ner = nlp.get_pipe("ner")
    model = ner.model
    ops = NumpyOps()

    # the NER model has its own tok2vec ref (not a listener — a standalone model)
    # with 4 hash embeds (NORM, PREFIX, SUFFIX, SHAPE), not the 6 from the pipe tok2vec
    tok2vec = model.get_ref("tok2vec")
    lower = model.get_ref("lower")
    upper = model.get_ref("upper")

    # walk the NER tok2vec to find components
    hash_embeds = []
    reduce_maxout = None
    reduce_ln = None
    for node in tok2vec.walk():
        if node.name == "hashembed":
            hash_embeds.append(node)
        if node.name == "maxout" and reduce_maxout is None and len(hash_embeds) == 4:
            reduce_maxout = node
        if node.name == "layernorm" and reduce_ln is None and reduce_maxout is not None:
            reduce_ln = node
            break
    assert len(hash_embeds) == 4, f"expected 4 hash embeds, got {len(hash_embeds)}"

    # find CNN residual blocks — filter to exactly the single-block residuals,
    # not the chain-of-residuals node
    residual_blocks = []
    for node in tok2vec.walk():
        if node.name == "residual(expand_window>>maxout>>layernorm>>dropout)":
            residual_blocks.append(node)
    assert len(residual_blocks) == 4, f"expected 4 residual blocks, got {len(residual_blocks)}"

    # find linear projection
    linear_proj = None
    for node in tok2vec.walk():
        if node.name == "linear":
            linear_proj = node

    # ── (a) token texts ──
    print("=" * 80)
    print("(a) TOKEN TEXTS (from tokenizer)")
    print("=" * 80)
    doc = nlp.make_doc(SENTENCE)
    tokens = [t.text for t in doc]
    print(f"sentence: {SENTENCE!r}")
    print(f"tokens ({len(tokens)}): {tokens}")
    print()

    # ── (b) token attributes: NORM, PREFIX, SUFFIX, SHAPE hashes ──
    print("=" * 80)
    print("(b) TOKEN ATTRIBUTES (NORM, PREFIX, SUFFIX, SHAPE hashes)")
    print("=" * 80)
    attr_array = doc.to_array([NORM, PREFIX, SUFFIX, SHAPE])
    for i, tok in enumerate(doc):
        print(f"  token[{i}] = {tok.text!r}")
        print(f"    NORM   = 0x{attr_array[i][0]:016x}  ({tok.norm_!r})")
        print(f"    PREFIX = 0x{attr_array[i][1]:016x}  ({tok.prefix_!r})")
        print(f"    SUFFIX = 0x{attr_array[i][2]:016x}  ({tok.suffix_!r})")
        print(f"    SHAPE  = 0x{attr_array[i][3]:016x}  ({tok.shape_!r})")
    print()

    # ── (c) hash embedding table lookups ──
    print("=" * 80)
    print("(c) HASH EMBEDDING TABLE LOOKUPS (raw rows from each embed table)")
    print("=" * 80)
    attr_names = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]

    all_embed_rows = []  # [embed_idx][tok_idx] = row vector
    for embed_idx, he in enumerate(hash_embeds):
        E = he.get_param("E")
        seed = he.attrs["seed"]
        nV = E.shape[0]
        nO = E.shape[1]
        print(f"\n  hash_embed[{embed_idx}] ({attr_names[embed_idx]}): E.shape={E.shape}, seed={seed}")

        embed_rows = []
        for tok_idx, tok in enumerate(doc):
            attr_val = attr_array[tok_idx][embed_idx]
            key_arr = np.array([attr_val], dtype=np.uint64)
            hash_result = ops.hash(key_arr, seed)  # shape (1, 4) of uint32
            bucket = hash_result[0][0] % nV
            row = E[bucket]
            embed_rows.append(row)
            print(f"    token[{tok_idx}] {tok.text!r}: attr=0x{attr_val:016x}, hash4={hash_result[0].tolist()}, bucket={bucket}")
            print(f"      row = {row}")
        all_embed_rows.append(embed_rows)
    print()

    # ── (d) MultiHashEmbed output (after concat → maxout reduce → layernorm) ──
    print("=" * 80)
    print("(d) MULTIHASHEMBED OUTPUT (concat → maxout → layernorm → 96-dim)")
    print("=" * 80)

    # concatenate the 4 embed rows for each token: 4 × 96 = 384
    n_tokens = len(doc)
    concat_matrix = np.zeros((n_tokens, 4 * 96), dtype=np.float32)
    for tok_idx in range(n_tokens):
        parts = [all_embed_rows[e][tok_idx] for e in range(4)]
        concat_matrix[tok_idx] = np.concatenate(parts)

    print(f"\n  concatenated embeddings: shape={concat_matrix.shape}")
    for tok_idx, tok in enumerate(doc):
        print(f"    token[{tok_idx}] {tok.text!r}: first 16 = {concat_matrix[tok_idx, :16]}")

    # apply reduction maxout
    W_maxout = reduce_maxout.get_param("W")
    b_maxout = reduce_maxout.get_param("b")
    nO = b_maxout.shape[0]
    nP = b_maxout.shape[1]
    nI = W_maxout.shape[-1]
    print(f"\n  reduce maxout: W={W_maxout.shape}, b={b_maxout.shape} (nO={nO}, nP={nP}, nI={nI})")

    W_flat = W_maxout.reshape(nO * nP, nI)
    b_flat = b_maxout.reshape(nO * nP)
    Y_pre_max = concat_matrix @ W_flat.T + b_flat
    Y_pieces = Y_pre_max.reshape(-1, nO, nP)
    Y_maxout = Y_pieces.max(axis=-1)

    print(f"\n  after maxout: shape={Y_maxout.shape}")
    for tok_idx, tok in enumerate(doc):
        print(f"    token[{tok_idx}] {tok.text!r}: {Y_maxout[tok_idx]}")

    # apply layernorm
    G_ln = reduce_ln.get_param("G")
    b_ln = reduce_ln.get_param("b")
    mean = Y_maxout.mean(axis=-1, keepdims=True)
    var = Y_maxout.var(axis=-1, keepdims=True)
    std = np.sqrt(var + 1e-12)
    Y_ln = G_ln * (Y_maxout - mean) / std + b_ln

    print(f"\n  after layernorm (manual): shape={Y_ln.shape}")
    for tok_idx, tok in enumerate(doc):
        print(f"    token[{tok_idx}] {tok.text!r}: {Y_ln[tok_idx]}")

    # now get ground truth by running the embed portion of the model
    # the NER tok2vec structure is:
    #   layers[0] = tok2vec_chain (embed+cnn)
    #     layers[0]._layers[0] = embed (extract_features >> list2ragged >> with_array(concat_embeds) >> with_array(maxout>>ln>>drop) >> ragged2list)
    #     layers[0]._layers[1] = with_array(4 × residual)
    #   layers[1] = list2array
    #   layers[2] = linear
    tok2vec_chain = tok2vec._layers[0]
    embed_chain = tok2vec_chain._layers[0]
    encode_with_array = tok2vec_chain._layers[1]
    list2array_layer = tok2vec._layers[1]
    linear_layer = tok2vec._layers[2]

    # run the embed chain to get ground-truth embed output
    doc_fresh = nlp.make_doc(SENTENCE)
    embed_output = embed_chain.predict([doc_fresh])  # list of arrays
    print(f"\n  ground-truth embed output (from model): shape={embed_output[0].shape}")
    for tok_idx, tok in enumerate(doc_fresh):
        print(f"    token[{tok_idx}] {tok.text!r}: {embed_output[0][tok_idx]}")
    print()

    # ── (e) after each CNN block ──
    print("=" * 80)
    print("(e) AFTER EACH CNN BLOCK (4 residual blocks)")
    print("=" * 80)

    # get the inner chain of residual blocks
    # encode_with_array wraps a chain of 4 residual blocks
    encode_inner = encode_with_array._layers[0]  # the chain of 4 residuals
    residual_layers = encode_inner._layers

    # run residual blocks one at a time on the embed output
    current = embed_output[0].copy()
    print(f"\n  input to CNN (embed output): shape={current.shape}")
    for block_idx, res_block in enumerate(residual_layers):
        current = res_block.predict(current)
        print(f"\n  after CNN block {block_idx}: shape={current.shape}")
        for tok_idx, tok in enumerate(doc_fresh):
            print(f"    token[{tok_idx}] {tok.text!r}: {current[tok_idx]}")

    # verify against full encode
    encoded_output = encode_with_array.predict(embed_output)
    print(f"\n  ground-truth encode output: shape={encoded_output[0].shape}")
    for tok_idx, tok in enumerate(doc_fresh):
        print(f"    token[{tok_idx}] {tok.text!r}: {encoded_output[0][tok_idx]}")

    # verify match
    diff = np.abs(current - encoded_output[0]).max()
    print(f"  max diff between manual CNN and model encode: {diff}")
    print()

    # ── (f) after linear projection ──
    print("=" * 80)
    print("(f) AFTER LINEAR PROJECTION (96 → 64-dim)")
    print("=" * 80)

    # apply list2array then linear
    as_array = list2array_layer.predict(encoded_output)
    print(f"  list2array output: shape={as_array.shape}")

    projected = linear_layer.predict(as_array)
    print(f"  linear projection output: shape={projected.shape}")
    for tok_idx, tok in enumerate(doc_fresh):
        print(f"    token[{tok_idx}] {tok.text!r}: {projected[tok_idx]}")

    # also verify via full tok2vec predict
    doc_verify = nlp.make_doc(SENTENCE)
    full_output = tok2vec.predict([doc_verify])
    print(f"\n  ground-truth full tok2vec output: shape={full_output.shape}")
    for tok_idx, tok in enumerate(doc_verify):
        print(f"    token[{tok_idx}] {tok.text!r}: {full_output[tok_idx]}")
    diff = np.abs(projected - full_output).max()
    print(f"  max diff between manual and model.predict: {diff}")

    # also print linear weights for reference
    W_lin = linear_layer.get_param("W")
    b_lin = linear_layer.get_param("b")
    print(f"\n  linear W: {W_lin.shape}, b: {b_lin.shape}")
    print()

    # ── (g) parser steps ──
    print("=" * 80)
    print("(g) PARSER (NER) TRANSITION STEPS")
    print("=" * 80)

    # get weights
    lower_W = lower.get_param("W")  # (3, 64, 2, 64) = (nF, nO, nP, nI)
    lower_b = lower.get_param("b")  # (64, 2) = (nO, nP)
    lower_pad = lower.get_param("pad")  # (1, 3, 64, 2) = (1, nF, nO, nP)
    upper_W = upper.get_param("W")  # (74, 64) = (n_actions, nO)
    upper_b = upper.get_param("b")  # (74,) = (n_actions,)

    nF, nO_l, nP_l, nI_l = lower_W.shape
    print(f"\n  lower: W={lower_W.shape} (nF={nF}, nO={nO_l}, nP={nP_l}, nI={nI_l})")
    print(f"  lower: b={lower_b.shape}, pad={lower_pad.shape}")
    print(f"  upper: W={upper_W.shape}, b={upper_b.shape}")

    # get action names
    moves = ner.moves
    n_actions = moves.n_moves
    action_names = [moves.get_class_name(i) for i in range(n_actions)]
    print(f"\n  actions ({n_actions}): {action_names}")

    # precompute lower features for all tokens
    # PrecomputableAffine: Y[t, f, o, p] = sum_i(X[t, i] * W[f, o, p, i])
    # bias b[o,p] is added AFTER summing features, not per-token
    # X = tokvecs, shape (nT, nI_l=64)
    tokvecs = full_output  # shape (nT, 64)
    precomputed = np.einsum('ti,fopi->tfop', tokvecs, lower_W)
    print(f"\n  precomputed shape: {precomputed.shape}")

    print(f"\n  lower pad vector:")
    print(f"    pad shape: {lower_pad.shape}")
    for f in range(nF):
        print(f"    pad[0, {f}]: {lower_pad[0, f]}")
    print()

    print(f"  precomputed features per token:")
    for tok_idx in range(n_tokens):
        print(f"    token[{tok_idx}] {doc[tok_idx].text!r}:")
        for f in range(nF):
            print(f"      feat[{f}]: {precomputed[tok_idx, f]}")
    print()

    # step through the NER transition system
    print("  stepping through NER transitions...")
    print()

    # CRITICAL: for nF=3, spaCy's set_context_tokens uses:
    #   ids[0] = B(0)         — current buffer token
    #   ids[1] = E(0)         — first word of open entity, or -1
    #   ids[2] = B(0) - 1     — word before buffer (end of entity), or -1
    # NOT [S(0), B(0), B(1)] as one might assume.

    from spacy.pipeline._parser_internals.stateclass import StateClass

    doc_step = nlp.make_doc(SENTENCE)
    # we need the tok2vec output on this doc — run tok2vec.predict
    step_tokvecs = tok2vec.predict([doc_step])
    # precompute for this doc (NO bias — bias is added after summing features)
    step_precomputed = np.einsum('ti,fopi->tfop', step_tokvecs, lower_W)

    # use spaCy's actual get_token_ids via ParserStepModel to get correct features
    # we construct a lightweight wrapper that gives us token_ids
    state = StateClass(doc_step)
    step = 0

    # helper: extract token ids using spaCy's actual C code
    def get_feat_ids(state):
        """get feature token ids using spaCy's set_context_tokens."""
        ids = np.zeros((1, nF), dtype=np.int32)
        ids.fill(-1)
        # use the state's C-level set_context_tokens via the StateClass wrapper
        # StateClass wraps StateC; we can call get_token_ids on a step model
        # but it's simpler to just use the known nF=3 logic:
        #   ids[0] = B(0), ids[1] = E(0) if entity open else -1,
        #   ids[2] = B(0)-1 if both ids[0] and ids[1] are valid else -1
        b0 = state.B(0)
        if b0 >= 0:
            ids[0, 0] = b0
        else:
            ids[0, 0] = -1
        if state.entity_is_open():
            ids[0, 1] = state.E(0)
        else:
            ids[0, 1] = -1
        if ids[0, 0] == -1 or ids[0, 1] == -1:
            ids[0, 2] = -1
        else:
            ids[0, 2] = ids[0, 0] - 1
        return ids[0]

    while not state.is_final():
        feat_ids = get_feat_ids(state)

        print(f"  --- step {step} ---")
        feat_labels = ["B(0)", "E(0)", "B(0)-1"]
        print(f"    feature token indices: {feat_labels[0]}={feat_ids[0]}, {feat_labels[1]}={feat_ids[1]}, {feat_labels[2]}={feat_ids[2]}")
        for fi in range(nF):
            tid = feat_ids[fi]
            if 0 <= tid < n_tokens:
                print(f"      feat[{fi}] ({feat_labels[fi]}) → token[{tid}] = {doc_step[tid].text!r}")
            else:
                print(f"      feat[{fi}] ({feat_labels[fi]}) → PAD (index {tid})")

        # sum precomputed features (or pad) for each feature slot
        hidden_input = np.zeros((nO_l, nP_l), dtype=np.float32)
        for fi in range(nF):
            tid = feat_ids[fi]
            if 0 <= tid < n_tokens:
                contrib = step_precomputed[tid, fi]
                hidden_input += contrib
                print(f"    precomp[{tid},{fi}] = {contrib}")
            else:
                contrib = lower_pad[0, fi]
                hidden_input += contrib
                print(f"    pad[{fi}] = {contrib}")

        # add bias (applied after summing, before maxout)
        hidden_input += lower_b
        print(f"    summed + bias (nO×nP = {nO_l}×{nP_l}): {hidden_input}")

        # maxout over pieces
        hidden = hidden_input.max(axis=-1)
        which = hidden_input.argmax(axis=-1)
        print(f"    after maxout: {hidden}")
        print(f"    maxout winners: {which}")

        # apply upper: scores = hidden @ W.T + b
        scores = hidden @ upper_W.T + upper_b
        print(f"    raw scores ({n_actions}): {scores}")

        # valid actions and best — is_valid takes a move name string
        valid = []
        for i in range(n_actions):
            if moves.is_valid(state, action_names[i]):
                valid.append(i)

        # find best valid
        best_idx = -1
        best_score = -float('inf')
        for i in valid:
            if scores[i] > best_score:
                best_score = scores[i]
                best_idx = i

        print(f"    valid actions: {[action_names[i] for i in valid]}")
        print(f"    top-5 by score:")
        top5 = np.argsort(scores)[::-1][:5]
        for rank, idx in enumerate(top5):
            v = "(valid)" if idx in valid else "(INVALID)"
            print(f"      [{rank}] {action_names[idx]}: {scores[idx]:.10f} {v}")

        if best_idx >= 0:
            print(f"    chosen: {action_names[best_idx]} (score={best_score:.10f})")
            moves.apply_transition(state, action_names[best_idx])
        else:
            print(f"    no valid actions, breaking")
            break

        print()
        step += 1
        if step > 30:
            print("  (safety limit: stopping after 30 steps)")
            break

    # apply state annotations to the doc so entities are visible
    moves.set_annotations(state, doc_step)

    # show final entities
    print(f"  final entities (from manual stepping):")
    for ent in doc_step.ents:
        print(f"    {ent.text!r} → {ent.label_} [{ent.start_char}:{ent.end_char}]")

    # compare with nlp() result
    print()
    doc_auto = nlp(SENTENCE)
    print(f"  final entities (from nlp()):")
    for ent in doc_auto.ents:
        print(f"    {ent.text!r} → {ent.label_} [{ent.start_char}:{ent.end_char}]")

    match = [(e.text, e.label_) for e in doc_step.ents] == [(e.text, e.label_) for e in doc_auto.ents]
    print(f"  manual vs nlp() match: {match}")

    print("\n" + "=" * 80)
    print("DONE")
    print("=" * 80)


if __name__ == "__main__":
    main()