"""export en_core_web_sm NER weights to a flat binary file for spacez.

binary format:
  [header]     — struct of uint32 dimensions
  [weights]    — contiguous float32 arrays in a fixed order

the zig side mmap's this file and slices into named weight regions,
following the karpathy/llama2.c pattern.

usage:
  uv run --with spacy --with 'en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl' python tools/export_weights.py
"""

import struct
import sys
from pathlib import Path

import numpy as np


def export(out_path: str = "src/weights/en_core_web_sm.bin"):
    import spacy

    nlp = spacy.load("en_core_web_sm")
    ner = nlp.get_pipe("ner")

    # disable everything except NER (same as coral)
    for pipe in nlp.pipe_names:
        if pipe != "ner":
            nlp.disable_pipe(pipe)

    model = ner.model
    tok2vec = model.get_ref("tok2vec")

    # ── extract dimensions ──
    # walk the model tree to find all named components
    # the tok2vec tree is: extract_features >> list2ragged >> with_array(
    #   hashembed|hashembed|hashembed|hashembed) >> with_array(maxout >> layernorm >> dropout)
    #   >> ragged2list >> with_array(residual(expand_window >> maxout >> layernorm >> dropout) * 4)
    #   >> list2array >> linear

    hash_embeds = []
    reduce_maxout = None
    reduce_ln = None

    for node in tok2vec.walk():
        if node.name == "hashembed":
            hash_embeds.append(node)
        if node.name == "maxout" and reduce_maxout is None and len(hash_embeds) == 4:
            reduce_maxout = node
        if node.name == "layernorm" and reduce_ln is None and reduce_maxout is not None:
            reduce_ln = node
            break

    assert len(hash_embeds) == 4, f"expected 4 hash embeds, got {len(hash_embeds)}"
    assert reduce_maxout is not None, "missing reduction maxout"
    assert reduce_ln is not None, "missing reduction layernorm"

    # get embed table configs
    embed_configs = []
    for he in hash_embeds:
        E = he.get_param("E")
        embed_configs.append({
            "nV": E.shape[0],
            "nO": E.shape[1],
            "seed": he.attrs["seed"],
        })
    print(f"hash embed configs: {embed_configs}")

    # CNN encoder blocks
    encoder_blocks = []
    encoder_lns = []
    in_encoder = False
    for node in tok2vec.walk():
        if node.name == "maxout" and in_encoder:
            encoder_blocks.append(node)
        elif node.name == "layernorm" and in_encoder:
            encoder_lns.append(node)
        elif node.name == "residual":
            in_encoder = True

    # actually, let's just walk and collect all params in order
    # this is more reliable than trying to navigate the tree

    print("\n=== collecting weights ===")
    weights = []

    # 1. hash embed tables (4x)
    for i, he in enumerate(hash_embeds):
        E = he.get_param("E")
        print(f"hash_embed[{i}] E: {E.shape} seed={he.attrs['seed']}")
        weights.append(("hash_embed_E", i, E))

    # 2. reduction maxout (384 → 96)
    W = reduce_maxout.get_param("W")
    b = reduce_maxout.get_param("b")
    print(f"reduce_maxout W: {W.shape}, b: {b.shape}")
    weights.append(("reduce_maxout_W", 0, W))
    weights.append(("reduce_maxout_b", 0, b))

    # 3. reduction layernorm
    G = reduce_ln.get_param("G")
    b_ln = reduce_ln.get_param("b")
    print(f"reduce_ln G: {G.shape}, b: {b_ln.shape}")
    weights.append(("reduce_ln_G", 0, G))
    weights.append(("reduce_ln_b", 0, b_ln))

    # 4. CNN encoder blocks (4x residual: maxout + layernorm)
    # re-walk to find them properly
    cnn_maxouts = []
    cnn_lns = []
    found_reduce = False
    for node in tok2vec.walk():
        if node.name == "maxout":
            if not found_reduce:
                # skip the reduction maxout (already handled)
                if node is reduce_maxout:
                    found_reduce = True
                continue
            cnn_maxouts.append(node)
        elif node.name == "layernorm" and found_reduce:
            if node is not reduce_ln:
                cnn_lns.append(node)

    print(f"\nfound {len(cnn_maxouts)} CNN blocks, {len(cnn_lns)} CNN layernorms")

    for i, (mx, ln) in enumerate(zip(cnn_maxouts, cnn_lns)):
        W = mx.get_param("W")
        b = mx.get_param("b")
        G = ln.get_param("G")
        b_ln = ln.get_param("b")
        print(f"cnn_block[{i}] W: {W.shape}, b: {b.shape}, G: {G.shape}, b_ln: {b_ln.shape}")
        weights.append(("cnn_W", i, W))
        weights.append(("cnn_b", i, b))
        weights.append(("cnn_G", i, G))
        weights.append(("cnn_b_ln", i, b_ln))

    # 5. linear projection (tok2vec output → parser hidden)
    # this is the "upper" part of the transition model
    lower = model.get_ref("lower")
    upper = model.get_ref("upper")

    # find the linear projection at the end of tok2vec
    linear = None
    for node in tok2vec.walk():
        if node.name == "linear":
            linear = node

    if linear is not None:
        W = linear.get_param("W")
        b = linear.get_param("b")
        print(f"linear_project W: {W.shape}, b: {b.shape}")
        weights.append(("linear_project_W", 0, W))
        weights.append(("linear_project_b", 0, b))

    # 6. precomputable affine (parser lower)
    for node in lower.walk():
        if hasattr(node, 'get_param'):
            try:
                W = node.get_param("W")
                b = node.get_param("b")
                print(f"lower W: {W.shape}, b: {b.shape}")
                weights.append(("lower_W", 0, W))
                weights.append(("lower_b", 0, b))
            except Exception:
                pass
            try:
                pad = node.get_param("pad")
                print(f"lower pad: {pad.shape}")
                weights.append(("lower_pad", 0, pad))
            except Exception:
                pass

    # 7. upper linear (hidden → actions)
    for node in upper.walk():
        if hasattr(node, 'get_param'):
            try:
                W = node.get_param("W")
                b = node.get_param("b")
                print(f"upper W: {W.shape}, b: {b.shape}")
                weights.append(("upper_W", 0, W))
                weights.append(("upper_b", 0, b))
            except Exception:
                pass

    # ── write binary file ──

    out = Path(out_path)
    out.parent.mkdir(parents=True, exist_ok=True)

    # header: magic, version, then dimension values
    MAGIC = 0x5350435A  # "SPCZ" for spacez
    VERSION = 1

    # collect all dimension info we need
    tok2vec_width = embed_configs[0]["nO"]  # 96
    cnn_depth = len(cnn_maxouts)
    cnn_nP = 3
    parser_hidden = 64
    parser_nP = 2
    parser_nF = 3
    n_actions = 74  # 18*4 + 1(filler) + 1(OUT)

    if linear is not None:
        parser_hidden = linear.get_param("W").shape[0]

    header_values = [
        MAGIC,
        VERSION,
        tok2vec_width,      # 96
        cnn_depth,          # 4
        cnn_nP,             # 3
        parser_hidden,      # 64
        parser_nP,          # 2
        parser_nF,          # 3
        n_actions,          # 73
        embed_configs[0]["nV"],  # NORM table rows
        embed_configs[1]["nV"],  # PREFIX table rows
        embed_configs[2]["nV"],  # SUFFIX table rows
        embed_configs[3]["nV"],  # SHAPE table rows
        embed_configs[0]["seed"],
        embed_configs[1]["seed"],
        embed_configs[2]["seed"],
        embed_configs[3]["seed"],
    ]

    total_floats = sum(w[2].size for w in weights)
    total_bytes = total_floats * 4
    print(f"\ntotal: {len(weights)} weight arrays, {total_floats:,} floats, {total_bytes:,} bytes ({total_bytes/1024/1024:.2f} MB)")

    with open(out, "wb") as f:
        # write header (pad to 64 uint32s for alignment)
        header = header_values + [0] * (64 - len(header_values))
        f.write(struct.pack(f"<{len(header)}I", *header))

        # write weight arrays contiguously
        for name, idx, arr in weights:
            flat = arr.astype(np.float32).flatten()
            f.write(flat.tobytes())
            print(f"  wrote {name}[{idx}]: {arr.shape} = {flat.size} floats")

    print(f"\nwrote {out} ({out.stat().st_size:,} bytes)")

    # also write a manifest for debugging
    manifest_path = out.with_suffix(".manifest.txt")
    offset = 64 * 4  # header size in bytes
    with open(manifest_path, "w") as f:
        f.write(f"# spacez weight manifest\n")
        f.write(f"# header: {64 * 4} bytes ({64} uint32s)\n")
        f.write(f"# total weights: {total_floats:,} float32s ({total_bytes:,} bytes)\n\n")
        for name, idx, arr in weights:
            size = arr.size * 4
            f.write(f"{offset:>10}  {size:>10}  {name}[{idx}]  {arr.shape}\n")
            offset += size

    print(f"wrote {manifest_path}")

    # ── verify: run inference and compare ──
    print("\n=== verification ===")
    doc = nlp("Barack Obama visited Paris. SpaceX launched from Cape Canaveral.")
    print(f"spaCy entities:")
    for ent in doc.ents:
        print(f"  {ent.text!r} → {ent.label_}")


if __name__ == "__main__":
    export(sys.argv[1] if len(sys.argv) > 1 else "src/weights/en_core_web_sm.bin")