"""generate tokenizer_data.zig from spaCy's en_core_web_sm tokenizer config.

extracts:
  - unicode character class tables (sorted ranges for binary search)
  - prefix single-char set + multi-char literals + special rules
  - suffix data (single-char set, multi-char literals, lookbehind rules)
  - special cases table (1347 entries)

the matching LOGIC lives in tokenizer.zig. this script only generates DATA tables.

usage:
  uv run --python 3.12 --with spacy \
    --with 'en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl' \
    python scripts/gen_tokenizer_data.py
"""

import json
import re
import sre_parse
import sys
from pathlib import Path


def load_spacy():
    """load spaCy and extract all tokenizer config."""
    import spacy

    nlp = spacy.load("en_core_web_sm")
    tok = nlp.tokenizer
    return tok


def extract_ranges(items):
    """convert sre_parse IN items to sorted, merged (lo, hi) ranges."""
    ranges = []
    for op, val in items:
        if op == sre_parse.LITERAL:
            ranges.append((val, val))
        elif op == sre_parse.RANGE:
            ranges.append(val)
        elif op == sre_parse.CATEGORY:
            if val == sre_parse.CATEGORY_DIGIT:
                ranges.append((0x30, 0x39))
            elif val == sre_parse.CATEGORY_WORD:
                ranges.extend([(0x30, 0x39), (0x41, 0x5A), (0x5F, 0x5F), (0x61, 0x7A)])
    ranges.sort()
    merged = []
    for lo, hi in ranges:
        if merged and lo <= merged[-1][1] + 1:
            merged[-1] = (merged[-1][0], max(merged[-1][1], hi))
        else:
            merged.append((lo, hi))
    return merged


def class_from_in_node(in_items):
    """extract character class from an IN node, handling NEGATE."""
    negated = any(x[0] == sre_parse.NEGATE for x in in_items)
    non_neg = [x for x in in_items if x[0] != sre_parse.NEGATE]
    ranges = extract_ranges(non_neg)
    return ranges, negated


# ── prefix data extraction ──


def extract_prefix_data(tok):
    """extract prefix pattern data: single chars, multi-char literals, char class, specials."""
    pat = tok.prefix_search.__self__.pattern
    parsed = sre_parse.parse(pat)
    branches = parsed[1][1][1]  # AT_BEGINNING, BRANCH

    single_chars = []  # codepoints matched as single-char prefix
    multi_literals = []  # multi-byte string prefixes
    symbol_ranges = []  # the big unicode symbol class
    dots = False  # whether ..+ is a prefix
    literal_unless_digit = []  # chars like + that don't match before digits

    for branch in branches:
        if len(branch) == 1:
            op, val = branch[0]
            if op == sre_parse.LITERAL:
                single_chars.append(val)
            elif op == sre_parse.IN:
                ranges, _ = class_from_in_node(val)
                if len(ranges) > 50:
                    symbol_ranges = ranges
                else:
                    # small class — expand to individual chars
                    for lo, hi in ranges:
                        for cp in range(lo, hi + 1):
                            single_chars.append(cp)
        elif all(b[0] == sre_parse.LITERAL for b in branch):
            s = "".join(chr(b[1]) for b in branch)
            multi_literals.append(s)
        elif (
            len(branch) == 2
            and branch[0][0] == sre_parse.LITERAL
            and branch[1][0] == sre_parse.MAX_REPEAT
        ):
            dots = True
        elif (
            len(branch) == 2
            and branch[0][0] == sre_parse.LITERAL
            and branch[1][0] == sre_parse.ASSERT_NOT
        ):
            literal_unless_digit.append(branch[0][1])

    return {
        "single_chars": sorted(set(single_chars)),
        "multi_literals": sorted(multi_literals, key=lambda s: -len(s)),
        "symbol_ranges": symbol_ranges,
        "has_dots": dots,
        "literal_unless_digit": literal_unless_digit,
    }


# ── suffix data extraction ──


def extract_suffix_data(tok):
    """extract suffix pattern data."""
    pat = tok.suffix_search.__self__.pattern
    parsed = sre_parse.parse(pat)
    branches = parsed[0][1][1]  # BRANCH

    single_chars = []
    multi_literals = []
    symbol_ranges = []
    has_dots = False
    lookbehind_rules = []

    for branch in branches:
        items = list(branch)
        if items and items[-1] == (sre_parse.AT, sre_parse.AT_END):
            items = items[:-1]
        if not items:
            continue

        # simple literal(s)
        if all(x[0] == sre_parse.LITERAL for x in items):
            s = "".join(chr(x[1]) for x in items)
            if len(s) == 1:
                single_chars.append(ord(s))
            else:
                multi_literals.append(s)
            continue

        # character class
        if len(items) == 1 and items[0][0] == sre_parse.IN:
            ranges, _ = class_from_in_node(items[0][1])
            if len(ranges) > 50:
                symbol_ranges = ranges
            else:
                for lo, hi in ranges:
                    for cp in range(lo, hi + 1):
                        single_chars.append(cp)
            continue

        # dots
        if (
            len(items) >= 2
            and items[0] == (sre_parse.LITERAL, ord("."))
            and items[1][0] == sre_parse.MAX_REPEAT
        ):
            has_dots = True
            continue

        # lookbehind rule
        if items[0][0] == sre_parse.ASSERT:
            direction = items[0][1][0]
            if direction == -1:  # lookbehind
                rule = _extract_lookbehind_rule(items)
                if rule:
                    lookbehind_rules.append(rule)
            continue

    return {
        "single_chars": sorted(set(single_chars)),
        "multi_literals": sorted(multi_literals, key=lambda s: -len(s)),
        "symbol_ranges": symbol_ranges,
        "has_dots": has_dots,
        "lookbehind_rules": lookbehind_rules,
    }


def _extract_lookbehind_rule(items):
    """extract a suffix lookbehind rule into a serializable structure."""
    behind_content = items[0][1][1]
    rest = items[1:]

    # parse lookbehind
    behind = _parse_assert_content(behind_content)
    if behind is None:
        return None

    # parse suffix part
    suffix = _parse_suffix_part(rest)
    if suffix is None:
        return None

    return {"behind": behind, "suffix": suffix}


def _parse_assert_content(content):
    """parse lookbehind/lookahead content into a descriptor."""
    parts = []
    for item in content:
        if item[0] == sre_parse.IN:
            ranges, negated = class_from_in_node(item[1])
            parts.append({"type": "class", "ranges": ranges, "negated": negated})
        elif item[0] == sre_parse.LITERAL:
            parts.append({"type": "literal", "char": item[1]})
        else:
            return None
    if len(parts) == 1:
        return parts[0]
    elif len(parts) > 1:
        return {"type": "sequence", "parts": parts}
    return None


def _parse_suffix_part(items):
    """parse the suffix portion after lookbehind."""
    if all(x[0] == sre_parse.LITERAL for x in items):
        s = "".join(chr(x[1]) for x in items)
        return {"type": "literal", "text": s}

    # subpattern with alternatives
    if len(items) == 1 and items[0][0] == sre_parse.SUBPATTERN:
        content = items[0][1][3]
        if content and content[0][0] == sre_parse.BRANCH:
            alts = []
            for branch in content[0][1][1]:
                if all(x[0] == sre_parse.LITERAL for x in branch):
                    alts.append("".join(chr(x[1]) for x in branch))
            if alts:
                return {"type": "alternatives", "texts": alts}

    # BRANCH directly
    if len(items) == 1 and items[0][0] == sre_parse.BRANCH:
        alts = []
        for branch in items[0][1][1]:
            if all(x[0] == sre_parse.LITERAL for x in branch):
                alts.append("".join(chr(x[1]) for x in branch))
        if alts:
            return {"type": "alternatives", "texts": alts}

    return None


# ── unicode class extraction from all patterns ──


def extract_named_classes(tok):
    """extract the specific unicode character classes used across patterns.

    we identify them by their content:
    - symbol: the big So/Sc class (~174 ranges)
    - lower: lowercase letters (contains a-z)
    - upper: uppercase letters (contains A-Z)
    - alpha: lower + upper
    - alnum: alpha + digits
    - lower_or_punct: the wide "not just upper" class used in suffix lookbehinds
    """
    classes = {}

    # extract from suffix lookbehinds
    suffix_pat = tok.suffix_search.__self__.pattern
    sp = sre_parse.parse(suffix_pat)

    def walk_for_classes(items, label=""):
        for item in items:
            op = item[0]
            if op == sre_parse.IN:
                ranges, negated = class_from_in_node(item[1])
                if len(ranges) > 5:
                    _classify(ranges, classes)
            elif op == sre_parse.BRANCH:
                for b in item[1][1]:
                    walk_for_classes(b, label)
            elif op in (sre_parse.ASSERT, sre_parse.ASSERT_NOT):
                walk_for_classes(item[1][1], label)
            elif op == sre_parse.SUBPATTERN:
                if item[1][3]:
                    walk_for_classes(list(item[1][3]), label)

    walk_for_classes(list(sp), "suffix")

    # also from infix
    infix_pat = tok.infix_finditer.__self__.pattern
    ip = sre_parse.parse(infix_pat)
    walk_for_classes(list(ip), "infix")

    return classes


def _classify(ranges, classes):
    """classify a character range set by its content."""
    range_set = set(ranges)

    # check for a-z presence → lower
    has_az = (0x61, 0x7A) in range_set
    has_AZ = (0x41, 0x5A) in range_set
    has_09 = (0x30, 0x39) in range_set or (0x30, 0x39) in range_set

    n_ranges = len(ranges)
    n_cp = sum(hi - lo + 1 for lo, hi in ranges)

    if has_az and not has_AZ and not has_09 and n_cp > 1000:
        if "lower" not in classes or len(ranges) > len(classes["lower"]):
            classes["lower"] = ranges
    elif has_AZ and not has_az and not has_09 and n_cp > 1000:
        if "upper" not in classes or len(ranges) > len(classes["upper"]):
            classes["upper"] = ranges
    elif has_az and has_AZ and not has_09 and n_cp > 1000:
        if "alpha" not in classes or len(ranges) > len(classes["alpha"]):
            classes["alpha"] = ranges
    elif has_az and has_AZ and has_09 and n_cp > 1000:
        if "alnum" not in classes or len(ranges) > len(classes["alnum"]):
            classes["alnum"] = ranges
    elif n_cp > 100000 and n_ranges > 300:
        # very large class — likely "lower_or_punct" or similar
        key = f"wide_{n_ranges}"
        classes[key] = ranges


# ── special cases ──


def extract_specials(tok):
    """extract special case rules."""
    entries = []
    for key, val in sorted(tok.rules.items()):
        orths = [d[65] for d in val]  # 65 = ORTH
        entries.append((key, orths))
    return entries


# ── zig code generation ──


def zig_str(s):
    """convert a python string to a zig string literal."""
    parts = []
    for c in s:
        cp = ord(c)
        if cp < 128:
            if c == '"':
                parts.append('\\"')
            elif c == "\\":
                parts.append("\\\\")
            elif c == "\n":
                parts.append("\\n")
            elif c == "\t":
                parts.append("\\t")
            elif c.isprintable():
                parts.append(c)
            else:
                parts.append(f"\\x{cp:02x}")
        else:
            for b in c.encode("utf-8"):
                parts.append(f"\\x{b:02x}")
    return '"' + "".join(parts) + '"'


def zig_char(cp):
    """convert a codepoint to a zig u21 literal."""
    if 32 <= cp < 127 and chr(cp) not in "'\\\"":
        return f"'{chr(cp)}'"
    return f"0x{cp:04X}"


def gen_range_table(name, ranges):
    """generate a const range table + lookup function."""
    lines = []
    lines.append(f"pub const {name}_ranges = [_][2]u21{{")
    for lo, hi in ranges:
        lines.append(f"    .{{ 0x{lo:04X}, 0x{hi:04X} }},")
    lines.append("};")
    lines.append("")
    lines.append(f"pub fn {name}(c: u21) bool {{")
    lines.append(f"    return rangeContains(&{name}_ranges, c);")
    lines.append("}")
    return "\n".join(lines)


def gen_codepoint_set(name, codepoints):
    """generate a switch-based codepoint set."""
    lines = []
    lines.append(f"pub fn {name}(c: u21) bool {{")
    lines.append("    return switch (c) {")
    # group consecutive codepoints into ranges
    ranges = []
    cps = sorted(set(codepoints))
    i = 0
    while i < len(cps):
        start = cps[i]
        end = start
        while i + 1 < len(cps) and cps[i + 1] == end + 1:
            end = cps[i + 1]
            i += 1
        ranges.append((start, end))
        i += 1

    for lo, hi in ranges:
        if lo == hi:
            lines.append(f"        {zig_char(lo)} => true,")
        else:
            lines.append(f"        {zig_char(lo)}...{zig_char(hi)} => true,")
    lines.append("        else => false,")
    lines.append("    };")
    lines.append("}")
    return "\n".join(lines)


def gen_specials(entries):
    """generate the special cases StaticStringMap."""
    max_tokens = max(len(orths) for _, orths in entries)
    assert max_tokens <= 3, f"max tokens {max_tokens} > 3"

    lines = []
    lines.append("pub const SpecialCase = struct {")
    lines.append("    tokens: [3][]const u8,")
    lines.append("    len: u8,")
    lines.append("};")
    lines.append("")
    lines.append(
        "pub const specials = std.StaticStringMap(SpecialCase).initComptime(.{"
    )
    for key, orths in entries:
        k = zig_str(key)
        toks = [zig_str(o) for o in orths]
        while len(toks) < 3:
            toks.append('""')
        tok_str = ", ".join(toks)
        lines.append(
            f"    .{{ {k}, SpecialCase{{ .tokens = .{{ {tok_str} }}, .len = {len(orths)} }} }},"
        )
    lines.append("});")
    return "\n".join(lines)


def gen_multi_literals(name, literals):
    """generate an array of multi-char literals for matching."""
    lines = []
    lines.append(f"pub const {name} = [_][]const u8{{")
    for lit in literals:
        lines.append(f"    {zig_str(lit)},")
    lines.append("};")
    return "\n".join(lines)


def gen_lookbehind_rules(rules):
    """generate suffix lookbehind rule data structures."""
    # identify unique character classes used in lookbehinds
    class_tables = {}
    rule_descs = []

    for rule in rules:
        behind = rule["behind"]
        suffix = rule["suffix"]

        behind_id = _get_class_id(behind, class_tables)
        suffix_texts = (
            [suffix["text"]]
            if suffix["type"] == "literal"
            else suffix.get("texts", [])
        )
        rule_descs.append(
            {"behind_id": behind_id, "behind": behind, "suffix_texts": suffix_texts}
        )

    lines = []

    # generate class tables for lookbehinds
    for cid, ranges in class_tables.items():
        lines.append(f"const lookbehind_class_{cid}_ranges = [_][2]u21{{")
        for lo, hi in ranges:
            lines.append(f"    .{{ 0x{lo:04X}, 0x{hi:04X} }},")
        lines.append("};")
        lines.append("")
        lines.append(f"pub fn matchLookbehind{cid}(c: u21) bool {{")
        lines.append(f"    return rangeContains(&lookbehind_class_{cid}_ranges, c);")
        lines.append("}")
        lines.append("")

    return "\n".join(lines), rule_descs


_class_counter = 0
_class_cache = {}


def _get_class_id(behind, class_tables):
    global _class_counter
    if behind["type"] == "class":
        key = str(behind["ranges"])
        if key not in _class_cache:
            cid = _class_counter
            _class_counter += 1
            _class_cache[key] = cid
            class_tables[cid] = behind["ranges"]
        return _class_cache[key]
    elif behind["type"] == "sequence":
        # sequence of tests — generate IDs for each part
        ids = []
        for part in behind["parts"]:
            ids.append(_get_class_id(part, class_tables))
        return tuple(ids)
    elif behind["type"] == "literal":
        return ("literal", behind["char"])
    return None


def generate(tok):
    """generate the complete tokenizer_data.zig."""
    print("extracting prefix data...")
    prefix = extract_prefix_data(tok)
    print(
        f"  {len(prefix['single_chars'])} single chars, "
        f"{len(prefix['multi_literals'])} multi literals, "
        f"{len(prefix['symbol_ranges'])} symbol ranges"
    )

    print("extracting suffix data...")
    suffix = extract_suffix_data(tok)
    print(
        f"  {len(suffix['single_chars'])} single chars, "
        f"{len(suffix['multi_literals'])} multi literals, "
        f"{len(suffix['lookbehind_rules'])} lookbehind rules"
    )

    print("extracting unicode classes...")
    classes = extract_named_classes(tok)
    print(f"  classes found: {list(classes.keys())}")

    print("extracting specials...")
    specials = extract_specials(tok)
    print(f"  {len(specials)} entries")

    # also extract the infix character classes directly
    infix_pat = tok.infix_finditer.__self__.pattern
    ip = sre_parse.parse(infix_pat)
    infix_branches = ip[0][1][1]

    # infix[2] is the symbol class (same as prefix)
    # infix[3] lookbehind is digits, chars are +-*^, lookahead is digits+hyphen
    # infix[4] lookbehind is lower/punct, ahead is upper/alpha
    # infix[5] lookbehind is alpha, ahead is alpha
    # infix[6] branch alternatives: -, --, ---, ~, en-dash, em-dash, em-dash*2
    # infix[7] lookbehind is alnum, chars :/~<=>, ahead is alpha

    # extract infix lookbehind/lookahead classes
    infix_classes = {}
    for idx in [3, 4, 5, 6, 7]:
        branch = infix_branches[idx]
        for item in branch:
            if item[0] == sre_parse.ASSERT:
                direction = item[1][0]
                content = item[1][1]
                if len(content) == 1 and content[0][0] == sre_parse.IN:
                    ranges, _ = class_from_in_node(content[0][1])
                    label = (
                        f"infix_{idx}_{'behind' if direction == -1 else 'ahead'}"
                    )
                    infix_classes[label] = ranges

    # build output
    sections = []
    sections.append("//! generated by scripts/gen_tokenizer_data.py — do not edit.")
    sections.append("//! tokenizer pattern data compiled from spaCy en_core_web_sm.")
    sections.append("")
    sections.append('const std = @import("std");')
    sections.append("")

    # ── utf-8 helpers ──
    sections.append("// ── utf-8 helpers ──")
    sections.append("")
    sections.append("pub const Codepoint = struct { value: u21, len: u3 };")
    sections.append("")
    sections.append("pub fn decodeUtf8(bytes: []const u8) ?Codepoint {")
    sections.append("    if (bytes.len == 0) return null;")
    sections.append("    const b0 = bytes[0];")
    sections.append("    if (b0 < 0x80) return .{ .value = b0, .len = 1 };")
    sections.append("    if (b0 & 0xE0 == 0xC0 and bytes.len >= 2)")
    sections.append(
        "        return .{ .value = (@as(u21, b0 & 0x1F) << 6) | (bytes[1] & 0x3F), .len = 2 };"
    )
    sections.append("    if (b0 & 0xF0 == 0xE0 and bytes.len >= 3)")
    sections.append(
        "        return .{ .value = (@as(u21, b0 & 0x0F) << 12) | (@as(u21, bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F), .len = 3 };"
    )
    sections.append("    if (b0 & 0xF8 == 0xF0 and bytes.len >= 4)")
    sections.append(
        "        return .{ .value = (@as(u21, b0 & 0x07) << 18) | (@as(u21, bytes[1] & 0x3F) << 12) | (@as(u21, bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F), .len = 4 };"
    )
    sections.append(
        '    return .{ .value = 0xFFFD, .len = 1 }; // replacement char'
    )
    sections.append("}")
    sections.append("")
    sections.append("pub fn lastCodepoint(text: []const u8) ?Codepoint {")
    sections.append("    if (text.len == 0) return null;")
    sections.append("    var i = text.len - 1;")
    sections.append("    while (i > 0 and text[i] & 0xC0 == 0x80) : (i -= 1) {}")
    sections.append("    return decodeUtf8(text[i..]);")
    sections.append("}")
    sections.append("")

    # ── range search ──
    sections.append("// ── range search ──")
    sections.append("")
    sections.append("fn rangeContains(ranges: []const [2]u21, c: u21) bool {")
    sections.append("    var lo: usize = 0;")
    sections.append("    var hi: usize = ranges.len;")
    sections.append("    while (lo < hi) {")
    sections.append("        const mid = lo + (hi - lo) / 2;")
    sections.append("        if (c > ranges[mid][1]) { lo = mid + 1; }")
    sections.append("        else if (c < ranges[mid][0]) { hi = mid; }")
    sections.append("        else return true;")
    sections.append("    }")
    sections.append("    return false;")
    sections.append("}")
    sections.append("")

    # ── symbol class (shared by prefix, suffix, infix) ──
    sections.append("// ── symbol class (So/Sc unicode categories) ──")
    sections.append("")
    sections.append(gen_range_table("isSymbol", prefix["symbol_ranges"]))
    sections.append("")

    # ── prefix data ──
    sections.append("// ── prefix data ──")
    sections.append("")
    sections.append(gen_codepoint_set("isPrefixChar", prefix["single_chars"]))
    sections.append("")
    sections.append(
        gen_multi_literals("prefix_multi_literals", prefix["multi_literals"])
    )
    sections.append("")
    if prefix["literal_unless_digit"]:
        cps = prefix["literal_unless_digit"]
        sections.append(gen_codepoint_set("isPrefixUnlessDigit", cps))
        sections.append("")

    # ── suffix data ──
    sections.append("// ── suffix data ──")
    sections.append("")
    sections.append(gen_codepoint_set("isSuffixChar", suffix["single_chars"]))
    sections.append("")
    sections.append(
        gen_multi_literals("suffix_multi_literals", suffix["multi_literals"])
    )
    sections.append("")

    # lookbehind helpers
    global _class_counter, _class_cache
    _class_counter = 0
    _class_cache = {}

    lookbehind_code, rule_descs = gen_lookbehind_rules(suffix["lookbehind_rules"])
    if lookbehind_code.strip():
        sections.append("// ── suffix lookbehind helpers ──")
        sections.append("")
        sections.append(lookbehind_code)

    # generate a compact suffix lookbehind rule table
    # each rule is: check lookbehind condition, then try matching suffix text(s)
    sections.append("// ── suffix lookbehind rules ──")
    sections.append("// these are checked by tokenizer.zig matchSuffix()")
    sections.append(
        "// format: for each rule, check behind condition then try suffix literal(s)"
    )
    sections.append("")

    # encode rules as Zig code in a single function
    sections.append("pub fn matchSuffixLookbehind(text: []const u8) usize {")
    sections.append("    if (text.len < 2) return 0;")
    sections.append("")

    for ri, desc in enumerate(rule_descs):
        behind = desc["behind"]
        suffix_texts = desc["suffix_texts"]

        # sort suffix texts longest first
        suffix_texts_sorted = sorted(suffix_texts, key=lambda s: -len(s.encode("utf-8")))

        for st in suffix_texts_sorted:
            blen = len(st.encode("utf-8"))
            zig_lit = zig_str(st)

            sections.append(
                f"    if (std.mem.endsWith(u8, text, {zig_lit}) and text.len > {blen}) {{"
            )

            bid = desc["behind_id"]
            if isinstance(bid, int):
                # simple class check
                sections.append(
                    f"        const before = lastCodepoint(text[0 .. text.len - {blen}]);"
                )
                sections.append(
                    f"        if (before != null and matchLookbehind{bid}(before.?.value)) return {blen};"
                )
            elif isinstance(bid, tuple) and isinstance(bid[0], str) and bid[0] == "literal":
                # literal check
                cp = bid[1]
                sections.append(
                    f"        const before = lastCodepoint(text[0 .. text.len - {blen}]);"
                )
                sections.append(
                    f"        if (before != null and before.?.value == {zig_char(cp)}) return {blen};"
                )
            elif isinstance(bid, tuple):
                # sequence check (multiple lookbehinds)
                sections.append(
                    f"        const b1 = lastCodepoint(text[0 .. text.len - {blen}]);"
                )
                sections.append(f"        if (b1) |bp1| {{")

                if len(bid) == 2:
                    sections.append(
                        f"            const b2 = lastCodepoint(text[0 .. text.len - {blen} - bp1.len]);"
                    )
                    # bid[0] is the class before bp2, bid[1] is the class for bp1
                    test1 = (
                        f"matchLookbehind{bid[1]}(bp1.value)"
                        if isinstance(bid[1], int)
                        else f"bp1.value == {zig_char(bid[1][1])}"
                    )
                    test0 = (
                        f"matchLookbehind{bid[0]}(b2p.value)"
                        if isinstance(bid[0], int)
                        else f"b2p.value == {zig_char(bid[0][1])}"
                    )
                    sections.append(f"            if ({test1}) {{")
                    sections.append(f"                if (b2) |b2p| {{")
                    sections.append(
                        f"                    if ({test0}) return {blen};"
                    )
                    sections.append(f"                }}")
                    sections.append(f"            }}")

                sections.append(f"        }}")

            sections.append("    }")

    sections.append("    return 0;")
    sections.append("}")
    sections.append("")

    # ── infix character class tables ──
    sections.append("// ── infix character classes ──")
    sections.append("")
    for label, ranges in sorted(infix_classes.items()):
        name = f"is_{label}"
        sections.append(gen_range_table(name, ranges))
        sections.append("")

    # ── specials ──
    sections.append("// ── special cases ──")
    sections.append("")
    sections.append(gen_specials(specials))
    sections.append("")

    return "\n".join(sections)


def main():
    print("loading spaCy...")
    tok = load_spacy()

    print("\ngenerating zig source...")
    zig_source = generate(tok)

    out_path = Path("src/tokenizer_data.zig")
    out_path.write_text(zig_source)
    n_lines = zig_source.count("\n") + 1
    print(f"\nwrote {out_path} ({len(zig_source):,} bytes, {n_lines:,} lines)")

    # verification: run spaCy tokenizer on test inputs and dump expected output
    print("\ngenerating test data...")
    import spacy
    nlp = spacy.load("en_core_web_sm")
    test_sentences = [
        "Barack Obama visited Paris.",
        "Apple Inc. is worth $2.5 trillion.",
        "I can't believe it's not butter!",
        "Dr. Smith's office (room 42) is closed.",
        "U.S.A. and U.K. are allies.",
        "They're going to the store.",
        'He said "hello" and left.',
        "The cost is $500.00/month.",
        "New York-based company",
        "e-mail: test@example.com",
        "10,000 people",
        "3.14159 is pi",
        "state-of-the-art technology",
        "Mr. and Mrs. Jones",
        "it's 5:30pm",
    ]

    test_data = []
    for sent in test_sentences:
        doc = nlp.make_doc(sent)
        tokens = [t.text for t in doc]
        test_data.append({"text": sent, "tokens": tokens})

    test_path = Path("tests/tokenizer_expected.json")
    test_path.parent.mkdir(exist_ok=True)
    test_path.write_text(json.dumps(test_data, indent=2))
    print(f"wrote {test_path} ({len(test_data)} test cases)")


if __name__ == "__main__":
    main()