"""generate tokenizer_data.zig from spaCy's en_core_web_sm tokenizer config. extracts: - unicode character class tables (sorted ranges for binary search) - prefix single-char set + multi-char literals + special rules - suffix data (single-char set, multi-char literals, lookbehind rules) - special cases table (1347 entries) the matching LOGIC lives in tokenizer.zig. this script only generates DATA tables. usage: uv run --python 3.12 --with spacy \ --with 'en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl' \ python scripts/gen_tokenizer_data.py """ import json import re import sre_parse import sys from pathlib import Path def load_spacy(): """load spaCy and extract all tokenizer config.""" import spacy nlp = spacy.load("en_core_web_sm") tok = nlp.tokenizer return tok def extract_ranges(items): """convert sre_parse IN items to sorted, merged (lo, hi) ranges.""" ranges = [] for op, val in items: if op == sre_parse.LITERAL: ranges.append((val, val)) elif op == sre_parse.RANGE: ranges.append(val) elif op == sre_parse.CATEGORY: if val == sre_parse.CATEGORY_DIGIT: ranges.append((0x30, 0x39)) elif val == sre_parse.CATEGORY_WORD: ranges.extend([(0x30, 0x39), (0x41, 0x5A), (0x5F, 0x5F), (0x61, 0x7A)]) ranges.sort() merged = [] for lo, hi in ranges: if merged and lo <= merged[-1][1] + 1: merged[-1] = (merged[-1][0], max(merged[-1][1], hi)) else: merged.append((lo, hi)) return merged def class_from_in_node(in_items): """extract character class from an IN node, handling NEGATE.""" negated = any(x[0] == sre_parse.NEGATE for x in in_items) non_neg = [x for x in in_items if x[0] != sre_parse.NEGATE] ranges = extract_ranges(non_neg) return ranges, negated # ── prefix data extraction ── def extract_prefix_data(tok): """extract prefix pattern data: single chars, multi-char literals, char class, specials.""" pat = tok.prefix_search.__self__.pattern parsed = sre_parse.parse(pat) branches = parsed[1][1][1] # AT_BEGINNING, BRANCH single_chars = [] # codepoints matched as single-char prefix multi_literals = [] # multi-byte string prefixes symbol_ranges = [] # the big unicode symbol class dots = False # whether ..+ is a prefix literal_unless_digit = [] # chars like + that don't match before digits for branch in branches: if len(branch) == 1: op, val = branch[0] if op == sre_parse.LITERAL: single_chars.append(val) elif op == sre_parse.IN: ranges, _ = class_from_in_node(val) if len(ranges) > 50: symbol_ranges = ranges else: # small class — expand to individual chars for lo, hi in ranges: for cp in range(lo, hi + 1): single_chars.append(cp) elif all(b[0] == sre_parse.LITERAL for b in branch): s = "".join(chr(b[1]) for b in branch) multi_literals.append(s) elif ( len(branch) == 2 and branch[0][0] == sre_parse.LITERAL and branch[1][0] == sre_parse.MAX_REPEAT ): dots = True elif ( len(branch) == 2 and branch[0][0] == sre_parse.LITERAL and branch[1][0] == sre_parse.ASSERT_NOT ): literal_unless_digit.append(branch[0][1]) return { "single_chars": sorted(set(single_chars)), "multi_literals": sorted(multi_literals, key=lambda s: -len(s)), "symbol_ranges": symbol_ranges, "has_dots": dots, "literal_unless_digit": literal_unless_digit, } # ── suffix data extraction ── def extract_suffix_data(tok): """extract suffix pattern data.""" pat = tok.suffix_search.__self__.pattern parsed = sre_parse.parse(pat) branches = parsed[0][1][1] # BRANCH single_chars = [] multi_literals = [] symbol_ranges = [] has_dots = False lookbehind_rules = [] for branch in branches: items = list(branch) if items and items[-1] == (sre_parse.AT, sre_parse.AT_END): items = items[:-1] if not items: continue # simple literal(s) if all(x[0] == sre_parse.LITERAL for x in items): s = "".join(chr(x[1]) for x in items) if len(s) == 1: single_chars.append(ord(s)) else: multi_literals.append(s) continue # character class if len(items) == 1 and items[0][0] == sre_parse.IN: ranges, _ = class_from_in_node(items[0][1]) if len(ranges) > 50: symbol_ranges = ranges else: for lo, hi in ranges: for cp in range(lo, hi + 1): single_chars.append(cp) continue # dots if ( len(items) >= 2 and items[0] == (sre_parse.LITERAL, ord(".")) and items[1][0] == sre_parse.MAX_REPEAT ): has_dots = True continue # lookbehind rule if items[0][0] == sre_parse.ASSERT: direction = items[0][1][0] if direction == -1: # lookbehind rule = _extract_lookbehind_rule(items) if rule: lookbehind_rules.append(rule) continue return { "single_chars": sorted(set(single_chars)), "multi_literals": sorted(multi_literals, key=lambda s: -len(s)), "symbol_ranges": symbol_ranges, "has_dots": has_dots, "lookbehind_rules": lookbehind_rules, } def _extract_lookbehind_rule(items): """extract a suffix lookbehind rule into a serializable structure.""" behind_content = items[0][1][1] rest = items[1:] # parse lookbehind behind = _parse_assert_content(behind_content) if behind is None: return None # parse suffix part suffix = _parse_suffix_part(rest) if suffix is None: return None return {"behind": behind, "suffix": suffix} def _parse_assert_content(content): """parse lookbehind/lookahead content into a descriptor.""" parts = [] for item in content: if item[0] == sre_parse.IN: ranges, negated = class_from_in_node(item[1]) parts.append({"type": "class", "ranges": ranges, "negated": negated}) elif item[0] == sre_parse.LITERAL: parts.append({"type": "literal", "char": item[1]}) else: return None if len(parts) == 1: return parts[0] elif len(parts) > 1: return {"type": "sequence", "parts": parts} return None def _parse_suffix_part(items): """parse the suffix portion after lookbehind.""" if all(x[0] == sre_parse.LITERAL for x in items): s = "".join(chr(x[1]) for x in items) return {"type": "literal", "text": s} # subpattern with alternatives if len(items) == 1 and items[0][0] == sre_parse.SUBPATTERN: content = items[0][1][3] if content and content[0][0] == sre_parse.BRANCH: alts = [] for branch in content[0][1][1]: if all(x[0] == sre_parse.LITERAL for x in branch): alts.append("".join(chr(x[1]) for x in branch)) if alts: return {"type": "alternatives", "texts": alts} # BRANCH directly if len(items) == 1 and items[0][0] == sre_parse.BRANCH: alts = [] for branch in items[0][1][1]: if all(x[0] == sre_parse.LITERAL for x in branch): alts.append("".join(chr(x[1]) for x in branch)) if alts: return {"type": "alternatives", "texts": alts} return None # ── unicode class extraction from all patterns ── def extract_named_classes(tok): """extract the specific unicode character classes used across patterns. we identify them by their content: - symbol: the big So/Sc class (~174 ranges) - lower: lowercase letters (contains a-z) - upper: uppercase letters (contains A-Z) - alpha: lower + upper - alnum: alpha + digits - lower_or_punct: the wide "not just upper" class used in suffix lookbehinds """ classes = {} # extract from suffix lookbehinds suffix_pat = tok.suffix_search.__self__.pattern sp = sre_parse.parse(suffix_pat) def walk_for_classes(items, label=""): for item in items: op = item[0] if op == sre_parse.IN: ranges, negated = class_from_in_node(item[1]) if len(ranges) > 5: _classify(ranges, classes) elif op == sre_parse.BRANCH: for b in item[1][1]: walk_for_classes(b, label) elif op in (sre_parse.ASSERT, sre_parse.ASSERT_NOT): walk_for_classes(item[1][1], label) elif op == sre_parse.SUBPATTERN: if item[1][3]: walk_for_classes(list(item[1][3]), label) walk_for_classes(list(sp), "suffix") # also from infix infix_pat = tok.infix_finditer.__self__.pattern ip = sre_parse.parse(infix_pat) walk_for_classes(list(ip), "infix") return classes def _classify(ranges, classes): """classify a character range set by its content.""" range_set = set(ranges) # check for a-z presence → lower has_az = (0x61, 0x7A) in range_set has_AZ = (0x41, 0x5A) in range_set has_09 = (0x30, 0x39) in range_set or (0x30, 0x39) in range_set n_ranges = len(ranges) n_cp = sum(hi - lo + 1 for lo, hi in ranges) if has_az and not has_AZ and not has_09 and n_cp > 1000: if "lower" not in classes or len(ranges) > len(classes["lower"]): classes["lower"] = ranges elif has_AZ and not has_az and not has_09 and n_cp > 1000: if "upper" not in classes or len(ranges) > len(classes["upper"]): classes["upper"] = ranges elif has_az and has_AZ and not has_09 and n_cp > 1000: if "alpha" not in classes or len(ranges) > len(classes["alpha"]): classes["alpha"] = ranges elif has_az and has_AZ and has_09 and n_cp > 1000: if "alnum" not in classes or len(ranges) > len(classes["alnum"]): classes["alnum"] = ranges elif n_cp > 100000 and n_ranges > 300: # very large class — likely "lower_or_punct" or similar key = f"wide_{n_ranges}" classes[key] = ranges # ── special cases ── def extract_specials(tok): """extract special case rules.""" entries = [] for key, val in sorted(tok.rules.items()): orths = [d[65] for d in val] # 65 = ORTH entries.append((key, orths)) return entries # ── zig code generation ── def zig_str(s): """convert a python string to a zig string literal.""" parts = [] for c in s: cp = ord(c) if cp < 128: if c == '"': parts.append('\\"') elif c == "\\": parts.append("\\\\") elif c == "\n": parts.append("\\n") elif c == "\t": parts.append("\\t") elif c.isprintable(): parts.append(c) else: parts.append(f"\\x{cp:02x}") else: for b in c.encode("utf-8"): parts.append(f"\\x{b:02x}") return '"' + "".join(parts) + '"' def zig_char(cp): """convert a codepoint to a zig u21 literal.""" if 32 <= cp < 127 and chr(cp) not in "'\\\"": return f"'{chr(cp)}'" return f"0x{cp:04X}" def gen_range_table(name, ranges): """generate a const range table + lookup function.""" lines = [] lines.append(f"pub const {name}_ranges = [_][2]u21{{") for lo, hi in ranges: lines.append(f" .{{ 0x{lo:04X}, 0x{hi:04X} }},") lines.append("};") lines.append("") lines.append(f"pub fn {name}(c: u21) bool {{") lines.append(f" return rangeContains(&{name}_ranges, c);") lines.append("}") return "\n".join(lines) def gen_codepoint_set(name, codepoints): """generate a switch-based codepoint set.""" lines = [] lines.append(f"pub fn {name}(c: u21) bool {{") lines.append(" return switch (c) {") # group consecutive codepoints into ranges ranges = [] cps = sorted(set(codepoints)) i = 0 while i < len(cps): start = cps[i] end = start while i + 1 < len(cps) and cps[i + 1] == end + 1: end = cps[i + 1] i += 1 ranges.append((start, end)) i += 1 for lo, hi in ranges: if lo == hi: lines.append(f" {zig_char(lo)} => true,") else: lines.append(f" {zig_char(lo)}...{zig_char(hi)} => true,") lines.append(" else => false,") lines.append(" };") lines.append("}") return "\n".join(lines) def gen_specials(entries): """generate the special cases StaticStringMap.""" max_tokens = max(len(orths) for _, orths in entries) assert max_tokens <= 3, f"max tokens {max_tokens} > 3" lines = [] lines.append("pub const SpecialCase = struct {") lines.append(" tokens: [3][]const u8,") lines.append(" len: u8,") lines.append("};") lines.append("") lines.append( "pub const specials = std.StaticStringMap(SpecialCase).initComptime(.{" ) for key, orths in entries: k = zig_str(key) toks = [zig_str(o) for o in orths] while len(toks) < 3: toks.append('""') tok_str = ", ".join(toks) lines.append( f" .{{ {k}, SpecialCase{{ .tokens = .{{ {tok_str} }}, .len = {len(orths)} }} }}," ) lines.append("});") return "\n".join(lines) def gen_multi_literals(name, literals): """generate an array of multi-char literals for matching.""" lines = [] lines.append(f"pub const {name} = [_][]const u8{{") for lit in literals: lines.append(f" {zig_str(lit)},") lines.append("};") return "\n".join(lines) def gen_lookbehind_rules(rules): """generate suffix lookbehind rule data structures.""" # identify unique character classes used in lookbehinds class_tables = {} rule_descs = [] for rule in rules: behind = rule["behind"] suffix = rule["suffix"] behind_id = _get_class_id(behind, class_tables) suffix_texts = ( [suffix["text"]] if suffix["type"] == "literal" else suffix.get("texts", []) ) rule_descs.append( {"behind_id": behind_id, "behind": behind, "suffix_texts": suffix_texts} ) lines = [] # generate class tables for lookbehinds for cid, ranges in class_tables.items(): lines.append(f"const lookbehind_class_{cid}_ranges = [_][2]u21{{") for lo, hi in ranges: lines.append(f" .{{ 0x{lo:04X}, 0x{hi:04X} }},") lines.append("};") lines.append("") lines.append(f"pub fn matchLookbehind{cid}(c: u21) bool {{") lines.append(f" return rangeContains(&lookbehind_class_{cid}_ranges, c);") lines.append("}") lines.append("") return "\n".join(lines), rule_descs _class_counter = 0 _class_cache = {} def _get_class_id(behind, class_tables): global _class_counter if behind["type"] == "class": key = str(behind["ranges"]) if key not in _class_cache: cid = _class_counter _class_counter += 1 _class_cache[key] = cid class_tables[cid] = behind["ranges"] return _class_cache[key] elif behind["type"] == "sequence": # sequence of tests — generate IDs for each part ids = [] for part in behind["parts"]: ids.append(_get_class_id(part, class_tables)) return tuple(ids) elif behind["type"] == "literal": return ("literal", behind["char"]) return None def generate(tok): """generate the complete tokenizer_data.zig.""" print("extracting prefix data...") prefix = extract_prefix_data(tok) print( f" {len(prefix['single_chars'])} single chars, " f"{len(prefix['multi_literals'])} multi literals, " f"{len(prefix['symbol_ranges'])} symbol ranges" ) print("extracting suffix data...") suffix = extract_suffix_data(tok) print( f" {len(suffix['single_chars'])} single chars, " f"{len(suffix['multi_literals'])} multi literals, " f"{len(suffix['lookbehind_rules'])} lookbehind rules" ) print("extracting unicode classes...") classes = extract_named_classes(tok) print(f" classes found: {list(classes.keys())}") print("extracting specials...") specials = extract_specials(tok) print(f" {len(specials)} entries") # also extract the infix character classes directly infix_pat = tok.infix_finditer.__self__.pattern ip = sre_parse.parse(infix_pat) infix_branches = ip[0][1][1] # infix[2] is the symbol class (same as prefix) # infix[3] lookbehind is digits, chars are +-*^, lookahead is digits+hyphen # infix[4] lookbehind is lower/punct, ahead is upper/alpha # infix[5] lookbehind is alpha, ahead is alpha # infix[6] branch alternatives: -, --, ---, ~, en-dash, em-dash, em-dash*2 # infix[7] lookbehind is alnum, chars :/~<=>, ahead is alpha # extract infix lookbehind/lookahead classes infix_classes = {} for idx in [3, 4, 5, 6, 7]: branch = infix_branches[idx] for item in branch: if item[0] == sre_parse.ASSERT: direction = item[1][0] content = item[1][1] if len(content) == 1 and content[0][0] == sre_parse.IN: ranges, _ = class_from_in_node(content[0][1]) label = ( f"infix_{idx}_{'behind' if direction == -1 else 'ahead'}" ) infix_classes[label] = ranges # build output sections = [] sections.append("//! generated by scripts/gen_tokenizer_data.py — do not edit.") sections.append("//! tokenizer pattern data compiled from spaCy en_core_web_sm.") sections.append("") sections.append('const std = @import("std");') sections.append("") # ── utf-8 helpers ── sections.append("// ── utf-8 helpers ──") sections.append("") sections.append("pub const Codepoint = struct { value: u21, len: u3 };") sections.append("") sections.append("pub fn decodeUtf8(bytes: []const u8) ?Codepoint {") sections.append(" if (bytes.len == 0) return null;") sections.append(" const b0 = bytes[0];") sections.append(" if (b0 < 0x80) return .{ .value = b0, .len = 1 };") sections.append(" if (b0 & 0xE0 == 0xC0 and bytes.len >= 2)") sections.append( " return .{ .value = (@as(u21, b0 & 0x1F) << 6) | (bytes[1] & 0x3F), .len = 2 };" ) sections.append(" if (b0 & 0xF0 == 0xE0 and bytes.len >= 3)") sections.append( " return .{ .value = (@as(u21, b0 & 0x0F) << 12) | (@as(u21, bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F), .len = 3 };" ) sections.append(" if (b0 & 0xF8 == 0xF0 and bytes.len >= 4)") sections.append( " return .{ .value = (@as(u21, b0 & 0x07) << 18) | (@as(u21, bytes[1] & 0x3F) << 12) | (@as(u21, bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F), .len = 4 };" ) sections.append( ' return .{ .value = 0xFFFD, .len = 1 }; // replacement char' ) sections.append("}") sections.append("") sections.append("pub fn lastCodepoint(text: []const u8) ?Codepoint {") sections.append(" if (text.len == 0) return null;") sections.append(" var i = text.len - 1;") sections.append(" while (i > 0 and text[i] & 0xC0 == 0x80) : (i -= 1) {}") sections.append(" return decodeUtf8(text[i..]);") sections.append("}") sections.append("") # ── range search ── sections.append("// ── range search ──") sections.append("") sections.append("fn rangeContains(ranges: []const [2]u21, c: u21) bool {") sections.append(" var lo: usize = 0;") sections.append(" var hi: usize = ranges.len;") sections.append(" while (lo < hi) {") sections.append(" const mid = lo + (hi - lo) / 2;") sections.append(" if (c > ranges[mid][1]) { lo = mid + 1; }") sections.append(" else if (c < ranges[mid][0]) { hi = mid; }") sections.append(" else return true;") sections.append(" }") sections.append(" return false;") sections.append("}") sections.append("") # ── symbol class (shared by prefix, suffix, infix) ── sections.append("// ── symbol class (So/Sc unicode categories) ──") sections.append("") sections.append(gen_range_table("isSymbol", prefix["symbol_ranges"])) sections.append("") # ── prefix data ── sections.append("// ── prefix data ──") sections.append("") sections.append(gen_codepoint_set("isPrefixChar", prefix["single_chars"])) sections.append("") sections.append( gen_multi_literals("prefix_multi_literals", prefix["multi_literals"]) ) sections.append("") if prefix["literal_unless_digit"]: cps = prefix["literal_unless_digit"] sections.append(gen_codepoint_set("isPrefixUnlessDigit", cps)) sections.append("") # ── suffix data ── sections.append("// ── suffix data ──") sections.append("") sections.append(gen_codepoint_set("isSuffixChar", suffix["single_chars"])) sections.append("") sections.append( gen_multi_literals("suffix_multi_literals", suffix["multi_literals"]) ) sections.append("") # lookbehind helpers global _class_counter, _class_cache _class_counter = 0 _class_cache = {} lookbehind_code, rule_descs = gen_lookbehind_rules(suffix["lookbehind_rules"]) if lookbehind_code.strip(): sections.append("// ── suffix lookbehind helpers ──") sections.append("") sections.append(lookbehind_code) # generate a compact suffix lookbehind rule table # each rule is: check lookbehind condition, then try matching suffix text(s) sections.append("// ── suffix lookbehind rules ──") sections.append("// these are checked by tokenizer.zig matchSuffix()") sections.append( "// format: for each rule, check behind condition then try suffix literal(s)" ) sections.append("") # encode rules as Zig code in a single function sections.append("pub fn matchSuffixLookbehind(text: []const u8) usize {") sections.append(" if (text.len < 2) return 0;") sections.append("") for ri, desc in enumerate(rule_descs): behind = desc["behind"] suffix_texts = desc["suffix_texts"] # sort suffix texts longest first suffix_texts_sorted = sorted(suffix_texts, key=lambda s: -len(s.encode("utf-8"))) for st in suffix_texts_sorted: blen = len(st.encode("utf-8")) zig_lit = zig_str(st) sections.append( f" if (std.mem.endsWith(u8, text, {zig_lit}) and text.len > {blen}) {{" ) bid = desc["behind_id"] if isinstance(bid, int): # simple class check sections.append( f" const before = lastCodepoint(text[0 .. text.len - {blen}]);" ) sections.append( f" if (before != null and matchLookbehind{bid}(before.?.value)) return {blen};" ) elif isinstance(bid, tuple) and isinstance(bid[0], str) and bid[0] == "literal": # literal check cp = bid[1] sections.append( f" const before = lastCodepoint(text[0 .. text.len - {blen}]);" ) sections.append( f" if (before != null and before.?.value == {zig_char(cp)}) return {blen};" ) elif isinstance(bid, tuple): # sequence check (multiple lookbehinds) sections.append( f" const b1 = lastCodepoint(text[0 .. text.len - {blen}]);" ) sections.append(f" if (b1) |bp1| {{") if len(bid) == 2: sections.append( f" const b2 = lastCodepoint(text[0 .. text.len - {blen} - bp1.len]);" ) # bid[0] is the class before bp2, bid[1] is the class for bp1 test1 = ( f"matchLookbehind{bid[1]}(bp1.value)" if isinstance(bid[1], int) else f"bp1.value == {zig_char(bid[1][1])}" ) test0 = ( f"matchLookbehind{bid[0]}(b2p.value)" if isinstance(bid[0], int) else f"b2p.value == {zig_char(bid[0][1])}" ) sections.append(f" if ({test1}) {{") sections.append(f" if (b2) |b2p| {{") sections.append( f" if ({test0}) return {blen};" ) sections.append(f" }}") sections.append(f" }}") sections.append(f" }}") sections.append(" }") sections.append(" return 0;") sections.append("}") sections.append("") # ── infix character class tables ── sections.append("// ── infix character classes ──") sections.append("") for label, ranges in sorted(infix_classes.items()): name = f"is_{label}" sections.append(gen_range_table(name, ranges)) sections.append("") # ── specials ── sections.append("// ── special cases ──") sections.append("") sections.append(gen_specials(specials)) sections.append("") return "\n".join(sections) def main(): print("loading spaCy...") tok = load_spacy() print("\ngenerating zig source...") zig_source = generate(tok) out_path = Path("src/tokenizer_data.zig") out_path.write_text(zig_source) n_lines = zig_source.count("\n") + 1 print(f"\nwrote {out_path} ({len(zig_source):,} bytes, {n_lines:,} lines)") # verification: run spaCy tokenizer on test inputs and dump expected output print("\ngenerating test data...") import spacy nlp = spacy.load("en_core_web_sm") test_sentences = [ "Barack Obama visited Paris.", "Apple Inc. is worth $2.5 trillion.", "I can't believe it's not butter!", "Dr. Smith's office (room 42) is closed.", "U.S.A. and U.K. are allies.", "They're going to the store.", 'He said "hello" and left.', "The cost is $500.00/month.", "New York-based company", "e-mail: test@example.com", "10,000 people", "3.14159 is pi", "state-of-the-art technology", "Mr. and Mrs. Jones", "it's 5:30pm", ] test_data = [] for sent in test_sentences: doc = nlp.make_doc(sent) tokens = [t.text for t in doc] test_data.append({"text": sent, "tokens": tokens}) test_path = Path("tests/tokenizer_expected.json") test_path.parent.mkdir(exist_ok=True) test_path.write_text(json.dumps(test_data, indent=2)) print(f"wrote {test_path} ({len(test_data)} test cases)") if __name__ == "__main__": main()