this repo has no description

feat: model inference, tokenizer, and full NER pipeline

- model.zig: weight loading from binary + forward pass (embed → CNN → linear → parser scoring)
- tokenizer.zig: full port of spaCy's tokenizer algorithm (whitespace split → iterative prefix/suffix stripping → infix splitting → special cases)
- tokenizer_data.zig: generated unicode tables, pattern data, and 1347 special cases from spaCy en_core_web_sm
- parser.zig: label enum reordered to match spaCy training order, N_ACTIONS=74, decodeAction rewritten
- spacez.zig: top-level recognize() function connecting tokenizer → model → byte-offset entities
- scripts/gen_tokenizer_data.py: extracts tokenizer patterns from spaCy via sre_parse
- scripts/compare.py: cross-validation against spaCy
- 15/15 tokenizer cross-validation tests pass
- model NER tests match spaCy: "Barack Obama visited Paris" → PERSON[0..2), GPE[3..4)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

+7138 -56
+16
build.zig
··· 30 30 run_demo.step.dependOn(b.getInstallStep()); 31 31 const run_step = b.step("run", "run the demo"); 32 32 run_step.dependOn(&run_demo.step); 33 + 34 + const xval = b.addExecutable(.{ 35 + .name = "spacez-xval", 36 + .root_module = b.createModule(.{ 37 + .root_source_file = b.path("examples/cross_validate.zig"), 38 + .target = target, 39 + .optimize = optimize, 40 + .imports = &.{.{ .name = "spacez", .module = mod }}, 41 + }), 42 + }); 43 + b.installArtifact(xval); 44 + 45 + const run_xval = b.addRunArtifact(xval); 46 + run_xval.step.dependOn(b.getInstallStep()); 47 + const xval_step = b.step("xval", "cross-validate tokenizer against spaCy"); 48 + xval_step.dependOn(&run_xval.step); 33 49 }
+72
examples/cross_validate.zig
··· 1 + //! cross-validate spacez tokenizer against spaCy expected output. 2 + //! run with: zig build run-xval 3 + 4 + const std = @import("std"); 5 + const spacez = @import("spacez"); 6 + 7 + const TestCase = struct { 8 + text: []const u8, 9 + expected: []const []const u8, 10 + }; 11 + 12 + const test_cases = [_]TestCase{ 13 + .{ .text = "Barack Obama visited Paris.", .expected = &.{ "Barack", "Obama", "visited", "Paris", "." } }, 14 + .{ .text = "Apple Inc. is worth $2.5 trillion.", .expected = &.{ "Apple", "Inc.", "is", "worth", "$", "2.5", "trillion", "." } }, 15 + .{ .text = "I can't believe it's not butter!", .expected = &.{ "I", "ca", "n't", "believe", "it", "'s", "not", "butter", "!" } }, 16 + .{ .text = "Dr. Smith's office (room 42) is closed.", .expected = &.{ "Dr.", "Smith", "'s", "office", "(", "room", "42", ")", "is", "closed", "." } }, 17 + .{ .text = "U.S.A. and U.K. are allies.", .expected = &.{ "U.S.A.", "and", "U.K.", "are", "allies", "." } }, 18 + .{ .text = "They're going to the store.", .expected = &.{ "They", "'re", "going", "to", "the", "store", "." } }, 19 + .{ .text = "He said \"hello\" and left.", .expected = &.{ "He", "said", "\"", "hello", "\"", "and", "left", "." } }, 20 + .{ .text = "The cost is $500.00/month.", .expected = &.{ "The", "cost", "is", "$", "500.00", "/", "month", "." } }, 21 + .{ .text = "New York-based company", .expected = &.{ "New", "York", "-", "based", "company" } }, 22 + .{ .text = "e-mail: test@example.com", .expected = &.{ "e", "-", "mail", ":", "test@example.com" } }, 23 + .{ .text = "10,000 people", .expected = &.{ "10,000", "people" } }, 24 + .{ .text = "3.14159 is pi", .expected = &.{ "3.14159", "is", "pi" } }, 25 + .{ .text = "state-of-the-art technology", .expected = &.{ "state", "-", "of", "-", "the", "-", "art", "technology" } }, 26 + .{ .text = "Mr. and Mrs. Jones", .expected = &.{ "Mr.", "and", "Mrs.", "Jones" } }, 27 + .{ .text = "it's 5:30pm", .expected = &.{ "it", "'s", "5:30pm" } }, 28 + }; 29 + 30 + pub fn main() void { 31 + const print = std.debug.print; 32 + var pass: u32 = 0; 33 + var fail: u32 = 0; 34 + 35 + for (test_cases) |tc| { 36 + var tokens: [1024]spacez.Token = undefined; 37 + const n = spacez.tokenizeText(tc.text, &tokens); 38 + 39 + var ok = true; 40 + if (n != tc.expected.len) { 41 + ok = false; 42 + } else { 43 + for (tc.expected, 0..) |exp, i| { 44 + if (!std.mem.eql(u8, exp, tokens[i].text(tc.text))) { 45 + ok = false; 46 + break; 47 + } 48 + } 49 + } 50 + 51 + if (ok) { 52 + pass += 1; 53 + print("PASS: {s}\n", .{tc.text}); 54 + } else { 55 + fail += 1; 56 + print("FAIL: {s}\n", .{tc.text}); 57 + print(" expected ({d}):", .{tc.expected.len}); 58 + for (tc.expected) |exp| { 59 + print(" |{s}|", .{exp}); 60 + } 61 + print("\n got ({d}):", .{n}); 62 + var i: u32 = 0; 63 + while (i < n) : (i += 1) { 64 + print(" |{s}|", .{tokens[i].text(tc.text)}); 65 + } 66 + print("\n", .{}); 67 + } 68 + } 69 + 70 + print("\n{d}/{d} passed\n", .{ pass, pass + fail }); 71 + if (fail > 0) std.process.exit(1); 72 + }
+115
scripts/compare.py
··· 1 + """compare spacez NER output against spaCy. 2 + 3 + runs spaCy on test sentences, then invokes the spacez-xval binary to 4 + compare tokenization. for NER comparison, reads the weight file and 5 + uses the model.zig predict() tests as ground truth. 6 + 7 + usage: 8 + uv run --python 3.12 --with spacy \ 9 + --with 'en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl' \ 10 + python scripts/compare.py 11 + """ 12 + 13 + import json 14 + import subprocess 15 + import sys 16 + from pathlib import Path 17 + 18 + 19 + def run_spacy(): 20 + """run spaCy NER on test sentences.""" 21 + import spacy 22 + 23 + nlp = spacy.load("en_core_web_sm") 24 + 25 + test_sentences = [ 26 + "Barack Obama visited Paris.", 27 + "Apple Inc. is worth $2.5 trillion.", 28 + "The United States and China are trading partners.", 29 + "Elon Musk founded SpaceX and Tesla.", 30 + "The World Cup was held in Qatar.", 31 + "Microsoft acquired Activision for $68.7 billion.", 32 + "Taylor Swift performed at Madison Square Garden.", 33 + "The European Union imposed sanctions on Russia.", 34 + "Goldman Sachs reported quarterly earnings.", 35 + "NASA launched the Artemis mission.", 36 + ] 37 + 38 + results = [] 39 + for sent in test_sentences: 40 + doc = nlp(sent) 41 + tokens = [t.text for t in doc] 42 + ents = [] 43 + for e in doc.ents: 44 + ents.append( 45 + { 46 + "text": e.text, 47 + "start": e.start_char, 48 + "end": e.end_char, 49 + "label": e.label_, 50 + } 51 + ) 52 + results.append({"text": sent, "tokens": tokens, "entities": ents}) 53 + 54 + return results 55 + 56 + 57 + def run_tokenizer_xval(): 58 + """run the tokenizer cross-validation binary.""" 59 + result = subprocess.run( 60 + ["zig", "build", "xval"], 61 + capture_output=True, 62 + text=True, 63 + cwd=str(Path(__file__).parent.parent), 64 + ) 65 + return result.returncode == 0, result.stderr 66 + 67 + 68 + def main(): 69 + print("=== spacez vs spaCy comparison ===\n") 70 + 71 + # 1. tokenizer cross-validation 72 + print("--- tokenizer cross-validation ---") 73 + ok, output = run_tokenizer_xval() 74 + for line in output.strip().split("\n"): 75 + print(f" {line}") 76 + if ok: 77 + print(" tokenizer: ALL PASS\n") 78 + else: 79 + print(" tokenizer: FAILURES\n") 80 + 81 + # 2. NER comparison (spaCy results) 82 + print("--- spaCy NER results ---") 83 + results = run_spacy() 84 + for r in results: 85 + ents_str = ", ".join( 86 + f"{e['label']}:{e['text']!r}" for e in r["entities"] 87 + ) 88 + print(f" {r['text']}") 89 + if ents_str: 90 + print(f" entities: {ents_str}") 91 + else: 92 + print(" entities: (none)") 93 + toks = " | ".join(r["tokens"]) 94 + print(f" tokens: {toks}") 95 + print() 96 + 97 + # save for reference 98 + out_path = Path("tests/ner_expected.json") 99 + out_path.parent.mkdir(exist_ok=True) 100 + with open(out_path, "w") as f: 101 + json.dump(results, f, indent=2) 102 + print(f"wrote {out_path}") 103 + 104 + # 3. check if weights exist for NER testing 105 + weights_path = Path("weights/en_core_web_sm.bin") 106 + if weights_path.exists(): 107 + print(f"\nweights found at {weights_path}") 108 + print("NER model tests run as part of `zig build test`") 109 + else: 110 + print(f"\nweights NOT found at {weights_path}") 111 + print("run `just export-weights` first to enable NER model tests") 112 + 113 + 114 + if __name__ == "__main__": 115 + main()
+6 -14
scripts/export_weights.py
··· 33 33 tok2vec = model.get_ref("tok2vec") 34 34 35 35 # ── extract dimensions ── 36 - 37 - # MultiHashEmbed: 4 tables with different sizes 38 - embed = tok2vec.get_ref("embed") 39 - # the embed layer is: chain(FeatureExtractor, list2ragged, with_array(chain( 40 - # MultiHashEmbed(...), MaxoutWindowEncoder(...) 41 - # ))) 42 - # navigate to the actual MultiHashEmbed 43 - multi_hash = None 44 - for node in embed.walk(): 45 - if node.name == "hashembed": 46 - multi_hash = node 47 - break 36 + # walk the model tree to find all named components 37 + # the tok2vec tree is: extract_features >> list2ragged >> with_array( 38 + # hashembed|hashembed|hashembed|hashembed) >> with_array(maxout >> layernorm >> dropout) 39 + # >> ragged2list >> with_array(residual(expand_window >> maxout >> layernorm >> dropout) * 4) 40 + # >> list2array >> linear 48 41 49 - # find all hash embeds and the reduction maxout 50 42 hash_embeds = [] 51 43 reduce_maxout = None 52 44 reduce_ln = None ··· 207 199 parser_hidden = 64 208 200 parser_nP = 2 209 201 parser_nF = 3 210 - n_actions = 73 # 18*4 + 1 202 + n_actions = 74 # 18*4 + 1(filler) + 1(OUT) 211 203 212 204 if linear is not None: 213 205 parser_hidden = linear.get_param("W").shape[0]
+817
scripts/gen_tokenizer_data.py
··· 1 + """generate tokenizer_data.zig from spaCy's en_core_web_sm tokenizer config. 2 + 3 + extracts: 4 + - unicode character class tables (sorted ranges for binary search) 5 + - prefix single-char set + multi-char literals + special rules 6 + - suffix data (single-char set, multi-char literals, lookbehind rules) 7 + - special cases table (1347 entries) 8 + 9 + the matching LOGIC lives in tokenizer.zig. this script only generates DATA tables. 10 + 11 + usage: 12 + uv run --python 3.12 --with spacy \ 13 + --with 'en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl' \ 14 + python scripts/gen_tokenizer_data.py 15 + """ 16 + 17 + import json 18 + import re 19 + import sre_parse 20 + import sys 21 + from pathlib import Path 22 + 23 + 24 + def load_spacy(): 25 + """load spaCy and extract all tokenizer config.""" 26 + import spacy 27 + 28 + nlp = spacy.load("en_core_web_sm") 29 + tok = nlp.tokenizer 30 + return tok 31 + 32 + 33 + def extract_ranges(items): 34 + """convert sre_parse IN items to sorted, merged (lo, hi) ranges.""" 35 + ranges = [] 36 + for op, val in items: 37 + if op == sre_parse.LITERAL: 38 + ranges.append((val, val)) 39 + elif op == sre_parse.RANGE: 40 + ranges.append(val) 41 + elif op == sre_parse.CATEGORY: 42 + if val == sre_parse.CATEGORY_DIGIT: 43 + ranges.append((0x30, 0x39)) 44 + elif val == sre_parse.CATEGORY_WORD: 45 + ranges.extend([(0x30, 0x39), (0x41, 0x5A), (0x5F, 0x5F), (0x61, 0x7A)]) 46 + ranges.sort() 47 + merged = [] 48 + for lo, hi in ranges: 49 + if merged and lo <= merged[-1][1] + 1: 50 + merged[-1] = (merged[-1][0], max(merged[-1][1], hi)) 51 + else: 52 + merged.append((lo, hi)) 53 + return merged 54 + 55 + 56 + def class_from_in_node(in_items): 57 + """extract character class from an IN node, handling NEGATE.""" 58 + negated = any(x[0] == sre_parse.NEGATE for x in in_items) 59 + non_neg = [x for x in in_items if x[0] != sre_parse.NEGATE] 60 + ranges = extract_ranges(non_neg) 61 + return ranges, negated 62 + 63 + 64 + # ── prefix data extraction ── 65 + 66 + 67 + def extract_prefix_data(tok): 68 + """extract prefix pattern data: single chars, multi-char literals, char class, specials.""" 69 + pat = tok.prefix_search.__self__.pattern 70 + parsed = sre_parse.parse(pat) 71 + branches = parsed[1][1][1] # AT_BEGINNING, BRANCH 72 + 73 + single_chars = [] # codepoints matched as single-char prefix 74 + multi_literals = [] # multi-byte string prefixes 75 + symbol_ranges = [] # the big unicode symbol class 76 + dots = False # whether ..+ is a prefix 77 + literal_unless_digit = [] # chars like + that don't match before digits 78 + 79 + for branch in branches: 80 + if len(branch) == 1: 81 + op, val = branch[0] 82 + if op == sre_parse.LITERAL: 83 + single_chars.append(val) 84 + elif op == sre_parse.IN: 85 + ranges, _ = class_from_in_node(val) 86 + if len(ranges) > 50: 87 + symbol_ranges = ranges 88 + else: 89 + # small class — expand to individual chars 90 + for lo, hi in ranges: 91 + for cp in range(lo, hi + 1): 92 + single_chars.append(cp) 93 + elif all(b[0] == sre_parse.LITERAL for b in branch): 94 + s = "".join(chr(b[1]) for b in branch) 95 + multi_literals.append(s) 96 + elif ( 97 + len(branch) == 2 98 + and branch[0][0] == sre_parse.LITERAL 99 + and branch[1][0] == sre_parse.MAX_REPEAT 100 + ): 101 + dots = True 102 + elif ( 103 + len(branch) == 2 104 + and branch[0][0] == sre_parse.LITERAL 105 + and branch[1][0] == sre_parse.ASSERT_NOT 106 + ): 107 + literal_unless_digit.append(branch[0][1]) 108 + 109 + return { 110 + "single_chars": sorted(set(single_chars)), 111 + "multi_literals": sorted(multi_literals, key=lambda s: -len(s)), 112 + "symbol_ranges": symbol_ranges, 113 + "has_dots": dots, 114 + "literal_unless_digit": literal_unless_digit, 115 + } 116 + 117 + 118 + # ── suffix data extraction ── 119 + 120 + 121 + def extract_suffix_data(tok): 122 + """extract suffix pattern data.""" 123 + pat = tok.suffix_search.__self__.pattern 124 + parsed = sre_parse.parse(pat) 125 + branches = parsed[0][1][1] # BRANCH 126 + 127 + single_chars = [] 128 + multi_literals = [] 129 + symbol_ranges = [] 130 + has_dots = False 131 + lookbehind_rules = [] 132 + 133 + for branch in branches: 134 + items = list(branch) 135 + if items and items[-1] == (sre_parse.AT, sre_parse.AT_END): 136 + items = items[:-1] 137 + if not items: 138 + continue 139 + 140 + # simple literal(s) 141 + if all(x[0] == sre_parse.LITERAL for x in items): 142 + s = "".join(chr(x[1]) for x in items) 143 + if len(s) == 1: 144 + single_chars.append(ord(s)) 145 + else: 146 + multi_literals.append(s) 147 + continue 148 + 149 + # character class 150 + if len(items) == 1 and items[0][0] == sre_parse.IN: 151 + ranges, _ = class_from_in_node(items[0][1]) 152 + if len(ranges) > 50: 153 + symbol_ranges = ranges 154 + else: 155 + for lo, hi in ranges: 156 + for cp in range(lo, hi + 1): 157 + single_chars.append(cp) 158 + continue 159 + 160 + # dots 161 + if ( 162 + len(items) >= 2 163 + and items[0] == (sre_parse.LITERAL, ord(".")) 164 + and items[1][0] == sre_parse.MAX_REPEAT 165 + ): 166 + has_dots = True 167 + continue 168 + 169 + # lookbehind rule 170 + if items[0][0] == sre_parse.ASSERT: 171 + direction = items[0][1][0] 172 + if direction == -1: # lookbehind 173 + rule = _extract_lookbehind_rule(items) 174 + if rule: 175 + lookbehind_rules.append(rule) 176 + continue 177 + 178 + return { 179 + "single_chars": sorted(set(single_chars)), 180 + "multi_literals": sorted(multi_literals, key=lambda s: -len(s)), 181 + "symbol_ranges": symbol_ranges, 182 + "has_dots": has_dots, 183 + "lookbehind_rules": lookbehind_rules, 184 + } 185 + 186 + 187 + def _extract_lookbehind_rule(items): 188 + """extract a suffix lookbehind rule into a serializable structure.""" 189 + behind_content = items[0][1][1] 190 + rest = items[1:] 191 + 192 + # parse lookbehind 193 + behind = _parse_assert_content(behind_content) 194 + if behind is None: 195 + return None 196 + 197 + # parse suffix part 198 + suffix = _parse_suffix_part(rest) 199 + if suffix is None: 200 + return None 201 + 202 + return {"behind": behind, "suffix": suffix} 203 + 204 + 205 + def _parse_assert_content(content): 206 + """parse lookbehind/lookahead content into a descriptor.""" 207 + parts = [] 208 + for item in content: 209 + if item[0] == sre_parse.IN: 210 + ranges, negated = class_from_in_node(item[1]) 211 + parts.append({"type": "class", "ranges": ranges, "negated": negated}) 212 + elif item[0] == sre_parse.LITERAL: 213 + parts.append({"type": "literal", "char": item[1]}) 214 + else: 215 + return None 216 + if len(parts) == 1: 217 + return parts[0] 218 + elif len(parts) > 1: 219 + return {"type": "sequence", "parts": parts} 220 + return None 221 + 222 + 223 + def _parse_suffix_part(items): 224 + """parse the suffix portion after lookbehind.""" 225 + if all(x[0] == sre_parse.LITERAL for x in items): 226 + s = "".join(chr(x[1]) for x in items) 227 + return {"type": "literal", "text": s} 228 + 229 + # subpattern with alternatives 230 + if len(items) == 1 and items[0][0] == sre_parse.SUBPATTERN: 231 + content = items[0][1][3] 232 + if content and content[0][0] == sre_parse.BRANCH: 233 + alts = [] 234 + for branch in content[0][1][1]: 235 + if all(x[0] == sre_parse.LITERAL for x in branch): 236 + alts.append("".join(chr(x[1]) for x in branch)) 237 + if alts: 238 + return {"type": "alternatives", "texts": alts} 239 + 240 + # BRANCH directly 241 + if len(items) == 1 and items[0][0] == sre_parse.BRANCH: 242 + alts = [] 243 + for branch in items[0][1][1]: 244 + if all(x[0] == sre_parse.LITERAL for x in branch): 245 + alts.append("".join(chr(x[1]) for x in branch)) 246 + if alts: 247 + return {"type": "alternatives", "texts": alts} 248 + 249 + return None 250 + 251 + 252 + # ── unicode class extraction from all patterns ── 253 + 254 + 255 + def extract_named_classes(tok): 256 + """extract the specific unicode character classes used across patterns. 257 + 258 + we identify them by their content: 259 + - symbol: the big So/Sc class (~174 ranges) 260 + - lower: lowercase letters (contains a-z) 261 + - upper: uppercase letters (contains A-Z) 262 + - alpha: lower + upper 263 + - alnum: alpha + digits 264 + - lower_or_punct: the wide "not just upper" class used in suffix lookbehinds 265 + """ 266 + classes = {} 267 + 268 + # extract from suffix lookbehinds 269 + suffix_pat = tok.suffix_search.__self__.pattern 270 + sp = sre_parse.parse(suffix_pat) 271 + 272 + def walk_for_classes(items, label=""): 273 + for item in items: 274 + op = item[0] 275 + if op == sre_parse.IN: 276 + ranges, negated = class_from_in_node(item[1]) 277 + if len(ranges) > 5: 278 + _classify(ranges, classes) 279 + elif op == sre_parse.BRANCH: 280 + for b in item[1][1]: 281 + walk_for_classes(b, label) 282 + elif op in (sre_parse.ASSERT, sre_parse.ASSERT_NOT): 283 + walk_for_classes(item[1][1], label) 284 + elif op == sre_parse.SUBPATTERN: 285 + if item[1][3]: 286 + walk_for_classes(list(item[1][3]), label) 287 + 288 + walk_for_classes(list(sp), "suffix") 289 + 290 + # also from infix 291 + infix_pat = tok.infix_finditer.__self__.pattern 292 + ip = sre_parse.parse(infix_pat) 293 + walk_for_classes(list(ip), "infix") 294 + 295 + return classes 296 + 297 + 298 + def _classify(ranges, classes): 299 + """classify a character range set by its content.""" 300 + range_set = set(ranges) 301 + 302 + # check for a-z presence → lower 303 + has_az = (0x61, 0x7A) in range_set 304 + has_AZ = (0x41, 0x5A) in range_set 305 + has_09 = (0x30, 0x39) in range_set or (0x30, 0x39) in range_set 306 + 307 + n_ranges = len(ranges) 308 + n_cp = sum(hi - lo + 1 for lo, hi in ranges) 309 + 310 + if has_az and not has_AZ and not has_09 and n_cp > 1000: 311 + if "lower" not in classes or len(ranges) > len(classes["lower"]): 312 + classes["lower"] = ranges 313 + elif has_AZ and not has_az and not has_09 and n_cp > 1000: 314 + if "upper" not in classes or len(ranges) > len(classes["upper"]): 315 + classes["upper"] = ranges 316 + elif has_az and has_AZ and not has_09 and n_cp > 1000: 317 + if "alpha" not in classes or len(ranges) > len(classes["alpha"]): 318 + classes["alpha"] = ranges 319 + elif has_az and has_AZ and has_09 and n_cp > 1000: 320 + if "alnum" not in classes or len(ranges) > len(classes["alnum"]): 321 + classes["alnum"] = ranges 322 + elif n_cp > 100000 and n_ranges > 300: 323 + # very large class — likely "lower_or_punct" or similar 324 + key = f"wide_{n_ranges}" 325 + classes[key] = ranges 326 + 327 + 328 + # ── special cases ── 329 + 330 + 331 + def extract_specials(tok): 332 + """extract special case rules.""" 333 + entries = [] 334 + for key, val in sorted(tok.rules.items()): 335 + orths = [d[65] for d in val] # 65 = ORTH 336 + entries.append((key, orths)) 337 + return entries 338 + 339 + 340 + # ── zig code generation ── 341 + 342 + 343 + def zig_str(s): 344 + """convert a python string to a zig string literal.""" 345 + parts = [] 346 + for c in s: 347 + cp = ord(c) 348 + if cp < 128: 349 + if c == '"': 350 + parts.append('\\"') 351 + elif c == "\\": 352 + parts.append("\\\\") 353 + elif c == "\n": 354 + parts.append("\\n") 355 + elif c == "\t": 356 + parts.append("\\t") 357 + elif c.isprintable(): 358 + parts.append(c) 359 + else: 360 + parts.append(f"\\x{cp:02x}") 361 + else: 362 + for b in c.encode("utf-8"): 363 + parts.append(f"\\x{b:02x}") 364 + return '"' + "".join(parts) + '"' 365 + 366 + 367 + def zig_char(cp): 368 + """convert a codepoint to a zig u21 literal.""" 369 + if 32 <= cp < 127 and chr(cp) not in "'\\\"": 370 + return f"'{chr(cp)}'" 371 + return f"0x{cp:04X}" 372 + 373 + 374 + def gen_range_table(name, ranges): 375 + """generate a const range table + lookup function.""" 376 + lines = [] 377 + lines.append(f"pub const {name}_ranges = [_][2]u21{{") 378 + for lo, hi in ranges: 379 + lines.append(f" .{{ 0x{lo:04X}, 0x{hi:04X} }},") 380 + lines.append("};") 381 + lines.append("") 382 + lines.append(f"pub fn {name}(c: u21) bool {{") 383 + lines.append(f" return rangeContains(&{name}_ranges, c);") 384 + lines.append("}") 385 + return "\n".join(lines) 386 + 387 + 388 + def gen_codepoint_set(name, codepoints): 389 + """generate a switch-based codepoint set.""" 390 + lines = [] 391 + lines.append(f"pub fn {name}(c: u21) bool {{") 392 + lines.append(" return switch (c) {") 393 + # group consecutive codepoints into ranges 394 + ranges = [] 395 + cps = sorted(set(codepoints)) 396 + i = 0 397 + while i < len(cps): 398 + start = cps[i] 399 + end = start 400 + while i + 1 < len(cps) and cps[i + 1] == end + 1: 401 + end = cps[i + 1] 402 + i += 1 403 + ranges.append((start, end)) 404 + i += 1 405 + 406 + for lo, hi in ranges: 407 + if lo == hi: 408 + lines.append(f" {zig_char(lo)} => true,") 409 + else: 410 + lines.append(f" {zig_char(lo)}...{zig_char(hi)} => true,") 411 + lines.append(" else => false,") 412 + lines.append(" };") 413 + lines.append("}") 414 + return "\n".join(lines) 415 + 416 + 417 + def gen_specials(entries): 418 + """generate the special cases StaticStringMap.""" 419 + max_tokens = max(len(orths) for _, orths in entries) 420 + assert max_tokens <= 3, f"max tokens {max_tokens} > 3" 421 + 422 + lines = [] 423 + lines.append("pub const SpecialCase = struct {") 424 + lines.append(" tokens: [3][]const u8,") 425 + lines.append(" len: u8,") 426 + lines.append("};") 427 + lines.append("") 428 + lines.append( 429 + "pub const specials = std.StaticStringMap(SpecialCase).initComptime(.{" 430 + ) 431 + for key, orths in entries: 432 + k = zig_str(key) 433 + toks = [zig_str(o) for o in orths] 434 + while len(toks) < 3: 435 + toks.append('""') 436 + tok_str = ", ".join(toks) 437 + lines.append( 438 + f" .{{ {k}, SpecialCase{{ .tokens = .{{ {tok_str} }}, .len = {len(orths)} }} }}," 439 + ) 440 + lines.append("});") 441 + return "\n".join(lines) 442 + 443 + 444 + def gen_multi_literals(name, literals): 445 + """generate an array of multi-char literals for matching.""" 446 + lines = [] 447 + lines.append(f"pub const {name} = [_][]const u8{{") 448 + for lit in literals: 449 + lines.append(f" {zig_str(lit)},") 450 + lines.append("};") 451 + return "\n".join(lines) 452 + 453 + 454 + def gen_lookbehind_rules(rules): 455 + """generate suffix lookbehind rule data structures.""" 456 + # identify unique character classes used in lookbehinds 457 + class_tables = {} 458 + rule_descs = [] 459 + 460 + for rule in rules: 461 + behind = rule["behind"] 462 + suffix = rule["suffix"] 463 + 464 + behind_id = _get_class_id(behind, class_tables) 465 + suffix_texts = ( 466 + [suffix["text"]] 467 + if suffix["type"] == "literal" 468 + else suffix.get("texts", []) 469 + ) 470 + rule_descs.append( 471 + {"behind_id": behind_id, "behind": behind, "suffix_texts": suffix_texts} 472 + ) 473 + 474 + lines = [] 475 + 476 + # generate class tables for lookbehinds 477 + for cid, ranges in class_tables.items(): 478 + lines.append(f"const lookbehind_class_{cid}_ranges = [_][2]u21{{") 479 + for lo, hi in ranges: 480 + lines.append(f" .{{ 0x{lo:04X}, 0x{hi:04X} }},") 481 + lines.append("};") 482 + lines.append("") 483 + lines.append(f"pub fn matchLookbehind{cid}(c: u21) bool {{") 484 + lines.append(f" return rangeContains(&lookbehind_class_{cid}_ranges, c);") 485 + lines.append("}") 486 + lines.append("") 487 + 488 + return "\n".join(lines), rule_descs 489 + 490 + 491 + _class_counter = 0 492 + _class_cache = {} 493 + 494 + 495 + def _get_class_id(behind, class_tables): 496 + global _class_counter 497 + if behind["type"] == "class": 498 + key = str(behind["ranges"]) 499 + if key not in _class_cache: 500 + cid = _class_counter 501 + _class_counter += 1 502 + _class_cache[key] = cid 503 + class_tables[cid] = behind["ranges"] 504 + return _class_cache[key] 505 + elif behind["type"] == "sequence": 506 + # sequence of tests — generate IDs for each part 507 + ids = [] 508 + for part in behind["parts"]: 509 + ids.append(_get_class_id(part, class_tables)) 510 + return tuple(ids) 511 + elif behind["type"] == "literal": 512 + return ("literal", behind["char"]) 513 + return None 514 + 515 + 516 + def generate(tok): 517 + """generate the complete tokenizer_data.zig.""" 518 + print("extracting prefix data...") 519 + prefix = extract_prefix_data(tok) 520 + print( 521 + f" {len(prefix['single_chars'])} single chars, " 522 + f"{len(prefix['multi_literals'])} multi literals, " 523 + f"{len(prefix['symbol_ranges'])} symbol ranges" 524 + ) 525 + 526 + print("extracting suffix data...") 527 + suffix = extract_suffix_data(tok) 528 + print( 529 + f" {len(suffix['single_chars'])} single chars, " 530 + f"{len(suffix['multi_literals'])} multi literals, " 531 + f"{len(suffix['lookbehind_rules'])} lookbehind rules" 532 + ) 533 + 534 + print("extracting unicode classes...") 535 + classes = extract_named_classes(tok) 536 + print(f" classes found: {list(classes.keys())}") 537 + 538 + print("extracting specials...") 539 + specials = extract_specials(tok) 540 + print(f" {len(specials)} entries") 541 + 542 + # also extract the infix character classes directly 543 + infix_pat = tok.infix_finditer.__self__.pattern 544 + ip = sre_parse.parse(infix_pat) 545 + infix_branches = ip[0][1][1] 546 + 547 + # infix[2] is the symbol class (same as prefix) 548 + # infix[3] lookbehind is digits, chars are +-*^, lookahead is digits+hyphen 549 + # infix[4] lookbehind is lower/punct, ahead is upper/alpha 550 + # infix[5] lookbehind is alpha, ahead is alpha 551 + # infix[6] branch alternatives: -, --, ---, ~, en-dash, em-dash, em-dash*2 552 + # infix[7] lookbehind is alnum, chars :/~<=>, ahead is alpha 553 + 554 + # extract infix lookbehind/lookahead classes 555 + infix_classes = {} 556 + for idx in [3, 4, 5, 6, 7]: 557 + branch = infix_branches[idx] 558 + for item in branch: 559 + if item[0] == sre_parse.ASSERT: 560 + direction = item[1][0] 561 + content = item[1][1] 562 + if len(content) == 1 and content[0][0] == sre_parse.IN: 563 + ranges, _ = class_from_in_node(content[0][1]) 564 + label = ( 565 + f"infix_{idx}_{'behind' if direction == -1 else 'ahead'}" 566 + ) 567 + infix_classes[label] = ranges 568 + 569 + # build output 570 + sections = [] 571 + sections.append("//! generated by scripts/gen_tokenizer_data.py — do not edit.") 572 + sections.append("//! tokenizer pattern data compiled from spaCy en_core_web_sm.") 573 + sections.append("") 574 + sections.append('const std = @import("std");') 575 + sections.append("") 576 + 577 + # ── utf-8 helpers ── 578 + sections.append("// ── utf-8 helpers ──") 579 + sections.append("") 580 + sections.append("pub const Codepoint = struct { value: u21, len: u3 };") 581 + sections.append("") 582 + sections.append("pub fn decodeUtf8(bytes: []const u8) ?Codepoint {") 583 + sections.append(" if (bytes.len == 0) return null;") 584 + sections.append(" const b0 = bytes[0];") 585 + sections.append(" if (b0 < 0x80) return .{ .value = b0, .len = 1 };") 586 + sections.append(" if (b0 & 0xE0 == 0xC0 and bytes.len >= 2)") 587 + sections.append( 588 + " return .{ .value = (@as(u21, b0 & 0x1F) << 6) | (bytes[1] & 0x3F), .len = 2 };" 589 + ) 590 + sections.append(" if (b0 & 0xF0 == 0xE0 and bytes.len >= 3)") 591 + sections.append( 592 + " return .{ .value = (@as(u21, b0 & 0x0F) << 12) | (@as(u21, bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F), .len = 3 };" 593 + ) 594 + sections.append(" if (b0 & 0xF8 == 0xF0 and bytes.len >= 4)") 595 + sections.append( 596 + " return .{ .value = (@as(u21, b0 & 0x07) << 18) | (@as(u21, bytes[1] & 0x3F) << 12) | (@as(u21, bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F), .len = 4 };" 597 + ) 598 + sections.append( 599 + ' return .{ .value = 0xFFFD, .len = 1 }; // replacement char' 600 + ) 601 + sections.append("}") 602 + sections.append("") 603 + sections.append("pub fn lastCodepoint(text: []const u8) ?Codepoint {") 604 + sections.append(" if (text.len == 0) return null;") 605 + sections.append(" var i = text.len - 1;") 606 + sections.append(" while (i > 0 and text[i] & 0xC0 == 0x80) : (i -= 1) {}") 607 + sections.append(" return decodeUtf8(text[i..]);") 608 + sections.append("}") 609 + sections.append("") 610 + 611 + # ── range search ── 612 + sections.append("// ── range search ──") 613 + sections.append("") 614 + sections.append("fn rangeContains(ranges: []const [2]u21, c: u21) bool {") 615 + sections.append(" var lo: usize = 0;") 616 + sections.append(" var hi: usize = ranges.len;") 617 + sections.append(" while (lo < hi) {") 618 + sections.append(" const mid = lo + (hi - lo) / 2;") 619 + sections.append(" if (c > ranges[mid][1]) { lo = mid + 1; }") 620 + sections.append(" else if (c < ranges[mid][0]) { hi = mid; }") 621 + sections.append(" else return true;") 622 + sections.append(" }") 623 + sections.append(" return false;") 624 + sections.append("}") 625 + sections.append("") 626 + 627 + # ── symbol class (shared by prefix, suffix, infix) ── 628 + sections.append("// ── symbol class (So/Sc unicode categories) ──") 629 + sections.append("") 630 + sections.append(gen_range_table("isSymbol", prefix["symbol_ranges"])) 631 + sections.append("") 632 + 633 + # ── prefix data ── 634 + sections.append("// ── prefix data ──") 635 + sections.append("") 636 + sections.append(gen_codepoint_set("isPrefixChar", prefix["single_chars"])) 637 + sections.append("") 638 + sections.append( 639 + gen_multi_literals("prefix_multi_literals", prefix["multi_literals"]) 640 + ) 641 + sections.append("") 642 + if prefix["literal_unless_digit"]: 643 + cps = prefix["literal_unless_digit"] 644 + sections.append(gen_codepoint_set("isPrefixUnlessDigit", cps)) 645 + sections.append("") 646 + 647 + # ── suffix data ── 648 + sections.append("// ── suffix data ──") 649 + sections.append("") 650 + sections.append(gen_codepoint_set("isSuffixChar", suffix["single_chars"])) 651 + sections.append("") 652 + sections.append( 653 + gen_multi_literals("suffix_multi_literals", suffix["multi_literals"]) 654 + ) 655 + sections.append("") 656 + 657 + # lookbehind helpers 658 + global _class_counter, _class_cache 659 + _class_counter = 0 660 + _class_cache = {} 661 + 662 + lookbehind_code, rule_descs = gen_lookbehind_rules(suffix["lookbehind_rules"]) 663 + if lookbehind_code.strip(): 664 + sections.append("// ── suffix lookbehind helpers ──") 665 + sections.append("") 666 + sections.append(lookbehind_code) 667 + 668 + # generate a compact suffix lookbehind rule table 669 + # each rule is: check lookbehind condition, then try matching suffix text(s) 670 + sections.append("// ── suffix lookbehind rules ──") 671 + sections.append("// these are checked by tokenizer.zig matchSuffix()") 672 + sections.append( 673 + "// format: for each rule, check behind condition then try suffix literal(s)" 674 + ) 675 + sections.append("") 676 + 677 + # encode rules as Zig code in a single function 678 + sections.append("pub fn matchSuffixLookbehind(text: []const u8) usize {") 679 + sections.append(" if (text.len < 2) return 0;") 680 + sections.append("") 681 + 682 + for ri, desc in enumerate(rule_descs): 683 + behind = desc["behind"] 684 + suffix_texts = desc["suffix_texts"] 685 + 686 + # sort suffix texts longest first 687 + suffix_texts_sorted = sorted(suffix_texts, key=lambda s: -len(s.encode("utf-8"))) 688 + 689 + for st in suffix_texts_sorted: 690 + blen = len(st.encode("utf-8")) 691 + zig_lit = zig_str(st) 692 + 693 + sections.append( 694 + f" if (std.mem.endsWith(u8, text, {zig_lit}) and text.len > {blen}) {{" 695 + ) 696 + 697 + bid = desc["behind_id"] 698 + if isinstance(bid, int): 699 + # simple class check 700 + sections.append( 701 + f" const before = lastCodepoint(text[0 .. text.len - {blen}]);" 702 + ) 703 + sections.append( 704 + f" if (before != null and matchLookbehind{bid}(before.?.value)) return {blen};" 705 + ) 706 + elif isinstance(bid, tuple) and isinstance(bid[0], str) and bid[0] == "literal": 707 + # literal check 708 + cp = bid[1] 709 + sections.append( 710 + f" const before = lastCodepoint(text[0 .. text.len - {blen}]);" 711 + ) 712 + sections.append( 713 + f" if (before != null and before.?.value == {zig_char(cp)}) return {blen};" 714 + ) 715 + elif isinstance(bid, tuple): 716 + # sequence check (multiple lookbehinds) 717 + sections.append( 718 + f" const b1 = lastCodepoint(text[0 .. text.len - {blen}]);" 719 + ) 720 + sections.append(f" if (b1) |bp1| {{") 721 + 722 + if len(bid) == 2: 723 + sections.append( 724 + f" const b2 = lastCodepoint(text[0 .. text.len - {blen} - bp1.len]);" 725 + ) 726 + # bid[0] is the class before bp2, bid[1] is the class for bp1 727 + test1 = ( 728 + f"matchLookbehind{bid[1]}(bp1.value)" 729 + if isinstance(bid[1], int) 730 + else f"bp1.value == {zig_char(bid[1][1])}" 731 + ) 732 + test0 = ( 733 + f"matchLookbehind{bid[0]}(b2p.value)" 734 + if isinstance(bid[0], int) 735 + else f"b2p.value == {zig_char(bid[0][1])}" 736 + ) 737 + sections.append(f" if ({test1}) {{") 738 + sections.append(f" if (b2) |b2p| {{") 739 + sections.append( 740 + f" if ({test0}) return {blen};" 741 + ) 742 + sections.append(f" }}") 743 + sections.append(f" }}") 744 + 745 + sections.append(f" }}") 746 + 747 + sections.append(" }") 748 + 749 + sections.append(" return 0;") 750 + sections.append("}") 751 + sections.append("") 752 + 753 + # ── infix character class tables ── 754 + sections.append("// ── infix character classes ──") 755 + sections.append("") 756 + for label, ranges in sorted(infix_classes.items()): 757 + name = f"is_{label}" 758 + sections.append(gen_range_table(name, ranges)) 759 + sections.append("") 760 + 761 + # ── specials ── 762 + sections.append("// ── special cases ──") 763 + sections.append("") 764 + sections.append(gen_specials(specials)) 765 + sections.append("") 766 + 767 + return "\n".join(sections) 768 + 769 + 770 + def main(): 771 + print("loading spaCy...") 772 + tok = load_spacy() 773 + 774 + print("\ngenerating zig source...") 775 + zig_source = generate(tok) 776 + 777 + out_path = Path("src/tokenizer_data.zig") 778 + out_path.write_text(zig_source) 779 + n_lines = zig_source.count("\n") + 1 780 + print(f"\nwrote {out_path} ({len(zig_source):,} bytes, {n_lines:,} lines)") 781 + 782 + # verification: run spaCy tokenizer on test inputs and dump expected output 783 + print("\ngenerating test data...") 784 + import spacy 785 + nlp = spacy.load("en_core_web_sm") 786 + test_sentences = [ 787 + "Barack Obama visited Paris.", 788 + "Apple Inc. is worth $2.5 trillion.", 789 + "I can't believe it's not butter!", 790 + "Dr. Smith's office (room 42) is closed.", 791 + "U.S.A. and U.K. are allies.", 792 + "They're going to the store.", 793 + 'He said "hello" and left.', 794 + "The cost is $500.00/month.", 795 + "New York-based company", 796 + "e-mail: test@example.com", 797 + "10,000 people", 798 + "3.14159 is pi", 799 + "state-of-the-art technology", 800 + "Mr. and Mrs. Jones", 801 + "it's 5:30pm", 802 + ] 803 + 804 + test_data = [] 805 + for sent in test_sentences: 806 + doc = nlp.make_doc(sent) 807 + tokens = [t.text for t in doc] 808 + test_data.append({"text": sent, "tokens": tokens}) 809 + 810 + test_path = Path("tests/tokenizer_expected.json") 811 + test_path.parent.mkdir(exist_ok=True) 812 + test_path.write_text(json.dumps(test_data, indent=2)) 813 + print(f"wrote {test_path} ({len(test_data)} test cases)") 814 + 815 + 816 + if __name__ == "__main__": 817 + main()
+389
src/model.zig
··· 1 + //! model weight loading and NER inference pipeline. 2 + //! 3 + //! loads en_core_web_sm weights from a flat binary (header + contiguous float32s), 4 + //! then runs the full pipeline: hash embed → CNN encode → linear → parser scoring. 5 + //! follows the karpathy/llama2.c pattern: mmap/embed bytes, slice into named regions. 6 + 7 + const std = @import("std"); 8 + const ops = @import("ops.zig"); 9 + const embed = @import("embed.zig"); 10 + const parser = @import("parser.zig"); 11 + 12 + /// maximum tokens per document (coral limits text to 500 chars ≈ ~120 tokens) 13 + pub const MAX_TOKENS = 128; 14 + 15 + const HEADER_MAGIC = 0x5350435A; // "SPCZ" 16 + const HEADER_VERSION = 1; 17 + const HEADER_UINT32S = 64; 18 + const HEADER_BYTES = HEADER_UINT32S * 4; 19 + 20 + pub const CnnBlock = struct { 21 + W: []const f32, // (nO * nP, nI) = (width*3, width*3) 22 + b: []const f32, // (nO * nP,) = (width*3,) 23 + G: []const f32, // (width,) 24 + b_ln: []const f32, // (width,) 25 + }; 26 + 27 + pub const Model = struct { 28 + // embedding 29 + embeds: [4]embed.HashEmbed, 30 + reduce_W: []const f32, // (width*nP, 4*width) 31 + reduce_b: []const f32, // (width*nP,) 32 + reduce_G: []const f32, // (width,) 33 + reduce_b_ln: []const f32, // (width,) 34 + 35 + // CNN encoder (4 residual blocks) 36 + cnn: [4]CnnBlock, 37 + 38 + // linear projection (tok2vec → parser input) 39 + linear_W: []const f32, // (hidden, width) 40 + linear_b: []const f32, // (hidden,) 41 + 42 + // parser lower (precomputable affine) 43 + lower_W: []const f32, // (nF * hidden * nP, hidden) = (3 * lower_dim, input_dim) 44 + lower_b: []const f32, // (lower_dim,) 45 + lower_pad: []const f32, // (nF * lower_dim,) = (3 * lower_dim,) 46 + 47 + // parser upper 48 + upper_W: []const f32, // (n_actions, hidden) 49 + upper_b: []const f32, // (n_actions,) 50 + 51 + // dimensions 52 + width: u32, // tok2vec width (96) 53 + hidden: u32, // parser hidden / linear output (64) 54 + n_actions: u32, // 74 55 + cnn_nP: u32, // CNN maxout pieces (3) 56 + parser_nP: u32, // parser lower maxout pieces (2) 57 + 58 + /// load model from raw weight bytes (mmap'd, @embedFile'd, or heap). 59 + pub fn load(bytes: []const u8) !Model { 60 + if (bytes.len < HEADER_BYTES) return error.FileTooSmall; 61 + 62 + const header = std.mem.bytesAsValue([HEADER_UINT32S]u32, bytes[0..HEADER_BYTES]); 63 + if (header[0] != HEADER_MAGIC) return error.BadMagic; 64 + if (header[1] != HEADER_VERSION) return error.BadVersion; 65 + 66 + const width: u32 = header[2]; // 96 67 + const cnn_depth: u32 = header[3]; // 4 68 + const cnn_nP: u32 = header[4]; // 3 69 + const hidden: u32 = header[5]; // 64 70 + const parser_nP: u32 = header[6]; // 2 71 + const parser_nF: u32 = header[7]; // 3 72 + const n_actions: u32 = header[8]; // 74 73 + 74 + if (cnn_depth != 4) return error.UnsupportedCnnDepth; 75 + if (parser_nF != 3) return error.UnsupportedParserNF; 76 + 77 + const embed_nVs = [4]u32{ header[9], header[10], header[11], header[12] }; 78 + const embed_seeds = [4]u32{ header[13], header[14], header[15], header[16] }; 79 + 80 + // slice weights from the data region after the header 81 + const aligned: []align(4) const u8 = @alignCast(bytes[HEADER_BYTES..]); 82 + const data = std.mem.bytesAsSlice(f32, aligned); 83 + var off: usize = 0; 84 + 85 + // helper to advance through contiguous weights 86 + const take = struct { 87 + fn f(d: []const f32, o: *usize, n: usize) []const f32 { 88 + const s = d[o.*..][0..n]; 89 + o.* += n; 90 + return s; 91 + } 92 + }.f; 93 + 94 + // 1. hash embed tables (4x) 95 + var embeds: [4]embed.HashEmbed = undefined; 96 + for (0..4) |i| { 97 + const nV = embed_nVs[i]; 98 + const nO = width; 99 + embeds[i] = .{ 100 + .E = take(data, &off, nV * nO), 101 + .nV = nV, 102 + .nO = nO, 103 + .seed = embed_seeds[i], 104 + }; 105 + } 106 + 107 + // 2. reduction maxout + layernorm 108 + const reduce_dim = width * cnn_nP; // 288 109 + const reduce_in = 4 * width; // 384 110 + const reduce_W = take(data, &off, reduce_dim * reduce_in); 111 + const reduce_b = take(data, &off, reduce_dim); 112 + const reduce_G = take(data, &off, width); 113 + const reduce_b_ln = take(data, &off, width); 114 + 115 + // 3. CNN blocks (4x) 116 + var cnn: [4]CnnBlock = undefined; 117 + const cnn_out = width * cnn_nP; // 288 118 + const cnn_in = width * 3; // 288 (from seq2col) 119 + for (0..4) |i| { 120 + cnn[i] = .{ 121 + .W = take(data, &off, cnn_out * cnn_in), 122 + .b = take(data, &off, cnn_out), 123 + .G = take(data, &off, width), 124 + .b_ln = take(data, &off, width), 125 + }; 126 + } 127 + 128 + // 4. linear projection 129 + const linear_W = take(data, &off, hidden * width); 130 + const linear_b = take(data, &off, hidden); 131 + 132 + // 5. parser lower (precomputable affine) 133 + const lower_dim = hidden * parser_nP; // 128 134 + const lower_W = take(data, &off, parser_nF * lower_dim * hidden); 135 + const lower_b = take(data, &off, lower_dim); 136 + const lower_pad = take(data, &off, parser_nF * lower_dim); 137 + 138 + // 6. parser upper 139 + const upper_W = take(data, &off, n_actions * hidden); 140 + const upper_b = take(data, &off, n_actions); 141 + 142 + return .{ 143 + .embeds = embeds, 144 + .reduce_W = reduce_W, 145 + .reduce_b = reduce_b, 146 + .reduce_G = reduce_G, 147 + .reduce_b_ln = reduce_b_ln, 148 + .cnn = cnn, 149 + .linear_W = linear_W, 150 + .linear_b = linear_b, 151 + .lower_W = lower_W, 152 + .lower_b = lower_b, 153 + .lower_pad = lower_pad, 154 + .upper_W = upper_W, 155 + .upper_b = upper_b, 156 + .width = width, 157 + .hidden = hidden, 158 + .n_actions = n_actions, 159 + .cnn_nP = cnn_nP, 160 + .parser_nP = parser_nP, 161 + }; 162 + } 163 + 164 + /// embed all tokens via MultiHashEmbed (hash lookups → maxout → layernorm). 165 + /// tok_vecs is (n_tokens, width) output buffer. 166 + pub fn embedTokens( 167 + self: *const Model, 168 + tokens: []const []const u8, 169 + tok_vecs: []f32, 170 + ) void { 171 + const w = self.width; 172 + const mhe = embed.MultiHashEmbed{ 173 + .embeds = self.embeds, 174 + .maxout_W = self.reduce_W, 175 + .maxout_b = self.reduce_b, 176 + .ln_G = self.reduce_G, 177 + .ln_b = self.reduce_b_ln, 178 + .nO = w, 179 + .nP = self.cnn_nP, 180 + }; 181 + var scratch: [4 * 96 + 96 * 3]f32 = undefined; 182 + for (tokens, 0..) |tok, t| { 183 + const attrs = embed.extractAttrs(tok); 184 + mhe.forward(attrs.asArray(), tok_vecs[t * w ..][0..w], &scratch); 185 + } 186 + } 187 + 188 + /// run 4 CNN residual blocks in-place on tok_vecs, then linear project to tok2vec_out. 189 + /// tok_vecs: (n_tokens, width), modified in place. 190 + /// expanded: scratch buffer, must be >= n_tokens * width * 3. 191 + /// tok2vec_out: (n_tokens, hidden) output buffer. 192 + pub fn encode( 193 + self: *const Model, 194 + tok_vecs: []f32, 195 + n_tokens: usize, 196 + expanded: []f32, 197 + tok2vec_out: []f32, 198 + ) void { 199 + const w = self.width; 200 + const cnn_out_dim = w * self.cnn_nP; // 288 201 + const cnn_in_dim = w * 3; // 288 202 + var pre_maxout: [96 * 3]f32 = undefined; 203 + var post_maxout: [96]f32 = undefined; 204 + 205 + for (0..4) |blk| { 206 + // seq2col: expand windows 207 + ops.seq2col(expanded, tok_vecs, n_tokens, w); 208 + 209 + // per-token: maxout + layernorm + residual 210 + for (0..n_tokens) |t| { 211 + const exp_t = expanded[t * cnn_in_dim ..][0..cnn_in_dim]; 212 + ops.matvec_bias(&pre_maxout, exp_t, self.cnn[blk].W, self.cnn[blk].b, cnn_in_dim, cnn_out_dim); 213 + ops.maxout(&post_maxout, &pre_maxout, w, self.cnn_nP); 214 + ops.layernorm(&post_maxout, &post_maxout, self.cnn[blk].G, self.cnn[blk].b_ln, w); 215 + // residual: tok_vecs[t] += post_maxout 216 + const tv = tok_vecs[t * w ..][0..w]; 217 + ops.vadd(tv, tv, &post_maxout, w); 218 + } 219 + } 220 + 221 + // linear projection: tok_vecs (width) → tok2vec_out (hidden) 222 + const h = self.hidden; 223 + for (0..n_tokens) |t| { 224 + ops.matvec_bias( 225 + tok2vec_out[t * h ..][0..h], 226 + tok_vecs[t * w ..][0..w], 227 + self.linear_W, 228 + self.linear_b, 229 + w, 230 + h, 231 + ); 232 + } 233 + } 234 + 235 + /// compute action scores for the parser at one step. 236 + /// ctx: [B(0), E(0), B(0)-1] token indices (n_tokens = padding sentinel). 237 + /// tok2vec_out: (n_tokens, hidden) from encode(). 238 + pub fn scoreActions( 239 + self: *const Model, 240 + ctx: [3]u32, 241 + tok2vec_out: []const f32, 242 + n_tokens: u32, 243 + scores: []f32, 244 + ) void { 245 + const h: usize = self.hidden; 246 + const nP: usize = self.parser_nP; 247 + const lower_dim = h * nP; // 128 248 + 249 + // accumulate 3 features into hidden 250 + var hidden: [128]f32 = [_]f32{0} ** 128; 251 + var tmp: [128]f32 = undefined; 252 + 253 + for (0..3) |f| { 254 + if (ctx[f] >= n_tokens) { 255 + // out-of-bounds → use padding vector 256 + ops.vadd( 257 + hidden[0..lower_dim], 258 + hidden[0..lower_dim], 259 + self.lower_pad[f * lower_dim ..][0..lower_dim], 260 + lower_dim, 261 + ); 262 + } else { 263 + // W_f @ tok2vec[tok_idx] 264 + const W_f = self.lower_W[f * lower_dim * h ..][0 .. lower_dim * h]; 265 + const x = tok2vec_out[ctx[f] * h ..][0..h]; 266 + ops.matvec(tmp[0..lower_dim], x, W_f, h, lower_dim); 267 + ops.vadd(hidden[0..lower_dim], hidden[0..lower_dim], tmp[0..lower_dim], lower_dim); 268 + } 269 + } 270 + 271 + // add bias 272 + ops.vadd(hidden[0..lower_dim], hidden[0..lower_dim], self.lower_b, lower_dim); 273 + 274 + // maxout: (hidden * nP) → (hidden) 275 + var maxed: [64]f32 = undefined; 276 + ops.maxout(maxed[0..h], hidden[0..lower_dim], h, nP); 277 + 278 + // upper: (n_actions, hidden) @ hidden → scores 279 + const na: usize = self.n_actions; 280 + ops.matvec_bias(scores[0..na], maxed[0..h], self.upper_W, self.upper_b, h, na); 281 + } 282 + 283 + /// run the full NER pipeline on pre-tokenized text. 284 + /// tokens: array of token byte slices (pointing into original text). 285 + /// returns parser state with recognized entities (token-index spans). 286 + pub fn predict(self: *const Model, tokens: []const []const u8) parser.State { 287 + const n: u32 = @intCast(@min(tokens.len, MAX_TOKENS)); 288 + if (n == 0) return parser.State.init(0); 289 + 290 + const w = self.width; 291 + const h = self.hidden; 292 + 293 + // scratch buffers (stack-allocated) 294 + var tok_vecs: [MAX_TOKENS * 96]f32 = undefined; 295 + var expanded: [MAX_TOKENS * 96 * 3]f32 = undefined; 296 + var tok2vec_out: [MAX_TOKENS * 64]f32 = undefined; 297 + 298 + // embed 299 + self.embedTokens(tokens[0..n], tok_vecs[0 .. n * w]); 300 + 301 + // CNN encode + linear project 302 + self.encode( 303 + tok_vecs[0 .. n * w], 304 + n, 305 + expanded[0 .. n * w * 3], 306 + tok2vec_out[0 .. n * h], 307 + ); 308 + 309 + // greedy parse 310 + var state = parser.State.init(n); 311 + var scores: [parser.N_ACTIONS]f32 = undefined; 312 + 313 + while (!state.isFinal()) { 314 + const ctx = state.contextIds(); 315 + self.scoreActions(ctx, tok2vec_out[0 .. n * h], n, &scores); 316 + const valid = state.validMask(); 317 + const best = parser.argmaxValid(&scores, valid); 318 + const decoded = parser.decodeAction(best); 319 + state.apply(decoded.action, decoded.label); 320 + } 321 + 322 + return state; 323 + } 324 + }; 325 + 326 + // === tests === 327 + 328 + const testing = std.testing; 329 + 330 + test "Model.load validates header" { 331 + // too small 332 + try testing.expectError(error.FileTooSmall, Model.load("")); 333 + 334 + // wrong magic 335 + var bad_header: [HEADER_BYTES]u8 = [_]u8{0} ** HEADER_BYTES; 336 + try testing.expectError(error.BadMagic, Model.load(&bad_header)); 337 + } 338 + 339 + /// load weight file from disk for tests (returns null if not found). 340 + fn loadWeightFile() ?[]align(4) const u8 { 341 + const file = std.fs.cwd().openFile("weights/en_core_web_sm.bin", .{}) catch return null; 342 + defer file.close(); 343 + const stat = file.stat() catch return null; 344 + const bytes = std.testing.allocator.alignedAlloc(u8, .@"4", stat.size) catch return null; 345 + const n = file.readAll(bytes) catch { 346 + std.testing.allocator.free(bytes); 347 + return null; 348 + }; 349 + return bytes[0..n]; 350 + } 351 + 352 + test "Model.load from weight file" { 353 + const weights = loadWeightFile() orelse return; // skip if weights not available 354 + defer std.testing.allocator.free(weights); 355 + 356 + const m = try Model.load(weights); 357 + 358 + try testing.expectEqual(@as(u32, 96), m.width); 359 + try testing.expectEqual(@as(u32, 64), m.hidden); 360 + try testing.expectEqual(@as(u32, 74), m.n_actions); 361 + try testing.expectEqual(@as(u32, 3), m.cnn_nP); 362 + try testing.expectEqual(@as(u32, 2), m.parser_nP); 363 + 364 + // verify embed table sizes 365 + try testing.expectEqual(@as(usize, 5000), m.embeds[0].nV); 366 + try testing.expectEqual(@as(usize, 1000), m.embeds[1].nV); 367 + try testing.expectEqual(@as(usize, 2500), m.embeds[2].nV); 368 + try testing.expectEqual(@as(usize, 2500), m.embeds[3].nV); 369 + } 370 + 371 + test "Model.predict basic NER" { 372 + const weights = loadWeightFile() orelse return; 373 + defer std.testing.allocator.free(weights); 374 + 375 + const m = try Model.load(weights); 376 + 377 + // "Barack Obama visited Paris" 378 + const tokens = [_][]const u8{ "Barack", "Obama", "visited", "Paris" }; 379 + const state = m.predict(&tokens); 380 + const ents = state.entities(); 381 + 382 + // should find at least one entity 383 + try testing.expect(ents.len > 0); 384 + 385 + // log what we found 386 + for (ents) |e| { 387 + std.debug.print(" [{d}..{d}) {s}\n", .{ e.start, e.end, @tagName(e.label) }); 388 + } 389 + }
+53 -42
src/parser.zig
··· 12 12 const std = @import("std"); 13 13 const ops = @import("ops.zig"); 14 14 15 - /// entity label indices — matches en_core_web_sm's label set. 16 - /// the model's output layer has actions for each (action_type, label) pair. 15 + /// entity label indices — matches en_core_web_sm's NER action table ordering. 16 + /// this is the training order, NOT alphabetical. 17 17 pub const Label = enum(u8) { 18 - CARDINAL = 0, 18 + ORG = 0, 19 19 DATE = 1, 20 - EVENT = 2, 21 - FAC = 3, 22 - GPE = 4, 23 - LANGUAGE = 5, 24 - LAW = 6, 25 - LOC = 7, 26 - MONEY = 8, 27 - NORP = 9, 28 - ORDINAL = 10, 29 - ORG = 11, 30 - PERCENT = 12, 31 - PERSON = 13, 32 - PRODUCT = 14, 33 - QUANTITY = 15, 34 - TIME = 16, 35 - WORK_OF_ART = 17, 20 + PERSON = 2, 21 + GPE = 3, 22 + MONEY = 4, 23 + CARDINAL = 5, 24 + NORP = 6, 25 + PERCENT = 7, 26 + WORK_OF_ART = 8, 27 + LOC = 9, 28 + TIME = 10, 29 + QUANTITY = 11, 30 + FAC = 12, 31 + EVENT = 13, 32 + ORDINAL = 14, 33 + PRODUCT = 15, 34 + LAW = 16, 35 + LANGUAGE = 17, 36 36 37 37 pub const COUNT = 18; 38 38 }; ··· 53 53 label: Label, 54 54 }; 55 55 56 - /// total number of possible actions: B/I/L/U for each label + O. 57 - pub const N_ACTIONS = Label.COUNT * 4 + 1; 56 + /// total number of possible actions: B/I/L/U for each label + filler + OUT. 57 + pub const N_ACTIONS = Label.COUNT * 4 + 2; 58 58 59 59 /// decode an action index (0..N_ACTIONS-1) into (action_type, label). 60 - /// the layout matches spaCy's ner.pyx move ordering. 60 + /// layout: [B*18, I*18, L*18, U*18, filler, OUT] — matches spaCy's get_class_name(). 61 61 pub fn decodeAction(idx: usize) struct { action: Action, label: ?Label } { 62 - if (idx == N_ACTIONS - 1) return .{ .action = .OUT, .label = null }; 63 - 64 - const label_idx = idx / 4; 65 - const action_idx = idx % 4; 66 - 67 - return .{ 68 - .action = @enumFromInt(action_idx), 69 - .label = @enumFromInt(@as(u8, @intCast(label_idx))), 70 - }; 62 + const n = Label.COUNT; 63 + if (idx < n) return .{ .action = .BEGIN, .label = @enumFromInt(@as(u8, @intCast(idx))) }; 64 + if (idx < 2 * n) return .{ .action = .IN, .label = @enumFromInt(@as(u8, @intCast(idx - n))) }; 65 + if (idx < 3 * n) return .{ .action = .LAST, .label = @enumFromInt(@as(u8, @intCast(idx - 2 * n))) }; 66 + if (idx < 4 * n) return .{ .action = .UNIT, .label = @enumFromInt(@as(u8, @intCast(idx - 3 * n))) }; 67 + if (idx == 4 * n + 1) return .{ .action = .OUT, .label = null }; 68 + // idx == 4*n: filler (U-""), always invalid because label is null 69 + return .{ .action = .UNIT, .label = null }; 71 70 } 72 71 73 72 /// parser state for a single document. ··· 237 236 const testing = std.testing; 238 237 239 238 test "decodeAction round-trip" { 240 - // first 4 actions: B-CARDINAL, I-CARDINAL, L-CARDINAL, U-CARDINAL 239 + // layout: [B*18, I*18, L*18, U*18, filler, OUT] 240 + // index 0 = B-ORG (first label in training order) 241 241 const a0 = decodeAction(0); 242 242 try testing.expectEqual(Action.BEGIN, a0.action); 243 - try testing.expectEqual(Label.CARDINAL, a0.label.?); 243 + try testing.expectEqual(Label.ORG, a0.label.?); 244 244 245 - const a3 = decodeAction(3); 246 - try testing.expectEqual(Action.UNIT, a3.action); 247 - try testing.expectEqual(Label.CARDINAL, a3.label.?); 245 + // index 2 = B-PERSON 246 + const a2 = decodeAction(2); 247 + try testing.expectEqual(Action.BEGIN, a2.action); 248 + try testing.expectEqual(Label.PERSON, a2.label.?); 248 249 249 - // PERSON: index 13 * 4 = 52..55 250 - const bp = decodeAction(52); 251 - try testing.expectEqual(Action.BEGIN, bp.action); 252 - try testing.expectEqual(Label.PERSON, bp.label.?); 250 + // index 18 = I-ORG 251 + const in0 = decodeAction(18); 252 + try testing.expectEqual(Action.IN, in0.action); 253 + try testing.expectEqual(Label.ORG, in0.label.?); 253 254 254 - // OUT is the last action 255 - const out = decodeAction(N_ACTIONS - 1); 255 + // index 56 = U-PERSON (54 + 2) 256 + const up = decodeAction(56); 257 + try testing.expectEqual(Action.UNIT, up.action); 258 + try testing.expectEqual(Label.PERSON, up.label.?); 259 + 260 + // index 72 = filler (U-""), label is null 261 + const filler = decodeAction(72); 262 + try testing.expectEqual(Action.UNIT, filler.action); 263 + try testing.expectEqual(@as(?Label, null), filler.label); 264 + 265 + // index 73 = OUT 266 + const out = decodeAction(73); 256 267 try testing.expectEqual(Action.OUT, out.action); 257 268 try testing.expectEqual(@as(?Label, null), out.label); 258 269 }
+55
src/spacez.zig
··· 8 8 pub const ops = @import("ops.zig"); 9 9 pub const embed = @import("embed.zig"); 10 10 pub const parser = @import("parser.zig"); 11 + pub const model = @import("model.zig"); 12 + pub const tokenizer = @import("tokenizer.zig"); 13 + pub const tokenizer_data = @import("tokenizer_data.zig"); 11 14 12 15 // re-export key types at the top level 16 + pub const Model = model.Model; 13 17 pub const Entity = parser.Entity; 14 18 pub const Label = parser.Label; 19 + pub const Token = tokenizer.Token; 15 20 pub const TokenAttrs = embed.TokenAttrs; 16 21 17 22 pub const hashString = hash.hashString; 18 23 pub const extractAttrs = embed.extractAttrs; 19 24 pub const computeShape = embed.computeShape; 25 + pub const tokenizeText = tokenizer.tokenize; 26 + 27 + /// a recognized entity with byte offsets into the source text. 28 + pub const SpanEntity = struct { 29 + start: u32, // byte offset of entity start in source text 30 + end: u32, // byte offset of entity end (exclusive) 31 + label: Label, 32 + }; 33 + 34 + /// run the full NER pipeline: tokenize → embed → CNN encode → parse. 35 + /// returns the number of entities written to entities_out. 36 + pub fn recognize( 37 + m: *const Model, 38 + text: []const u8, 39 + entities_out: []SpanEntity, 40 + ) u32 { 41 + // tokenize 42 + var tok_buf: [tokenizer.MAX_TOKENS]Token = undefined; 43 + const n_toks = tokenizeText(text, &tok_buf); 44 + if (n_toks == 0) return 0; 45 + 46 + // collect token text slices for the model (capped at model's MAX_TOKENS) 47 + const n: u32 = @min(n_toks, model.MAX_TOKENS); 48 + var tok_slices: [model.MAX_TOKENS][]const u8 = undefined; 49 + for (0..n) |i| { 50 + tok_slices[i] = tok_buf[i].text(text); 51 + } 52 + 53 + // run model prediction 54 + const state = m.predict(tok_slices[0..n]); 55 + const ents = state.entities(); 56 + 57 + // map token-index entities back to byte offsets 58 + var count: u32 = 0; 59 + for (ents) |e| { 60 + if (count >= entities_out.len) break; 61 + if (e.start >= n or e.end > n) continue; 62 + entities_out[count] = .{ 63 + .start = tok_buf[e.start].start, 64 + .end = tok_buf[e.end - 1].end, 65 + .label = e.label, 66 + }; 67 + count += 1; 68 + } 69 + 70 + return count; 71 + } 20 72 21 73 test { 22 74 _ = hash; 23 75 _ = ops; 24 76 _ = embed; 25 77 _ = parser; 78 + _ = model; 79 + _ = tokenizer; 80 + _ = tokenizer_data; 26 81 }
+723
src/tokenizer.zig
··· 1 + //! spaCy-compatible tokenizer. 2 + //! 3 + //! port of spaCy's `tokenizer.pyx` algorithm: whitespace split → per-chunk 4 + //! iterative prefix/suffix stripping → infix splitting → special case lookup. 5 + //! uses generated data tables from `tokenizer_data.zig`. 6 + 7 + const std = @import("std"); 8 + const data = @import("tokenizer_data.zig"); 9 + 10 + /// a token is a byte-offset slice into the original text. 11 + pub const Token = struct { 12 + /// byte offset of start in original text 13 + start: u32, 14 + /// byte offset of end (exclusive) in original text 15 + end: u32, 16 + 17 + pub fn text(self: Token, source: []const u8) []const u8 { 18 + return source[self.start..self.end]; 19 + } 20 + }; 21 + 22 + /// maximum tokens per document. 23 + pub const MAX_TOKENS = 1024; 24 + 25 + /// tokenize text into tokens. returns the number of tokens written. 26 + /// tokens are byte-offset spans into the original text. 27 + pub fn tokenize(text: []const u8, out: []Token) u32 { 28 + if (text.len == 0) return 0; 29 + var count: u32 = 0; 30 + 31 + // phase 1: whitespace split into chunks 32 + var i: usize = 0; 33 + while (i < text.len) { 34 + // skip whitespace 35 + while (i < text.len and isWhitespace(text[i])) : (i += 1) {} 36 + if (i >= text.len) break; 37 + 38 + // find end of chunk (next whitespace) 39 + const chunk_start = i; 40 + while (i < text.len and !isWhitespace(text[i])) : (i += 1) {} 41 + const chunk_end = i; 42 + 43 + // phase 2: tokenize this chunk 44 + count = tokenizeChunk(text, chunk_start, chunk_end, out, count); 45 + if (count >= out.len) return count; 46 + } 47 + 48 + return count; 49 + } 50 + 51 + /// tokenize a single whitespace-delimited chunk. 52 + /// text[start..end] is the chunk. writes tokens to out[count..]. 53 + fn tokenizeChunk( 54 + text: []const u8, 55 + start: usize, 56 + end: usize, 57 + out: []Token, 58 + count_in: u32, 59 + ) u32 { 60 + var count = count_in; 61 + const chunk = text[start..end]; 62 + 63 + // check special cases first 64 + if (data.specials.get(chunk)) |special| { 65 + var offset: u32 = @intCast(start); 66 + for (0..special.len) |ti| { 67 + const tok_text = special.tokens[ti]; 68 + // find this token text in the source at the expected position 69 + const tok_start = findSubstr(text[offset..end], tok_text); 70 + if (tok_start) |ts| { 71 + if (count < out.len) { 72 + out[count] = .{ 73 + .start = offset + @as(u32, @intCast(ts)), 74 + .end = offset + @as(u32, @intCast(ts)) + @as(u32, @intCast(tok_text.len)), 75 + }; 76 + count += 1; 77 + } 78 + offset = offset + @as(u32, @intCast(ts)) + @as(u32, @intCast(tok_text.len)); 79 + } else { 80 + // special token not found at expected position — emit based on length 81 + if (count < out.len) { 82 + out[count] = .{ 83 + .start = offset, 84 + .end = offset + @as(u32, @intCast(tok_text.len)), 85 + }; 86 + count += 1; 87 + } 88 + offset += @as(u32, @intCast(tok_text.len)); 89 + } 90 + } 91 + return count; 92 + } 93 + 94 + // split affixes iteratively 95 + var lo: u32 = @intCast(start); 96 + var hi: u32 = @intCast(end); 97 + 98 + // prefix and suffix stacks (indices into out buffer) 99 + const prefix_start = count; // where prefixes begin in out 100 + var suffix_buf: [64]Token = undefined; 101 + var n_suffixes: u32 = 0; 102 + 103 + var last_len: u32 = 0; 104 + while (lo < hi and (hi - lo) != last_len) { 105 + const span = text[lo..hi]; 106 + const span_len = hi - lo; 107 + 108 + // check if remaining span is a special case 109 + if (data.specials.get(span) != null) break; 110 + 111 + // check URL match 112 + if (matchUrl(span) > 0) break; 113 + 114 + last_len = span_len; 115 + 116 + // try prefix 117 + const pre_len = matchPrefix(span); 118 + 119 + // try suffix on span[pre_len..] but strip from end of full span 120 + const suf_len = if (pre_len < span_len) 121 + matchSuffix(span[pre_len..]) 122 + else 123 + @as(usize, 0); 124 + 125 + if (pre_len > 0 and suf_len > 0 and (pre_len + suf_len) <= span_len) { 126 + // both prefix and suffix, non-overlapping 127 + // check if stripping prefix reveals a special 128 + const minus_pre = text[lo + @as(u32, @intCast(pre_len)) .. hi]; 129 + if (minus_pre.len > 0 and data.specials.get(minus_pre) != null) { 130 + // emit prefix, let middle handle the special 131 + if (count < out.len) { 132 + out[count] = .{ .start = lo, .end = lo + @as(u32, @intCast(pre_len)) }; 133 + count += 1; 134 + } 135 + lo += @as(u32, @intCast(pre_len)); 136 + break; 137 + } 138 + // check if stripping suffix reveals a special 139 + const minus_suf = text[lo..hi - @as(u32, @intCast(suf_len))]; 140 + if (minus_suf.len > 0 and data.specials.get(minus_suf) != null) { 141 + if (n_suffixes < suffix_buf.len) { 142 + suffix_buf[n_suffixes] = .{ 143 + .start = hi - @as(u32, @intCast(suf_len)), 144 + .end = hi, 145 + }; 146 + n_suffixes += 1; 147 + } 148 + hi -= @as(u32, @intCast(suf_len)); 149 + break; 150 + } 151 + 152 + // strip both 153 + if (count < out.len) { 154 + out[count] = .{ .start = lo, .end = lo + @as(u32, @intCast(pre_len)) }; 155 + count += 1; 156 + } 157 + if (n_suffixes < suffix_buf.len) { 158 + suffix_buf[n_suffixes] = .{ 159 + .start = hi - @as(u32, @intCast(suf_len)), 160 + .end = hi, 161 + }; 162 + n_suffixes += 1; 163 + } 164 + lo += @as(u32, @intCast(pre_len)); 165 + hi -= @as(u32, @intCast(suf_len)); 166 + } else if (pre_len > 0) { 167 + // prefix only 168 + const minus_pre = text[lo + @as(u32, @intCast(pre_len)) .. hi]; 169 + if (minus_pre.len > 0 and data.specials.get(minus_pre) != null) { 170 + if (count < out.len) { 171 + out[count] = .{ .start = lo, .end = lo + @as(u32, @intCast(pre_len)) }; 172 + count += 1; 173 + } 174 + lo += @as(u32, @intCast(pre_len)); 175 + break; 176 + } 177 + if (count < out.len) { 178 + out[count] = .{ .start = lo, .end = lo + @as(u32, @intCast(pre_len)) }; 179 + count += 1; 180 + } 181 + lo += @as(u32, @intCast(pre_len)); 182 + } else if (suf_len > 0) { 183 + const minus_suf = text[lo..hi - @as(u32, @intCast(suf_len))]; 184 + if (minus_suf.len > 0 and data.specials.get(minus_suf) != null) { 185 + if (n_suffixes < suffix_buf.len) { 186 + suffix_buf[n_suffixes] = .{ 187 + .start = hi - @as(u32, @intCast(suf_len)), 188 + .end = hi, 189 + }; 190 + n_suffixes += 1; 191 + } 192 + hi -= @as(u32, @intCast(suf_len)); 193 + break; 194 + } 195 + if (n_suffixes < suffix_buf.len) { 196 + suffix_buf[n_suffixes] = .{ 197 + .start = hi - @as(u32, @intCast(suf_len)), 198 + .end = hi, 199 + }; 200 + n_suffixes += 1; 201 + } 202 + hi -= @as(u32, @intCast(suf_len)); 203 + } 204 + // else: neither matched, last_len == span_len, loop exits 205 + } 206 + 207 + _ = prefix_start; 208 + 209 + // emit middle portion 210 + if (lo < hi) { 211 + const middle = text[lo..hi]; 212 + 213 + // try special cases for the remaining middle 214 + if (data.specials.get(middle)) |special| { 215 + var offset: u32 = lo; 216 + for (0..special.len) |ti| { 217 + const tok_text = special.tokens[ti]; 218 + const tok_start = findSubstr(text[offset..hi], tok_text); 219 + if (tok_start) |ts| { 220 + if (count < out.len) { 221 + out[count] = .{ 222 + .start = offset + @as(u32, @intCast(ts)), 223 + .end = offset + @as(u32, @intCast(ts)) + @as(u32, @intCast(tok_text.len)), 224 + }; 225 + count += 1; 226 + } 227 + offset = offset + @as(u32, @intCast(ts)) + @as(u32, @intCast(tok_text.len)); 228 + } else { 229 + if (count < out.len) { 230 + out[count] = .{ .start = offset, .end = offset + @as(u32, @intCast(tok_text.len)) }; 231 + count += 1; 232 + } 233 + offset += @as(u32, @intCast(tok_text.len)); 234 + } 235 + } 236 + } else if (matchUrl(middle) > 0) { 237 + // URL — emit as single token 238 + if (count < out.len) { 239 + out[count] = .{ .start = lo, .end = hi }; 240 + count += 1; 241 + } 242 + } else { 243 + // try infix splitting 244 + var infixes: [64]Infix = undefined; 245 + const n_infixes = findInfixes(middle, &infixes); 246 + 247 + if (n_infixes == 0) { 248 + // no infixes — single token 249 + if (count < out.len) { 250 + out[count] = .{ .start = lo, .end = hi }; 251 + count += 1; 252 + } 253 + } else { 254 + // split on infixes 255 + var pos: u32 = lo; 256 + for (infixes[0..n_infixes]) |inf| { 257 + const inf_start = lo + @as(u32, @intCast(inf.start)); 258 + const inf_end = lo + @as(u32, @intCast(inf.end)); 259 + 260 + // skip infixes at position 0 261 + if (inf.start == 0) continue; 262 + 263 + // emit text before infix 264 + if (inf_start > pos) { 265 + if (count < out.len) { 266 + out[count] = .{ .start = pos, .end = inf_start }; 267 + count += 1; 268 + } 269 + } 270 + 271 + // emit infix 272 + if (inf_start != inf_end) { 273 + if (count < out.len) { 274 + out[count] = .{ .start = inf_start, .end = inf_end }; 275 + count += 1; 276 + } 277 + } 278 + 279 + pos = inf_end; 280 + } 281 + 282 + // emit text after last infix 283 + if (pos < hi) { 284 + if (count < out.len) { 285 + out[count] = .{ .start = pos, .end = hi }; 286 + count += 1; 287 + } 288 + } 289 + } 290 + } 291 + } 292 + 293 + // emit suffixes in reverse order 294 + var si = n_suffixes; 295 + while (si > 0) { 296 + si -= 1; 297 + if (count < out.len) { 298 + out[count] = suffix_buf[si]; 299 + count += 1; 300 + } 301 + } 302 + 303 + return count; 304 + } 305 + 306 + // ── pattern matching ── 307 + 308 + /// match a prefix at position 0. returns byte length of match, or 0. 309 + pub fn matchPrefix(text: []const u8) usize { 310 + if (text.len == 0) return 0; 311 + 312 + const cp = data.decodeUtf8(text) orelse return 0; 313 + 314 + // 1. single-character prefixes (switch on codepoint) 315 + if (data.isPrefixChar(cp.value)) return cp.len; 316 + 317 + // 2. multi-char literals (longest first) 318 + for (data.prefix_multi_literals) |lit| { 319 + if (std.mem.startsWith(u8, text, lit)) return lit.len; 320 + } 321 + 322 + // 3. symbol class (unicode So/Sc categories) 323 + if (data.isSymbol(cp.value)) return cp.len; 324 + 325 + // 4. 2+ dots 326 + if (text.len >= 2 and text[0] == '.' and text[1] == '.') { 327 + var i: usize = 2; 328 + while (i < text.len and text[i] == '.') : (i += 1) {} 329 + return i; 330 + } 331 + 332 + // 5. literal-unless-digit (e.g., + not followed by digit) 333 + if (data.isPrefixUnlessDigit(cp.value)) { 334 + if (cp.len >= text.len) return cp.len; 335 + const next = data.decodeUtf8(text[cp.len..]); 336 + if (next == null or !isAsciiDigit(next.?.value)) return cp.len; 337 + } 338 + 339 + return 0; 340 + } 341 + 342 + /// match a suffix at the end of text. returns byte length of suffix, or 0. 343 + pub fn matchSuffix(text: []const u8) usize { 344 + if (text.len == 0) return 0; 345 + 346 + const last = data.lastCodepoint(text) orelse return 0; 347 + 348 + // 1. single-character suffixes 349 + if (data.isSuffixChar(last.value)) return last.len; 350 + 351 + // 2. symbol class 352 + if (data.isSymbol(last.value)) return last.len; 353 + 354 + // 3. multi-char literal suffixes (longest first) 355 + for (data.suffix_multi_literals) |lit| { 356 + if (std.mem.endsWith(u8, text, lit)) return lit.len; 357 + } 358 + 359 + // 4. 2+ dots at end 360 + if (text.len >= 2 and text[text.len - 1] == '.' and text[text.len - 2] == '.') { 361 + var i: usize = text.len - 2; 362 + while (i > 0 and text[i - 1] == '.') : (i -= 1) {} 363 + return text.len - i; 364 + } 365 + 366 + // 5. lookbehind rules (generated) 367 + const lb = data.matchSuffixLookbehind(text); 368 + if (lb > 0) return lb; 369 + 370 + return 0; 371 + } 372 + 373 + /// infix match result 374 + const Infix = struct { start: usize, end: usize }; 375 + 376 + /// find all infix split points. returns count written. 377 + fn findInfixes(text: []const u8, out: []Infix) usize { 378 + var count: usize = 0; 379 + if (text.len == 0) return 0; 380 + 381 + var i: usize = 0; 382 + while (i < text.len) { 383 + const cp = data.decodeUtf8(text[i..]) orelse { 384 + i += 1; 385 + continue; 386 + }; 387 + var matched: usize = 0; 388 + 389 + // 1. 2+ dots (infix[0]) 390 + if (text[i] == '.' and i + 1 < text.len and text[i + 1] == '.') { 391 + var end = i + 2; 392 + while (end < text.len and text[end] == '.') : (end += 1) {} 393 + matched = end - i; 394 + } 395 + // 2. ellipsis U+2026 (infix[1]) 396 + else if (cp.value == 0x2026) { 397 + matched = cp.len; 398 + } 399 + // 3. symbol class (infix[2]) 400 + else if (data.isSymbol(cp.value)) { 401 + matched = cp.len; 402 + } 403 + // contextual rules require lookbehind/lookahead 404 + else { 405 + const prev_cp = if (i > 0) data.lastCodepoint(text[0..i]) else null; 406 + const next_start = i + cp.len; 407 + const next_cp = if (next_start < text.len) data.decodeUtf8(text[next_start..]) else null; 408 + 409 + // 4. math ops between digits: (?<=[0-9])[+\-*^](?=[0-9\-]) (infix[3]) 410 + if (prev_cp != null and isAsciiDigit(prev_cp.?.value)) { 411 + if (cp.value == '+' or cp.value == '-' or cp.value == '*' or cp.value == '^') { 412 + if (next_cp != null and (isAsciiDigit(next_cp.?.value) or next_cp.?.value == '-')) { 413 + matched = cp.len; 414 + } 415 + } 416 + } 417 + 418 + // 5. period between lower/punct and upper (infix[4]) 419 + if (matched == 0 and cp.value == '.') { 420 + if (prev_cp != null and next_cp != null) { 421 + if (data.is_infix_4_behind(prev_cp.?.value) and data.is_infix_4_ahead(next_cp.?.value)) { 422 + matched = 1; 423 + } 424 + } 425 + } 426 + 427 + // 6. comma between alpha chars (infix[5]) 428 + if (matched == 0 and cp.value == ',') { 429 + if (prev_cp != null and next_cp != null) { 430 + if (data.is_infix_5_behind(prev_cp.?.value) and data.is_infix_5_ahead(next_cp.?.value)) { 431 + matched = 1; 432 + } 433 + } 434 + } 435 + 436 + // 7. hyphens/dashes between alnum (infix[6]) 437 + if (matched == 0 and prev_cp != null and next_cp != null) { 438 + if (data.is_infix_6_behind(prev_cp.?.value) and data.is_infix_7_ahead(next_cp.?.value)) { 439 + // try alternatives: ---, --, —— (U+2014 U+2014), —, –, -, ~ 440 + if (i + 3 <= text.len and std.mem.eql(u8, text[i..][0..3], "---")) { 441 + matched = 3; 442 + } else if (i + 2 <= text.len and std.mem.eql(u8, text[i..][0..2], "--")) { 443 + matched = 2; 444 + } else if (i + 6 <= text.len and std.mem.eql(u8, text[i..][0..6], "\xe2\x80\x94\xe2\x80\x94")) { 445 + matched = 6; // —— 446 + } else if (cp.value == 0x2014) { // — 447 + matched = cp.len; 448 + } else if (cp.value == 0x2013) { // – 449 + matched = cp.len; 450 + } else if (cp.value == '-') { 451 + matched = 1; 452 + } else if (cp.value == '~') { 453 + matched = 1; 454 + } 455 + } 456 + } 457 + 458 + // 8. separators between alnum (infix[7]) 459 + if (matched == 0 and prev_cp != null and next_cp != null) { 460 + if (data.is_infix_7_behind(prev_cp.?.value) and data.is_infix_7_ahead(next_cp.?.value)) { 461 + if (cp.value == '/' or cp.value == ':' or cp.value == '<' or 462 + cp.value == '>' or cp.value == '=') 463 + { 464 + matched = cp.len; 465 + } 466 + } 467 + } 468 + } 469 + 470 + if (matched > 0) { 471 + if (count < out.len) { 472 + out[count] = .{ .start = i, .end = i + matched }; 473 + count += 1; 474 + } 475 + i += matched; 476 + } else { 477 + i += cp.len; 478 + } 479 + } 480 + 481 + return count; 482 + } 483 + 484 + /// simplified URL matcher. matches scheme://... or domain.tld patterns. 485 + /// returns length of match from start, or 0. 486 + fn matchUrl(text: []const u8) usize { 487 + if (text.len < 4) return 0; 488 + 489 + // check for scheme:// 490 + var pos: usize = 0; 491 + if (std.mem.startsWith(u8, text, "http://")) { 492 + pos = 7; 493 + } else if (std.mem.startsWith(u8, text, "https://")) { 494 + pos = 8; 495 + } else if (std.mem.startsWith(u8, text, "ftp://")) { 496 + pos = 6; 497 + } else { 498 + // try generic scheme:// 499 + var j: usize = 0; 500 + while (j < text.len and j < 20) : (j += 1) { 501 + const c = text[j]; 502 + if (c == ':') { 503 + if (j + 2 < text.len and text[j + 1] == '/' and text[j + 2] == '/') { 504 + pos = j + 3; 505 + break; 506 + } else break; 507 + } 508 + if (!isAsciiAlnum(c) and c != '+' and c != '-' and c != '.') break; 509 + } 510 + 511 + // no scheme — try bare domain: word.word or word@word.word 512 + if (pos == 0) { 513 + pos = matchBareDomain(text); 514 + } 515 + } 516 + 517 + if (pos == 0 or pos >= text.len) return 0; 518 + 519 + // consume until whitespace 520 + while (pos < text.len and !isWhitespace(text[pos])) : (pos += 1) {} 521 + 522 + return pos; 523 + } 524 + 525 + /// match a bare domain like example.com or user@example.com 526 + fn matchBareDomain(text: []const u8) usize { 527 + // look for word.word or word@word.word pattern 528 + var i: usize = 0; 529 + var has_dot = false; 530 + var has_at = false; 531 + var last_was_alnum = false; 532 + 533 + while (i < text.len) { 534 + const c = text[i]; 535 + if (isAsciiAlnum(c) or c == '-' or c == '_') { 536 + last_was_alnum = isAsciiAlnum(c); 537 + i += 1; 538 + } else if (c == '.' and last_was_alnum and i + 1 < text.len and isAsciiAlnum(text[i + 1])) { 539 + has_dot = true; 540 + last_was_alnum = false; 541 + i += 1; 542 + } else if (c == '@' and !has_at and last_was_alnum and i + 1 < text.len and isAsciiAlnum(text[i + 1])) { 543 + has_at = true; 544 + last_was_alnum = false; 545 + i += 1; 546 + } else break; 547 + } 548 + 549 + // must have at least one dot to be a domain 550 + if (!has_dot) return 0; 551 + 552 + // check TLD is at least 2 chars and alphabetic (not numeric like 500.00) 553 + var last_dot: usize = 0; 554 + var j: usize = 0; 555 + while (j < i) : (j += 1) { 556 + if (text[j] == '.') last_dot = j; 557 + } 558 + const tld_start = last_dot + 1; 559 + const tld_len = i - tld_start; 560 + if (tld_len < 2) return 0; 561 + 562 + // TLD must contain at least one letter 563 + var has_alpha = false; 564 + j = tld_start; 565 + while (j < i) : (j += 1) { 566 + if ((text[j] >= 'a' and text[j] <= 'z') or (text[j] >= 'A' and text[j] <= 'Z')) { 567 + has_alpha = true; 568 + break; 569 + } 570 + } 571 + if (!has_alpha) return 0; 572 + 573 + return i; 574 + } 575 + 576 + // ── helpers ── 577 + 578 + fn isWhitespace(c: u8) bool { 579 + return c == ' ' or c == '\t' or c == '\n' or c == '\r'; 580 + } 581 + 582 + fn isAsciiDigit(c: u21) bool { 583 + return c >= '0' and c <= '9'; 584 + } 585 + 586 + fn isAsciiAlnum(c: u8) bool { 587 + return (c >= '0' and c <= '9') or (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z'); 588 + } 589 + 590 + fn findSubstr(haystack: []const u8, needle: []const u8) ?usize { 591 + if (needle.len > haystack.len) return null; 592 + if (needle.len == 0) return 0; 593 + var i: usize = 0; 594 + while (i + needle.len <= haystack.len) : (i += 1) { 595 + if (std.mem.eql(u8, haystack[i..][0..needle.len], needle)) return i; 596 + } 597 + return null; 598 + } 599 + 600 + // ── tests ── 601 + 602 + const testing = std.testing; 603 + 604 + test "tokenize basic sentence" { 605 + var tokens: [64]Token = undefined; 606 + const n = tokenize("Barack Obama visited Paris.", &tokens); 607 + 608 + const expected = [_][]const u8{ "Barack", "Obama", "visited", "Paris", "." }; 609 + try testing.expectEqual(@as(u32, expected.len), n); 610 + for (expected, 0..) |exp, i| { 611 + try testing.expectEqualStrings(exp, tokens[i].text("Barack Obama visited Paris.")); 612 + } 613 + } 614 + 615 + test "tokenize contractions" { 616 + var tokens: [64]Token = undefined; 617 + const text = "I can't believe it's not butter!"; 618 + const n = tokenize(text, &tokens); 619 + 620 + const expected = [_][]const u8{ "I", "ca", "n't", "believe", "it", "'s", "not", "butter", "!" }; 621 + try testing.expectEqual(@as(u32, expected.len), n); 622 + for (expected, 0..) |exp, i| { 623 + try testing.expectEqualStrings(exp, tokens[i].text(text)); 624 + } 625 + } 626 + 627 + test "tokenize currency and punctuation" { 628 + var tokens: [64]Token = undefined; 629 + const text = "Apple Inc. is worth $2.5 trillion."; 630 + const n = tokenize(text, &tokens); 631 + 632 + const expected = [_][]const u8{ "Apple", "Inc.", "is", "worth", "$", "2.5", "trillion", "." }; 633 + try testing.expectEqual(@as(u32, expected.len), n); 634 + for (expected, 0..) |exp, i| { 635 + try testing.expectEqualStrings(exp, tokens[i].text(text)); 636 + } 637 + } 638 + 639 + test "tokenize parentheses" { 640 + var tokens: [64]Token = undefined; 641 + const text = "Dr. Smith's office (room 42) is closed."; 642 + const n = tokenize(text, &tokens); 643 + 644 + const expected = [_][]const u8{ "Dr.", "Smith", "'s", "office", "(", "room", "42", ")", "is", "closed", "." }; 645 + try testing.expectEqual(@as(u32, expected.len), n); 646 + for (expected, 0..) |exp, i| { 647 + try testing.expectEqualStrings(exp, tokens[i].text(text)); 648 + } 649 + } 650 + 651 + test "tokenize hyphenated words" { 652 + var tokens: [64]Token = undefined; 653 + const text = "New York-based company"; 654 + const n = tokenize(text, &tokens); 655 + 656 + const expected = [_][]const u8{ "New", "York", "-", "based", "company" }; 657 + try testing.expectEqual(@as(u32, expected.len), n); 658 + for (expected, 0..) |exp, i| { 659 + try testing.expectEqualStrings(exp, tokens[i].text(text)); 660 + } 661 + } 662 + 663 + test "tokenize abbreviations" { 664 + var tokens: [64]Token = undefined; 665 + const text = "U.S.A. and U.K. are allies."; 666 + const n = tokenize(text, &tokens); 667 + 668 + const expected = [_][]const u8{ "U.S.A.", "and", "U.K.", "are", "allies", "." }; 669 + try testing.expectEqual(@as(u32, expected.len), n); 670 + for (expected, 0..) |exp, i| { 671 + try testing.expectEqualStrings(exp, tokens[i].text(text)); 672 + } 673 + } 674 + 675 + test "tokenize email" { 676 + var tokens: [64]Token = undefined; 677 + const text = "e-mail: test@example.com"; 678 + const n = tokenize(text, &tokens); 679 + 680 + const expected = [_][]const u8{ "e", "-", "mail", ":", "test@example.com" }; 681 + try testing.expectEqual(@as(u32, expected.len), n); 682 + for (expected, 0..) |exp, i| { 683 + try testing.expectEqualStrings(exp, tokens[i].text(text)); 684 + } 685 + } 686 + 687 + test "matchPrefix" { 688 + // single chars 689 + try testing.expectEqual(@as(usize, 1), matchPrefix("$100")); 690 + try testing.expectEqual(@as(usize, 1), matchPrefix("(hello)")); 691 + try testing.expectEqual(@as(usize, 1), matchPrefix("\"quote")); 692 + try testing.expectEqual(@as(usize, 1), matchPrefix("!")); 693 + 694 + // multi-char 695 + try testing.expectEqual(@as(usize, 3), matchPrefix("US$100")); 696 + try testing.expectEqual(@as(usize, 2), matchPrefix("C$100")); 697 + 698 + // dots 699 + try testing.expectEqual(@as(usize, 3), matchPrefix("...hello")); 700 + try testing.expectEqual(@as(usize, 2), matchPrefix("..hello")); 701 + 702 + // no match 703 + try testing.expectEqual(@as(usize, 0), matchPrefix("hello")); 704 + try testing.expectEqual(@as(usize, 0), matchPrefix("123")); 705 + } 706 + 707 + test "matchSuffix" { 708 + try testing.expectEqual(@as(usize, 1), matchSuffix("hello.")); 709 + try testing.expectEqual(@as(usize, 1), matchSuffix("hello!")); 710 + try testing.expectEqual(@as(usize, 1), matchSuffix("hello)")); 711 + try testing.expectEqual(@as(usize, 1), matchSuffix("hello,")); 712 + try testing.expectEqual(@as(usize, 0), matchSuffix("hello")); 713 + } 714 + 715 + test "findInfixes" { 716 + var infixes: [64]Infix = undefined; 717 + 718 + // hyphen between words 719 + const n1 = findInfixes("York-based", &infixes); 720 + try testing.expect(n1 > 0); 721 + try testing.expectEqual(@as(usize, 4), infixes[0].start); 722 + try testing.expectEqual(@as(usize, 5), infixes[0].end); 723 + }
+4452
src/tokenizer_data.zig
··· 1 + //! generated by scripts/gen_tokenizer_data.py — do not edit. 2 + //! tokenizer pattern data compiled from spaCy en_core_web_sm. 3 + 4 + const std = @import("std"); 5 + 6 + // ── utf-8 helpers ── 7 + 8 + pub const Codepoint = struct { value: u21, len: u3 }; 9 + 10 + pub fn decodeUtf8(bytes: []const u8) ?Codepoint { 11 + if (bytes.len == 0) return null; 12 + const b0 = bytes[0]; 13 + if (b0 < 0x80) return .{ .value = b0, .len = 1 }; 14 + if (b0 & 0xE0 == 0xC0 and bytes.len >= 2) 15 + return .{ .value = (@as(u21, b0 & 0x1F) << 6) | (bytes[1] & 0x3F), .len = 2 }; 16 + if (b0 & 0xF0 == 0xE0 and bytes.len >= 3) 17 + return .{ .value = (@as(u21, b0 & 0x0F) << 12) | (@as(u21, bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F), .len = 3 }; 18 + if (b0 & 0xF8 == 0xF0 and bytes.len >= 4) 19 + return .{ .value = (@as(u21, b0 & 0x07) << 18) | (@as(u21, bytes[1] & 0x3F) << 12) | (@as(u21, bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F), .len = 4 }; 20 + return .{ .value = 0xFFFD, .len = 1 }; // replacement char 21 + } 22 + 23 + pub fn lastCodepoint(text: []const u8) ?Codepoint { 24 + if (text.len == 0) return null; 25 + var i = text.len - 1; 26 + while (i > 0 and text[i] & 0xC0 == 0x80) : (i -= 1) {} 27 + return decodeUtf8(text[i..]); 28 + } 29 + 30 + // ── range search ── 31 + 32 + fn rangeContains(ranges: []const [2]u21, c: u21) bool { 33 + var lo: usize = 0; 34 + var hi: usize = ranges.len; 35 + while (lo < hi) { 36 + const mid = lo + (hi - lo) / 2; 37 + if (c > ranges[mid][1]) { lo = mid + 1; } 38 + else if (c < ranges[mid][0]) { hi = mid; } 39 + else return true; 40 + } 41 + return false; 42 + } 43 + 44 + // ── symbol class (So/Sc unicode categories) ── 45 + 46 + pub const isSymbol_ranges = [_][2]u21{ 47 + .{ 0x00A6, 0x00A6 }, 48 + .{ 0x00A9, 0x00A9 }, 49 + .{ 0x00AE, 0x00AE }, 50 + .{ 0x00B0, 0x00B0 }, 51 + .{ 0x0482, 0x0482 }, 52 + .{ 0x058D, 0x058E }, 53 + .{ 0x060E, 0x060F }, 54 + .{ 0x06DE, 0x06DE }, 55 + .{ 0x06E9, 0x06E9 }, 56 + .{ 0x06FD, 0x06FE }, 57 + .{ 0x07F6, 0x07F6 }, 58 + .{ 0x09FA, 0x09FA }, 59 + .{ 0x0B70, 0x0B70 }, 60 + .{ 0x0BF3, 0x0BF8 }, 61 + .{ 0x0BFA, 0x0BFA }, 62 + .{ 0x0C7F, 0x0C7F }, 63 + .{ 0x0D4F, 0x0D4F }, 64 + .{ 0x0D79, 0x0D79 }, 65 + .{ 0x0F01, 0x0F03 }, 66 + .{ 0x0F13, 0x0F13 }, 67 + .{ 0x0F15, 0x0F17 }, 68 + .{ 0x0F1A, 0x0F1F }, 69 + .{ 0x0F34, 0x0F34 }, 70 + .{ 0x0F36, 0x0F36 }, 71 + .{ 0x0F38, 0x0F38 }, 72 + .{ 0x0FBE, 0x0FC5 }, 73 + .{ 0x0FC7, 0x0FCC }, 74 + .{ 0x0FCE, 0x0FCF }, 75 + .{ 0x0FD5, 0x0FD8 }, 76 + .{ 0x109E, 0x109F }, 77 + .{ 0x1390, 0x1399 }, 78 + .{ 0x1940, 0x1940 }, 79 + .{ 0x19DE, 0x19FF }, 80 + .{ 0x1B61, 0x1B6A }, 81 + .{ 0x1B74, 0x1B7C }, 82 + .{ 0x2100, 0x2101 }, 83 + .{ 0x2103, 0x2106 }, 84 + .{ 0x2108, 0x2109 }, 85 + .{ 0x2114, 0x2114 }, 86 + .{ 0x2116, 0x2117 }, 87 + .{ 0x211E, 0x2123 }, 88 + .{ 0x2125, 0x2125 }, 89 + .{ 0x2127, 0x2127 }, 90 + .{ 0x2129, 0x2129 }, 91 + .{ 0x212E, 0x212E }, 92 + .{ 0x213A, 0x213B }, 93 + .{ 0x214A, 0x214A }, 94 + .{ 0x214C, 0x214D }, 95 + .{ 0x214F, 0x214F }, 96 + .{ 0x218A, 0x218B }, 97 + .{ 0x2195, 0x2199 }, 98 + .{ 0x219C, 0x219F }, 99 + .{ 0x21A1, 0x21A2 }, 100 + .{ 0x21A4, 0x21A5 }, 101 + .{ 0x21A7, 0x21AD }, 102 + .{ 0x21AF, 0x21CD }, 103 + .{ 0x21D0, 0x21D1 }, 104 + .{ 0x21D3, 0x21D3 }, 105 + .{ 0x21D5, 0x21F3 }, 106 + .{ 0x2300, 0x2307 }, 107 + .{ 0x230C, 0x231F }, 108 + .{ 0x2322, 0x2328 }, 109 + .{ 0x232B, 0x237B }, 110 + .{ 0x237D, 0x239A }, 111 + .{ 0x23B4, 0x23DB }, 112 + .{ 0x23E2, 0x2426 }, 113 + .{ 0x2440, 0x244A }, 114 + .{ 0x249C, 0x24E9 }, 115 + .{ 0x2500, 0x25B6 }, 116 + .{ 0x25B8, 0x25C0 }, 117 + .{ 0x25C2, 0x25F7 }, 118 + .{ 0x2600, 0x266E }, 119 + .{ 0x2670, 0x2767 }, 120 + .{ 0x2794, 0x27BF }, 121 + .{ 0x2800, 0x28FF }, 122 + .{ 0x2B00, 0x2B2F }, 123 + .{ 0x2B45, 0x2B46 }, 124 + .{ 0x2B4D, 0x2B73 }, 125 + .{ 0x2B76, 0x2B95 }, 126 + .{ 0x2B98, 0x2BC8 }, 127 + .{ 0x2BCA, 0x2BFE }, 128 + .{ 0x2CE5, 0x2CEA }, 129 + .{ 0x2E80, 0x2E99 }, 130 + .{ 0x2E9B, 0x2EF3 }, 131 + .{ 0x2F00, 0x2FD5 }, 132 + .{ 0x2FF0, 0x2FFB }, 133 + .{ 0x3004, 0x3004 }, 134 + .{ 0x3012, 0x3013 }, 135 + .{ 0x3020, 0x3020 }, 136 + .{ 0x3036, 0x3037 }, 137 + .{ 0x303E, 0x303F }, 138 + .{ 0x3190, 0x3191 }, 139 + .{ 0x3196, 0x319F }, 140 + .{ 0x31C0, 0x31E3 }, 141 + .{ 0x3200, 0x321E }, 142 + .{ 0x322A, 0x3247 }, 143 + .{ 0x3250, 0x3250 }, 144 + .{ 0x3260, 0x327F }, 145 + .{ 0x328A, 0x32B0 }, 146 + .{ 0x32C0, 0x32FE }, 147 + .{ 0x3300, 0x33FF }, 148 + .{ 0x4DC0, 0x4DFF }, 149 + .{ 0xA490, 0xA4C6 }, 150 + .{ 0xA828, 0xA82B }, 151 + .{ 0xA836, 0xA837 }, 152 + .{ 0xA839, 0xA839 }, 153 + .{ 0xAA77, 0xAA79 }, 154 + .{ 0xFDFD, 0xFDFD }, 155 + .{ 0xFFE4, 0xFFE4 }, 156 + .{ 0xFFE8, 0xFFE8 }, 157 + .{ 0xFFED, 0xFFEE }, 158 + .{ 0xFFFC, 0xFFFD }, 159 + .{ 0x10137, 0x1013F }, 160 + .{ 0x10179, 0x10189 }, 161 + .{ 0x1018C, 0x1018E }, 162 + .{ 0x10190, 0x1019B }, 163 + .{ 0x101A0, 0x101A0 }, 164 + .{ 0x101D0, 0x101FC }, 165 + .{ 0x10877, 0x10878 }, 166 + .{ 0x10AC8, 0x10AC8 }, 167 + .{ 0x1173F, 0x1173F }, 168 + .{ 0x16B3C, 0x16B3F }, 169 + .{ 0x16B45, 0x16B45 }, 170 + .{ 0x1BC9C, 0x1BC9C }, 171 + .{ 0x1D000, 0x1D0F5 }, 172 + .{ 0x1D100, 0x1D126 }, 173 + .{ 0x1D129, 0x1D164 }, 174 + .{ 0x1D16A, 0x1D16C }, 175 + .{ 0x1D183, 0x1D184 }, 176 + .{ 0x1D18C, 0x1D1A9 }, 177 + .{ 0x1D1AE, 0x1D1E8 }, 178 + .{ 0x1D200, 0x1D241 }, 179 + .{ 0x1D245, 0x1D245 }, 180 + .{ 0x1D300, 0x1D356 }, 181 + .{ 0x1D800, 0x1D9FF }, 182 + .{ 0x1DA37, 0x1DA3A }, 183 + .{ 0x1DA6D, 0x1DA74 }, 184 + .{ 0x1DA76, 0x1DA83 }, 185 + .{ 0x1DA85, 0x1DA86 }, 186 + .{ 0x1ECAC, 0x1ECAC }, 187 + .{ 0x1F000, 0x1F02B }, 188 + .{ 0x1F030, 0x1F093 }, 189 + .{ 0x1F0A0, 0x1F0AE }, 190 + .{ 0x1F0B1, 0x1F0BF }, 191 + .{ 0x1F0C1, 0x1F0CF }, 192 + .{ 0x1F0D1, 0x1F0F5 }, 193 + .{ 0x1F110, 0x1F16B }, 194 + .{ 0x1F170, 0x1F1AC }, 195 + .{ 0x1F1E6, 0x1F202 }, 196 + .{ 0x1F210, 0x1F23B }, 197 + .{ 0x1F240, 0x1F248 }, 198 + .{ 0x1F250, 0x1F251 }, 199 + .{ 0x1F260, 0x1F265 }, 200 + .{ 0x1F300, 0x1F3FA }, 201 + .{ 0x1F400, 0x1F6D4 }, 202 + .{ 0x1F6E0, 0x1F6EC }, 203 + .{ 0x1F6F0, 0x1F6F9 }, 204 + .{ 0x1F700, 0x1F773 }, 205 + .{ 0x1F780, 0x1F7D8 }, 206 + .{ 0x1F800, 0x1F80B }, 207 + .{ 0x1F810, 0x1F847 }, 208 + .{ 0x1F850, 0x1F859 }, 209 + .{ 0x1F860, 0x1F887 }, 210 + .{ 0x1F890, 0x1F8AD }, 211 + .{ 0x1F900, 0x1F90B }, 212 + .{ 0x1F910, 0x1F93E }, 213 + .{ 0x1F940, 0x1F970 }, 214 + .{ 0x1F973, 0x1F976 }, 215 + .{ 0x1F97A, 0x1F97A }, 216 + .{ 0x1F97C, 0x1F9A2 }, 217 + .{ 0x1F9B0, 0x1F9B9 }, 218 + .{ 0x1F9C0, 0x1F9C2 }, 219 + .{ 0x1F9D0, 0x1F9FF }, 220 + .{ 0x1FA60, 0x1FA6D }, 221 + }; 222 + 223 + pub fn isSymbol(c: u21) bool { 224 + return rangeContains(&isSymbol_ranges, c); 225 + } 226 + 227 + // ── prefix data ── 228 + 229 + pub fn isPrefixChar(c: u21) bool { 230 + return switch (c) { 231 + '!'...'*' => true, 232 + ',' => true, 233 + ':'...'?' => true, 234 + '[' => true, 235 + ']' => true, 236 + '_'...'`' => true, 237 + '{' => true, 238 + '}' => true, 239 + 0x00A1 => true, 240 + 0x00A3 => true, 241 + 0x00A5 => true, 242 + 0x00A7 => true, 243 + 0x00AB => true, 244 + 0x00B4 => true, 245 + 0x00B7 => true, 246 + 0x00BB => true, 247 + 0x00BF => true, 248 + 0x060C => true, 249 + 0x061B => true, 250 + 0x061F => true, 251 + 0x066A => true, 252 + 0x06D4 => true, 253 + 0x0964 => true, 254 + 0x0E3F => true, 255 + 0x2013...0x2014 => true, 256 + 0x2018...0x201A => true, 257 + 0x201C...0x201E => true, 258 + 0x2026 => true, 259 + 0x20A0...0x20BF => true, 260 + 0x2329...0x232A => true, 261 + 0x27E6...0x27E7 => true, 262 + 0x3001...0x3002 => true, 263 + 0x3008...0x3011 => true, 264 + 0x3014...0x3015 => true, 265 + 0xFDFC => true, 266 + 0xFF01 => true, 267 + 0xFF08...0xFF09 => true, 268 + 0xFF0C => true, 269 + 0xFF1A...0xFF1B => true, 270 + 0xFF1F => true, 271 + 0xFF5E => true, 272 + else => false, 273 + }; 274 + } 275 + 276 + pub const prefix_multi_literals = [_][]const u8{ 277 + "US$", 278 + "\xe2\x80\xa6\xe2\x80\xa6", 279 + "C$", 280 + "A$", 281 + }; 282 + 283 + pub fn isPrefixUnlessDigit(c: u21) bool { 284 + return switch (c) { 285 + '+' => true, 286 + else => false, 287 + }; 288 + } 289 + 290 + // ── suffix data ── 291 + 292 + pub fn isSuffixChar(c: u21) bool { 293 + return switch (c) { 294 + '!'...'#' => true, 295 + '&'...'*' => true, 296 + ',' => true, 297 + ':'...'<' => true, 298 + '>'...'?' => true, 299 + '[' => true, 300 + ']' => true, 301 + '_'...'`' => true, 302 + '{' => true, 303 + '}' => true, 304 + 0x00A1 => true, 305 + 0x00AB => true, 306 + 0x00B4 => true, 307 + 0x00B7 => true, 308 + 0x00BB => true, 309 + 0x00BF => true, 310 + 0x060C => true, 311 + 0x061B => true, 312 + 0x061F => true, 313 + 0x066A => true, 314 + 0x06D4 => true, 315 + 0x0964 => true, 316 + 0x2013...0x2014 => true, 317 + 0x2018...0x201A => true, 318 + 0x201C...0x201E => true, 319 + 0x2026 => true, 320 + 0x2329...0x232A => true, 321 + 0x27E6...0x27E7 => true, 322 + 0x3001...0x3002 => true, 323 + 0x3008...0x3011 => true, 324 + 0x3014...0x3015 => true, 325 + 0xFF01 => true, 326 + 0xFF08...0xFF09 => true, 327 + 0xFF0C => true, 328 + 0xFF1A...0xFF1B => true, 329 + 0xFF1F => true, 330 + 0xFF5E => true, 331 + else => false, 332 + }; 333 + } 334 + 335 + pub const suffix_multi_literals = [_][]const u8{ 336 + "\xe2\x80\xa6\xe2\x80\xa6", 337 + "'s", 338 + "'S", 339 + "\xe2\x80\x99s", 340 + "\xe2\x80\x99S", 341 + }; 342 + 343 + // ── suffix lookbehind helpers ── 344 + 345 + const lookbehind_class_0_ranges = [_][2]u21{ 346 + .{ 0x0030, 0x0039 }, 347 + }; 348 + 349 + pub fn matchLookbehind0(c: u21) bool { 350 + return rangeContains(&lookbehind_class_0_ranges, c); 351 + } 352 + 353 + const lookbehind_class_1_ranges = [_][2]u21{ 354 + .{ 0x0043, 0x0043 }, 355 + .{ 0x0046, 0x0046 }, 356 + .{ 0x004B, 0x004B }, 357 + .{ 0x0063, 0x0063 }, 358 + .{ 0x0066, 0x0066 }, 359 + .{ 0x006B, 0x006B }, 360 + }; 361 + 362 + pub fn matchLookbehind1(c: u21) bool { 363 + return rangeContains(&lookbehind_class_1_ranges, c); 364 + } 365 + 366 + const lookbehind_class_2_ranges = [_][2]u21{ 367 + .{ 0x0021, 0x0023 }, 368 + .{ 0x0025, 0x002D }, 369 + .{ 0x0030, 0x003C }, 370 + .{ 0x003E, 0x003F }, 371 + .{ 0x005B, 0x005B }, 372 + .{ 0x005D, 0x005D }, 373 + .{ 0x005F, 0x007D }, 374 + .{ 0x00A1, 0x00A1 }, 375 + .{ 0x00AB, 0x00AB }, 376 + .{ 0x00B2, 0x00B2 }, 377 + .{ 0x00B4, 0x00B4 }, 378 + .{ 0x00B7, 0x00B7 }, 379 + .{ 0x00BB, 0x00BB }, 380 + .{ 0x00BF, 0x00BF }, 381 + .{ 0x00DF, 0x00F6 }, 382 + .{ 0x00F8, 0x00FF }, 383 + .{ 0x0101, 0x0101 }, 384 + .{ 0x0103, 0x0103 }, 385 + .{ 0x0105, 0x0105 }, 386 + .{ 0x0107, 0x0107 }, 387 + .{ 0x0109, 0x0109 }, 388 + .{ 0x010B, 0x010B }, 389 + .{ 0x010D, 0x010D }, 390 + .{ 0x010F, 0x010F }, 391 + .{ 0x0111, 0x0111 }, 392 + .{ 0x0113, 0x0113 }, 393 + .{ 0x0115, 0x0115 }, 394 + .{ 0x0117, 0x0117 }, 395 + .{ 0x0119, 0x0119 }, 396 + .{ 0x011B, 0x011B }, 397 + .{ 0x011D, 0x011D }, 398 + .{ 0x011F, 0x011F }, 399 + .{ 0x0121, 0x0121 }, 400 + .{ 0x0123, 0x0123 }, 401 + .{ 0x0125, 0x0125 }, 402 + .{ 0x0127, 0x0127 }, 403 + .{ 0x0129, 0x0129 }, 404 + .{ 0x012B, 0x012B }, 405 + .{ 0x012D, 0x012D }, 406 + .{ 0x012F, 0x012F }, 407 + .{ 0x0131, 0x0131 }, 408 + .{ 0x0133, 0x0133 }, 409 + .{ 0x0135, 0x0135 }, 410 + .{ 0x0137, 0x0138 }, 411 + .{ 0x013A, 0x013A }, 412 + .{ 0x013C, 0x013C }, 413 + .{ 0x013E, 0x013E }, 414 + .{ 0x0140, 0x0140 }, 415 + .{ 0x0142, 0x0142 }, 416 + .{ 0x0144, 0x0144 }, 417 + .{ 0x0146, 0x0146 }, 418 + .{ 0x0148, 0x0149 }, 419 + .{ 0x014B, 0x014B }, 420 + .{ 0x014D, 0x014D }, 421 + .{ 0x014F, 0x014F }, 422 + .{ 0x0151, 0x0151 }, 423 + .{ 0x0153, 0x0153 }, 424 + .{ 0x0155, 0x0155 }, 425 + .{ 0x0157, 0x0157 }, 426 + .{ 0x0159, 0x0159 }, 427 + .{ 0x015B, 0x015B }, 428 + .{ 0x015D, 0x015D }, 429 + .{ 0x015F, 0x015F }, 430 + .{ 0x0161, 0x0161 }, 431 + .{ 0x0163, 0x0163 }, 432 + .{ 0x0165, 0x0165 }, 433 + .{ 0x0167, 0x0167 }, 434 + .{ 0x0169, 0x0169 }, 435 + .{ 0x016B, 0x016B }, 436 + .{ 0x016D, 0x016D }, 437 + .{ 0x016F, 0x016F }, 438 + .{ 0x0171, 0x0171 }, 439 + .{ 0x0173, 0x0173 }, 440 + .{ 0x0175, 0x0175 }, 441 + .{ 0x0177, 0x0177 }, 442 + .{ 0x017A, 0x017A }, 443 + .{ 0x017C, 0x017C }, 444 + .{ 0x017E, 0x0180 }, 445 + .{ 0x0183, 0x0183 }, 446 + .{ 0x0185, 0x0185 }, 447 + .{ 0x0188, 0x0188 }, 448 + .{ 0x018C, 0x018D }, 449 + .{ 0x0192, 0x0192 }, 450 + .{ 0x0195, 0x0195 }, 451 + .{ 0x0199, 0x019B }, 452 + .{ 0x019E, 0x019E }, 453 + .{ 0x01A1, 0x01A1 }, 454 + .{ 0x01A3, 0x01A3 }, 455 + .{ 0x01A5, 0x01A5 }, 456 + .{ 0x01A8, 0x01A8 }, 457 + .{ 0x01AA, 0x01AB }, 458 + .{ 0x01AD, 0x01AD }, 459 + .{ 0x01B0, 0x01B0 }, 460 + .{ 0x01B4, 0x01B4 }, 461 + .{ 0x01B6, 0x01B6 }, 462 + .{ 0x01B9, 0x01BA }, 463 + .{ 0x01BD, 0x01BF }, 464 + .{ 0x01C6, 0x01C6 }, 465 + .{ 0x01C9, 0x01C9 }, 466 + .{ 0x01CC, 0x01CC }, 467 + .{ 0x01CE, 0x01CE }, 468 + .{ 0x01D0, 0x01D0 }, 469 + .{ 0x01D2, 0x01D2 }, 470 + .{ 0x01D4, 0x01D4 }, 471 + .{ 0x01D6, 0x01D6 }, 472 + .{ 0x01D8, 0x01D8 }, 473 + .{ 0x01DA, 0x01DA }, 474 + .{ 0x01DC, 0x01DD }, 475 + .{ 0x01DF, 0x01DF }, 476 + .{ 0x01E1, 0x01E1 }, 477 + .{ 0x01E3, 0x01E3 }, 478 + .{ 0x01E5, 0x01E5 }, 479 + .{ 0x01E7, 0x01E7 }, 480 + .{ 0x01E9, 0x01E9 }, 481 + .{ 0x01EB, 0x01EB }, 482 + .{ 0x01ED, 0x01ED }, 483 + .{ 0x01EF, 0x01F0 }, 484 + .{ 0x01F3, 0x01F3 }, 485 + .{ 0x01F5, 0x01F5 }, 486 + .{ 0x01F9, 0x01F9 }, 487 + .{ 0x01FB, 0x01FB }, 488 + .{ 0x01FD, 0x01FD }, 489 + .{ 0x01FF, 0x01FF }, 490 + .{ 0x0201, 0x0201 }, 491 + .{ 0x0203, 0x0203 }, 492 + .{ 0x0205, 0x0205 }, 493 + .{ 0x0207, 0x0207 }, 494 + .{ 0x0209, 0x0209 }, 495 + .{ 0x020B, 0x020B }, 496 + .{ 0x020D, 0x020D }, 497 + .{ 0x020F, 0x020F }, 498 + .{ 0x0211, 0x0211 }, 499 + .{ 0x0213, 0x0213 }, 500 + .{ 0x0215, 0x0215 }, 501 + .{ 0x0217, 0x0217 }, 502 + .{ 0x0219, 0x0219 }, 503 + .{ 0x021B, 0x021B }, 504 + .{ 0x021D, 0x021D }, 505 + .{ 0x021F, 0x021F }, 506 + .{ 0x0221, 0x0221 }, 507 + .{ 0x0223, 0x0223 }, 508 + .{ 0x0225, 0x0225 }, 509 + .{ 0x0227, 0x0227 }, 510 + .{ 0x0229, 0x0229 }, 511 + .{ 0x022B, 0x022B }, 512 + .{ 0x022D, 0x022D }, 513 + .{ 0x022F, 0x022F }, 514 + .{ 0x0231, 0x0231 }, 515 + .{ 0x0233, 0x0239 }, 516 + .{ 0x023C, 0x023C }, 517 + .{ 0x023F, 0x0240 }, 518 + .{ 0x0242, 0x0242 }, 519 + .{ 0x0247, 0x0247 }, 520 + .{ 0x0249, 0x0249 }, 521 + .{ 0x024B, 0x024B }, 522 + .{ 0x024D, 0x024D }, 523 + .{ 0x024F, 0x02AF }, 524 + .{ 0x03AC, 0x03AF }, 525 + .{ 0x03B1, 0x03C9 }, 526 + .{ 0x03CC, 0x03CE }, 527 + .{ 0x0430, 0x0451 }, 528 + .{ 0x0453, 0x045A }, 529 + .{ 0x045C, 0x045D }, 530 + .{ 0x0491, 0x0491 }, 531 + .{ 0x0497, 0x0497 }, 532 + .{ 0x04A3, 0x04A3 }, 533 + .{ 0x04AF, 0x04AF }, 534 + .{ 0x04BB, 0x04BB }, 535 + .{ 0x04D9, 0x04D9 }, 536 + .{ 0x04E9, 0x04E9 }, 537 + .{ 0x0591, 0x05F4 }, 538 + .{ 0x060C, 0x060C }, 539 + .{ 0x061B, 0x061B }, 540 + .{ 0x061F, 0x064A }, 541 + .{ 0x066A, 0x066A }, 542 + .{ 0x066E, 0x06D5 }, 543 + .{ 0x06E5, 0x06FF }, 544 + .{ 0x0750, 0x077F }, 545 + .{ 0x08A0, 0x08BD }, 546 + .{ 0x0900, 0x09FF }, 547 + .{ 0x0B80, 0x0CFF }, 548 + .{ 0x0D80, 0x0DFF }, 549 + .{ 0x1100, 0x137F }, 550 + .{ 0x1D00, 0x1D25 }, 551 + .{ 0x1D6B, 0x1D77 }, 552 + .{ 0x1D79, 0x1D9A }, 553 + .{ 0x1E01, 0x1E01 }, 554 + .{ 0x1E03, 0x1E03 }, 555 + .{ 0x1E05, 0x1E05 }, 556 + .{ 0x1E07, 0x1E07 }, 557 + .{ 0x1E09, 0x1E09 }, 558 + .{ 0x1E0B, 0x1E0B }, 559 + .{ 0x1E0D, 0x1E0D }, 560 + .{ 0x1E0F, 0x1E0F }, 561 + .{ 0x1E11, 0x1E11 }, 562 + .{ 0x1E13, 0x1E13 }, 563 + .{ 0x1E15, 0x1E15 }, 564 + .{ 0x1E17, 0x1E17 }, 565 + .{ 0x1E19, 0x1E19 }, 566 + .{ 0x1E1B, 0x1E1B }, 567 + .{ 0x1E1D, 0x1E1D }, 568 + .{ 0x1E1F, 0x1E1F }, 569 + .{ 0x1E21, 0x1E21 }, 570 + .{ 0x1E23, 0x1E23 }, 571 + .{ 0x1E25, 0x1E25 }, 572 + .{ 0x1E27, 0x1E27 }, 573 + .{ 0x1E29, 0x1E29 }, 574 + .{ 0x1E2B, 0x1E2B }, 575 + .{ 0x1E2D, 0x1E2D }, 576 + .{ 0x1E2F, 0x1E2F }, 577 + .{ 0x1E31, 0x1E31 }, 578 + .{ 0x1E33, 0x1E33 }, 579 + .{ 0x1E35, 0x1E35 }, 580 + .{ 0x1E37, 0x1E37 }, 581 + .{ 0x1E39, 0x1E39 }, 582 + .{ 0x1E3B, 0x1E3B }, 583 + .{ 0x1E3D, 0x1E3D }, 584 + .{ 0x1E3F, 0x1E3F }, 585 + .{ 0x1E41, 0x1E41 }, 586 + .{ 0x1E43, 0x1E43 }, 587 + .{ 0x1E45, 0x1E45 }, 588 + .{ 0x1E47, 0x1E47 }, 589 + .{ 0x1E49, 0x1E49 }, 590 + .{ 0x1E4B, 0x1E4B }, 591 + .{ 0x1E4D, 0x1E4D }, 592 + .{ 0x1E4F, 0x1E4F }, 593 + .{ 0x1E51, 0x1E51 }, 594 + .{ 0x1E53, 0x1E53 }, 595 + .{ 0x1E55, 0x1E55 }, 596 + .{ 0x1E57, 0x1E57 }, 597 + .{ 0x1E59, 0x1E59 }, 598 + .{ 0x1E5B, 0x1E5B }, 599 + .{ 0x1E5D, 0x1E5D }, 600 + .{ 0x1E5F, 0x1E5F }, 601 + .{ 0x1E61, 0x1E61 }, 602 + .{ 0x1E63, 0x1E63 }, 603 + .{ 0x1E65, 0x1E65 }, 604 + .{ 0x1E67, 0x1E67 }, 605 + .{ 0x1E69, 0x1E69 }, 606 + .{ 0x1E6B, 0x1E6B }, 607 + .{ 0x1E6D, 0x1E6D }, 608 + .{ 0x1E6F, 0x1E6F }, 609 + .{ 0x1E71, 0x1E71 }, 610 + .{ 0x1E73, 0x1E73 }, 611 + .{ 0x1E75, 0x1E75 }, 612 + .{ 0x1E77, 0x1E77 }, 613 + .{ 0x1E79, 0x1E79 }, 614 + .{ 0x1E7B, 0x1E7B }, 615 + .{ 0x1E7D, 0x1E7D }, 616 + .{ 0x1E7F, 0x1E7F }, 617 + .{ 0x1E81, 0x1E81 }, 618 + .{ 0x1E83, 0x1E83 }, 619 + .{ 0x1E85, 0x1E85 }, 620 + .{ 0x1E87, 0x1E87 }, 621 + .{ 0x1E89, 0x1E89 }, 622 + .{ 0x1E8B, 0x1E8B }, 623 + .{ 0x1E8D, 0x1E8D }, 624 + .{ 0x1E8F, 0x1E8F }, 625 + .{ 0x1E91, 0x1E91 }, 626 + .{ 0x1E93, 0x1E93 }, 627 + .{ 0x1E95, 0x1E9D }, 628 + .{ 0x1E9F, 0x1E9F }, 629 + .{ 0x1EA1, 0x1EA1 }, 630 + .{ 0x1EA3, 0x1EA3 }, 631 + .{ 0x1EA5, 0x1EA5 }, 632 + .{ 0x1EA7, 0x1EA7 }, 633 + .{ 0x1EA9, 0x1EA9 }, 634 + .{ 0x1EAB, 0x1EAB }, 635 + .{ 0x1EAD, 0x1EAD }, 636 + .{ 0x1EAF, 0x1EAF }, 637 + .{ 0x1EB1, 0x1EB1 }, 638 + .{ 0x1EB3, 0x1EB3 }, 639 + .{ 0x1EB5, 0x1EB5 }, 640 + .{ 0x1EB7, 0x1EB7 }, 641 + .{ 0x1EB9, 0x1EB9 }, 642 + .{ 0x1EBB, 0x1EBB }, 643 + .{ 0x1EBD, 0x1EBD }, 644 + .{ 0x1EBF, 0x1EBF }, 645 + .{ 0x1EC1, 0x1EC1 }, 646 + .{ 0x1EC3, 0x1EC3 }, 647 + .{ 0x1EC5, 0x1EC5 }, 648 + .{ 0x1EC7, 0x1EC7 }, 649 + .{ 0x1EC9, 0x1EC9 }, 650 + .{ 0x1ECB, 0x1ECB }, 651 + .{ 0x1ECD, 0x1ECD }, 652 + .{ 0x1ECF, 0x1ECF }, 653 + .{ 0x1ED1, 0x1ED1 }, 654 + .{ 0x1ED3, 0x1ED3 }, 655 + .{ 0x1ED5, 0x1ED5 }, 656 + .{ 0x1ED7, 0x1ED7 }, 657 + .{ 0x1ED9, 0x1ED9 }, 658 + .{ 0x1EDB, 0x1EDB }, 659 + .{ 0x1EDD, 0x1EDD }, 660 + .{ 0x1EDF, 0x1EDF }, 661 + .{ 0x1EE1, 0x1EE1 }, 662 + .{ 0x1EE3, 0x1EE3 }, 663 + .{ 0x1EE5, 0x1EE5 }, 664 + .{ 0x1EE7, 0x1EE7 }, 665 + .{ 0x1EE9, 0x1EE9 }, 666 + .{ 0x1EEB, 0x1EEB }, 667 + .{ 0x1EED, 0x1EED }, 668 + .{ 0x1EEF, 0x1EEF }, 669 + .{ 0x1EF1, 0x1EF1 }, 670 + .{ 0x1EF3, 0x1EF3 }, 671 + .{ 0x1EF5, 0x1EF5 }, 672 + .{ 0x1EF7, 0x1EF7 }, 673 + .{ 0x1EF9, 0x1EF9 }, 674 + .{ 0x1EFB, 0x1EFB }, 675 + .{ 0x1EFD, 0x1EFD }, 676 + .{ 0x1EFF, 0x1EFF }, 677 + .{ 0x2018, 0x201A }, 678 + .{ 0x201C, 0x201E }, 679 + .{ 0x2026, 0x2026 }, 680 + .{ 0x2329, 0x232A }, 681 + .{ 0x27E6, 0x27E7 }, 682 + .{ 0x2C61, 0x2C61 }, 683 + .{ 0x2C65, 0x2C66 }, 684 + .{ 0x2C68, 0x2C68 }, 685 + .{ 0x2C6A, 0x2C6A }, 686 + .{ 0x2C6C, 0x2C6C }, 687 + .{ 0x2C71, 0x2C71 }, 688 + .{ 0x2C73, 0x2C74 }, 689 + .{ 0x2C76, 0x2C7B }, 690 + .{ 0x2E80, 0x2FDF }, 691 + .{ 0x2FF0, 0x30FF }, 692 + .{ 0x31C0, 0x31EF }, 693 + .{ 0x3200, 0x4DBF }, 694 + .{ 0x4E00, 0x9FFF }, 695 + .{ 0xA723, 0xA723 }, 696 + .{ 0xA725, 0xA725 }, 697 + .{ 0xA727, 0xA727 }, 698 + .{ 0xA729, 0xA729 }, 699 + .{ 0xA72B, 0xA72B }, 700 + .{ 0xA72D, 0xA72D }, 701 + .{ 0xA72F, 0xA731 }, 702 + .{ 0xA733, 0xA733 }, 703 + .{ 0xA735, 0xA735 }, 704 + .{ 0xA737, 0xA737 }, 705 + .{ 0xA739, 0xA739 }, 706 + .{ 0xA73B, 0xA73B }, 707 + .{ 0xA73D, 0xA73D }, 708 + .{ 0xA73F, 0xA73F }, 709 + .{ 0xA741, 0xA741 }, 710 + .{ 0xA743, 0xA743 }, 711 + .{ 0xA745, 0xA745 }, 712 + .{ 0xA747, 0xA747 }, 713 + .{ 0xA749, 0xA749 }, 714 + .{ 0xA74B, 0xA74B }, 715 + .{ 0xA74D, 0xA74D }, 716 + .{ 0xA74F, 0xA74F }, 717 + .{ 0xA751, 0xA751 }, 718 + .{ 0xA753, 0xA753 }, 719 + .{ 0xA755, 0xA755 }, 720 + .{ 0xA757, 0xA757 }, 721 + .{ 0xA759, 0xA759 }, 722 + .{ 0xA75B, 0xA75B }, 723 + .{ 0xA75D, 0xA75D }, 724 + .{ 0xA75F, 0xA75F }, 725 + .{ 0xA761, 0xA761 }, 726 + .{ 0xA763, 0xA763 }, 727 + .{ 0xA765, 0xA765 }, 728 + .{ 0xA767, 0xA767 }, 729 + .{ 0xA769, 0xA769 }, 730 + .{ 0xA76B, 0xA76B }, 731 + .{ 0xA76D, 0xA76D }, 732 + .{ 0xA76F, 0xA76F }, 733 + .{ 0xA771, 0xA778 }, 734 + .{ 0xA77A, 0xA77A }, 735 + .{ 0xA77C, 0xA77C }, 736 + .{ 0xA77F, 0xA77F }, 737 + .{ 0xA781, 0xA781 }, 738 + .{ 0xA783, 0xA783 }, 739 + .{ 0xA785, 0xA785 }, 740 + .{ 0xA787, 0xA787 }, 741 + .{ 0xA78C, 0xA78C }, 742 + .{ 0xA78E, 0xA78E }, 743 + .{ 0xA791, 0xA791 }, 744 + .{ 0xA793, 0xA795 }, 745 + .{ 0xA797, 0xA797 }, 746 + .{ 0xA799, 0xA799 }, 747 + .{ 0xA79B, 0xA79B }, 748 + .{ 0xA79D, 0xA79D }, 749 + .{ 0xA79F, 0xA79F }, 750 + .{ 0xA7A1, 0xA7A1 }, 751 + .{ 0xA7A3, 0xA7A3 }, 752 + .{ 0xA7A5, 0xA7A5 }, 753 + .{ 0xA7A7, 0xA7A7 }, 754 + .{ 0xA7A9, 0xA7A9 }, 755 + .{ 0xA7AF, 0xA7AF }, 756 + .{ 0xA7B5, 0xA7B5 }, 757 + .{ 0xA7B7, 0xA7B7 }, 758 + .{ 0xA7B9, 0xA7B9 }, 759 + .{ 0xA7FA, 0xA7FA }, 760 + .{ 0xAB30, 0xAB5A }, 761 + .{ 0xAB60, 0xAB64 }, 762 + .{ 0xAC00, 0xD7AF }, 763 + .{ 0xF900, 0xFAFF }, 764 + .{ 0xFB1D, 0xFBB1 }, 765 + .{ 0xFBD3, 0xFD3D }, 766 + .{ 0xFD50, 0xFDC7 }, 767 + .{ 0xFDF0, 0xFDFB }, 768 + .{ 0xFE30, 0xFE4F }, 769 + .{ 0xFE70, 0xFEFC }, 770 + .{ 0xFF01, 0xFF01 }, 771 + .{ 0xFF08, 0xFF09 }, 772 + .{ 0xFF0C, 0xFF0C }, 773 + .{ 0xFF1A, 0xFF1B }, 774 + .{ 0xFF1F, 0xFF1F }, 775 + .{ 0xFF41, 0xFF5A }, 776 + .{ 0xFF5E, 0xFF5E }, 777 + .{ 0x1EE00, 0x1EEBB }, 778 + .{ 0x1F200, 0x1F2FF }, 779 + .{ 0x20000, 0x2A6DF }, 780 + .{ 0x2A700, 0x2EBEF }, 781 + .{ 0x2F800, 0x2FA1F }, 782 + }; 783 + 784 + pub fn matchLookbehind2(c: u21) bool { 785 + return rangeContains(&lookbehind_class_2_ranges, c); 786 + } 787 + 788 + const lookbehind_class_3_ranges = [_][2]u21{ 789 + .{ 0x0041, 0x005A }, 790 + .{ 0x00C0, 0x00D6 }, 791 + .{ 0x00D8, 0x00DE }, 792 + .{ 0x0100, 0x0100 }, 793 + .{ 0x0102, 0x0102 }, 794 + .{ 0x0104, 0x0104 }, 795 + .{ 0x0106, 0x0106 }, 796 + .{ 0x0108, 0x0108 }, 797 + .{ 0x010A, 0x010A }, 798 + .{ 0x010C, 0x010C }, 799 + .{ 0x010E, 0x010E }, 800 + .{ 0x0110, 0x0110 }, 801 + .{ 0x0112, 0x0112 }, 802 + .{ 0x0114, 0x0114 }, 803 + .{ 0x0116, 0x0116 }, 804 + .{ 0x0118, 0x0118 }, 805 + .{ 0x011A, 0x011A }, 806 + .{ 0x011C, 0x011C }, 807 + .{ 0x011E, 0x011E }, 808 + .{ 0x0120, 0x0120 }, 809 + .{ 0x0122, 0x0122 }, 810 + .{ 0x0124, 0x0124 }, 811 + .{ 0x0126, 0x0126 }, 812 + .{ 0x0128, 0x0128 }, 813 + .{ 0x012A, 0x012A }, 814 + .{ 0x012C, 0x012C }, 815 + .{ 0x012E, 0x012E }, 816 + .{ 0x0130, 0x0130 }, 817 + .{ 0x0132, 0x0132 }, 818 + .{ 0x0134, 0x0134 }, 819 + .{ 0x0136, 0x0136 }, 820 + .{ 0x0139, 0x0139 }, 821 + .{ 0x013B, 0x013B }, 822 + .{ 0x013D, 0x013D }, 823 + .{ 0x013F, 0x013F }, 824 + .{ 0x0141, 0x0141 }, 825 + .{ 0x0143, 0x0143 }, 826 + .{ 0x0145, 0x0145 }, 827 + .{ 0x0147, 0x0147 }, 828 + .{ 0x014A, 0x014A }, 829 + .{ 0x014C, 0x014C }, 830 + .{ 0x014E, 0x014E }, 831 + .{ 0x0150, 0x0150 }, 832 + .{ 0x0152, 0x0152 }, 833 + .{ 0x0154, 0x0154 }, 834 + .{ 0x0156, 0x0156 }, 835 + .{ 0x0158, 0x0158 }, 836 + .{ 0x015A, 0x015A }, 837 + .{ 0x015C, 0x015C }, 838 + .{ 0x015E, 0x015E }, 839 + .{ 0x0160, 0x0160 }, 840 + .{ 0x0162, 0x0162 }, 841 + .{ 0x0164, 0x0164 }, 842 + .{ 0x0166, 0x0166 }, 843 + .{ 0x0168, 0x0168 }, 844 + .{ 0x016A, 0x016A }, 845 + .{ 0x016C, 0x016C }, 846 + .{ 0x016E, 0x016E }, 847 + .{ 0x0170, 0x0170 }, 848 + .{ 0x0172, 0x0172 }, 849 + .{ 0x0174, 0x0174 }, 850 + .{ 0x0176, 0x0176 }, 851 + .{ 0x0178, 0x0179 }, 852 + .{ 0x017B, 0x017B }, 853 + .{ 0x017D, 0x017D }, 854 + .{ 0x0181, 0x0182 }, 855 + .{ 0x0184, 0x0184 }, 856 + .{ 0x0186, 0x0187 }, 857 + .{ 0x0189, 0x018B }, 858 + .{ 0x018E, 0x0191 }, 859 + .{ 0x0193, 0x0194 }, 860 + .{ 0x0196, 0x0198 }, 861 + .{ 0x019C, 0x019D }, 862 + .{ 0x019F, 0x01A0 }, 863 + .{ 0x01A2, 0x01A2 }, 864 + .{ 0x01A4, 0x01A4 }, 865 + .{ 0x01A6, 0x01A7 }, 866 + .{ 0x01A9, 0x01A9 }, 867 + .{ 0x01AC, 0x01AC }, 868 + .{ 0x01AE, 0x01AF }, 869 + .{ 0x01B1, 0x01B3 }, 870 + .{ 0x01B5, 0x01B5 }, 871 + .{ 0x01B7, 0x01B8 }, 872 + .{ 0x01BC, 0x01BC }, 873 + .{ 0x01C4, 0x01C4 }, 874 + .{ 0x01C7, 0x01C7 }, 875 + .{ 0x01CA, 0x01CA }, 876 + .{ 0x01CD, 0x01CD }, 877 + .{ 0x01CF, 0x01CF }, 878 + .{ 0x01D1, 0x01D1 }, 879 + .{ 0x01D3, 0x01D3 }, 880 + .{ 0x01D5, 0x01D5 }, 881 + .{ 0x01D7, 0x01D7 }, 882 + .{ 0x01D9, 0x01D9 }, 883 + .{ 0x01DB, 0x01DB }, 884 + .{ 0x01DE, 0x01DE }, 885 + .{ 0x01E0, 0x01E0 }, 886 + .{ 0x01E2, 0x01E2 }, 887 + .{ 0x01E4, 0x01E4 }, 888 + .{ 0x01E6, 0x01E6 }, 889 + .{ 0x01E8, 0x01E8 }, 890 + .{ 0x01EA, 0x01EA }, 891 + .{ 0x01EC, 0x01EC }, 892 + .{ 0x01EE, 0x01EE }, 893 + .{ 0x01F1, 0x01F1 }, 894 + .{ 0x01F4, 0x01F4 }, 895 + .{ 0x01F6, 0x01F8 }, 896 + .{ 0x01FA, 0x01FA }, 897 + .{ 0x01FC, 0x01FC }, 898 + .{ 0x01FE, 0x01FE }, 899 + .{ 0x0200, 0x0200 }, 900 + .{ 0x0202, 0x0202 }, 901 + .{ 0x0204, 0x0204 }, 902 + .{ 0x0206, 0x0206 }, 903 + .{ 0x0208, 0x0208 }, 904 + .{ 0x020A, 0x020A }, 905 + .{ 0x020C, 0x020C }, 906 + .{ 0x020E, 0x020E }, 907 + .{ 0x0210, 0x0210 }, 908 + .{ 0x0212, 0x0212 }, 909 + .{ 0x0214, 0x0214 }, 910 + .{ 0x0216, 0x0216 }, 911 + .{ 0x0218, 0x0218 }, 912 + .{ 0x021A, 0x021A }, 913 + .{ 0x021C, 0x021C }, 914 + .{ 0x021E, 0x021E }, 915 + .{ 0x0220, 0x0220 }, 916 + .{ 0x0222, 0x0222 }, 917 + .{ 0x0224, 0x0224 }, 918 + .{ 0x0226, 0x0226 }, 919 + .{ 0x0228, 0x0228 }, 920 + .{ 0x022A, 0x022A }, 921 + .{ 0x022C, 0x022C }, 922 + .{ 0x022E, 0x022E }, 923 + .{ 0x0230, 0x0230 }, 924 + .{ 0x0232, 0x0232 }, 925 + .{ 0x023A, 0x023B }, 926 + .{ 0x023D, 0x023E }, 927 + .{ 0x0241, 0x0241 }, 928 + .{ 0x0243, 0x0246 }, 929 + .{ 0x0248, 0x0248 }, 930 + .{ 0x024A, 0x024A }, 931 + .{ 0x024C, 0x024C }, 932 + .{ 0x024E, 0x024E }, 933 + .{ 0x0386, 0x0386 }, 934 + .{ 0x0388, 0x038A }, 935 + .{ 0x038C, 0x038C }, 936 + .{ 0x038E, 0x038F }, 937 + .{ 0x0391, 0x03A9 }, 938 + .{ 0x0400, 0x0401 }, 939 + .{ 0x0403, 0x040A }, 940 + .{ 0x040C, 0x040D }, 941 + .{ 0x0410, 0x042F }, 942 + .{ 0x0490, 0x0490 }, 943 + .{ 0x0496, 0x0496 }, 944 + .{ 0x04A2, 0x04A2 }, 945 + .{ 0x04AE, 0x04AE }, 946 + .{ 0x04BA, 0x04BA }, 947 + .{ 0x04D8, 0x04D8 }, 948 + .{ 0x04E8, 0x04E8 }, 949 + .{ 0x0591, 0x05F4 }, 950 + .{ 0x0620, 0x064A }, 951 + .{ 0x066E, 0x06D5 }, 952 + .{ 0x06E5, 0x06FF }, 953 + .{ 0x0750, 0x077F }, 954 + .{ 0x08A0, 0x08BD }, 955 + .{ 0x0900, 0x09FF }, 956 + .{ 0x0B80, 0x0CFF }, 957 + .{ 0x0D80, 0x0DFF }, 958 + .{ 0x1100, 0x137F }, 959 + .{ 0x1E00, 0x1E00 }, 960 + .{ 0x1E02, 0x1E02 }, 961 + .{ 0x1E04, 0x1E04 }, 962 + .{ 0x1E06, 0x1E06 }, 963 + .{ 0x1E08, 0x1E08 }, 964 + .{ 0x1E0A, 0x1E0A }, 965 + .{ 0x1E0C, 0x1E0C }, 966 + .{ 0x1E0E, 0x1E0E }, 967 + .{ 0x1E10, 0x1E10 }, 968 + .{ 0x1E12, 0x1E12 }, 969 + .{ 0x1E14, 0x1E14 }, 970 + .{ 0x1E16, 0x1E16 }, 971 + .{ 0x1E18, 0x1E18 }, 972 + .{ 0x1E1A, 0x1E1A }, 973 + .{ 0x1E1C, 0x1E1C }, 974 + .{ 0x1E1E, 0x1E1E }, 975 + .{ 0x1E20, 0x1E20 }, 976 + .{ 0x1E22, 0x1E22 }, 977 + .{ 0x1E24, 0x1E24 }, 978 + .{ 0x1E26, 0x1E26 }, 979 + .{ 0x1E28, 0x1E28 }, 980 + .{ 0x1E2A, 0x1E2A }, 981 + .{ 0x1E2C, 0x1E2C }, 982 + .{ 0x1E2E, 0x1E2E }, 983 + .{ 0x1E30, 0x1E30 }, 984 + .{ 0x1E32, 0x1E32 }, 985 + .{ 0x1E34, 0x1E34 }, 986 + .{ 0x1E36, 0x1E36 }, 987 + .{ 0x1E38, 0x1E38 }, 988 + .{ 0x1E3A, 0x1E3A }, 989 + .{ 0x1E3C, 0x1E3C }, 990 + .{ 0x1E3E, 0x1E3E }, 991 + .{ 0x1E40, 0x1E40 }, 992 + .{ 0x1E42, 0x1E42 }, 993 + .{ 0x1E44, 0x1E44 }, 994 + .{ 0x1E46, 0x1E46 }, 995 + .{ 0x1E48, 0x1E48 }, 996 + .{ 0x1E4A, 0x1E4A }, 997 + .{ 0x1E4C, 0x1E4C }, 998 + .{ 0x1E4E, 0x1E4E }, 999 + .{ 0x1E50, 0x1E50 }, 1000 + .{ 0x1E52, 0x1E52 }, 1001 + .{ 0x1E54, 0x1E54 }, 1002 + .{ 0x1E56, 0x1E56 }, 1003 + .{ 0x1E58, 0x1E58 }, 1004 + .{ 0x1E5A, 0x1E5A }, 1005 + .{ 0x1E5C, 0x1E5C }, 1006 + .{ 0x1E5E, 0x1E5E }, 1007 + .{ 0x1E60, 0x1E60 }, 1008 + .{ 0x1E62, 0x1E62 }, 1009 + .{ 0x1E64, 0x1E64 }, 1010 + .{ 0x1E66, 0x1E66 }, 1011 + .{ 0x1E68, 0x1E68 }, 1012 + .{ 0x1E6A, 0x1E6A }, 1013 + .{ 0x1E6C, 0x1E6C }, 1014 + .{ 0x1E6E, 0x1E6E }, 1015 + .{ 0x1E70, 0x1E70 }, 1016 + .{ 0x1E72, 0x1E72 }, 1017 + .{ 0x1E74, 0x1E74 }, 1018 + .{ 0x1E76, 0x1E76 }, 1019 + .{ 0x1E78, 0x1E78 }, 1020 + .{ 0x1E7A, 0x1E7A }, 1021 + .{ 0x1E7C, 0x1E7C }, 1022 + .{ 0x1E7E, 0x1E7E }, 1023 + .{ 0x1E80, 0x1E80 }, 1024 + .{ 0x1E82, 0x1E82 }, 1025 + .{ 0x1E84, 0x1E84 }, 1026 + .{ 0x1E86, 0x1E86 }, 1027 + .{ 0x1E88, 0x1E88 }, 1028 + .{ 0x1E8A, 0x1E8A }, 1029 + .{ 0x1E8C, 0x1E8C }, 1030 + .{ 0x1E8E, 0x1E8E }, 1031 + .{ 0x1E90, 0x1E90 }, 1032 + .{ 0x1E92, 0x1E92 }, 1033 + .{ 0x1E94, 0x1E94 }, 1034 + .{ 0x1E9E, 0x1E9E }, 1035 + .{ 0x1EA0, 0x1EA0 }, 1036 + .{ 0x1EA2, 0x1EA2 }, 1037 + .{ 0x1EA4, 0x1EA4 }, 1038 + .{ 0x1EA6, 0x1EA6 }, 1039 + .{ 0x1EA8, 0x1EA8 }, 1040 + .{ 0x1EAA, 0x1EAA }, 1041 + .{ 0x1EAC, 0x1EAC }, 1042 + .{ 0x1EAE, 0x1EAE }, 1043 + .{ 0x1EB0, 0x1EB0 }, 1044 + .{ 0x1EB2, 0x1EB2 }, 1045 + .{ 0x1EB4, 0x1EB4 }, 1046 + .{ 0x1EB6, 0x1EB6 }, 1047 + .{ 0x1EB8, 0x1EB8 }, 1048 + .{ 0x1EBA, 0x1EBA }, 1049 + .{ 0x1EBC, 0x1EBC }, 1050 + .{ 0x1EBE, 0x1EBE }, 1051 + .{ 0x1EC0, 0x1EC0 }, 1052 + .{ 0x1EC2, 0x1EC2 }, 1053 + .{ 0x1EC4, 0x1EC4 }, 1054 + .{ 0x1EC6, 0x1EC6 }, 1055 + .{ 0x1EC8, 0x1EC8 }, 1056 + .{ 0x1ECA, 0x1ECA }, 1057 + .{ 0x1ECC, 0x1ECC }, 1058 + .{ 0x1ECE, 0x1ECE }, 1059 + .{ 0x1ED0, 0x1ED0 }, 1060 + .{ 0x1ED2, 0x1ED2 }, 1061 + .{ 0x1ED4, 0x1ED4 }, 1062 + .{ 0x1ED6, 0x1ED6 }, 1063 + .{ 0x1ED8, 0x1ED8 }, 1064 + .{ 0x1EDA, 0x1EDA }, 1065 + .{ 0x1EDC, 0x1EDC }, 1066 + .{ 0x1EDE, 0x1EDE }, 1067 + .{ 0x1EE0, 0x1EE0 }, 1068 + .{ 0x1EE2, 0x1EE2 }, 1069 + .{ 0x1EE4, 0x1EE4 }, 1070 + .{ 0x1EE6, 0x1EE6 }, 1071 + .{ 0x1EE8, 0x1EE8 }, 1072 + .{ 0x1EEA, 0x1EEA }, 1073 + .{ 0x1EEC, 0x1EEC }, 1074 + .{ 0x1EEE, 0x1EEE }, 1075 + .{ 0x1EF0, 0x1EF0 }, 1076 + .{ 0x1EF2, 0x1EF2 }, 1077 + .{ 0x1EF4, 0x1EF4 }, 1078 + .{ 0x1EF6, 0x1EF6 }, 1079 + .{ 0x1EF8, 0x1EF8 }, 1080 + .{ 0x1EFA, 0x1EFA }, 1081 + .{ 0x1EFC, 0x1EFC }, 1082 + .{ 0x1EFE, 0x1EFE }, 1083 + .{ 0x2C60, 0x2C60 }, 1084 + .{ 0x2C62, 0x2C64 }, 1085 + .{ 0x2C67, 0x2C67 }, 1086 + .{ 0x2C69, 0x2C69 }, 1087 + .{ 0x2C6B, 0x2C6B }, 1088 + .{ 0x2C6D, 0x2C70 }, 1089 + .{ 0x2C72, 0x2C72 }, 1090 + .{ 0x2C75, 0x2C75 }, 1091 + .{ 0x2C7E, 0x2C7F }, 1092 + .{ 0x2E80, 0x2FDF }, 1093 + .{ 0x2FF0, 0x30FF }, 1094 + .{ 0x31C0, 0x31EF }, 1095 + .{ 0x3200, 0x4DBF }, 1096 + .{ 0x4E00, 0x9FFF }, 1097 + .{ 0xA722, 0xA722 }, 1098 + .{ 0xA724, 0xA724 }, 1099 + .{ 0xA726, 0xA726 }, 1100 + .{ 0xA728, 0xA728 }, 1101 + .{ 0xA72A, 0xA72A }, 1102 + .{ 0xA72C, 0xA72C }, 1103 + .{ 0xA72E, 0xA72E }, 1104 + .{ 0xA732, 0xA732 }, 1105 + .{ 0xA734, 0xA734 }, 1106 + .{ 0xA736, 0xA736 }, 1107 + .{ 0xA738, 0xA738 }, 1108 + .{ 0xA73A, 0xA73A }, 1109 + .{ 0xA73C, 0xA73C }, 1110 + .{ 0xA73E, 0xA73E }, 1111 + .{ 0xA740, 0xA740 }, 1112 + .{ 0xA742, 0xA742 }, 1113 + .{ 0xA744, 0xA744 }, 1114 + .{ 0xA746, 0xA746 }, 1115 + .{ 0xA748, 0xA748 }, 1116 + .{ 0xA74A, 0xA74A }, 1117 + .{ 0xA74C, 0xA74C }, 1118 + .{ 0xA74E, 0xA74E }, 1119 + .{ 0xA750, 0xA750 }, 1120 + .{ 0xA752, 0xA752 }, 1121 + .{ 0xA754, 0xA754 }, 1122 + .{ 0xA756, 0xA756 }, 1123 + .{ 0xA758, 0xA758 }, 1124 + .{ 0xA75A, 0xA75A }, 1125 + .{ 0xA75C, 0xA75C }, 1126 + .{ 0xA75E, 0xA75E }, 1127 + .{ 0xA760, 0xA760 }, 1128 + .{ 0xA762, 0xA762 }, 1129 + .{ 0xA764, 0xA764 }, 1130 + .{ 0xA766, 0xA766 }, 1131 + .{ 0xA768, 0xA768 }, 1132 + .{ 0xA76A, 0xA76A }, 1133 + .{ 0xA76C, 0xA76C }, 1134 + .{ 0xA76E, 0xA76E }, 1135 + .{ 0xA779, 0xA779 }, 1136 + .{ 0xA77B, 0xA77B }, 1137 + .{ 0xA77D, 0xA77E }, 1138 + .{ 0xA780, 0xA780 }, 1139 + .{ 0xA782, 0xA782 }, 1140 + .{ 0xA784, 0xA784 }, 1141 + .{ 0xA786, 0xA786 }, 1142 + .{ 0xA78B, 0xA78B }, 1143 + .{ 0xA78D, 0xA78D }, 1144 + .{ 0xA790, 0xA790 }, 1145 + .{ 0xA792, 0xA792 }, 1146 + .{ 0xA796, 0xA796 }, 1147 + .{ 0xA798, 0xA798 }, 1148 + .{ 0xA79A, 0xA79A }, 1149 + .{ 0xA79C, 0xA79C }, 1150 + .{ 0xA79E, 0xA79E }, 1151 + .{ 0xA7A0, 0xA7A0 }, 1152 + .{ 0xA7A2, 0xA7A2 }, 1153 + .{ 0xA7A4, 0xA7A4 }, 1154 + .{ 0xA7A6, 0xA7A6 }, 1155 + .{ 0xA7A8, 0xA7A8 }, 1156 + .{ 0xA7AA, 0xA7AE }, 1157 + .{ 0xA7B0, 0xA7B4 }, 1158 + .{ 0xA7B6, 0xA7B6 }, 1159 + .{ 0xA7B8, 0xA7B8 }, 1160 + .{ 0xAC00, 0xD7AF }, 1161 + .{ 0xF900, 0xFAFF }, 1162 + .{ 0xFB1D, 0xFBB1 }, 1163 + .{ 0xFBD3, 0xFD3D }, 1164 + .{ 0xFD50, 0xFDC7 }, 1165 + .{ 0xFDF0, 0xFDFB }, 1166 + .{ 0xFE30, 0xFE4F }, 1167 + .{ 0xFE70, 0xFEFC }, 1168 + .{ 0xFF21, 0xFF3A }, 1169 + .{ 0x1EE00, 0x1EEBB }, 1170 + .{ 0x1F200, 0x1F2FF }, 1171 + .{ 0x20000, 0x2A6DF }, 1172 + .{ 0x2A700, 0x2EBEF }, 1173 + .{ 0x2F800, 0x2FA1F }, 1174 + }; 1175 + 1176 + pub fn matchLookbehind3(c: u21) bool { 1177 + return rangeContains(&lookbehind_class_3_ranges, c); 1178 + } 1179 + 1180 + // ── suffix lookbehind rules ── 1181 + // these are checked by tokenizer.zig matchSuffix() 1182 + // format: for each rule, check behind condition then try suffix literal(s) 1183 + 1184 + pub fn matchSuffixLookbehind(text: []const u8) usize { 1185 + if (text.len < 2) return 0; 1186 + 1187 + if (std.mem.endsWith(u8, text, "+") and text.len > 1) { 1188 + const before = lastCodepoint(text[0 .. text.len - 1]); 1189 + if (before != null and matchLookbehind0(before.?.value)) return 1; 1190 + } 1191 + if (std.mem.endsWith(u8, text, ".") and text.len > 1) { 1192 + const b1 = lastCodepoint(text[0 .. text.len - 1]); 1193 + if (b1) |bp1| { 1194 + const b2 = lastCodepoint(text[0 .. text.len - 1 - bp1.len]); 1195 + if (matchLookbehind1(bp1.value)) { 1196 + if (b2) |b2p| { 1197 + if (b2p.value == 0x00B0) return 1; 1198 + } 1199 + } 1200 + } 1201 + } 1202 + if (std.mem.endsWith(u8, text, "\xe2\x82\xac") and text.len > 3) { 1203 + const before = lastCodepoint(text[0 .. text.len - 3]); 1204 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1205 + } 1206 + if (std.mem.endsWith(u8, text, "\xe0\xb8\xbf") and text.len > 3) { 1207 + const before = lastCodepoint(text[0 .. text.len - 3]); 1208 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1209 + } 1210 + if (std.mem.endsWith(u8, text, "US$") and text.len > 3) { 1211 + const before = lastCodepoint(text[0 .. text.len - 3]); 1212 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1213 + } 1214 + if (std.mem.endsWith(u8, text, "\xe2\x82\xbd") and text.len > 3) { 1215 + const before = lastCodepoint(text[0 .. text.len - 3]); 1216 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1217 + } 1218 + if (std.mem.endsWith(u8, text, "\xef\xb7\xbc") and text.len > 3) { 1219 + const before = lastCodepoint(text[0 .. text.len - 3]); 1220 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1221 + } 1222 + if (std.mem.endsWith(u8, text, "\xe2\x82\xb4") and text.len > 3) { 1223 + const before = lastCodepoint(text[0 .. text.len - 3]); 1224 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1225 + } 1226 + if (std.mem.endsWith(u8, text, "\xe2\x82\xa0") and text.len > 3) { 1227 + const before = lastCodepoint(text[0 .. text.len - 3]); 1228 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1229 + } 1230 + if (std.mem.endsWith(u8, text, "\xe2\x82\xa1") and text.len > 3) { 1231 + const before = lastCodepoint(text[0 .. text.len - 3]); 1232 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1233 + } 1234 + if (std.mem.endsWith(u8, text, "\xe2\x82\xa2") and text.len > 3) { 1235 + const before = lastCodepoint(text[0 .. text.len - 3]); 1236 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1237 + } 1238 + if (std.mem.endsWith(u8, text, "\xe2\x82\xa3") and text.len > 3) { 1239 + const before = lastCodepoint(text[0 .. text.len - 3]); 1240 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1241 + } 1242 + if (std.mem.endsWith(u8, text, "\xe2\x82\xa4") and text.len > 3) { 1243 + const before = lastCodepoint(text[0 .. text.len - 3]); 1244 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1245 + } 1246 + if (std.mem.endsWith(u8, text, "\xe2\x82\xa5") and text.len > 3) { 1247 + const before = lastCodepoint(text[0 .. text.len - 3]); 1248 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1249 + } 1250 + if (std.mem.endsWith(u8, text, "\xe2\x82\xa6") and text.len > 3) { 1251 + const before = lastCodepoint(text[0 .. text.len - 3]); 1252 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1253 + } 1254 + if (std.mem.endsWith(u8, text, "\xe2\x82\xa7") and text.len > 3) { 1255 + const before = lastCodepoint(text[0 .. text.len - 3]); 1256 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1257 + } 1258 + if (std.mem.endsWith(u8, text, "\xe2\x82\xa8") and text.len > 3) { 1259 + const before = lastCodepoint(text[0 .. text.len - 3]); 1260 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1261 + } 1262 + if (std.mem.endsWith(u8, text, "\xe2\x82\xa9") and text.len > 3) { 1263 + const before = lastCodepoint(text[0 .. text.len - 3]); 1264 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1265 + } 1266 + if (std.mem.endsWith(u8, text, "\xe2\x82\xaa") and text.len > 3) { 1267 + const before = lastCodepoint(text[0 .. text.len - 3]); 1268 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1269 + } 1270 + if (std.mem.endsWith(u8, text, "\xe2\x82\xab") and text.len > 3) { 1271 + const before = lastCodepoint(text[0 .. text.len - 3]); 1272 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1273 + } 1274 + if (std.mem.endsWith(u8, text, "\xe2\x82\xac") and text.len > 3) { 1275 + const before = lastCodepoint(text[0 .. text.len - 3]); 1276 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1277 + } 1278 + if (std.mem.endsWith(u8, text, "\xe2\x82\xad") and text.len > 3) { 1279 + const before = lastCodepoint(text[0 .. text.len - 3]); 1280 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1281 + } 1282 + if (std.mem.endsWith(u8, text, "\xe2\x82\xae") and text.len > 3) { 1283 + const before = lastCodepoint(text[0 .. text.len - 3]); 1284 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1285 + } 1286 + if (std.mem.endsWith(u8, text, "\xe2\x82\xaf") and text.len > 3) { 1287 + const before = lastCodepoint(text[0 .. text.len - 3]); 1288 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1289 + } 1290 + if (std.mem.endsWith(u8, text, "\xe2\x82\xb0") and text.len > 3) { 1291 + const before = lastCodepoint(text[0 .. text.len - 3]); 1292 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1293 + } 1294 + if (std.mem.endsWith(u8, text, "\xe2\x82\xb1") and text.len > 3) { 1295 + const before = lastCodepoint(text[0 .. text.len - 3]); 1296 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1297 + } 1298 + if (std.mem.endsWith(u8, text, "\xe2\x82\xb2") and text.len > 3) { 1299 + const before = lastCodepoint(text[0 .. text.len - 3]); 1300 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1301 + } 1302 + if (std.mem.endsWith(u8, text, "\xe2\x82\xb3") and text.len > 3) { 1303 + const before = lastCodepoint(text[0 .. text.len - 3]); 1304 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1305 + } 1306 + if (std.mem.endsWith(u8, text, "\xe2\x82\xb4") and text.len > 3) { 1307 + const before = lastCodepoint(text[0 .. text.len - 3]); 1308 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1309 + } 1310 + if (std.mem.endsWith(u8, text, "\xe2\x82\xb5") and text.len > 3) { 1311 + const before = lastCodepoint(text[0 .. text.len - 3]); 1312 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1313 + } 1314 + if (std.mem.endsWith(u8, text, "\xe2\x82\xb6") and text.len > 3) { 1315 + const before = lastCodepoint(text[0 .. text.len - 3]); 1316 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1317 + } 1318 + if (std.mem.endsWith(u8, text, "\xe2\x82\xb7") and text.len > 3) { 1319 + const before = lastCodepoint(text[0 .. text.len - 3]); 1320 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1321 + } 1322 + if (std.mem.endsWith(u8, text, "\xe2\x82\xb8") and text.len > 3) { 1323 + const before = lastCodepoint(text[0 .. text.len - 3]); 1324 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1325 + } 1326 + if (std.mem.endsWith(u8, text, "\xe2\x82\xb9") and text.len > 3) { 1327 + const before = lastCodepoint(text[0 .. text.len - 3]); 1328 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1329 + } 1330 + if (std.mem.endsWith(u8, text, "\xe2\x82\xba") and text.len > 3) { 1331 + const before = lastCodepoint(text[0 .. text.len - 3]); 1332 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1333 + } 1334 + if (std.mem.endsWith(u8, text, "\xe2\x82\xbb") and text.len > 3) { 1335 + const before = lastCodepoint(text[0 .. text.len - 3]); 1336 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1337 + } 1338 + if (std.mem.endsWith(u8, text, "\xe2\x82\xbc") and text.len > 3) { 1339 + const before = lastCodepoint(text[0 .. text.len - 3]); 1340 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1341 + } 1342 + if (std.mem.endsWith(u8, text, "\xe2\x82\xbd") and text.len > 3) { 1343 + const before = lastCodepoint(text[0 .. text.len - 3]); 1344 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1345 + } 1346 + if (std.mem.endsWith(u8, text, "\xe2\x82\xbe") and text.len > 3) { 1347 + const before = lastCodepoint(text[0 .. text.len - 3]); 1348 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1349 + } 1350 + if (std.mem.endsWith(u8, text, "\xe2\x82\xbf") and text.len > 3) { 1351 + const before = lastCodepoint(text[0 .. text.len - 3]); 1352 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1353 + } 1354 + if (std.mem.endsWith(u8, text, "\xc2\xa3") and text.len > 2) { 1355 + const before = lastCodepoint(text[0 .. text.len - 2]); 1356 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1357 + } 1358 + if (std.mem.endsWith(u8, text, "\xc2\xa5") and text.len > 2) { 1359 + const before = lastCodepoint(text[0 .. text.len - 2]); 1360 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1361 + } 1362 + if (std.mem.endsWith(u8, text, "C$") and text.len > 2) { 1363 + const before = lastCodepoint(text[0 .. text.len - 2]); 1364 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1365 + } 1366 + if (std.mem.endsWith(u8, text, "A$") and text.len > 2) { 1367 + const before = lastCodepoint(text[0 .. text.len - 2]); 1368 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1369 + } 1370 + if (std.mem.endsWith(u8, text, "$") and text.len > 1) { 1371 + const before = lastCodepoint(text[0 .. text.len - 1]); 1372 + if (before != null and matchLookbehind0(before.?.value)) return 1; 1373 + } 1374 + if (std.mem.endsWith(u8, text, "\xd8\xa7\xd9\x83\xd9\x88\xd8\xa7\xd8\xa8") and text.len > 10) { 1375 + const before = lastCodepoint(text[0 .. text.len - 10]); 1376 + if (before != null and matchLookbehind0(before.?.value)) return 10; 1377 + } 1378 + if (std.mem.endsWith(u8, text, "\xd0\xbc\xd0\xb1\xd0\xb0\xd1\x80") and text.len > 8) { 1379 + const before = lastCodepoint(text[0 .. text.len - 8]); 1380 + if (before != null and matchLookbehind0(before.?.value)) return 8; 1381 + } 1382 + if (std.mem.endsWith(u8, text, "\xd1\x82\xd0\xb1\xd9\x83\xd9\x85") and text.len > 8) { 1383 + const before = lastCodepoint(text[0 .. text.len - 8]); 1384 + if (before != null and matchLookbehind0(before.?.value)) return 8; 1385 + } 1386 + if (std.mem.endsWith(u8, text, "\xd8\xba\xd8\xb1\xd8\xa7\xd9\x85") and text.len > 8) { 1387 + const before = lastCodepoint(text[0 .. text.len - 8]); 1388 + if (before != null and matchLookbehind0(before.?.value)) return 8; 1389 + } 1390 + if (std.mem.endsWith(u8, text, "\xd8\xac\xd8\xb1\xd8\xa7\xd9\x85") and text.len > 8) { 1391 + const before = lastCodepoint(text[0 .. text.len - 8]); 1392 + if (before != null and matchLookbehind0(before.?.value)) return 8; 1393 + } 1394 + if (std.mem.endsWith(u8, text, "\xd0\xba\xd0\xbc/\xd1\x87") and text.len > 7) { 1395 + const before = lastCodepoint(text[0 .. text.len - 7]); 1396 + if (before != null and matchLookbehind0(before.?.value)) return 7; 1397 + } 1398 + if (std.mem.endsWith(u8, text, "\xd0\xba\xd0\xbc\xc2\xb2") and text.len > 6) { 1399 + const before = lastCodepoint(text[0 .. text.len - 6]); 1400 + if (before != null and matchLookbehind0(before.?.value)) return 6; 1401 + } 1402 + if (std.mem.endsWith(u8, text, "\xd0\xba\xd0\xbc\xc2\xb3") and text.len > 6) { 1403 + const before = lastCodepoint(text[0 .. text.len - 6]); 1404 + if (before != null and matchLookbehind0(before.?.value)) return 6; 1405 + } 1406 + if (std.mem.endsWith(u8, text, "\xd0\xb4\xd0\xbc\xc2\xb2") and text.len > 6) { 1407 + const before = lastCodepoint(text[0 .. text.len - 6]); 1408 + if (before != null and matchLookbehind0(before.?.value)) return 6; 1409 + } 1410 + if (std.mem.endsWith(u8, text, "\xd0\xb4\xd0\xbc\xc2\xb3") and text.len > 6) { 1411 + const before = lastCodepoint(text[0 .. text.len - 6]); 1412 + if (before != null and matchLookbehind0(before.?.value)) return 6; 1413 + } 1414 + if (std.mem.endsWith(u8, text, "\xd1\x81\xd0\xbc\xc2\xb2") and text.len > 6) { 1415 + const before = lastCodepoint(text[0 .. text.len - 6]); 1416 + if (before != null and matchLookbehind0(before.?.value)) return 6; 1417 + } 1418 + if (std.mem.endsWith(u8, text, "\xd1\x81\xd0\xbc\xc2\xb3") and text.len > 6) { 1419 + const before = lastCodepoint(text[0 .. text.len - 6]); 1420 + if (before != null and matchLookbehind0(before.?.value)) return 6; 1421 + } 1422 + if (std.mem.endsWith(u8, text, "\xd0\xbc\xd0\xbc\xc2\xb2") and text.len > 6) { 1423 + const before = lastCodepoint(text[0 .. text.len - 6]); 1424 + if (before != null and matchLookbehind0(before.?.value)) return 6; 1425 + } 1426 + if (std.mem.endsWith(u8, text, "\xd0\xbc\xd0\xbc\xc2\xb3") and text.len > 6) { 1427 + const before = lastCodepoint(text[0 .. text.len - 6]); 1428 + if (before != null and matchLookbehind0(before.?.value)) return 6; 1429 + } 1430 + if (std.mem.endsWith(u8, text, "\xd0\xba\xd0\x9f\xd0\xb0") and text.len > 6) { 1431 + const before = lastCodepoint(text[0 .. text.len - 6]); 1432 + if (before != null and matchLookbehind0(before.?.value)) return 6; 1433 + } 1434 + if (std.mem.endsWith(u8, text, "\xd9\x83\xd9\x85\xc2\xb2") and text.len > 6) { 1435 + const before = lastCodepoint(text[0 .. text.len - 6]); 1436 + if (before != null and matchLookbehind0(before.?.value)) return 6; 1437 + } 1438 + if (std.mem.endsWith(u8, text, "\xd9\x83\xd9\x85\xc2\xb3") and text.len > 6) { 1439 + const before = lastCodepoint(text[0 .. text.len - 6]); 1440 + if (before != null and matchLookbehind0(before.?.value)) return 6; 1441 + } 1442 + if (std.mem.endsWith(u8, text, "\xd8\xb3\xd9\x85\xc2\xb2") and text.len > 6) { 1443 + const before = lastCodepoint(text[0 .. text.len - 6]); 1444 + if (before != null and matchLookbehind0(before.?.value)) return 6; 1445 + } 1446 + if (std.mem.endsWith(u8, text, "\xd8\xb3\xd9\x85\xc2\xb3") and text.len > 6) { 1447 + const before = lastCodepoint(text[0 .. text.len - 6]); 1448 + if (before != null and matchLookbehind0(before.?.value)) return 6; 1449 + } 1450 + if (std.mem.endsWith(u8, text, "\xd9\x85\xd9\x85\xc2\xb2") and text.len > 6) { 1451 + const before = lastCodepoint(text[0 .. text.len - 6]); 1452 + if (before != null and matchLookbehind0(before.?.value)) return 6; 1453 + } 1454 + if (std.mem.endsWith(u8, text, "\xd9\x85\xd9\x85\xc2\xb3") and text.len > 6) { 1455 + const before = lastCodepoint(text[0 .. text.len - 6]); 1456 + if (before != null and matchLookbehind0(before.?.value)) return 6; 1457 + } 1458 + if (std.mem.endsWith(u8, text, "\xd9\x85\xd9\x84\xd8\xba") and text.len > 6) { 1459 + const before = lastCodepoint(text[0 .. text.len - 6]); 1460 + if (before != null and matchLookbehind0(before.?.value)) return 6; 1461 + } 1462 + if (std.mem.endsWith(u8, text, "\xd9\x83\xd9\x88\xd8\xa8") and text.len > 6) { 1463 + const before = lastCodepoint(text[0 .. text.len - 6]); 1464 + if (before != null and matchLookbehind0(before.?.value)) return 6; 1465 + } 1466 + if (std.mem.endsWith(u8, text, "\xd0\xbc/\xd1\x81") and text.len > 5) { 1467 + const before = lastCodepoint(text[0 .. text.len - 5]); 1468 + if (before != null and matchLookbehind0(before.?.value)) return 5; 1469 + } 1470 + if (std.mem.endsWith(u8, text, "km\xc2\xb2") and text.len > 4) { 1471 + const before = lastCodepoint(text[0 .. text.len - 4]); 1472 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1473 + } 1474 + if (std.mem.endsWith(u8, text, "km\xc2\xb3") and text.len > 4) { 1475 + const before = lastCodepoint(text[0 .. text.len - 4]); 1476 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1477 + } 1478 + if (std.mem.endsWith(u8, text, "dm\xc2\xb2") and text.len > 4) { 1479 + const before = lastCodepoint(text[0 .. text.len - 4]); 1480 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1481 + } 1482 + if (std.mem.endsWith(u8, text, "dm\xc2\xb3") and text.len > 4) { 1483 + const before = lastCodepoint(text[0 .. text.len - 4]); 1484 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1485 + } 1486 + if (std.mem.endsWith(u8, text, "cm\xc2\xb2") and text.len > 4) { 1487 + const before = lastCodepoint(text[0 .. text.len - 4]); 1488 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1489 + } 1490 + if (std.mem.endsWith(u8, text, "cm\xc2\xb3") and text.len > 4) { 1491 + const before = lastCodepoint(text[0 .. text.len - 4]); 1492 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1493 + } 1494 + if (std.mem.endsWith(u8, text, "mm\xc2\xb2") and text.len > 4) { 1495 + const before = lastCodepoint(text[0 .. text.len - 4]); 1496 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1497 + } 1498 + if (std.mem.endsWith(u8, text, "mm\xc2\xb3") and text.len > 4) { 1499 + const before = lastCodepoint(text[0 .. text.len - 4]); 1500 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1501 + } 1502 + if (std.mem.endsWith(u8, text, "km/h") and text.len > 4) { 1503 + const before = lastCodepoint(text[0 .. text.len - 4]); 1504 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1505 + } 1506 + if (std.mem.endsWith(u8, text, "mbar") and text.len > 4) { 1507 + const before = lastCodepoint(text[0 .. text.len - 4]); 1508 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1509 + } 1510 + if (std.mem.endsWith(u8, text, "\xd0\xba\xd0\xbc") and text.len > 4) { 1511 + const before = lastCodepoint(text[0 .. text.len - 4]); 1512 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1513 + } 1514 + if (std.mem.endsWith(u8, text, "\xd0\xbc\xc2\xb2") and text.len > 4) { 1515 + const before = lastCodepoint(text[0 .. text.len - 4]); 1516 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1517 + } 1518 + if (std.mem.endsWith(u8, text, "\xd0\xbc\xc2\xb3") and text.len > 4) { 1519 + const before = lastCodepoint(text[0 .. text.len - 4]); 1520 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1521 + } 1522 + if (std.mem.endsWith(u8, text, "\xd0\xb4\xd0\xbc") and text.len > 4) { 1523 + const before = lastCodepoint(text[0 .. text.len - 4]); 1524 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1525 + } 1526 + if (std.mem.endsWith(u8, text, "\xd1\x81\xd0\xbc") and text.len > 4) { 1527 + const before = lastCodepoint(text[0 .. text.len - 4]); 1528 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1529 + } 1530 + if (std.mem.endsWith(u8, text, "\xd0\xbc\xd0\xbc") and text.len > 4) { 1531 + const before = lastCodepoint(text[0 .. text.len - 4]); 1532 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1533 + } 1534 + if (std.mem.endsWith(u8, text, "\xd0\xbd\xd0\xbc") and text.len > 4) { 1535 + const before = lastCodepoint(text[0 .. text.len - 4]); 1536 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1537 + } 1538 + if (std.mem.endsWith(u8, text, "\xd0\xba\xd0\xb3") and text.len > 4) { 1539 + const before = lastCodepoint(text[0 .. text.len - 4]); 1540 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1541 + } 1542 + if (std.mem.endsWith(u8, text, "\xd0\xbc\xd0\xb3") and text.len > 4) { 1543 + const before = lastCodepoint(text[0 .. text.len - 4]); 1544 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1545 + } 1546 + if (std.mem.endsWith(u8, text, "\xd0\x9f\xd0\xb0") and text.len > 4) { 1547 + const before = lastCodepoint(text[0 .. text.len - 4]); 1548 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1549 + } 1550 + if (std.mem.endsWith(u8, text, "\xd0\x9a\xd0\xb1") and text.len > 4) { 1551 + const before = lastCodepoint(text[0 .. text.len - 4]); 1552 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1553 + } 1554 + if (std.mem.endsWith(u8, text, "\xd0\x9a\xd0\x91") and text.len > 4) { 1555 + const before = lastCodepoint(text[0 .. text.len - 4]); 1556 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1557 + } 1558 + if (std.mem.endsWith(u8, text, "\xd0\xba\xd0\xb1") and text.len > 4) { 1559 + const before = lastCodepoint(text[0 .. text.len - 4]); 1560 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1561 + } 1562 + if (std.mem.endsWith(u8, text, "\xd0\x9c\xd0\xb1") and text.len > 4) { 1563 + const before = lastCodepoint(text[0 .. text.len - 4]); 1564 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1565 + } 1566 + if (std.mem.endsWith(u8, text, "\xd0\x9c\xd0\x91") and text.len > 4) { 1567 + const before = lastCodepoint(text[0 .. text.len - 4]); 1568 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1569 + } 1570 + if (std.mem.endsWith(u8, text, "\xd0\xbc\xd0\xb1") and text.len > 4) { 1571 + const before = lastCodepoint(text[0 .. text.len - 4]); 1572 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1573 + } 1574 + if (std.mem.endsWith(u8, text, "\xd0\x93\xd0\xb1") and text.len > 4) { 1575 + const before = lastCodepoint(text[0 .. text.len - 4]); 1576 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1577 + } 1578 + if (std.mem.endsWith(u8, text, "\xd0\x93\xd0\x91") and text.len > 4) { 1579 + const before = lastCodepoint(text[0 .. text.len - 4]); 1580 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1581 + } 1582 + if (std.mem.endsWith(u8, text, "\xd0\xb3\xd0\xb1") and text.len > 4) { 1583 + const before = lastCodepoint(text[0 .. text.len - 4]); 1584 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1585 + } 1586 + if (std.mem.endsWith(u8, text, "\xd0\xa2\xd0\xb1") and text.len > 4) { 1587 + const before = lastCodepoint(text[0 .. text.len - 4]); 1588 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1589 + } 1590 + if (std.mem.endsWith(u8, text, "\xd0\xa2\xd0\x91") and text.len > 4) { 1591 + const before = lastCodepoint(text[0 .. text.len - 4]); 1592 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1593 + } 1594 + if (std.mem.endsWith(u8, text, "\xd9\x85\xc2\xb2") and text.len > 4) { 1595 + const before = lastCodepoint(text[0 .. text.len - 4]); 1596 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1597 + } 1598 + if (std.mem.endsWith(u8, text, "\xd9\x85\xc2\xb3") and text.len > 4) { 1599 + const before = lastCodepoint(text[0 .. text.len - 4]); 1600 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1601 + } 1602 + if (std.mem.endsWith(u8, text, "\xd8\xb3\xd9\x85") and text.len > 4) { 1603 + const before = lastCodepoint(text[0 .. text.len - 4]); 1604 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1605 + } 1606 + if (std.mem.endsWith(u8, text, "\xd9\x85\xd9\x85") and text.len > 4) { 1607 + const before = lastCodepoint(text[0 .. text.len - 4]); 1608 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1609 + } 1610 + if (std.mem.endsWith(u8, text, "\xd9\x83\xd9\x85") and text.len > 4) { 1611 + const before = lastCodepoint(text[0 .. text.len - 4]); 1612 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1613 + } 1614 + if (std.mem.endsWith(u8, text, "\xd8\xac\xd9\x85") and text.len > 4) { 1615 + const before = lastCodepoint(text[0 .. text.len - 4]); 1616 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1617 + } 1618 + if (std.mem.endsWith(u8, text, "\xd9\x83\xd8\xba") and text.len > 4) { 1619 + const before = lastCodepoint(text[0 .. text.len - 4]); 1620 + if (before != null and matchLookbehind0(before.?.value)) return 4; 1621 + } 1622 + if (std.mem.endsWith(u8, text, "m\xc2\xb2") and text.len > 3) { 1623 + const before = lastCodepoint(text[0 .. text.len - 3]); 1624 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1625 + } 1626 + if (std.mem.endsWith(u8, text, "m\xc2\xb3") and text.len > 3) { 1627 + const before = lastCodepoint(text[0 .. text.len - 3]); 1628 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1629 + } 1630 + if (std.mem.endsWith(u8, text, "\xc2\xb5m") and text.len > 3) { 1631 + const before = lastCodepoint(text[0 .. text.len - 3]); 1632 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1633 + } 1634 + if (std.mem.endsWith(u8, text, "\xc2\xb5g") and text.len > 3) { 1635 + const before = lastCodepoint(text[0 .. text.len - 3]); 1636 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1637 + } 1638 + if (std.mem.endsWith(u8, text, "m/s") and text.len > 3) { 1639 + const before = lastCodepoint(text[0 .. text.len - 3]); 1640 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1641 + } 1642 + if (std.mem.endsWith(u8, text, "kmh") and text.len > 3) { 1643 + const before = lastCodepoint(text[0 .. text.len - 3]); 1644 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1645 + } 1646 + if (std.mem.endsWith(u8, text, "mph") and text.len > 3) { 1647 + const before = lastCodepoint(text[0 .. text.len - 3]); 1648 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1649 + } 1650 + if (std.mem.endsWith(u8, text, "hPa") and text.len > 3) { 1651 + const before = lastCodepoint(text[0 .. text.len - 3]); 1652 + if (before != null and matchLookbehind0(before.?.value)) return 3; 1653 + } 1654 + if (std.mem.endsWith(u8, text, "km") and text.len > 2) { 1655 + const before = lastCodepoint(text[0 .. text.len - 2]); 1656 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1657 + } 1658 + if (std.mem.endsWith(u8, text, "dm") and text.len > 2) { 1659 + const before = lastCodepoint(text[0 .. text.len - 2]); 1660 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1661 + } 1662 + if (std.mem.endsWith(u8, text, "cm") and text.len > 2) { 1663 + const before = lastCodepoint(text[0 .. text.len - 2]); 1664 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1665 + } 1666 + if (std.mem.endsWith(u8, text, "mm") and text.len > 2) { 1667 + const before = lastCodepoint(text[0 .. text.len - 2]); 1668 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1669 + } 1670 + if (std.mem.endsWith(u8, text, "ha") and text.len > 2) { 1671 + const before = lastCodepoint(text[0 .. text.len - 2]); 1672 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1673 + } 1674 + if (std.mem.endsWith(u8, text, "nm") and text.len > 2) { 1675 + const before = lastCodepoint(text[0 .. text.len - 2]); 1676 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1677 + } 1678 + if (std.mem.endsWith(u8, text, "yd") and text.len > 2) { 1679 + const before = lastCodepoint(text[0 .. text.len - 2]); 1680 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1681 + } 1682 + if (std.mem.endsWith(u8, text, "in") and text.len > 2) { 1683 + const before = lastCodepoint(text[0 .. text.len - 2]); 1684 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1685 + } 1686 + if (std.mem.endsWith(u8, text, "ft") and text.len > 2) { 1687 + const before = lastCodepoint(text[0 .. text.len - 2]); 1688 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1689 + } 1690 + if (std.mem.endsWith(u8, text, "kg") and text.len > 2) { 1691 + const before = lastCodepoint(text[0 .. text.len - 2]); 1692 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1693 + } 1694 + if (std.mem.endsWith(u8, text, "mg") and text.len > 2) { 1695 + const before = lastCodepoint(text[0 .. text.len - 2]); 1696 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1697 + } 1698 + if (std.mem.endsWith(u8, text, "lb") and text.len > 2) { 1699 + const before = lastCodepoint(text[0 .. text.len - 2]); 1700 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1701 + } 1702 + if (std.mem.endsWith(u8, text, "oz") and text.len > 2) { 1703 + const before = lastCodepoint(text[0 .. text.len - 2]); 1704 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1705 + } 1706 + if (std.mem.endsWith(u8, text, "Pa") and text.len > 2) { 1707 + const before = lastCodepoint(text[0 .. text.len - 2]); 1708 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1709 + } 1710 + if (std.mem.endsWith(u8, text, "mb") and text.len > 2) { 1711 + const before = lastCodepoint(text[0 .. text.len - 2]); 1712 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1713 + } 1714 + if (std.mem.endsWith(u8, text, "MB") and text.len > 2) { 1715 + const before = lastCodepoint(text[0 .. text.len - 2]); 1716 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1717 + } 1718 + if (std.mem.endsWith(u8, text, "kb") and text.len > 2) { 1719 + const before = lastCodepoint(text[0 .. text.len - 2]); 1720 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1721 + } 1722 + if (std.mem.endsWith(u8, text, "KB") and text.len > 2) { 1723 + const before = lastCodepoint(text[0 .. text.len - 2]); 1724 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1725 + } 1726 + if (std.mem.endsWith(u8, text, "gb") and text.len > 2) { 1727 + const before = lastCodepoint(text[0 .. text.len - 2]); 1728 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1729 + } 1730 + if (std.mem.endsWith(u8, text, "GB") and text.len > 2) { 1731 + const before = lastCodepoint(text[0 .. text.len - 2]); 1732 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1733 + } 1734 + if (std.mem.endsWith(u8, text, "tb") and text.len > 2) { 1735 + const before = lastCodepoint(text[0 .. text.len - 2]); 1736 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1737 + } 1738 + if (std.mem.endsWith(u8, text, "TB") and text.len > 2) { 1739 + const before = lastCodepoint(text[0 .. text.len - 2]); 1740 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1741 + } 1742 + if (std.mem.endsWith(u8, text, "\xd0\xbc") and text.len > 2) { 1743 + const before = lastCodepoint(text[0 .. text.len - 2]); 1744 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1745 + } 1746 + if (std.mem.endsWith(u8, text, "\xd0\xb3") and text.len > 2) { 1747 + const before = lastCodepoint(text[0 .. text.len - 2]); 1748 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1749 + } 1750 + if (std.mem.endsWith(u8, text, "\xd9\x85") and text.len > 2) { 1751 + const before = lastCodepoint(text[0 .. text.len - 2]); 1752 + if (before != null and matchLookbehind0(before.?.value)) return 2; 1753 + } 1754 + if (std.mem.endsWith(u8, text, "m") and text.len > 1) { 1755 + const before = lastCodepoint(text[0 .. text.len - 1]); 1756 + if (before != null and matchLookbehind0(before.?.value)) return 1; 1757 + } 1758 + if (std.mem.endsWith(u8, text, "g") and text.len > 1) { 1759 + const before = lastCodepoint(text[0 .. text.len - 1]); 1760 + if (before != null and matchLookbehind0(before.?.value)) return 1; 1761 + } 1762 + if (std.mem.endsWith(u8, text, "t") and text.len > 1) { 1763 + const before = lastCodepoint(text[0 .. text.len - 1]); 1764 + if (before != null and matchLookbehind0(before.?.value)) return 1; 1765 + } 1766 + if (std.mem.endsWith(u8, text, "T") and text.len > 1) { 1767 + const before = lastCodepoint(text[0 .. text.len - 1]); 1768 + if (before != null and matchLookbehind0(before.?.value)) return 1; 1769 + } 1770 + if (std.mem.endsWith(u8, text, "G") and text.len > 1) { 1771 + const before = lastCodepoint(text[0 .. text.len - 1]); 1772 + if (before != null and matchLookbehind0(before.?.value)) return 1; 1773 + } 1774 + if (std.mem.endsWith(u8, text, "M") and text.len > 1) { 1775 + const before = lastCodepoint(text[0 .. text.len - 1]); 1776 + if (before != null and matchLookbehind0(before.?.value)) return 1; 1777 + } 1778 + if (std.mem.endsWith(u8, text, "K") and text.len > 1) { 1779 + const before = lastCodepoint(text[0 .. text.len - 1]); 1780 + if (before != null and matchLookbehind0(before.?.value)) return 1; 1781 + } 1782 + if (std.mem.endsWith(u8, text, "%") and text.len > 1) { 1783 + const before = lastCodepoint(text[0 .. text.len - 1]); 1784 + if (before != null and matchLookbehind0(before.?.value)) return 1; 1785 + } 1786 + if (std.mem.endsWith(u8, text, ".") and text.len > 1) { 1787 + const before = lastCodepoint(text[0 .. text.len - 1]); 1788 + if (before != null and matchLookbehind2(before.?.value)) return 1; 1789 + } 1790 + if (std.mem.endsWith(u8, text, ".") and text.len > 1) { 1791 + const b1 = lastCodepoint(text[0 .. text.len - 1]); 1792 + if (b1) |bp1| { 1793 + const b2 = lastCodepoint(text[0 .. text.len - 1 - bp1.len]); 1794 + if (matchLookbehind3(bp1.value)) { 1795 + if (b2) |b2p| { 1796 + if (matchLookbehind3(b2p.value)) return 1; 1797 + } 1798 + } 1799 + } 1800 + } 1801 + return 0; 1802 + } 1803 + 1804 + // ── infix character classes ── 1805 + 1806 + pub const is_infix_3_ahead_ranges = [_][2]u21{ 1807 + .{ 0x002D, 0x002D }, 1808 + .{ 0x0030, 0x0039 }, 1809 + }; 1810 + 1811 + pub fn is_infix_3_ahead(c: u21) bool { 1812 + return rangeContains(&is_infix_3_ahead_ranges, c); 1813 + } 1814 + 1815 + pub const is_infix_3_behind_ranges = [_][2]u21{ 1816 + .{ 0x0030, 0x0039 }, 1817 + }; 1818 + 1819 + pub fn is_infix_3_behind(c: u21) bool { 1820 + return rangeContains(&is_infix_3_behind_ranges, c); 1821 + } 1822 + 1823 + pub const is_infix_4_ahead_ranges = [_][2]u21{ 1824 + .{ 0x0022, 0x0022 }, 1825 + .{ 0x0027, 0x0027 }, 1826 + .{ 0x002C, 0x002C }, 1827 + .{ 0x0041, 0x005A }, 1828 + .{ 0x0060, 0x0060 }, 1829 + .{ 0x00AB, 0x00AB }, 1830 + .{ 0x00B4, 0x00B4 }, 1831 + .{ 0x00BB, 0x00BB }, 1832 + .{ 0x00C0, 0x00D6 }, 1833 + .{ 0x00D8, 0x00DE }, 1834 + .{ 0x0100, 0x0100 }, 1835 + .{ 0x0102, 0x0102 }, 1836 + .{ 0x0104, 0x0104 }, 1837 + .{ 0x0106, 0x0106 }, 1838 + .{ 0x0108, 0x0108 }, 1839 + .{ 0x010A, 0x010A }, 1840 + .{ 0x010C, 0x010C }, 1841 + .{ 0x010E, 0x010E }, 1842 + .{ 0x0110, 0x0110 }, 1843 + .{ 0x0112, 0x0112 }, 1844 + .{ 0x0114, 0x0114 }, 1845 + .{ 0x0116, 0x0116 }, 1846 + .{ 0x0118, 0x0118 }, 1847 + .{ 0x011A, 0x011A }, 1848 + .{ 0x011C, 0x011C }, 1849 + .{ 0x011E, 0x011E }, 1850 + .{ 0x0120, 0x0120 }, 1851 + .{ 0x0122, 0x0122 }, 1852 + .{ 0x0124, 0x0124 }, 1853 + .{ 0x0126, 0x0126 }, 1854 + .{ 0x0128, 0x0128 }, 1855 + .{ 0x012A, 0x012A }, 1856 + .{ 0x012C, 0x012C }, 1857 + .{ 0x012E, 0x012E }, 1858 + .{ 0x0130, 0x0130 }, 1859 + .{ 0x0132, 0x0132 }, 1860 + .{ 0x0134, 0x0134 }, 1861 + .{ 0x0136, 0x0136 }, 1862 + .{ 0x0139, 0x0139 }, 1863 + .{ 0x013B, 0x013B }, 1864 + .{ 0x013D, 0x013D }, 1865 + .{ 0x013F, 0x013F }, 1866 + .{ 0x0141, 0x0141 }, 1867 + .{ 0x0143, 0x0143 }, 1868 + .{ 0x0145, 0x0145 }, 1869 + .{ 0x0147, 0x0147 }, 1870 + .{ 0x014A, 0x014A }, 1871 + .{ 0x014C, 0x014C }, 1872 + .{ 0x014E, 0x014E }, 1873 + .{ 0x0150, 0x0150 }, 1874 + .{ 0x0152, 0x0152 }, 1875 + .{ 0x0154, 0x0154 }, 1876 + .{ 0x0156, 0x0156 }, 1877 + .{ 0x0158, 0x0158 }, 1878 + .{ 0x015A, 0x015A }, 1879 + .{ 0x015C, 0x015C }, 1880 + .{ 0x015E, 0x015E }, 1881 + .{ 0x0160, 0x0160 }, 1882 + .{ 0x0162, 0x0162 }, 1883 + .{ 0x0164, 0x0164 }, 1884 + .{ 0x0166, 0x0166 }, 1885 + .{ 0x0168, 0x0168 }, 1886 + .{ 0x016A, 0x016A }, 1887 + .{ 0x016C, 0x016C }, 1888 + .{ 0x016E, 0x016E }, 1889 + .{ 0x0170, 0x0170 }, 1890 + .{ 0x0172, 0x0172 }, 1891 + .{ 0x0174, 0x0174 }, 1892 + .{ 0x0176, 0x0176 }, 1893 + .{ 0x0178, 0x0179 }, 1894 + .{ 0x017B, 0x017B }, 1895 + .{ 0x017D, 0x017D }, 1896 + .{ 0x0181, 0x0182 }, 1897 + .{ 0x0184, 0x0184 }, 1898 + .{ 0x0186, 0x0187 }, 1899 + .{ 0x0189, 0x018B }, 1900 + .{ 0x018E, 0x0191 }, 1901 + .{ 0x0193, 0x0194 }, 1902 + .{ 0x0196, 0x0198 }, 1903 + .{ 0x019C, 0x019D }, 1904 + .{ 0x019F, 0x01A0 }, 1905 + .{ 0x01A2, 0x01A2 }, 1906 + .{ 0x01A4, 0x01A4 }, 1907 + .{ 0x01A6, 0x01A7 }, 1908 + .{ 0x01A9, 0x01A9 }, 1909 + .{ 0x01AC, 0x01AC }, 1910 + .{ 0x01AE, 0x01AF }, 1911 + .{ 0x01B1, 0x01B3 }, 1912 + .{ 0x01B5, 0x01B5 }, 1913 + .{ 0x01B7, 0x01B8 }, 1914 + .{ 0x01BC, 0x01BC }, 1915 + .{ 0x01C4, 0x01C4 }, 1916 + .{ 0x01C7, 0x01C7 }, 1917 + .{ 0x01CA, 0x01CA }, 1918 + .{ 0x01CD, 0x01CD }, 1919 + .{ 0x01CF, 0x01CF }, 1920 + .{ 0x01D1, 0x01D1 }, 1921 + .{ 0x01D3, 0x01D3 }, 1922 + .{ 0x01D5, 0x01D5 }, 1923 + .{ 0x01D7, 0x01D7 }, 1924 + .{ 0x01D9, 0x01D9 }, 1925 + .{ 0x01DB, 0x01DB }, 1926 + .{ 0x01DE, 0x01DE }, 1927 + .{ 0x01E0, 0x01E0 }, 1928 + .{ 0x01E2, 0x01E2 }, 1929 + .{ 0x01E4, 0x01E4 }, 1930 + .{ 0x01E6, 0x01E6 }, 1931 + .{ 0x01E8, 0x01E8 }, 1932 + .{ 0x01EA, 0x01EA }, 1933 + .{ 0x01EC, 0x01EC }, 1934 + .{ 0x01EE, 0x01EE }, 1935 + .{ 0x01F1, 0x01F1 }, 1936 + .{ 0x01F4, 0x01F4 }, 1937 + .{ 0x01F6, 0x01F8 }, 1938 + .{ 0x01FA, 0x01FA }, 1939 + .{ 0x01FC, 0x01FC }, 1940 + .{ 0x01FE, 0x01FE }, 1941 + .{ 0x0200, 0x0200 }, 1942 + .{ 0x0202, 0x0202 }, 1943 + .{ 0x0204, 0x0204 }, 1944 + .{ 0x0206, 0x0206 }, 1945 + .{ 0x0208, 0x0208 }, 1946 + .{ 0x020A, 0x020A }, 1947 + .{ 0x020C, 0x020C }, 1948 + .{ 0x020E, 0x020E }, 1949 + .{ 0x0210, 0x0210 }, 1950 + .{ 0x0212, 0x0212 }, 1951 + .{ 0x0214, 0x0214 }, 1952 + .{ 0x0216, 0x0216 }, 1953 + .{ 0x0218, 0x0218 }, 1954 + .{ 0x021A, 0x021A }, 1955 + .{ 0x021C, 0x021C }, 1956 + .{ 0x021E, 0x021E }, 1957 + .{ 0x0220, 0x0220 }, 1958 + .{ 0x0222, 0x0222 }, 1959 + .{ 0x0224, 0x0224 }, 1960 + .{ 0x0226, 0x0226 }, 1961 + .{ 0x0228, 0x0228 }, 1962 + .{ 0x022A, 0x022A }, 1963 + .{ 0x022C, 0x022C }, 1964 + .{ 0x022E, 0x022E }, 1965 + .{ 0x0230, 0x0230 }, 1966 + .{ 0x0232, 0x0232 }, 1967 + .{ 0x023A, 0x023B }, 1968 + .{ 0x023D, 0x023E }, 1969 + .{ 0x0241, 0x0241 }, 1970 + .{ 0x0243, 0x0246 }, 1971 + .{ 0x0248, 0x0248 }, 1972 + .{ 0x024A, 0x024A }, 1973 + .{ 0x024C, 0x024C }, 1974 + .{ 0x024E, 0x024E }, 1975 + .{ 0x0386, 0x0386 }, 1976 + .{ 0x0388, 0x038A }, 1977 + .{ 0x038C, 0x038C }, 1978 + .{ 0x038E, 0x038F }, 1979 + .{ 0x0391, 0x03A9 }, 1980 + .{ 0x0400, 0x0401 }, 1981 + .{ 0x0403, 0x040A }, 1982 + .{ 0x040C, 0x040D }, 1983 + .{ 0x0410, 0x042F }, 1984 + .{ 0x0490, 0x0490 }, 1985 + .{ 0x0496, 0x0496 }, 1986 + .{ 0x04A2, 0x04A2 }, 1987 + .{ 0x04AE, 0x04AE }, 1988 + .{ 0x04BA, 0x04BA }, 1989 + .{ 0x04D8, 0x04D8 }, 1990 + .{ 0x04E8, 0x04E8 }, 1991 + .{ 0x0591, 0x05F4 }, 1992 + .{ 0x0620, 0x064A }, 1993 + .{ 0x066E, 0x06D5 }, 1994 + .{ 0x06E5, 0x06FF }, 1995 + .{ 0x0750, 0x077F }, 1996 + .{ 0x08A0, 0x08BD }, 1997 + .{ 0x0900, 0x09FF }, 1998 + .{ 0x0B80, 0x0CFF }, 1999 + .{ 0x0D80, 0x0DFF }, 2000 + .{ 0x1100, 0x137F }, 2001 + .{ 0x1E00, 0x1E00 }, 2002 + .{ 0x1E02, 0x1E02 }, 2003 + .{ 0x1E04, 0x1E04 }, 2004 + .{ 0x1E06, 0x1E06 }, 2005 + .{ 0x1E08, 0x1E08 }, 2006 + .{ 0x1E0A, 0x1E0A }, 2007 + .{ 0x1E0C, 0x1E0C }, 2008 + .{ 0x1E0E, 0x1E0E }, 2009 + .{ 0x1E10, 0x1E10 }, 2010 + .{ 0x1E12, 0x1E12 }, 2011 + .{ 0x1E14, 0x1E14 }, 2012 + .{ 0x1E16, 0x1E16 }, 2013 + .{ 0x1E18, 0x1E18 }, 2014 + .{ 0x1E1A, 0x1E1A }, 2015 + .{ 0x1E1C, 0x1E1C }, 2016 + .{ 0x1E1E, 0x1E1E }, 2017 + .{ 0x1E20, 0x1E20 }, 2018 + .{ 0x1E22, 0x1E22 }, 2019 + .{ 0x1E24, 0x1E24 }, 2020 + .{ 0x1E26, 0x1E26 }, 2021 + .{ 0x1E28, 0x1E28 }, 2022 + .{ 0x1E2A, 0x1E2A }, 2023 + .{ 0x1E2C, 0x1E2C }, 2024 + .{ 0x1E2E, 0x1E2E }, 2025 + .{ 0x1E30, 0x1E30 }, 2026 + .{ 0x1E32, 0x1E32 }, 2027 + .{ 0x1E34, 0x1E34 }, 2028 + .{ 0x1E36, 0x1E36 }, 2029 + .{ 0x1E38, 0x1E38 }, 2030 + .{ 0x1E3A, 0x1E3A }, 2031 + .{ 0x1E3C, 0x1E3C }, 2032 + .{ 0x1E3E, 0x1E3E }, 2033 + .{ 0x1E40, 0x1E40 }, 2034 + .{ 0x1E42, 0x1E42 }, 2035 + .{ 0x1E44, 0x1E44 }, 2036 + .{ 0x1E46, 0x1E46 }, 2037 + .{ 0x1E48, 0x1E48 }, 2038 + .{ 0x1E4A, 0x1E4A }, 2039 + .{ 0x1E4C, 0x1E4C }, 2040 + .{ 0x1E4E, 0x1E4E }, 2041 + .{ 0x1E50, 0x1E50 }, 2042 + .{ 0x1E52, 0x1E52 }, 2043 + .{ 0x1E54, 0x1E54 }, 2044 + .{ 0x1E56, 0x1E56 }, 2045 + .{ 0x1E58, 0x1E58 }, 2046 + .{ 0x1E5A, 0x1E5A }, 2047 + .{ 0x1E5C, 0x1E5C }, 2048 + .{ 0x1E5E, 0x1E5E }, 2049 + .{ 0x1E60, 0x1E60 }, 2050 + .{ 0x1E62, 0x1E62 }, 2051 + .{ 0x1E64, 0x1E64 }, 2052 + .{ 0x1E66, 0x1E66 }, 2053 + .{ 0x1E68, 0x1E68 }, 2054 + .{ 0x1E6A, 0x1E6A }, 2055 + .{ 0x1E6C, 0x1E6C }, 2056 + .{ 0x1E6E, 0x1E6E }, 2057 + .{ 0x1E70, 0x1E70 }, 2058 + .{ 0x1E72, 0x1E72 }, 2059 + .{ 0x1E74, 0x1E74 }, 2060 + .{ 0x1E76, 0x1E76 }, 2061 + .{ 0x1E78, 0x1E78 }, 2062 + .{ 0x1E7A, 0x1E7A }, 2063 + .{ 0x1E7C, 0x1E7C }, 2064 + .{ 0x1E7E, 0x1E7E }, 2065 + .{ 0x1E80, 0x1E80 }, 2066 + .{ 0x1E82, 0x1E82 }, 2067 + .{ 0x1E84, 0x1E84 }, 2068 + .{ 0x1E86, 0x1E86 }, 2069 + .{ 0x1E88, 0x1E88 }, 2070 + .{ 0x1E8A, 0x1E8A }, 2071 + .{ 0x1E8C, 0x1E8C }, 2072 + .{ 0x1E8E, 0x1E8E }, 2073 + .{ 0x1E90, 0x1E90 }, 2074 + .{ 0x1E92, 0x1E92 }, 2075 + .{ 0x1E94, 0x1E94 }, 2076 + .{ 0x1E9E, 0x1E9E }, 2077 + .{ 0x1EA0, 0x1EA0 }, 2078 + .{ 0x1EA2, 0x1EA2 }, 2079 + .{ 0x1EA4, 0x1EA4 }, 2080 + .{ 0x1EA6, 0x1EA6 }, 2081 + .{ 0x1EA8, 0x1EA8 }, 2082 + .{ 0x1EAA, 0x1EAA }, 2083 + .{ 0x1EAC, 0x1EAC }, 2084 + .{ 0x1EAE, 0x1EAE }, 2085 + .{ 0x1EB0, 0x1EB0 }, 2086 + .{ 0x1EB2, 0x1EB2 }, 2087 + .{ 0x1EB4, 0x1EB4 }, 2088 + .{ 0x1EB6, 0x1EB6 }, 2089 + .{ 0x1EB8, 0x1EB8 }, 2090 + .{ 0x1EBA, 0x1EBA }, 2091 + .{ 0x1EBC, 0x1EBC }, 2092 + .{ 0x1EBE, 0x1EBE }, 2093 + .{ 0x1EC0, 0x1EC0 }, 2094 + .{ 0x1EC2, 0x1EC2 }, 2095 + .{ 0x1EC4, 0x1EC4 }, 2096 + .{ 0x1EC6, 0x1EC6 }, 2097 + .{ 0x1EC8, 0x1EC8 }, 2098 + .{ 0x1ECA, 0x1ECA }, 2099 + .{ 0x1ECC, 0x1ECC }, 2100 + .{ 0x1ECE, 0x1ECE }, 2101 + .{ 0x1ED0, 0x1ED0 }, 2102 + .{ 0x1ED2, 0x1ED2 }, 2103 + .{ 0x1ED4, 0x1ED4 }, 2104 + .{ 0x1ED6, 0x1ED6 }, 2105 + .{ 0x1ED8, 0x1ED8 }, 2106 + .{ 0x1EDA, 0x1EDA }, 2107 + .{ 0x1EDC, 0x1EDC }, 2108 + .{ 0x1EDE, 0x1EDE }, 2109 + .{ 0x1EE0, 0x1EE0 }, 2110 + .{ 0x1EE2, 0x1EE2 }, 2111 + .{ 0x1EE4, 0x1EE4 }, 2112 + .{ 0x1EE6, 0x1EE6 }, 2113 + .{ 0x1EE8, 0x1EE8 }, 2114 + .{ 0x1EEA, 0x1EEA }, 2115 + .{ 0x1EEC, 0x1EEC }, 2116 + .{ 0x1EEE, 0x1EEE }, 2117 + .{ 0x1EF0, 0x1EF0 }, 2118 + .{ 0x1EF2, 0x1EF2 }, 2119 + .{ 0x1EF4, 0x1EF4 }, 2120 + .{ 0x1EF6, 0x1EF6 }, 2121 + .{ 0x1EF8, 0x1EF8 }, 2122 + .{ 0x1EFA, 0x1EFA }, 2123 + .{ 0x1EFC, 0x1EFC }, 2124 + .{ 0x1EFE, 0x1EFE }, 2125 + .{ 0x2018, 0x201A }, 2126 + .{ 0x201C, 0x201E }, 2127 + .{ 0x2329, 0x232A }, 2128 + .{ 0x27E6, 0x27E7 }, 2129 + .{ 0x2C60, 0x2C60 }, 2130 + .{ 0x2C62, 0x2C64 }, 2131 + .{ 0x2C67, 0x2C67 }, 2132 + .{ 0x2C69, 0x2C69 }, 2133 + .{ 0x2C6B, 0x2C6B }, 2134 + .{ 0x2C6D, 0x2C70 }, 2135 + .{ 0x2C72, 0x2C72 }, 2136 + .{ 0x2C75, 0x2C75 }, 2137 + .{ 0x2C7E, 0x2C7F }, 2138 + .{ 0x2E80, 0x2FDF }, 2139 + .{ 0x2FF0, 0x30FF }, 2140 + .{ 0x31C0, 0x31EF }, 2141 + .{ 0x3200, 0x4DBF }, 2142 + .{ 0x4E00, 0x9FFF }, 2143 + .{ 0xA722, 0xA722 }, 2144 + .{ 0xA724, 0xA724 }, 2145 + .{ 0xA726, 0xA726 }, 2146 + .{ 0xA728, 0xA728 }, 2147 + .{ 0xA72A, 0xA72A }, 2148 + .{ 0xA72C, 0xA72C }, 2149 + .{ 0xA72E, 0xA72E }, 2150 + .{ 0xA732, 0xA732 }, 2151 + .{ 0xA734, 0xA734 }, 2152 + .{ 0xA736, 0xA736 }, 2153 + .{ 0xA738, 0xA738 }, 2154 + .{ 0xA73A, 0xA73A }, 2155 + .{ 0xA73C, 0xA73C }, 2156 + .{ 0xA73E, 0xA73E }, 2157 + .{ 0xA740, 0xA740 }, 2158 + .{ 0xA742, 0xA742 }, 2159 + .{ 0xA744, 0xA744 }, 2160 + .{ 0xA746, 0xA746 }, 2161 + .{ 0xA748, 0xA748 }, 2162 + .{ 0xA74A, 0xA74A }, 2163 + .{ 0xA74C, 0xA74C }, 2164 + .{ 0xA74E, 0xA74E }, 2165 + .{ 0xA750, 0xA750 }, 2166 + .{ 0xA752, 0xA752 }, 2167 + .{ 0xA754, 0xA754 }, 2168 + .{ 0xA756, 0xA756 }, 2169 + .{ 0xA758, 0xA758 }, 2170 + .{ 0xA75A, 0xA75A }, 2171 + .{ 0xA75C, 0xA75C }, 2172 + .{ 0xA75E, 0xA75E }, 2173 + .{ 0xA760, 0xA760 }, 2174 + .{ 0xA762, 0xA762 }, 2175 + .{ 0xA764, 0xA764 }, 2176 + .{ 0xA766, 0xA766 }, 2177 + .{ 0xA768, 0xA768 }, 2178 + .{ 0xA76A, 0xA76A }, 2179 + .{ 0xA76C, 0xA76C }, 2180 + .{ 0xA76E, 0xA76E }, 2181 + .{ 0xA779, 0xA779 }, 2182 + .{ 0xA77B, 0xA77B }, 2183 + .{ 0xA77D, 0xA77E }, 2184 + .{ 0xA780, 0xA780 }, 2185 + .{ 0xA782, 0xA782 }, 2186 + .{ 0xA784, 0xA784 }, 2187 + .{ 0xA786, 0xA786 }, 2188 + .{ 0xA78B, 0xA78B }, 2189 + .{ 0xA78D, 0xA78D }, 2190 + .{ 0xA790, 0xA790 }, 2191 + .{ 0xA792, 0xA792 }, 2192 + .{ 0xA796, 0xA796 }, 2193 + .{ 0xA798, 0xA798 }, 2194 + .{ 0xA79A, 0xA79A }, 2195 + .{ 0xA79C, 0xA79C }, 2196 + .{ 0xA79E, 0xA79E }, 2197 + .{ 0xA7A0, 0xA7A0 }, 2198 + .{ 0xA7A2, 0xA7A2 }, 2199 + .{ 0xA7A4, 0xA7A4 }, 2200 + .{ 0xA7A6, 0xA7A6 }, 2201 + .{ 0xA7A8, 0xA7A8 }, 2202 + .{ 0xA7AA, 0xA7AE }, 2203 + .{ 0xA7B0, 0xA7B4 }, 2204 + .{ 0xA7B6, 0xA7B6 }, 2205 + .{ 0xA7B8, 0xA7B8 }, 2206 + .{ 0xAC00, 0xD7AF }, 2207 + .{ 0xF900, 0xFAFF }, 2208 + .{ 0xFB1D, 0xFBB1 }, 2209 + .{ 0xFBD3, 0xFD3D }, 2210 + .{ 0xFD50, 0xFDC7 }, 2211 + .{ 0xFDF0, 0xFDFB }, 2212 + .{ 0xFE30, 0xFE4F }, 2213 + .{ 0xFE70, 0xFEFC }, 2214 + .{ 0xFF08, 0xFF09 }, 2215 + .{ 0xFF21, 0xFF3A }, 2216 + .{ 0x1EE00, 0x1EEBB }, 2217 + .{ 0x1F200, 0x1F2FF }, 2218 + .{ 0x20000, 0x2A6DF }, 2219 + .{ 0x2A700, 0x2EBEF }, 2220 + .{ 0x2F800, 0x2FA1F }, 2221 + }; 2222 + 2223 + pub fn is_infix_4_ahead(c: u21) bool { 2224 + return rangeContains(&is_infix_4_ahead_ranges, c); 2225 + } 2226 + 2227 + pub const is_infix_4_behind_ranges = [_][2]u21{ 2228 + .{ 0x0022, 0x0022 }, 2229 + .{ 0x0027, 0x0027 }, 2230 + .{ 0x002C, 0x002C }, 2231 + .{ 0x0060, 0x007A }, 2232 + .{ 0x00AB, 0x00AB }, 2233 + .{ 0x00B4, 0x00B4 }, 2234 + .{ 0x00BB, 0x00BB }, 2235 + .{ 0x00DF, 0x00F6 }, 2236 + .{ 0x00F8, 0x00FF }, 2237 + .{ 0x0101, 0x0101 }, 2238 + .{ 0x0103, 0x0103 }, 2239 + .{ 0x0105, 0x0105 }, 2240 + .{ 0x0107, 0x0107 }, 2241 + .{ 0x0109, 0x0109 }, 2242 + .{ 0x010B, 0x010B }, 2243 + .{ 0x010D, 0x010D }, 2244 + .{ 0x010F, 0x010F }, 2245 + .{ 0x0111, 0x0111 }, 2246 + .{ 0x0113, 0x0113 }, 2247 + .{ 0x0115, 0x0115 }, 2248 + .{ 0x0117, 0x0117 }, 2249 + .{ 0x0119, 0x0119 }, 2250 + .{ 0x011B, 0x011B }, 2251 + .{ 0x011D, 0x011D }, 2252 + .{ 0x011F, 0x011F }, 2253 + .{ 0x0121, 0x0121 }, 2254 + .{ 0x0123, 0x0123 }, 2255 + .{ 0x0125, 0x0125 }, 2256 + .{ 0x0127, 0x0127 }, 2257 + .{ 0x0129, 0x0129 }, 2258 + .{ 0x012B, 0x012B }, 2259 + .{ 0x012D, 0x012D }, 2260 + .{ 0x012F, 0x012F }, 2261 + .{ 0x0131, 0x0131 }, 2262 + .{ 0x0133, 0x0133 }, 2263 + .{ 0x0135, 0x0135 }, 2264 + .{ 0x0137, 0x0138 }, 2265 + .{ 0x013A, 0x013A }, 2266 + .{ 0x013C, 0x013C }, 2267 + .{ 0x013E, 0x013E }, 2268 + .{ 0x0140, 0x0140 }, 2269 + .{ 0x0142, 0x0142 }, 2270 + .{ 0x0144, 0x0144 }, 2271 + .{ 0x0146, 0x0146 }, 2272 + .{ 0x0148, 0x0149 }, 2273 + .{ 0x014B, 0x014B }, 2274 + .{ 0x014D, 0x014D }, 2275 + .{ 0x014F, 0x014F }, 2276 + .{ 0x0151, 0x0151 }, 2277 + .{ 0x0153, 0x0153 }, 2278 + .{ 0x0155, 0x0155 }, 2279 + .{ 0x0157, 0x0157 }, 2280 + .{ 0x0159, 0x0159 }, 2281 + .{ 0x015B, 0x015B }, 2282 + .{ 0x015D, 0x015D }, 2283 + .{ 0x015F, 0x015F }, 2284 + .{ 0x0161, 0x0161 }, 2285 + .{ 0x0163, 0x0163 }, 2286 + .{ 0x0165, 0x0165 }, 2287 + .{ 0x0167, 0x0167 }, 2288 + .{ 0x0169, 0x0169 }, 2289 + .{ 0x016B, 0x016B }, 2290 + .{ 0x016D, 0x016D }, 2291 + .{ 0x016F, 0x016F }, 2292 + .{ 0x0171, 0x0171 }, 2293 + .{ 0x0173, 0x0173 }, 2294 + .{ 0x0175, 0x0175 }, 2295 + .{ 0x0177, 0x0177 }, 2296 + .{ 0x017A, 0x017A }, 2297 + .{ 0x017C, 0x017C }, 2298 + .{ 0x017E, 0x0180 }, 2299 + .{ 0x0183, 0x0183 }, 2300 + .{ 0x0185, 0x0185 }, 2301 + .{ 0x0188, 0x0188 }, 2302 + .{ 0x018C, 0x018D }, 2303 + .{ 0x0192, 0x0192 }, 2304 + .{ 0x0195, 0x0195 }, 2305 + .{ 0x0199, 0x019B }, 2306 + .{ 0x019E, 0x019E }, 2307 + .{ 0x01A1, 0x01A1 }, 2308 + .{ 0x01A3, 0x01A3 }, 2309 + .{ 0x01A5, 0x01A5 }, 2310 + .{ 0x01A8, 0x01A8 }, 2311 + .{ 0x01AA, 0x01AB }, 2312 + .{ 0x01AD, 0x01AD }, 2313 + .{ 0x01B0, 0x01B0 }, 2314 + .{ 0x01B4, 0x01B4 }, 2315 + .{ 0x01B6, 0x01B6 }, 2316 + .{ 0x01B9, 0x01BA }, 2317 + .{ 0x01BD, 0x01BF }, 2318 + .{ 0x01C6, 0x01C6 }, 2319 + .{ 0x01C9, 0x01C9 }, 2320 + .{ 0x01CC, 0x01CC }, 2321 + .{ 0x01CE, 0x01CE }, 2322 + .{ 0x01D0, 0x01D0 }, 2323 + .{ 0x01D2, 0x01D2 }, 2324 + .{ 0x01D4, 0x01D4 }, 2325 + .{ 0x01D6, 0x01D6 }, 2326 + .{ 0x01D8, 0x01D8 }, 2327 + .{ 0x01DA, 0x01DA }, 2328 + .{ 0x01DC, 0x01DD }, 2329 + .{ 0x01DF, 0x01DF }, 2330 + .{ 0x01E1, 0x01E1 }, 2331 + .{ 0x01E3, 0x01E3 }, 2332 + .{ 0x01E5, 0x01E5 }, 2333 + .{ 0x01E7, 0x01E7 }, 2334 + .{ 0x01E9, 0x01E9 }, 2335 + .{ 0x01EB, 0x01EB }, 2336 + .{ 0x01ED, 0x01ED }, 2337 + .{ 0x01EF, 0x01F0 }, 2338 + .{ 0x01F3, 0x01F3 }, 2339 + .{ 0x01F5, 0x01F5 }, 2340 + .{ 0x01F9, 0x01F9 }, 2341 + .{ 0x01FB, 0x01FB }, 2342 + .{ 0x01FD, 0x01FD }, 2343 + .{ 0x01FF, 0x01FF }, 2344 + .{ 0x0201, 0x0201 }, 2345 + .{ 0x0203, 0x0203 }, 2346 + .{ 0x0205, 0x0205 }, 2347 + .{ 0x0207, 0x0207 }, 2348 + .{ 0x0209, 0x0209 }, 2349 + .{ 0x020B, 0x020B }, 2350 + .{ 0x020D, 0x020D }, 2351 + .{ 0x020F, 0x020F }, 2352 + .{ 0x0211, 0x0211 }, 2353 + .{ 0x0213, 0x0213 }, 2354 + .{ 0x0215, 0x0215 }, 2355 + .{ 0x0217, 0x0217 }, 2356 + .{ 0x0219, 0x0219 }, 2357 + .{ 0x021B, 0x021B }, 2358 + .{ 0x021D, 0x021D }, 2359 + .{ 0x021F, 0x021F }, 2360 + .{ 0x0221, 0x0221 }, 2361 + .{ 0x0223, 0x0223 }, 2362 + .{ 0x0225, 0x0225 }, 2363 + .{ 0x0227, 0x0227 }, 2364 + .{ 0x0229, 0x0229 }, 2365 + .{ 0x022B, 0x022B }, 2366 + .{ 0x022D, 0x022D }, 2367 + .{ 0x022F, 0x022F }, 2368 + .{ 0x0231, 0x0231 }, 2369 + .{ 0x0233, 0x0239 }, 2370 + .{ 0x023C, 0x023C }, 2371 + .{ 0x023F, 0x0240 }, 2372 + .{ 0x0242, 0x0242 }, 2373 + .{ 0x0247, 0x0247 }, 2374 + .{ 0x0249, 0x0249 }, 2375 + .{ 0x024B, 0x024B }, 2376 + .{ 0x024D, 0x024D }, 2377 + .{ 0x024F, 0x02AF }, 2378 + .{ 0x03AC, 0x03AF }, 2379 + .{ 0x03B1, 0x03C9 }, 2380 + .{ 0x03CC, 0x03CE }, 2381 + .{ 0x0430, 0x0451 }, 2382 + .{ 0x0453, 0x045A }, 2383 + .{ 0x045C, 0x045D }, 2384 + .{ 0x0491, 0x0491 }, 2385 + .{ 0x0497, 0x0497 }, 2386 + .{ 0x04A3, 0x04A3 }, 2387 + .{ 0x04AF, 0x04AF }, 2388 + .{ 0x04BB, 0x04BB }, 2389 + .{ 0x04D9, 0x04D9 }, 2390 + .{ 0x04E9, 0x04E9 }, 2391 + .{ 0x0591, 0x05F4 }, 2392 + .{ 0x0620, 0x064A }, 2393 + .{ 0x066E, 0x06D5 }, 2394 + .{ 0x06E5, 0x06FF }, 2395 + .{ 0x0750, 0x077F }, 2396 + .{ 0x08A0, 0x08BD }, 2397 + .{ 0x0900, 0x09FF }, 2398 + .{ 0x0B80, 0x0CFF }, 2399 + .{ 0x0D80, 0x0DFF }, 2400 + .{ 0x1100, 0x137F }, 2401 + .{ 0x1D00, 0x1D25 }, 2402 + .{ 0x1D6B, 0x1D77 }, 2403 + .{ 0x1D79, 0x1D9A }, 2404 + .{ 0x1E01, 0x1E01 }, 2405 + .{ 0x1E03, 0x1E03 }, 2406 + .{ 0x1E05, 0x1E05 }, 2407 + .{ 0x1E07, 0x1E07 }, 2408 + .{ 0x1E09, 0x1E09 }, 2409 + .{ 0x1E0B, 0x1E0B }, 2410 + .{ 0x1E0D, 0x1E0D }, 2411 + .{ 0x1E0F, 0x1E0F }, 2412 + .{ 0x1E11, 0x1E11 }, 2413 + .{ 0x1E13, 0x1E13 }, 2414 + .{ 0x1E15, 0x1E15 }, 2415 + .{ 0x1E17, 0x1E17 }, 2416 + .{ 0x1E19, 0x1E19 }, 2417 + .{ 0x1E1B, 0x1E1B }, 2418 + .{ 0x1E1D, 0x1E1D }, 2419 + .{ 0x1E1F, 0x1E1F }, 2420 + .{ 0x1E21, 0x1E21 }, 2421 + .{ 0x1E23, 0x1E23 }, 2422 + .{ 0x1E25, 0x1E25 }, 2423 + .{ 0x1E27, 0x1E27 }, 2424 + .{ 0x1E29, 0x1E29 }, 2425 + .{ 0x1E2B, 0x1E2B }, 2426 + .{ 0x1E2D, 0x1E2D }, 2427 + .{ 0x1E2F, 0x1E2F }, 2428 + .{ 0x1E31, 0x1E31 }, 2429 + .{ 0x1E33, 0x1E33 }, 2430 + .{ 0x1E35, 0x1E35 }, 2431 + .{ 0x1E37, 0x1E37 }, 2432 + .{ 0x1E39, 0x1E39 }, 2433 + .{ 0x1E3B, 0x1E3B }, 2434 + .{ 0x1E3D, 0x1E3D }, 2435 + .{ 0x1E3F, 0x1E3F }, 2436 + .{ 0x1E41, 0x1E41 }, 2437 + .{ 0x1E43, 0x1E43 }, 2438 + .{ 0x1E45, 0x1E45 }, 2439 + .{ 0x1E47, 0x1E47 }, 2440 + .{ 0x1E49, 0x1E49 }, 2441 + .{ 0x1E4B, 0x1E4B }, 2442 + .{ 0x1E4D, 0x1E4D }, 2443 + .{ 0x1E4F, 0x1E4F }, 2444 + .{ 0x1E51, 0x1E51 }, 2445 + .{ 0x1E53, 0x1E53 }, 2446 + .{ 0x1E55, 0x1E55 }, 2447 + .{ 0x1E57, 0x1E57 }, 2448 + .{ 0x1E59, 0x1E59 }, 2449 + .{ 0x1E5B, 0x1E5B }, 2450 + .{ 0x1E5D, 0x1E5D }, 2451 + .{ 0x1E5F, 0x1E5F }, 2452 + .{ 0x1E61, 0x1E61 }, 2453 + .{ 0x1E63, 0x1E63 }, 2454 + .{ 0x1E65, 0x1E65 }, 2455 + .{ 0x1E67, 0x1E67 }, 2456 + .{ 0x1E69, 0x1E69 }, 2457 + .{ 0x1E6B, 0x1E6B }, 2458 + .{ 0x1E6D, 0x1E6D }, 2459 + .{ 0x1E6F, 0x1E6F }, 2460 + .{ 0x1E71, 0x1E71 }, 2461 + .{ 0x1E73, 0x1E73 }, 2462 + .{ 0x1E75, 0x1E75 }, 2463 + .{ 0x1E77, 0x1E77 }, 2464 + .{ 0x1E79, 0x1E79 }, 2465 + .{ 0x1E7B, 0x1E7B }, 2466 + .{ 0x1E7D, 0x1E7D }, 2467 + .{ 0x1E7F, 0x1E7F }, 2468 + .{ 0x1E81, 0x1E81 }, 2469 + .{ 0x1E83, 0x1E83 }, 2470 + .{ 0x1E85, 0x1E85 }, 2471 + .{ 0x1E87, 0x1E87 }, 2472 + .{ 0x1E89, 0x1E89 }, 2473 + .{ 0x1E8B, 0x1E8B }, 2474 + .{ 0x1E8D, 0x1E8D }, 2475 + .{ 0x1E8F, 0x1E8F }, 2476 + .{ 0x1E91, 0x1E91 }, 2477 + .{ 0x1E93, 0x1E93 }, 2478 + .{ 0x1E95, 0x1E9D }, 2479 + .{ 0x1E9F, 0x1E9F }, 2480 + .{ 0x1EA1, 0x1EA1 }, 2481 + .{ 0x1EA3, 0x1EA3 }, 2482 + .{ 0x1EA5, 0x1EA5 }, 2483 + .{ 0x1EA7, 0x1EA7 }, 2484 + .{ 0x1EA9, 0x1EA9 }, 2485 + .{ 0x1EAB, 0x1EAB }, 2486 + .{ 0x1EAD, 0x1EAD }, 2487 + .{ 0x1EAF, 0x1EAF }, 2488 + .{ 0x1EB1, 0x1EB1 }, 2489 + .{ 0x1EB3, 0x1EB3 }, 2490 + .{ 0x1EB5, 0x1EB5 }, 2491 + .{ 0x1EB7, 0x1EB7 }, 2492 + .{ 0x1EB9, 0x1EB9 }, 2493 + .{ 0x1EBB, 0x1EBB }, 2494 + .{ 0x1EBD, 0x1EBD }, 2495 + .{ 0x1EBF, 0x1EBF }, 2496 + .{ 0x1EC1, 0x1EC1 }, 2497 + .{ 0x1EC3, 0x1EC3 }, 2498 + .{ 0x1EC5, 0x1EC5 }, 2499 + .{ 0x1EC7, 0x1EC7 }, 2500 + .{ 0x1EC9, 0x1EC9 }, 2501 + .{ 0x1ECB, 0x1ECB }, 2502 + .{ 0x1ECD, 0x1ECD }, 2503 + .{ 0x1ECF, 0x1ECF }, 2504 + .{ 0x1ED1, 0x1ED1 }, 2505 + .{ 0x1ED3, 0x1ED3 }, 2506 + .{ 0x1ED5, 0x1ED5 }, 2507 + .{ 0x1ED7, 0x1ED7 }, 2508 + .{ 0x1ED9, 0x1ED9 }, 2509 + .{ 0x1EDB, 0x1EDB }, 2510 + .{ 0x1EDD, 0x1EDD }, 2511 + .{ 0x1EDF, 0x1EDF }, 2512 + .{ 0x1EE1, 0x1EE1 }, 2513 + .{ 0x1EE3, 0x1EE3 }, 2514 + .{ 0x1EE5, 0x1EE5 }, 2515 + .{ 0x1EE7, 0x1EE7 }, 2516 + .{ 0x1EE9, 0x1EE9 }, 2517 + .{ 0x1EEB, 0x1EEB }, 2518 + .{ 0x1EED, 0x1EED }, 2519 + .{ 0x1EEF, 0x1EEF }, 2520 + .{ 0x1EF1, 0x1EF1 }, 2521 + .{ 0x1EF3, 0x1EF3 }, 2522 + .{ 0x1EF5, 0x1EF5 }, 2523 + .{ 0x1EF7, 0x1EF7 }, 2524 + .{ 0x1EF9, 0x1EF9 }, 2525 + .{ 0x1EFB, 0x1EFB }, 2526 + .{ 0x1EFD, 0x1EFD }, 2527 + .{ 0x1EFF, 0x1EFF }, 2528 + .{ 0x2018, 0x201A }, 2529 + .{ 0x201C, 0x201E }, 2530 + .{ 0x2329, 0x232A }, 2531 + .{ 0x27E6, 0x27E7 }, 2532 + .{ 0x2C61, 0x2C61 }, 2533 + .{ 0x2C65, 0x2C66 }, 2534 + .{ 0x2C68, 0x2C68 }, 2535 + .{ 0x2C6A, 0x2C6A }, 2536 + .{ 0x2C6C, 0x2C6C }, 2537 + .{ 0x2C71, 0x2C71 }, 2538 + .{ 0x2C73, 0x2C74 }, 2539 + .{ 0x2C76, 0x2C7B }, 2540 + .{ 0x2E80, 0x2FDF }, 2541 + .{ 0x2FF0, 0x30FF }, 2542 + .{ 0x31C0, 0x31EF }, 2543 + .{ 0x3200, 0x4DBF }, 2544 + .{ 0x4E00, 0x9FFF }, 2545 + .{ 0xA723, 0xA723 }, 2546 + .{ 0xA725, 0xA725 }, 2547 + .{ 0xA727, 0xA727 }, 2548 + .{ 0xA729, 0xA729 }, 2549 + .{ 0xA72B, 0xA72B }, 2550 + .{ 0xA72D, 0xA72D }, 2551 + .{ 0xA72F, 0xA731 }, 2552 + .{ 0xA733, 0xA733 }, 2553 + .{ 0xA735, 0xA735 }, 2554 + .{ 0xA737, 0xA737 }, 2555 + .{ 0xA739, 0xA739 }, 2556 + .{ 0xA73B, 0xA73B }, 2557 + .{ 0xA73D, 0xA73D }, 2558 + .{ 0xA73F, 0xA73F }, 2559 + .{ 0xA741, 0xA741 }, 2560 + .{ 0xA743, 0xA743 }, 2561 + .{ 0xA745, 0xA745 }, 2562 + .{ 0xA747, 0xA747 }, 2563 + .{ 0xA749, 0xA749 }, 2564 + .{ 0xA74B, 0xA74B }, 2565 + .{ 0xA74D, 0xA74D }, 2566 + .{ 0xA74F, 0xA74F }, 2567 + .{ 0xA751, 0xA751 }, 2568 + .{ 0xA753, 0xA753 }, 2569 + .{ 0xA755, 0xA755 }, 2570 + .{ 0xA757, 0xA757 }, 2571 + .{ 0xA759, 0xA759 }, 2572 + .{ 0xA75B, 0xA75B }, 2573 + .{ 0xA75D, 0xA75D }, 2574 + .{ 0xA75F, 0xA75F }, 2575 + .{ 0xA761, 0xA761 }, 2576 + .{ 0xA763, 0xA763 }, 2577 + .{ 0xA765, 0xA765 }, 2578 + .{ 0xA767, 0xA767 }, 2579 + .{ 0xA769, 0xA769 }, 2580 + .{ 0xA76B, 0xA76B }, 2581 + .{ 0xA76D, 0xA76D }, 2582 + .{ 0xA76F, 0xA76F }, 2583 + .{ 0xA771, 0xA778 }, 2584 + .{ 0xA77A, 0xA77A }, 2585 + .{ 0xA77C, 0xA77C }, 2586 + .{ 0xA77F, 0xA77F }, 2587 + .{ 0xA781, 0xA781 }, 2588 + .{ 0xA783, 0xA783 }, 2589 + .{ 0xA785, 0xA785 }, 2590 + .{ 0xA787, 0xA787 }, 2591 + .{ 0xA78C, 0xA78C }, 2592 + .{ 0xA78E, 0xA78E }, 2593 + .{ 0xA791, 0xA791 }, 2594 + .{ 0xA793, 0xA795 }, 2595 + .{ 0xA797, 0xA797 }, 2596 + .{ 0xA799, 0xA799 }, 2597 + .{ 0xA79B, 0xA79B }, 2598 + .{ 0xA79D, 0xA79D }, 2599 + .{ 0xA79F, 0xA79F }, 2600 + .{ 0xA7A1, 0xA7A1 }, 2601 + .{ 0xA7A3, 0xA7A3 }, 2602 + .{ 0xA7A5, 0xA7A5 }, 2603 + .{ 0xA7A7, 0xA7A7 }, 2604 + .{ 0xA7A9, 0xA7A9 }, 2605 + .{ 0xA7AF, 0xA7AF }, 2606 + .{ 0xA7B5, 0xA7B5 }, 2607 + .{ 0xA7B7, 0xA7B7 }, 2608 + .{ 0xA7B9, 0xA7B9 }, 2609 + .{ 0xA7FA, 0xA7FA }, 2610 + .{ 0xAB30, 0xAB5A }, 2611 + .{ 0xAB60, 0xAB64 }, 2612 + .{ 0xAC00, 0xD7AF }, 2613 + .{ 0xF900, 0xFAFF }, 2614 + .{ 0xFB1D, 0xFBB1 }, 2615 + .{ 0xFBD3, 0xFD3D }, 2616 + .{ 0xFD50, 0xFDC7 }, 2617 + .{ 0xFDF0, 0xFDFB }, 2618 + .{ 0xFE30, 0xFE4F }, 2619 + .{ 0xFE70, 0xFEFC }, 2620 + .{ 0xFF08, 0xFF09 }, 2621 + .{ 0xFF41, 0xFF5A }, 2622 + .{ 0x1EE00, 0x1EEBB }, 2623 + .{ 0x1F200, 0x1F2FF }, 2624 + .{ 0x20000, 0x2A6DF }, 2625 + .{ 0x2A700, 0x2EBEF }, 2626 + .{ 0x2F800, 0x2FA1F }, 2627 + }; 2628 + 2629 + pub fn is_infix_4_behind(c: u21) bool { 2630 + return rangeContains(&is_infix_4_behind_ranges, c); 2631 + } 2632 + 2633 + pub const is_infix_5_ahead_ranges = [_][2]u21{ 2634 + .{ 0x0041, 0x005A }, 2635 + .{ 0x0061, 0x007A }, 2636 + .{ 0x00C0, 0x00D6 }, 2637 + .{ 0x00D8, 0x00F6 }, 2638 + .{ 0x00F8, 0x01BF }, 2639 + .{ 0x01C4, 0x02AF }, 2640 + .{ 0x0386, 0x0386 }, 2641 + .{ 0x0388, 0x038A }, 2642 + .{ 0x038C, 0x038C }, 2643 + .{ 0x038E, 0x038F }, 2644 + .{ 0x0391, 0x03A9 }, 2645 + .{ 0x03AC, 0x03AF }, 2646 + .{ 0x03B1, 0x03C9 }, 2647 + .{ 0x03CC, 0x03CE }, 2648 + .{ 0x0400, 0x0401 }, 2649 + .{ 0x0403, 0x040A }, 2650 + .{ 0x040C, 0x040D }, 2651 + .{ 0x0410, 0x0451 }, 2652 + .{ 0x0453, 0x045A }, 2653 + .{ 0x045C, 0x045D }, 2654 + .{ 0x0490, 0x0491 }, 2655 + .{ 0x0496, 0x0497 }, 2656 + .{ 0x04A2, 0x04A3 }, 2657 + .{ 0x04AE, 0x04AF }, 2658 + .{ 0x04BA, 0x04BB }, 2659 + .{ 0x04D8, 0x04D9 }, 2660 + .{ 0x04E8, 0x04E9 }, 2661 + .{ 0x0591, 0x05F4 }, 2662 + .{ 0x0620, 0x064A }, 2663 + .{ 0x066E, 0x06D5 }, 2664 + .{ 0x06E5, 0x06FF }, 2665 + .{ 0x0750, 0x077F }, 2666 + .{ 0x08A0, 0x08BD }, 2667 + .{ 0x0900, 0x09FF }, 2668 + .{ 0x0B80, 0x0CFF }, 2669 + .{ 0x0D80, 0x0DFF }, 2670 + .{ 0x1100, 0x137F }, 2671 + .{ 0x1D00, 0x1D25 }, 2672 + .{ 0x1D6B, 0x1D77 }, 2673 + .{ 0x1D79, 0x1D9A }, 2674 + .{ 0x1E00, 0x1EFF }, 2675 + .{ 0x2C60, 0x2C7B }, 2676 + .{ 0x2C7E, 0x2C7F }, 2677 + .{ 0x2E80, 0x2FDF }, 2678 + .{ 0x2FF0, 0x30FF }, 2679 + .{ 0x31C0, 0x31EF }, 2680 + .{ 0x3200, 0x4DBF }, 2681 + .{ 0x4E00, 0x9FFF }, 2682 + .{ 0xA722, 0xA76F }, 2683 + .{ 0xA771, 0xA787 }, 2684 + .{ 0xA78B, 0xA78E }, 2685 + .{ 0xA790, 0xA7B9 }, 2686 + .{ 0xA7FA, 0xA7FA }, 2687 + .{ 0xAB30, 0xAB5A }, 2688 + .{ 0xAB60, 0xAB64 }, 2689 + .{ 0xAC00, 0xD7AF }, 2690 + .{ 0xF900, 0xFAFF }, 2691 + .{ 0xFB1D, 0xFBB1 }, 2692 + .{ 0xFBD3, 0xFD3D }, 2693 + .{ 0xFD50, 0xFDC7 }, 2694 + .{ 0xFDF0, 0xFDFB }, 2695 + .{ 0xFE30, 0xFE4F }, 2696 + .{ 0xFE70, 0xFEFC }, 2697 + .{ 0xFF21, 0xFF3A }, 2698 + .{ 0xFF41, 0xFF5A }, 2699 + .{ 0x1EE00, 0x1EEBB }, 2700 + .{ 0x1F200, 0x1F2FF }, 2701 + .{ 0x20000, 0x2A6DF }, 2702 + .{ 0x2A700, 0x2EBEF }, 2703 + .{ 0x2F800, 0x2FA1F }, 2704 + }; 2705 + 2706 + pub fn is_infix_5_ahead(c: u21) bool { 2707 + return rangeContains(&is_infix_5_ahead_ranges, c); 2708 + } 2709 + 2710 + pub const is_infix_5_behind_ranges = [_][2]u21{ 2711 + .{ 0x0041, 0x005A }, 2712 + .{ 0x0061, 0x007A }, 2713 + .{ 0x00C0, 0x00D6 }, 2714 + .{ 0x00D8, 0x00F6 }, 2715 + .{ 0x00F8, 0x01BF }, 2716 + .{ 0x01C4, 0x02AF }, 2717 + .{ 0x0386, 0x0386 }, 2718 + .{ 0x0388, 0x038A }, 2719 + .{ 0x038C, 0x038C }, 2720 + .{ 0x038E, 0x038F }, 2721 + .{ 0x0391, 0x03A9 }, 2722 + .{ 0x03AC, 0x03AF }, 2723 + .{ 0x03B1, 0x03C9 }, 2724 + .{ 0x03CC, 0x03CE }, 2725 + .{ 0x0400, 0x0401 }, 2726 + .{ 0x0403, 0x040A }, 2727 + .{ 0x040C, 0x040D }, 2728 + .{ 0x0410, 0x0451 }, 2729 + .{ 0x0453, 0x045A }, 2730 + .{ 0x045C, 0x045D }, 2731 + .{ 0x0490, 0x0491 }, 2732 + .{ 0x0496, 0x0497 }, 2733 + .{ 0x04A2, 0x04A3 }, 2734 + .{ 0x04AE, 0x04AF }, 2735 + .{ 0x04BA, 0x04BB }, 2736 + .{ 0x04D8, 0x04D9 }, 2737 + .{ 0x04E8, 0x04E9 }, 2738 + .{ 0x0591, 0x05F4 }, 2739 + .{ 0x0620, 0x064A }, 2740 + .{ 0x066E, 0x06D5 }, 2741 + .{ 0x06E5, 0x06FF }, 2742 + .{ 0x0750, 0x077F }, 2743 + .{ 0x08A0, 0x08BD }, 2744 + .{ 0x0900, 0x09FF }, 2745 + .{ 0x0B80, 0x0CFF }, 2746 + .{ 0x0D80, 0x0DFF }, 2747 + .{ 0x1100, 0x137F }, 2748 + .{ 0x1D00, 0x1D25 }, 2749 + .{ 0x1D6B, 0x1D77 }, 2750 + .{ 0x1D79, 0x1D9A }, 2751 + .{ 0x1E00, 0x1EFF }, 2752 + .{ 0x2C60, 0x2C7B }, 2753 + .{ 0x2C7E, 0x2C7F }, 2754 + .{ 0x2E80, 0x2FDF }, 2755 + .{ 0x2FF0, 0x30FF }, 2756 + .{ 0x31C0, 0x31EF }, 2757 + .{ 0x3200, 0x4DBF }, 2758 + .{ 0x4E00, 0x9FFF }, 2759 + .{ 0xA722, 0xA76F }, 2760 + .{ 0xA771, 0xA787 }, 2761 + .{ 0xA78B, 0xA78E }, 2762 + .{ 0xA790, 0xA7B9 }, 2763 + .{ 0xA7FA, 0xA7FA }, 2764 + .{ 0xAB30, 0xAB5A }, 2765 + .{ 0xAB60, 0xAB64 }, 2766 + .{ 0xAC00, 0xD7AF }, 2767 + .{ 0xF900, 0xFAFF }, 2768 + .{ 0xFB1D, 0xFBB1 }, 2769 + .{ 0xFBD3, 0xFD3D }, 2770 + .{ 0xFD50, 0xFDC7 }, 2771 + .{ 0xFDF0, 0xFDFB }, 2772 + .{ 0xFE30, 0xFE4F }, 2773 + .{ 0xFE70, 0xFEFC }, 2774 + .{ 0xFF21, 0xFF3A }, 2775 + .{ 0xFF41, 0xFF5A }, 2776 + .{ 0x1EE00, 0x1EEBB }, 2777 + .{ 0x1F200, 0x1F2FF }, 2778 + .{ 0x20000, 0x2A6DF }, 2779 + .{ 0x2A700, 0x2EBEF }, 2780 + .{ 0x2F800, 0x2FA1F }, 2781 + }; 2782 + 2783 + pub fn is_infix_5_behind(c: u21) bool { 2784 + return rangeContains(&is_infix_5_behind_ranges, c); 2785 + } 2786 + 2787 + pub const is_infix_6_ahead_ranges = [_][2]u21{ 2788 + .{ 0x0041, 0x005A }, 2789 + .{ 0x0061, 0x007A }, 2790 + .{ 0x00C0, 0x00D6 }, 2791 + .{ 0x00D8, 0x00F6 }, 2792 + .{ 0x00F8, 0x01BF }, 2793 + .{ 0x01C4, 0x02AF }, 2794 + .{ 0x0386, 0x0386 }, 2795 + .{ 0x0388, 0x038A }, 2796 + .{ 0x038C, 0x038C }, 2797 + .{ 0x038E, 0x038F }, 2798 + .{ 0x0391, 0x03A9 }, 2799 + .{ 0x03AC, 0x03AF }, 2800 + .{ 0x03B1, 0x03C9 }, 2801 + .{ 0x03CC, 0x03CE }, 2802 + .{ 0x0400, 0x0401 }, 2803 + .{ 0x0403, 0x040A }, 2804 + .{ 0x040C, 0x040D }, 2805 + .{ 0x0410, 0x0451 }, 2806 + .{ 0x0453, 0x045A }, 2807 + .{ 0x045C, 0x045D }, 2808 + .{ 0x0490, 0x0491 }, 2809 + .{ 0x0496, 0x0497 }, 2810 + .{ 0x04A2, 0x04A3 }, 2811 + .{ 0x04AE, 0x04AF }, 2812 + .{ 0x04BA, 0x04BB }, 2813 + .{ 0x04D8, 0x04D9 }, 2814 + .{ 0x04E8, 0x04E9 }, 2815 + .{ 0x0591, 0x05F4 }, 2816 + .{ 0x0620, 0x064A }, 2817 + .{ 0x066E, 0x06D5 }, 2818 + .{ 0x06E5, 0x06FF }, 2819 + .{ 0x0750, 0x077F }, 2820 + .{ 0x08A0, 0x08BD }, 2821 + .{ 0x0900, 0x09FF }, 2822 + .{ 0x0B80, 0x0CFF }, 2823 + .{ 0x0D80, 0x0DFF }, 2824 + .{ 0x1100, 0x137F }, 2825 + .{ 0x1D00, 0x1D25 }, 2826 + .{ 0x1D6B, 0x1D77 }, 2827 + .{ 0x1D79, 0x1D9A }, 2828 + .{ 0x1E00, 0x1EFF }, 2829 + .{ 0x2C60, 0x2C7B }, 2830 + .{ 0x2C7E, 0x2C7F }, 2831 + .{ 0x2E80, 0x2FDF }, 2832 + .{ 0x2FF0, 0x30FF }, 2833 + .{ 0x31C0, 0x31EF }, 2834 + .{ 0x3200, 0x4DBF }, 2835 + .{ 0x4E00, 0x9FFF }, 2836 + .{ 0xA722, 0xA76F }, 2837 + .{ 0xA771, 0xA787 }, 2838 + .{ 0xA78B, 0xA78E }, 2839 + .{ 0xA790, 0xA7B9 }, 2840 + .{ 0xA7FA, 0xA7FA }, 2841 + .{ 0xAB30, 0xAB5A }, 2842 + .{ 0xAB60, 0xAB64 }, 2843 + .{ 0xAC00, 0xD7AF }, 2844 + .{ 0xF900, 0xFAFF }, 2845 + .{ 0xFB1D, 0xFBB1 }, 2846 + .{ 0xFBD3, 0xFD3D }, 2847 + .{ 0xFD50, 0xFDC7 }, 2848 + .{ 0xFDF0, 0xFDFB }, 2849 + .{ 0xFE30, 0xFE4F }, 2850 + .{ 0xFE70, 0xFEFC }, 2851 + .{ 0xFF21, 0xFF3A }, 2852 + .{ 0xFF41, 0xFF5A }, 2853 + .{ 0x1EE00, 0x1EEBB }, 2854 + .{ 0x1F200, 0x1F2FF }, 2855 + .{ 0x20000, 0x2A6DF }, 2856 + .{ 0x2A700, 0x2EBEF }, 2857 + .{ 0x2F800, 0x2FA1F }, 2858 + }; 2859 + 2860 + pub fn is_infix_6_ahead(c: u21) bool { 2861 + return rangeContains(&is_infix_6_ahead_ranges, c); 2862 + } 2863 + 2864 + pub const is_infix_6_behind_ranges = [_][2]u21{ 2865 + .{ 0x0030, 0x0039 }, 2866 + .{ 0x0041, 0x005A }, 2867 + .{ 0x0061, 0x007A }, 2868 + .{ 0x00C0, 0x00D6 }, 2869 + .{ 0x00D8, 0x00F6 }, 2870 + .{ 0x00F8, 0x01BF }, 2871 + .{ 0x01C4, 0x02AF }, 2872 + .{ 0x0386, 0x0386 }, 2873 + .{ 0x0388, 0x038A }, 2874 + .{ 0x038C, 0x038C }, 2875 + .{ 0x038E, 0x038F }, 2876 + .{ 0x0391, 0x03A9 }, 2877 + .{ 0x03AC, 0x03AF }, 2878 + .{ 0x03B1, 0x03C9 }, 2879 + .{ 0x03CC, 0x03CE }, 2880 + .{ 0x0400, 0x0401 }, 2881 + .{ 0x0403, 0x040A }, 2882 + .{ 0x040C, 0x040D }, 2883 + .{ 0x0410, 0x0451 }, 2884 + .{ 0x0453, 0x045A }, 2885 + .{ 0x045C, 0x045D }, 2886 + .{ 0x0490, 0x0491 }, 2887 + .{ 0x0496, 0x0497 }, 2888 + .{ 0x04A2, 0x04A3 }, 2889 + .{ 0x04AE, 0x04AF }, 2890 + .{ 0x04BA, 0x04BB }, 2891 + .{ 0x04D8, 0x04D9 }, 2892 + .{ 0x04E8, 0x04E9 }, 2893 + .{ 0x0591, 0x05F4 }, 2894 + .{ 0x0620, 0x064A }, 2895 + .{ 0x066E, 0x06D5 }, 2896 + .{ 0x06E5, 0x06FF }, 2897 + .{ 0x0750, 0x077F }, 2898 + .{ 0x08A0, 0x08BD }, 2899 + .{ 0x0900, 0x09FF }, 2900 + .{ 0x0B80, 0x0CFF }, 2901 + .{ 0x0D80, 0x0DFF }, 2902 + .{ 0x1100, 0x137F }, 2903 + .{ 0x1D00, 0x1D25 }, 2904 + .{ 0x1D6B, 0x1D77 }, 2905 + .{ 0x1D79, 0x1D9A }, 2906 + .{ 0x1E00, 0x1EFF }, 2907 + .{ 0x2C60, 0x2C7B }, 2908 + .{ 0x2C7E, 0x2C7F }, 2909 + .{ 0x2E80, 0x2FDF }, 2910 + .{ 0x2FF0, 0x30FF }, 2911 + .{ 0x31C0, 0x31EF }, 2912 + .{ 0x3200, 0x4DBF }, 2913 + .{ 0x4E00, 0x9FFF }, 2914 + .{ 0xA722, 0xA76F }, 2915 + .{ 0xA771, 0xA787 }, 2916 + .{ 0xA78B, 0xA78E }, 2917 + .{ 0xA790, 0xA7B9 }, 2918 + .{ 0xA7FA, 0xA7FA }, 2919 + .{ 0xAB30, 0xAB5A }, 2920 + .{ 0xAB60, 0xAB64 }, 2921 + .{ 0xAC00, 0xD7AF }, 2922 + .{ 0xF900, 0xFAFF }, 2923 + .{ 0xFB1D, 0xFBB1 }, 2924 + .{ 0xFBD3, 0xFD3D }, 2925 + .{ 0xFD50, 0xFDC7 }, 2926 + .{ 0xFDF0, 0xFDFB }, 2927 + .{ 0xFE30, 0xFE4F }, 2928 + .{ 0xFE70, 0xFEFC }, 2929 + .{ 0xFF21, 0xFF3A }, 2930 + .{ 0xFF41, 0xFF5A }, 2931 + .{ 0x1EE00, 0x1EEBB }, 2932 + .{ 0x1F200, 0x1F2FF }, 2933 + .{ 0x20000, 0x2A6DF }, 2934 + .{ 0x2A700, 0x2EBEF }, 2935 + .{ 0x2F800, 0x2FA1F }, 2936 + }; 2937 + 2938 + pub fn is_infix_6_behind(c: u21) bool { 2939 + return rangeContains(&is_infix_6_behind_ranges, c); 2940 + } 2941 + 2942 + pub const is_infix_7_ahead_ranges = [_][2]u21{ 2943 + .{ 0x0041, 0x005A }, 2944 + .{ 0x0061, 0x007A }, 2945 + .{ 0x00C0, 0x00D6 }, 2946 + .{ 0x00D8, 0x00F6 }, 2947 + .{ 0x00F8, 0x01BF }, 2948 + .{ 0x01C4, 0x02AF }, 2949 + .{ 0x0386, 0x0386 }, 2950 + .{ 0x0388, 0x038A }, 2951 + .{ 0x038C, 0x038C }, 2952 + .{ 0x038E, 0x038F }, 2953 + .{ 0x0391, 0x03A9 }, 2954 + .{ 0x03AC, 0x03AF }, 2955 + .{ 0x03B1, 0x03C9 }, 2956 + .{ 0x03CC, 0x03CE }, 2957 + .{ 0x0400, 0x0401 }, 2958 + .{ 0x0403, 0x040A }, 2959 + .{ 0x040C, 0x040D }, 2960 + .{ 0x0410, 0x0451 }, 2961 + .{ 0x0453, 0x045A }, 2962 + .{ 0x045C, 0x045D }, 2963 + .{ 0x0490, 0x0491 }, 2964 + .{ 0x0496, 0x0497 }, 2965 + .{ 0x04A2, 0x04A3 }, 2966 + .{ 0x04AE, 0x04AF }, 2967 + .{ 0x04BA, 0x04BB }, 2968 + .{ 0x04D8, 0x04D9 }, 2969 + .{ 0x04E8, 0x04E9 }, 2970 + .{ 0x0591, 0x05F4 }, 2971 + .{ 0x0620, 0x064A }, 2972 + .{ 0x066E, 0x06D5 }, 2973 + .{ 0x06E5, 0x06FF }, 2974 + .{ 0x0750, 0x077F }, 2975 + .{ 0x08A0, 0x08BD }, 2976 + .{ 0x0900, 0x09FF }, 2977 + .{ 0x0B80, 0x0CFF }, 2978 + .{ 0x0D80, 0x0DFF }, 2979 + .{ 0x1100, 0x137F }, 2980 + .{ 0x1D00, 0x1D25 }, 2981 + .{ 0x1D6B, 0x1D77 }, 2982 + .{ 0x1D79, 0x1D9A }, 2983 + .{ 0x1E00, 0x1EFF }, 2984 + .{ 0x2C60, 0x2C7B }, 2985 + .{ 0x2C7E, 0x2C7F }, 2986 + .{ 0x2E80, 0x2FDF }, 2987 + .{ 0x2FF0, 0x30FF }, 2988 + .{ 0x31C0, 0x31EF }, 2989 + .{ 0x3200, 0x4DBF }, 2990 + .{ 0x4E00, 0x9FFF }, 2991 + .{ 0xA722, 0xA76F }, 2992 + .{ 0xA771, 0xA787 }, 2993 + .{ 0xA78B, 0xA78E }, 2994 + .{ 0xA790, 0xA7B9 }, 2995 + .{ 0xA7FA, 0xA7FA }, 2996 + .{ 0xAB30, 0xAB5A }, 2997 + .{ 0xAB60, 0xAB64 }, 2998 + .{ 0xAC00, 0xD7AF }, 2999 + .{ 0xF900, 0xFAFF }, 3000 + .{ 0xFB1D, 0xFBB1 }, 3001 + .{ 0xFBD3, 0xFD3D }, 3002 + .{ 0xFD50, 0xFDC7 }, 3003 + .{ 0xFDF0, 0xFDFB }, 3004 + .{ 0xFE30, 0xFE4F }, 3005 + .{ 0xFE70, 0xFEFC }, 3006 + .{ 0xFF21, 0xFF3A }, 3007 + .{ 0xFF41, 0xFF5A }, 3008 + .{ 0x1EE00, 0x1EEBB }, 3009 + .{ 0x1F200, 0x1F2FF }, 3010 + .{ 0x20000, 0x2A6DF }, 3011 + .{ 0x2A700, 0x2EBEF }, 3012 + .{ 0x2F800, 0x2FA1F }, 3013 + }; 3014 + 3015 + pub fn is_infix_7_ahead(c: u21) bool { 3016 + return rangeContains(&is_infix_7_ahead_ranges, c); 3017 + } 3018 + 3019 + pub const is_infix_7_behind_ranges = [_][2]u21{ 3020 + .{ 0x0030, 0x0039 }, 3021 + .{ 0x0041, 0x005A }, 3022 + .{ 0x0061, 0x007A }, 3023 + .{ 0x00C0, 0x00D6 }, 3024 + .{ 0x00D8, 0x00F6 }, 3025 + .{ 0x00F8, 0x01BF }, 3026 + .{ 0x01C4, 0x02AF }, 3027 + .{ 0x0386, 0x0386 }, 3028 + .{ 0x0388, 0x038A }, 3029 + .{ 0x038C, 0x038C }, 3030 + .{ 0x038E, 0x038F }, 3031 + .{ 0x0391, 0x03A9 }, 3032 + .{ 0x03AC, 0x03AF }, 3033 + .{ 0x03B1, 0x03C9 }, 3034 + .{ 0x03CC, 0x03CE }, 3035 + .{ 0x0400, 0x0401 }, 3036 + .{ 0x0403, 0x040A }, 3037 + .{ 0x040C, 0x040D }, 3038 + .{ 0x0410, 0x0451 }, 3039 + .{ 0x0453, 0x045A }, 3040 + .{ 0x045C, 0x045D }, 3041 + .{ 0x0490, 0x0491 }, 3042 + .{ 0x0496, 0x0497 }, 3043 + .{ 0x04A2, 0x04A3 }, 3044 + .{ 0x04AE, 0x04AF }, 3045 + .{ 0x04BA, 0x04BB }, 3046 + .{ 0x04D8, 0x04D9 }, 3047 + .{ 0x04E8, 0x04E9 }, 3048 + .{ 0x0591, 0x05F4 }, 3049 + .{ 0x0620, 0x064A }, 3050 + .{ 0x066E, 0x06D5 }, 3051 + .{ 0x06E5, 0x06FF }, 3052 + .{ 0x0750, 0x077F }, 3053 + .{ 0x08A0, 0x08BD }, 3054 + .{ 0x0900, 0x09FF }, 3055 + .{ 0x0B80, 0x0CFF }, 3056 + .{ 0x0D80, 0x0DFF }, 3057 + .{ 0x1100, 0x137F }, 3058 + .{ 0x1D00, 0x1D25 }, 3059 + .{ 0x1D6B, 0x1D77 }, 3060 + .{ 0x1D79, 0x1D9A }, 3061 + .{ 0x1E00, 0x1EFF }, 3062 + .{ 0x2C60, 0x2C7B }, 3063 + .{ 0x2C7E, 0x2C7F }, 3064 + .{ 0x2E80, 0x2FDF }, 3065 + .{ 0x2FF0, 0x30FF }, 3066 + .{ 0x31C0, 0x31EF }, 3067 + .{ 0x3200, 0x4DBF }, 3068 + .{ 0x4E00, 0x9FFF }, 3069 + .{ 0xA722, 0xA76F }, 3070 + .{ 0xA771, 0xA787 }, 3071 + .{ 0xA78B, 0xA78E }, 3072 + .{ 0xA790, 0xA7B9 }, 3073 + .{ 0xA7FA, 0xA7FA }, 3074 + .{ 0xAB30, 0xAB5A }, 3075 + .{ 0xAB60, 0xAB64 }, 3076 + .{ 0xAC00, 0xD7AF }, 3077 + .{ 0xF900, 0xFAFF }, 3078 + .{ 0xFB1D, 0xFBB1 }, 3079 + .{ 0xFBD3, 0xFD3D }, 3080 + .{ 0xFD50, 0xFDC7 }, 3081 + .{ 0xFDF0, 0xFDFB }, 3082 + .{ 0xFE30, 0xFE4F }, 3083 + .{ 0xFE70, 0xFEFC }, 3084 + .{ 0xFF21, 0xFF3A }, 3085 + .{ 0xFF41, 0xFF5A }, 3086 + .{ 0x1EE00, 0x1EEBB }, 3087 + .{ 0x1F200, 0x1F2FF }, 3088 + .{ 0x20000, 0x2A6DF }, 3089 + .{ 0x2A700, 0x2EBEF }, 3090 + .{ 0x2F800, 0x2FA1F }, 3091 + }; 3092 + 3093 + pub fn is_infix_7_behind(c: u21) bool { 3094 + return rangeContains(&is_infix_7_behind_ranges, c); 3095 + } 3096 + 3097 + // ── special cases ── 3098 + 3099 + pub const SpecialCase = struct { 3100 + tokens: [3][]const u8, 3101 + len: u8, 3102 + }; 3103 + 3104 + pub const specials = std.StaticStringMap(SpecialCase).initComptime(.{ 3105 + .{ "\t", SpecialCase{ .tokens = .{ "\t", "", "" }, .len = 1 } }, 3106 + .{ "\n", SpecialCase{ .tokens = .{ "\n", "", "" }, .len = 1 } }, 3107 + .{ " ", SpecialCase{ .tokens = .{ " ", "", "" }, .len = 1 } }, 3108 + .{ "'", SpecialCase{ .tokens = .{ "'", "", "" }, .len = 1 } }, 3109 + .{ "''", SpecialCase{ .tokens = .{ "''", "", "" }, .len = 1 } }, 3110 + .{ "'Cause", SpecialCase{ .tokens = .{ "'Cause", "", "" }, .len = 1 } }, 3111 + .{ "'Cos", SpecialCase{ .tokens = .{ "'Cos", "", "" }, .len = 1 } }, 3112 + .{ "'Coz", SpecialCase{ .tokens = .{ "'Coz", "", "" }, .len = 1 } }, 3113 + .{ "'Cuz", SpecialCase{ .tokens = .{ "'Cuz", "", "" }, .len = 1 } }, 3114 + .{ "'S", SpecialCase{ .tokens = .{ "'S", "", "" }, .len = 1 } }, 3115 + .{ "'bout", SpecialCase{ .tokens = .{ "'bout", "", "" }, .len = 1 } }, 3116 + .{ "'cause", SpecialCase{ .tokens = .{ "'cause", "", "" }, .len = 1 } }, 3117 + .{ "'cos", SpecialCase{ .tokens = .{ "'cos", "", "" }, .len = 1 } }, 3118 + .{ "'coz", SpecialCase{ .tokens = .{ "'coz", "", "" }, .len = 1 } }, 3119 + .{ "'cuz", SpecialCase{ .tokens = .{ "'cuz", "", "" }, .len = 1 } }, 3120 + .{ "'d", SpecialCase{ .tokens = .{ "'d", "", "" }, .len = 1 } }, 3121 + .{ "'em", SpecialCase{ .tokens = .{ "'em", "", "" }, .len = 1 } }, 3122 + .{ "'ll", SpecialCase{ .tokens = .{ "'ll", "", "" }, .len = 1 } }, 3123 + .{ "'nuff", SpecialCase{ .tokens = .{ "'nuff", "", "" }, .len = 1 } }, 3124 + .{ "'re", SpecialCase{ .tokens = .{ "'re", "", "" }, .len = 1 } }, 3125 + .{ "'s", SpecialCase{ .tokens = .{ "'s", "", "" }, .len = 1 } }, 3126 + .{ "(*_*)", SpecialCase{ .tokens = .{ "(*_*)", "", "" }, .len = 1 } }, 3127 + .{ "(-8", SpecialCase{ .tokens = .{ "(-8", "", "" }, .len = 1 } }, 3128 + .{ "(-:", SpecialCase{ .tokens = .{ "(-:", "", "" }, .len = 1 } }, 3129 + .{ "(-;", SpecialCase{ .tokens = .{ "(-;", "", "" }, .len = 1 } }, 3130 + .{ "(-_-)", SpecialCase{ .tokens = .{ "(-_-)", "", "" }, .len = 1 } }, 3131 + .{ "(._.)", SpecialCase{ .tokens = .{ "(._.)", "", "" }, .len = 1 } }, 3132 + .{ "(:", SpecialCase{ .tokens = .{ "(:", "", "" }, .len = 1 } }, 3133 + .{ "(;", SpecialCase{ .tokens = .{ "(;", "", "" }, .len = 1 } }, 3134 + .{ "(=", SpecialCase{ .tokens = .{ "(=", "", "" }, .len = 1 } }, 3135 + .{ "(>_<)", SpecialCase{ .tokens = .{ "(>_<)", "", "" }, .len = 1 } }, 3136 + .{ "(^_^)", SpecialCase{ .tokens = .{ "(^_^)", "", "" }, .len = 1 } }, 3137 + .{ "(o:", SpecialCase{ .tokens = .{ "(o:", "", "" }, .len = 1 } }, 3138 + .{ "(\xc2\xac_\xc2\xac)", SpecialCase{ .tokens = .{ "(\xc2\xac_\xc2\xac)", "", "" }, .len = 1 } }, 3139 + .{ "(\xe0\xb2\xa0_\xe0\xb2\xa0)", SpecialCase{ .tokens = .{ "(\xe0\xb2\xa0_\xe0\xb2\xa0)", "", "" }, .len = 1 } }, 3140 + .{ "(\xe2\x95\xaf\xc2\xb0\xe2\x96\xa1\xc2\xb0\xef\xbc\x89\xe2\x95\xaf\xef\xb8\xb5\xe2\x94\xbb\xe2\x94\x81\xe2\x94\xbb", SpecialCase{ .tokens = .{ "(\xe2\x95\xaf\xc2\xb0\xe2\x96\xa1\xc2\xb0\xef\xbc\x89\xe2\x95\xaf\xef\xb8\xb5\xe2\x94\xbb\xe2\x94\x81\xe2\x94\xbb", "", "" }, .len = 1 } }, 3141 + .{ ")-:", SpecialCase{ .tokens = .{ ")-:", "", "" }, .len = 1 } }, 3142 + .{ "):", SpecialCase{ .tokens = .{ "):", "", "" }, .len = 1 } }, 3143 + .{ "-_-", SpecialCase{ .tokens = .{ "-_-", "", "" }, .len = 1 } }, 3144 + .{ "-__-", SpecialCase{ .tokens = .{ "-__-", "", "" }, .len = 1 } }, 3145 + .{ "._.", SpecialCase{ .tokens = .{ "._.", "", "" }, .len = 1 } }, 3146 + .{ "0.0", SpecialCase{ .tokens = .{ "0.0", "", "" }, .len = 1 } }, 3147 + .{ "0.o", SpecialCase{ .tokens = .{ "0.o", "", "" }, .len = 1 } }, 3148 + .{ "0_0", SpecialCase{ .tokens = .{ "0_0", "", "" }, .len = 1 } }, 3149 + .{ "0_o", SpecialCase{ .tokens = .{ "0_o", "", "" }, .len = 1 } }, 3150 + .{ "10a.m.", SpecialCase{ .tokens = .{ "10", "a.m.", "" }, .len = 2 } }, 3151 + .{ "10am", SpecialCase{ .tokens = .{ "10", "am", "" }, .len = 2 } }, 3152 + .{ "10p.m.", SpecialCase{ .tokens = .{ "10", "p.m.", "" }, .len = 2 } }, 3153 + .{ "10pm", SpecialCase{ .tokens = .{ "10", "pm", "" }, .len = 2 } }, 3154 + .{ "11a.m.", SpecialCase{ .tokens = .{ "11", "a.m.", "" }, .len = 2 } }, 3155 + .{ "11am", SpecialCase{ .tokens = .{ "11", "am", "" }, .len = 2 } }, 3156 + .{ "11p.m.", SpecialCase{ .tokens = .{ "11", "p.m.", "" }, .len = 2 } }, 3157 + .{ "11pm", SpecialCase{ .tokens = .{ "11", "pm", "" }, .len = 2 } }, 3158 + .{ "12a.m.", SpecialCase{ .tokens = .{ "12", "a.m.", "" }, .len = 2 } }, 3159 + .{ "12am", SpecialCase{ .tokens = .{ "12", "am", "" }, .len = 2 } }, 3160 + .{ "12p.m.", SpecialCase{ .tokens = .{ "12", "p.m.", "" }, .len = 2 } }, 3161 + .{ "12pm", SpecialCase{ .tokens = .{ "12", "pm", "" }, .len = 2 } }, 3162 + .{ "1a.m.", SpecialCase{ .tokens = .{ "1", "a.m.", "" }, .len = 2 } }, 3163 + .{ "1am", SpecialCase{ .tokens = .{ "1", "am", "" }, .len = 2 } }, 3164 + .{ "1p.m.", SpecialCase{ .tokens = .{ "1", "p.m.", "" }, .len = 2 } }, 3165 + .{ "1pm", SpecialCase{ .tokens = .{ "1", "pm", "" }, .len = 2 } }, 3166 + .{ "2a.m.", SpecialCase{ .tokens = .{ "2", "a.m.", "" }, .len = 2 } }, 3167 + .{ "2am", SpecialCase{ .tokens = .{ "2", "am", "" }, .len = 2 } }, 3168 + .{ "2p.m.", SpecialCase{ .tokens = .{ "2", "p.m.", "" }, .len = 2 } }, 3169 + .{ "2pm", SpecialCase{ .tokens = .{ "2", "pm", "" }, .len = 2 } }, 3170 + .{ "3a.m.", SpecialCase{ .tokens = .{ "3", "a.m.", "" }, .len = 2 } }, 3171 + .{ "3am", SpecialCase{ .tokens = .{ "3", "am", "" }, .len = 2 } }, 3172 + .{ "3p.m.", SpecialCase{ .tokens = .{ "3", "p.m.", "" }, .len = 2 } }, 3173 + .{ "3pm", SpecialCase{ .tokens = .{ "3", "pm", "" }, .len = 2 } }, 3174 + .{ "4a.m.", SpecialCase{ .tokens = .{ "4", "a.m.", "" }, .len = 2 } }, 3175 + .{ "4am", SpecialCase{ .tokens = .{ "4", "am", "" }, .len = 2 } }, 3176 + .{ "4p.m.", SpecialCase{ .tokens = .{ "4", "p.m.", "" }, .len = 2 } }, 3177 + .{ "4pm", SpecialCase{ .tokens = .{ "4", "pm", "" }, .len = 2 } }, 3178 + .{ "5a.m.", SpecialCase{ .tokens = .{ "5", "a.m.", "" }, .len = 2 } }, 3179 + .{ "5am", SpecialCase{ .tokens = .{ "5", "am", "" }, .len = 2 } }, 3180 + .{ "5p.m.", SpecialCase{ .tokens = .{ "5", "p.m.", "" }, .len = 2 } }, 3181 + .{ "5pm", SpecialCase{ .tokens = .{ "5", "pm", "" }, .len = 2 } }, 3182 + .{ "6a.m.", SpecialCase{ .tokens = .{ "6", "a.m.", "" }, .len = 2 } }, 3183 + .{ "6am", SpecialCase{ .tokens = .{ "6", "am", "" }, .len = 2 } }, 3184 + .{ "6p.m.", SpecialCase{ .tokens = .{ "6", "p.m.", "" }, .len = 2 } }, 3185 + .{ "6pm", SpecialCase{ .tokens = .{ "6", "pm", "" }, .len = 2 } }, 3186 + .{ "7a.m.", SpecialCase{ .tokens = .{ "7", "a.m.", "" }, .len = 2 } }, 3187 + .{ "7am", SpecialCase{ .tokens = .{ "7", "am", "" }, .len = 2 } }, 3188 + .{ "7p.m.", SpecialCase{ .tokens = .{ "7", "p.m.", "" }, .len = 2 } }, 3189 + .{ "7pm", SpecialCase{ .tokens = .{ "7", "pm", "" }, .len = 2 } }, 3190 + .{ "8)", SpecialCase{ .tokens = .{ "8)", "", "" }, .len = 1 } }, 3191 + .{ "8-)", SpecialCase{ .tokens = .{ "8-)", "", "" }, .len = 1 } }, 3192 + .{ "8-D", SpecialCase{ .tokens = .{ "8-D", "", "" }, .len = 1 } }, 3193 + .{ "8D", SpecialCase{ .tokens = .{ "8D", "", "" }, .len = 1 } }, 3194 + .{ "8a.m.", SpecialCase{ .tokens = .{ "8", "a.m.", "" }, .len = 2 } }, 3195 + .{ "8am", SpecialCase{ .tokens = .{ "8", "am", "" }, .len = 2 } }, 3196 + .{ "8p.m.", SpecialCase{ .tokens = .{ "8", "p.m.", "" }, .len = 2 } }, 3197 + .{ "8pm", SpecialCase{ .tokens = .{ "8", "pm", "" }, .len = 2 } }, 3198 + .{ "9a.m.", SpecialCase{ .tokens = .{ "9", "a.m.", "" }, .len = 2 } }, 3199 + .{ "9am", SpecialCase{ .tokens = .{ "9", "am", "" }, .len = 2 } }, 3200 + .{ "9p.m.", SpecialCase{ .tokens = .{ "9", "p.m.", "" }, .len = 2 } }, 3201 + .{ "9pm", SpecialCase{ .tokens = .{ "9", "pm", "" }, .len = 2 } }, 3202 + .{ ":'(", SpecialCase{ .tokens = .{ ":'(", "", "" }, .len = 1 } }, 3203 + .{ ":')", SpecialCase{ .tokens = .{ ":')", "", "" }, .len = 1 } }, 3204 + .{ ":'-(", SpecialCase{ .tokens = .{ ":'-(", "", "" }, .len = 1 } }, 3205 + .{ ":'-)", SpecialCase{ .tokens = .{ ":'-)", "", "" }, .len = 1 } }, 3206 + .{ ":(", SpecialCase{ .tokens = .{ ":(", "", "" }, .len = 1 } }, 3207 + .{ ":((", SpecialCase{ .tokens = .{ ":((", "", "" }, .len = 1 } }, 3208 + .{ ":(((", SpecialCase{ .tokens = .{ ":(((", "", "" }, .len = 1 } }, 3209 + .{ ":()", SpecialCase{ .tokens = .{ ":()", "", "" }, .len = 1 } }, 3210 + .{ ":)", SpecialCase{ .tokens = .{ ":)", "", "" }, .len = 1 } }, 3211 + .{ ":))", SpecialCase{ .tokens = .{ ":))", "", "" }, .len = 1 } }, 3212 + .{ ":)))", SpecialCase{ .tokens = .{ ":)))", "", "" }, .len = 1 } }, 3213 + .{ ":*", SpecialCase{ .tokens = .{ ":*", "", "" }, .len = 1 } }, 3214 + .{ ":-(", SpecialCase{ .tokens = .{ ":-(", "", "" }, .len = 1 } }, 3215 + .{ ":-((", SpecialCase{ .tokens = .{ ":-((", "", "" }, .len = 1 } }, 3216 + .{ ":-(((", SpecialCase{ .tokens = .{ ":-(((", "", "" }, .len = 1 } }, 3217 + .{ ":-)", SpecialCase{ .tokens = .{ ":-)", "", "" }, .len = 1 } }, 3218 + .{ ":-))", SpecialCase{ .tokens = .{ ":-))", "", "" }, .len = 1 } }, 3219 + .{ ":-)))", SpecialCase{ .tokens = .{ ":-)))", "", "" }, .len = 1 } }, 3220 + .{ ":-*", SpecialCase{ .tokens = .{ ":-*", "", "" }, .len = 1 } }, 3221 + .{ ":-/", SpecialCase{ .tokens = .{ ":-/", "", "" }, .len = 1 } }, 3222 + .{ ":-0", SpecialCase{ .tokens = .{ ":-0", "", "" }, .len = 1 } }, 3223 + .{ ":-3", SpecialCase{ .tokens = .{ ":-3", "", "" }, .len = 1 } }, 3224 + .{ ":->", SpecialCase{ .tokens = .{ ":->", "", "" }, .len = 1 } }, 3225 + .{ ":-D", SpecialCase{ .tokens = .{ ":-D", "", "" }, .len = 1 } }, 3226 + .{ ":-O", SpecialCase{ .tokens = .{ ":-O", "", "" }, .len = 1 } }, 3227 + .{ ":-P", SpecialCase{ .tokens = .{ ":-P", "", "" }, .len = 1 } }, 3228 + .{ ":-X", SpecialCase{ .tokens = .{ ":-X", "", "" }, .len = 1 } }, 3229 + .{ ":-]", SpecialCase{ .tokens = .{ ":-]", "", "" }, .len = 1 } }, 3230 + .{ ":-o", SpecialCase{ .tokens = .{ ":-o", "", "" }, .len = 1 } }, 3231 + .{ ":-p", SpecialCase{ .tokens = .{ ":-p", "", "" }, .len = 1 } }, 3232 + .{ ":-x", SpecialCase{ .tokens = .{ ":-x", "", "" }, .len = 1 } }, 3233 + .{ ":-|", SpecialCase{ .tokens = .{ ":-|", "", "" }, .len = 1 } }, 3234 + .{ ":-}", SpecialCase{ .tokens = .{ ":-}", "", "" }, .len = 1 } }, 3235 + .{ ":/", SpecialCase{ .tokens = .{ ":/", "", "" }, .len = 1 } }, 3236 + .{ ":0", SpecialCase{ .tokens = .{ ":0", "", "" }, .len = 1 } }, 3237 + .{ ":1", SpecialCase{ .tokens = .{ ":1", "", "" }, .len = 1 } }, 3238 + .{ ":3", SpecialCase{ .tokens = .{ ":3", "", "" }, .len = 1 } }, 3239 + .{ ":>", SpecialCase{ .tokens = .{ ":>", "", "" }, .len = 1 } }, 3240 + .{ ":D", SpecialCase{ .tokens = .{ ":D", "", "" }, .len = 1 } }, 3241 + .{ ":O", SpecialCase{ .tokens = .{ ":O", "", "" }, .len = 1 } }, 3242 + .{ ":P", SpecialCase{ .tokens = .{ ":P", "", "" }, .len = 1 } }, 3243 + .{ ":X", SpecialCase{ .tokens = .{ ":X", "", "" }, .len = 1 } }, 3244 + .{ ":]", SpecialCase{ .tokens = .{ ":]", "", "" }, .len = 1 } }, 3245 + .{ ":o", SpecialCase{ .tokens = .{ ":o", "", "" }, .len = 1 } }, 3246 + .{ ":o)", SpecialCase{ .tokens = .{ ":o)", "", "" }, .len = 1 } }, 3247 + .{ ":p", SpecialCase{ .tokens = .{ ":p", "", "" }, .len = 1 } }, 3248 + .{ ":x", SpecialCase{ .tokens = .{ ":x", "", "" }, .len = 1 } }, 3249 + .{ ":|", SpecialCase{ .tokens = .{ ":|", "", "" }, .len = 1 } }, 3250 + .{ ":}", SpecialCase{ .tokens = .{ ":}", "", "" }, .len = 1 } }, 3251 + .{ ":\xe2\x80\x99(", SpecialCase{ .tokens = .{ ":\xe2\x80\x99(", "", "" }, .len = 1 } }, 3252 + .{ ":\xe2\x80\x99)", SpecialCase{ .tokens = .{ ":\xe2\x80\x99)", "", "" }, .len = 1 } }, 3253 + .{ ":\xe2\x80\x99-(", SpecialCase{ .tokens = .{ ":\xe2\x80\x99-(", "", "" }, .len = 1 } }, 3254 + .{ ":\xe2\x80\x99-)", SpecialCase{ .tokens = .{ ":\xe2\x80\x99-)", "", "" }, .len = 1 } }, 3255 + .{ ";)", SpecialCase{ .tokens = .{ ";)", "", "" }, .len = 1 } }, 3256 + .{ ";-)", SpecialCase{ .tokens = .{ ";-)", "", "" }, .len = 1 } }, 3257 + .{ ";-D", SpecialCase{ .tokens = .{ ";-D", "", "" }, .len = 1 } }, 3258 + .{ ";D", SpecialCase{ .tokens = .{ ";D", "", "" }, .len = 1 } }, 3259 + .{ ";_;", SpecialCase{ .tokens = .{ ";_;", "", "" }, .len = 1 } }, 3260 + .{ "<.<", SpecialCase{ .tokens = .{ "<.<", "", "" }, .len = 1 } }, 3261 + .{ "</3", SpecialCase{ .tokens = .{ "</3", "", "" }, .len = 1 } }, 3262 + .{ "<3", SpecialCase{ .tokens = .{ "<3", "", "" }, .len = 1 } }, 3263 + .{ "<33", SpecialCase{ .tokens = .{ "<33", "", "" }, .len = 1 } }, 3264 + .{ "<333", SpecialCase{ .tokens = .{ "<333", "", "" }, .len = 1 } }, 3265 + .{ "<space>", SpecialCase{ .tokens = .{ "<space>", "", "" }, .len = 1 } }, 3266 + .{ "=(", SpecialCase{ .tokens = .{ "=(", "", "" }, .len = 1 } }, 3267 + .{ "=)", SpecialCase{ .tokens = .{ "=)", "", "" }, .len = 1 } }, 3268 + .{ "=/", SpecialCase{ .tokens = .{ "=/", "", "" }, .len = 1 } }, 3269 + .{ "=3", SpecialCase{ .tokens = .{ "=3", "", "" }, .len = 1 } }, 3270 + .{ "=D", SpecialCase{ .tokens = .{ "=D", "", "" }, .len = 1 } }, 3271 + .{ "=[", SpecialCase{ .tokens = .{ "=[", "", "" }, .len = 1 } }, 3272 + .{ "=]", SpecialCase{ .tokens = .{ "=]", "", "" }, .len = 1 } }, 3273 + .{ "=|", SpecialCase{ .tokens = .{ "=|", "", "" }, .len = 1 } }, 3274 + .{ ">.<", SpecialCase{ .tokens = .{ ">.<", "", "" }, .len = 1 } }, 3275 + .{ ">.>", SpecialCase{ .tokens = .{ ">.>", "", "" }, .len = 1 } }, 3276 + .{ ">:(", SpecialCase{ .tokens = .{ ">:(", "", "" }, .len = 1 } }, 3277 + .{ ">:o", SpecialCase{ .tokens = .{ ">:o", "", "" }, .len = 1 } }, 3278 + .{ "><(((*>", SpecialCase{ .tokens = .{ "><(((*>", "", "" }, .len = 1 } }, 3279 + .{ "@_@", SpecialCase{ .tokens = .{ "@_@", "", "" }, .len = 1 } }, 3280 + .{ "Adm.", SpecialCase{ .tokens = .{ "Adm.", "", "" }, .len = 1 } }, 3281 + .{ "Ain't", SpecialCase{ .tokens = .{ "Ai", "n't", "" }, .len = 2 } }, 3282 + .{ "Aint", SpecialCase{ .tokens = .{ "Ai", "nt", "" }, .len = 2 } }, 3283 + .{ "Ain\xe2\x80\x99t", SpecialCase{ .tokens = .{ "Ai", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3284 + .{ "Ak.", SpecialCase{ .tokens = .{ "Ak.", "", "" }, .len = 1 } }, 3285 + .{ "Ala.", SpecialCase{ .tokens = .{ "Ala.", "", "" }, .len = 1 } }, 3286 + .{ "Apr.", SpecialCase{ .tokens = .{ "Apr.", "", "" }, .len = 1 } }, 3287 + .{ "Aren't", SpecialCase{ .tokens = .{ "Are", "n't", "" }, .len = 2 } }, 3288 + .{ "Arent", SpecialCase{ .tokens = .{ "Are", "nt", "" }, .len = 2 } }, 3289 + .{ "Aren\xe2\x80\x99t", SpecialCase{ .tokens = .{ "Are", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3290 + .{ "Ariz.", SpecialCase{ .tokens = .{ "Ariz.", "", "" }, .len = 1 } }, 3291 + .{ "Ark.", SpecialCase{ .tokens = .{ "Ark.", "", "" }, .len = 1 } }, 3292 + .{ "Aug.", SpecialCase{ .tokens = .{ "Aug.", "", "" }, .len = 1 } }, 3293 + .{ "Bros.", SpecialCase{ .tokens = .{ "Bros.", "", "" }, .len = 1 } }, 3294 + .{ "C'mon", SpecialCase{ .tokens = .{ "C'm", "on", "" }, .len = 2 } }, 3295 + .{ "C++", SpecialCase{ .tokens = .{ "C++", "", "" }, .len = 1 } }, 3296 + .{ "Calif.", SpecialCase{ .tokens = .{ "Calif.", "", "" }, .len = 1 } }, 3297 + .{ "Can't", SpecialCase{ .tokens = .{ "Ca", "n't", "" }, .len = 2 } }, 3298 + .{ "Can't've", SpecialCase{ .tokens = .{ "Ca", "n't", "'ve" }, .len = 3 } }, 3299 + .{ "Cannot", SpecialCase{ .tokens = .{ "Can", "not", "" }, .len = 2 } }, 3300 + .{ "Cant", SpecialCase{ .tokens = .{ "Ca", "nt", "" }, .len = 2 } }, 3301 + .{ "Cantve", SpecialCase{ .tokens = .{ "Ca", "nt", "ve" }, .len = 3 } }, 3302 + .{ "Can\xe2\x80\x99t", SpecialCase{ .tokens = .{ "Ca", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3303 + .{ "Can\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Ca", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 3304 + .{ "Co.", SpecialCase{ .tokens = .{ "Co.", "", "" }, .len = 1 } }, 3305 + .{ "Colo.", SpecialCase{ .tokens = .{ "Colo.", "", "" }, .len = 1 } }, 3306 + .{ "Conn.", SpecialCase{ .tokens = .{ "Conn.", "", "" }, .len = 1 } }, 3307 + .{ "Corp.", SpecialCase{ .tokens = .{ "Corp.", "", "" }, .len = 1 } }, 3308 + .{ "Could've", SpecialCase{ .tokens = .{ "Could", "'ve", "" }, .len = 2 } }, 3309 + .{ "Couldn't", SpecialCase{ .tokens = .{ "Could", "n't", "" }, .len = 2 } }, 3310 + .{ "Couldn't've", SpecialCase{ .tokens = .{ "Could", "n't", "'ve" }, .len = 3 } }, 3311 + .{ "Couldnt", SpecialCase{ .tokens = .{ "Could", "nt", "" }, .len = 2 } }, 3312 + .{ "Couldntve", SpecialCase{ .tokens = .{ "Could", "nt", "ve" }, .len = 3 } }, 3313 + .{ "Couldn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "Could", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3314 + .{ "Couldn\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Could", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 3315 + .{ "Couldve", SpecialCase{ .tokens = .{ "Could", "ve", "" }, .len = 2 } }, 3316 + .{ "Could\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Could", "\xe2\x80\x99ve", "" }, .len = 2 } }, 3317 + .{ "C\xe2\x80\x99mon", SpecialCase{ .tokens = .{ "C\xe2\x80\x99m", "on", "" }, .len = 2 } }, 3318 + .{ "D.C.", SpecialCase{ .tokens = .{ "D.C.", "", "" }, .len = 1 } }, 3319 + .{ "Daren't", SpecialCase{ .tokens = .{ "Dare", "n't", "" }, .len = 2 } }, 3320 + .{ "Darent", SpecialCase{ .tokens = .{ "Dare", "nt", "" }, .len = 2 } }, 3321 + .{ "Daren\xe2\x80\x99t", SpecialCase{ .tokens = .{ "Dare", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3322 + .{ "Dec.", SpecialCase{ .tokens = .{ "Dec.", "", "" }, .len = 1 } }, 3323 + .{ "Del.", SpecialCase{ .tokens = .{ "Del.", "", "" }, .len = 1 } }, 3324 + .{ "Didn't", SpecialCase{ .tokens = .{ "Did", "n't", "" }, .len = 2 } }, 3325 + .{ "Didn't've", SpecialCase{ .tokens = .{ "Did", "n't", "'ve" }, .len = 3 } }, 3326 + .{ "Didnt", SpecialCase{ .tokens = .{ "Did", "nt", "" }, .len = 2 } }, 3327 + .{ "Didntve", SpecialCase{ .tokens = .{ "Did", "nt", "ve" }, .len = 3 } }, 3328 + .{ "Didn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "Did", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3329 + .{ "Didn\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Did", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 3330 + .{ "Doesn't", SpecialCase{ .tokens = .{ "Does", "n't", "" }, .len = 2 } }, 3331 + .{ "Doesn't've", SpecialCase{ .tokens = .{ "Does", "n't", "'ve" }, .len = 3 } }, 3332 + .{ "Doesnt", SpecialCase{ .tokens = .{ "Does", "nt", "" }, .len = 2 } }, 3333 + .{ "Doesntve", SpecialCase{ .tokens = .{ "Does", "nt", "ve" }, .len = 3 } }, 3334 + .{ "Doesn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "Does", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3335 + .{ "Doesn\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Does", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 3336 + .{ "Doin", SpecialCase{ .tokens = .{ "Doin", "", "" }, .len = 1 } }, 3337 + .{ "Doin'", SpecialCase{ .tokens = .{ "Doin'", "", "" }, .len = 1 } }, 3338 + .{ "Doin\xe2\x80\x99", SpecialCase{ .tokens = .{ "Doin\xe2\x80\x99", "", "" }, .len = 1 } }, 3339 + .{ "Don't", SpecialCase{ .tokens = .{ "Do", "n't", "" }, .len = 2 } }, 3340 + .{ "Don't've", SpecialCase{ .tokens = .{ "Do", "n't", "'ve" }, .len = 3 } }, 3341 + .{ "Dont", SpecialCase{ .tokens = .{ "Do", "nt", "" }, .len = 2 } }, 3342 + .{ "Dontve", SpecialCase{ .tokens = .{ "Do", "nt", "ve" }, .len = 3 } }, 3343 + .{ "Don\xe2\x80\x99t", SpecialCase{ .tokens = .{ "Do", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3344 + .{ "Don\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Do", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 3345 + .{ "Dr.", SpecialCase{ .tokens = .{ "Dr.", "", "" }, .len = 1 } }, 3346 + .{ "E.G.", SpecialCase{ .tokens = .{ "E.G.", "", "" }, .len = 1 } }, 3347 + .{ "E.g.", SpecialCase{ .tokens = .{ "E.g.", "", "" }, .len = 1 } }, 3348 + .{ "Feb.", SpecialCase{ .tokens = .{ "Feb.", "", "" }, .len = 1 } }, 3349 + .{ "Fla.", SpecialCase{ .tokens = .{ "Fla.", "", "" }, .len = 1 } }, 3350 + .{ "Ga.", SpecialCase{ .tokens = .{ "Ga.", "", "" }, .len = 1 } }, 3351 + .{ "Gen.", SpecialCase{ .tokens = .{ "Gen.", "", "" }, .len = 1 } }, 3352 + .{ "Goin", SpecialCase{ .tokens = .{ "Goin", "", "" }, .len = 1 } }, 3353 + .{ "Goin'", SpecialCase{ .tokens = .{ "Goin'", "", "" }, .len = 1 } }, 3354 + .{ "Goin\xe2\x80\x99", SpecialCase{ .tokens = .{ "Goin\xe2\x80\x99", "", "" }, .len = 1 } }, 3355 + .{ "Gonna", SpecialCase{ .tokens = .{ "Gon", "na", "" }, .len = 2 } }, 3356 + .{ "Gotta", SpecialCase{ .tokens = .{ "Got", "ta", "" }, .len = 2 } }, 3357 + .{ "Gov.", SpecialCase{ .tokens = .{ "Gov.", "", "" }, .len = 1 } }, 3358 + .{ "Hadn't", SpecialCase{ .tokens = .{ "Had", "n't", "" }, .len = 2 } }, 3359 + .{ "Hadn't've", SpecialCase{ .tokens = .{ "Had", "n't", "'ve" }, .len = 3 } }, 3360 + .{ "Hadnt", SpecialCase{ .tokens = .{ "Had", "nt", "" }, .len = 2 } }, 3361 + .{ "Hadntve", SpecialCase{ .tokens = .{ "Had", "nt", "ve" }, .len = 3 } }, 3362 + .{ "Hadn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "Had", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3363 + .{ "Hadn\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Had", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 3364 + .{ "Hasn't", SpecialCase{ .tokens = .{ "Has", "n't", "" }, .len = 2 } }, 3365 + .{ "Hasnt", SpecialCase{ .tokens = .{ "Has", "nt", "" }, .len = 2 } }, 3366 + .{ "Hasn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "Has", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3367 + .{ "Haven't", SpecialCase{ .tokens = .{ "Have", "n't", "" }, .len = 2 } }, 3368 + .{ "Havent", SpecialCase{ .tokens = .{ "Have", "nt", "" }, .len = 2 } }, 3369 + .{ "Haven\xe2\x80\x99t", SpecialCase{ .tokens = .{ "Have", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3370 + .{ "Havin", SpecialCase{ .tokens = .{ "Havin", "", "" }, .len = 1 } }, 3371 + .{ "Havin'", SpecialCase{ .tokens = .{ "Havin'", "", "" }, .len = 1 } }, 3372 + .{ "Havin\xe2\x80\x99", SpecialCase{ .tokens = .{ "Havin\xe2\x80\x99", "", "" }, .len = 1 } }, 3373 + .{ "He'd", SpecialCase{ .tokens = .{ "He", "'d", "" }, .len = 2 } }, 3374 + .{ "He'd've", SpecialCase{ .tokens = .{ "He", "'d", "'ve" }, .len = 3 } }, 3375 + .{ "He'll", SpecialCase{ .tokens = .{ "He", "'ll", "" }, .len = 2 } }, 3376 + .{ "He'll've", SpecialCase{ .tokens = .{ "He", "'ll", "'ve" }, .len = 3 } }, 3377 + .{ "He's", SpecialCase{ .tokens = .{ "He", "'s", "" }, .len = 2 } }, 3378 + .{ "Hed", SpecialCase{ .tokens = .{ "He", "d", "" }, .len = 2 } }, 3379 + .{ "Hedve", SpecialCase{ .tokens = .{ "He", "d", "ve" }, .len = 3 } }, 3380 + .{ "Hellve", SpecialCase{ .tokens = .{ "He", "ll", "ve" }, .len = 3 } }, 3381 + .{ "Hes", SpecialCase{ .tokens = .{ "He", "s", "" }, .len = 2 } }, 3382 + .{ "He\xe2\x80\x99d", SpecialCase{ .tokens = .{ "He", "\xe2\x80\x99d", "" }, .len = 2 } }, 3383 + .{ "He\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "He", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 3384 + .{ "He\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "He", "\xe2\x80\x99ll", "" }, .len = 2 } }, 3385 + .{ "He\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "He", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 3386 + .{ "He\xe2\x80\x99s", SpecialCase{ .tokens = .{ "He", "\xe2\x80\x99s", "" }, .len = 2 } }, 3387 + .{ "How'd", SpecialCase{ .tokens = .{ "How", "'d", "" }, .len = 2 } }, 3388 + .{ "How'd've", SpecialCase{ .tokens = .{ "How", "'d", "'ve" }, .len = 3 } }, 3389 + .{ "How'd'y", SpecialCase{ .tokens = .{ "How", "'d", "'y" }, .len = 3 } }, 3390 + .{ "How'll", SpecialCase{ .tokens = .{ "How", "'ll", "" }, .len = 2 } }, 3391 + .{ "How'll've", SpecialCase{ .tokens = .{ "How", "'ll", "'ve" }, .len = 3 } }, 3392 + .{ "How're", SpecialCase{ .tokens = .{ "How", "'re", "" }, .len = 2 } }, 3393 + .{ "How's", SpecialCase{ .tokens = .{ "How", "'s", "" }, .len = 2 } }, 3394 + .{ "How've", SpecialCase{ .tokens = .{ "How", "'ve", "" }, .len = 2 } }, 3395 + .{ "Howd", SpecialCase{ .tokens = .{ "How", "d", "" }, .len = 2 } }, 3396 + .{ "Howdve", SpecialCase{ .tokens = .{ "How", "d", "ve" }, .len = 3 } }, 3397 + .{ "Howll", SpecialCase{ .tokens = .{ "How", "ll", "" }, .len = 2 } }, 3398 + .{ "Howllve", SpecialCase{ .tokens = .{ "How", "ll", "ve" }, .len = 3 } }, 3399 + .{ "Howre", SpecialCase{ .tokens = .{ "How", "re", "" }, .len = 2 } }, 3400 + .{ "Hows", SpecialCase{ .tokens = .{ "How", "s", "" }, .len = 2 } }, 3401 + .{ "Howve", SpecialCase{ .tokens = .{ "How", "ve", "" }, .len = 2 } }, 3402 + .{ "How\xe2\x80\x99d", SpecialCase{ .tokens = .{ "How", "\xe2\x80\x99d", "" }, .len = 2 } }, 3403 + .{ "How\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "How", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 3404 + .{ "How\xe2\x80\x99d\xe2\x80\x99y", SpecialCase{ .tokens = .{ "How", "\xe2\x80\x99d", "\xe2\x80\x99y" }, .len = 3 } }, 3405 + .{ "How\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "How", "\xe2\x80\x99ll", "" }, .len = 2 } }, 3406 + .{ "How\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "How", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 3407 + .{ "How\xe2\x80\x99re", SpecialCase{ .tokens = .{ "How", "\xe2\x80\x99re", "" }, .len = 2 } }, 3408 + .{ "How\xe2\x80\x99s", SpecialCase{ .tokens = .{ "How", "\xe2\x80\x99s", "" }, .len = 2 } }, 3409 + .{ "How\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "How", "\xe2\x80\x99ve", "" }, .len = 2 } }, 3410 + .{ "I'd", SpecialCase{ .tokens = .{ "I", "'d", "" }, .len = 2 } }, 3411 + .{ "I'd've", SpecialCase{ .tokens = .{ "I", "'d", "'ve" }, .len = 3 } }, 3412 + .{ "I'll", SpecialCase{ .tokens = .{ "I", "'ll", "" }, .len = 2 } }, 3413 + .{ "I'll've", SpecialCase{ .tokens = .{ "I", "'ll", "'ve" }, .len = 3 } }, 3414 + .{ "I'm", SpecialCase{ .tokens = .{ "I", "'m", "" }, .len = 2 } }, 3415 + .{ "I'ma", SpecialCase{ .tokens = .{ "I", "'m", "a" }, .len = 3 } }, 3416 + .{ "I've", SpecialCase{ .tokens = .{ "I", "'ve", "" }, .len = 2 } }, 3417 + .{ "I.E.", SpecialCase{ .tokens = .{ "I.E.", "", "" }, .len = 1 } }, 3418 + .{ "I.e.", SpecialCase{ .tokens = .{ "I.e.", "", "" }, .len = 1 } }, 3419 + .{ "Ia.", SpecialCase{ .tokens = .{ "Ia.", "", "" }, .len = 1 } }, 3420 + .{ "Id", SpecialCase{ .tokens = .{ "I", "d", "" }, .len = 2 } }, 3421 + .{ "Id.", SpecialCase{ .tokens = .{ "Id.", "", "" }, .len = 1 } }, 3422 + .{ "Idve", SpecialCase{ .tokens = .{ "I", "d", "ve" }, .len = 3 } }, 3423 + .{ "Ill.", SpecialCase{ .tokens = .{ "Ill.", "", "" }, .len = 1 } }, 3424 + .{ "Illve", SpecialCase{ .tokens = .{ "I", "ll", "ve" }, .len = 3 } }, 3425 + .{ "Im", SpecialCase{ .tokens = .{ "I", "m", "" }, .len = 2 } }, 3426 + .{ "Ima", SpecialCase{ .tokens = .{ "I", "m", "a" }, .len = 3 } }, 3427 + .{ "Inc.", SpecialCase{ .tokens = .{ "Inc.", "", "" }, .len = 1 } }, 3428 + .{ "Ind.", SpecialCase{ .tokens = .{ "Ind.", "", "" }, .len = 1 } }, 3429 + .{ "Isn't", SpecialCase{ .tokens = .{ "Is", "n't", "" }, .len = 2 } }, 3430 + .{ "Isnt", SpecialCase{ .tokens = .{ "Is", "nt", "" }, .len = 2 } }, 3431 + .{ "Isn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "Is", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3432 + .{ "It'd", SpecialCase{ .tokens = .{ "It", "'d", "" }, .len = 2 } }, 3433 + .{ "It'd've", SpecialCase{ .tokens = .{ "It", "'d", "'ve" }, .len = 3 } }, 3434 + .{ "It'll", SpecialCase{ .tokens = .{ "It", "'ll", "" }, .len = 2 } }, 3435 + .{ "It'll've", SpecialCase{ .tokens = .{ "It", "'ll", "'ve" }, .len = 3 } }, 3436 + .{ "It's", SpecialCase{ .tokens = .{ "It", "'s", "" }, .len = 2 } }, 3437 + .{ "Itd", SpecialCase{ .tokens = .{ "It", "d", "" }, .len = 2 } }, 3438 + .{ "Itdve", SpecialCase{ .tokens = .{ "It", "d", "ve" }, .len = 3 } }, 3439 + .{ "Itll", SpecialCase{ .tokens = .{ "It", "ll", "" }, .len = 2 } }, 3440 + .{ "Itllve", SpecialCase{ .tokens = .{ "It", "ll", "ve" }, .len = 3 } }, 3441 + .{ "It\xe2\x80\x99d", SpecialCase{ .tokens = .{ "It", "\xe2\x80\x99d", "" }, .len = 2 } }, 3442 + .{ "It\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "It", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 3443 + .{ "It\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "It", "\xe2\x80\x99ll", "" }, .len = 2 } }, 3444 + .{ "It\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "It", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 3445 + .{ "It\xe2\x80\x99s", SpecialCase{ .tokens = .{ "It", "\xe2\x80\x99s", "" }, .len = 2 } }, 3446 + .{ "Ive", SpecialCase{ .tokens = .{ "I", "ve", "" }, .len = 2 } }, 3447 + .{ "I\xe2\x80\x99d", SpecialCase{ .tokens = .{ "I", "\xe2\x80\x99d", "" }, .len = 2 } }, 3448 + .{ "I\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "I", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 3449 + .{ "I\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "I", "\xe2\x80\x99ll", "" }, .len = 2 } }, 3450 + .{ "I\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "I", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 3451 + .{ "I\xe2\x80\x99m", SpecialCase{ .tokens = .{ "I", "\xe2\x80\x99m", "" }, .len = 2 } }, 3452 + .{ "I\xe2\x80\x99ma", SpecialCase{ .tokens = .{ "I", "\xe2\x80\x99m", "a" }, .len = 3 } }, 3453 + .{ "I\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "I", "\xe2\x80\x99ve", "" }, .len = 2 } }, 3454 + .{ "Jan.", SpecialCase{ .tokens = .{ "Jan.", "", "" }, .len = 1 } }, 3455 + .{ "Jr.", SpecialCase{ .tokens = .{ "Jr.", "", "" }, .len = 1 } }, 3456 + .{ "Jul.", SpecialCase{ .tokens = .{ "Jul.", "", "" }, .len = 1 } }, 3457 + .{ "Jun.", SpecialCase{ .tokens = .{ "Jun.", "", "" }, .len = 1 } }, 3458 + .{ "Kan.", SpecialCase{ .tokens = .{ "Kan.", "", "" }, .len = 1 } }, 3459 + .{ "Kans.", SpecialCase{ .tokens = .{ "Kans.", "", "" }, .len = 1 } }, 3460 + .{ "Ky.", SpecialCase{ .tokens = .{ "Ky.", "", "" }, .len = 1 } }, 3461 + .{ "La.", SpecialCase{ .tokens = .{ "La.", "", "" }, .len = 1 } }, 3462 + .{ "Let's", SpecialCase{ .tokens = .{ "Let", "'s", "" }, .len = 2 } }, 3463 + .{ "Let\xe2\x80\x99s", SpecialCase{ .tokens = .{ "Let", "\xe2\x80\x99s", "" }, .len = 2 } }, 3464 + .{ "Lovin", SpecialCase{ .tokens = .{ "Lovin", "", "" }, .len = 1 } }, 3465 + .{ "Lovin'", SpecialCase{ .tokens = .{ "Lovin'", "", "" }, .len = 1 } }, 3466 + .{ "Lovin\xe2\x80\x99", SpecialCase{ .tokens = .{ "Lovin\xe2\x80\x99", "", "" }, .len = 1 } }, 3467 + .{ "Ltd.", SpecialCase{ .tokens = .{ "Ltd.", "", "" }, .len = 1 } }, 3468 + .{ "Ma'am", SpecialCase{ .tokens = .{ "Ma'am", "", "" }, .len = 1 } }, 3469 + .{ "Mar.", SpecialCase{ .tokens = .{ "Mar.", "", "" }, .len = 1 } }, 3470 + .{ "Mass.", SpecialCase{ .tokens = .{ "Mass.", "", "" }, .len = 1 } }, 3471 + .{ "Mayn't", SpecialCase{ .tokens = .{ "May", "n't", "" }, .len = 2 } }, 3472 + .{ "Mayn't've", SpecialCase{ .tokens = .{ "May", "n't", "'ve" }, .len = 3 } }, 3473 + .{ "Maynt", SpecialCase{ .tokens = .{ "May", "nt", "" }, .len = 2 } }, 3474 + .{ "Mayntve", SpecialCase{ .tokens = .{ "May", "nt", "ve" }, .len = 3 } }, 3475 + .{ "Mayn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "May", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3476 + .{ "Mayn\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "May", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 3477 + .{ "Ma\xe2\x80\x99am", SpecialCase{ .tokens = .{ "Ma\xe2\x80\x99am", "", "" }, .len = 1 } }, 3478 + .{ "Md.", SpecialCase{ .tokens = .{ "Md.", "", "" }, .len = 1 } }, 3479 + .{ "Messrs.", SpecialCase{ .tokens = .{ "Messrs.", "", "" }, .len = 1 } }, 3480 + .{ "Mich.", SpecialCase{ .tokens = .{ "Mich.", "", "" }, .len = 1 } }, 3481 + .{ "Might've", SpecialCase{ .tokens = .{ "Might", "'ve", "" }, .len = 2 } }, 3482 + .{ "Mightn't", SpecialCase{ .tokens = .{ "Might", "n't", "" }, .len = 2 } }, 3483 + .{ "Mightn't've", SpecialCase{ .tokens = .{ "Might", "n't", "'ve" }, .len = 3 } }, 3484 + .{ "Mightnt", SpecialCase{ .tokens = .{ "Might", "nt", "" }, .len = 2 } }, 3485 + .{ "Mightntve", SpecialCase{ .tokens = .{ "Might", "nt", "ve" }, .len = 3 } }, 3486 + .{ "Mightn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "Might", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3487 + .{ "Mightn\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Might", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 3488 + .{ "Mightve", SpecialCase{ .tokens = .{ "Might", "ve", "" }, .len = 2 } }, 3489 + .{ "Might\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Might", "\xe2\x80\x99ve", "" }, .len = 2 } }, 3490 + .{ "Minn.", SpecialCase{ .tokens = .{ "Minn.", "", "" }, .len = 1 } }, 3491 + .{ "Miss.", SpecialCase{ .tokens = .{ "Miss.", "", "" }, .len = 1 } }, 3492 + .{ "Mo.", SpecialCase{ .tokens = .{ "Mo.", "", "" }, .len = 1 } }, 3493 + .{ "Mont.", SpecialCase{ .tokens = .{ "Mont.", "", "" }, .len = 1 } }, 3494 + .{ "Mr.", SpecialCase{ .tokens = .{ "Mr.", "", "" }, .len = 1 } }, 3495 + .{ "Mrs.", SpecialCase{ .tokens = .{ "Mrs.", "", "" }, .len = 1 } }, 3496 + .{ "Ms.", SpecialCase{ .tokens = .{ "Ms.", "", "" }, .len = 1 } }, 3497 + .{ "Mt.", SpecialCase{ .tokens = .{ "Mt.", "", "" }, .len = 1 } }, 3498 + .{ "Must've", SpecialCase{ .tokens = .{ "Must", "'ve", "" }, .len = 2 } }, 3499 + .{ "Mustn't", SpecialCase{ .tokens = .{ "Must", "n't", "" }, .len = 2 } }, 3500 + .{ "Mustn't've", SpecialCase{ .tokens = .{ "Must", "n't", "'ve" }, .len = 3 } }, 3501 + .{ "Mustnt", SpecialCase{ .tokens = .{ "Must", "nt", "" }, .len = 2 } }, 3502 + .{ "Mustntve", SpecialCase{ .tokens = .{ "Must", "nt", "ve" }, .len = 3 } }, 3503 + .{ "Mustn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "Must", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3504 + .{ "Mustn\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Must", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 3505 + .{ "Mustve", SpecialCase{ .tokens = .{ "Must", "ve", "" }, .len = 2 } }, 3506 + .{ "Must\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Must", "\xe2\x80\x99ve", "" }, .len = 2 } }, 3507 + .{ "N.C.", SpecialCase{ .tokens = .{ "N.C.", "", "" }, .len = 1 } }, 3508 + .{ "N.D.", SpecialCase{ .tokens = .{ "N.D.", "", "" }, .len = 1 } }, 3509 + .{ "N.H.", SpecialCase{ .tokens = .{ "N.H.", "", "" }, .len = 1 } }, 3510 + .{ "N.J.", SpecialCase{ .tokens = .{ "N.J.", "", "" }, .len = 1 } }, 3511 + .{ "N.M.", SpecialCase{ .tokens = .{ "N.M.", "", "" }, .len = 1 } }, 3512 + .{ "N.Y.", SpecialCase{ .tokens = .{ "N.Y.", "", "" }, .len = 1 } }, 3513 + .{ "Neb.", SpecialCase{ .tokens = .{ "Neb.", "", "" }, .len = 1 } }, 3514 + .{ "Nebr.", SpecialCase{ .tokens = .{ "Nebr.", "", "" }, .len = 1 } }, 3515 + .{ "Needn't", SpecialCase{ .tokens = .{ "Need", "n't", "" }, .len = 2 } }, 3516 + .{ "Needn't've", SpecialCase{ .tokens = .{ "Need", "n't", "'ve" }, .len = 3 } }, 3517 + .{ "Neednt", SpecialCase{ .tokens = .{ "Need", "nt", "" }, .len = 2 } }, 3518 + .{ "Needntve", SpecialCase{ .tokens = .{ "Need", "nt", "ve" }, .len = 3 } }, 3519 + .{ "Needn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "Need", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3520 + .{ "Needn\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Need", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 3521 + .{ "Nev.", SpecialCase{ .tokens = .{ "Nev.", "", "" }, .len = 1 } }, 3522 + .{ "Not've", SpecialCase{ .tokens = .{ "Not", "'ve", "" }, .len = 2 } }, 3523 + .{ "Nothin", SpecialCase{ .tokens = .{ "Nothin", "", "" }, .len = 1 } }, 3524 + .{ "Nothin'", SpecialCase{ .tokens = .{ "Nothin'", "", "" }, .len = 1 } }, 3525 + .{ "Nothin\xe2\x80\x99", SpecialCase{ .tokens = .{ "Nothin\xe2\x80\x99", "", "" }, .len = 1 } }, 3526 + .{ "Notve", SpecialCase{ .tokens = .{ "Not", "ve", "" }, .len = 2 } }, 3527 + .{ "Not\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Not", "\xe2\x80\x99ve", "" }, .len = 2 } }, 3528 + .{ "Nov.", SpecialCase{ .tokens = .{ "Nov.", "", "" }, .len = 1 } }, 3529 + .{ "Nuthin", SpecialCase{ .tokens = .{ "Nuthin", "", "" }, .len = 1 } }, 3530 + .{ "Nuthin'", SpecialCase{ .tokens = .{ "Nuthin'", "", "" }, .len = 1 } }, 3531 + .{ "Nuthin\xe2\x80\x99", SpecialCase{ .tokens = .{ "Nuthin\xe2\x80\x99", "", "" }, .len = 1 } }, 3532 + .{ "O'clock", SpecialCase{ .tokens = .{ "O'clock", "", "" }, .len = 1 } }, 3533 + .{ "O.O", SpecialCase{ .tokens = .{ "O.O", "", "" }, .len = 1 } }, 3534 + .{ "O.o", SpecialCase{ .tokens = .{ "O.o", "", "" }, .len = 1 } }, 3535 + .{ "O_O", SpecialCase{ .tokens = .{ "O_O", "", "" }, .len = 1 } }, 3536 + .{ "O_o", SpecialCase{ .tokens = .{ "O_o", "", "" }, .len = 1 } }, 3537 + .{ "Oct.", SpecialCase{ .tokens = .{ "Oct.", "", "" }, .len = 1 } }, 3538 + .{ "Okla.", SpecialCase{ .tokens = .{ "Okla.", "", "" }, .len = 1 } }, 3539 + .{ "Ol", SpecialCase{ .tokens = .{ "Ol", "", "" }, .len = 1 } }, 3540 + .{ "Ol'", SpecialCase{ .tokens = .{ "Ol'", "", "" }, .len = 1 } }, 3541 + .{ "Ol\xe2\x80\x99", SpecialCase{ .tokens = .{ "Ol\xe2\x80\x99", "", "" }, .len = 1 } }, 3542 + .{ "Ore.", SpecialCase{ .tokens = .{ "Ore.", "", "" }, .len = 1 } }, 3543 + .{ "Oughtn't", SpecialCase{ .tokens = .{ "Ought", "n't", "" }, .len = 2 } }, 3544 + .{ "Oughtn't've", SpecialCase{ .tokens = .{ "Ought", "n't", "'ve" }, .len = 3 } }, 3545 + .{ "Oughtnt", SpecialCase{ .tokens = .{ "Ought", "nt", "" }, .len = 2 } }, 3546 + .{ "Oughtntve", SpecialCase{ .tokens = .{ "Ought", "nt", "ve" }, .len = 3 } }, 3547 + .{ "Oughtn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "Ought", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3548 + .{ "Oughtn\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Ought", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 3549 + .{ "O\xe2\x80\x99clock", SpecialCase{ .tokens = .{ "O\xe2\x80\x99clock", "", "" }, .len = 1 } }, 3550 + .{ "Pa.", SpecialCase{ .tokens = .{ "Pa.", "", "" }, .len = 1 } }, 3551 + .{ "Ph.D.", SpecialCase{ .tokens = .{ "Ph.D.", "", "" }, .len = 1 } }, 3552 + .{ "Prof.", SpecialCase{ .tokens = .{ "Prof.", "", "" }, .len = 1 } }, 3553 + .{ "Rep.", SpecialCase{ .tokens = .{ "Rep.", "", "" }, .len = 1 } }, 3554 + .{ "Rev.", SpecialCase{ .tokens = .{ "Rev.", "", "" }, .len = 1 } }, 3555 + .{ "S.C.", SpecialCase{ .tokens = .{ "S.C.", "", "" }, .len = 1 } }, 3556 + .{ "Sen.", SpecialCase{ .tokens = .{ "Sen.", "", "" }, .len = 1 } }, 3557 + .{ "Sep.", SpecialCase{ .tokens = .{ "Sep.", "", "" }, .len = 1 } }, 3558 + .{ "Sept.", SpecialCase{ .tokens = .{ "Sept.", "", "" }, .len = 1 } }, 3559 + .{ "Shan't", SpecialCase{ .tokens = .{ "Sha", "n't", "" }, .len = 2 } }, 3560 + .{ "Shan't've", SpecialCase{ .tokens = .{ "Sha", "n't", "'ve" }, .len = 3 } }, 3561 + .{ "Shant", SpecialCase{ .tokens = .{ "Sha", "nt", "" }, .len = 2 } }, 3562 + .{ "Shantve", SpecialCase{ .tokens = .{ "Sha", "nt", "ve" }, .len = 3 } }, 3563 + .{ "Shan\xe2\x80\x99t", SpecialCase{ .tokens = .{ "Sha", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3564 + .{ "Shan\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Sha", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 3565 + .{ "She'd", SpecialCase{ .tokens = .{ "She", "'d", "" }, .len = 2 } }, 3566 + .{ "She'd've", SpecialCase{ .tokens = .{ "She", "'d", "'ve" }, .len = 3 } }, 3567 + .{ "She'll", SpecialCase{ .tokens = .{ "She", "'ll", "" }, .len = 2 } }, 3568 + .{ "She'll've", SpecialCase{ .tokens = .{ "She", "'ll", "'ve" }, .len = 3 } }, 3569 + .{ "She's", SpecialCase{ .tokens = .{ "She", "'s", "" }, .len = 2 } }, 3570 + .{ "Shedve", SpecialCase{ .tokens = .{ "She", "d", "ve" }, .len = 3 } }, 3571 + .{ "Shellve", SpecialCase{ .tokens = .{ "She", "ll", "ve" }, .len = 3 } }, 3572 + .{ "Shes", SpecialCase{ .tokens = .{ "She", "s", "" }, .len = 2 } }, 3573 + .{ "She\xe2\x80\x99d", SpecialCase{ .tokens = .{ "She", "\xe2\x80\x99d", "" }, .len = 2 } }, 3574 + .{ "She\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "She", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 3575 + .{ "She\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "She", "\xe2\x80\x99ll", "" }, .len = 2 } }, 3576 + .{ "She\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "She", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 3577 + .{ "She\xe2\x80\x99s", SpecialCase{ .tokens = .{ "She", "\xe2\x80\x99s", "" }, .len = 2 } }, 3578 + .{ "Should've", SpecialCase{ .tokens = .{ "Should", "'ve", "" }, .len = 2 } }, 3579 + .{ "Shouldn't", SpecialCase{ .tokens = .{ "Should", "n't", "" }, .len = 2 } }, 3580 + .{ "Shouldn't've", SpecialCase{ .tokens = .{ "Should", "n't", "'ve" }, .len = 3 } }, 3581 + .{ "Shouldnt", SpecialCase{ .tokens = .{ "Should", "nt", "" }, .len = 2 } }, 3582 + .{ "Shouldntve", SpecialCase{ .tokens = .{ "Should", "nt", "ve" }, .len = 3 } }, 3583 + .{ "Shouldn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "Should", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3584 + .{ "Shouldn\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Should", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 3585 + .{ "Shouldve", SpecialCase{ .tokens = .{ "Should", "ve", "" }, .len = 2 } }, 3586 + .{ "Should\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Should", "\xe2\x80\x99ve", "" }, .len = 2 } }, 3587 + .{ "Somethin", SpecialCase{ .tokens = .{ "Somethin", "", "" }, .len = 1 } }, 3588 + .{ "Somethin'", SpecialCase{ .tokens = .{ "Somethin'", "", "" }, .len = 1 } }, 3589 + .{ "Somethin\xe2\x80\x99", SpecialCase{ .tokens = .{ "Somethin\xe2\x80\x99", "", "" }, .len = 1 } }, 3590 + .{ "St.", SpecialCase{ .tokens = .{ "St.", "", "" }, .len = 1 } }, 3591 + .{ "Tenn.", SpecialCase{ .tokens = .{ "Tenn.", "", "" }, .len = 1 } }, 3592 + .{ "That'd", SpecialCase{ .tokens = .{ "That", "'d", "" }, .len = 2 } }, 3593 + .{ "That'd've", SpecialCase{ .tokens = .{ "That", "'d", "'ve" }, .len = 3 } }, 3594 + .{ "That'll", SpecialCase{ .tokens = .{ "That", "'ll", "" }, .len = 2 } }, 3595 + .{ "That'll've", SpecialCase{ .tokens = .{ "That", "'ll", "'ve" }, .len = 3 } }, 3596 + .{ "That's", SpecialCase{ .tokens = .{ "That", "'s", "" }, .len = 2 } }, 3597 + .{ "Thatd", SpecialCase{ .tokens = .{ "That", "d", "" }, .len = 2 } }, 3598 + .{ "Thatdve", SpecialCase{ .tokens = .{ "That", "d", "ve" }, .len = 3 } }, 3599 + .{ "Thatll", SpecialCase{ .tokens = .{ "That", "ll", "" }, .len = 2 } }, 3600 + .{ "Thatllve", SpecialCase{ .tokens = .{ "That", "ll", "ve" }, .len = 3 } }, 3601 + .{ "Thats", SpecialCase{ .tokens = .{ "That", "s", "" }, .len = 2 } }, 3602 + .{ "That\xe2\x80\x99d", SpecialCase{ .tokens = .{ "That", "\xe2\x80\x99d", "" }, .len = 2 } }, 3603 + .{ "That\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "That", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 3604 + .{ "That\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "That", "\xe2\x80\x99ll", "" }, .len = 2 } }, 3605 + .{ "That\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "That", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 3606 + .{ "That\xe2\x80\x99s", SpecialCase{ .tokens = .{ "That", "\xe2\x80\x99s", "" }, .len = 2 } }, 3607 + .{ "There'd", SpecialCase{ .tokens = .{ "There", "'d", "" }, .len = 2 } }, 3608 + .{ "There'd've", SpecialCase{ .tokens = .{ "There", "'d", "'ve" }, .len = 3 } }, 3609 + .{ "There'll", SpecialCase{ .tokens = .{ "There", "'ll", "" }, .len = 2 } }, 3610 + .{ "There'll've", SpecialCase{ .tokens = .{ "There", "'ll", "'ve" }, .len = 3 } }, 3611 + .{ "There're", SpecialCase{ .tokens = .{ "There", "'re", "" }, .len = 2 } }, 3612 + .{ "There's", SpecialCase{ .tokens = .{ "There", "'s", "" }, .len = 2 } }, 3613 + .{ "There've", SpecialCase{ .tokens = .{ "There", "'ve", "" }, .len = 2 } }, 3614 + .{ "Thered", SpecialCase{ .tokens = .{ "There", "d", "" }, .len = 2 } }, 3615 + .{ "Theredve", SpecialCase{ .tokens = .{ "There", "d", "ve" }, .len = 3 } }, 3616 + .{ "Therell", SpecialCase{ .tokens = .{ "There", "ll", "" }, .len = 2 } }, 3617 + .{ "Therellve", SpecialCase{ .tokens = .{ "There", "ll", "ve" }, .len = 3 } }, 3618 + .{ "Therere", SpecialCase{ .tokens = .{ "There", "re", "" }, .len = 2 } }, 3619 + .{ "Theres", SpecialCase{ .tokens = .{ "There", "s", "" }, .len = 2 } }, 3620 + .{ "Thereve", SpecialCase{ .tokens = .{ "There", "ve", "" }, .len = 2 } }, 3621 + .{ "There\xe2\x80\x99d", SpecialCase{ .tokens = .{ "There", "\xe2\x80\x99d", "" }, .len = 2 } }, 3622 + .{ "There\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "There", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 3623 + .{ "There\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "There", "\xe2\x80\x99ll", "" }, .len = 2 } }, 3624 + .{ "There\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "There", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 3625 + .{ "There\xe2\x80\x99re", SpecialCase{ .tokens = .{ "There", "\xe2\x80\x99re", "" }, .len = 2 } }, 3626 + .{ "There\xe2\x80\x99s", SpecialCase{ .tokens = .{ "There", "\xe2\x80\x99s", "" }, .len = 2 } }, 3627 + .{ "There\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "There", "\xe2\x80\x99ve", "" }, .len = 2 } }, 3628 + .{ "These'd", SpecialCase{ .tokens = .{ "These", "'d", "" }, .len = 2 } }, 3629 + .{ "These'd've", SpecialCase{ .tokens = .{ "These", "'d", "'ve" }, .len = 3 } }, 3630 + .{ "These'll", SpecialCase{ .tokens = .{ "These", "'ll", "" }, .len = 2 } }, 3631 + .{ "These'll've", SpecialCase{ .tokens = .{ "These", "'ll", "'ve" }, .len = 3 } }, 3632 + .{ "These're", SpecialCase{ .tokens = .{ "These", "'re", "" }, .len = 2 } }, 3633 + .{ "These've", SpecialCase{ .tokens = .{ "These", "'ve", "" }, .len = 2 } }, 3634 + .{ "Thesed", SpecialCase{ .tokens = .{ "These", "d", "" }, .len = 2 } }, 3635 + .{ "Thesedve", SpecialCase{ .tokens = .{ "These", "d", "ve" }, .len = 3 } }, 3636 + .{ "Thesell", SpecialCase{ .tokens = .{ "These", "ll", "" }, .len = 2 } }, 3637 + .{ "Thesellve", SpecialCase{ .tokens = .{ "These", "ll", "ve" }, .len = 3 } }, 3638 + .{ "Thesere", SpecialCase{ .tokens = .{ "These", "re", "" }, .len = 2 } }, 3639 + .{ "Theseve", SpecialCase{ .tokens = .{ "These", "ve", "" }, .len = 2 } }, 3640 + .{ "These\xe2\x80\x99d", SpecialCase{ .tokens = .{ "These", "\xe2\x80\x99d", "" }, .len = 2 } }, 3641 + .{ "These\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "These", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 3642 + .{ "These\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "These", "\xe2\x80\x99ll", "" }, .len = 2 } }, 3643 + .{ "These\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "These", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 3644 + .{ "These\xe2\x80\x99re", SpecialCase{ .tokens = .{ "These", "\xe2\x80\x99re", "" }, .len = 2 } }, 3645 + .{ "These\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "These", "\xe2\x80\x99ve", "" }, .len = 2 } }, 3646 + .{ "They'd", SpecialCase{ .tokens = .{ "They", "'d", "" }, .len = 2 } }, 3647 + .{ "They'd've", SpecialCase{ .tokens = .{ "They", "'d", "'ve" }, .len = 3 } }, 3648 + .{ "They'll", SpecialCase{ .tokens = .{ "They", "'ll", "" }, .len = 2 } }, 3649 + .{ "They'll've", SpecialCase{ .tokens = .{ "They", "'ll", "'ve" }, .len = 3 } }, 3650 + .{ "They're", SpecialCase{ .tokens = .{ "They", "'re", "" }, .len = 2 } }, 3651 + .{ "They've", SpecialCase{ .tokens = .{ "They", "'ve", "" }, .len = 2 } }, 3652 + .{ "Theyd", SpecialCase{ .tokens = .{ "They", "d", "" }, .len = 2 } }, 3653 + .{ "Theydve", SpecialCase{ .tokens = .{ "They", "d", "ve" }, .len = 3 } }, 3654 + .{ "Theyll", SpecialCase{ .tokens = .{ "They", "ll", "" }, .len = 2 } }, 3655 + .{ "Theyllve", SpecialCase{ .tokens = .{ "They", "ll", "ve" }, .len = 3 } }, 3656 + .{ "Theyre", SpecialCase{ .tokens = .{ "They", "re", "" }, .len = 2 } }, 3657 + .{ "Theyve", SpecialCase{ .tokens = .{ "They", "ve", "" }, .len = 2 } }, 3658 + .{ "They\xe2\x80\x99d", SpecialCase{ .tokens = .{ "They", "\xe2\x80\x99d", "" }, .len = 2 } }, 3659 + .{ "They\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "They", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 3660 + .{ "They\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "They", "\xe2\x80\x99ll", "" }, .len = 2 } }, 3661 + .{ "They\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "They", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 3662 + .{ "They\xe2\x80\x99re", SpecialCase{ .tokens = .{ "They", "\xe2\x80\x99re", "" }, .len = 2 } }, 3663 + .{ "They\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "They", "\xe2\x80\x99ve", "" }, .len = 2 } }, 3664 + .{ "This'd", SpecialCase{ .tokens = .{ "This", "'d", "" }, .len = 2 } }, 3665 + .{ "This'd've", SpecialCase{ .tokens = .{ "This", "'d", "'ve" }, .len = 3 } }, 3666 + .{ "This'll", SpecialCase{ .tokens = .{ "This", "'ll", "" }, .len = 2 } }, 3667 + .{ "This'll've", SpecialCase{ .tokens = .{ "This", "'ll", "'ve" }, .len = 3 } }, 3668 + .{ "This's", SpecialCase{ .tokens = .{ "This", "'s", "" }, .len = 2 } }, 3669 + .{ "Thisd", SpecialCase{ .tokens = .{ "This", "d", "" }, .len = 2 } }, 3670 + .{ "Thisdve", SpecialCase{ .tokens = .{ "This", "d", "ve" }, .len = 3 } }, 3671 + .{ "Thisll", SpecialCase{ .tokens = .{ "This", "ll", "" }, .len = 2 } }, 3672 + .{ "Thisllve", SpecialCase{ .tokens = .{ "This", "ll", "ve" }, .len = 3 } }, 3673 + .{ "Thiss", SpecialCase{ .tokens = .{ "This", "s", "" }, .len = 2 } }, 3674 + .{ "This\xe2\x80\x99d", SpecialCase{ .tokens = .{ "This", "\xe2\x80\x99d", "" }, .len = 2 } }, 3675 + .{ "This\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "This", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 3676 + .{ "This\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "This", "\xe2\x80\x99ll", "" }, .len = 2 } }, 3677 + .{ "This\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "This", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 3678 + .{ "This\xe2\x80\x99s", SpecialCase{ .tokens = .{ "This", "\xe2\x80\x99s", "" }, .len = 2 } }, 3679 + .{ "Those'd", SpecialCase{ .tokens = .{ "Those", "'d", "" }, .len = 2 } }, 3680 + .{ "Those'd've", SpecialCase{ .tokens = .{ "Those", "'d", "'ve" }, .len = 3 } }, 3681 + .{ "Those'll", SpecialCase{ .tokens = .{ "Those", "'ll", "" }, .len = 2 } }, 3682 + .{ "Those'll've", SpecialCase{ .tokens = .{ "Those", "'ll", "'ve" }, .len = 3 } }, 3683 + .{ "Those're", SpecialCase{ .tokens = .{ "Those", "'re", "" }, .len = 2 } }, 3684 + .{ "Those've", SpecialCase{ .tokens = .{ "Those", "'ve", "" }, .len = 2 } }, 3685 + .{ "Thosed", SpecialCase{ .tokens = .{ "Those", "d", "" }, .len = 2 } }, 3686 + .{ "Thosedve", SpecialCase{ .tokens = .{ "Those", "d", "ve" }, .len = 3 } }, 3687 + .{ "Thosell", SpecialCase{ .tokens = .{ "Those", "ll", "" }, .len = 2 } }, 3688 + .{ "Thosellve", SpecialCase{ .tokens = .{ "Those", "ll", "ve" }, .len = 3 } }, 3689 + .{ "Thosere", SpecialCase{ .tokens = .{ "Those", "re", "" }, .len = 2 } }, 3690 + .{ "Thoseve", SpecialCase{ .tokens = .{ "Those", "ve", "" }, .len = 2 } }, 3691 + .{ "Those\xe2\x80\x99d", SpecialCase{ .tokens = .{ "Those", "\xe2\x80\x99d", "" }, .len = 2 } }, 3692 + .{ "Those\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Those", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 3693 + .{ "Those\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "Those", "\xe2\x80\x99ll", "" }, .len = 2 } }, 3694 + .{ "Those\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Those", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 3695 + .{ "Those\xe2\x80\x99re", SpecialCase{ .tokens = .{ "Those", "\xe2\x80\x99re", "" }, .len = 2 } }, 3696 + .{ "Those\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Those", "\xe2\x80\x99ve", "" }, .len = 2 } }, 3697 + .{ "V.V", SpecialCase{ .tokens = .{ "V.V", "", "" }, .len = 1 } }, 3698 + .{ "V_V", SpecialCase{ .tokens = .{ "V_V", "", "" }, .len = 1 } }, 3699 + .{ "Va.", SpecialCase{ .tokens = .{ "Va.", "", "" }, .len = 1 } }, 3700 + .{ "Wash.", SpecialCase{ .tokens = .{ "Wash.", "", "" }, .len = 1 } }, 3701 + .{ "Wasn't", SpecialCase{ .tokens = .{ "Was", "n't", "" }, .len = 2 } }, 3702 + .{ "Wasnt", SpecialCase{ .tokens = .{ "Was", "nt", "" }, .len = 2 } }, 3703 + .{ "Wasn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "Was", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3704 + .{ "We'd", SpecialCase{ .tokens = .{ "We", "'d", "" }, .len = 2 } }, 3705 + .{ "We'd've", SpecialCase{ .tokens = .{ "We", "'d", "'ve" }, .len = 3 } }, 3706 + .{ "We'll", SpecialCase{ .tokens = .{ "We", "'ll", "" }, .len = 2 } }, 3707 + .{ "We'll've", SpecialCase{ .tokens = .{ "We", "'ll", "'ve" }, .len = 3 } }, 3708 + .{ "We're", SpecialCase{ .tokens = .{ "We", "'re", "" }, .len = 2 } }, 3709 + .{ "We've", SpecialCase{ .tokens = .{ "We", "'ve", "" }, .len = 2 } }, 3710 + .{ "Wed", SpecialCase{ .tokens = .{ "We", "d", "" }, .len = 2 } }, 3711 + .{ "Wedve", SpecialCase{ .tokens = .{ "We", "d", "ve" }, .len = 3 } }, 3712 + .{ "Wellve", SpecialCase{ .tokens = .{ "We", "ll", "ve" }, .len = 3 } }, 3713 + .{ "Weren't", SpecialCase{ .tokens = .{ "Were", "n't", "" }, .len = 2 } }, 3714 + .{ "Werent", SpecialCase{ .tokens = .{ "Were", "nt", "" }, .len = 2 } }, 3715 + .{ "Weren\xe2\x80\x99t", SpecialCase{ .tokens = .{ "Were", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3716 + .{ "Weve", SpecialCase{ .tokens = .{ "We", "ve", "" }, .len = 2 } }, 3717 + .{ "We\xe2\x80\x99d", SpecialCase{ .tokens = .{ "We", "\xe2\x80\x99d", "" }, .len = 2 } }, 3718 + .{ "We\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "We", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 3719 + .{ "We\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "We", "\xe2\x80\x99ll", "" }, .len = 2 } }, 3720 + .{ "We\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "We", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 3721 + .{ "We\xe2\x80\x99re", SpecialCase{ .tokens = .{ "We", "\xe2\x80\x99re", "" }, .len = 2 } }, 3722 + .{ "We\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "We", "\xe2\x80\x99ve", "" }, .len = 2 } }, 3723 + .{ "What'd", SpecialCase{ .tokens = .{ "What", "'d", "" }, .len = 2 } }, 3724 + .{ "What'd've", SpecialCase{ .tokens = .{ "What", "'d", "'ve" }, .len = 3 } }, 3725 + .{ "What'll", SpecialCase{ .tokens = .{ "What", "'ll", "" }, .len = 2 } }, 3726 + .{ "What'll've", SpecialCase{ .tokens = .{ "What", "'ll", "'ve" }, .len = 3 } }, 3727 + .{ "What're", SpecialCase{ .tokens = .{ "What", "'re", "" }, .len = 2 } }, 3728 + .{ "What's", SpecialCase{ .tokens = .{ "What", "'s", "" }, .len = 2 } }, 3729 + .{ "What've", SpecialCase{ .tokens = .{ "What", "'ve", "" }, .len = 2 } }, 3730 + .{ "Whatd", SpecialCase{ .tokens = .{ "What", "d", "" }, .len = 2 } }, 3731 + .{ "Whatdve", SpecialCase{ .tokens = .{ "What", "d", "ve" }, .len = 3 } }, 3732 + .{ "Whatll", SpecialCase{ .tokens = .{ "What", "ll", "" }, .len = 2 } }, 3733 + .{ "Whatllve", SpecialCase{ .tokens = .{ "What", "ll", "ve" }, .len = 3 } }, 3734 + .{ "Whatre", SpecialCase{ .tokens = .{ "What", "re", "" }, .len = 2 } }, 3735 + .{ "Whats", SpecialCase{ .tokens = .{ "What", "s", "" }, .len = 2 } }, 3736 + .{ "Whatve", SpecialCase{ .tokens = .{ "What", "ve", "" }, .len = 2 } }, 3737 + .{ "What\xe2\x80\x99d", SpecialCase{ .tokens = .{ "What", "\xe2\x80\x99d", "" }, .len = 2 } }, 3738 + .{ "What\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "What", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 3739 + .{ "What\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "What", "\xe2\x80\x99ll", "" }, .len = 2 } }, 3740 + .{ "What\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "What", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 3741 + .{ "What\xe2\x80\x99re", SpecialCase{ .tokens = .{ "What", "\xe2\x80\x99re", "" }, .len = 2 } }, 3742 + .{ "What\xe2\x80\x99s", SpecialCase{ .tokens = .{ "What", "\xe2\x80\x99s", "" }, .len = 2 } }, 3743 + .{ "What\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "What", "\xe2\x80\x99ve", "" }, .len = 2 } }, 3744 + .{ "When'd", SpecialCase{ .tokens = .{ "When", "'d", "" }, .len = 2 } }, 3745 + .{ "When'd've", SpecialCase{ .tokens = .{ "When", "'d", "'ve" }, .len = 3 } }, 3746 + .{ "When'll", SpecialCase{ .tokens = .{ "When", "'ll", "" }, .len = 2 } }, 3747 + .{ "When'll've", SpecialCase{ .tokens = .{ "When", "'ll", "'ve" }, .len = 3 } }, 3748 + .{ "When're", SpecialCase{ .tokens = .{ "When", "'re", "" }, .len = 2 } }, 3749 + .{ "When's", SpecialCase{ .tokens = .{ "When", "'s", "" }, .len = 2 } }, 3750 + .{ "When've", SpecialCase{ .tokens = .{ "When", "'ve", "" }, .len = 2 } }, 3751 + .{ "Whend", SpecialCase{ .tokens = .{ "When", "d", "" }, .len = 2 } }, 3752 + .{ "Whendve", SpecialCase{ .tokens = .{ "When", "d", "ve" }, .len = 3 } }, 3753 + .{ "Whenll", SpecialCase{ .tokens = .{ "When", "ll", "" }, .len = 2 } }, 3754 + .{ "Whenllve", SpecialCase{ .tokens = .{ "When", "ll", "ve" }, .len = 3 } }, 3755 + .{ "Whenre", SpecialCase{ .tokens = .{ "When", "re", "" }, .len = 2 } }, 3756 + .{ "Whens", SpecialCase{ .tokens = .{ "When", "s", "" }, .len = 2 } }, 3757 + .{ "Whenve", SpecialCase{ .tokens = .{ "When", "ve", "" }, .len = 2 } }, 3758 + .{ "When\xe2\x80\x99d", SpecialCase{ .tokens = .{ "When", "\xe2\x80\x99d", "" }, .len = 2 } }, 3759 + .{ "When\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "When", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 3760 + .{ "When\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "When", "\xe2\x80\x99ll", "" }, .len = 2 } }, 3761 + .{ "When\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "When", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 3762 + .{ "When\xe2\x80\x99re", SpecialCase{ .tokens = .{ "When", "\xe2\x80\x99re", "" }, .len = 2 } }, 3763 + .{ "When\xe2\x80\x99s", SpecialCase{ .tokens = .{ "When", "\xe2\x80\x99s", "" }, .len = 2 } }, 3764 + .{ "When\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "When", "\xe2\x80\x99ve", "" }, .len = 2 } }, 3765 + .{ "Where'd", SpecialCase{ .tokens = .{ "Where", "'d", "" }, .len = 2 } }, 3766 + .{ "Where'd've", SpecialCase{ .tokens = .{ "Where", "'d", "'ve" }, .len = 3 } }, 3767 + .{ "Where'll", SpecialCase{ .tokens = .{ "Where", "'ll", "" }, .len = 2 } }, 3768 + .{ "Where'll've", SpecialCase{ .tokens = .{ "Where", "'ll", "'ve" }, .len = 3 } }, 3769 + .{ "Where're", SpecialCase{ .tokens = .{ "Where", "'re", "" }, .len = 2 } }, 3770 + .{ "Where's", SpecialCase{ .tokens = .{ "Where", "'s", "" }, .len = 2 } }, 3771 + .{ "Where've", SpecialCase{ .tokens = .{ "Where", "'ve", "" }, .len = 2 } }, 3772 + .{ "Whered", SpecialCase{ .tokens = .{ "Where", "d", "" }, .len = 2 } }, 3773 + .{ "Wheredve", SpecialCase{ .tokens = .{ "Where", "d", "ve" }, .len = 3 } }, 3774 + .{ "Wherell", SpecialCase{ .tokens = .{ "Where", "ll", "" }, .len = 2 } }, 3775 + .{ "Wherellve", SpecialCase{ .tokens = .{ "Where", "ll", "ve" }, .len = 3 } }, 3776 + .{ "Wherere", SpecialCase{ .tokens = .{ "Where", "re", "" }, .len = 2 } }, 3777 + .{ "Wheres", SpecialCase{ .tokens = .{ "Where", "s", "" }, .len = 2 } }, 3778 + .{ "Whereve", SpecialCase{ .tokens = .{ "Where", "ve", "" }, .len = 2 } }, 3779 + .{ "Where\xe2\x80\x99d", SpecialCase{ .tokens = .{ "Where", "\xe2\x80\x99d", "" }, .len = 2 } }, 3780 + .{ "Where\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Where", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 3781 + .{ "Where\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "Where", "\xe2\x80\x99ll", "" }, .len = 2 } }, 3782 + .{ "Where\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Where", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 3783 + .{ "Where\xe2\x80\x99re", SpecialCase{ .tokens = .{ "Where", "\xe2\x80\x99re", "" }, .len = 2 } }, 3784 + .{ "Where\xe2\x80\x99s", SpecialCase{ .tokens = .{ "Where", "\xe2\x80\x99s", "" }, .len = 2 } }, 3785 + .{ "Where\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Where", "\xe2\x80\x99ve", "" }, .len = 2 } }, 3786 + .{ "Who'd", SpecialCase{ .tokens = .{ "Who", "'d", "" }, .len = 2 } }, 3787 + .{ "Who'd've", SpecialCase{ .tokens = .{ "Who", "'d", "'ve" }, .len = 3 } }, 3788 + .{ "Who'll", SpecialCase{ .tokens = .{ "Who", "'ll", "" }, .len = 2 } }, 3789 + .{ "Who'll've", SpecialCase{ .tokens = .{ "Who", "'ll", "'ve" }, .len = 3 } }, 3790 + .{ "Who're", SpecialCase{ .tokens = .{ "Who", "'re", "" }, .len = 2 } }, 3791 + .{ "Who's", SpecialCase{ .tokens = .{ "Who", "'s", "" }, .len = 2 } }, 3792 + .{ "Who've", SpecialCase{ .tokens = .{ "Who", "'ve", "" }, .len = 2 } }, 3793 + .{ "Whod", SpecialCase{ .tokens = .{ "Who", "d", "" }, .len = 2 } }, 3794 + .{ "Whodve", SpecialCase{ .tokens = .{ "Who", "d", "ve" }, .len = 3 } }, 3795 + .{ "Wholl", SpecialCase{ .tokens = .{ "Who", "ll", "" }, .len = 2 } }, 3796 + .{ "Whollve", SpecialCase{ .tokens = .{ "Who", "ll", "ve" }, .len = 3 } }, 3797 + .{ "Whos", SpecialCase{ .tokens = .{ "Who", "s", "" }, .len = 2 } }, 3798 + .{ "Whove", SpecialCase{ .tokens = .{ "Who", "ve", "" }, .len = 2 } }, 3799 + .{ "Who\xe2\x80\x99d", SpecialCase{ .tokens = .{ "Who", "\xe2\x80\x99d", "" }, .len = 2 } }, 3800 + .{ "Who\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Who", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 3801 + .{ "Who\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "Who", "\xe2\x80\x99ll", "" }, .len = 2 } }, 3802 + .{ "Who\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Who", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 3803 + .{ "Who\xe2\x80\x99re", SpecialCase{ .tokens = .{ "Who", "\xe2\x80\x99re", "" }, .len = 2 } }, 3804 + .{ "Who\xe2\x80\x99s", SpecialCase{ .tokens = .{ "Who", "\xe2\x80\x99s", "" }, .len = 2 } }, 3805 + .{ "Who\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Who", "\xe2\x80\x99ve", "" }, .len = 2 } }, 3806 + .{ "Why'd", SpecialCase{ .tokens = .{ "Why", "'d", "" }, .len = 2 } }, 3807 + .{ "Why'd've", SpecialCase{ .tokens = .{ "Why", "'d", "'ve" }, .len = 3 } }, 3808 + .{ "Why'll", SpecialCase{ .tokens = .{ "Why", "'ll", "" }, .len = 2 } }, 3809 + .{ "Why'll've", SpecialCase{ .tokens = .{ "Why", "'ll", "'ve" }, .len = 3 } }, 3810 + .{ "Why're", SpecialCase{ .tokens = .{ "Why", "'re", "" }, .len = 2 } }, 3811 + .{ "Why's", SpecialCase{ .tokens = .{ "Why", "'s", "" }, .len = 2 } }, 3812 + .{ "Why've", SpecialCase{ .tokens = .{ "Why", "'ve", "" }, .len = 2 } }, 3813 + .{ "Whyd", SpecialCase{ .tokens = .{ "Why", "d", "" }, .len = 2 } }, 3814 + .{ "Whydve", SpecialCase{ .tokens = .{ "Why", "d", "ve" }, .len = 3 } }, 3815 + .{ "Whyll", SpecialCase{ .tokens = .{ "Why", "ll", "" }, .len = 2 } }, 3816 + .{ "Whyllve", SpecialCase{ .tokens = .{ "Why", "ll", "ve" }, .len = 3 } }, 3817 + .{ "Whyre", SpecialCase{ .tokens = .{ "Why", "re", "" }, .len = 2 } }, 3818 + .{ "Whys", SpecialCase{ .tokens = .{ "Why", "s", "" }, .len = 2 } }, 3819 + .{ "Whyve", SpecialCase{ .tokens = .{ "Why", "ve", "" }, .len = 2 } }, 3820 + .{ "Why\xe2\x80\x99d", SpecialCase{ .tokens = .{ "Why", "\xe2\x80\x99d", "" }, .len = 2 } }, 3821 + .{ "Why\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Why", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 3822 + .{ "Why\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "Why", "\xe2\x80\x99ll", "" }, .len = 2 } }, 3823 + .{ "Why\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Why", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 3824 + .{ "Why\xe2\x80\x99re", SpecialCase{ .tokens = .{ "Why", "\xe2\x80\x99re", "" }, .len = 2 } }, 3825 + .{ "Why\xe2\x80\x99s", SpecialCase{ .tokens = .{ "Why", "\xe2\x80\x99s", "" }, .len = 2 } }, 3826 + .{ "Why\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Why", "\xe2\x80\x99ve", "" }, .len = 2 } }, 3827 + .{ "Wis.", SpecialCase{ .tokens = .{ "Wis.", "", "" }, .len = 1 } }, 3828 + .{ "Won't", SpecialCase{ .tokens = .{ "Wo", "n't", "" }, .len = 2 } }, 3829 + .{ "Won't've", SpecialCase{ .tokens = .{ "Wo", "n't", "'ve" }, .len = 3 } }, 3830 + .{ "Wont", SpecialCase{ .tokens = .{ "Wo", "nt", "" }, .len = 2 } }, 3831 + .{ "Wontve", SpecialCase{ .tokens = .{ "Wo", "nt", "ve" }, .len = 3 } }, 3832 + .{ "Won\xe2\x80\x99t", SpecialCase{ .tokens = .{ "Wo", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3833 + .{ "Won\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Wo", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 3834 + .{ "Would've", SpecialCase{ .tokens = .{ "Would", "'ve", "" }, .len = 2 } }, 3835 + .{ "Wouldn't", SpecialCase{ .tokens = .{ "Would", "n't", "" }, .len = 2 } }, 3836 + .{ "Wouldn't've", SpecialCase{ .tokens = .{ "Would", "n't", "'ve" }, .len = 3 } }, 3837 + .{ "Wouldnt", SpecialCase{ .tokens = .{ "Would", "nt", "" }, .len = 2 } }, 3838 + .{ "Wouldntve", SpecialCase{ .tokens = .{ "Would", "nt", "ve" }, .len = 3 } }, 3839 + .{ "Wouldn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "Would", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3840 + .{ "Wouldn\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Would", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 3841 + .{ "Wouldve", SpecialCase{ .tokens = .{ "Would", "ve", "" }, .len = 2 } }, 3842 + .{ "Would\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "Would", "\xe2\x80\x99ve", "" }, .len = 2 } }, 3843 + .{ "XD", SpecialCase{ .tokens = .{ "XD", "", "" }, .len = 1 } }, 3844 + .{ "XDD", SpecialCase{ .tokens = .{ "XDD", "", "" }, .len = 1 } }, 3845 + .{ "You'd", SpecialCase{ .tokens = .{ "You", "'d", "" }, .len = 2 } }, 3846 + .{ "You'd've", SpecialCase{ .tokens = .{ "You", "'d", "'ve" }, .len = 3 } }, 3847 + .{ "You'll", SpecialCase{ .tokens = .{ "You", "'ll", "" }, .len = 2 } }, 3848 + .{ "You'll've", SpecialCase{ .tokens = .{ "You", "'ll", "'ve" }, .len = 3 } }, 3849 + .{ "You're", SpecialCase{ .tokens = .{ "You", "'re", "" }, .len = 2 } }, 3850 + .{ "You've", SpecialCase{ .tokens = .{ "You", "'ve", "" }, .len = 2 } }, 3851 + .{ "Youd", SpecialCase{ .tokens = .{ "You", "d", "" }, .len = 2 } }, 3852 + .{ "Youdve", SpecialCase{ .tokens = .{ "You", "d", "ve" }, .len = 3 } }, 3853 + .{ "Youll", SpecialCase{ .tokens = .{ "You", "ll", "" }, .len = 2 } }, 3854 + .{ "Youllve", SpecialCase{ .tokens = .{ "You", "ll", "ve" }, .len = 3 } }, 3855 + .{ "Youre", SpecialCase{ .tokens = .{ "You", "re", "" }, .len = 2 } }, 3856 + .{ "Youve", SpecialCase{ .tokens = .{ "You", "ve", "" }, .len = 2 } }, 3857 + .{ "You\xe2\x80\x99d", SpecialCase{ .tokens = .{ "You", "\xe2\x80\x99d", "" }, .len = 2 } }, 3858 + .{ "You\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "You", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 3859 + .{ "You\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "You", "\xe2\x80\x99ll", "" }, .len = 2 } }, 3860 + .{ "You\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "You", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 3861 + .{ "You\xe2\x80\x99re", SpecialCase{ .tokens = .{ "You", "\xe2\x80\x99re", "" }, .len = 2 } }, 3862 + .{ "You\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "You", "\xe2\x80\x99ve", "" }, .len = 2 } }, 3863 + .{ "[-:", SpecialCase{ .tokens = .{ "[-:", "", "" }, .len = 1 } }, 3864 + .{ "[:", SpecialCase{ .tokens = .{ "[:", "", "" }, .len = 1 } }, 3865 + .{ "[=", SpecialCase{ .tokens = .{ "[=", "", "" }, .len = 1 } }, 3866 + .{ "\\\")", SpecialCase{ .tokens = .{ "\\\")", "", "" }, .len = 1 } }, 3867 + .{ "\\n", SpecialCase{ .tokens = .{ "\\n", "", "" }, .len = 1 } }, 3868 + .{ "\\t", SpecialCase{ .tokens = .{ "\\t", "", "" }, .len = 1 } }, 3869 + .{ "]=", SpecialCase{ .tokens = .{ "]=", "", "" }, .len = 1 } }, 3870 + .{ "^_^", SpecialCase{ .tokens = .{ "^_^", "", "" }, .len = 1 } }, 3871 + .{ "^__^", SpecialCase{ .tokens = .{ "^__^", "", "" }, .len = 1 } }, 3872 + .{ "^___^", SpecialCase{ .tokens = .{ "^___^", "", "" }, .len = 1 } }, 3873 + .{ "a.", SpecialCase{ .tokens = .{ "a.", "", "" }, .len = 1 } }, 3874 + .{ "a.m.", SpecialCase{ .tokens = .{ "a.m.", "", "" }, .len = 1 } }, 3875 + .{ "ain't", SpecialCase{ .tokens = .{ "ai", "n't", "" }, .len = 2 } }, 3876 + .{ "aint", SpecialCase{ .tokens = .{ "ai", "nt", "" }, .len = 2 } }, 3877 + .{ "ain\xe2\x80\x99t", SpecialCase{ .tokens = .{ "ai", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3878 + .{ "and/or", SpecialCase{ .tokens = .{ "and/or", "", "" }, .len = 1 } }, 3879 + .{ "aren't", SpecialCase{ .tokens = .{ "are", "n't", "" }, .len = 2 } }, 3880 + .{ "arent", SpecialCase{ .tokens = .{ "are", "nt", "" }, .len = 2 } }, 3881 + .{ "aren\xe2\x80\x99t", SpecialCase{ .tokens = .{ "are", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3882 + .{ "b.", SpecialCase{ .tokens = .{ "b.", "", "" }, .len = 1 } }, 3883 + .{ "c'mon", SpecialCase{ .tokens = .{ "c'm", "on", "" }, .len = 2 } }, 3884 + .{ "c.", SpecialCase{ .tokens = .{ "c.", "", "" }, .len = 1 } }, 3885 + .{ "can't", SpecialCase{ .tokens = .{ "ca", "n't", "" }, .len = 2 } }, 3886 + .{ "can't've", SpecialCase{ .tokens = .{ "ca", "n't", "'ve" }, .len = 3 } }, 3887 + .{ "cannot", SpecialCase{ .tokens = .{ "can", "not", "" }, .len = 2 } }, 3888 + .{ "cant", SpecialCase{ .tokens = .{ "ca", "nt", "" }, .len = 2 } }, 3889 + .{ "cantve", SpecialCase{ .tokens = .{ "ca", "nt", "ve" }, .len = 3 } }, 3890 + .{ "can\xe2\x80\x99t", SpecialCase{ .tokens = .{ "ca", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3891 + .{ "can\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "ca", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 3892 + .{ "co.", SpecialCase{ .tokens = .{ "co.", "", "" }, .len = 1 } }, 3893 + .{ "could've", SpecialCase{ .tokens = .{ "could", "'ve", "" }, .len = 2 } }, 3894 + .{ "couldn't", SpecialCase{ .tokens = .{ "could", "n't", "" }, .len = 2 } }, 3895 + .{ "couldn't've", SpecialCase{ .tokens = .{ "could", "n't", "'ve" }, .len = 3 } }, 3896 + .{ "couldnt", SpecialCase{ .tokens = .{ "could", "nt", "" }, .len = 2 } }, 3897 + .{ "couldntve", SpecialCase{ .tokens = .{ "could", "nt", "ve" }, .len = 3 } }, 3898 + .{ "couldn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "could", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3899 + .{ "couldn\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "could", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 3900 + .{ "couldve", SpecialCase{ .tokens = .{ "could", "ve", "" }, .len = 2 } }, 3901 + .{ "could\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "could", "\xe2\x80\x99ve", "" }, .len = 2 } }, 3902 + .{ "c\xe2\x80\x99mon", SpecialCase{ .tokens = .{ "c\xe2\x80\x99m", "on", "" }, .len = 2 } }, 3903 + .{ "d.", SpecialCase{ .tokens = .{ "d.", "", "" }, .len = 1 } }, 3904 + .{ "daren't", SpecialCase{ .tokens = .{ "dare", "n't", "" }, .len = 2 } }, 3905 + .{ "darent", SpecialCase{ .tokens = .{ "dare", "nt", "" }, .len = 2 } }, 3906 + .{ "daren\xe2\x80\x99t", SpecialCase{ .tokens = .{ "dare", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3907 + .{ "didn't", SpecialCase{ .tokens = .{ "did", "n't", "" }, .len = 2 } }, 3908 + .{ "didn't've", SpecialCase{ .tokens = .{ "did", "n't", "'ve" }, .len = 3 } }, 3909 + .{ "didnt", SpecialCase{ .tokens = .{ "did", "nt", "" }, .len = 2 } }, 3910 + .{ "didntve", SpecialCase{ .tokens = .{ "did", "nt", "ve" }, .len = 3 } }, 3911 + .{ "didn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "did", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3912 + .{ "didn\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "did", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 3913 + .{ "doesn't", SpecialCase{ .tokens = .{ "does", "n't", "" }, .len = 2 } }, 3914 + .{ "doesn't've", SpecialCase{ .tokens = .{ "does", "n't", "'ve" }, .len = 3 } }, 3915 + .{ "doesnt", SpecialCase{ .tokens = .{ "does", "nt", "" }, .len = 2 } }, 3916 + .{ "doesntve", SpecialCase{ .tokens = .{ "does", "nt", "ve" }, .len = 3 } }, 3917 + .{ "doesn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "does", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3918 + .{ "doesn\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "does", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 3919 + .{ "doin", SpecialCase{ .tokens = .{ "doin", "", "" }, .len = 1 } }, 3920 + .{ "doin'", SpecialCase{ .tokens = .{ "doin'", "", "" }, .len = 1 } }, 3921 + .{ "doin\xe2\x80\x99", SpecialCase{ .tokens = .{ "doin\xe2\x80\x99", "", "" }, .len = 1 } }, 3922 + .{ "don't", SpecialCase{ .tokens = .{ "do", "n't", "" }, .len = 2 } }, 3923 + .{ "don't've", SpecialCase{ .tokens = .{ "do", "n't", "'ve" }, .len = 3 } }, 3924 + .{ "dont", SpecialCase{ .tokens = .{ "do", "nt", "" }, .len = 2 } }, 3925 + .{ "dontve", SpecialCase{ .tokens = .{ "do", "nt", "ve" }, .len = 3 } }, 3926 + .{ "don\xe2\x80\x99t", SpecialCase{ .tokens = .{ "do", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3927 + .{ "don\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "do", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 3928 + .{ "e.", SpecialCase{ .tokens = .{ "e.", "", "" }, .len = 1 } }, 3929 + .{ "e.g.", SpecialCase{ .tokens = .{ "e.g.", "", "" }, .len = 1 } }, 3930 + .{ "em", SpecialCase{ .tokens = .{ "em", "", "" }, .len = 1 } }, 3931 + .{ "f.", SpecialCase{ .tokens = .{ "f.", "", "" }, .len = 1 } }, 3932 + .{ "g.", SpecialCase{ .tokens = .{ "g.", "", "" }, .len = 1 } }, 3933 + .{ "goin", SpecialCase{ .tokens = .{ "goin", "", "" }, .len = 1 } }, 3934 + .{ "goin'", SpecialCase{ .tokens = .{ "goin'", "", "" }, .len = 1 } }, 3935 + .{ "goin\xe2\x80\x99", SpecialCase{ .tokens = .{ "goin\xe2\x80\x99", "", "" }, .len = 1 } }, 3936 + .{ "gonna", SpecialCase{ .tokens = .{ "gon", "na", "" }, .len = 2 } }, 3937 + .{ "gotta", SpecialCase{ .tokens = .{ "got", "ta", "" }, .len = 2 } }, 3938 + .{ "h.", SpecialCase{ .tokens = .{ "h.", "", "" }, .len = 1 } }, 3939 + .{ "hadn't", SpecialCase{ .tokens = .{ "had", "n't", "" }, .len = 2 } }, 3940 + .{ "hadn't've", SpecialCase{ .tokens = .{ "had", "n't", "'ve" }, .len = 3 } }, 3941 + .{ "hadnt", SpecialCase{ .tokens = .{ "had", "nt", "" }, .len = 2 } }, 3942 + .{ "hadntve", SpecialCase{ .tokens = .{ "had", "nt", "ve" }, .len = 3 } }, 3943 + .{ "hadn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "had", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3944 + .{ "hadn\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "had", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 3945 + .{ "hasn't", SpecialCase{ .tokens = .{ "has", "n't", "" }, .len = 2 } }, 3946 + .{ "hasnt", SpecialCase{ .tokens = .{ "has", "nt", "" }, .len = 2 } }, 3947 + .{ "hasn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "has", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3948 + .{ "haven't", SpecialCase{ .tokens = .{ "have", "n't", "" }, .len = 2 } }, 3949 + .{ "havent", SpecialCase{ .tokens = .{ "have", "nt", "" }, .len = 2 } }, 3950 + .{ "haven\xe2\x80\x99t", SpecialCase{ .tokens = .{ "have", "n\xe2\x80\x99t", "" }, .len = 2 } }, 3951 + .{ "havin", SpecialCase{ .tokens = .{ "havin", "", "" }, .len = 1 } }, 3952 + .{ "havin'", SpecialCase{ .tokens = .{ "havin'", "", "" }, .len = 1 } }, 3953 + .{ "havin\xe2\x80\x99", SpecialCase{ .tokens = .{ "havin\xe2\x80\x99", "", "" }, .len = 1 } }, 3954 + .{ "he'd", SpecialCase{ .tokens = .{ "he", "'d", "" }, .len = 2 } }, 3955 + .{ "he'd've", SpecialCase{ .tokens = .{ "he", "'d", "'ve" }, .len = 3 } }, 3956 + .{ "he'll", SpecialCase{ .tokens = .{ "he", "'ll", "" }, .len = 2 } }, 3957 + .{ "he'll've", SpecialCase{ .tokens = .{ "he", "'ll", "'ve" }, .len = 3 } }, 3958 + .{ "he's", SpecialCase{ .tokens = .{ "he", "'s", "" }, .len = 2 } }, 3959 + .{ "hed", SpecialCase{ .tokens = .{ "he", "d", "" }, .len = 2 } }, 3960 + .{ "hedve", SpecialCase{ .tokens = .{ "he", "d", "ve" }, .len = 3 } }, 3961 + .{ "hellve", SpecialCase{ .tokens = .{ "he", "ll", "ve" }, .len = 3 } }, 3962 + .{ "hes", SpecialCase{ .tokens = .{ "he", "s", "" }, .len = 2 } }, 3963 + .{ "he\xe2\x80\x99d", SpecialCase{ .tokens = .{ "he", "\xe2\x80\x99d", "" }, .len = 2 } }, 3964 + .{ "he\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "he", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 3965 + .{ "he\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "he", "\xe2\x80\x99ll", "" }, .len = 2 } }, 3966 + .{ "he\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "he", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 3967 + .{ "he\xe2\x80\x99s", SpecialCase{ .tokens = .{ "he", "\xe2\x80\x99s", "" }, .len = 2 } }, 3968 + .{ "how'd", SpecialCase{ .tokens = .{ "how", "'d", "" }, .len = 2 } }, 3969 + .{ "how'd've", SpecialCase{ .tokens = .{ "how", "'d", "'ve" }, .len = 3 } }, 3970 + .{ "how'd'y", SpecialCase{ .tokens = .{ "how", "'d", "'y" }, .len = 3 } }, 3971 + .{ "how'll", SpecialCase{ .tokens = .{ "how", "'ll", "" }, .len = 2 } }, 3972 + .{ "how'll've", SpecialCase{ .tokens = .{ "how", "'ll", "'ve" }, .len = 3 } }, 3973 + .{ "how're", SpecialCase{ .tokens = .{ "how", "'re", "" }, .len = 2 } }, 3974 + .{ "how's", SpecialCase{ .tokens = .{ "how", "'s", "" }, .len = 2 } }, 3975 + .{ "how've", SpecialCase{ .tokens = .{ "how", "'ve", "" }, .len = 2 } }, 3976 + .{ "howd", SpecialCase{ .tokens = .{ "how", "d", "" }, .len = 2 } }, 3977 + .{ "howdve", SpecialCase{ .tokens = .{ "how", "d", "ve" }, .len = 3 } }, 3978 + .{ "howll", SpecialCase{ .tokens = .{ "how", "ll", "" }, .len = 2 } }, 3979 + .{ "howllve", SpecialCase{ .tokens = .{ "how", "ll", "ve" }, .len = 3 } }, 3980 + .{ "howre", SpecialCase{ .tokens = .{ "how", "re", "" }, .len = 2 } }, 3981 + .{ "hows", SpecialCase{ .tokens = .{ "how", "s", "" }, .len = 2 } }, 3982 + .{ "howve", SpecialCase{ .tokens = .{ "how", "ve", "" }, .len = 2 } }, 3983 + .{ "how\xe2\x80\x99d", SpecialCase{ .tokens = .{ "how", "\xe2\x80\x99d", "" }, .len = 2 } }, 3984 + .{ "how\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "how", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 3985 + .{ "how\xe2\x80\x99d\xe2\x80\x99y", SpecialCase{ .tokens = .{ "how", "\xe2\x80\x99d", "\xe2\x80\x99y" }, .len = 3 } }, 3986 + .{ "how\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "how", "\xe2\x80\x99ll", "" }, .len = 2 } }, 3987 + .{ "how\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "how", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 3988 + .{ "how\xe2\x80\x99re", SpecialCase{ .tokens = .{ "how", "\xe2\x80\x99re", "" }, .len = 2 } }, 3989 + .{ "how\xe2\x80\x99s", SpecialCase{ .tokens = .{ "how", "\xe2\x80\x99s", "" }, .len = 2 } }, 3990 + .{ "how\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "how", "\xe2\x80\x99ve", "" }, .len = 2 } }, 3991 + .{ "i'd", SpecialCase{ .tokens = .{ "i", "'d", "" }, .len = 2 } }, 3992 + .{ "i'd've", SpecialCase{ .tokens = .{ "i", "'d", "'ve" }, .len = 3 } }, 3993 + .{ "i'll", SpecialCase{ .tokens = .{ "i", "'ll", "" }, .len = 2 } }, 3994 + .{ "i'll've", SpecialCase{ .tokens = .{ "i", "'ll", "'ve" }, .len = 3 } }, 3995 + .{ "i'm", SpecialCase{ .tokens = .{ "i", "'m", "" }, .len = 2 } }, 3996 + .{ "i'ma", SpecialCase{ .tokens = .{ "i", "'m", "a" }, .len = 3 } }, 3997 + .{ "i've", SpecialCase{ .tokens = .{ "i", "'ve", "" }, .len = 2 } }, 3998 + .{ "i.", SpecialCase{ .tokens = .{ "i.", "", "" }, .len = 1 } }, 3999 + .{ "i.e.", SpecialCase{ .tokens = .{ "i.e.", "", "" }, .len = 1 } }, 4000 + .{ "id", SpecialCase{ .tokens = .{ "i", "d", "" }, .len = 2 } }, 4001 + .{ "idve", SpecialCase{ .tokens = .{ "i", "d", "ve" }, .len = 3 } }, 4002 + .{ "illve", SpecialCase{ .tokens = .{ "i", "ll", "ve" }, .len = 3 } }, 4003 + .{ "im", SpecialCase{ .tokens = .{ "i", "m", "" }, .len = 2 } }, 4004 + .{ "ima", SpecialCase{ .tokens = .{ "i", "m", "a" }, .len = 3 } }, 4005 + .{ "isn't", SpecialCase{ .tokens = .{ "is", "n't", "" }, .len = 2 } }, 4006 + .{ "isnt", SpecialCase{ .tokens = .{ "is", "nt", "" }, .len = 2 } }, 4007 + .{ "isn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "is", "n\xe2\x80\x99t", "" }, .len = 2 } }, 4008 + .{ "it'd", SpecialCase{ .tokens = .{ "it", "'d", "" }, .len = 2 } }, 4009 + .{ "it'd've", SpecialCase{ .tokens = .{ "it", "'d", "'ve" }, .len = 3 } }, 4010 + .{ "it'll", SpecialCase{ .tokens = .{ "it", "'ll", "" }, .len = 2 } }, 4011 + .{ "it'll've", SpecialCase{ .tokens = .{ "it", "'ll", "'ve" }, .len = 3 } }, 4012 + .{ "it's", SpecialCase{ .tokens = .{ "it", "'s", "" }, .len = 2 } }, 4013 + .{ "itd", SpecialCase{ .tokens = .{ "it", "d", "" }, .len = 2 } }, 4014 + .{ "itdve", SpecialCase{ .tokens = .{ "it", "d", "ve" }, .len = 3 } }, 4015 + .{ "itll", SpecialCase{ .tokens = .{ "it", "ll", "" }, .len = 2 } }, 4016 + .{ "itllve", SpecialCase{ .tokens = .{ "it", "ll", "ve" }, .len = 3 } }, 4017 + .{ "it\xe2\x80\x99d", SpecialCase{ .tokens = .{ "it", "\xe2\x80\x99d", "" }, .len = 2 } }, 4018 + .{ "it\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "it", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 4019 + .{ "it\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "it", "\xe2\x80\x99ll", "" }, .len = 2 } }, 4020 + .{ "it\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "it", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 4021 + .{ "it\xe2\x80\x99s", SpecialCase{ .tokens = .{ "it", "\xe2\x80\x99s", "" }, .len = 2 } }, 4022 + .{ "ive", SpecialCase{ .tokens = .{ "i", "ve", "" }, .len = 2 } }, 4023 + .{ "i\xe2\x80\x99d", SpecialCase{ .tokens = .{ "i", "\xe2\x80\x99d", "" }, .len = 2 } }, 4024 + .{ "i\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "i", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 4025 + .{ "i\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "i", "\xe2\x80\x99ll", "" }, .len = 2 } }, 4026 + .{ "i\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "i", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 4027 + .{ "i\xe2\x80\x99m", SpecialCase{ .tokens = .{ "i", "\xe2\x80\x99m", "" }, .len = 2 } }, 4028 + .{ "i\xe2\x80\x99ma", SpecialCase{ .tokens = .{ "i", "\xe2\x80\x99m", "a" }, .len = 3 } }, 4029 + .{ "i\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "i", "\xe2\x80\x99ve", "" }, .len = 2 } }, 4030 + .{ "j.", SpecialCase{ .tokens = .{ "j.", "", "" }, .len = 1 } }, 4031 + .{ "k.", SpecialCase{ .tokens = .{ "k.", "", "" }, .len = 1 } }, 4032 + .{ "l.", SpecialCase{ .tokens = .{ "l.", "", "" }, .len = 1 } }, 4033 + .{ "let's", SpecialCase{ .tokens = .{ "let", "'s", "" }, .len = 2 } }, 4034 + .{ "let\xe2\x80\x99s", SpecialCase{ .tokens = .{ "let", "\xe2\x80\x99s", "" }, .len = 2 } }, 4035 + .{ "ll", SpecialCase{ .tokens = .{ "ll", "", "" }, .len = 1 } }, 4036 + .{ "lovin", SpecialCase{ .tokens = .{ "lovin", "", "" }, .len = 1 } }, 4037 + .{ "lovin'", SpecialCase{ .tokens = .{ "lovin'", "", "" }, .len = 1 } }, 4038 + .{ "lovin\xe2\x80\x99", SpecialCase{ .tokens = .{ "lovin\xe2\x80\x99", "", "" }, .len = 1 } }, 4039 + .{ "m.", SpecialCase{ .tokens = .{ "m.", "", "" }, .len = 1 } }, 4040 + .{ "ma'am", SpecialCase{ .tokens = .{ "ma'am", "", "" }, .len = 1 } }, 4041 + .{ "mayn't", SpecialCase{ .tokens = .{ "may", "n't", "" }, .len = 2 } }, 4042 + .{ "mayn't've", SpecialCase{ .tokens = .{ "may", "n't", "'ve" }, .len = 3 } }, 4043 + .{ "maynt", SpecialCase{ .tokens = .{ "may", "nt", "" }, .len = 2 } }, 4044 + .{ "mayntve", SpecialCase{ .tokens = .{ "may", "nt", "ve" }, .len = 3 } }, 4045 + .{ "mayn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "may", "n\xe2\x80\x99t", "" }, .len = 2 } }, 4046 + .{ "mayn\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "may", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 4047 + .{ "ma\xe2\x80\x99am", SpecialCase{ .tokens = .{ "ma\xe2\x80\x99am", "", "" }, .len = 1 } }, 4048 + .{ "might've", SpecialCase{ .tokens = .{ "might", "'ve", "" }, .len = 2 } }, 4049 + .{ "mightn't", SpecialCase{ .tokens = .{ "might", "n't", "" }, .len = 2 } }, 4050 + .{ "mightn't've", SpecialCase{ .tokens = .{ "might", "n't", "'ve" }, .len = 3 } }, 4051 + .{ "mightnt", SpecialCase{ .tokens = .{ "might", "nt", "" }, .len = 2 } }, 4052 + .{ "mightntve", SpecialCase{ .tokens = .{ "might", "nt", "ve" }, .len = 3 } }, 4053 + .{ "mightn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "might", "n\xe2\x80\x99t", "" }, .len = 2 } }, 4054 + .{ "mightn\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "might", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 4055 + .{ "mightve", SpecialCase{ .tokens = .{ "might", "ve", "" }, .len = 2 } }, 4056 + .{ "might\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "might", "\xe2\x80\x99ve", "" }, .len = 2 } }, 4057 + .{ "must've", SpecialCase{ .tokens = .{ "must", "'ve", "" }, .len = 2 } }, 4058 + .{ "mustn't", SpecialCase{ .tokens = .{ "must", "n't", "" }, .len = 2 } }, 4059 + .{ "mustn't've", SpecialCase{ .tokens = .{ "must", "n't", "'ve" }, .len = 3 } }, 4060 + .{ "mustnt", SpecialCase{ .tokens = .{ "must", "nt", "" }, .len = 2 } }, 4061 + .{ "mustntve", SpecialCase{ .tokens = .{ "must", "nt", "ve" }, .len = 3 } }, 4062 + .{ "mustn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "must", "n\xe2\x80\x99t", "" }, .len = 2 } }, 4063 + .{ "mustn\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "must", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 4064 + .{ "mustve", SpecialCase{ .tokens = .{ "must", "ve", "" }, .len = 2 } }, 4065 + .{ "must\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "must", "\xe2\x80\x99ve", "" }, .len = 2 } }, 4066 + .{ "n.", SpecialCase{ .tokens = .{ "n.", "", "" }, .len = 1 } }, 4067 + .{ "needn't", SpecialCase{ .tokens = .{ "need", "n't", "" }, .len = 2 } }, 4068 + .{ "needn't've", SpecialCase{ .tokens = .{ "need", "n't", "'ve" }, .len = 3 } }, 4069 + .{ "neednt", SpecialCase{ .tokens = .{ "need", "nt", "" }, .len = 2 } }, 4070 + .{ "needntve", SpecialCase{ .tokens = .{ "need", "nt", "ve" }, .len = 3 } }, 4071 + .{ "needn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "need", "n\xe2\x80\x99t", "" }, .len = 2 } }, 4072 + .{ "needn\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "need", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 4073 + .{ "not've", SpecialCase{ .tokens = .{ "not", "'ve", "" }, .len = 2 } }, 4074 + .{ "nothin", SpecialCase{ .tokens = .{ "nothin", "", "" }, .len = 1 } }, 4075 + .{ "nothin'", SpecialCase{ .tokens = .{ "nothin'", "", "" }, .len = 1 } }, 4076 + .{ "nothin\xe2\x80\x99", SpecialCase{ .tokens = .{ "nothin\xe2\x80\x99", "", "" }, .len = 1 } }, 4077 + .{ "notve", SpecialCase{ .tokens = .{ "not", "ve", "" }, .len = 2 } }, 4078 + .{ "not\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "not", "\xe2\x80\x99ve", "" }, .len = 2 } }, 4079 + .{ "nuff", SpecialCase{ .tokens = .{ "nuff", "", "" }, .len = 1 } }, 4080 + .{ "nuthin", SpecialCase{ .tokens = .{ "nuthin", "", "" }, .len = 1 } }, 4081 + .{ "nuthin'", SpecialCase{ .tokens = .{ "nuthin'", "", "" }, .len = 1 } }, 4082 + .{ "nuthin\xe2\x80\x99", SpecialCase{ .tokens = .{ "nuthin\xe2\x80\x99", "", "" }, .len = 1 } }, 4083 + .{ "o'clock", SpecialCase{ .tokens = .{ "o'clock", "", "" }, .len = 1 } }, 4084 + .{ "o.", SpecialCase{ .tokens = .{ "o.", "", "" }, .len = 1 } }, 4085 + .{ "o.0", SpecialCase{ .tokens = .{ "o.0", "", "" }, .len = 1 } }, 4086 + .{ "o.O", SpecialCase{ .tokens = .{ "o.O", "", "" }, .len = 1 } }, 4087 + .{ "o.o", SpecialCase{ .tokens = .{ "o.o", "", "" }, .len = 1 } }, 4088 + .{ "o_0", SpecialCase{ .tokens = .{ "o_0", "", "" }, .len = 1 } }, 4089 + .{ "o_O", SpecialCase{ .tokens = .{ "o_O", "", "" }, .len = 1 } }, 4090 + .{ "o_o", SpecialCase{ .tokens = .{ "o_o", "", "" }, .len = 1 } }, 4091 + .{ "ol", SpecialCase{ .tokens = .{ "ol", "", "" }, .len = 1 } }, 4092 + .{ "ol'", SpecialCase{ .tokens = .{ "ol'", "", "" }, .len = 1 } }, 4093 + .{ "ol\xe2\x80\x99", SpecialCase{ .tokens = .{ "ol\xe2\x80\x99", "", "" }, .len = 1 } }, 4094 + .{ "oughtn't", SpecialCase{ .tokens = .{ "ought", "n't", "" }, .len = 2 } }, 4095 + .{ "oughtn't've", SpecialCase{ .tokens = .{ "ought", "n't", "'ve" }, .len = 3 } }, 4096 + .{ "oughtnt", SpecialCase{ .tokens = .{ "ought", "nt", "" }, .len = 2 } }, 4097 + .{ "oughtntve", SpecialCase{ .tokens = .{ "ought", "nt", "ve" }, .len = 3 } }, 4098 + .{ "oughtn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "ought", "n\xe2\x80\x99t", "" }, .len = 2 } }, 4099 + .{ "oughtn\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "ought", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 4100 + .{ "o\xe2\x80\x99clock", SpecialCase{ .tokens = .{ "o\xe2\x80\x99clock", "", "" }, .len = 1 } }, 4101 + .{ "p.", SpecialCase{ .tokens = .{ "p.", "", "" }, .len = 1 } }, 4102 + .{ "p.m.", SpecialCase{ .tokens = .{ "p.m.", "", "" }, .len = 1 } }, 4103 + .{ "q.", SpecialCase{ .tokens = .{ "q.", "", "" }, .len = 1 } }, 4104 + .{ "r.", SpecialCase{ .tokens = .{ "r.", "", "" }, .len = 1 } }, 4105 + .{ "s.", SpecialCase{ .tokens = .{ "s.", "", "" }, .len = 1 } }, 4106 + .{ "shan't", SpecialCase{ .tokens = .{ "sha", "n't", "" }, .len = 2 } }, 4107 + .{ "shan't've", SpecialCase{ .tokens = .{ "sha", "n't", "'ve" }, .len = 3 } }, 4108 + .{ "shant", SpecialCase{ .tokens = .{ "sha", "nt", "" }, .len = 2 } }, 4109 + .{ "shantve", SpecialCase{ .tokens = .{ "sha", "nt", "ve" }, .len = 3 } }, 4110 + .{ "shan\xe2\x80\x99t", SpecialCase{ .tokens = .{ "sha", "n\xe2\x80\x99t", "" }, .len = 2 } }, 4111 + .{ "shan\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "sha", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 4112 + .{ "she'd", SpecialCase{ .tokens = .{ "she", "'d", "" }, .len = 2 } }, 4113 + .{ "she'd've", SpecialCase{ .tokens = .{ "she", "'d", "'ve" }, .len = 3 } }, 4114 + .{ "she'll", SpecialCase{ .tokens = .{ "she", "'ll", "" }, .len = 2 } }, 4115 + .{ "she'll've", SpecialCase{ .tokens = .{ "she", "'ll", "'ve" }, .len = 3 } }, 4116 + .{ "she's", SpecialCase{ .tokens = .{ "she", "'s", "" }, .len = 2 } }, 4117 + .{ "shedve", SpecialCase{ .tokens = .{ "she", "d", "ve" }, .len = 3 } }, 4118 + .{ "shellve", SpecialCase{ .tokens = .{ "she", "ll", "ve" }, .len = 3 } }, 4119 + .{ "shes", SpecialCase{ .tokens = .{ "she", "s", "" }, .len = 2 } }, 4120 + .{ "she\xe2\x80\x99d", SpecialCase{ .tokens = .{ "she", "\xe2\x80\x99d", "" }, .len = 2 } }, 4121 + .{ "she\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "she", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 4122 + .{ "she\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "she", "\xe2\x80\x99ll", "" }, .len = 2 } }, 4123 + .{ "she\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "she", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 4124 + .{ "she\xe2\x80\x99s", SpecialCase{ .tokens = .{ "she", "\xe2\x80\x99s", "" }, .len = 2 } }, 4125 + .{ "should've", SpecialCase{ .tokens = .{ "should", "'ve", "" }, .len = 2 } }, 4126 + .{ "shouldn't", SpecialCase{ .tokens = .{ "should", "n't", "" }, .len = 2 } }, 4127 + .{ "shouldn't've", SpecialCase{ .tokens = .{ "should", "n't", "'ve" }, .len = 3 } }, 4128 + .{ "shouldnt", SpecialCase{ .tokens = .{ "should", "nt", "" }, .len = 2 } }, 4129 + .{ "shouldntve", SpecialCase{ .tokens = .{ "should", "nt", "ve" }, .len = 3 } }, 4130 + .{ "shouldn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "should", "n\xe2\x80\x99t", "" }, .len = 2 } }, 4131 + .{ "shouldn\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "should", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 4132 + .{ "shouldve", SpecialCase{ .tokens = .{ "should", "ve", "" }, .len = 2 } }, 4133 + .{ "should\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "should", "\xe2\x80\x99ve", "" }, .len = 2 } }, 4134 + .{ "somethin", SpecialCase{ .tokens = .{ "somethin", "", "" }, .len = 1 } }, 4135 + .{ "somethin'", SpecialCase{ .tokens = .{ "somethin'", "", "" }, .len = 1 } }, 4136 + .{ "somethin\xe2\x80\x99", SpecialCase{ .tokens = .{ "somethin\xe2\x80\x99", "", "" }, .len = 1 } }, 4137 + .{ "t.", SpecialCase{ .tokens = .{ "t.", "", "" }, .len = 1 } }, 4138 + .{ "that'd", SpecialCase{ .tokens = .{ "that", "'d", "" }, .len = 2 } }, 4139 + .{ "that'd've", SpecialCase{ .tokens = .{ "that", "'d", "'ve" }, .len = 3 } }, 4140 + .{ "that'll", SpecialCase{ .tokens = .{ "that", "'ll", "" }, .len = 2 } }, 4141 + .{ "that'll've", SpecialCase{ .tokens = .{ "that", "'ll", "'ve" }, .len = 3 } }, 4142 + .{ "that's", SpecialCase{ .tokens = .{ "that", "'s", "" }, .len = 2 } }, 4143 + .{ "thatd", SpecialCase{ .tokens = .{ "that", "d", "" }, .len = 2 } }, 4144 + .{ "thatdve", SpecialCase{ .tokens = .{ "that", "d", "ve" }, .len = 3 } }, 4145 + .{ "thatll", SpecialCase{ .tokens = .{ "that", "ll", "" }, .len = 2 } }, 4146 + .{ "thatllve", SpecialCase{ .tokens = .{ "that", "ll", "ve" }, .len = 3 } }, 4147 + .{ "thats", SpecialCase{ .tokens = .{ "that", "s", "" }, .len = 2 } }, 4148 + .{ "that\xe2\x80\x99d", SpecialCase{ .tokens = .{ "that", "\xe2\x80\x99d", "" }, .len = 2 } }, 4149 + .{ "that\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "that", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 4150 + .{ "that\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "that", "\xe2\x80\x99ll", "" }, .len = 2 } }, 4151 + .{ "that\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "that", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 4152 + .{ "that\xe2\x80\x99s", SpecialCase{ .tokens = .{ "that", "\xe2\x80\x99s", "" }, .len = 2 } }, 4153 + .{ "there'd", SpecialCase{ .tokens = .{ "there", "'d", "" }, .len = 2 } }, 4154 + .{ "there'd've", SpecialCase{ .tokens = .{ "there", "'d", "'ve" }, .len = 3 } }, 4155 + .{ "there'll", SpecialCase{ .tokens = .{ "there", "'ll", "" }, .len = 2 } }, 4156 + .{ "there'll've", SpecialCase{ .tokens = .{ "there", "'ll", "'ve" }, .len = 3 } }, 4157 + .{ "there're", SpecialCase{ .tokens = .{ "there", "'re", "" }, .len = 2 } }, 4158 + .{ "there's", SpecialCase{ .tokens = .{ "there", "'s", "" }, .len = 2 } }, 4159 + .{ "there've", SpecialCase{ .tokens = .{ "there", "'ve", "" }, .len = 2 } }, 4160 + .{ "thered", SpecialCase{ .tokens = .{ "there", "d", "" }, .len = 2 } }, 4161 + .{ "theredve", SpecialCase{ .tokens = .{ "there", "d", "ve" }, .len = 3 } }, 4162 + .{ "therell", SpecialCase{ .tokens = .{ "there", "ll", "" }, .len = 2 } }, 4163 + .{ "therellve", SpecialCase{ .tokens = .{ "there", "ll", "ve" }, .len = 3 } }, 4164 + .{ "therere", SpecialCase{ .tokens = .{ "there", "re", "" }, .len = 2 } }, 4165 + .{ "theres", SpecialCase{ .tokens = .{ "there", "s", "" }, .len = 2 } }, 4166 + .{ "thereve", SpecialCase{ .tokens = .{ "there", "ve", "" }, .len = 2 } }, 4167 + .{ "there\xe2\x80\x99d", SpecialCase{ .tokens = .{ "there", "\xe2\x80\x99d", "" }, .len = 2 } }, 4168 + .{ "there\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "there", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 4169 + .{ "there\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "there", "\xe2\x80\x99ll", "" }, .len = 2 } }, 4170 + .{ "there\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "there", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 4171 + .{ "there\xe2\x80\x99re", SpecialCase{ .tokens = .{ "there", "\xe2\x80\x99re", "" }, .len = 2 } }, 4172 + .{ "there\xe2\x80\x99s", SpecialCase{ .tokens = .{ "there", "\xe2\x80\x99s", "" }, .len = 2 } }, 4173 + .{ "there\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "there", "\xe2\x80\x99ve", "" }, .len = 2 } }, 4174 + .{ "these'd", SpecialCase{ .tokens = .{ "these", "'d", "" }, .len = 2 } }, 4175 + .{ "these'd've", SpecialCase{ .tokens = .{ "these", "'d", "'ve" }, .len = 3 } }, 4176 + .{ "these'll", SpecialCase{ .tokens = .{ "these", "'ll", "" }, .len = 2 } }, 4177 + .{ "these'll've", SpecialCase{ .tokens = .{ "these", "'ll", "'ve" }, .len = 3 } }, 4178 + .{ "these're", SpecialCase{ .tokens = .{ "these", "'re", "" }, .len = 2 } }, 4179 + .{ "these've", SpecialCase{ .tokens = .{ "these", "'ve", "" }, .len = 2 } }, 4180 + .{ "thesed", SpecialCase{ .tokens = .{ "these", "d", "" }, .len = 2 } }, 4181 + .{ "thesedve", SpecialCase{ .tokens = .{ "these", "d", "ve" }, .len = 3 } }, 4182 + .{ "thesell", SpecialCase{ .tokens = .{ "these", "ll", "" }, .len = 2 } }, 4183 + .{ "thesellve", SpecialCase{ .tokens = .{ "these", "ll", "ve" }, .len = 3 } }, 4184 + .{ "thesere", SpecialCase{ .tokens = .{ "these", "re", "" }, .len = 2 } }, 4185 + .{ "theseve", SpecialCase{ .tokens = .{ "these", "ve", "" }, .len = 2 } }, 4186 + .{ "these\xe2\x80\x99d", SpecialCase{ .tokens = .{ "these", "\xe2\x80\x99d", "" }, .len = 2 } }, 4187 + .{ "these\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "these", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 4188 + .{ "these\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "these", "\xe2\x80\x99ll", "" }, .len = 2 } }, 4189 + .{ "these\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "these", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 4190 + .{ "these\xe2\x80\x99re", SpecialCase{ .tokens = .{ "these", "\xe2\x80\x99re", "" }, .len = 2 } }, 4191 + .{ "these\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "these", "\xe2\x80\x99ve", "" }, .len = 2 } }, 4192 + .{ "they'd", SpecialCase{ .tokens = .{ "they", "'d", "" }, .len = 2 } }, 4193 + .{ "they'd've", SpecialCase{ .tokens = .{ "they", "'d", "'ve" }, .len = 3 } }, 4194 + .{ "they'll", SpecialCase{ .tokens = .{ "they", "'ll", "" }, .len = 2 } }, 4195 + .{ "they'll've", SpecialCase{ .tokens = .{ "they", "'ll", "'ve" }, .len = 3 } }, 4196 + .{ "they're", SpecialCase{ .tokens = .{ "they", "'re", "" }, .len = 2 } }, 4197 + .{ "they've", SpecialCase{ .tokens = .{ "they", "'ve", "" }, .len = 2 } }, 4198 + .{ "theyd", SpecialCase{ .tokens = .{ "they", "d", "" }, .len = 2 } }, 4199 + .{ "theydve", SpecialCase{ .tokens = .{ "they", "d", "ve" }, .len = 3 } }, 4200 + .{ "theyll", SpecialCase{ .tokens = .{ "they", "ll", "" }, .len = 2 } }, 4201 + .{ "theyllve", SpecialCase{ .tokens = .{ "they", "ll", "ve" }, .len = 3 } }, 4202 + .{ "theyre", SpecialCase{ .tokens = .{ "they", "re", "" }, .len = 2 } }, 4203 + .{ "theyve", SpecialCase{ .tokens = .{ "they", "ve", "" }, .len = 2 } }, 4204 + .{ "they\xe2\x80\x99d", SpecialCase{ .tokens = .{ "they", "\xe2\x80\x99d", "" }, .len = 2 } }, 4205 + .{ "they\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "they", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 4206 + .{ "they\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "they", "\xe2\x80\x99ll", "" }, .len = 2 } }, 4207 + .{ "they\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "they", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 4208 + .{ "they\xe2\x80\x99re", SpecialCase{ .tokens = .{ "they", "\xe2\x80\x99re", "" }, .len = 2 } }, 4209 + .{ "they\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "they", "\xe2\x80\x99ve", "" }, .len = 2 } }, 4210 + .{ "this'd", SpecialCase{ .tokens = .{ "this", "'d", "" }, .len = 2 } }, 4211 + .{ "this'd've", SpecialCase{ .tokens = .{ "this", "'d", "'ve" }, .len = 3 } }, 4212 + .{ "this'll", SpecialCase{ .tokens = .{ "this", "'ll", "" }, .len = 2 } }, 4213 + .{ "this'll've", SpecialCase{ .tokens = .{ "this", "'ll", "'ve" }, .len = 3 } }, 4214 + .{ "this's", SpecialCase{ .tokens = .{ "this", "'s", "" }, .len = 2 } }, 4215 + .{ "thisd", SpecialCase{ .tokens = .{ "this", "d", "" }, .len = 2 } }, 4216 + .{ "thisdve", SpecialCase{ .tokens = .{ "this", "d", "ve" }, .len = 3 } }, 4217 + .{ "thisll", SpecialCase{ .tokens = .{ "this", "ll", "" }, .len = 2 } }, 4218 + .{ "thisllve", SpecialCase{ .tokens = .{ "this", "ll", "ve" }, .len = 3 } }, 4219 + .{ "thiss", SpecialCase{ .tokens = .{ "this", "s", "" }, .len = 2 } }, 4220 + .{ "this\xe2\x80\x99d", SpecialCase{ .tokens = .{ "this", "\xe2\x80\x99d", "" }, .len = 2 } }, 4221 + .{ "this\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "this", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 4222 + .{ "this\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "this", "\xe2\x80\x99ll", "" }, .len = 2 } }, 4223 + .{ "this\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "this", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 4224 + .{ "this\xe2\x80\x99s", SpecialCase{ .tokens = .{ "this", "\xe2\x80\x99s", "" }, .len = 2 } }, 4225 + .{ "those'd", SpecialCase{ .tokens = .{ "those", "'d", "" }, .len = 2 } }, 4226 + .{ "those'd've", SpecialCase{ .tokens = .{ "those", "'d", "'ve" }, .len = 3 } }, 4227 + .{ "those'll", SpecialCase{ .tokens = .{ "those", "'ll", "" }, .len = 2 } }, 4228 + .{ "those'll've", SpecialCase{ .tokens = .{ "those", "'ll", "'ve" }, .len = 3 } }, 4229 + .{ "those're", SpecialCase{ .tokens = .{ "those", "'re", "" }, .len = 2 } }, 4230 + .{ "those've", SpecialCase{ .tokens = .{ "those", "'ve", "" }, .len = 2 } }, 4231 + .{ "thosed", SpecialCase{ .tokens = .{ "those", "d", "" }, .len = 2 } }, 4232 + .{ "thosedve", SpecialCase{ .tokens = .{ "those", "d", "ve" }, .len = 3 } }, 4233 + .{ "thosell", SpecialCase{ .tokens = .{ "those", "ll", "" }, .len = 2 } }, 4234 + .{ "thosellve", SpecialCase{ .tokens = .{ "those", "ll", "ve" }, .len = 3 } }, 4235 + .{ "thosere", SpecialCase{ .tokens = .{ "those", "re", "" }, .len = 2 } }, 4236 + .{ "thoseve", SpecialCase{ .tokens = .{ "those", "ve", "" }, .len = 2 } }, 4237 + .{ "those\xe2\x80\x99d", SpecialCase{ .tokens = .{ "those", "\xe2\x80\x99d", "" }, .len = 2 } }, 4238 + .{ "those\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "those", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 4239 + .{ "those\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "those", "\xe2\x80\x99ll", "" }, .len = 2 } }, 4240 + .{ "those\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "those", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 4241 + .{ "those\xe2\x80\x99re", SpecialCase{ .tokens = .{ "those", "\xe2\x80\x99re", "" }, .len = 2 } }, 4242 + .{ "those\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "those", "\xe2\x80\x99ve", "" }, .len = 2 } }, 4243 + .{ "u.", SpecialCase{ .tokens = .{ "u.", "", "" }, .len = 1 } }, 4244 + .{ "v.", SpecialCase{ .tokens = .{ "v.", "", "" }, .len = 1 } }, 4245 + .{ "v.s.", SpecialCase{ .tokens = .{ "v.s.", "", "" }, .len = 1 } }, 4246 + .{ "v.v", SpecialCase{ .tokens = .{ "v.v", "", "" }, .len = 1 } }, 4247 + .{ "v_v", SpecialCase{ .tokens = .{ "v_v", "", "" }, .len = 1 } }, 4248 + .{ "vs.", SpecialCase{ .tokens = .{ "vs.", "", "" }, .len = 1 } }, 4249 + .{ "w.", SpecialCase{ .tokens = .{ "w.", "", "" }, .len = 1 } }, 4250 + .{ "w/o", SpecialCase{ .tokens = .{ "w/o", "", "" }, .len = 1 } }, 4251 + .{ "wasn't", SpecialCase{ .tokens = .{ "was", "n't", "" }, .len = 2 } }, 4252 + .{ "wasnt", SpecialCase{ .tokens = .{ "was", "nt", "" }, .len = 2 } }, 4253 + .{ "wasn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "was", "n\xe2\x80\x99t", "" }, .len = 2 } }, 4254 + .{ "we'd", SpecialCase{ .tokens = .{ "we", "'d", "" }, .len = 2 } }, 4255 + .{ "we'd've", SpecialCase{ .tokens = .{ "we", "'d", "'ve" }, .len = 3 } }, 4256 + .{ "we'll", SpecialCase{ .tokens = .{ "we", "'ll", "" }, .len = 2 } }, 4257 + .{ "we'll've", SpecialCase{ .tokens = .{ "we", "'ll", "'ve" }, .len = 3 } }, 4258 + .{ "we're", SpecialCase{ .tokens = .{ "we", "'re", "" }, .len = 2 } }, 4259 + .{ "we've", SpecialCase{ .tokens = .{ "we", "'ve", "" }, .len = 2 } }, 4260 + .{ "wed", SpecialCase{ .tokens = .{ "we", "d", "" }, .len = 2 } }, 4261 + .{ "wedve", SpecialCase{ .tokens = .{ "we", "d", "ve" }, .len = 3 } }, 4262 + .{ "wellve", SpecialCase{ .tokens = .{ "we", "ll", "ve" }, .len = 3 } }, 4263 + .{ "weren't", SpecialCase{ .tokens = .{ "were", "n't", "" }, .len = 2 } }, 4264 + .{ "werent", SpecialCase{ .tokens = .{ "were", "nt", "" }, .len = 2 } }, 4265 + .{ "weren\xe2\x80\x99t", SpecialCase{ .tokens = .{ "were", "n\xe2\x80\x99t", "" }, .len = 2 } }, 4266 + .{ "weve", SpecialCase{ .tokens = .{ "we", "ve", "" }, .len = 2 } }, 4267 + .{ "we\xe2\x80\x99d", SpecialCase{ .tokens = .{ "we", "\xe2\x80\x99d", "" }, .len = 2 } }, 4268 + .{ "we\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "we", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 4269 + .{ "we\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "we", "\xe2\x80\x99ll", "" }, .len = 2 } }, 4270 + .{ "we\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "we", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 4271 + .{ "we\xe2\x80\x99re", SpecialCase{ .tokens = .{ "we", "\xe2\x80\x99re", "" }, .len = 2 } }, 4272 + .{ "we\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "we", "\xe2\x80\x99ve", "" }, .len = 2 } }, 4273 + .{ "what'd", SpecialCase{ .tokens = .{ "what", "'d", "" }, .len = 2 } }, 4274 + .{ "what'd've", SpecialCase{ .tokens = .{ "what", "'d", "'ve" }, .len = 3 } }, 4275 + .{ "what'll", SpecialCase{ .tokens = .{ "what", "'ll", "" }, .len = 2 } }, 4276 + .{ "what'll've", SpecialCase{ .tokens = .{ "what", "'ll", "'ve" }, .len = 3 } }, 4277 + .{ "what're", SpecialCase{ .tokens = .{ "what", "'re", "" }, .len = 2 } }, 4278 + .{ "what's", SpecialCase{ .tokens = .{ "what", "'s", "" }, .len = 2 } }, 4279 + .{ "what've", SpecialCase{ .tokens = .{ "what", "'ve", "" }, .len = 2 } }, 4280 + .{ "whatd", SpecialCase{ .tokens = .{ "what", "d", "" }, .len = 2 } }, 4281 + .{ "whatdve", SpecialCase{ .tokens = .{ "what", "d", "ve" }, .len = 3 } }, 4282 + .{ "whatll", SpecialCase{ .tokens = .{ "what", "ll", "" }, .len = 2 } }, 4283 + .{ "whatllve", SpecialCase{ .tokens = .{ "what", "ll", "ve" }, .len = 3 } }, 4284 + .{ "whatre", SpecialCase{ .tokens = .{ "what", "re", "" }, .len = 2 } }, 4285 + .{ "whats", SpecialCase{ .tokens = .{ "what", "s", "" }, .len = 2 } }, 4286 + .{ "whatve", SpecialCase{ .tokens = .{ "what", "ve", "" }, .len = 2 } }, 4287 + .{ "what\xe2\x80\x99d", SpecialCase{ .tokens = .{ "what", "\xe2\x80\x99d", "" }, .len = 2 } }, 4288 + .{ "what\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "what", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 4289 + .{ "what\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "what", "\xe2\x80\x99ll", "" }, .len = 2 } }, 4290 + .{ "what\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "what", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 4291 + .{ "what\xe2\x80\x99re", SpecialCase{ .tokens = .{ "what", "\xe2\x80\x99re", "" }, .len = 2 } }, 4292 + .{ "what\xe2\x80\x99s", SpecialCase{ .tokens = .{ "what", "\xe2\x80\x99s", "" }, .len = 2 } }, 4293 + .{ "what\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "what", "\xe2\x80\x99ve", "" }, .len = 2 } }, 4294 + .{ "when'd", SpecialCase{ .tokens = .{ "when", "'d", "" }, .len = 2 } }, 4295 + .{ "when'd've", SpecialCase{ .tokens = .{ "when", "'d", "'ve" }, .len = 3 } }, 4296 + .{ "when'll", SpecialCase{ .tokens = .{ "when", "'ll", "" }, .len = 2 } }, 4297 + .{ "when'll've", SpecialCase{ .tokens = .{ "when", "'ll", "'ve" }, .len = 3 } }, 4298 + .{ "when're", SpecialCase{ .tokens = .{ "when", "'re", "" }, .len = 2 } }, 4299 + .{ "when's", SpecialCase{ .tokens = .{ "when", "'s", "" }, .len = 2 } }, 4300 + .{ "when've", SpecialCase{ .tokens = .{ "when", "'ve", "" }, .len = 2 } }, 4301 + .{ "whend", SpecialCase{ .tokens = .{ "when", "d", "" }, .len = 2 } }, 4302 + .{ "whendve", SpecialCase{ .tokens = .{ "when", "d", "ve" }, .len = 3 } }, 4303 + .{ "whenll", SpecialCase{ .tokens = .{ "when", "ll", "" }, .len = 2 } }, 4304 + .{ "whenllve", SpecialCase{ .tokens = .{ "when", "ll", "ve" }, .len = 3 } }, 4305 + .{ "whenre", SpecialCase{ .tokens = .{ "when", "re", "" }, .len = 2 } }, 4306 + .{ "whens", SpecialCase{ .tokens = .{ "when", "s", "" }, .len = 2 } }, 4307 + .{ "whenve", SpecialCase{ .tokens = .{ "when", "ve", "" }, .len = 2 } }, 4308 + .{ "when\xe2\x80\x99d", SpecialCase{ .tokens = .{ "when", "\xe2\x80\x99d", "" }, .len = 2 } }, 4309 + .{ "when\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "when", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 4310 + .{ "when\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "when", "\xe2\x80\x99ll", "" }, .len = 2 } }, 4311 + .{ "when\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "when", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 4312 + .{ "when\xe2\x80\x99re", SpecialCase{ .tokens = .{ "when", "\xe2\x80\x99re", "" }, .len = 2 } }, 4313 + .{ "when\xe2\x80\x99s", SpecialCase{ .tokens = .{ "when", "\xe2\x80\x99s", "" }, .len = 2 } }, 4314 + .{ "when\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "when", "\xe2\x80\x99ve", "" }, .len = 2 } }, 4315 + .{ "where'd", SpecialCase{ .tokens = .{ "where", "'d", "" }, .len = 2 } }, 4316 + .{ "where'd've", SpecialCase{ .tokens = .{ "where", "'d", "'ve" }, .len = 3 } }, 4317 + .{ "where'll", SpecialCase{ .tokens = .{ "where", "'ll", "" }, .len = 2 } }, 4318 + .{ "where'll've", SpecialCase{ .tokens = .{ "where", "'ll", "'ve" }, .len = 3 } }, 4319 + .{ "where're", SpecialCase{ .tokens = .{ "where", "'re", "" }, .len = 2 } }, 4320 + .{ "where's", SpecialCase{ .tokens = .{ "where", "'s", "" }, .len = 2 } }, 4321 + .{ "where've", SpecialCase{ .tokens = .{ "where", "'ve", "" }, .len = 2 } }, 4322 + .{ "whered", SpecialCase{ .tokens = .{ "where", "d", "" }, .len = 2 } }, 4323 + .{ "wheredve", SpecialCase{ .tokens = .{ "where", "d", "ve" }, .len = 3 } }, 4324 + .{ "wherell", SpecialCase{ .tokens = .{ "where", "ll", "" }, .len = 2 } }, 4325 + .{ "wherellve", SpecialCase{ .tokens = .{ "where", "ll", "ve" }, .len = 3 } }, 4326 + .{ "wherere", SpecialCase{ .tokens = .{ "where", "re", "" }, .len = 2 } }, 4327 + .{ "wheres", SpecialCase{ .tokens = .{ "where", "s", "" }, .len = 2 } }, 4328 + .{ "whereve", SpecialCase{ .tokens = .{ "where", "ve", "" }, .len = 2 } }, 4329 + .{ "where\xe2\x80\x99d", SpecialCase{ .tokens = .{ "where", "\xe2\x80\x99d", "" }, .len = 2 } }, 4330 + .{ "where\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "where", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 4331 + .{ "where\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "where", "\xe2\x80\x99ll", "" }, .len = 2 } }, 4332 + .{ "where\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "where", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 4333 + .{ "where\xe2\x80\x99re", SpecialCase{ .tokens = .{ "where", "\xe2\x80\x99re", "" }, .len = 2 } }, 4334 + .{ "where\xe2\x80\x99s", SpecialCase{ .tokens = .{ "where", "\xe2\x80\x99s", "" }, .len = 2 } }, 4335 + .{ "where\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "where", "\xe2\x80\x99ve", "" }, .len = 2 } }, 4336 + .{ "who'd", SpecialCase{ .tokens = .{ "who", "'d", "" }, .len = 2 } }, 4337 + .{ "who'd've", SpecialCase{ .tokens = .{ "who", "'d", "'ve" }, .len = 3 } }, 4338 + .{ "who'll", SpecialCase{ .tokens = .{ "who", "'ll", "" }, .len = 2 } }, 4339 + .{ "who'll've", SpecialCase{ .tokens = .{ "who", "'ll", "'ve" }, .len = 3 } }, 4340 + .{ "who're", SpecialCase{ .tokens = .{ "who", "'re", "" }, .len = 2 } }, 4341 + .{ "who's", SpecialCase{ .tokens = .{ "who", "'s", "" }, .len = 2 } }, 4342 + .{ "who've", SpecialCase{ .tokens = .{ "who", "'ve", "" }, .len = 2 } }, 4343 + .{ "whod", SpecialCase{ .tokens = .{ "who", "d", "" }, .len = 2 } }, 4344 + .{ "whodve", SpecialCase{ .tokens = .{ "who", "d", "ve" }, .len = 3 } }, 4345 + .{ "wholl", SpecialCase{ .tokens = .{ "who", "ll", "" }, .len = 2 } }, 4346 + .{ "whollve", SpecialCase{ .tokens = .{ "who", "ll", "ve" }, .len = 3 } }, 4347 + .{ "whos", SpecialCase{ .tokens = .{ "who", "s", "" }, .len = 2 } }, 4348 + .{ "whove", SpecialCase{ .tokens = .{ "who", "ve", "" }, .len = 2 } }, 4349 + .{ "who\xe2\x80\x99d", SpecialCase{ .tokens = .{ "who", "\xe2\x80\x99d", "" }, .len = 2 } }, 4350 + .{ "who\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "who", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 4351 + .{ "who\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "who", "\xe2\x80\x99ll", "" }, .len = 2 } }, 4352 + .{ "who\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "who", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 4353 + .{ "who\xe2\x80\x99re", SpecialCase{ .tokens = .{ "who", "\xe2\x80\x99re", "" }, .len = 2 } }, 4354 + .{ "who\xe2\x80\x99s", SpecialCase{ .tokens = .{ "who", "\xe2\x80\x99s", "" }, .len = 2 } }, 4355 + .{ "who\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "who", "\xe2\x80\x99ve", "" }, .len = 2 } }, 4356 + .{ "why'd", SpecialCase{ .tokens = .{ "why", "'d", "" }, .len = 2 } }, 4357 + .{ "why'd've", SpecialCase{ .tokens = .{ "why", "'d", "'ve" }, .len = 3 } }, 4358 + .{ "why'll", SpecialCase{ .tokens = .{ "why", "'ll", "" }, .len = 2 } }, 4359 + .{ "why'll've", SpecialCase{ .tokens = .{ "why", "'ll", "'ve" }, .len = 3 } }, 4360 + .{ "why're", SpecialCase{ .tokens = .{ "why", "'re", "" }, .len = 2 } }, 4361 + .{ "why's", SpecialCase{ .tokens = .{ "why", "'s", "" }, .len = 2 } }, 4362 + .{ "why've", SpecialCase{ .tokens = .{ "why", "'ve", "" }, .len = 2 } }, 4363 + .{ "whyd", SpecialCase{ .tokens = .{ "why", "d", "" }, .len = 2 } }, 4364 + .{ "whydve", SpecialCase{ .tokens = .{ "why", "d", "ve" }, .len = 3 } }, 4365 + .{ "whyll", SpecialCase{ .tokens = .{ "why", "ll", "" }, .len = 2 } }, 4366 + .{ "whyllve", SpecialCase{ .tokens = .{ "why", "ll", "ve" }, .len = 3 } }, 4367 + .{ "whyre", SpecialCase{ .tokens = .{ "why", "re", "" }, .len = 2 } }, 4368 + .{ "whys", SpecialCase{ .tokens = .{ "why", "s", "" }, .len = 2 } }, 4369 + .{ "whyve", SpecialCase{ .tokens = .{ "why", "ve", "" }, .len = 2 } }, 4370 + .{ "why\xe2\x80\x99d", SpecialCase{ .tokens = .{ "why", "\xe2\x80\x99d", "" }, .len = 2 } }, 4371 + .{ "why\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "why", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 4372 + .{ "why\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "why", "\xe2\x80\x99ll", "" }, .len = 2 } }, 4373 + .{ "why\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "why", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 4374 + .{ "why\xe2\x80\x99re", SpecialCase{ .tokens = .{ "why", "\xe2\x80\x99re", "" }, .len = 2 } }, 4375 + .{ "why\xe2\x80\x99s", SpecialCase{ .tokens = .{ "why", "\xe2\x80\x99s", "" }, .len = 2 } }, 4376 + .{ "why\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "why", "\xe2\x80\x99ve", "" }, .len = 2 } }, 4377 + .{ "won't", SpecialCase{ .tokens = .{ "wo", "n't", "" }, .len = 2 } }, 4378 + .{ "won't've", SpecialCase{ .tokens = .{ "wo", "n't", "'ve" }, .len = 3 } }, 4379 + .{ "wont", SpecialCase{ .tokens = .{ "wo", "nt", "" }, .len = 2 } }, 4380 + .{ "wontve", SpecialCase{ .tokens = .{ "wo", "nt", "ve" }, .len = 3 } }, 4381 + .{ "won\xe2\x80\x99t", SpecialCase{ .tokens = .{ "wo", "n\xe2\x80\x99t", "" }, .len = 2 } }, 4382 + .{ "won\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "wo", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 4383 + .{ "would've", SpecialCase{ .tokens = .{ "would", "'ve", "" }, .len = 2 } }, 4384 + .{ "wouldn't", SpecialCase{ .tokens = .{ "would", "n't", "" }, .len = 2 } }, 4385 + .{ "wouldn't've", SpecialCase{ .tokens = .{ "would", "n't", "'ve" }, .len = 3 } }, 4386 + .{ "wouldnt", SpecialCase{ .tokens = .{ "would", "nt", "" }, .len = 2 } }, 4387 + .{ "wouldntve", SpecialCase{ .tokens = .{ "would", "nt", "ve" }, .len = 3 } }, 4388 + .{ "wouldn\xe2\x80\x99t", SpecialCase{ .tokens = .{ "would", "n\xe2\x80\x99t", "" }, .len = 2 } }, 4389 + .{ "wouldn\xe2\x80\x99t\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "would", "n\xe2\x80\x99t", "\xe2\x80\x99ve" }, .len = 3 } }, 4390 + .{ "wouldve", SpecialCase{ .tokens = .{ "would", "ve", "" }, .len = 2 } }, 4391 + .{ "would\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "would", "\xe2\x80\x99ve", "" }, .len = 2 } }, 4392 + .{ "x.", SpecialCase{ .tokens = .{ "x.", "", "" }, .len = 1 } }, 4393 + .{ "xD", SpecialCase{ .tokens = .{ "xD", "", "" }, .len = 1 } }, 4394 + .{ "xDD", SpecialCase{ .tokens = .{ "xDD", "", "" }, .len = 1 } }, 4395 + .{ "y'all", SpecialCase{ .tokens = .{ "y'", "all", "" }, .len = 2 } }, 4396 + .{ "y.", SpecialCase{ .tokens = .{ "y.", "", "" }, .len = 1 } }, 4397 + .{ "yall", SpecialCase{ .tokens = .{ "y", "all", "" }, .len = 2 } }, 4398 + .{ "you'd", SpecialCase{ .tokens = .{ "you", "'d", "" }, .len = 2 } }, 4399 + .{ "you'd've", SpecialCase{ .tokens = .{ "you", "'d", "'ve" }, .len = 3 } }, 4400 + .{ "you'll", SpecialCase{ .tokens = .{ "you", "'ll", "" }, .len = 2 } }, 4401 + .{ "you'll've", SpecialCase{ .tokens = .{ "you", "'ll", "'ve" }, .len = 3 } }, 4402 + .{ "you're", SpecialCase{ .tokens = .{ "you", "'re", "" }, .len = 2 } }, 4403 + .{ "you've", SpecialCase{ .tokens = .{ "you", "'ve", "" }, .len = 2 } }, 4404 + .{ "youd", SpecialCase{ .tokens = .{ "you", "d", "" }, .len = 2 } }, 4405 + .{ "youdve", SpecialCase{ .tokens = .{ "you", "d", "ve" }, .len = 3 } }, 4406 + .{ "youll", SpecialCase{ .tokens = .{ "you", "ll", "" }, .len = 2 } }, 4407 + .{ "youllve", SpecialCase{ .tokens = .{ "you", "ll", "ve" }, .len = 3 } }, 4408 + .{ "youre", SpecialCase{ .tokens = .{ "you", "re", "" }, .len = 2 } }, 4409 + .{ "youve", SpecialCase{ .tokens = .{ "you", "ve", "" }, .len = 2 } }, 4410 + .{ "you\xe2\x80\x99d", SpecialCase{ .tokens = .{ "you", "\xe2\x80\x99d", "" }, .len = 2 } }, 4411 + .{ "you\xe2\x80\x99d\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "you", "\xe2\x80\x99d", "\xe2\x80\x99ve" }, .len = 3 } }, 4412 + .{ "you\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "you", "\xe2\x80\x99ll", "" }, .len = 2 } }, 4413 + .{ "you\xe2\x80\x99ll\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "you", "\xe2\x80\x99ll", "\xe2\x80\x99ve" }, .len = 3 } }, 4414 + .{ "you\xe2\x80\x99re", SpecialCase{ .tokens = .{ "you", "\xe2\x80\x99re", "" }, .len = 2 } }, 4415 + .{ "you\xe2\x80\x99ve", SpecialCase{ .tokens = .{ "you", "\xe2\x80\x99ve", "" }, .len = 2 } }, 4416 + .{ "y\xe2\x80\x99all", SpecialCase{ .tokens = .{ "y\xe2\x80\x99", "all", "" }, .len = 2 } }, 4417 + .{ "z.", SpecialCase{ .tokens = .{ "z.", "", "" }, .len = 1 } }, 4418 + .{ "\xc2\xa0", SpecialCase{ .tokens = .{ "\xc2\xa0", "", "" }, .len = 1 } }, 4419 + .{ "\xc2\xaf\\(\xe3\x83\x84)/\xc2\xaf", SpecialCase{ .tokens = .{ "\xc2\xaf\\(\xe3\x83\x84)/\xc2\xaf", "", "" }, .len = 1 } }, 4420 + .{ "\xc2\xb0C.", SpecialCase{ .tokens = .{ "\xc2\xb0", "C", "." }, .len = 3 } }, 4421 + .{ "\xc2\xb0F.", SpecialCase{ .tokens = .{ "\xc2\xb0", "F", "." }, .len = 3 } }, 4422 + .{ "\xc2\xb0K.", SpecialCase{ .tokens = .{ "\xc2\xb0", "K", "." }, .len = 3 } }, 4423 + .{ "\xc2\xb0c.", SpecialCase{ .tokens = .{ "\xc2\xb0", "c", "." }, .len = 3 } }, 4424 + .{ "\xc2\xb0f.", SpecialCase{ .tokens = .{ "\xc2\xb0", "f", "." }, .len = 3 } }, 4425 + .{ "\xc2\xb0k.", SpecialCase{ .tokens = .{ "\xc2\xb0", "k", "." }, .len = 3 } }, 4426 + .{ "\xc3\xa4.", SpecialCase{ .tokens = .{ "\xc3\xa4.", "", "" }, .len = 1 } }, 4427 + .{ "\xc3\xb6.", SpecialCase{ .tokens = .{ "\xc3\xb6.", "", "" }, .len = 1 } }, 4428 + .{ "\xc3\xbc.", SpecialCase{ .tokens = .{ "\xc3\xbc.", "", "" }, .len = 1 } }, 4429 + .{ "\xe0\xb2\xa0_\xe0\xb2\xa0", SpecialCase{ .tokens = .{ "\xe0\xb2\xa0_\xe0\xb2\xa0", "", "" }, .len = 1 } }, 4430 + .{ "\xe0\xb2\xa0\xef\xb8\xb5\xe0\xb2\xa0", SpecialCase{ .tokens = .{ "\xe0\xb2\xa0\xef\xb8\xb5\xe0\xb2\xa0", "", "" }, .len = 1 } }, 4431 + .{ "\xe2\x80\x94", SpecialCase{ .tokens = .{ "\xe2\x80\x94", "", "" }, .len = 1 } }, 4432 + .{ "\xe2\x80\x98S", SpecialCase{ .tokens = .{ "\xe2\x80\x98S", "", "" }, .len = 1 } }, 4433 + .{ "\xe2\x80\x98s", SpecialCase{ .tokens = .{ "\xe2\x80\x98s", "", "" }, .len = 1 } }, 4434 + .{ "\xe2\x80\x99", SpecialCase{ .tokens = .{ "\xe2\x80\x99", "", "" }, .len = 1 } }, 4435 + .{ "\xe2\x80\x99Cause", SpecialCase{ .tokens = .{ "\xe2\x80\x99Cause", "", "" }, .len = 1 } }, 4436 + .{ "\xe2\x80\x99Cos", SpecialCase{ .tokens = .{ "\xe2\x80\x99Cos", "", "" }, .len = 1 } }, 4437 + .{ "\xe2\x80\x99Coz", SpecialCase{ .tokens = .{ "\xe2\x80\x99Coz", "", "" }, .len = 1 } }, 4438 + .{ "\xe2\x80\x99Cuz", SpecialCase{ .tokens = .{ "\xe2\x80\x99Cuz", "", "" }, .len = 1 } }, 4439 + .{ "\xe2\x80\x99S", SpecialCase{ .tokens = .{ "\xe2\x80\x99S", "", "" }, .len = 1 } }, 4440 + .{ "\xe2\x80\x99bout", SpecialCase{ .tokens = .{ "\xe2\x80\x99bout", "", "" }, .len = 1 } }, 4441 + .{ "\xe2\x80\x99cause", SpecialCase{ .tokens = .{ "\xe2\x80\x99cause", "", "" }, .len = 1 } }, 4442 + .{ "\xe2\x80\x99cos", SpecialCase{ .tokens = .{ "\xe2\x80\x99cos", "", "" }, .len = 1 } }, 4443 + .{ "\xe2\x80\x99coz", SpecialCase{ .tokens = .{ "\xe2\x80\x99coz", "", "" }, .len = 1 } }, 4444 + .{ "\xe2\x80\x99cuz", SpecialCase{ .tokens = .{ "\xe2\x80\x99cuz", "", "" }, .len = 1 } }, 4445 + .{ "\xe2\x80\x99d", SpecialCase{ .tokens = .{ "\xe2\x80\x99d", "", "" }, .len = 1 } }, 4446 + .{ "\xe2\x80\x99em", SpecialCase{ .tokens = .{ "\xe2\x80\x99em", "", "" }, .len = 1 } }, 4447 + .{ "\xe2\x80\x99ll", SpecialCase{ .tokens = .{ "\xe2\x80\x99ll", "", "" }, .len = 1 } }, 4448 + .{ "\xe2\x80\x99nuff", SpecialCase{ .tokens = .{ "\xe2\x80\x99nuff", "", "" }, .len = 1 } }, 4449 + .{ "\xe2\x80\x99re", SpecialCase{ .tokens = .{ "\xe2\x80\x99re", "", "" }, .len = 1 } }, 4450 + .{ "\xe2\x80\x99s", SpecialCase{ .tokens = .{ "\xe2\x80\x99s", "", "" }, .len = 1 } }, 4451 + .{ "\xe2\x80\x99\xe2\x80\x99", SpecialCase{ .tokens = .{ "\xe2\x80\x99\xe2\x80\x99", "", "" }, .len = 1 } }, 4452 + });
+271
tests/ner_expected.json
··· 1 + [ 2 + { 3 + "text": "Barack Obama visited Paris.", 4 + "tokens": [ 5 + "Barack", 6 + "Obama", 7 + "visited", 8 + "Paris", 9 + "." 10 + ], 11 + "entities": [ 12 + { 13 + "text": "Barack Obama", 14 + "start": 0, 15 + "end": 12, 16 + "label": "PERSON" 17 + }, 18 + { 19 + "text": "Paris", 20 + "start": 21, 21 + "end": 26, 22 + "label": "GPE" 23 + } 24 + ] 25 + }, 26 + { 27 + "text": "Apple Inc. is worth $2.5 trillion.", 28 + "tokens": [ 29 + "Apple", 30 + "Inc.", 31 + "is", 32 + "worth", 33 + "$", 34 + "2.5", 35 + "trillion", 36 + "." 37 + ], 38 + "entities": [ 39 + { 40 + "text": "Apple Inc.", 41 + "start": 0, 42 + "end": 10, 43 + "label": "ORG" 44 + }, 45 + { 46 + "text": "$2.5 trillion", 47 + "start": 20, 48 + "end": 33, 49 + "label": "MONEY" 50 + } 51 + ] 52 + }, 53 + { 54 + "text": "The United States and China are trading partners.", 55 + "tokens": [ 56 + "The", 57 + "United", 58 + "States", 59 + "and", 60 + "China", 61 + "are", 62 + "trading", 63 + "partners", 64 + "." 65 + ], 66 + "entities": [ 67 + { 68 + "text": "The United States", 69 + "start": 0, 70 + "end": 17, 71 + "label": "GPE" 72 + }, 73 + { 74 + "text": "China", 75 + "start": 22, 76 + "end": 27, 77 + "label": "GPE" 78 + } 79 + ] 80 + }, 81 + { 82 + "text": "Elon Musk founded SpaceX and Tesla.", 83 + "tokens": [ 84 + "Elon", 85 + "Musk", 86 + "founded", 87 + "SpaceX", 88 + "and", 89 + "Tesla", 90 + "." 91 + ], 92 + "entities": [ 93 + { 94 + "text": "Elon Musk", 95 + "start": 0, 96 + "end": 9, 97 + "label": "PERSON" 98 + }, 99 + { 100 + "text": "Tesla", 101 + "start": 29, 102 + "end": 34, 103 + "label": "NORP" 104 + } 105 + ] 106 + }, 107 + { 108 + "text": "The World Cup was held in Qatar.", 109 + "tokens": [ 110 + "The", 111 + "World", 112 + "Cup", 113 + "was", 114 + "held", 115 + "in", 116 + "Qatar", 117 + "." 118 + ], 119 + "entities": [ 120 + { 121 + "text": "The World Cup", 122 + "start": 0, 123 + "end": 13, 124 + "label": "EVENT" 125 + }, 126 + { 127 + "text": "Qatar", 128 + "start": 26, 129 + "end": 31, 130 + "label": "GPE" 131 + } 132 + ] 133 + }, 134 + { 135 + "text": "Microsoft acquired Activision for $68.7 billion.", 136 + "tokens": [ 137 + "Microsoft", 138 + "acquired", 139 + "Activision", 140 + "for", 141 + "$", 142 + "68.7", 143 + "billion", 144 + "." 145 + ], 146 + "entities": [ 147 + { 148 + "text": "Microsoft", 149 + "start": 0, 150 + "end": 9, 151 + "label": "ORG" 152 + }, 153 + { 154 + "text": "Activision", 155 + "start": 19, 156 + "end": 29, 157 + "label": "ORG" 158 + }, 159 + { 160 + "text": "$68.7 billion", 161 + "start": 34, 162 + "end": 47, 163 + "label": "MONEY" 164 + } 165 + ] 166 + }, 167 + { 168 + "text": "Taylor Swift performed at Madison Square Garden.", 169 + "tokens": [ 170 + "Taylor", 171 + "Swift", 172 + "performed", 173 + "at", 174 + "Madison", 175 + "Square", 176 + "Garden", 177 + "." 178 + ], 179 + "entities": [ 180 + { 181 + "text": "Taylor Swift", 182 + "start": 0, 183 + "end": 12, 184 + "label": "PERSON" 185 + }, 186 + { 187 + "text": "Madison Square Garden", 188 + "start": 26, 189 + "end": 47, 190 + "label": "FAC" 191 + } 192 + ] 193 + }, 194 + { 195 + "text": "The European Union imposed sanctions on Russia.", 196 + "tokens": [ 197 + "The", 198 + "European", 199 + "Union", 200 + "imposed", 201 + "sanctions", 202 + "on", 203 + "Russia", 204 + "." 205 + ], 206 + "entities": [ 207 + { 208 + "text": "The European Union", 209 + "start": 0, 210 + "end": 18, 211 + "label": "ORG" 212 + }, 213 + { 214 + "text": "Russia", 215 + "start": 40, 216 + "end": 46, 217 + "label": "GPE" 218 + } 219 + ] 220 + }, 221 + { 222 + "text": "Goldman Sachs reported quarterly earnings.", 223 + "tokens": [ 224 + "Goldman", 225 + "Sachs", 226 + "reported", 227 + "quarterly", 228 + "earnings", 229 + "." 230 + ], 231 + "entities": [ 232 + { 233 + "text": "Goldman Sachs", 234 + "start": 0, 235 + "end": 13, 236 + "label": "ORG" 237 + }, 238 + { 239 + "text": "quarterly", 240 + "start": 23, 241 + "end": 32, 242 + "label": "DATE" 243 + } 244 + ] 245 + }, 246 + { 247 + "text": "NASA launched the Artemis mission.", 248 + "tokens": [ 249 + "NASA", 250 + "launched", 251 + "the", 252 + "Artemis", 253 + "mission", 254 + "." 255 + ], 256 + "entities": [ 257 + { 258 + "text": "NASA", 259 + "start": 0, 260 + "end": 4, 261 + "label": "ORG" 262 + }, 263 + { 264 + "text": "Artemis", 265 + "start": 18, 266 + "end": 25, 267 + "label": "NORP" 268 + } 269 + ] 270 + } 271 + ]
+169
tests/tokenizer_expected.json
··· 1 + [ 2 + { 3 + "text": "Barack Obama visited Paris.", 4 + "tokens": [ 5 + "Barack", 6 + "Obama", 7 + "visited", 8 + "Paris", 9 + "." 10 + ] 11 + }, 12 + { 13 + "text": "Apple Inc. is worth $2.5 trillion.", 14 + "tokens": [ 15 + "Apple", 16 + "Inc.", 17 + "is", 18 + "worth", 19 + "$", 20 + "2.5", 21 + "trillion", 22 + "." 23 + ] 24 + }, 25 + { 26 + "text": "I can't believe it's not butter!", 27 + "tokens": [ 28 + "I", 29 + "ca", 30 + "n't", 31 + "believe", 32 + "it", 33 + "'s", 34 + "not", 35 + "butter", 36 + "!" 37 + ] 38 + }, 39 + { 40 + "text": "Dr. Smith's office (room 42) is closed.", 41 + "tokens": [ 42 + "Dr.", 43 + "Smith", 44 + "'s", 45 + "office", 46 + "(", 47 + "room", 48 + "42", 49 + ")", 50 + "is", 51 + "closed", 52 + "." 53 + ] 54 + }, 55 + { 56 + "text": "U.S.A. and U.K. are allies.", 57 + "tokens": [ 58 + "U.S.A.", 59 + "and", 60 + "U.K.", 61 + "are", 62 + "allies", 63 + "." 64 + ] 65 + }, 66 + { 67 + "text": "They're going to the store.", 68 + "tokens": [ 69 + "They", 70 + "'re", 71 + "going", 72 + "to", 73 + "the", 74 + "store", 75 + "." 76 + ] 77 + }, 78 + { 79 + "text": "He said \"hello\" and left.", 80 + "tokens": [ 81 + "He", 82 + "said", 83 + "\"", 84 + "hello", 85 + "\"", 86 + "and", 87 + "left", 88 + "." 89 + ] 90 + }, 91 + { 92 + "text": "The cost is $500.00/month.", 93 + "tokens": [ 94 + "The", 95 + "cost", 96 + "is", 97 + "$", 98 + "500.00", 99 + "/", 100 + "month", 101 + "." 102 + ] 103 + }, 104 + { 105 + "text": "New York-based company", 106 + "tokens": [ 107 + "New", 108 + "York", 109 + "-", 110 + "based", 111 + "company" 112 + ] 113 + }, 114 + { 115 + "text": "e-mail: test@example.com", 116 + "tokens": [ 117 + "e", 118 + "-", 119 + "mail", 120 + ":", 121 + "test@example.com" 122 + ] 123 + }, 124 + { 125 + "text": "10,000 people", 126 + "tokens": [ 127 + "10,000", 128 + "people" 129 + ] 130 + }, 131 + { 132 + "text": "3.14159 is pi", 133 + "tokens": [ 134 + "3.14159", 135 + "is", 136 + "pi" 137 + ] 138 + }, 139 + { 140 + "text": "state-of-the-art technology", 141 + "tokens": [ 142 + "state", 143 + "-", 144 + "of", 145 + "-", 146 + "the", 147 + "-", 148 + "art", 149 + "technology" 150 + ] 151 + }, 152 + { 153 + "text": "Mr. and Mrs. Jones", 154 + "tokens": [ 155 + "Mr.", 156 + "and", 157 + "Mrs.", 158 + "Jones" 159 + ] 160 + }, 161 + { 162 + "text": "it's 5:30pm", 163 + "tokens": [ 164 + "it", 165 + "'s", 166 + "5:30pm" 167 + ] 168 + } 169 + ]