this repo has no description
at main 723 lines 25 kB view raw
1//! spaCy-compatible tokenizer. 2//! 3//! port of spaCy's `tokenizer.pyx` algorithm: whitespace split → per-chunk 4//! iterative prefix/suffix stripping → infix splitting → special case lookup. 5//! uses generated data tables from `tokenizer_data.zig`. 6 7const std = @import("std"); 8const data = @import("tokenizer_data.zig"); 9 10/// a token is a byte-offset slice into the original text. 11pub const Token = struct { 12 /// byte offset of start in original text 13 start: u32, 14 /// byte offset of end (exclusive) in original text 15 end: u32, 16 17 pub fn text(self: Token, source: []const u8) []const u8 { 18 return source[self.start..self.end]; 19 } 20}; 21 22/// maximum tokens per document. 23pub const MAX_TOKENS = 1024; 24 25/// tokenize text into tokens. returns the number of tokens written. 26/// tokens are byte-offset spans into the original text. 27pub fn tokenize(text: []const u8, out: []Token) u32 { 28 if (text.len == 0) return 0; 29 var count: u32 = 0; 30 31 // phase 1: whitespace split into chunks 32 var i: usize = 0; 33 while (i < text.len) { 34 // skip whitespace 35 while (i < text.len and isWhitespace(text[i])) : (i += 1) {} 36 if (i >= text.len) break; 37 38 // find end of chunk (next whitespace) 39 const chunk_start = i; 40 while (i < text.len and !isWhitespace(text[i])) : (i += 1) {} 41 const chunk_end = i; 42 43 // phase 2: tokenize this chunk 44 count = tokenizeChunk(text, chunk_start, chunk_end, out, count); 45 if (count >= out.len) return count; 46 } 47 48 return count; 49} 50 51/// tokenize a single whitespace-delimited chunk. 52/// text[start..end] is the chunk. writes tokens to out[count..]. 53fn tokenizeChunk( 54 text: []const u8, 55 start: usize, 56 end: usize, 57 out: []Token, 58 count_in: u32, 59) u32 { 60 var count = count_in; 61 const chunk = text[start..end]; 62 63 // check special cases first 64 if (data.specials.get(chunk)) |special| { 65 var offset: u32 = @intCast(start); 66 for (0..special.len) |ti| { 67 const tok_text = special.tokens[ti]; 68 // find this token text in the source at the expected position 69 const tok_start = findSubstr(text[offset..end], tok_text); 70 if (tok_start) |ts| { 71 if (count < out.len) { 72 out[count] = .{ 73 .start = offset + @as(u32, @intCast(ts)), 74 .end = offset + @as(u32, @intCast(ts)) + @as(u32, @intCast(tok_text.len)), 75 }; 76 count += 1; 77 } 78 offset = offset + @as(u32, @intCast(ts)) + @as(u32, @intCast(tok_text.len)); 79 } else { 80 // special token not found at expected position — emit based on length 81 if (count < out.len) { 82 out[count] = .{ 83 .start = offset, 84 .end = offset + @as(u32, @intCast(tok_text.len)), 85 }; 86 count += 1; 87 } 88 offset += @as(u32, @intCast(tok_text.len)); 89 } 90 } 91 return count; 92 } 93 94 // split affixes iteratively 95 var lo: u32 = @intCast(start); 96 var hi: u32 = @intCast(end); 97 98 // prefix and suffix stacks (indices into out buffer) 99 const prefix_start = count; // where prefixes begin in out 100 var suffix_buf: [64]Token = undefined; 101 var n_suffixes: u32 = 0; 102 103 var last_len: u32 = 0; 104 while (lo < hi and (hi - lo) != last_len) { 105 const span = text[lo..hi]; 106 const span_len = hi - lo; 107 108 // check if remaining span is a special case 109 if (data.specials.get(span) != null) break; 110 111 // check URL match 112 if (matchUrl(span) > 0) break; 113 114 last_len = span_len; 115 116 // try prefix 117 const pre_len = matchPrefix(span); 118 119 // try suffix on span[pre_len..] but strip from end of full span 120 const suf_len = if (pre_len < span_len) 121 matchSuffix(span[pre_len..]) 122 else 123 @as(usize, 0); 124 125 if (pre_len > 0 and suf_len > 0 and (pre_len + suf_len) <= span_len) { 126 // both prefix and suffix, non-overlapping 127 // check if stripping prefix reveals a special 128 const minus_pre = text[lo + @as(u32, @intCast(pre_len)) .. hi]; 129 if (minus_pre.len > 0 and data.specials.get(minus_pre) != null) { 130 // emit prefix, let middle handle the special 131 if (count < out.len) { 132 out[count] = .{ .start = lo, .end = lo + @as(u32, @intCast(pre_len)) }; 133 count += 1; 134 } 135 lo += @as(u32, @intCast(pre_len)); 136 break; 137 } 138 // check if stripping suffix reveals a special 139 const minus_suf = text[lo..hi - @as(u32, @intCast(suf_len))]; 140 if (minus_suf.len > 0 and data.specials.get(minus_suf) != null) { 141 if (n_suffixes < suffix_buf.len) { 142 suffix_buf[n_suffixes] = .{ 143 .start = hi - @as(u32, @intCast(suf_len)), 144 .end = hi, 145 }; 146 n_suffixes += 1; 147 } 148 hi -= @as(u32, @intCast(suf_len)); 149 break; 150 } 151 152 // strip both 153 if (count < out.len) { 154 out[count] = .{ .start = lo, .end = lo + @as(u32, @intCast(pre_len)) }; 155 count += 1; 156 } 157 if (n_suffixes < suffix_buf.len) { 158 suffix_buf[n_suffixes] = .{ 159 .start = hi - @as(u32, @intCast(suf_len)), 160 .end = hi, 161 }; 162 n_suffixes += 1; 163 } 164 lo += @as(u32, @intCast(pre_len)); 165 hi -= @as(u32, @intCast(suf_len)); 166 } else if (pre_len > 0) { 167 // prefix only 168 const minus_pre = text[lo + @as(u32, @intCast(pre_len)) .. hi]; 169 if (minus_pre.len > 0 and data.specials.get(minus_pre) != null) { 170 if (count < out.len) { 171 out[count] = .{ .start = lo, .end = lo + @as(u32, @intCast(pre_len)) }; 172 count += 1; 173 } 174 lo += @as(u32, @intCast(pre_len)); 175 break; 176 } 177 if (count < out.len) { 178 out[count] = .{ .start = lo, .end = lo + @as(u32, @intCast(pre_len)) }; 179 count += 1; 180 } 181 lo += @as(u32, @intCast(pre_len)); 182 } else if (suf_len > 0) { 183 const minus_suf = text[lo..hi - @as(u32, @intCast(suf_len))]; 184 if (minus_suf.len > 0 and data.specials.get(minus_suf) != null) { 185 if (n_suffixes < suffix_buf.len) { 186 suffix_buf[n_suffixes] = .{ 187 .start = hi - @as(u32, @intCast(suf_len)), 188 .end = hi, 189 }; 190 n_suffixes += 1; 191 } 192 hi -= @as(u32, @intCast(suf_len)); 193 break; 194 } 195 if (n_suffixes < suffix_buf.len) { 196 suffix_buf[n_suffixes] = .{ 197 .start = hi - @as(u32, @intCast(suf_len)), 198 .end = hi, 199 }; 200 n_suffixes += 1; 201 } 202 hi -= @as(u32, @intCast(suf_len)); 203 } 204 // else: neither matched, last_len == span_len, loop exits 205 } 206 207 _ = prefix_start; 208 209 // emit middle portion 210 if (lo < hi) { 211 const middle = text[lo..hi]; 212 213 // try special cases for the remaining middle 214 if (data.specials.get(middle)) |special| { 215 var offset: u32 = lo; 216 for (0..special.len) |ti| { 217 const tok_text = special.tokens[ti]; 218 const tok_start = findSubstr(text[offset..hi], tok_text); 219 if (tok_start) |ts| { 220 if (count < out.len) { 221 out[count] = .{ 222 .start = offset + @as(u32, @intCast(ts)), 223 .end = offset + @as(u32, @intCast(ts)) + @as(u32, @intCast(tok_text.len)), 224 }; 225 count += 1; 226 } 227 offset = offset + @as(u32, @intCast(ts)) + @as(u32, @intCast(tok_text.len)); 228 } else { 229 if (count < out.len) { 230 out[count] = .{ .start = offset, .end = offset + @as(u32, @intCast(tok_text.len)) }; 231 count += 1; 232 } 233 offset += @as(u32, @intCast(tok_text.len)); 234 } 235 } 236 } else if (matchUrl(middle) > 0) { 237 // URL — emit as single token 238 if (count < out.len) { 239 out[count] = .{ .start = lo, .end = hi }; 240 count += 1; 241 } 242 } else { 243 // try infix splitting 244 var infixes: [64]Infix = undefined; 245 const n_infixes = findInfixes(middle, &infixes); 246 247 if (n_infixes == 0) { 248 // no infixes — single token 249 if (count < out.len) { 250 out[count] = .{ .start = lo, .end = hi }; 251 count += 1; 252 } 253 } else { 254 // split on infixes 255 var pos: u32 = lo; 256 for (infixes[0..n_infixes]) |inf| { 257 const inf_start = lo + @as(u32, @intCast(inf.start)); 258 const inf_end = lo + @as(u32, @intCast(inf.end)); 259 260 // skip infixes at position 0 261 if (inf.start == 0) continue; 262 263 // emit text before infix 264 if (inf_start > pos) { 265 if (count < out.len) { 266 out[count] = .{ .start = pos, .end = inf_start }; 267 count += 1; 268 } 269 } 270 271 // emit infix 272 if (inf_start != inf_end) { 273 if (count < out.len) { 274 out[count] = .{ .start = inf_start, .end = inf_end }; 275 count += 1; 276 } 277 } 278 279 pos = inf_end; 280 } 281 282 // emit text after last infix 283 if (pos < hi) { 284 if (count < out.len) { 285 out[count] = .{ .start = pos, .end = hi }; 286 count += 1; 287 } 288 } 289 } 290 } 291 } 292 293 // emit suffixes in reverse order 294 var si = n_suffixes; 295 while (si > 0) { 296 si -= 1; 297 if (count < out.len) { 298 out[count] = suffix_buf[si]; 299 count += 1; 300 } 301 } 302 303 return count; 304} 305 306// ── pattern matching ── 307 308/// match a prefix at position 0. returns byte length of match, or 0. 309pub fn matchPrefix(text: []const u8) usize { 310 if (text.len == 0) return 0; 311 312 const cp = data.decodeUtf8(text) orelse return 0; 313 314 // 1. single-character prefixes (switch on codepoint) 315 if (data.isPrefixChar(cp.value)) return cp.len; 316 317 // 2. multi-char literals (longest first) 318 for (data.prefix_multi_literals) |lit| { 319 if (std.mem.startsWith(u8, text, lit)) return lit.len; 320 } 321 322 // 3. symbol class (unicode So/Sc categories) 323 if (data.isSymbol(cp.value)) return cp.len; 324 325 // 4. 2+ dots 326 if (text.len >= 2 and text[0] == '.' and text[1] == '.') { 327 var i: usize = 2; 328 while (i < text.len and text[i] == '.') : (i += 1) {} 329 return i; 330 } 331 332 // 5. literal-unless-digit (e.g., + not followed by digit) 333 if (data.isPrefixUnlessDigit(cp.value)) { 334 if (cp.len >= text.len) return cp.len; 335 const next = data.decodeUtf8(text[cp.len..]); 336 if (next == null or !isAsciiDigit(next.?.value)) return cp.len; 337 } 338 339 return 0; 340} 341 342/// match a suffix at the end of text. returns byte length of suffix, or 0. 343pub fn matchSuffix(text: []const u8) usize { 344 if (text.len == 0) return 0; 345 346 const last = data.lastCodepoint(text) orelse return 0; 347 348 // 1. single-character suffixes 349 if (data.isSuffixChar(last.value)) return last.len; 350 351 // 2. symbol class 352 if (data.isSymbol(last.value)) return last.len; 353 354 // 3. multi-char literal suffixes (longest first) 355 for (data.suffix_multi_literals) |lit| { 356 if (std.mem.endsWith(u8, text, lit)) return lit.len; 357 } 358 359 // 4. 2+ dots at end 360 if (text.len >= 2 and text[text.len - 1] == '.' and text[text.len - 2] == '.') { 361 var i: usize = text.len - 2; 362 while (i > 0 and text[i - 1] == '.') : (i -= 1) {} 363 return text.len - i; 364 } 365 366 // 5. lookbehind rules (generated) 367 const lb = data.matchSuffixLookbehind(text); 368 if (lb > 0) return lb; 369 370 return 0; 371} 372 373/// infix match result 374const Infix = struct { start: usize, end: usize }; 375 376/// find all infix split points. returns count written. 377fn findInfixes(text: []const u8, out: []Infix) usize { 378 var count: usize = 0; 379 if (text.len == 0) return 0; 380 381 var i: usize = 0; 382 while (i < text.len) { 383 const cp = data.decodeUtf8(text[i..]) orelse { 384 i += 1; 385 continue; 386 }; 387 var matched: usize = 0; 388 389 // 1. 2+ dots (infix[0]) 390 if (text[i] == '.' and i + 1 < text.len and text[i + 1] == '.') { 391 var end = i + 2; 392 while (end < text.len and text[end] == '.') : (end += 1) {} 393 matched = end - i; 394 } 395 // 2. ellipsis U+2026 (infix[1]) 396 else if (cp.value == 0x2026) { 397 matched = cp.len; 398 } 399 // 3. symbol class (infix[2]) 400 else if (data.isSymbol(cp.value)) { 401 matched = cp.len; 402 } 403 // contextual rules require lookbehind/lookahead 404 else { 405 const prev_cp = if (i > 0) data.lastCodepoint(text[0..i]) else null; 406 const next_start = i + cp.len; 407 const next_cp = if (next_start < text.len) data.decodeUtf8(text[next_start..]) else null; 408 409 // 4. math ops between digits: (?<=[0-9])[+\-*^](?=[0-9\-]) (infix[3]) 410 if (prev_cp != null and isAsciiDigit(prev_cp.?.value)) { 411 if (cp.value == '+' or cp.value == '-' or cp.value == '*' or cp.value == '^') { 412 if (next_cp != null and (isAsciiDigit(next_cp.?.value) or next_cp.?.value == '-')) { 413 matched = cp.len; 414 } 415 } 416 } 417 418 // 5. period between lower/punct and upper (infix[4]) 419 if (matched == 0 and cp.value == '.') { 420 if (prev_cp != null and next_cp != null) { 421 if (data.is_infix_4_behind(prev_cp.?.value) and data.is_infix_4_ahead(next_cp.?.value)) { 422 matched = 1; 423 } 424 } 425 } 426 427 // 6. comma between alpha chars (infix[5]) 428 if (matched == 0 and cp.value == ',') { 429 if (prev_cp != null and next_cp != null) { 430 if (data.is_infix_5_behind(prev_cp.?.value) and data.is_infix_5_ahead(next_cp.?.value)) { 431 matched = 1; 432 } 433 } 434 } 435 436 // 7. hyphens/dashes between alnum (infix[6]) 437 if (matched == 0 and prev_cp != null and next_cp != null) { 438 if (data.is_infix_6_behind(prev_cp.?.value) and data.is_infix_7_ahead(next_cp.?.value)) { 439 // try alternatives: ---, --, —— (U+2014 U+2014), —, –, -, ~ 440 if (i + 3 <= text.len and std.mem.eql(u8, text[i..][0..3], "---")) { 441 matched = 3; 442 } else if (i + 2 <= text.len and std.mem.eql(u8, text[i..][0..2], "--")) { 443 matched = 2; 444 } else if (i + 6 <= text.len and std.mem.eql(u8, text[i..][0..6], "\xe2\x80\x94\xe2\x80\x94")) { 445 matched = 6; // —— 446 } else if (cp.value == 0x2014) { // — 447 matched = cp.len; 448 } else if (cp.value == 0x2013) { // – 449 matched = cp.len; 450 } else if (cp.value == '-') { 451 matched = 1; 452 } else if (cp.value == '~') { 453 matched = 1; 454 } 455 } 456 } 457 458 // 8. separators between alnum (infix[7]) 459 if (matched == 0 and prev_cp != null and next_cp != null) { 460 if (data.is_infix_7_behind(prev_cp.?.value) and data.is_infix_7_ahead(next_cp.?.value)) { 461 if (cp.value == '/' or cp.value == ':' or cp.value == '<' or 462 cp.value == '>' or cp.value == '=') 463 { 464 matched = cp.len; 465 } 466 } 467 } 468 } 469 470 if (matched > 0) { 471 if (count < out.len) { 472 out[count] = .{ .start = i, .end = i + matched }; 473 count += 1; 474 } 475 i += matched; 476 } else { 477 i += cp.len; 478 } 479 } 480 481 return count; 482} 483 484/// simplified URL matcher. matches scheme://... or domain.tld patterns. 485/// returns length of match from start, or 0. 486fn matchUrl(text: []const u8) usize { 487 if (text.len < 4) return 0; 488 489 // check for scheme:// 490 var pos: usize = 0; 491 if (std.mem.startsWith(u8, text, "http://")) { 492 pos = 7; 493 } else if (std.mem.startsWith(u8, text, "https://")) { 494 pos = 8; 495 } else if (std.mem.startsWith(u8, text, "ftp://")) { 496 pos = 6; 497 } else { 498 // try generic scheme:// 499 var j: usize = 0; 500 while (j < text.len and j < 20) : (j += 1) { 501 const c = text[j]; 502 if (c == ':') { 503 if (j + 2 < text.len and text[j + 1] == '/' and text[j + 2] == '/') { 504 pos = j + 3; 505 break; 506 } else break; 507 } 508 if (!isAsciiAlnum(c) and c != '+' and c != '-' and c != '.') break; 509 } 510 511 // no scheme — try bare domain: word.word or word@word.word 512 if (pos == 0) { 513 pos = matchBareDomain(text); 514 } 515 } 516 517 if (pos == 0 or pos >= text.len) return 0; 518 519 // consume until whitespace 520 while (pos < text.len and !isWhitespace(text[pos])) : (pos += 1) {} 521 522 return pos; 523} 524 525/// match a bare domain like example.com or user@example.com 526fn matchBareDomain(text: []const u8) usize { 527 // look for word.word or word@word.word pattern 528 var i: usize = 0; 529 var has_dot = false; 530 var has_at = false; 531 var last_was_alnum = false; 532 533 while (i < text.len) { 534 const c = text[i]; 535 if (isAsciiAlnum(c) or c == '-' or c == '_') { 536 last_was_alnum = isAsciiAlnum(c); 537 i += 1; 538 } else if (c == '.' and last_was_alnum and i + 1 < text.len and isAsciiAlnum(text[i + 1])) { 539 has_dot = true; 540 last_was_alnum = false; 541 i += 1; 542 } else if (c == '@' and !has_at and last_was_alnum and i + 1 < text.len and isAsciiAlnum(text[i + 1])) { 543 has_at = true; 544 last_was_alnum = false; 545 i += 1; 546 } else break; 547 } 548 549 // must have at least one dot to be a domain 550 if (!has_dot) return 0; 551 552 // check TLD is at least 2 chars and alphabetic (not numeric like 500.00) 553 var last_dot: usize = 0; 554 var j: usize = 0; 555 while (j < i) : (j += 1) { 556 if (text[j] == '.') last_dot = j; 557 } 558 const tld_start = last_dot + 1; 559 const tld_len = i - tld_start; 560 if (tld_len < 2) return 0; 561 562 // TLD must contain at least one letter 563 var has_alpha = false; 564 j = tld_start; 565 while (j < i) : (j += 1) { 566 if ((text[j] >= 'a' and text[j] <= 'z') or (text[j] >= 'A' and text[j] <= 'Z')) { 567 has_alpha = true; 568 break; 569 } 570 } 571 if (!has_alpha) return 0; 572 573 return i; 574} 575 576// ── helpers ── 577 578fn isWhitespace(c: u8) bool { 579 return c == ' ' or c == '\t' or c == '\n' or c == '\r'; 580} 581 582fn isAsciiDigit(c: u21) bool { 583 return c >= '0' and c <= '9'; 584} 585 586fn isAsciiAlnum(c: u8) bool { 587 return (c >= '0' and c <= '9') or (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z'); 588} 589 590fn findSubstr(haystack: []const u8, needle: []const u8) ?usize { 591 if (needle.len > haystack.len) return null; 592 if (needle.len == 0) return 0; 593 var i: usize = 0; 594 while (i + needle.len <= haystack.len) : (i += 1) { 595 if (std.mem.eql(u8, haystack[i..][0..needle.len], needle)) return i; 596 } 597 return null; 598} 599 600// ── tests ── 601 602const testing = std.testing; 603 604test "tokenize basic sentence" { 605 var tokens: [64]Token = undefined; 606 const n = tokenize("Barack Obama visited Paris.", &tokens); 607 608 const expected = [_][]const u8{ "Barack", "Obama", "visited", "Paris", "." }; 609 try testing.expectEqual(@as(u32, expected.len), n); 610 for (expected, 0..) |exp, i| { 611 try testing.expectEqualStrings(exp, tokens[i].text("Barack Obama visited Paris.")); 612 } 613} 614 615test "tokenize contractions" { 616 var tokens: [64]Token = undefined; 617 const text = "I can't believe it's not butter!"; 618 const n = tokenize(text, &tokens); 619 620 const expected = [_][]const u8{ "I", "ca", "n't", "believe", "it", "'s", "not", "butter", "!" }; 621 try testing.expectEqual(@as(u32, expected.len), n); 622 for (expected, 0..) |exp, i| { 623 try testing.expectEqualStrings(exp, tokens[i].text(text)); 624 } 625} 626 627test "tokenize currency and punctuation" { 628 var tokens: [64]Token = undefined; 629 const text = "Apple Inc. is worth $2.5 trillion."; 630 const n = tokenize(text, &tokens); 631 632 const expected = [_][]const u8{ "Apple", "Inc.", "is", "worth", "$", "2.5", "trillion", "." }; 633 try testing.expectEqual(@as(u32, expected.len), n); 634 for (expected, 0..) |exp, i| { 635 try testing.expectEqualStrings(exp, tokens[i].text(text)); 636 } 637} 638 639test "tokenize parentheses" { 640 var tokens: [64]Token = undefined; 641 const text = "Dr. Smith's office (room 42) is closed."; 642 const n = tokenize(text, &tokens); 643 644 const expected = [_][]const u8{ "Dr.", "Smith", "'s", "office", "(", "room", "42", ")", "is", "closed", "." }; 645 try testing.expectEqual(@as(u32, expected.len), n); 646 for (expected, 0..) |exp, i| { 647 try testing.expectEqualStrings(exp, tokens[i].text(text)); 648 } 649} 650 651test "tokenize hyphenated words" { 652 var tokens: [64]Token = undefined; 653 const text = "New York-based company"; 654 const n = tokenize(text, &tokens); 655 656 const expected = [_][]const u8{ "New", "York", "-", "based", "company" }; 657 try testing.expectEqual(@as(u32, expected.len), n); 658 for (expected, 0..) |exp, i| { 659 try testing.expectEqualStrings(exp, tokens[i].text(text)); 660 } 661} 662 663test "tokenize abbreviations" { 664 var tokens: [64]Token = undefined; 665 const text = "U.S.A. and U.K. are allies."; 666 const n = tokenize(text, &tokens); 667 668 const expected = [_][]const u8{ "U.S.A.", "and", "U.K.", "are", "allies", "." }; 669 try testing.expectEqual(@as(u32, expected.len), n); 670 for (expected, 0..) |exp, i| { 671 try testing.expectEqualStrings(exp, tokens[i].text(text)); 672 } 673} 674 675test "tokenize email" { 676 var tokens: [64]Token = undefined; 677 const text = "e-mail: test@example.com"; 678 const n = tokenize(text, &tokens); 679 680 const expected = [_][]const u8{ "e", "-", "mail", ":", "test@example.com" }; 681 try testing.expectEqual(@as(u32, expected.len), n); 682 for (expected, 0..) |exp, i| { 683 try testing.expectEqualStrings(exp, tokens[i].text(text)); 684 } 685} 686 687test "matchPrefix" { 688 // single chars 689 try testing.expectEqual(@as(usize, 1), matchPrefix("$100")); 690 try testing.expectEqual(@as(usize, 1), matchPrefix("(hello)")); 691 try testing.expectEqual(@as(usize, 1), matchPrefix("\"quote")); 692 try testing.expectEqual(@as(usize, 1), matchPrefix("!")); 693 694 // multi-char 695 try testing.expectEqual(@as(usize, 3), matchPrefix("US$100")); 696 try testing.expectEqual(@as(usize, 2), matchPrefix("C$100")); 697 698 // dots 699 try testing.expectEqual(@as(usize, 3), matchPrefix("...hello")); 700 try testing.expectEqual(@as(usize, 2), matchPrefix("..hello")); 701 702 // no match 703 try testing.expectEqual(@as(usize, 0), matchPrefix("hello")); 704 try testing.expectEqual(@as(usize, 0), matchPrefix("123")); 705} 706 707test "matchSuffix" { 708 try testing.expectEqual(@as(usize, 1), matchSuffix("hello.")); 709 try testing.expectEqual(@as(usize, 1), matchSuffix("hello!")); 710 try testing.expectEqual(@as(usize, 1), matchSuffix("hello)")); 711 try testing.expectEqual(@as(usize, 1), matchSuffix("hello,")); 712 try testing.expectEqual(@as(usize, 0), matchSuffix("hello")); 713} 714 715test "findInfixes" { 716 var infixes: [64]Infix = undefined; 717 718 // hyphen between words 719 const n1 = findInfixes("York-based", &infixes); 720 try testing.expect(n1 > 0); 721 try testing.expectEqual(@as(usize, 4), infixes[0].start); 722 try testing.expectEqual(@as(usize, 5), infixes[0].end); 723}