this repo has no description
1//! spaCy-compatible tokenizer.
2//!
3//! port of spaCy's `tokenizer.pyx` algorithm: whitespace split → per-chunk
4//! iterative prefix/suffix stripping → infix splitting → special case lookup.
5//! uses generated data tables from `tokenizer_data.zig`.
6
7const std = @import("std");
8const data = @import("tokenizer_data.zig");
9
10/// a token is a byte-offset slice into the original text.
11pub const Token = struct {
12 /// byte offset of start in original text
13 start: u32,
14 /// byte offset of end (exclusive) in original text
15 end: u32,
16
17 pub fn text(self: Token, source: []const u8) []const u8 {
18 return source[self.start..self.end];
19 }
20};
21
22/// maximum tokens per document.
23pub const MAX_TOKENS = 1024;
24
25/// tokenize text into tokens. returns the number of tokens written.
26/// tokens are byte-offset spans into the original text.
27pub fn tokenize(text: []const u8, out: []Token) u32 {
28 if (text.len == 0) return 0;
29 var count: u32 = 0;
30
31 // phase 1: whitespace split into chunks
32 var i: usize = 0;
33 while (i < text.len) {
34 // skip whitespace
35 while (i < text.len and isWhitespace(text[i])) : (i += 1) {}
36 if (i >= text.len) break;
37
38 // find end of chunk (next whitespace)
39 const chunk_start = i;
40 while (i < text.len and !isWhitespace(text[i])) : (i += 1) {}
41 const chunk_end = i;
42
43 // phase 2: tokenize this chunk
44 count = tokenizeChunk(text, chunk_start, chunk_end, out, count);
45 if (count >= out.len) return count;
46 }
47
48 return count;
49}
50
51/// tokenize a single whitespace-delimited chunk.
52/// text[start..end] is the chunk. writes tokens to out[count..].
53fn tokenizeChunk(
54 text: []const u8,
55 start: usize,
56 end: usize,
57 out: []Token,
58 count_in: u32,
59) u32 {
60 var count = count_in;
61 const chunk = text[start..end];
62
63 // check special cases first
64 if (data.specials.get(chunk)) |special| {
65 var offset: u32 = @intCast(start);
66 for (0..special.len) |ti| {
67 const tok_text = special.tokens[ti];
68 // find this token text in the source at the expected position
69 const tok_start = findSubstr(text[offset..end], tok_text);
70 if (tok_start) |ts| {
71 if (count < out.len) {
72 out[count] = .{
73 .start = offset + @as(u32, @intCast(ts)),
74 .end = offset + @as(u32, @intCast(ts)) + @as(u32, @intCast(tok_text.len)),
75 };
76 count += 1;
77 }
78 offset = offset + @as(u32, @intCast(ts)) + @as(u32, @intCast(tok_text.len));
79 } else {
80 // special token not found at expected position — emit based on length
81 if (count < out.len) {
82 out[count] = .{
83 .start = offset,
84 .end = offset + @as(u32, @intCast(tok_text.len)),
85 };
86 count += 1;
87 }
88 offset += @as(u32, @intCast(tok_text.len));
89 }
90 }
91 return count;
92 }
93
94 // split affixes iteratively
95 var lo: u32 = @intCast(start);
96 var hi: u32 = @intCast(end);
97
98 // prefix and suffix stacks (indices into out buffer)
99 const prefix_start = count; // where prefixes begin in out
100 var suffix_buf: [64]Token = undefined;
101 var n_suffixes: u32 = 0;
102
103 var last_len: u32 = 0;
104 while (lo < hi and (hi - lo) != last_len) {
105 const span = text[lo..hi];
106 const span_len = hi - lo;
107
108 // check if remaining span is a special case
109 if (data.specials.get(span) != null) break;
110
111 // check URL match
112 if (matchUrl(span) > 0) break;
113
114 last_len = span_len;
115
116 // try prefix
117 const pre_len = matchPrefix(span);
118
119 // try suffix on span[pre_len..] but strip from end of full span
120 const suf_len = if (pre_len < span_len)
121 matchSuffix(span[pre_len..])
122 else
123 @as(usize, 0);
124
125 if (pre_len > 0 and suf_len > 0 and (pre_len + suf_len) <= span_len) {
126 // both prefix and suffix, non-overlapping
127 // check if stripping prefix reveals a special
128 const minus_pre = text[lo + @as(u32, @intCast(pre_len)) .. hi];
129 if (minus_pre.len > 0 and data.specials.get(minus_pre) != null) {
130 // emit prefix, let middle handle the special
131 if (count < out.len) {
132 out[count] = .{ .start = lo, .end = lo + @as(u32, @intCast(pre_len)) };
133 count += 1;
134 }
135 lo += @as(u32, @intCast(pre_len));
136 break;
137 }
138 // check if stripping suffix reveals a special
139 const minus_suf = text[lo..hi - @as(u32, @intCast(suf_len))];
140 if (minus_suf.len > 0 and data.specials.get(minus_suf) != null) {
141 if (n_suffixes < suffix_buf.len) {
142 suffix_buf[n_suffixes] = .{
143 .start = hi - @as(u32, @intCast(suf_len)),
144 .end = hi,
145 };
146 n_suffixes += 1;
147 }
148 hi -= @as(u32, @intCast(suf_len));
149 break;
150 }
151
152 // strip both
153 if (count < out.len) {
154 out[count] = .{ .start = lo, .end = lo + @as(u32, @intCast(pre_len)) };
155 count += 1;
156 }
157 if (n_suffixes < suffix_buf.len) {
158 suffix_buf[n_suffixes] = .{
159 .start = hi - @as(u32, @intCast(suf_len)),
160 .end = hi,
161 };
162 n_suffixes += 1;
163 }
164 lo += @as(u32, @intCast(pre_len));
165 hi -= @as(u32, @intCast(suf_len));
166 } else if (pre_len > 0) {
167 // prefix only
168 const minus_pre = text[lo + @as(u32, @intCast(pre_len)) .. hi];
169 if (minus_pre.len > 0 and data.specials.get(minus_pre) != null) {
170 if (count < out.len) {
171 out[count] = .{ .start = lo, .end = lo + @as(u32, @intCast(pre_len)) };
172 count += 1;
173 }
174 lo += @as(u32, @intCast(pre_len));
175 break;
176 }
177 if (count < out.len) {
178 out[count] = .{ .start = lo, .end = lo + @as(u32, @intCast(pre_len)) };
179 count += 1;
180 }
181 lo += @as(u32, @intCast(pre_len));
182 } else if (suf_len > 0) {
183 const minus_suf = text[lo..hi - @as(u32, @intCast(suf_len))];
184 if (minus_suf.len > 0 and data.specials.get(minus_suf) != null) {
185 if (n_suffixes < suffix_buf.len) {
186 suffix_buf[n_suffixes] = .{
187 .start = hi - @as(u32, @intCast(suf_len)),
188 .end = hi,
189 };
190 n_suffixes += 1;
191 }
192 hi -= @as(u32, @intCast(suf_len));
193 break;
194 }
195 if (n_suffixes < suffix_buf.len) {
196 suffix_buf[n_suffixes] = .{
197 .start = hi - @as(u32, @intCast(suf_len)),
198 .end = hi,
199 };
200 n_suffixes += 1;
201 }
202 hi -= @as(u32, @intCast(suf_len));
203 }
204 // else: neither matched, last_len == span_len, loop exits
205 }
206
207 _ = prefix_start;
208
209 // emit middle portion
210 if (lo < hi) {
211 const middle = text[lo..hi];
212
213 // try special cases for the remaining middle
214 if (data.specials.get(middle)) |special| {
215 var offset: u32 = lo;
216 for (0..special.len) |ti| {
217 const tok_text = special.tokens[ti];
218 const tok_start = findSubstr(text[offset..hi], tok_text);
219 if (tok_start) |ts| {
220 if (count < out.len) {
221 out[count] = .{
222 .start = offset + @as(u32, @intCast(ts)),
223 .end = offset + @as(u32, @intCast(ts)) + @as(u32, @intCast(tok_text.len)),
224 };
225 count += 1;
226 }
227 offset = offset + @as(u32, @intCast(ts)) + @as(u32, @intCast(tok_text.len));
228 } else {
229 if (count < out.len) {
230 out[count] = .{ .start = offset, .end = offset + @as(u32, @intCast(tok_text.len)) };
231 count += 1;
232 }
233 offset += @as(u32, @intCast(tok_text.len));
234 }
235 }
236 } else if (matchUrl(middle) > 0) {
237 // URL — emit as single token
238 if (count < out.len) {
239 out[count] = .{ .start = lo, .end = hi };
240 count += 1;
241 }
242 } else {
243 // try infix splitting
244 var infixes: [64]Infix = undefined;
245 const n_infixes = findInfixes(middle, &infixes);
246
247 if (n_infixes == 0) {
248 // no infixes — single token
249 if (count < out.len) {
250 out[count] = .{ .start = lo, .end = hi };
251 count += 1;
252 }
253 } else {
254 // split on infixes
255 var pos: u32 = lo;
256 for (infixes[0..n_infixes]) |inf| {
257 const inf_start = lo + @as(u32, @intCast(inf.start));
258 const inf_end = lo + @as(u32, @intCast(inf.end));
259
260 // skip infixes at position 0
261 if (inf.start == 0) continue;
262
263 // emit text before infix
264 if (inf_start > pos) {
265 if (count < out.len) {
266 out[count] = .{ .start = pos, .end = inf_start };
267 count += 1;
268 }
269 }
270
271 // emit infix
272 if (inf_start != inf_end) {
273 if (count < out.len) {
274 out[count] = .{ .start = inf_start, .end = inf_end };
275 count += 1;
276 }
277 }
278
279 pos = inf_end;
280 }
281
282 // emit text after last infix
283 if (pos < hi) {
284 if (count < out.len) {
285 out[count] = .{ .start = pos, .end = hi };
286 count += 1;
287 }
288 }
289 }
290 }
291 }
292
293 // emit suffixes in reverse order
294 var si = n_suffixes;
295 while (si > 0) {
296 si -= 1;
297 if (count < out.len) {
298 out[count] = suffix_buf[si];
299 count += 1;
300 }
301 }
302
303 return count;
304}
305
306// ── pattern matching ──
307
308/// match a prefix at position 0. returns byte length of match, or 0.
309pub fn matchPrefix(text: []const u8) usize {
310 if (text.len == 0) return 0;
311
312 const cp = data.decodeUtf8(text) orelse return 0;
313
314 // 1. single-character prefixes (switch on codepoint)
315 if (data.isPrefixChar(cp.value)) return cp.len;
316
317 // 2. multi-char literals (longest first)
318 for (data.prefix_multi_literals) |lit| {
319 if (std.mem.startsWith(u8, text, lit)) return lit.len;
320 }
321
322 // 3. symbol class (unicode So/Sc categories)
323 if (data.isSymbol(cp.value)) return cp.len;
324
325 // 4. 2+ dots
326 if (text.len >= 2 and text[0] == '.' and text[1] == '.') {
327 var i: usize = 2;
328 while (i < text.len and text[i] == '.') : (i += 1) {}
329 return i;
330 }
331
332 // 5. literal-unless-digit (e.g., + not followed by digit)
333 if (data.isPrefixUnlessDigit(cp.value)) {
334 if (cp.len >= text.len) return cp.len;
335 const next = data.decodeUtf8(text[cp.len..]);
336 if (next == null or !isAsciiDigit(next.?.value)) return cp.len;
337 }
338
339 return 0;
340}
341
342/// match a suffix at the end of text. returns byte length of suffix, or 0.
343pub fn matchSuffix(text: []const u8) usize {
344 if (text.len == 0) return 0;
345
346 const last = data.lastCodepoint(text) orelse return 0;
347
348 // 1. single-character suffixes
349 if (data.isSuffixChar(last.value)) return last.len;
350
351 // 2. symbol class
352 if (data.isSymbol(last.value)) return last.len;
353
354 // 3. multi-char literal suffixes (longest first)
355 for (data.suffix_multi_literals) |lit| {
356 if (std.mem.endsWith(u8, text, lit)) return lit.len;
357 }
358
359 // 4. 2+ dots at end
360 if (text.len >= 2 and text[text.len - 1] == '.' and text[text.len - 2] == '.') {
361 var i: usize = text.len - 2;
362 while (i > 0 and text[i - 1] == '.') : (i -= 1) {}
363 return text.len - i;
364 }
365
366 // 5. lookbehind rules (generated)
367 const lb = data.matchSuffixLookbehind(text);
368 if (lb > 0) return lb;
369
370 return 0;
371}
372
373/// infix match result
374const Infix = struct { start: usize, end: usize };
375
376/// find all infix split points. returns count written.
377fn findInfixes(text: []const u8, out: []Infix) usize {
378 var count: usize = 0;
379 if (text.len == 0) return 0;
380
381 var i: usize = 0;
382 while (i < text.len) {
383 const cp = data.decodeUtf8(text[i..]) orelse {
384 i += 1;
385 continue;
386 };
387 var matched: usize = 0;
388
389 // 1. 2+ dots (infix[0])
390 if (text[i] == '.' and i + 1 < text.len and text[i + 1] == '.') {
391 var end = i + 2;
392 while (end < text.len and text[end] == '.') : (end += 1) {}
393 matched = end - i;
394 }
395 // 2. ellipsis U+2026 (infix[1])
396 else if (cp.value == 0x2026) {
397 matched = cp.len;
398 }
399 // 3. symbol class (infix[2])
400 else if (data.isSymbol(cp.value)) {
401 matched = cp.len;
402 }
403 // contextual rules require lookbehind/lookahead
404 else {
405 const prev_cp = if (i > 0) data.lastCodepoint(text[0..i]) else null;
406 const next_start = i + cp.len;
407 const next_cp = if (next_start < text.len) data.decodeUtf8(text[next_start..]) else null;
408
409 // 4. math ops between digits: (?<=[0-9])[+\-*^](?=[0-9\-]) (infix[3])
410 if (prev_cp != null and isAsciiDigit(prev_cp.?.value)) {
411 if (cp.value == '+' or cp.value == '-' or cp.value == '*' or cp.value == '^') {
412 if (next_cp != null and (isAsciiDigit(next_cp.?.value) or next_cp.?.value == '-')) {
413 matched = cp.len;
414 }
415 }
416 }
417
418 // 5. period between lower/punct and upper (infix[4])
419 if (matched == 0 and cp.value == '.') {
420 if (prev_cp != null and next_cp != null) {
421 if (data.is_infix_4_behind(prev_cp.?.value) and data.is_infix_4_ahead(next_cp.?.value)) {
422 matched = 1;
423 }
424 }
425 }
426
427 // 6. comma between alpha chars (infix[5])
428 if (matched == 0 and cp.value == ',') {
429 if (prev_cp != null and next_cp != null) {
430 if (data.is_infix_5_behind(prev_cp.?.value) and data.is_infix_5_ahead(next_cp.?.value)) {
431 matched = 1;
432 }
433 }
434 }
435
436 // 7. hyphens/dashes between alnum (infix[6])
437 if (matched == 0 and prev_cp != null and next_cp != null) {
438 if (data.is_infix_6_behind(prev_cp.?.value) and data.is_infix_7_ahead(next_cp.?.value)) {
439 // try alternatives: ---, --, —— (U+2014 U+2014), —, –, -, ~
440 if (i + 3 <= text.len and std.mem.eql(u8, text[i..][0..3], "---")) {
441 matched = 3;
442 } else if (i + 2 <= text.len and std.mem.eql(u8, text[i..][0..2], "--")) {
443 matched = 2;
444 } else if (i + 6 <= text.len and std.mem.eql(u8, text[i..][0..6], "\xe2\x80\x94\xe2\x80\x94")) {
445 matched = 6; // ——
446 } else if (cp.value == 0x2014) { // —
447 matched = cp.len;
448 } else if (cp.value == 0x2013) { // –
449 matched = cp.len;
450 } else if (cp.value == '-') {
451 matched = 1;
452 } else if (cp.value == '~') {
453 matched = 1;
454 }
455 }
456 }
457
458 // 8. separators between alnum (infix[7])
459 if (matched == 0 and prev_cp != null and next_cp != null) {
460 if (data.is_infix_7_behind(prev_cp.?.value) and data.is_infix_7_ahead(next_cp.?.value)) {
461 if (cp.value == '/' or cp.value == ':' or cp.value == '<' or
462 cp.value == '>' or cp.value == '=')
463 {
464 matched = cp.len;
465 }
466 }
467 }
468 }
469
470 if (matched > 0) {
471 if (count < out.len) {
472 out[count] = .{ .start = i, .end = i + matched };
473 count += 1;
474 }
475 i += matched;
476 } else {
477 i += cp.len;
478 }
479 }
480
481 return count;
482}
483
484/// simplified URL matcher. matches scheme://... or domain.tld patterns.
485/// returns length of match from start, or 0.
486fn matchUrl(text: []const u8) usize {
487 if (text.len < 4) return 0;
488
489 // check for scheme://
490 var pos: usize = 0;
491 if (std.mem.startsWith(u8, text, "http://")) {
492 pos = 7;
493 } else if (std.mem.startsWith(u8, text, "https://")) {
494 pos = 8;
495 } else if (std.mem.startsWith(u8, text, "ftp://")) {
496 pos = 6;
497 } else {
498 // try generic scheme://
499 var j: usize = 0;
500 while (j < text.len and j < 20) : (j += 1) {
501 const c = text[j];
502 if (c == ':') {
503 if (j + 2 < text.len and text[j + 1] == '/' and text[j + 2] == '/') {
504 pos = j + 3;
505 break;
506 } else break;
507 }
508 if (!isAsciiAlnum(c) and c != '+' and c != '-' and c != '.') break;
509 }
510
511 // no scheme — try bare domain: word.word or word@word.word
512 if (pos == 0) {
513 pos = matchBareDomain(text);
514 }
515 }
516
517 if (pos == 0 or pos >= text.len) return 0;
518
519 // consume until whitespace
520 while (pos < text.len and !isWhitespace(text[pos])) : (pos += 1) {}
521
522 return pos;
523}
524
525/// match a bare domain like example.com or user@example.com
526fn matchBareDomain(text: []const u8) usize {
527 // look for word.word or word@word.word pattern
528 var i: usize = 0;
529 var has_dot = false;
530 var has_at = false;
531 var last_was_alnum = false;
532
533 while (i < text.len) {
534 const c = text[i];
535 if (isAsciiAlnum(c) or c == '-' or c == '_') {
536 last_was_alnum = isAsciiAlnum(c);
537 i += 1;
538 } else if (c == '.' and last_was_alnum and i + 1 < text.len and isAsciiAlnum(text[i + 1])) {
539 has_dot = true;
540 last_was_alnum = false;
541 i += 1;
542 } else if (c == '@' and !has_at and last_was_alnum and i + 1 < text.len and isAsciiAlnum(text[i + 1])) {
543 has_at = true;
544 last_was_alnum = false;
545 i += 1;
546 } else break;
547 }
548
549 // must have at least one dot to be a domain
550 if (!has_dot) return 0;
551
552 // check TLD is at least 2 chars and alphabetic (not numeric like 500.00)
553 var last_dot: usize = 0;
554 var j: usize = 0;
555 while (j < i) : (j += 1) {
556 if (text[j] == '.') last_dot = j;
557 }
558 const tld_start = last_dot + 1;
559 const tld_len = i - tld_start;
560 if (tld_len < 2) return 0;
561
562 // TLD must contain at least one letter
563 var has_alpha = false;
564 j = tld_start;
565 while (j < i) : (j += 1) {
566 if ((text[j] >= 'a' and text[j] <= 'z') or (text[j] >= 'A' and text[j] <= 'Z')) {
567 has_alpha = true;
568 break;
569 }
570 }
571 if (!has_alpha) return 0;
572
573 return i;
574}
575
576// ── helpers ──
577
578fn isWhitespace(c: u8) bool {
579 return c == ' ' or c == '\t' or c == '\n' or c == '\r';
580}
581
582fn isAsciiDigit(c: u21) bool {
583 return c >= '0' and c <= '9';
584}
585
586fn isAsciiAlnum(c: u8) bool {
587 return (c >= '0' and c <= '9') or (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z');
588}
589
590fn findSubstr(haystack: []const u8, needle: []const u8) ?usize {
591 if (needle.len > haystack.len) return null;
592 if (needle.len == 0) return 0;
593 var i: usize = 0;
594 while (i + needle.len <= haystack.len) : (i += 1) {
595 if (std.mem.eql(u8, haystack[i..][0..needle.len], needle)) return i;
596 }
597 return null;
598}
599
600// ── tests ──
601
602const testing = std.testing;
603
604test "tokenize basic sentence" {
605 var tokens: [64]Token = undefined;
606 const n = tokenize("Barack Obama visited Paris.", &tokens);
607
608 const expected = [_][]const u8{ "Barack", "Obama", "visited", "Paris", "." };
609 try testing.expectEqual(@as(u32, expected.len), n);
610 for (expected, 0..) |exp, i| {
611 try testing.expectEqualStrings(exp, tokens[i].text("Barack Obama visited Paris."));
612 }
613}
614
615test "tokenize contractions" {
616 var tokens: [64]Token = undefined;
617 const text = "I can't believe it's not butter!";
618 const n = tokenize(text, &tokens);
619
620 const expected = [_][]const u8{ "I", "ca", "n't", "believe", "it", "'s", "not", "butter", "!" };
621 try testing.expectEqual(@as(u32, expected.len), n);
622 for (expected, 0..) |exp, i| {
623 try testing.expectEqualStrings(exp, tokens[i].text(text));
624 }
625}
626
627test "tokenize currency and punctuation" {
628 var tokens: [64]Token = undefined;
629 const text = "Apple Inc. is worth $2.5 trillion.";
630 const n = tokenize(text, &tokens);
631
632 const expected = [_][]const u8{ "Apple", "Inc.", "is", "worth", "$", "2.5", "trillion", "." };
633 try testing.expectEqual(@as(u32, expected.len), n);
634 for (expected, 0..) |exp, i| {
635 try testing.expectEqualStrings(exp, tokens[i].text(text));
636 }
637}
638
639test "tokenize parentheses" {
640 var tokens: [64]Token = undefined;
641 const text = "Dr. Smith's office (room 42) is closed.";
642 const n = tokenize(text, &tokens);
643
644 const expected = [_][]const u8{ "Dr.", "Smith", "'s", "office", "(", "room", "42", ")", "is", "closed", "." };
645 try testing.expectEqual(@as(u32, expected.len), n);
646 for (expected, 0..) |exp, i| {
647 try testing.expectEqualStrings(exp, tokens[i].text(text));
648 }
649}
650
651test "tokenize hyphenated words" {
652 var tokens: [64]Token = undefined;
653 const text = "New York-based company";
654 const n = tokenize(text, &tokens);
655
656 const expected = [_][]const u8{ "New", "York", "-", "based", "company" };
657 try testing.expectEqual(@as(u32, expected.len), n);
658 for (expected, 0..) |exp, i| {
659 try testing.expectEqualStrings(exp, tokens[i].text(text));
660 }
661}
662
663test "tokenize abbreviations" {
664 var tokens: [64]Token = undefined;
665 const text = "U.S.A. and U.K. are allies.";
666 const n = tokenize(text, &tokens);
667
668 const expected = [_][]const u8{ "U.S.A.", "and", "U.K.", "are", "allies", "." };
669 try testing.expectEqual(@as(u32, expected.len), n);
670 for (expected, 0..) |exp, i| {
671 try testing.expectEqualStrings(exp, tokens[i].text(text));
672 }
673}
674
675test "tokenize email" {
676 var tokens: [64]Token = undefined;
677 const text = "e-mail: test@example.com";
678 const n = tokenize(text, &tokens);
679
680 const expected = [_][]const u8{ "e", "-", "mail", ":", "test@example.com" };
681 try testing.expectEqual(@as(u32, expected.len), n);
682 for (expected, 0..) |exp, i| {
683 try testing.expectEqualStrings(exp, tokens[i].text(text));
684 }
685}
686
687test "matchPrefix" {
688 // single chars
689 try testing.expectEqual(@as(usize, 1), matchPrefix("$100"));
690 try testing.expectEqual(@as(usize, 1), matchPrefix("(hello)"));
691 try testing.expectEqual(@as(usize, 1), matchPrefix("\"quote"));
692 try testing.expectEqual(@as(usize, 1), matchPrefix("!"));
693
694 // multi-char
695 try testing.expectEqual(@as(usize, 3), matchPrefix("US$100"));
696 try testing.expectEqual(@as(usize, 2), matchPrefix("C$100"));
697
698 // dots
699 try testing.expectEqual(@as(usize, 3), matchPrefix("...hello"));
700 try testing.expectEqual(@as(usize, 2), matchPrefix("..hello"));
701
702 // no match
703 try testing.expectEqual(@as(usize, 0), matchPrefix("hello"));
704 try testing.expectEqual(@as(usize, 0), matchPrefix("123"));
705}
706
707test "matchSuffix" {
708 try testing.expectEqual(@as(usize, 1), matchSuffix("hello."));
709 try testing.expectEqual(@as(usize, 1), matchSuffix("hello!"));
710 try testing.expectEqual(@as(usize, 1), matchSuffix("hello)"));
711 try testing.expectEqual(@as(usize, 1), matchSuffix("hello,"));
712 try testing.expectEqual(@as(usize, 0), matchSuffix("hello"));
713}
714
715test "findInfixes" {
716 var infixes: [64]Infix = undefined;
717
718 // hyphen between words
719 const n1 = findInfixes("York-based", &infixes);
720 try testing.expect(n1 > 0);
721 try testing.expectEqual(@as(usize, 4), infixes[0].start);
722 try testing.expectEqual(@as(usize, 5), infixes[0].end);
723}