src/tokenizer.zig at main · zzstoatzz.io/spacez

zzstoatzz.io / spacez
fork atom
this repo has no description
fork atom
spacez / src / tokenizer.zig
at main 723 lines 25 kB view raw
wrap content
zzstoatzz.io feat: model inference, tokenizer, and full NER pipeline 11d ago
ab1eb3ac
  1//! spaCy-compatible tokenizer.
  2//!
  3//! port of spaCy's `tokenizer.pyx` algorithm: whitespace split → per-chunk
  4//! iterative prefix/suffix stripping → infix splitting → special case lookup.
  5//! uses generated data tables from `tokenizer_data.zig`.
  6
  7const std = @import("std");
  8const data = @import("tokenizer_data.zig");
  9
 10/// a token is a byte-offset slice into the original text.
 11pub const Token = struct {
 12    /// byte offset of start in original text
 13    start: u32,
 14    /// byte offset of end (exclusive) in original text
 15    end: u32,
 16
 17    pub fn text(self: Token, source: []const u8) []const u8 {
 18        return source[self.start..self.end];
 19    }
 20};
 21
 22/// maximum tokens per document.
 23pub const MAX_TOKENS = 1024;
 24
 25/// tokenize text into tokens. returns the number of tokens written.
 26/// tokens are byte-offset spans into the original text.
 27pub fn tokenize(text: []const u8, out: []Token) u32 {
 28    if (text.len == 0) return 0;
 29    var count: u32 = 0;
 30
 31    // phase 1: whitespace split into chunks
 32    var i: usize = 0;
 33    while (i < text.len) {
 34        // skip whitespace
 35        while (i < text.len and isWhitespace(text[i])) : (i += 1) {}
 36        if (i >= text.len) break;
 37
 38        // find end of chunk (next whitespace)
 39        const chunk_start = i;
 40        while (i < text.len and !isWhitespace(text[i])) : (i += 1) {}
 41        const chunk_end = i;
 42
 43        // phase 2: tokenize this chunk
 44        count = tokenizeChunk(text, chunk_start, chunk_end, out, count);
 45        if (count >= out.len) return count;
 46    }
 47
 48    return count;
 49}
 50
 51/// tokenize a single whitespace-delimited chunk.
 52/// text[start..end] is the chunk. writes tokens to out[count..].
 53fn tokenizeChunk(
 54    text: []const u8,
 55    start: usize,
 56    end: usize,
 57    out: []Token,
 58    count_in: u32,
 59) u32 {
 60    var count = count_in;
 61    const chunk = text[start..end];
 62
 63    // check special cases first
 64    if (data.specials.get(chunk)) |special| {
 65        var offset: u32 = @intCast(start);
 66        for (0..special.len) |ti| {
 67            const tok_text = special.tokens[ti];
 68            // find this token text in the source at the expected position
 69            const tok_start = findSubstr(text[offset..end], tok_text);
 70            if (tok_start) |ts| {
 71                if (count < out.len) {
 72                    out[count] = .{
 73                        .start = offset + @as(u32, @intCast(ts)),
 74                        .end = offset + @as(u32, @intCast(ts)) + @as(u32, @intCast(tok_text.len)),
 75                    };
 76                    count += 1;
 77                }
 78                offset = offset + @as(u32, @intCast(ts)) + @as(u32, @intCast(tok_text.len));
 79            } else {
 80                // special token not found at expected position — emit based on length
 81                if (count < out.len) {
 82                    out[count] = .{
 83                        .start = offset,
 84                        .end = offset + @as(u32, @intCast(tok_text.len)),
 85                    };
 86                    count += 1;
 87                }
 88                offset += @as(u32, @intCast(tok_text.len));
 89            }
 90        }
 91        return count;
 92    }
 93
 94    // split affixes iteratively
 95    var lo: u32 = @intCast(start);
 96    var hi: u32 = @intCast(end);
 97
 98    // prefix and suffix stacks (indices into out buffer)
 99    const prefix_start = count; // where prefixes begin in out
100    var suffix_buf: [64]Token = undefined;
101    var n_suffixes: u32 = 0;
102
103    var last_len: u32 = 0;
104    while (lo < hi and (hi - lo) != last_len) {
105        const span = text[lo..hi];
106        const span_len = hi - lo;
107
108        // check if remaining span is a special case
109        if (data.specials.get(span) != null) break;
110
111        // check URL match
112        if (matchUrl(span) > 0) break;
113
114        last_len = span_len;
115
116        // try prefix
117        const pre_len = matchPrefix(span);
118
119        // try suffix on span[pre_len..] but strip from end of full span
120        const suf_len = if (pre_len < span_len)
121            matchSuffix(span[pre_len..])
122        else
123            @as(usize, 0);
124
125        if (pre_len > 0 and suf_len > 0 and (pre_len + suf_len) <= span_len) {
126            // both prefix and suffix, non-overlapping
127            // check if stripping prefix reveals a special
128            const minus_pre = text[lo + @as(u32, @intCast(pre_len)) .. hi];
129            if (minus_pre.len > 0 and data.specials.get(minus_pre) != null) {
130                // emit prefix, let middle handle the special
131                if (count < out.len) {
132                    out[count] = .{ .start = lo, .end = lo + @as(u32, @intCast(pre_len)) };
133                    count += 1;
134                }
135                lo += @as(u32, @intCast(pre_len));
136                break;
137            }
138            // check if stripping suffix reveals a special
139            const minus_suf = text[lo..hi - @as(u32, @intCast(suf_len))];
140            if (minus_suf.len > 0 and data.specials.get(minus_suf) != null) {
141                if (n_suffixes < suffix_buf.len) {
142                    suffix_buf[n_suffixes] = .{
143                        .start = hi - @as(u32, @intCast(suf_len)),
144                        .end = hi,
145                    };
146                    n_suffixes += 1;
147                }
148                hi -= @as(u32, @intCast(suf_len));
149                break;
150            }
151
152            // strip both
153            if (count < out.len) {
154                out[count] = .{ .start = lo, .end = lo + @as(u32, @intCast(pre_len)) };
155                count += 1;
156            }
157            if (n_suffixes < suffix_buf.len) {
158                suffix_buf[n_suffixes] = .{
159                    .start = hi - @as(u32, @intCast(suf_len)),
160                    .end = hi,
161                };
162                n_suffixes += 1;
163            }
164            lo += @as(u32, @intCast(pre_len));
165            hi -= @as(u32, @intCast(suf_len));
166        } else if (pre_len > 0) {
167            // prefix only
168            const minus_pre = text[lo + @as(u32, @intCast(pre_len)) .. hi];
169            if (minus_pre.len > 0 and data.specials.get(minus_pre) != null) {
170                if (count < out.len) {
171                    out[count] = .{ .start = lo, .end = lo + @as(u32, @intCast(pre_len)) };
172                    count += 1;
173                }
174                lo += @as(u32, @intCast(pre_len));
175                break;
176            }
177            if (count < out.len) {
178                out[count] = .{ .start = lo, .end = lo + @as(u32, @intCast(pre_len)) };
179                count += 1;
180            }
181            lo += @as(u32, @intCast(pre_len));
182        } else if (suf_len > 0) {
183            const minus_suf = text[lo..hi - @as(u32, @intCast(suf_len))];
184            if (minus_suf.len > 0 and data.specials.get(minus_suf) != null) {
185                if (n_suffixes < suffix_buf.len) {
186                    suffix_buf[n_suffixes] = .{
187                        .start = hi - @as(u32, @intCast(suf_len)),
188                        .end = hi,
189                    };
190                    n_suffixes += 1;
191                }
192                hi -= @as(u32, @intCast(suf_len));
193                break;
194            }
195            if (n_suffixes < suffix_buf.len) {
196                suffix_buf[n_suffixes] = .{
197                    .start = hi - @as(u32, @intCast(suf_len)),
198                    .end = hi,
199                };
200                n_suffixes += 1;
201            }
202            hi -= @as(u32, @intCast(suf_len));
203        }
204        // else: neither matched, last_len == span_len, loop exits
205    }
206
207    _ = prefix_start;
208
209    // emit middle portion
210    if (lo < hi) {
211        const middle = text[lo..hi];
212
213        // try special cases for the remaining middle
214        if (data.specials.get(middle)) |special| {
215            var offset: u32 = lo;
216            for (0..special.len) |ti| {
217                const tok_text = special.tokens[ti];
218                const tok_start = findSubstr(text[offset..hi], tok_text);
219                if (tok_start) |ts| {
220                    if (count < out.len) {
221                        out[count] = .{
222                            .start = offset + @as(u32, @intCast(ts)),
223                            .end = offset + @as(u32, @intCast(ts)) + @as(u32, @intCast(tok_text.len)),
224                        };
225                        count += 1;
226                    }
227                    offset = offset + @as(u32, @intCast(ts)) + @as(u32, @intCast(tok_text.len));
228                } else {
229                    if (count < out.len) {
230                        out[count] = .{ .start = offset, .end = offset + @as(u32, @intCast(tok_text.len)) };
231                        count += 1;
232                    }
233                    offset += @as(u32, @intCast(tok_text.len));
234                }
235            }
236        } else if (matchUrl(middle) > 0) {
237            // URL — emit as single token
238            if (count < out.len) {
239                out[count] = .{ .start = lo, .end = hi };
240                count += 1;
241            }
242        } else {
243            // try infix splitting
244            var infixes: [64]Infix = undefined;
245            const n_infixes = findInfixes(middle, &infixes);
246
247            if (n_infixes == 0) {
248                // no infixes — single token
249                if (count < out.len) {
250                    out[count] = .{ .start = lo, .end = hi };
251                    count += 1;
252                }
253            } else {
254                // split on infixes
255                var pos: u32 = lo;
256                for (infixes[0..n_infixes]) |inf| {
257                    const inf_start = lo + @as(u32, @intCast(inf.start));
258                    const inf_end = lo + @as(u32, @intCast(inf.end));
259
260                    // skip infixes at position 0
261                    if (inf.start == 0) continue;
262
263                    // emit text before infix
264                    if (inf_start > pos) {
265                        if (count < out.len) {
266                            out[count] = .{ .start = pos, .end = inf_start };
267                            count += 1;
268                        }
269                    }
270
271                    // emit infix
272                    if (inf_start != inf_end) {
273                        if (count < out.len) {
274                            out[count] = .{ .start = inf_start, .end = inf_end };
275                            count += 1;
276                        }
277                    }
278
279                    pos = inf_end;
280                }
281
282                // emit text after last infix
283                if (pos < hi) {
284                    if (count < out.len) {
285                        out[count] = .{ .start = pos, .end = hi };
286                        count += 1;
287                    }
288                }
289            }
290        }
291    }
292
293    // emit suffixes in reverse order
294    var si = n_suffixes;
295    while (si > 0) {
296        si -= 1;
297        if (count < out.len) {
298            out[count] = suffix_buf[si];
299            count += 1;
300        }
301    }
302
303    return count;
304}
305
306// ── pattern matching ──
307
308/// match a prefix at position 0. returns byte length of match, or 0.
309pub fn matchPrefix(text: []const u8) usize {
310    if (text.len == 0) return 0;
311
312    const cp = data.decodeUtf8(text) orelse return 0;
313
314    // 1. single-character prefixes (switch on codepoint)
315    if (data.isPrefixChar(cp.value)) return cp.len;
316
317    // 2. multi-char literals (longest first)
318    for (data.prefix_multi_literals) |lit| {
319        if (std.mem.startsWith(u8, text, lit)) return lit.len;
320    }
321
322    // 3. symbol class (unicode So/Sc categories)
323    if (data.isSymbol(cp.value)) return cp.len;
324
325    // 4. 2+ dots
326    if (text.len >= 2 and text[0] == '.' and text[1] == '.') {
327        var i: usize = 2;
328        while (i < text.len and text[i] == '.') : (i += 1) {}
329        return i;
330    }
331
332    // 5. literal-unless-digit (e.g., + not followed by digit)
333    if (data.isPrefixUnlessDigit(cp.value)) {
334        if (cp.len >= text.len) return cp.len;
335        const next = data.decodeUtf8(text[cp.len..]);
336        if (next == null or !isAsciiDigit(next.?.value)) return cp.len;
337    }
338
339    return 0;
340}
341
342/// match a suffix at the end of text. returns byte length of suffix, or 0.
343pub fn matchSuffix(text: []const u8) usize {
344    if (text.len == 0) return 0;
345
346    const last = data.lastCodepoint(text) orelse return 0;
347
348    // 1. single-character suffixes
349    if (data.isSuffixChar(last.value)) return last.len;
350
351    // 2. symbol class
352    if (data.isSymbol(last.value)) return last.len;
353
354    // 3. multi-char literal suffixes (longest first)
355    for (data.suffix_multi_literals) |lit| {
356        if (std.mem.endsWith(u8, text, lit)) return lit.len;
357    }
358
359    // 4. 2+ dots at end
360    if (text.len >= 2 and text[text.len - 1] == '.' and text[text.len - 2] == '.') {
361        var i: usize = text.len - 2;
362        while (i > 0 and text[i - 1] == '.') : (i -= 1) {}
363        return text.len - i;
364    }
365
366    // 5. lookbehind rules (generated)
367    const lb = data.matchSuffixLookbehind(text);
368    if (lb > 0) return lb;
369
370    return 0;
371}
372
373/// infix match result
374const Infix = struct { start: usize, end: usize };
375
376/// find all infix split points. returns count written.
377fn findInfixes(text: []const u8, out: []Infix) usize {
378    var count: usize = 0;
379    if (text.len == 0) return 0;
380
381    var i: usize = 0;
382    while (i < text.len) {
383        const cp = data.decodeUtf8(text[i..]) orelse {
384            i += 1;
385            continue;
386        };
387        var matched: usize = 0;
388
389        // 1. 2+ dots (infix[0])
390        if (text[i] == '.' and i + 1 < text.len and text[i + 1] == '.') {
391            var end = i + 2;
392            while (end < text.len and text[end] == '.') : (end += 1) {}
393            matched = end - i;
394        }
395        // 2. ellipsis U+2026 (infix[1])
396        else if (cp.value == 0x2026) {
397            matched = cp.len;
398        }
399        // 3. symbol class (infix[2])
400        else if (data.isSymbol(cp.value)) {
401            matched = cp.len;
402        }
403        // contextual rules require lookbehind/lookahead
404        else {
405            const prev_cp = if (i > 0) data.lastCodepoint(text[0..i]) else null;
406            const next_start = i + cp.len;
407            const next_cp = if (next_start < text.len) data.decodeUtf8(text[next_start..]) else null;
408
409            // 4. math ops between digits: (?<=[0-9])[+\-*^](?=[0-9\-]) (infix[3])
410            if (prev_cp != null and isAsciiDigit(prev_cp.?.value)) {
411                if (cp.value == '+' or cp.value == '-' or cp.value == '*' or cp.value == '^') {
412                    if (next_cp != null and (isAsciiDigit(next_cp.?.value) or next_cp.?.value == '-')) {
413                        matched = cp.len;
414                    }
415                }
416            }
417
418            // 5. period between lower/punct and upper (infix[4])
419            if (matched == 0 and cp.value == '.') {
420                if (prev_cp != null and next_cp != null) {
421                    if (data.is_infix_4_behind(prev_cp.?.value) and data.is_infix_4_ahead(next_cp.?.value)) {
422                        matched = 1;
423                    }
424                }
425            }
426
427            // 6. comma between alpha chars (infix[5])
428            if (matched == 0 and cp.value == ',') {
429                if (prev_cp != null and next_cp != null) {
430                    if (data.is_infix_5_behind(prev_cp.?.value) and data.is_infix_5_ahead(next_cp.?.value)) {
431                        matched = 1;
432                    }
433                }
434            }
435
436            // 7. hyphens/dashes between alnum (infix[6])
437            if (matched == 0 and prev_cp != null and next_cp != null) {
438                if (data.is_infix_6_behind(prev_cp.?.value) and data.is_infix_7_ahead(next_cp.?.value)) {
439                    // try alternatives: ---, --, —— (U+2014 U+2014), —, –, -, ~
440                    if (i + 3 <= text.len and std.mem.eql(u8, text[i..][0..3], "---")) {
441                        matched = 3;
442                    } else if (i + 2 <= text.len and std.mem.eql(u8, text[i..][0..2], "--")) {
443                        matched = 2;
444                    } else if (i + 6 <= text.len and std.mem.eql(u8, text[i..][0..6], "\xe2\x80\x94\xe2\x80\x94")) {
445                        matched = 6; // ——
446                    } else if (cp.value == 0x2014) { // —
447                        matched = cp.len;
448                    } else if (cp.value == 0x2013) { // –
449                        matched = cp.len;
450                    } else if (cp.value == '-') {
451                        matched = 1;
452                    } else if (cp.value == '~') {
453                        matched = 1;
454                    }
455                }
456            }
457
458            // 8. separators between alnum (infix[7])
459            if (matched == 0 and prev_cp != null and next_cp != null) {
460                if (data.is_infix_7_behind(prev_cp.?.value) and data.is_infix_7_ahead(next_cp.?.value)) {
461                    if (cp.value == '/' or cp.value == ':' or cp.value == '<' or
462                        cp.value == '>' or cp.value == '=')
463                    {
464                        matched = cp.len;
465                    }
466                }
467            }
468        }
469
470        if (matched > 0) {
471            if (count < out.len) {
472                out[count] = .{ .start = i, .end = i + matched };
473                count += 1;
474            }
475            i += matched;
476        } else {
477            i += cp.len;
478        }
479    }
480
481    return count;
482}
483
484/// simplified URL matcher. matches scheme://... or domain.tld patterns.
485/// returns length of match from start, or 0.
486fn matchUrl(text: []const u8) usize {
487    if (text.len < 4) return 0;
488
489    // check for scheme://
490    var pos: usize = 0;
491    if (std.mem.startsWith(u8, text, "http://")) {
492        pos = 7;
493    } else if (std.mem.startsWith(u8, text, "https://")) {
494        pos = 8;
495    } else if (std.mem.startsWith(u8, text, "ftp://")) {
496        pos = 6;
497    } else {
498        // try generic scheme://
499        var j: usize = 0;
500        while (j < text.len and j < 20) : (j += 1) {
501            const c = text[j];
502            if (c == ':') {
503                if (j + 2 < text.len and text[j + 1] == '/' and text[j + 2] == '/') {
504                    pos = j + 3;
505                    break;
506                } else break;
507            }
508            if (!isAsciiAlnum(c) and c != '+' and c != '-' and c != '.') break;
509        }
510
511        // no scheme — try bare domain: word.word or word@word.word
512        if (pos == 0) {
513            pos = matchBareDomain(text);
514        }
515    }
516
517    if (pos == 0 or pos >= text.len) return 0;
518
519    // consume until whitespace
520    while (pos < text.len and !isWhitespace(text[pos])) : (pos += 1) {}
521
522    return pos;
523}
524
525/// match a bare domain like example.com or user@example.com
526fn matchBareDomain(text: []const u8) usize {
527    // look for word.word or word@word.word pattern
528    var i: usize = 0;
529    var has_dot = false;
530    var has_at = false;
531    var last_was_alnum = false;
532
533    while (i < text.len) {
534        const c = text[i];
535        if (isAsciiAlnum(c) or c == '-' or c == '_') {
536            last_was_alnum = isAsciiAlnum(c);
537            i += 1;
538        } else if (c == '.' and last_was_alnum and i + 1 < text.len and isAsciiAlnum(text[i + 1])) {
539            has_dot = true;
540            last_was_alnum = false;
541            i += 1;
542        } else if (c == '@' and !has_at and last_was_alnum and i + 1 < text.len and isAsciiAlnum(text[i + 1])) {
543            has_at = true;
544            last_was_alnum = false;
545            i += 1;
546        } else break;
547    }
548
549    // must have at least one dot to be a domain
550    if (!has_dot) return 0;
551
552    // check TLD is at least 2 chars and alphabetic (not numeric like 500.00)
553    var last_dot: usize = 0;
554    var j: usize = 0;
555    while (j < i) : (j += 1) {
556        if (text[j] == '.') last_dot = j;
557    }
558    const tld_start = last_dot + 1;
559    const tld_len = i - tld_start;
560    if (tld_len < 2) return 0;
561
562    // TLD must contain at least one letter
563    var has_alpha = false;
564    j = tld_start;
565    while (j < i) : (j += 1) {
566        if ((text[j] >= 'a' and text[j] <= 'z') or (text[j] >= 'A' and text[j] <= 'Z')) {
567            has_alpha = true;
568            break;
569        }
570    }
571    if (!has_alpha) return 0;
572
573    return i;
574}
575
576// ── helpers ──
577
578fn isWhitespace(c: u8) bool {
579    return c == ' ' or c == '\t' or c == '\n' or c == '\r';
580}
581
582fn isAsciiDigit(c: u21) bool {
583    return c >= '0' and c <= '9';
584}
585
586fn isAsciiAlnum(c: u8) bool {
587    return (c >= '0' and c <= '9') or (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z');
588}
589
590fn findSubstr(haystack: []const u8, needle: []const u8) ?usize {
591    if (needle.len > haystack.len) return null;
592    if (needle.len == 0) return 0;
593    var i: usize = 0;
594    while (i + needle.len <= haystack.len) : (i += 1) {
595        if (std.mem.eql(u8, haystack[i..][0..needle.len], needle)) return i;
596    }
597    return null;
598}
599
600// ── tests ──
601
602const testing = std.testing;
603
604test "tokenize basic sentence" {
605    var tokens: [64]Token = undefined;
606    const n = tokenize("Barack Obama visited Paris.", &tokens);
607
608    const expected = [_][]const u8{ "Barack", "Obama", "visited", "Paris", "." };
609    try testing.expectEqual(@as(u32, expected.len), n);
610    for (expected, 0..) |exp, i| {
611        try testing.expectEqualStrings(exp, tokens[i].text("Barack Obama visited Paris."));
612    }
613}
614
615test "tokenize contractions" {
616    var tokens: [64]Token = undefined;
617    const text = "I can't believe it's not butter!";
618    const n = tokenize(text, &tokens);
619
620    const expected = [_][]const u8{ "I", "ca", "n't", "believe", "it", "'s", "not", "butter", "!" };
621    try testing.expectEqual(@as(u32, expected.len), n);
622    for (expected, 0..) |exp, i| {
623        try testing.expectEqualStrings(exp, tokens[i].text(text));
624    }
625}
626
627test "tokenize currency and punctuation" {
628    var tokens: [64]Token = undefined;
629    const text = "Apple Inc. is worth $2.5 trillion.";
630    const n = tokenize(text, &tokens);
631
632    const expected = [_][]const u8{ "Apple", "Inc.", "is", "worth", "$", "2.5", "trillion", "." };
633    try testing.expectEqual(@as(u32, expected.len), n);
634    for (expected, 0..) |exp, i| {
635        try testing.expectEqualStrings(exp, tokens[i].text(text));
636    }
637}
638
639test "tokenize parentheses" {
640    var tokens: [64]Token = undefined;
641    const text = "Dr. Smith's office (room 42) is closed.";
642    const n = tokenize(text, &tokens);
643
644    const expected = [_][]const u8{ "Dr.", "Smith", "'s", "office", "(", "room", "42", ")", "is", "closed", "." };
645    try testing.expectEqual(@as(u32, expected.len), n);
646    for (expected, 0..) |exp, i| {
647        try testing.expectEqualStrings(exp, tokens[i].text(text));
648    }
649}
650
651test "tokenize hyphenated words" {
652    var tokens: [64]Token = undefined;
653    const text = "New York-based company";
654    const n = tokenize(text, &tokens);
655
656    const expected = [_][]const u8{ "New", "York", "-", "based", "company" };
657    try testing.expectEqual(@as(u32, expected.len), n);
658    for (expected, 0..) |exp, i| {
659        try testing.expectEqualStrings(exp, tokens[i].text(text));
660    }
661}
662
663test "tokenize abbreviations" {
664    var tokens: [64]Token = undefined;
665    const text = "U.S.A. and U.K. are allies.";
666    const n = tokenize(text, &tokens);
667
668    const expected = [_][]const u8{ "U.S.A.", "and", "U.K.", "are", "allies", "." };
669    try testing.expectEqual(@as(u32, expected.len), n);
670    for (expected, 0..) |exp, i| {
671        try testing.expectEqualStrings(exp, tokens[i].text(text));
672    }
673}
674
675test "tokenize email" {
676    var tokens: [64]Token = undefined;
677    const text = "e-mail: test@example.com";
678    const n = tokenize(text, &tokens);
679
680    const expected = [_][]const u8{ "e", "-", "mail", ":", "test@example.com" };
681    try testing.expectEqual(@as(u32, expected.len), n);
682    for (expected, 0..) |exp, i| {
683        try testing.expectEqualStrings(exp, tokens[i].text(text));
684    }
685}
686
687test "matchPrefix" {
688    // single chars
689    try testing.expectEqual(@as(usize, 1), matchPrefix("$100"));
690    try testing.expectEqual(@as(usize, 1), matchPrefix("(hello)"));
691    try testing.expectEqual(@as(usize, 1), matchPrefix("\"quote"));
692    try testing.expectEqual(@as(usize, 1), matchPrefix("!"));
693
694    // multi-char
695    try testing.expectEqual(@as(usize, 3), matchPrefix("US$100"));
696    try testing.expectEqual(@as(usize, 2), matchPrefix("C$100"));
697
698    // dots
699    try testing.expectEqual(@as(usize, 3), matchPrefix("...hello"));
700    try testing.expectEqual(@as(usize, 2), matchPrefix("..hello"));
701
702    // no match
703    try testing.expectEqual(@as(usize, 0), matchPrefix("hello"));
704    try testing.expectEqual(@as(usize, 0), matchPrefix("123"));
705}
706
707test "matchSuffix" {
708    try testing.expectEqual(@as(usize, 1), matchSuffix("hello."));
709    try testing.expectEqual(@as(usize, 1), matchSuffix("hello!"));
710    try testing.expectEqual(@as(usize, 1), matchSuffix("hello)"));
711    try testing.expectEqual(@as(usize, 1), matchSuffix("hello,"));
712    try testing.expectEqual(@as(usize, 0), matchSuffix("hello"));
713}
714
715test "findInfixes" {
716    var infixes: [64]Infix = undefined;
717
718    // hyphen between words
719    const n1 = findInfixes("York-based", &infixes);
720    try testing.expect(n1 > 0);
721    try testing.expectEqual(@as(usize, 4), infixes[0].start);
722    try testing.expectEqual(@as(usize, 5), infixes[0].end);
723}