this repo has no description
1//! cross-validate spacez tokenizer against spaCy expected output.
2//! run with: zig build run-xval
3
4const std = @import("std");
5const spacez = @import("spacez");
6
7const TestCase = struct {
8 text: []const u8,
9 expected: []const []const u8,
10};
11
12const test_cases = [_]TestCase{
13 .{ .text = "Barack Obama visited Paris.", .expected = &.{ "Barack", "Obama", "visited", "Paris", "." } },
14 .{ .text = "Apple Inc. is worth $2.5 trillion.", .expected = &.{ "Apple", "Inc.", "is", "worth", "$", "2.5", "trillion", "." } },
15 .{ .text = "I can't believe it's not butter!", .expected = &.{ "I", "ca", "n't", "believe", "it", "'s", "not", "butter", "!" } },
16 .{ .text = "Dr. Smith's office (room 42) is closed.", .expected = &.{ "Dr.", "Smith", "'s", "office", "(", "room", "42", ")", "is", "closed", "." } },
17 .{ .text = "U.S.A. and U.K. are allies.", .expected = &.{ "U.S.A.", "and", "U.K.", "are", "allies", "." } },
18 .{ .text = "They're going to the store.", .expected = &.{ "They", "'re", "going", "to", "the", "store", "." } },
19 .{ .text = "He said \"hello\" and left.", .expected = &.{ "He", "said", "\"", "hello", "\"", "and", "left", "." } },
20 .{ .text = "The cost is $500.00/month.", .expected = &.{ "The", "cost", "is", "$", "500.00", "/", "month", "." } },
21 .{ .text = "New York-based company", .expected = &.{ "New", "York", "-", "based", "company" } },
22 .{ .text = "e-mail: test@example.com", .expected = &.{ "e", "-", "mail", ":", "test@example.com" } },
23 .{ .text = "10,000 people", .expected = &.{ "10,000", "people" } },
24 .{ .text = "3.14159 is pi", .expected = &.{ "3.14159", "is", "pi" } },
25 .{ .text = "state-of-the-art technology", .expected = &.{ "state", "-", "of", "-", "the", "-", "art", "technology" } },
26 .{ .text = "Mr. and Mrs. Jones", .expected = &.{ "Mr.", "and", "Mrs.", "Jones" } },
27 .{ .text = "it's 5:30pm", .expected = &.{ "it", "'s", "5:30pm" } },
28};
29
30pub fn main() void {
31 const print = std.debug.print;
32 var pass: u32 = 0;
33 var fail: u32 = 0;
34
35 for (test_cases) |tc| {
36 var tokens: [1024]spacez.Token = undefined;
37 const n = spacez.tokenizeText(tc.text, &tokens);
38
39 var ok = true;
40 if (n != tc.expected.len) {
41 ok = false;
42 } else {
43 for (tc.expected, 0..) |exp, i| {
44 if (!std.mem.eql(u8, exp, tokens[i].text(tc.text))) {
45 ok = false;
46 break;
47 }
48 }
49 }
50
51 if (ok) {
52 pass += 1;
53 print("PASS: {s}\n", .{tc.text});
54 } else {
55 fail += 1;
56 print("FAIL: {s}\n", .{tc.text});
57 print(" expected ({d}):", .{tc.expected.len});
58 for (tc.expected) |exp| {
59 print(" |{s}|", .{exp});
60 }
61 print("\n got ({d}):", .{n});
62 var i: u32 = 0;
63 while (i < n) : (i += 1) {
64 print(" |{s}|", .{tokens[i].text(tc.text)});
65 }
66 print("\n", .{});
67 }
68 }
69
70 print("\n{d}/{d} passed\n", .{ pass, pass + fail });
71 if (fail > 0) std.process.exit(1);
72}