this repo has no description
at main 72 lines 3.1 kB view raw
1//! cross-validate spacez tokenizer against spaCy expected output. 2//! run with: zig build run-xval 3 4const std = @import("std"); 5const spacez = @import("spacez"); 6 7const TestCase = struct { 8 text: []const u8, 9 expected: []const []const u8, 10}; 11 12const test_cases = [_]TestCase{ 13 .{ .text = "Barack Obama visited Paris.", .expected = &.{ "Barack", "Obama", "visited", "Paris", "." } }, 14 .{ .text = "Apple Inc. is worth $2.5 trillion.", .expected = &.{ "Apple", "Inc.", "is", "worth", "$", "2.5", "trillion", "." } }, 15 .{ .text = "I can't believe it's not butter!", .expected = &.{ "I", "ca", "n't", "believe", "it", "'s", "not", "butter", "!" } }, 16 .{ .text = "Dr. Smith's office (room 42) is closed.", .expected = &.{ "Dr.", "Smith", "'s", "office", "(", "room", "42", ")", "is", "closed", "." } }, 17 .{ .text = "U.S.A. and U.K. are allies.", .expected = &.{ "U.S.A.", "and", "U.K.", "are", "allies", "." } }, 18 .{ .text = "They're going to the store.", .expected = &.{ "They", "'re", "going", "to", "the", "store", "." } }, 19 .{ .text = "He said \"hello\" and left.", .expected = &.{ "He", "said", "\"", "hello", "\"", "and", "left", "." } }, 20 .{ .text = "The cost is $500.00/month.", .expected = &.{ "The", "cost", "is", "$", "500.00", "/", "month", "." } }, 21 .{ .text = "New York-based company", .expected = &.{ "New", "York", "-", "based", "company" } }, 22 .{ .text = "e-mail: test@example.com", .expected = &.{ "e", "-", "mail", ":", "test@example.com" } }, 23 .{ .text = "10,000 people", .expected = &.{ "10,000", "people" } }, 24 .{ .text = "3.14159 is pi", .expected = &.{ "3.14159", "is", "pi" } }, 25 .{ .text = "state-of-the-art technology", .expected = &.{ "state", "-", "of", "-", "the", "-", "art", "technology" } }, 26 .{ .text = "Mr. and Mrs. Jones", .expected = &.{ "Mr.", "and", "Mrs.", "Jones" } }, 27 .{ .text = "it's 5:30pm", .expected = &.{ "it", "'s", "5:30pm" } }, 28}; 29 30pub fn main() void { 31 const print = std.debug.print; 32 var pass: u32 = 0; 33 var fail: u32 = 0; 34 35 for (test_cases) |tc| { 36 var tokens: [1024]spacez.Token = undefined; 37 const n = spacez.tokenizeText(tc.text, &tokens); 38 39 var ok = true; 40 if (n != tc.expected.len) { 41 ok = false; 42 } else { 43 for (tc.expected, 0..) |exp, i| { 44 if (!std.mem.eql(u8, exp, tokens[i].text(tc.text))) { 45 ok = false; 46 break; 47 } 48 } 49 } 50 51 if (ok) { 52 pass += 1; 53 print("PASS: {s}\n", .{tc.text}); 54 } else { 55 fail += 1; 56 print("FAIL: {s}\n", .{tc.text}); 57 print(" expected ({d}):", .{tc.expected.len}); 58 for (tc.expected) |exp| { 59 print(" |{s}|", .{exp}); 60 } 61 print("\n got ({d}):", .{n}); 62 var i: u32 = 0; 63 while (i < n) : (i += 1) { 64 print(" |{s}|", .{tokens[i].text(tc.text)}); 65 } 66 print("\n", .{}); 67 } 68 } 69 70 print("\n{d}/{d} passed\n", .{ pass, pass + fail }); 71 if (fail > 0) std.process.exit(1); 72}