//! cross-validate spacez tokenizer against spaCy expected output. //! run with: zig build run-xval const std = @import("std"); const spacez = @import("spacez"); const TestCase = struct { text: []const u8, expected: []const []const u8, }; const test_cases = [_]TestCase{ .{ .text = "Barack Obama visited Paris.", .expected = &.{ "Barack", "Obama", "visited", "Paris", "." } }, .{ .text = "Apple Inc. is worth $2.5 trillion.", .expected = &.{ "Apple", "Inc.", "is", "worth", "$", "2.5", "trillion", "." } }, .{ .text = "I can't believe it's not butter!", .expected = &.{ "I", "ca", "n't", "believe", "it", "'s", "not", "butter", "!" } }, .{ .text = "Dr. Smith's office (room 42) is closed.", .expected = &.{ "Dr.", "Smith", "'s", "office", "(", "room", "42", ")", "is", "closed", "." } }, .{ .text = "U.S.A. and U.K. are allies.", .expected = &.{ "U.S.A.", "and", "U.K.", "are", "allies", "." } }, .{ .text = "They're going to the store.", .expected = &.{ "They", "'re", "going", "to", "the", "store", "." } }, .{ .text = "He said \"hello\" and left.", .expected = &.{ "He", "said", "\"", "hello", "\"", "and", "left", "." } }, .{ .text = "The cost is $500.00/month.", .expected = &.{ "The", "cost", "is", "$", "500.00", "/", "month", "." } }, .{ .text = "New York-based company", .expected = &.{ "New", "York", "-", "based", "company" } }, .{ .text = "e-mail: test@example.com", .expected = &.{ "e", "-", "mail", ":", "test@example.com" } }, .{ .text = "10,000 people", .expected = &.{ "10,000", "people" } }, .{ .text = "3.14159 is pi", .expected = &.{ "3.14159", "is", "pi" } }, .{ .text = "state-of-the-art technology", .expected = &.{ "state", "-", "of", "-", "the", "-", "art", "technology" } }, .{ .text = "Mr. and Mrs. Jones", .expected = &.{ "Mr.", "and", "Mrs.", "Jones" } }, .{ .text = "it's 5:30pm", .expected = &.{ "it", "'s", "5:30pm" } }, }; pub fn main() void { const print = std.debug.print; var pass: u32 = 0; var fail: u32 = 0; for (test_cases) |tc| { var tokens: [1024]spacez.Token = undefined; const n = spacez.tokenizeText(tc.text, &tokens); var ok = true; if (n != tc.expected.len) { ok = false; } else { for (tc.expected, 0..) |exp, i| { if (!std.mem.eql(u8, exp, tokens[i].text(tc.text))) { ok = false; break; } } } if (ok) { pass += 1; print("PASS: {s}\n", .{tc.text}); } else { fail += 1; print("FAIL: {s}\n", .{tc.text}); print(" expected ({d}):", .{tc.expected.len}); for (tc.expected) |exp| { print(" |{s}|", .{exp}); } print("\n got ({d}):", .{n}); var i: u32 = 0; while (i < n) : (i += 1) { print(" |{s}|", .{tokens[i].text(tc.text)}); } print("\n", .{}); } } print("\n{d}/{d} passed\n", .{ pass, pass + fail }); if (fail > 0) std.process.exit(1); }