examples/ner.zig at main · zzstoatzz.io/spacez

zzstoatzz.io / spacez
fork atom
this repo has no description
fork atom
spacez / examples / ner.zig
at main 102 lines 3.1 kB view raw
wrap content
zzstoatzz.io fix: CNN padding, parser features, and UTF-8 offsets — 73/73 match 8d ago
a555bdea
  1//! NER CLI: reads lines from stdin, runs spacez NER, outputs JSON.
  2
  3const std = @import("std");
  4const spacez = @import("spacez");
  5
  6pub fn main() !void {
  7    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
  8    defer _ = gpa.deinit();
  9    const allocator = gpa.allocator();
 10
 11    var model = try spacez.Model.load(spacez.en_core_web_sm);
 12    _ = &model;
 13
 14    // read all of stdin
 15    const input = try std.fs.File.stdin().readToEndAlloc(allocator, 1024 * 1024);
 16    defer allocator.free(input);
 17
 18    // collect output into an ArrayList
 19    var out: std.ArrayList(u8) = .empty;
 20    defer out.deinit(allocator);
 21    const w = out.writer(allocator);
 22
 23    try w.writeAll("[\n");
 24    var first = true;
 25
 26    var lines = std.mem.splitScalar(u8, input, '\n');
 27    while (lines.next()) |line| {
 28        if (line.len == 0) continue;
 29
 30        // tokenize
 31        var tok_buf: [spacez.tokenizer.MAX_TOKENS]spacez.Token = undefined;
 32        const n_toks = spacez.tokenizeText(line, &tok_buf);
 33
 34        // NER
 35        var ent_buf: [64]spacez.SpanEntity = undefined;
 36        const n = spacez.recognize(&model, line, &ent_buf);
 37
 38        if (!first) try w.writeAll(",\n");
 39        first = false;
 40
 41        try w.writeAll("  {\"text\": ");
 42        try writeJsonString(w, line);
 43
 44        // tokens
 45        try w.writeAll(", \"tokens\": [");
 46        for (tok_buf[0..n_toks], 0..) |tok, i| {
 47            if (i > 0) try w.writeAll(", ");
 48            try writeJsonString(w, tok.text(line));
 49        }
 50        try w.writeAll("]");
 51
 52        try w.writeAll(", \"entities\": [");
 53
 54        for (ent_buf[0..n], 0..) |ent, i| {
 55            if (i > 0) try w.writeAll(", ");
 56            try w.writeAll("{\"text\": ");
 57            try writeJsonString(w, line[ent.start..ent.end]);
 58            try w.print(", \"start\": {d}, \"end\": {d}, \"label\": \"{s}\"}}", .{
 59                byteToCharOffset(line, ent.start),
 60                byteToCharOffset(line, ent.end),
 61                @tagName(ent.label),
 62            });
 63        }
 64
 65        try w.writeAll("]}");
 66    }
 67
 68    try w.writeAll("\n]\n");
 69
 70    // write to stdout
 71    const stdout_fd = std.posix.STDOUT_FILENO;
 72    var written: usize = 0;
 73    while (written < out.items.len) {
 74        written += try std.posix.write(stdout_fd, out.items[written..]);
 75    }
 76}
 77
 78/// convert a byte offset in a UTF-8 string to a character (codepoint) offset.
 79/// this matches spaCy's start_char / end_char convention.
 80fn byteToCharOffset(text: []const u8, byte_offset: u32) u32 {
 81    var chars: u32 = 0;
 82    for (text[0..byte_offset]) |b| {
 83        // count bytes that are NOT continuation bytes (10xxxxxx)
 84        if (b & 0xC0 != 0x80) chars += 1;
 85    }
 86    return chars;
 87}
 88
 89fn writeJsonString(w: anytype, s: []const u8) !void {
 90    try w.writeByte('"');
 91    for (s) |c| {
 92        switch (c) {
 93            '"' => try w.writeAll("\\\""),
 94            '\\' => try w.writeAll("\\\\"),
 95            '\n' => try w.writeAll("\\n"),
 96            '\r' => try w.writeAll("\\r"),
 97            '\t' => try w.writeAll("\\t"),
 98            else => try w.writeByte(c),
 99        }
100    }
101    try w.writeByte('"');
102}