this repo has no description
1//! NER CLI: reads lines from stdin, runs spacez NER, outputs JSON.
2
3const std = @import("std");
4const spacez = @import("spacez");
5
6pub fn main() !void {
7 var gpa = std.heap.GeneralPurposeAllocator(.{}){};
8 defer _ = gpa.deinit();
9 const allocator = gpa.allocator();
10
11 var model = try spacez.Model.load(spacez.en_core_web_sm);
12 _ = &model;
13
14 // read all of stdin
15 const input = try std.fs.File.stdin().readToEndAlloc(allocator, 1024 * 1024);
16 defer allocator.free(input);
17
18 // collect output into an ArrayList
19 var out: std.ArrayList(u8) = .empty;
20 defer out.deinit(allocator);
21 const w = out.writer(allocator);
22
23 try w.writeAll("[\n");
24 var first = true;
25
26 var lines = std.mem.splitScalar(u8, input, '\n');
27 while (lines.next()) |line| {
28 if (line.len == 0) continue;
29
30 // tokenize
31 var tok_buf: [spacez.tokenizer.MAX_TOKENS]spacez.Token = undefined;
32 const n_toks = spacez.tokenizeText(line, &tok_buf);
33
34 // NER
35 var ent_buf: [64]spacez.SpanEntity = undefined;
36 const n = spacez.recognize(&model, line, &ent_buf);
37
38 if (!first) try w.writeAll(",\n");
39 first = false;
40
41 try w.writeAll(" {\"text\": ");
42 try writeJsonString(w, line);
43
44 // tokens
45 try w.writeAll(", \"tokens\": [");
46 for (tok_buf[0..n_toks], 0..) |tok, i| {
47 if (i > 0) try w.writeAll(", ");
48 try writeJsonString(w, tok.text(line));
49 }
50 try w.writeAll("]");
51
52 try w.writeAll(", \"entities\": [");
53
54 for (ent_buf[0..n], 0..) |ent, i| {
55 if (i > 0) try w.writeAll(", ");
56 try w.writeAll("{\"text\": ");
57 try writeJsonString(w, line[ent.start..ent.end]);
58 try w.print(", \"start\": {d}, \"end\": {d}, \"label\": \"{s}\"}}", .{
59 byteToCharOffset(line, ent.start),
60 byteToCharOffset(line, ent.end),
61 @tagName(ent.label),
62 });
63 }
64
65 try w.writeAll("]}");
66 }
67
68 try w.writeAll("\n]\n");
69
70 // write to stdout
71 const stdout_fd = std.posix.STDOUT_FILENO;
72 var written: usize = 0;
73 while (written < out.items.len) {
74 written += try std.posix.write(stdout_fd, out.items[written..]);
75 }
76}
77
78/// convert a byte offset in a UTF-8 string to a character (codepoint) offset.
79/// this matches spaCy's start_char / end_char convention.
80fn byteToCharOffset(text: []const u8, byte_offset: u32) u32 {
81 var chars: u32 = 0;
82 for (text[0..byte_offset]) |b| {
83 // count bytes that are NOT continuation bytes (10xxxxxx)
84 if (b & 0xC0 != 0x80) chars += 1;
85 }
86 return chars;
87}
88
89fn writeJsonString(w: anytype, s: []const u8) !void {
90 try w.writeByte('"');
91 for (s) |c| {
92 switch (c) {
93 '"' => try w.writeAll("\\\""),
94 '\\' => try w.writeAll("\\\\"),
95 '\n' => try w.writeAll("\\n"),
96 '\r' => try w.writeAll("\\r"),
97 '\t' => try w.writeAll("\\t"),
98 else => try w.writeByte(c),
99 }
100 }
101 try w.writeByte('"');
102}