this repo has no description
at main 102 lines 3.1 kB view raw
1//! NER CLI: reads lines from stdin, runs spacez NER, outputs JSON. 2 3const std = @import("std"); 4const spacez = @import("spacez"); 5 6pub fn main() !void { 7 var gpa = std.heap.GeneralPurposeAllocator(.{}){}; 8 defer _ = gpa.deinit(); 9 const allocator = gpa.allocator(); 10 11 var model = try spacez.Model.load(spacez.en_core_web_sm); 12 _ = &model; 13 14 // read all of stdin 15 const input = try std.fs.File.stdin().readToEndAlloc(allocator, 1024 * 1024); 16 defer allocator.free(input); 17 18 // collect output into an ArrayList 19 var out: std.ArrayList(u8) = .empty; 20 defer out.deinit(allocator); 21 const w = out.writer(allocator); 22 23 try w.writeAll("[\n"); 24 var first = true; 25 26 var lines = std.mem.splitScalar(u8, input, '\n'); 27 while (lines.next()) |line| { 28 if (line.len == 0) continue; 29 30 // tokenize 31 var tok_buf: [spacez.tokenizer.MAX_TOKENS]spacez.Token = undefined; 32 const n_toks = spacez.tokenizeText(line, &tok_buf); 33 34 // NER 35 var ent_buf: [64]spacez.SpanEntity = undefined; 36 const n = spacez.recognize(&model, line, &ent_buf); 37 38 if (!first) try w.writeAll(",\n"); 39 first = false; 40 41 try w.writeAll(" {\"text\": "); 42 try writeJsonString(w, line); 43 44 // tokens 45 try w.writeAll(", \"tokens\": ["); 46 for (tok_buf[0..n_toks], 0..) |tok, i| { 47 if (i > 0) try w.writeAll(", "); 48 try writeJsonString(w, tok.text(line)); 49 } 50 try w.writeAll("]"); 51 52 try w.writeAll(", \"entities\": ["); 53 54 for (ent_buf[0..n], 0..) |ent, i| { 55 if (i > 0) try w.writeAll(", "); 56 try w.writeAll("{\"text\": "); 57 try writeJsonString(w, line[ent.start..ent.end]); 58 try w.print(", \"start\": {d}, \"end\": {d}, \"label\": \"{s}\"}}", .{ 59 byteToCharOffset(line, ent.start), 60 byteToCharOffset(line, ent.end), 61 @tagName(ent.label), 62 }); 63 } 64 65 try w.writeAll("]}"); 66 } 67 68 try w.writeAll("\n]\n"); 69 70 // write to stdout 71 const stdout_fd = std.posix.STDOUT_FILENO; 72 var written: usize = 0; 73 while (written < out.items.len) { 74 written += try std.posix.write(stdout_fd, out.items[written..]); 75 } 76} 77 78/// convert a byte offset in a UTF-8 string to a character (codepoint) offset. 79/// this matches spaCy's start_char / end_char convention. 80fn byteToCharOffset(text: []const u8, byte_offset: u32) u32 { 81 var chars: u32 = 0; 82 for (text[0..byte_offset]) |b| { 83 // count bytes that are NOT continuation bytes (10xxxxxx) 84 if (b & 0xC0 != 0x80) chars += 1; 85 } 86 return chars; 87} 88 89fn writeJsonString(w: anytype, s: []const u8) !void { 90 try w.writeByte('"'); 91 for (s) |c| { 92 switch (c) { 93 '"' => try w.writeAll("\\\""), 94 '\\' => try w.writeAll("\\\\"), 95 '\n' => try w.writeAll("\\n"), 96 '\r' => try w.writeAll("\\r"), 97 '\t' => try w.writeAll("\\t"), 98 else => try w.writeByte(c), 99 } 100 } 101 try w.writeByte('"'); 102}