//! NER CLI: reads lines from stdin, runs spacez NER, outputs JSON. const std = @import("std"); const spacez = @import("spacez"); pub fn main() !void { var gpa = std.heap.GeneralPurposeAllocator(.{}){}; defer _ = gpa.deinit(); const allocator = gpa.allocator(); var model = try spacez.Model.load(spacez.en_core_web_sm); _ = &model; // read all of stdin const input = try std.fs.File.stdin().readToEndAlloc(allocator, 1024 * 1024); defer allocator.free(input); // collect output into an ArrayList var out: std.ArrayList(u8) = .empty; defer out.deinit(allocator); const w = out.writer(allocator); try w.writeAll("[\n"); var first = true; var lines = std.mem.splitScalar(u8, input, '\n'); while (lines.next()) |line| { if (line.len == 0) continue; // tokenize var tok_buf: [spacez.tokenizer.MAX_TOKENS]spacez.Token = undefined; const n_toks = spacez.tokenizeText(line, &tok_buf); // NER var ent_buf: [64]spacez.SpanEntity = undefined; const n = spacez.recognize(&model, line, &ent_buf); if (!first) try w.writeAll(",\n"); first = false; try w.writeAll(" {\"text\": "); try writeJsonString(w, line); // tokens try w.writeAll(", \"tokens\": ["); for (tok_buf[0..n_toks], 0..) |tok, i| { if (i > 0) try w.writeAll(", "); try writeJsonString(w, tok.text(line)); } try w.writeAll("]"); try w.writeAll(", \"entities\": ["); for (ent_buf[0..n], 0..) |ent, i| { if (i > 0) try w.writeAll(", "); try w.writeAll("{\"text\": "); try writeJsonString(w, line[ent.start..ent.end]); try w.print(", \"start\": {d}, \"end\": {d}, \"label\": \"{s}\"}}", .{ byteToCharOffset(line, ent.start), byteToCharOffset(line, ent.end), @tagName(ent.label), }); } try w.writeAll("]}"); } try w.writeAll("\n]\n"); // write to stdout const stdout_fd = std.posix.STDOUT_FILENO; var written: usize = 0; while (written < out.items.len) { written += try std.posix.write(stdout_fd, out.items[written..]); } } /// convert a byte offset in a UTF-8 string to a character (codepoint) offset. /// this matches spaCy's start_char / end_char convention. fn byteToCharOffset(text: []const u8, byte_offset: u32) u32 { var chars: u32 = 0; for (text[0..byte_offset]) |b| { // count bytes that are NOT continuation bytes (10xxxxxx) if (b & 0xC0 != 0x80) chars += 1; } return chars; } fn writeJsonString(w: anytype, s: []const u8) !void { try w.writeByte('"'); for (s) |c| { switch (c) { '"' => try w.writeAll("\\\""), '\\' => try w.writeAll("\\\\"), '\n' => try w.writeAll("\\n"), '\r' => try w.writeAll("\\r"), '\t' => try w.writeAll("\\t"), else => try w.writeByte(c), } } try w.writeByte('"'); }