//! spacez — named entity recognition in zig.
//!
//! a from-scratch NER inference engine, compatible with spaCy's
//! en_core_web_sm model weights. hash embeddings → CNN → transition
//! parser, all in pure zig with zero dependencies.

pub const hash = @import("hash.zig");
pub const ops = @import("ops.zig");
pub const embed = @import("embed.zig");
pub const parser = @import("parser.zig");
pub const model = @import("model.zig");
pub const tokenizer = @import("tokenizer.zig");
pub const tokenizer_data = @import("tokenizer_data.zig");

// re-export key types at the top level
pub const Model = model.Model;
pub const Entity = parser.Entity;
pub const Label = parser.Label;
pub const Token = tokenizer.Token;
pub const TokenAttrs = embed.TokenAttrs;

pub const hashString = hash.hashString;
pub const extractAttrs = embed.extractAttrs;
pub const computeShape = embed.computeShape;
pub const tokenizeText = tokenizer.tokenize;

/// bundled en_core_web_sm weights (~6MB, embedded at compile time).
pub const en_core_web_sm = @embedFile("weights/en_core_web_sm.bin");

/// a recognized entity with byte offsets into the source text.
pub const SpanEntity = struct {
    start: u32, // byte offset of entity start in source text
    end: u32, // byte offset of entity end (exclusive)
    label: Label,
};

/// run the full NER pipeline: tokenize → embed → CNN encode → parse.
/// returns the number of entities written to entities_out.
pub fn recognize(
    m: *const Model,
    text: []const u8,
    entities_out: []SpanEntity,
) u32 {
    // tokenize
    var tok_buf: [tokenizer.MAX_TOKENS]Token = undefined;
    const n_toks = tokenizeText(text, &tok_buf);
    if (n_toks == 0) return 0;

    // collect token text slices for the model (capped at model's MAX_TOKENS)
    const n: u32 = @min(n_toks, model.MAX_TOKENS);
    var tok_slices: [model.MAX_TOKENS][]const u8 = undefined;
    for (0..n) |i| {
        tok_slices[i] = tok_buf[i].text(text);
    }

    // run model prediction
    const state = m.predict(tok_slices[0..n]);
    const ents = state.entities();

    // map token-index entities back to byte offsets
    var count: u32 = 0;
    for (ents) |e| {
        if (count >= entities_out.len) break;
        if (e.start >= n or e.end > n) continue;
        entities_out[count] = .{
            .start = tok_buf[e.start].start,
            .end = tok_buf[e.end - 1].end,
            .label = e.label,
        };
        count += 1;
    }

    return count;
}

test {
    _ = hash;
    _ = ops;
    _ = embed;
    _ = parser;
    _ = model;
    _ = tokenizer;
    _ = tokenizer_data;
}