(*---------------------------------------------------------------------------
  Copyright (c) 2007-2016 Mozilla Foundation
  Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>
  SPDX-License-Identifier: MIT
 ---------------------------------------------------------------------------*)

(** Language detection library based on n-gram frequency analysis.

    This is an OCaml port of the Cybozu langdetect algorithm. *)

module StringMap = Map.Make (String)

type result = {
  lang : string;
  prob : float;
}

type config = {
  alpha : float;
  n_trial : int;
  max_text_length : int;
  conv_threshold : float;
  prob_threshold : float;
}

let default_config =
  {
    alpha = 0.5;
    n_trial = 7;
    max_text_length = 10000;
    conv_threshold = 0.99999;
    prob_threshold = 0.1;
  }

let n_gram_max = 3
let base_freq = 10000
let iteration_limit = 1000
let alpha_width = 0.05

type t = {
  config : config;
  word_lang_prob : float array StringMap.t;
  lang_list : string array;
  mutable seed : int option;
}

(* Character normalization matching the original Java implementation.
   This is critical for matching the trained profiles. *)
let normalize_uchar uchar =
  let code = Uchar.to_int uchar in
  (* Basic Latin: only letters pass through *)
  if code < 128 then
    let c = Char.chr code in
    match c with
    | 'A' .. 'Z' | 'a' .. 'z' -> Some (String.make 1 c)
    | _ -> None
  (* Hangul Syllables (U+AC00-U+D7A3): normalize to '가' (U+AC00) *)
  else if code >= 0xAC00 && code <= 0xD7A3 then
    Some "\xEA\xB0\x80"  (* UTF-8 for U+AC00 '가' *)
  (* Hiragana (U+3040-U+309F): normalize to 'あ' (U+3042) *)
  else if code >= 0x3040 && code <= 0x309F then
    Some "\xE3\x81\x82"  (* UTF-8 for U+3042 'あ' *)
  (* Katakana (U+30A0-U+30FF): normalize to 'ア' (U+30A2) *)
  else if code >= 0x30A0 && code <= 0x30FF then
    Some "\xE3\x82\xA2"  (* UTF-8 for U+30A2 'ア' *)
  (* Bopomofo (U+3100-U+312F) and Extended (U+31A0-U+31BF): normalize to 'ㄅ' (U+3105) *)
  else if (code >= 0x3100 && code <= 0x312F) || (code >= 0x31A0 && code <= 0x31BF) then
    Some "\xE3\x84\x85"  (* UTF-8 for U+3105 'ㄅ' *)
  (* General Punctuation (U+2000-U+206F): treat as space/separator *)
  else if code >= 0x2000 && code <= 0x206F then
    None
  (* CJK Unified Ideographs and other scripts: pass through *)
  else
    let buf = Buffer.create 4 in
    Buffer.add_utf_8_uchar buf uchar;
    Some (Buffer.contents buf)

let extract_ngrams ?(max_len = 10000) text word_lang_prob =
  let ngrams = ref [] in
  let char_buffer = Array.make n_gram_max "" in
  let char_count = ref 0 in
  let processed = ref 0 in
  let decoder = Uutf.decoder ~encoding:`UTF_8 (`String text) in
  let rec process () =
    if !processed >= max_len then ()
    else
      match Uutf.decode decoder with
      | `Await | `End -> ()
      | `Malformed _ -> process ()
      | `Uchar uchar -> (
          incr processed;
          match normalize_uchar uchar with
          | None ->
              char_buffer.(0) <- "";
              char_buffer.(1) <- "";
              char_buffer.(2) <- "";
              char_count := 0;
              process ()
          | Some char_str ->
              char_buffer.(0) <- char_buffer.(1);
              char_buffer.(1) <- char_buffer.(2);
              char_buffer.(2) <- char_str;
              incr char_count;
              let available = min !char_count n_gram_max in
              for n = 1 to available do
                let start_idx = n_gram_max - n in
                let parts = ref [] in
                for i = start_idx to n_gram_max - 1 do
                  parts := char_buffer.(i) :: !parts
                done;
                let ngram = String.concat "" (List.rev !parts) in
                if StringMap.mem ngram word_lang_prob then
                  ngrams := ngram :: !ngrams
              done;
              process ())
  in
  process ();
  Array.of_list (List.rev !ngrams)

let init_prob n_langs = Array.make n_langs (1.0 /. float_of_int n_langs)

let update_lang_prob prob ngram word_lang_prob alpha =
  match StringMap.find_opt ngram word_lang_prob with
  | None -> false
  | Some lang_prob_map ->
      let weight = alpha /. float_of_int base_freq in
      for i = 0 to Array.length prob - 1 do
        prob.(i) <- prob.(i) *. (weight +. lang_prob_map.(i))
      done;
      true

let normalize_prob prob =
  let sum = Array.fold_left ( +. ) 0.0 prob in
  if sum <= 0.0 then 0.0
  else
    let max_p = ref 0.0 in
    for i = 0 to Array.length prob - 1 do
      prob.(i) <- prob.(i) /. sum;
      if prob.(i) > !max_p then max_p := prob.(i)
    done;
    !max_p

(* LCG random number generator using Int32 for WASM compatibility.
   The constants (1103515245, 12345) are from the C standard library's rand().
   We mask with 0x3FFFFFFF (30 bits) to ensure the result fits in OCaml's
   31-bit int on 32-bit platforms like WASM. *)
let random_state = ref 12345l
let set_seed seed = random_state := Int32.of_int seed

let next_random () =
  (* Use Int32 to handle overflow correctly on 32-bit platforms (WASM) *)
  let open Int32 in
  random_state := logand (add (mul !random_state 1103515245l) 12345l) 0x7FFFFFFFl;
  (* Mask to 30 bits to fit in OCaml's 31-bit int on 32-bit platforms *)
  to_int (logand !random_state 0x3FFFFFFFl)

let random_int bound =
  let r = next_random () in
  (* Ensure positive result even if bound is negative *)
  abs (r mod bound)

let max_random_float = Int32.to_float 0x3FFFFFFFl

let random_gaussian () =
  let u1 = float_of_int (next_random ()) /. max_random_float in
  let u2 = float_of_int (next_random ()) /. max_random_float in
  let u1 = max 0.0001 u1 in
  sqrt (-2.0 *. log u1) *. cos (2.0 *. Float.pi *. u2)

let detect_block t ngrams =
  let n_langs = Array.length t.lang_list in
  if n_langs = 0 || Array.length ngrams = 0 then [||]
  else
    let lang_prob = Array.make n_langs 0.0 in
    set_seed (Option.value t.seed ~default:12345);
    for _ = 0 to t.config.n_trial - 1 do
      let prob = init_prob n_langs in
      let alpha = t.config.alpha +. (random_gaussian () *. alpha_width) in
      let converged = ref false in
      let iter_count = ref 0 in
      while (not !converged) && !iter_count < iteration_limit do
        let r = random_int (Array.length ngrams) in
        let (_ : bool) = update_lang_prob prob ngrams.(r) t.word_lang_prob alpha in
        if !iter_count mod 5 = 0 then begin
          let max_p = normalize_prob prob in
          if max_p > t.config.conv_threshold then converged := true
        end;
        incr iter_count
      done;
      for j = 0 to n_langs - 1 do
        lang_prob.(j) <- lang_prob.(j) +. (prob.(j) /. float_of_int t.config.n_trial)
      done
    done;
    lang_prob

(* Create detector from packed profiles with flat data array.
   ngram_table: global string table mapping indices to n-gram strings
   profile_data: flat int array of (ngram_index, frequency) pairs
   profile_offsets: array of (lang_code, start_index, num_pairs) *)
let create_packed ?(config = default_config) ~ngram_table ~profile_data profile_offsets =
  let n_langs = Array.length profile_offsets in
  let lang_list = Array.map (fun (lang, _, _) -> lang) profile_offsets in
  let all_ngrams = Hashtbl.create 65536 in
  let lang_totals = Array.make n_langs 0 in
  Array.iteri
    (fun lang_idx (_, start_idx, num_pairs) ->
      for pair_idx = 0 to num_pairs - 1 do
        let data_idx = start_idx + (pair_idx * 2) in
        let ngram_idx = profile_data.(data_idx) in
        let count = profile_data.(data_idx + 1) in
        let ngram = ngram_table.(ngram_idx) in
        let current =
          match Hashtbl.find_opt all_ngrams ngram with
          | Some arr -> arr
          | None ->
              let arr = Array.make n_langs 0 in
              Hashtbl.add all_ngrams ngram arr;
              arr
        in
        current.(lang_idx) <- count;
        lang_totals.(lang_idx) <- lang_totals.(lang_idx) + count
      done)
    profile_offsets;
  let word_lang_prob =
    Hashtbl.fold
      (fun ngram counts acc ->
        let probs = Array.make n_langs 0.0 in
        for i = 0 to n_langs - 1 do
          if lang_totals.(i) > 0 then
            probs.(i) <- float_of_int counts.(i) /. float_of_int lang_totals.(i)
        done;
        StringMap.add ngram probs acc)
      all_ngrams StringMap.empty
  in
  { config; word_lang_prob; lang_list; seed = None }

(* Create detector from legacy list-based profiles.
   profiles: list of (lang_code, (ngram, frequency) list) *)
let create ?(config = default_config) profiles =
  let lang_list = Array.of_list (List.map fst profiles) in
  let n_langs = Array.length lang_list in
  let all_ngrams = Hashtbl.create 65536 in
  let lang_totals = Array.make n_langs 0 in
  List.iteri
    (fun lang_idx (_, freq_list) ->
      List.iter
        (fun (ngram, count) ->
          let current =
            match Hashtbl.find_opt all_ngrams ngram with
            | Some arr -> arr
            | None ->
                let arr = Array.make n_langs 0 in
                Hashtbl.add all_ngrams ngram arr;
                arr
          in
          current.(lang_idx) <- count;
          lang_totals.(lang_idx) <- lang_totals.(lang_idx) + count)
        freq_list)
    profiles;
  let word_lang_prob =
    Hashtbl.fold
      (fun ngram counts acc ->
        let probs = Array.make n_langs 0.0 in
        for i = 0 to n_langs - 1 do
          if lang_totals.(i) > 0 then
            probs.(i) <- float_of_int counts.(i) /. float_of_int lang_totals.(i)
        done;
        StringMap.add ngram probs acc)
      all_ngrams StringMap.empty
  in
  { config; word_lang_prob; lang_list; seed = None }

let set_random_seed t seed = t.seed <- Some seed

let detect t text =
  let ngrams =
    extract_ngrams ~max_len:t.config.max_text_length text t.word_lang_prob
  in
  if Array.length ngrams = 0 then []
  else
    let probs = detect_block t ngrams in
    let results = ref [] in
    for i = 0 to Array.length probs - 1 do
      if probs.(i) > t.config.prob_threshold then
        results := { lang = t.lang_list.(i); prob = probs.(i) } :: !results
    done;
    List.sort (fun a b -> compare b.prob a.prob) !results

let detect_best t text =
  match detect t text with
  | [] -> None
  | best :: _ -> Some best.lang

let detect_with_prob t text =
  match detect t text with
  | [] -> None
  | best :: _ -> Some (best.lang, best.prob)

let supported_languages t = t.lang_list

let create_default ?config () =
  create_packed ?config
    ~ngram_table:Profiles_packed.ngram_table
    ~profile_data:Profiles_packed.profile_data
    Profiles_packed.profile_offsets