(*--------------------------------------------------------------------------- Copyright (c) 2007-2016 Mozilla Foundation Copyright (c) 2025 Anil Madhavapeddy SPDX-License-Identifier: MIT ---------------------------------------------------------------------------*) (** Language detection library based on n-gram frequency analysis. This is an OCaml port of the Cybozu langdetect algorithm. *) module StringMap = Map.Make (String) type result = { lang : string; prob : float; } type config = { alpha : float; n_trial : int; max_text_length : int; conv_threshold : float; prob_threshold : float; } let default_config = { alpha = 0.5; n_trial = 7; max_text_length = 10000; conv_threshold = 0.99999; prob_threshold = 0.1; } let n_gram_max = 3 let base_freq = 10000 let iteration_limit = 1000 let alpha_width = 0.05 type t = { config : config; word_lang_prob : float array StringMap.t; lang_list : string array; mutable seed : int option; } (* Character normalization matching the original Java implementation. This is critical for matching the trained profiles. *) let normalize_uchar uchar = let code = Uchar.to_int uchar in (* Basic Latin: only letters pass through *) if code < 128 then let c = Char.chr code in match c with | 'A' .. 'Z' | 'a' .. 'z' -> Some (String.make 1 c) | _ -> None (* Hangul Syllables (U+AC00-U+D7A3): normalize to '가' (U+AC00) *) else if code >= 0xAC00 && code <= 0xD7A3 then Some "\xEA\xB0\x80" (* UTF-8 for U+AC00 '가' *) (* Hiragana (U+3040-U+309F): normalize to 'あ' (U+3042) *) else if code >= 0x3040 && code <= 0x309F then Some "\xE3\x81\x82" (* UTF-8 for U+3042 'あ' *) (* Katakana (U+30A0-U+30FF): normalize to 'ア' (U+30A2) *) else if code >= 0x30A0 && code <= 0x30FF then Some "\xE3\x82\xA2" (* UTF-8 for U+30A2 'ア' *) (* Bopomofo (U+3100-U+312F) and Extended (U+31A0-U+31BF): normalize to 'ㄅ' (U+3105) *) else if (code >= 0x3100 && code <= 0x312F) || (code >= 0x31A0 && code <= 0x31BF) then Some "\xE3\x84\x85" (* UTF-8 for U+3105 'ㄅ' *) (* General Punctuation (U+2000-U+206F): treat as space/separator *) else if code >= 0x2000 && code <= 0x206F then None (* CJK Unified Ideographs and other scripts: pass through *) else let buf = Buffer.create 4 in Buffer.add_utf_8_uchar buf uchar; Some (Buffer.contents buf) let extract_ngrams ?(max_len = 10000) text word_lang_prob = let ngrams = ref [] in let char_buffer = Array.make n_gram_max "" in let char_count = ref 0 in let processed = ref 0 in let decoder = Uutf.decoder ~encoding:`UTF_8 (`String text) in let rec process () = if !processed >= max_len then () else match Uutf.decode decoder with | `Await | `End -> () | `Malformed _ -> process () | `Uchar uchar -> ( incr processed; match normalize_uchar uchar with | None -> char_buffer.(0) <- ""; char_buffer.(1) <- ""; char_buffer.(2) <- ""; char_count := 0; process () | Some char_str -> char_buffer.(0) <- char_buffer.(1); char_buffer.(1) <- char_buffer.(2); char_buffer.(2) <- char_str; incr char_count; let available = min !char_count n_gram_max in for n = 1 to available do let start_idx = n_gram_max - n in let parts = ref [] in for i = start_idx to n_gram_max - 1 do parts := char_buffer.(i) :: !parts done; let ngram = String.concat "" (List.rev !parts) in if StringMap.mem ngram word_lang_prob then ngrams := ngram :: !ngrams done; process ()) in process (); Array.of_list (List.rev !ngrams) let init_prob n_langs = Array.make n_langs (1.0 /. float_of_int n_langs) let update_lang_prob prob ngram word_lang_prob alpha = match StringMap.find_opt ngram word_lang_prob with | None -> false | Some lang_prob_map -> let weight = alpha /. float_of_int base_freq in for i = 0 to Array.length prob - 1 do prob.(i) <- prob.(i) *. (weight +. lang_prob_map.(i)) done; true let normalize_prob prob = let sum = Array.fold_left ( +. ) 0.0 prob in if sum <= 0.0 then 0.0 else let max_p = ref 0.0 in for i = 0 to Array.length prob - 1 do prob.(i) <- prob.(i) /. sum; if prob.(i) > !max_p then max_p := prob.(i) done; !max_p (* LCG random number generator using Int32 for WASM compatibility. The constants (1103515245, 12345) are from the C standard library's rand(). We mask with 0x3FFFFFFF (30 bits) to ensure the result fits in OCaml's 31-bit int on 32-bit platforms like WASM. *) let random_state = ref 12345l let set_seed seed = random_state := Int32.of_int seed let next_random () = (* Use Int32 to handle overflow correctly on 32-bit platforms (WASM) *) let open Int32 in random_state := logand (add (mul !random_state 1103515245l) 12345l) 0x7FFFFFFFl; (* Mask to 30 bits to fit in OCaml's 31-bit int on 32-bit platforms *) to_int (logand !random_state 0x3FFFFFFFl) let random_int bound = let r = next_random () in (* Ensure positive result even if bound is negative *) abs (r mod bound) let max_random_float = Int32.to_float 0x3FFFFFFFl let random_gaussian () = let u1 = float_of_int (next_random ()) /. max_random_float in let u2 = float_of_int (next_random ()) /. max_random_float in let u1 = max 0.0001 u1 in sqrt (-2.0 *. log u1) *. cos (2.0 *. Float.pi *. u2) let detect_block t ngrams = let n_langs = Array.length t.lang_list in if n_langs = 0 || Array.length ngrams = 0 then [||] else let lang_prob = Array.make n_langs 0.0 in set_seed (Option.value t.seed ~default:12345); for _ = 0 to t.config.n_trial - 1 do let prob = init_prob n_langs in let alpha = t.config.alpha +. (random_gaussian () *. alpha_width) in let converged = ref false in let iter_count = ref 0 in while (not !converged) && !iter_count < iteration_limit do let r = random_int (Array.length ngrams) in let (_ : bool) = update_lang_prob prob ngrams.(r) t.word_lang_prob alpha in if !iter_count mod 5 = 0 then begin let max_p = normalize_prob prob in if max_p > t.config.conv_threshold then converged := true end; incr iter_count done; for j = 0 to n_langs - 1 do lang_prob.(j) <- lang_prob.(j) +. (prob.(j) /. float_of_int t.config.n_trial) done done; lang_prob (* Create detector from packed profiles with flat data array. ngram_table: global string table mapping indices to n-gram strings profile_data: flat int array of (ngram_index, frequency) pairs profile_offsets: array of (lang_code, start_index, num_pairs) *) let create_packed ?(config = default_config) ~ngram_table ~profile_data profile_offsets = let n_langs = Array.length profile_offsets in let lang_list = Array.map (fun (lang, _, _) -> lang) profile_offsets in let all_ngrams = Hashtbl.create 65536 in let lang_totals = Array.make n_langs 0 in Array.iteri (fun lang_idx (_, start_idx, num_pairs) -> for pair_idx = 0 to num_pairs - 1 do let data_idx = start_idx + (pair_idx * 2) in let ngram_idx = profile_data.(data_idx) in let count = profile_data.(data_idx + 1) in let ngram = ngram_table.(ngram_idx) in let current = match Hashtbl.find_opt all_ngrams ngram with | Some arr -> arr | None -> let arr = Array.make n_langs 0 in Hashtbl.add all_ngrams ngram arr; arr in current.(lang_idx) <- count; lang_totals.(lang_idx) <- lang_totals.(lang_idx) + count done) profile_offsets; let word_lang_prob = Hashtbl.fold (fun ngram counts acc -> let probs = Array.make n_langs 0.0 in for i = 0 to n_langs - 1 do if lang_totals.(i) > 0 then probs.(i) <- float_of_int counts.(i) /. float_of_int lang_totals.(i) done; StringMap.add ngram probs acc) all_ngrams StringMap.empty in { config; word_lang_prob; lang_list; seed = None } (* Create detector from legacy list-based profiles. profiles: list of (lang_code, (ngram, frequency) list) *) let create ?(config = default_config) profiles = let lang_list = Array.of_list (List.map fst profiles) in let n_langs = Array.length lang_list in let all_ngrams = Hashtbl.create 65536 in let lang_totals = Array.make n_langs 0 in List.iteri (fun lang_idx (_, freq_list) -> List.iter (fun (ngram, count) -> let current = match Hashtbl.find_opt all_ngrams ngram with | Some arr -> arr | None -> let arr = Array.make n_langs 0 in Hashtbl.add all_ngrams ngram arr; arr in current.(lang_idx) <- count; lang_totals.(lang_idx) <- lang_totals.(lang_idx) + count) freq_list) profiles; let word_lang_prob = Hashtbl.fold (fun ngram counts acc -> let probs = Array.make n_langs 0.0 in for i = 0 to n_langs - 1 do if lang_totals.(i) > 0 then probs.(i) <- float_of_int counts.(i) /. float_of_int lang_totals.(i) done; StringMap.add ngram probs acc) all_ngrams StringMap.empty in { config; word_lang_prob; lang_list; seed = None } let set_random_seed t seed = t.seed <- Some seed let detect t text = let ngrams = extract_ngrams ~max_len:t.config.max_text_length text t.word_lang_prob in if Array.length ngrams = 0 then [] else let probs = detect_block t ngrams in let results = ref [] in for i = 0 to Array.length probs - 1 do if probs.(i) > t.config.prob_threshold then results := { lang = t.lang_list.(i); prob = probs.(i) } :: !results done; List.sort (fun a b -> compare b.prob a.prob) !results let detect_best t text = match detect t text with | [] -> None | best :: _ -> Some best.lang let detect_with_prob t text = match detect t text with | [] -> None | best :: _ -> Some (best.lang, best.prob) let supported_languages t = t.lang_list let create_default ?config () = create_packed ?config ~ngram_table:Profiles_packed.ngram_table ~profile_data:Profiles_packed.profile_data Profiles_packed.profile_offsets