Detect which human language a document uses from OCaml, from the Nu Html validator
languages unicode ocaml
at main 305 lines 11 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2007-2016 Mozilla Foundation 3 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org> 4 SPDX-License-Identifier: MIT 5 ---------------------------------------------------------------------------*) 6 7(** Language detection library based on n-gram frequency analysis. 8 9 This is an OCaml port of the Cybozu langdetect algorithm. *) 10 11module StringMap = Map.Make (String) 12 13type result = { 14 lang : string; 15 prob : float; 16} 17 18type config = { 19 alpha : float; 20 n_trial : int; 21 max_text_length : int; 22 conv_threshold : float; 23 prob_threshold : float; 24} 25 26let default_config = 27 { 28 alpha = 0.5; 29 n_trial = 7; 30 max_text_length = 10000; 31 conv_threshold = 0.99999; 32 prob_threshold = 0.1; 33 } 34 35let n_gram_max = 3 36let base_freq = 10000 37let iteration_limit = 1000 38let alpha_width = 0.05 39 40type t = { 41 config : config; 42 word_lang_prob : float array StringMap.t; 43 lang_list : string array; 44 mutable seed : int option; 45} 46 47(* Character normalization matching the original Java implementation. 48 This is critical for matching the trained profiles. *) 49let normalize_uchar uchar = 50 let code = Uchar.to_int uchar in 51 (* Basic Latin: only letters pass through *) 52 if code < 128 then 53 let c = Char.chr code in 54 match c with 55 | 'A' .. 'Z' | 'a' .. 'z' -> Some (String.make 1 c) 56 | _ -> None 57 (* Hangul Syllables (U+AC00-U+D7A3): normalize to '가' (U+AC00) *) 58 else if code >= 0xAC00 && code <= 0xD7A3 then 59 Some "\xEA\xB0\x80" (* UTF-8 for U+AC00 '가' *) 60 (* Hiragana (U+3040-U+309F): normalize to 'あ' (U+3042) *) 61 else if code >= 0x3040 && code <= 0x309F then 62 Some "\xE3\x81\x82" (* UTF-8 for U+3042 'あ' *) 63 (* Katakana (U+30A0-U+30FF): normalize to 'ア' (U+30A2) *) 64 else if code >= 0x30A0 && code <= 0x30FF then 65 Some "\xE3\x82\xA2" (* UTF-8 for U+30A2 'ア' *) 66 (* Bopomofo (U+3100-U+312F) and Extended (U+31A0-U+31BF): normalize to 'ㄅ' (U+3105) *) 67 else if (code >= 0x3100 && code <= 0x312F) || (code >= 0x31A0 && code <= 0x31BF) then 68 Some "\xE3\x84\x85" (* UTF-8 for U+3105 'ㄅ' *) 69 (* General Punctuation (U+2000-U+206F): treat as space/separator *) 70 else if code >= 0x2000 && code <= 0x206F then 71 None 72 (* CJK Unified Ideographs and other scripts: pass through *) 73 else 74 let buf = Buffer.create 4 in 75 Buffer.add_utf_8_uchar buf uchar; 76 Some (Buffer.contents buf) 77 78let extract_ngrams ?(max_len = 10000) text word_lang_prob = 79 let ngrams = ref [] in 80 let char_buffer = Array.make n_gram_max "" in 81 let char_count = ref 0 in 82 let processed = ref 0 in 83 let decoder = Uutf.decoder ~encoding:`UTF_8 (`String text) in 84 let rec process () = 85 if !processed >= max_len then () 86 else 87 match Uutf.decode decoder with 88 | `Await | `End -> () 89 | `Malformed _ -> process () 90 | `Uchar uchar -> ( 91 incr processed; 92 match normalize_uchar uchar with 93 | None -> 94 char_buffer.(0) <- ""; 95 char_buffer.(1) <- ""; 96 char_buffer.(2) <- ""; 97 char_count := 0; 98 process () 99 | Some char_str -> 100 char_buffer.(0) <- char_buffer.(1); 101 char_buffer.(1) <- char_buffer.(2); 102 char_buffer.(2) <- char_str; 103 incr char_count; 104 let available = min !char_count n_gram_max in 105 for n = 1 to available do 106 let start_idx = n_gram_max - n in 107 let parts = ref [] in 108 for i = start_idx to n_gram_max - 1 do 109 parts := char_buffer.(i) :: !parts 110 done; 111 let ngram = String.concat "" (List.rev !parts) in 112 if StringMap.mem ngram word_lang_prob then 113 ngrams := ngram :: !ngrams 114 done; 115 process ()) 116 in 117 process (); 118 Array.of_list (List.rev !ngrams) 119 120let init_prob n_langs = Array.make n_langs (1.0 /. float_of_int n_langs) 121 122let update_lang_prob prob ngram word_lang_prob alpha = 123 match StringMap.find_opt ngram word_lang_prob with 124 | None -> false 125 | Some lang_prob_map -> 126 let weight = alpha /. float_of_int base_freq in 127 for i = 0 to Array.length prob - 1 do 128 prob.(i) <- prob.(i) *. (weight +. lang_prob_map.(i)) 129 done; 130 true 131 132let normalize_prob prob = 133 let sum = Array.fold_left ( +. ) 0.0 prob in 134 if sum <= 0.0 then 0.0 135 else 136 let max_p = ref 0.0 in 137 for i = 0 to Array.length prob - 1 do 138 prob.(i) <- prob.(i) /. sum; 139 if prob.(i) > !max_p then max_p := prob.(i) 140 done; 141 !max_p 142 143(* LCG random number generator using Int32 for WASM compatibility. 144 The constants (1103515245, 12345) are from the C standard library's rand(). 145 We mask with 0x3FFFFFFF (30 bits) to ensure the result fits in OCaml's 146 31-bit int on 32-bit platforms like WASM. *) 147let random_state = ref 12345l 148let set_seed seed = random_state := Int32.of_int seed 149 150let next_random () = 151 (* Use Int32 to handle overflow correctly on 32-bit platforms (WASM) *) 152 let open Int32 in 153 random_state := logand (add (mul !random_state 1103515245l) 12345l) 0x7FFFFFFFl; 154 (* Mask to 30 bits to fit in OCaml's 31-bit int on 32-bit platforms *) 155 to_int (logand !random_state 0x3FFFFFFFl) 156 157let random_int bound = 158 let r = next_random () in 159 (* Ensure positive result even if bound is negative *) 160 abs (r mod bound) 161 162let max_random_float = Int32.to_float 0x3FFFFFFFl 163 164let random_gaussian () = 165 let u1 = float_of_int (next_random ()) /. max_random_float in 166 let u2 = float_of_int (next_random ()) /. max_random_float in 167 let u1 = max 0.0001 u1 in 168 sqrt (-2.0 *. log u1) *. cos (2.0 *. Float.pi *. u2) 169 170let detect_block t ngrams = 171 let n_langs = Array.length t.lang_list in 172 if n_langs = 0 || Array.length ngrams = 0 then [||] 173 else 174 let lang_prob = Array.make n_langs 0.0 in 175 set_seed (Option.value t.seed ~default:12345); 176 for _ = 0 to t.config.n_trial - 1 do 177 let prob = init_prob n_langs in 178 let alpha = t.config.alpha +. (random_gaussian () *. alpha_width) in 179 let converged = ref false in 180 let iter_count = ref 0 in 181 while (not !converged) && !iter_count < iteration_limit do 182 let r = random_int (Array.length ngrams) in 183 let (_ : bool) = update_lang_prob prob ngrams.(r) t.word_lang_prob alpha in 184 if !iter_count mod 5 = 0 then begin 185 let max_p = normalize_prob prob in 186 if max_p > t.config.conv_threshold then converged := true 187 end; 188 incr iter_count 189 done; 190 for j = 0 to n_langs - 1 do 191 lang_prob.(j) <- lang_prob.(j) +. (prob.(j) /. float_of_int t.config.n_trial) 192 done 193 done; 194 lang_prob 195 196(* Create detector from packed profiles with flat data array. 197 ngram_table: global string table mapping indices to n-gram strings 198 profile_data: flat int array of (ngram_index, frequency) pairs 199 profile_offsets: array of (lang_code, start_index, num_pairs) *) 200let create_packed ?(config = default_config) ~ngram_table ~profile_data profile_offsets = 201 let n_langs = Array.length profile_offsets in 202 let lang_list = Array.map (fun (lang, _, _) -> lang) profile_offsets in 203 let all_ngrams = Hashtbl.create 65536 in 204 let lang_totals = Array.make n_langs 0 in 205 Array.iteri 206 (fun lang_idx (_, start_idx, num_pairs) -> 207 for pair_idx = 0 to num_pairs - 1 do 208 let data_idx = start_idx + (pair_idx * 2) in 209 let ngram_idx = profile_data.(data_idx) in 210 let count = profile_data.(data_idx + 1) in 211 let ngram = ngram_table.(ngram_idx) in 212 let current = 213 match Hashtbl.find_opt all_ngrams ngram with 214 | Some arr -> arr 215 | None -> 216 let arr = Array.make n_langs 0 in 217 Hashtbl.add all_ngrams ngram arr; 218 arr 219 in 220 current.(lang_idx) <- count; 221 lang_totals.(lang_idx) <- lang_totals.(lang_idx) + count 222 done) 223 profile_offsets; 224 let word_lang_prob = 225 Hashtbl.fold 226 (fun ngram counts acc -> 227 let probs = Array.make n_langs 0.0 in 228 for i = 0 to n_langs - 1 do 229 if lang_totals.(i) > 0 then 230 probs.(i) <- float_of_int counts.(i) /. float_of_int lang_totals.(i) 231 done; 232 StringMap.add ngram probs acc) 233 all_ngrams StringMap.empty 234 in 235 { config; word_lang_prob; lang_list; seed = None } 236 237(* Create detector from legacy list-based profiles. 238 profiles: list of (lang_code, (ngram, frequency) list) *) 239let create ?(config = default_config) profiles = 240 let lang_list = Array.of_list (List.map fst profiles) in 241 let n_langs = Array.length lang_list in 242 let all_ngrams = Hashtbl.create 65536 in 243 let lang_totals = Array.make n_langs 0 in 244 List.iteri 245 (fun lang_idx (_, freq_list) -> 246 List.iter 247 (fun (ngram, count) -> 248 let current = 249 match Hashtbl.find_opt all_ngrams ngram with 250 | Some arr -> arr 251 | None -> 252 let arr = Array.make n_langs 0 in 253 Hashtbl.add all_ngrams ngram arr; 254 arr 255 in 256 current.(lang_idx) <- count; 257 lang_totals.(lang_idx) <- lang_totals.(lang_idx) + count) 258 freq_list) 259 profiles; 260 let word_lang_prob = 261 Hashtbl.fold 262 (fun ngram counts acc -> 263 let probs = Array.make n_langs 0.0 in 264 for i = 0 to n_langs - 1 do 265 if lang_totals.(i) > 0 then 266 probs.(i) <- float_of_int counts.(i) /. float_of_int lang_totals.(i) 267 done; 268 StringMap.add ngram probs acc) 269 all_ngrams StringMap.empty 270 in 271 { config; word_lang_prob; lang_list; seed = None } 272 273let set_random_seed t seed = t.seed <- Some seed 274 275let detect t text = 276 let ngrams = 277 extract_ngrams ~max_len:t.config.max_text_length text t.word_lang_prob 278 in 279 if Array.length ngrams = 0 then [] 280 else 281 let probs = detect_block t ngrams in 282 let results = ref [] in 283 for i = 0 to Array.length probs - 1 do 284 if probs.(i) > t.config.prob_threshold then 285 results := { lang = t.lang_list.(i); prob = probs.(i) } :: !results 286 done; 287 List.sort (fun a b -> compare b.prob a.prob) !results 288 289let detect_best t text = 290 match detect t text with 291 | [] -> None 292 | best :: _ -> Some best.lang 293 294let detect_with_prob t text = 295 match detect t text with 296 | [] -> None 297 | best :: _ -> Some (best.lang, best.prob) 298 299let supported_languages t = t.lang_list 300 301let create_default ?config () = 302 create_packed ?config 303 ~ngram_table:Profiles_packed.ngram_table 304 ~profile_data:Profiles_packed.profile_data 305 Profiles_packed.profile_offsets