Detect which human language a document uses from OCaml, from the Nu Html validator
languages
unicode
ocaml
1(*---------------------------------------------------------------------------
2 Copyright (c) 2007-2016 Mozilla Foundation
3 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>
4 SPDX-License-Identifier: MIT
5 ---------------------------------------------------------------------------*)
6
7(** Language detection library based on n-gram frequency analysis.
8
9 This is an OCaml port of the Cybozu langdetect algorithm. *)
10
11module StringMap = Map.Make (String)
12
13type result = {
14 lang : string;
15 prob : float;
16}
17
18type config = {
19 alpha : float;
20 n_trial : int;
21 max_text_length : int;
22 conv_threshold : float;
23 prob_threshold : float;
24}
25
26let default_config =
27 {
28 alpha = 0.5;
29 n_trial = 7;
30 max_text_length = 10000;
31 conv_threshold = 0.99999;
32 prob_threshold = 0.1;
33 }
34
35let n_gram_max = 3
36let base_freq = 10000
37let iteration_limit = 1000
38let alpha_width = 0.05
39
40type t = {
41 config : config;
42 word_lang_prob : float array StringMap.t;
43 lang_list : string array;
44 mutable seed : int option;
45}
46
47(* Character normalization matching the original Java implementation.
48 This is critical for matching the trained profiles. *)
49let normalize_uchar uchar =
50 let code = Uchar.to_int uchar in
51 (* Basic Latin: only letters pass through *)
52 if code < 128 then
53 let c = Char.chr code in
54 match c with
55 | 'A' .. 'Z' | 'a' .. 'z' -> Some (String.make 1 c)
56 | _ -> None
57 (* Hangul Syllables (U+AC00-U+D7A3): normalize to '가' (U+AC00) *)
58 else if code >= 0xAC00 && code <= 0xD7A3 then
59 Some "\xEA\xB0\x80" (* UTF-8 for U+AC00 '가' *)
60 (* Hiragana (U+3040-U+309F): normalize to 'あ' (U+3042) *)
61 else if code >= 0x3040 && code <= 0x309F then
62 Some "\xE3\x81\x82" (* UTF-8 for U+3042 'あ' *)
63 (* Katakana (U+30A0-U+30FF): normalize to 'ア' (U+30A2) *)
64 else if code >= 0x30A0 && code <= 0x30FF then
65 Some "\xE3\x82\xA2" (* UTF-8 for U+30A2 'ア' *)
66 (* Bopomofo (U+3100-U+312F) and Extended (U+31A0-U+31BF): normalize to 'ㄅ' (U+3105) *)
67 else if (code >= 0x3100 && code <= 0x312F) || (code >= 0x31A0 && code <= 0x31BF) then
68 Some "\xE3\x84\x85" (* UTF-8 for U+3105 'ㄅ' *)
69 (* General Punctuation (U+2000-U+206F): treat as space/separator *)
70 else if code >= 0x2000 && code <= 0x206F then
71 None
72 (* CJK Unified Ideographs and other scripts: pass through *)
73 else
74 let buf = Buffer.create 4 in
75 Buffer.add_utf_8_uchar buf uchar;
76 Some (Buffer.contents buf)
77
78let extract_ngrams ?(max_len = 10000) text word_lang_prob =
79 let ngrams = ref [] in
80 let char_buffer = Array.make n_gram_max "" in
81 let char_count = ref 0 in
82 let processed = ref 0 in
83 let decoder = Uutf.decoder ~encoding:`UTF_8 (`String text) in
84 let rec process () =
85 if !processed >= max_len then ()
86 else
87 match Uutf.decode decoder with
88 | `Await | `End -> ()
89 | `Malformed _ -> process ()
90 | `Uchar uchar -> (
91 incr processed;
92 match normalize_uchar uchar with
93 | None ->
94 char_buffer.(0) <- "";
95 char_buffer.(1) <- "";
96 char_buffer.(2) <- "";
97 char_count := 0;
98 process ()
99 | Some char_str ->
100 char_buffer.(0) <- char_buffer.(1);
101 char_buffer.(1) <- char_buffer.(2);
102 char_buffer.(2) <- char_str;
103 incr char_count;
104 let available = min !char_count n_gram_max in
105 for n = 1 to available do
106 let start_idx = n_gram_max - n in
107 let parts = ref [] in
108 for i = start_idx to n_gram_max - 1 do
109 parts := char_buffer.(i) :: !parts
110 done;
111 let ngram = String.concat "" (List.rev !parts) in
112 if StringMap.mem ngram word_lang_prob then
113 ngrams := ngram :: !ngrams
114 done;
115 process ())
116 in
117 process ();
118 Array.of_list (List.rev !ngrams)
119
120let init_prob n_langs = Array.make n_langs (1.0 /. float_of_int n_langs)
121
122let update_lang_prob prob ngram word_lang_prob alpha =
123 match StringMap.find_opt ngram word_lang_prob with
124 | None -> false
125 | Some lang_prob_map ->
126 let weight = alpha /. float_of_int base_freq in
127 for i = 0 to Array.length prob - 1 do
128 prob.(i) <- prob.(i) *. (weight +. lang_prob_map.(i))
129 done;
130 true
131
132let normalize_prob prob =
133 let sum = Array.fold_left ( +. ) 0.0 prob in
134 if sum <= 0.0 then 0.0
135 else
136 let max_p = ref 0.0 in
137 for i = 0 to Array.length prob - 1 do
138 prob.(i) <- prob.(i) /. sum;
139 if prob.(i) > !max_p then max_p := prob.(i)
140 done;
141 !max_p
142
143(* LCG random number generator using Int32 for WASM compatibility.
144 The constants (1103515245, 12345) are from the C standard library's rand().
145 We mask with 0x3FFFFFFF (30 bits) to ensure the result fits in OCaml's
146 31-bit int on 32-bit platforms like WASM. *)
147let random_state = ref 12345l
148let set_seed seed = random_state := Int32.of_int seed
149
150let next_random () =
151 (* Use Int32 to handle overflow correctly on 32-bit platforms (WASM) *)
152 let open Int32 in
153 random_state := logand (add (mul !random_state 1103515245l) 12345l) 0x7FFFFFFFl;
154 (* Mask to 30 bits to fit in OCaml's 31-bit int on 32-bit platforms *)
155 to_int (logand !random_state 0x3FFFFFFFl)
156
157let random_int bound =
158 let r = next_random () in
159 (* Ensure positive result even if bound is negative *)
160 abs (r mod bound)
161
162let max_random_float = Int32.to_float 0x3FFFFFFFl
163
164let random_gaussian () =
165 let u1 = float_of_int (next_random ()) /. max_random_float in
166 let u2 = float_of_int (next_random ()) /. max_random_float in
167 let u1 = max 0.0001 u1 in
168 sqrt (-2.0 *. log u1) *. cos (2.0 *. Float.pi *. u2)
169
170let detect_block t ngrams =
171 let n_langs = Array.length t.lang_list in
172 if n_langs = 0 || Array.length ngrams = 0 then [||]
173 else
174 let lang_prob = Array.make n_langs 0.0 in
175 set_seed (Option.value t.seed ~default:12345);
176 for _ = 0 to t.config.n_trial - 1 do
177 let prob = init_prob n_langs in
178 let alpha = t.config.alpha +. (random_gaussian () *. alpha_width) in
179 let converged = ref false in
180 let iter_count = ref 0 in
181 while (not !converged) && !iter_count < iteration_limit do
182 let r = random_int (Array.length ngrams) in
183 let (_ : bool) = update_lang_prob prob ngrams.(r) t.word_lang_prob alpha in
184 if !iter_count mod 5 = 0 then begin
185 let max_p = normalize_prob prob in
186 if max_p > t.config.conv_threshold then converged := true
187 end;
188 incr iter_count
189 done;
190 for j = 0 to n_langs - 1 do
191 lang_prob.(j) <- lang_prob.(j) +. (prob.(j) /. float_of_int t.config.n_trial)
192 done
193 done;
194 lang_prob
195
196(* Create detector from packed profiles with flat data array.
197 ngram_table: global string table mapping indices to n-gram strings
198 profile_data: flat int array of (ngram_index, frequency) pairs
199 profile_offsets: array of (lang_code, start_index, num_pairs) *)
200let create_packed ?(config = default_config) ~ngram_table ~profile_data profile_offsets =
201 let n_langs = Array.length profile_offsets in
202 let lang_list = Array.map (fun (lang, _, _) -> lang) profile_offsets in
203 let all_ngrams = Hashtbl.create 65536 in
204 let lang_totals = Array.make n_langs 0 in
205 Array.iteri
206 (fun lang_idx (_, start_idx, num_pairs) ->
207 for pair_idx = 0 to num_pairs - 1 do
208 let data_idx = start_idx + (pair_idx * 2) in
209 let ngram_idx = profile_data.(data_idx) in
210 let count = profile_data.(data_idx + 1) in
211 let ngram = ngram_table.(ngram_idx) in
212 let current =
213 match Hashtbl.find_opt all_ngrams ngram with
214 | Some arr -> arr
215 | None ->
216 let arr = Array.make n_langs 0 in
217 Hashtbl.add all_ngrams ngram arr;
218 arr
219 in
220 current.(lang_idx) <- count;
221 lang_totals.(lang_idx) <- lang_totals.(lang_idx) + count
222 done)
223 profile_offsets;
224 let word_lang_prob =
225 Hashtbl.fold
226 (fun ngram counts acc ->
227 let probs = Array.make n_langs 0.0 in
228 for i = 0 to n_langs - 1 do
229 if lang_totals.(i) > 0 then
230 probs.(i) <- float_of_int counts.(i) /. float_of_int lang_totals.(i)
231 done;
232 StringMap.add ngram probs acc)
233 all_ngrams StringMap.empty
234 in
235 { config; word_lang_prob; lang_list; seed = None }
236
237(* Create detector from legacy list-based profiles.
238 profiles: list of (lang_code, (ngram, frequency) list) *)
239let create ?(config = default_config) profiles =
240 let lang_list = Array.of_list (List.map fst profiles) in
241 let n_langs = Array.length lang_list in
242 let all_ngrams = Hashtbl.create 65536 in
243 let lang_totals = Array.make n_langs 0 in
244 List.iteri
245 (fun lang_idx (_, freq_list) ->
246 List.iter
247 (fun (ngram, count) ->
248 let current =
249 match Hashtbl.find_opt all_ngrams ngram with
250 | Some arr -> arr
251 | None ->
252 let arr = Array.make n_langs 0 in
253 Hashtbl.add all_ngrams ngram arr;
254 arr
255 in
256 current.(lang_idx) <- count;
257 lang_totals.(lang_idx) <- lang_totals.(lang_idx) + count)
258 freq_list)
259 profiles;
260 let word_lang_prob =
261 Hashtbl.fold
262 (fun ngram counts acc ->
263 let probs = Array.make n_langs 0.0 in
264 for i = 0 to n_langs - 1 do
265 if lang_totals.(i) > 0 then
266 probs.(i) <- float_of_int counts.(i) /. float_of_int lang_totals.(i)
267 done;
268 StringMap.add ngram probs acc)
269 all_ngrams StringMap.empty
270 in
271 { config; word_lang_prob; lang_list; seed = None }
272
273let set_random_seed t seed = t.seed <- Some seed
274
275let detect t text =
276 let ngrams =
277 extract_ngrams ~max_len:t.config.max_text_length text t.word_lang_prob
278 in
279 if Array.length ngrams = 0 then []
280 else
281 let probs = detect_block t ngrams in
282 let results = ref [] in
283 for i = 0 to Array.length probs - 1 do
284 if probs.(i) > t.config.prob_threshold then
285 results := { lang = t.lang_list.(i); prob = probs.(i) } :: !results
286 done;
287 List.sort (fun a b -> compare b.prob a.prob) !results
288
289let detect_best t text =
290 match detect t text with
291 | [] -> None
292 | best :: _ -> Some best.lang
293
294let detect_with_prob t text =
295 match detect t text with
296 | [] -> None
297 | best :: _ -> Some (best.lang, best.prob)
298
299let supported_languages t = t.lang_list
300
301let create_default ?config () =
302 create_packed ?config
303 ~ngram_table:Profiles_packed.ngram_table
304 ~profile_data:Profiles_packed.profile_data
305 Profiles_packed.profile_offsets