Detect which human language a document uses from OCaml, from the Nu Html validator
languages
unicode
ocaml
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(** Browser-based test runner for langdetect.
7
8 This module runs regression tests in the browser and displays results
9 in the DOM. It demonstrates language detection across multiple languages. *)
10
11open Brr
12
13(** Test case definition *)
14type test_case = {
15 name : string;
16 text : string;
17 expected : string;
18}
19
20(** Test results *)
21type test_result = {
22 test : test_case;
23 detected : string option;
24 prob : float option;
25 passed : bool;
26 time_ms : float;
27}
28
29(** Sample texts from the native test corpus *)
30let test_cases = [|
31 (* Same corpus as test/test_langdetect.ml *)
32 { name = "English"; text = "The quick brown fox jumps over the lazy dog. This is a sample of English text that should be detected correctly by the language detection algorithm. Language detection uses n-gram frequency analysis to determine the most likely language of a given text sample."; expected = "en" };
33 { name = "Chinese"; text = "看官,現今我們中國四萬萬同胞欲內免專制、外杜瓜分的一個絕大轉機、絕大遭際,不是那預備立憲一事麼?但那立憲上加了這麼預備兩個字的活動考語,我就深恐將來這瘟憲立不成,必定嫁禍到我們同胞程度不齊上,以為卸罪地步。"; expected = "zh" };
34 { name = "Hebrew"; text = "זוהי דוגמה לטקסט בעברית שנועד לבדיקת זיהוי שפה. עברית היא שפה שמית שנכתבת מימין לשמאל. המערכת צריכה לזהות אותה כראוי על סמך התדירות של אותיות ותבניות אופייניות."; expected = "he" };
35 { name = "German"; text = "Dies ist ein Beispieltext auf Deutsch, der zur Spracherkennung verwendet wird. Die deutsche Sprache hat viele charakteristische Merkmale wie Umlaute und zusammengesetzte Wörter, die die Erkennung erleichtern."; expected = "de" };
36 { name = "French"; text = "Ceci est un exemple de texte en français pour tester la détection de langue. Le français est une langue romane avec des caractéristiques distinctives comme les accents et les conjugaisons verbales."; expected = "fr" };
37 { name = "Japanese"; text = "これは日本語のテキストです。日本語の言語検出をテストするためのサンプルです。日本語には漢字、ひらがな、カタカナの三種類の文字が使われています。"; expected = "ja" };
38 { name = "Russian"; text = "Это пример текста на русском языке для тестирования определения языка. Русский язык использует кириллический алфавит и имеет сложную грамматику с падежами и склонениями."; expected = "ru" };
39 { name = "Spanish"; text = "Este es un ejemplo de texto en español para probar la detección de idiomas. El español es una lengua romance hablada por millones de personas en todo el mundo."; expected = "es" };
40 { name = "Arabic"; text = "هذا مثال على نص باللغة العربية لاختبار اكتشاف اللغة. اللغة العربية هي لغة سامية تكتب من اليمين إلى اليسار."; expected = "ar" };
41 { name = "Korean"; text = "이것은 언어 감지를 테스트하기 위한 한국어 텍스트 예시입니다. 한국어는 한글이라는 독특한 문자 체계를 사용합니다."; expected = "ko" };
42 { name = "Portuguese"; text = "Este é um exemplo de texto em português para testar a detecção de idiomas. O português é uma língua românica falada em Portugal, Brasil e outros países."; expected = "pt" };
43 { name = "Italian"; text = "Questo è un esempio di testo in italiano per testare il rilevamento della lingua. L'italiano è una lingua romanza con una ricca storia letteraria."; expected = "it" };
44 { name = "Dutch"; text = "Dit is een voorbeeld van Nederlandse tekst voor het testen van taaldetectie. Nederlands wordt gesproken in Nederland en België en heeft veel overeenkomsten met Duits en Engels."; expected = "nl" };
45 { name = "Polish"; text = "To jest przykładowy tekst w języku polskim do testowania wykrywania języka. Polski jest językiem słowiańskim z bogatą historią literacką i skomplikowaną gramatyką."; expected = "pl" };
46 { name = "Turkish"; text = "Bu, dil algılama testleri için Türkçe örnek bir metindir. Türkçe, agglutinative bir dil yapısına sahip ve Latin alfabesi kullanmaktadır. Özel karakterler içerir."; expected = "tr" };
47 { name = "Swedish"; text = "Detta är en exempeltext på svenska för att testa språkdetektering. Svenska är ett nordiskt språk som talas i Sverige och Finland med karakteristiska vokaler."; expected = "sv" };
48 { name = "Vietnamese"; text = "Đây là một văn bản mẫu bằng tiếng Việt để kiểm tra phát hiện ngôn ngữ. Tiếng Việt sử dụng bảng chữ cái Latin với nhiều dấu thanh điệu đặc biệt."; expected = "vi" };
49 { name = "Thai"; text = "นี่คือข้อความตัวอย่างภาษาไทยสำหรับทดสอบการตรวจจับภาษา ภาษาไทยใช้อักษรไทย และมีระบบวรรณยุกต์ที่ซับซ้อน"; expected = "th" };
50 { name = "Hindi"; text = "यह भाषा पहचान परीक्षण के लिए हिंदी में एक नमूना पाठ है। हिंदी देवनागरी लिपि का उपयोग करती है और भारत की आधिकारिक भाषाओं में से एक है।"; expected = "hi" };
51 { name = "Finnish"; text = "Tämä on suomenkielinen esimerkkiteksti kielentunnistuksen testaamiseksi. Suomi on suomalais-ugrilainen kieli, jolla on monimutkainen taivutusjärjestelmä."; expected = "fi" };
52|]
53
54(** Get current time in milliseconds *)
55let now_ms () =
56 Jv.to_float (Jv.call (Jv.get Jv.global "performance") "now" [||])
57
58(** Run a single test *)
59let run_test detector test =
60 (* Set deterministic seed before EACH test, like native tests do *)
61 Langdetect.set_random_seed detector 42;
62 let start = now_ms () in
63 let result = Langdetect.detect_with_prob detector test.text in
64 let time_ms = now_ms () -. start in
65 let detected, prob = match result with
66 | Some (lang, p) -> Some lang, Some p
67 | None -> None, None
68 in
69 (* Handle special case: zh matching zh-cn/zh-tw *)
70 let lang_matches expected detected =
71 if expected = "zh" then
72 String.length detected >= 2 && String.sub detected 0 2 = "zh"
73 else
74 expected = detected
75 in
76 let passed = match detected with
77 | Some lang -> lang_matches test.expected lang
78 | None -> false
79 in
80 { test; detected; prob; passed; time_ms }
81
82(** Shared detector instance - created lazily on first use *)
83let shared_detector = lazy (Langdetect.create_default ())
84
85(** Run all tests and return results *)
86let run_all_tests () =
87 let detector = Lazy.force shared_detector in
88 Array.map (run_test detector) test_cases
89
90(** Create a result row element *)
91let create_result_row result =
92 let status_class = if result.passed then "pass" else "fail" in
93 let status_text = if result.passed then "✓" else "✗" in
94 let detected_text = match result.detected with
95 | Some lang -> lang
96 | None -> "(none)"
97 in
98 let prob_text = match result.prob with
99 | Some p -> Printf.sprintf "%.1f%%" (p *. 100.0)
100 | None -> "-"
101 in
102 let time_text = Printf.sprintf "%.1fms" result.time_ms in
103 (* Truncate long text for display *)
104 let display_text =
105 let t = result.test.text in
106 if String.length t > 60 then String.sub t 0 57 ^ "..." else t
107 in
108
109 let tr = El.tr [] in
110 El.set_children tr [
111 El.td [El.txt' result.test.name];
112 El.td ~at:[At.class' (Jstr.v "corpus-text")] [El.txt' display_text];
113 El.td ~at:[At.class' (Jstr.v "code")] [El.txt' result.test.expected];
114 El.td ~at:[At.class' (Jstr.v "code")] [El.txt' detected_text];
115 El.td [El.txt' prob_text];
116 El.td [El.txt' time_text];
117 El.td ~at:[At.class' (Jstr.v status_class)] [El.txt' status_text];
118 ];
119 tr
120
121(** Create summary stats *)
122let create_summary results =
123 let total = Array.length results in
124 let passed = Array.fold_left (fun acc r -> if r.passed then acc + 1 else acc) 0 results in
125 let failed = total - passed in
126 let total_time = Array.fold_left (fun acc r -> acc +. r.time_ms) 0.0 results in
127 let avg_time = total_time /. float_of_int total in
128
129 El.div ~at:[At.class' (Jstr.v "summary")] [
130 El.h2 [El.txt' "Test Results"];
131 El.p [
132 El.strong [El.txt' (Printf.sprintf "%d/%d tests passed" passed total)];
133 El.txt' (Printf.sprintf " (%d failed)" failed);
134 ];
135 El.p [
136 El.txt' (Printf.sprintf "Total time: %.1fms (avg %.2fms per test)" total_time avg_time);
137 ];
138 ]
139
140(** Console error logging *)
141let console_error msg =
142 ignore (Jv.call (Jv.get Jv.global "console") "error" [| Jv.of_string msg |])
143
144let console_log msg =
145 ignore (Jv.call (Jv.get Jv.global "console") "log" [| Jv.of_string msg |])
146
147(** Main test runner *)
148let run_tests_ui () =
149 console_log "[langdetect-tests] Starting test UI...";
150 try
151 (* Find or create output container *)
152 let container = match El.find_first_by_selector (Jstr.v "#test-results") ~root:(Document.body G.document) with
153 | Some el ->
154 console_log "[langdetect-tests] Found #test-results container";
155 el
156 | None ->
157 console_log "[langdetect-tests] Creating #test-results container";
158 let el = El.div ~at:[At.id (Jstr.v "test-results")] [] in
159 El.append_children (Document.body G.document) [el];
160 el
161 in
162
163 (* Show loading message *)
164 El.set_children container [
165 El.p [El.txt' "Running tests..."]
166 ];
167 console_log "[langdetect-tests] Set loading message, scheduling test run...";
168
169 (* Run tests using JavaScript setTimeout *)
170 let run_tests_callback () =
171 console_log "[langdetect-tests] Callback executing...";
172 try
173 console_log "[langdetect-tests] Running tests...";
174 let results = run_all_tests () in
175 console_log (Printf.sprintf "[langdetect-tests] Tests complete: %d results" (Array.length results));
176
177 (* Build results table *)
178 let thead = El.thead [
179 El.tr [
180 El.th [El.txt' "Language"];
181 El.th [El.txt' "Sample Text"];
182 El.th [El.txt' "Expected"];
183 El.th [El.txt' "Detected"];
184 El.th [El.txt' "Confidence"];
185 El.th [El.txt' "Time"];
186 El.th [El.txt' "Status"];
187 ]
188 ] in
189
190 let tbody = El.tbody [] in
191 Array.iter (fun result ->
192 El.append_children tbody [create_result_row result]
193 ) results;
194
195 let table = El.table ~at:[At.class' (Jstr.v "results-table")] [thead; tbody] in
196
197 (* Update container *)
198 El.set_children container [
199 create_summary results;
200 table;
201 ];
202 console_log "[langdetect-tests] UI updated with results"
203 with exn ->
204 console_error (Printf.sprintf "[langdetect-tests] Error running tests: %s" (Printexc.to_string exn));
205 El.set_children container [
206 El.p ~at:[At.style (Jstr.v "color: red")] [
207 El.txt' (Printf.sprintf "Error: %s" (Printexc.to_string exn))
208 ]
209 ]
210 in
211
212 (* Use Brr's timer function *)
213 console_log "[langdetect-tests] Scheduling tests with G.set_timeout...";
214 let _timer = G.set_timeout ~ms:200 run_tests_callback in
215 console_log "[langdetect-tests] Timer scheduled";
216 ()
217 with exn ->
218 console_error (Printf.sprintf "[langdetect-tests] Error in run_tests_ui: %s" (Printexc.to_string exn))
219
220
221(** Interactive demo section *)
222let setup_demo () =
223 console_log "[langdetect-tests] Setting up demo...";
224 try
225 let demo_container = match El.find_first_by_selector (Jstr.v "#demo") ~root:(Document.body G.document) with
226 | Some el ->
227 console_log "[langdetect-tests] Found #demo container";
228 el
229 | None ->
230 console_log "[langdetect-tests] No #demo container, using body";
231 Document.body G.document
232 in
233 console_log "[langdetect-tests] Creating demo elements...";
234
235 let textarea = El.textarea ~at:[
236 At.id (Jstr.v "demo-input");
237 At.v (Jstr.v "rows") (Jstr.v "4");
238 At.v (Jstr.v "placeholder") (Jstr.v "Enter text to detect language...");
239 ] [] in
240
241 let result_div = El.div ~at:[At.id (Jstr.v "demo-result")] [
242 El.txt' "Enter text above and click Detect"
243 ] in
244
245 let detect_button = El.button ~at:[At.id (Jstr.v "demo-button")] [El.txt' "Detect Language"] in
246 console_log "[langdetect-tests] Created demo elements, setting up click handler...";
247
248 (* Set up click handler - detector is created lazily on first click *)
249 ignore (Ev.listen Ev.click (fun _ ->
250 let text = Jstr.to_string (El.prop El.Prop.value textarea) in
251 if String.length text > 0 then begin
252 let detector = Lazy.force shared_detector in
253 let start = now_ms () in
254 let results = Langdetect.detect detector text in
255 let time_ms = now_ms () -. start in
256
257 let result_html = match results with
258 | [] ->
259 [El.txt' "No language detected (text may be too short)"]
260 | _ ->
261 let items = List.map (fun (r : Langdetect.result) ->
262 El.li [
263 El.strong [El.txt' r.lang];
264 El.txt' (Printf.sprintf " — %.1f%% confidence" (r.prob *. 100.0))
265 ]
266 ) results in
267 [
268 El.p [El.txt' (Printf.sprintf "Detected in %.1fms:" time_ms)];
269 El.ul items
270 ]
271 in
272 El.set_children result_div result_html
273 end
274 ) (El.as_target detect_button));
275 console_log "[langdetect-tests] Click handler registered";
276
277 (* Add demo section to container *)
278 let tag = Jstr.to_string (El.tag_name demo_container) in
279 console_log (Printf.sprintf "[langdetect-tests] Container tag: %s" tag);
280 El.set_children demo_container [
281 El.h2 [El.txt' "Try It"];
282 El.div ~at:[At.class' (Jstr.v "demo-area")] [
283 textarea;
284 detect_button;
285 result_div;
286 ]
287 ];
288 console_log "[langdetect-tests] Demo UI created"
289 with exn ->
290 console_error (Printf.sprintf "[langdetect-tests] Error in setup_demo: %s" (Printexc.to_string exn))
291
292(** Entry point *)
293let () =
294 (* Register global API for the interactive demo in test.html *)
295 Langdetect_js.register_global_api ();
296
297 (* Wait for DOM to be ready *)
298 let ready_state = Jv.get (Jv.get Jv.global "document") "readyState" |> Jv.to_string in
299 if ready_state = "loading" then
300 ignore (Jv.call Jv.global "addEventListener" [|
301 Jv.of_string "DOMContentLoaded";
302 Jv.callback ~arity:1 (fun _ ->
303 run_tests_ui ();
304 setup_demo ()
305 )
306 |])
307 else begin
308 run_tests_ui ();
309 setup_demo ()
310 end