Add WASM support and fix CJK character normalization · anil.recoil.org/ocaml-langdetect@6f25190

+72 -19

gen/gen_profiles.ml

··· 5 5 ---------------------------------------------------------------------------*) 6 6 7 7 (* Profile generator - converts JSON language profiles to a packed OCaml module 8 - with shared string table and a single flat int array for maximum efficiency 9 - and WASM/js_of_ocaml compatibility (31-bit ints) *) 8 + with shared string table and chunked arrays for WASM compatibility. 9 + 10 + WASM has a limit of 10,000 operands for array_new_fixed, so we split 11 + large arrays into chunks and concatenate at runtime. *) 10 12 11 13 module StringSet = Set.Make(String) 14 + 15 + (* Maximum elements per array chunk to stay under WASM limits *) 16 + let chunk_size = 9000 12 17 13 18 let read_file path = 14 19 let ic = open_in path in ··· 81 86 | c -> Buffer.add_char buf c 82 87 ) s; 83 88 Buffer.contents buf 89 + 90 + (* Split a list into chunks of at most n elements *) 91 + let chunk_list n lst = 92 + let rec aux acc current current_len = function 93 + | [] -> 94 + if current_len > 0 then List.rev (List.rev current :: acc) 95 + else List.rev acc 96 + | x :: xs -> 97 + if current_len >= n then 98 + aux (List.rev current :: acc) [x] 1 xs 99 + else 100 + aux acc (x :: current) (current_len + 1) xs 101 + in 102 + aux [] [] 0 lst 84 103 85 104 let () = 86 105 if Array.length Sys.argv < 3 then begin ··· 139 158 140 159 Printf.eprintf "Total int array size: %d elements\n%!" total_ints; 141 160 142 - (* Generate single packed module *) 161 + (* Chunk the data *) 162 + let ngram_chunks = chunk_list chunk_size ngram_list in 163 + Printf.eprintf "N-gram table: %d chunks of max %d elements\n%!" 164 + (List.length ngram_chunks) chunk_size; 165 + 166 + (* Build flat list of all profile data *) 167 + let all_data = ref [] in 168 + List.iter (fun (_, pairs) -> 169 + List.iter (fun (ngram, freq) -> 170 + let idx = Hashtbl.find ngram_to_idx ngram in 171 + all_data := freq :: idx :: !all_data 172 + ) (List.rev pairs) 173 + ) sorted_profiles; 174 + let all_data = List.rev !all_data in 175 + 176 + let data_chunks = chunk_list chunk_size all_data in 177 + Printf.eprintf "Profile data: %d chunks of max %d elements\n%!" 178 + (List.length data_chunks) chunk_size; 179 + 180 + (* Generate single packed module with chunked arrays *) 143 181 let out_path = Filename.concat output_dir "profiles_packed.ml" in 144 182 let oc = open_out out_path in 145 183 146 184 Printf.fprintf oc "(* Auto-generated packed profiles - do not edit *)\n"; 147 - Printf.fprintf oc "(* Single flat data array for maximum efficiency with WASM/js_of_ocaml *)\n\n"; 185 + Printf.fprintf oc "(* Chunked arrays for WASM compatibility (max %d elements per chunk) *)\n\n" chunk_size; 186 + 187 + (* Output ngram table chunks *) 188 + List.iteri (fun i chunk -> 189 + Printf.fprintf oc "let ngram_chunk_%d = [|\n" i; 190 + List.iter (fun ngram -> 191 + Printf.fprintf oc " \"%s\";\n" (escape_ocaml_string ngram) 192 + ) chunk; 193 + Printf.fprintf oc "|]\n\n" 194 + ) ngram_chunks; 195 + 196 + (* Concatenate ngram chunks *) 197 + Printf.fprintf oc "let ngram_table = Array.concat [\n"; 198 + List.iteri (fun i _ -> 199 + Printf.fprintf oc " ngram_chunk_%d;\n" i 200 + ) ngram_chunks; 201 + Printf.fprintf oc "]\n\n"; 148 202 149 - (* Output string table *) 150 - Printf.fprintf oc "let ngram_table = [|\n"; 151 - List.iter (fun ngram -> 152 - Printf.fprintf oc " \"%s\";\n" (escape_ocaml_string ngram) 153 - ) ngram_list; 154 - Printf.fprintf oc "|]\n\n"; 203 + (* Output profile data chunks *) 204 + List.iteri (fun i chunk -> 205 + Printf.fprintf oc "let data_chunk_%d = [|\n" i; 206 + List.iter (fun v -> 207 + Printf.fprintf oc " %d;\n" v 208 + ) chunk; 209 + Printf.fprintf oc "|]\n\n" 210 + ) data_chunks; 155 211 156 - (* Output single flat data array with all profiles concatenated *) 212 + (* Concatenate data chunks *) 157 213 Printf.fprintf oc "(* Flat array of (ngram_index, frequency) pairs for all languages *)\n"; 158 - Printf.fprintf oc "let profile_data = [|\n"; 159 - List.iter (fun (_, pairs) -> 160 - List.iter (fun (ngram, freq) -> 161 - let idx = Hashtbl.find ngram_to_idx ngram in 162 - Printf.fprintf oc " %d; %d;\n" idx freq 163 - ) (List.rev pairs) 164 - ) sorted_profiles; 165 - Printf.fprintf oc "|]\n\n"; 214 + Printf.fprintf oc "let profile_data = Array.concat [\n"; 215 + List.iteri (fun i _ -> 216 + Printf.fprintf oc " data_chunk_%d;\n" i 217 + ) data_chunks; 218 + Printf.fprintf oc "]\n\n"; 166 219 167 220 (* Output offsets table: (lang_code, start_index, num_pairs) *) 168 221 Printf.fprintf oc "(* Profile offsets: (lang_code, start_index_in_data, num_ngram_pairs) *)\n";

+14 -2

lib/js/dune

··· 14 14 (libraries langdetect_js) 15 15 (js_of_ocaml 16 16 (javascript_files)) 17 - (modes js) 17 + (modes js wasm) 18 18 (modules langdetect_js_main)) 19 19 20 20 ; Browser-based test runner ··· 23 23 (libraries langdetect_js) 24 24 (js_of_ocaml 25 25 (javascript_files)) 26 - (modes js) 26 + (modes js wasm) 27 27 (modules langdetect_js_tests)) 28 28 29 29 ; Copy to nice filenames (JS) ··· 36 36 (targets langdetect-tests.js) 37 37 (deps langdetect_js_tests.bc.js) 38 38 (action (copy %{deps} %{targets}))) 39 + 40 + ; Copy to nice filenames (WASM) 41 + ; Note: requires wasm_of_ocaml-compiler to be installed 42 + (rule 43 + (targets langdetect.wasm.js) 44 + (deps langdetect_js_main.bc.wasm.js) 45 + (action (copy %{deps} %{targets}))) 46 + 47 + (rule 48 + (targets langdetect-tests.wasm.js) 49 + (deps langdetect_js_tests.bc.wasm.js) 50 + (action (copy %{deps} %{targets})))

+143 -100

lib/js/langdetect_js_tests.ml

··· 26 26 time_ms : float; 27 27 } 28 28 29 - (** Sample texts for testing various languages *) 29 + (** Sample texts from the native test corpus *) 30 30 let test_cases = [| 31 - (* European languages *) 32 - { name = "English"; text = "The quick brown fox jumps over the lazy dog. This is a sample of English text for language detection testing."; expected = "en" }; 33 - { name = "French"; text = "Bonjour le monde. La langue française est belle et mélodieuse. J'aime beaucoup la culture française."; expected = "fr" }; 34 - { name = "German"; text = "Guten Tag! Die deutsche Sprache hat viele interessante Eigenschaften. Ich lerne gerne Deutsch."; expected = "de" }; 35 - { name = "Spanish"; text = "Hola mundo. El español es un idioma muy hablado en todo el mundo. Me gusta mucho la cultura española."; expected = "es" }; 36 - { name = "Italian"; text = "Ciao mondo! L'italiano è una lingua bellissima. Mi piace molto la cultura italiana e il cibo."; expected = "it" }; 37 - { name = "Portuguese"; text = "Olá mundo! O português é uma língua muito bonita. Eu gosto muito da cultura portuguesa."; expected = "pt" }; 38 - { name = "Dutch"; text = "Hallo wereld! De Nederlandse taal is interessant. Ik hou van de Nederlandse cultuur."; expected = "nl" }; 39 - { name = "Swedish"; text = "Hej världen! Svenska är ett vackert språk. Jag tycker om svensk kultur och mat."; expected = "sv" }; 40 - { name = "Norwegian"; text = "Hei verden! Norsk er et fint språk. Jeg liker norsk kultur og natur veldig godt."; expected = "no" }; 41 - { name = "Danish"; text = "Hej verden! Dansk er et interessant sprog. Jeg kan godt lide dansk kultur."; expected = "da" }; 42 - { name = "Finnish"; text = "Hei maailma! Suomen kieli on erittäin mielenkiintoinen. Pidän suomalaisesta kulttuurista."; expected = "fi" }; 43 - { name = "Polish"; text = "Witaj świecie! Język polski jest bardzo piękny. Lubię polską kulturę i historię."; expected = "pl" }; 44 - { name = "Russian"; text = "Привет мир! Русский язык очень красивый и богатый. Я люблю русскую литературу."; expected = "ru" }; 45 - { name = "Ukrainian"; text = "Привіт світ! Українська мова дуже гарна. Я люблю українську культуру."; expected = "uk" }; 46 - { name = "Czech"; text = "Ahoj světe! Čeština je zajímavý jazyk. Mám rád českou kulturu a historii."; expected = "cs" }; 47 - { name = "Romanian"; text = "Salut lume! Limba română este foarte frumoasă. Îmi place cultura românească."; expected = "ro" }; 48 - { name = "Hungarian"; text = "Szia világ! A magyar nyelv nagyon érdekes. Szeretem a magyar kultúrát."; expected = "hu" }; 49 - { name = "Greek"; text = "Γειά σου κόσμε! Η ελληνική γλώσσα είναι πολύ όμορφη. Μου αρέσει ο ελληνικός πολιτισμός."; expected = "el" }; 50 - { name = "Bulgarian"; text = "Здравей свят! Българският език е много красив. Обичам българската култура."; expected = "bg" }; 51 - 52 - (* Asian languages *) 53 - { name = "Chinese (Simplified)"; text = "你好世界！中文是一种非常古老而美丽的语言。我喜欢学习中国文化和历史。"; expected = "zh-cn" }; 54 - { name = "Chinese (Traditional)"; text = "你好世界！中文是一種非常古老而美麗的語言。我喜歡學習中國文化和歷史。"; expected = "zh-tw" }; 55 - { name = "Japanese"; text = "こんにちは世界！日本語はとても美しい言語です。日本の文化が大好きです。"; expected = "ja" }; 56 - { name = "Korean"; text = "안녕하세요 세계! 한국어는 매우 아름다운 언어입니다. 저는 한국 문화를 좋아합니다."; expected = "ko" }; 57 - { name = "Vietnamese"; text = "Xin chào thế giới! Tiếng Việt là một ngôn ngữ rất đẹp. Tôi yêu văn hóa Việt Nam."; expected = "vi" }; 58 - { name = "Thai"; text = "สวัสดีโลก! ภาษาไทยเป็นภาษาที่สวยงาม ฉันชอบวัฒนธรรมไทยมาก"; expected = "th" }; 59 - { name = "Indonesian"; text = "Halo dunia! Bahasa Indonesia adalah bahasa yang indah. Saya suka budaya Indonesia."; expected = "id" }; 60 - 61 - (* Middle Eastern languages *) 62 - { name = "Arabic"; text = "مرحبا بالعالم! اللغة العربية جميلة جدا. أنا أحب الثقافة العربية والتاريخ."; expected = "ar" }; 63 - { name = "Hebrew"; text = "שלום עולם! השפה העברית היא שפה יפה מאוד. אני אוהב את התרבות העברית."; expected = "he" }; 64 - { name = "Persian"; text = "سلام دنیا! زبان فارسی بسیار زیباست. من فرهنگ ایرانی را دوست دارم."; expected = "fa" }; 65 - { name = "Turkish"; text = "Merhaba dünya! Türkçe çok güzel bir dil. Türk kültürünü ve tarihini seviyorum."; expected = "tr" }; 66 - 67 - (* South Asian languages *) 68 - { name = "Hindi"; text = "नमस्ते दुनिया! हिंदी एक बहुत सुंदर भाषा है। मुझे भारतीय संस्कृति बहुत पसंद है।"; expected = "hi" }; 69 - { name = "Bengali"; text = "হ্যালো বিশ্ব! বাংলা একটি অত্যন্ত সুন্দর ভাষা। আমি বাঙালি সংস্কৃতি পছন্দ করি।"; expected = "bn" }; 70 - { name = "Tamil"; text = "வணக்கம் உலகம்! தமிழ் ஒரு மிக அழகான மொழி. நான் தமிழ் கலாச்சாரத்தை விரும்புகிறேன்."; expected = "ta" }; 71 - { name = "Telugu"; text = "హలో ప్రపంచం! తెలుగు చాలా అందమైన భాష. నాకు తెలుగు సంస్కృతి చాలా ఇష్టం."; expected = "te" }; 72 - { name = "Gujarati"; text = "હેલો વિશ્વ! ગુજરાતી એક સુંદર ભાષા છે. મને ગુજરાતી સંસ્કૃતિ ગમે છે."; expected = "gu" }; 73 - { name = "Urdu"; text = "ہیلو دنیا! اردو ایک بہت خوبصورت زبان ہے۔ مجھے اردو ادب پسند ہے۔"; expected = "ur" }; 31 + (* Same corpus as test/test_langdetect.ml *) 32 + { name = "English"; text = "The quick brown fox jumps over the lazy dog. This is a sample of English text that should be detected correctly by the language detection algorithm. Language detection uses n-gram frequency analysis to determine the most likely language of a given text sample."; expected = "en" }; 33 + { name = "Chinese"; text = "看官，現今我們中國四萬萬同胞欲內免專制、外杜瓜分的一個絕大轉機、絕大遭際，不是那預備立憲一事麼？但那立憲上加了這麼預備兩個字的活動考語，我就深恐將來這瘟憲立不成，必定嫁禍到我們同胞程度不齊上，以為卸罪地步。"; expected = "zh" }; 34 + { name = "Hebrew"; text = "זוהי דוגמה לטקסט בעברית שנועד לבדיקת זיהוי שפה. עברית היא שפה שמית שנכתבת מימין לשמאל. המערכת צריכה לזהות אותה כראוי על סמך התדירות של אותיות ותבניות אופייניות."; expected = "he" }; 35 + { name = "German"; text = "Dies ist ein Beispieltext auf Deutsch, der zur Spracherkennung verwendet wird. Die deutsche Sprache hat viele charakteristische Merkmale wie Umlaute und zusammengesetzte Wörter, die die Erkennung erleichtern."; expected = "de" }; 36 + { name = "French"; text = "Ceci est un exemple de texte en français pour tester la détection de langue. Le français est une langue romane avec des caractéristiques distinctives comme les accents et les conjugaisons verbales."; expected = "fr" }; 37 + { name = "Japanese"; text = "これは日本語のテキストです。日本語の言語検出をテストするためのサンプルです。日本語には漢字、ひらがな、カタカナの三種類の文字が使われています。"; expected = "ja" }; 38 + { name = "Russian"; text = "Это пример текста на русском языке для тестирования определения языка. Русский язык использует кириллический алфавит и имеет сложную грамматику с падежами и склонениями."; expected = "ru" }; 39 + { name = "Spanish"; text = "Este es un ejemplo de texto en español para probar la detección de idiomas. El español es una lengua romance hablada por millones de personas en todo el mundo."; expected = "es" }; 40 + { name = "Arabic"; text = "هذا مثال على نص باللغة العربية لاختبار اكتشاف اللغة. اللغة العربية هي لغة سامية تكتب من اليمين إلى اليسار."; expected = "ar" }; 41 + { name = "Korean"; text = "이것은 언어 감지를 테스트하기 위한 한국어 텍스트 예시입니다. 한국어는 한글이라는 독특한 문자 체계를 사용합니다."; expected = "ko" }; 42 + { name = "Portuguese"; text = "Este é um exemplo de texto em português para testar a detecção de idiomas. O português é uma língua românica falada em Portugal, Brasil e outros países."; expected = "pt" }; 43 + { name = "Italian"; text = "Questo è un esempio di testo in italiano per testare il rilevamento della lingua. L'italiano è una lingua romanza con una ricca storia letteraria."; expected = "it" }; 44 + { name = "Dutch"; text = "Dit is een voorbeeld van Nederlandse tekst voor het testen van taaldetectie. Nederlands wordt gesproken in Nederland en België en heeft veel overeenkomsten met Duits en Engels."; expected = "nl" }; 45 + { name = "Polish"; text = "To jest przykładowy tekst w języku polskim do testowania wykrywania języka. Polski jest językiem słowiańskim z bogatą historią literacką i skomplikowaną gramatyką."; expected = "pl" }; 46 + { name = "Turkish"; text = "Bu, dil algılama testleri için Türkçe örnek bir metindir. Türkçe, agglutinative bir dil yapısına sahip ve Latin alfabesi kullanmaktadır. Özel karakterler içerir."; expected = "tr" }; 47 + { name = "Swedish"; text = "Detta är en exempeltext på svenska för att testa språkdetektering. Svenska är ett nordiskt språk som talas i Sverige och Finland med karakteristiska vokaler."; expected = "sv" }; 48 + { name = "Vietnamese"; text = "Đây là một văn bản mẫu bằng tiếng Việt để kiểm tra phát hiện ngôn ngữ. Tiếng Việt sử dụng bảng chữ cái Latin với nhiều dấu thanh điệu đặc biệt."; expected = "vi" }; 49 + { name = "Thai"; text = "นี่คือข้อความตัวอย่างภาษาไทยสำหรับทดสอบการตรวจจับภาษา ภาษาไทยใช้อักษรไทย และมีระบบวรรณยุกต์ที่ซับซ้อน"; expected = "th" }; 50 + { name = "Hindi"; text = "यह भाषा पहचान परीक्षण के लिए हिंदी में एक नमूना पाठ है। हिंदी देवनागरी लिपि का उपयोग करती है और भारत की आधिकारिक भाषाओं में से एक है।"; expected = "hi" }; 51 + { name = "Finnish"; text = "Tämä on suomenkielinen esimerkkiteksti kielentunnistuksen testaamiseksi. Suomi on suomalais-ugrilainen kieli, jolla on monimutkainen taivutusjärjestelmä."; expected = "fi" }; 74 52 |] 75 53 76 54 (** Get current time in milliseconds *) ··· 79 57 80 58 (** Run a single test *) 81 59 let run_test detector test = 60 + (* Set deterministic seed before EACH test, like native tests do *) 61 + Langdetect.set_random_seed detector 42; 82 62 let start = now_ms () in 83 63 let result = Langdetect.detect_with_prob detector test.text in 84 64 let time_ms = now_ms () -. start in ··· 86 66 | Some (lang, p) -> Some lang, Some p 87 67 | None -> None, None 88 68 in 69 + (* Handle special case: zh matching zh-cn/zh-tw *) 70 + let lang_matches expected detected = 71 + if expected = "zh" then 72 + String.length detected >= 2 && String.sub detected 0 2 = "zh" 73 + else 74 + expected = detected 75 + in 89 76 let passed = match detected with 90 - | Some lang -> lang = test.expected 77 + | Some lang -> lang_matches test.expected lang 91 78 | None -> false 92 79 in 93 80 { test; detected; prob; passed; time_ms } 94 81 82 + (** Shared detector instance - created lazily on first use *) 83 + let shared_detector = lazy (Langdetect.create_default ()) 84 + 95 85 (** Run all tests and return results *) 96 86 let run_all_tests () = 97 - let detector = Langdetect.create_default () in 87 + let detector = Lazy.force shared_detector in 98 88 Array.map (run_test detector) test_cases 99 89 100 90 (** Create a result row element *) ··· 110 100 | None -> "-" 111 101 in 112 102 let time_text = Printf.sprintf "%.1fms" result.time_ms in 103 + (* Truncate long text for display *) 104 + let display_text = 105 + let t = result.test.text in 106 + if String.length t > 60 then String.sub t 0 57 ^ "..." else t 107 + in 113 108 114 109 let tr = El.tr [] in 115 110 El.set_children tr [ 116 111 El.td [El.txt' result.test.name]; 112 + El.td ~at:[At.class' (Jstr.v "corpus-text")] [El.txt' display_text]; 117 113 El.td ~at:[At.class' (Jstr.v "code")] [El.txt' result.test.expected]; 118 114 El.td ~at:[At.class' (Jstr.v "code")] [El.txt' detected_text]; 119 115 El.td [El.txt' prob_text]; ··· 141 137 ]; 142 138 ] 143 139 140 + (** Console error logging *) 141 + let console_error msg = 142 + ignore (Jv.call (Jv.get Jv.global "console") "error" [| Jv.of_string msg |]) 143 + 144 + let console_log msg = 145 + ignore (Jv.call (Jv.get Jv.global "console") "log" [| Jv.of_string msg |]) 146 + 144 147 (** Main test runner *) 145 148 let run_tests_ui () = 146 - (* Find or create output container *) 147 - let container = match El.find_first_by_selector (Jstr.v "#test-results") ~root:(Document.body G.document) with 148 - | Some el -> el 149 - | None -> 150 - let el = El.div ~at:[At.id (Jstr.v "test-results")] [] in 151 - El.append_children (Document.body G.document) [el]; 152 - el 153 - in 149 + console_log "[langdetect-tests] Starting test UI..."; 150 + try 151 + (* Find or create output container *) 152 + let container = match El.find_first_by_selector (Jstr.v "#test-results") ~root:(Document.body G.document) with 153 + | Some el -> 154 + console_log "[langdetect-tests] Found #test-results container"; 155 + el 156 + | None -> 157 + console_log "[langdetect-tests] Creating #test-results container"; 158 + let el = El.div ~at:[At.id (Jstr.v "test-results")] [] in 159 + El.append_children (Document.body G.document) [el]; 160 + el 161 + in 162 + 163 + (* Show loading message *) 164 + El.set_children container [ 165 + El.p [El.txt' "Running tests..."] 166 + ]; 167 + console_log "[langdetect-tests] Set loading message, scheduling test run..."; 168 + 169 + (* Run tests using JavaScript setTimeout *) 170 + let run_tests_callback () = 171 + console_log "[langdetect-tests] Callback executing..."; 172 + try 173 + console_log "[langdetect-tests] Running tests..."; 174 + let results = run_all_tests () in 175 + console_log (Printf.sprintf "[langdetect-tests] Tests complete: %d results" (Array.length results)); 176 + 177 + (* Build results table *) 178 + let thead = El.thead [ 179 + El.tr [ 180 + El.th [El.txt' "Language"]; 181 + El.th [El.txt' "Sample Text"]; 182 + El.th [El.txt' "Expected"]; 183 + El.th [El.txt' "Detected"]; 184 + El.th [El.txt' "Confidence"]; 185 + El.th [El.txt' "Time"]; 186 + El.th [El.txt' "Status"]; 187 + ] 188 + ] in 154 189 155 - (* Show loading message *) 156 - El.set_children container [ 157 - El.p [El.txt' "Running tests..."] 158 - ]; 190 + let tbody = El.tbody [] in 191 + Array.iter (fun result -> 192 + El.append_children tbody [create_result_row result] 193 + ) results; 159 194 160 - (* Run tests (async to allow UI update) *) 161 - let _ = Jv.call Jv.global "setTimeout" [| 162 - Jv.callback ~arity:0 (fun () -> 163 - let results = run_all_tests () in 195 + let table = El.table ~at:[At.class' (Jstr.v "results-table")] [thead; tbody] in 164 196 165 - (* Build results table *) 166 - let thead = El.thead [ 167 - El.tr [ 168 - El.th [El.txt' "Language"]; 169 - El.th [El.txt' "Expected"]; 170 - El.th [El.txt' "Detected"]; 171 - El.th [El.txt' "Confidence"]; 172 - El.th [El.txt' "Time"]; 173 - El.th [El.txt' "Status"]; 197 + (* Update container *) 198 + El.set_children container [ 199 + create_summary results; 200 + table; 201 + ]; 202 + console_log "[langdetect-tests] UI updated with results" 203 + with exn -> 204 + console_error (Printf.sprintf "[langdetect-tests] Error running tests: %s" (Printexc.to_string exn)); 205 + El.set_children container [ 206 + El.p ~at:[At.style (Jstr.v "color: red")] [ 207 + El.txt' (Printf.sprintf "Error: %s" (Printexc.to_string exn)) 208 + ] 174 209 ] 175 - ] in 210 + in 176 211 177 - let tbody = El.tbody [] in 178 - Array.iter (fun result -> 179 - El.append_children tbody [create_result_row result] 180 - ) results; 212 + (* Use Brr's timer function *) 213 + console_log "[langdetect-tests] Scheduling tests with G.set_timeout..."; 214 + let _timer = G.set_timeout ~ms:200 run_tests_callback in 215 + console_log "[langdetect-tests] Timer scheduled"; 216 + () 217 + with exn -> 218 + console_error (Printf.sprintf "[langdetect-tests] Error in run_tests_ui: %s" (Printexc.to_string exn)) 181 219 182 - let table = El.table ~at:[At.class' (Jstr.v "results-table")] [thead; tbody] in 183 - 184 - (* Update container *) 185 - El.set_children container [ 186 - create_summary results; 187 - table; 188 - ] 189 - ); 190 - Jv.of_int 10 191 - |] in 192 - () 193 220 194 221 (** Interactive demo section *) 195 222 let setup_demo () = 223 + console_log "[langdetect-tests] Setting up demo..."; 224 + try 196 225 let demo_container = match El.find_first_by_selector (Jstr.v "#demo") ~root:(Document.body G.document) with 197 - | Some el -> el 198 - | None -> Document.body G.document 226 + | Some el -> 227 + console_log "[langdetect-tests] Found #demo container"; 228 + el 229 + | None -> 230 + console_log "[langdetect-tests] No #demo container, using body"; 231 + Document.body G.document 199 232 in 233 + console_log "[langdetect-tests] Creating demo elements..."; 200 234 201 235 let textarea = El.textarea ~at:[ 202 236 At.id (Jstr.v "demo-input"); ··· 209 243 ] in 210 244 211 245 let detect_button = El.button ~at:[At.id (Jstr.v "demo-button")] [El.txt' "Detect Language"] in 246 + console_log "[langdetect-tests] Created demo elements, setting up click handler..."; 212 247 213 - (* Set up click handler *) 214 - let detector = Langdetect.create_default () in 248 + (* Set up click handler - detector is created lazily on first click *) 215 249 ignore (Ev.listen Ev.click (fun _ -> 216 250 let text = Jstr.to_string (El.prop El.Prop.value textarea) in 217 251 if String.length text > 0 then begin 252 + let detector = Lazy.force shared_detector in 218 253 let start = now_ms () in 219 254 let results = Langdetect.detect detector text in 220 255 let time_ms = now_ms () -. start in ··· 237 272 El.set_children result_div result_html 238 273 end 239 274 ) (El.as_target detect_button)); 275 + console_log "[langdetect-tests] Click handler registered"; 240 276 241 - (* Only add demo section if container exists *) 242 - if El.has_tag_name (Jstr.v "DIV") demo_container then 243 - El.set_children demo_container [ 244 - El.h2 [El.txt' "Try It"]; 245 - El.div ~at:[At.class' (Jstr.v "demo-area")] [ 246 - textarea; 247 - detect_button; 248 - result_div; 249 - ] 277 + (* Add demo section to container *) 278 + let tag = Jstr.to_string (El.tag_name demo_container) in 279 + console_log (Printf.sprintf "[langdetect-tests] Container tag: %s" tag); 280 + El.set_children demo_container [ 281 + El.h2 [El.txt' "Try It"]; 282 + El.div ~at:[At.class' (Jstr.v "demo-area")] [ 283 + textarea; 284 + detect_button; 285 + result_div; 250 286 ] 287 + ]; 288 + console_log "[langdetect-tests] Demo UI created" 289 + with exn -> 290 + console_error (Printf.sprintf "[langdetect-tests] Error in setup_demo: %s" (Printexc.to_string exn)) 251 291 252 292 (** Entry point *) 253 293 let () = 294 + (* Register global API for the interactive demo in test.html *) 295 + Langdetect_js.register_global_api (); 296 + 254 297 (* Wait for DOM to be ready *) 255 298 let ready_state = Jv.get (Jv.get Jv.global "document") "readyState" |> Jv.to_string in 256 299 if ready_state = "loading" then

+19

lib/langdetect.ml

··· 44 44 mutable seed : int option; 45 45 } 46 46 47 + (* Character normalization matching the original Java implementation. 48 + This is critical for matching the trained profiles. *) 47 49 let normalize_uchar uchar = 48 50 let code = Uchar.to_int uchar in 51 + (* Basic Latin: only letters pass through *) 49 52 if code < 128 then 50 53 let c = Char.chr code in 51 54 match c with 52 55 | 'A' .. 'Z' | 'a' .. 'z' -> Some (String.make 1 c) 53 56 | _ -> None 57 + (* Hangul Syllables (U+AC00-U+D7A3): normalize to '가' (U+AC00) *) 58 + else if code >= 0xAC00 && code <= 0xD7A3 then 59 + Some "\xEA\xB0\x80" (* UTF-8 for U+AC00 '가' *) 60 + (* Hiragana (U+3040-U+309F): normalize to 'あ' (U+3042) *) 61 + else if code >= 0x3040 && code <= 0x309F then 62 + Some "\xE3\x81\x82" (* UTF-8 for U+3042 'あ' *) 63 + (* Katakana (U+30A0-U+30FF): normalize to 'ア' (U+30A2) *) 64 + else if code >= 0x30A0 && code <= 0x30FF then 65 + Some "\xE3\x82\xA2" (* UTF-8 for U+30A2 'ア' *) 66 + (* Bopomofo (U+3100-U+312F) and Extended (U+31A0-U+31BF): normalize to 'ㄅ' (U+3105) *) 67 + else if (code >= 0x3100 && code <= 0x312F) || (code >= 0x31A0 && code <= 0x31BF) then 68 + Some "\xE3\x84\x85" (* UTF-8 for U+3105 'ㄅ' *) 69 + (* General Punctuation (U+2000-U+206F): treat as space/separator *) 70 + else if code >= 0x2000 && code <= 0x206F then 71 + None 72 + (* CJK Unified Ideographs and other scripts: pass through *) 54 73 else 55 74 let buf = Buffer.create 4 in 56 75 Buffer.add_utf_8_uchar buf uchar;

+99 -27

test.html

··· 129 129 padding: 0.2rem 0.4rem; 130 130 border-radius: 4px; 131 131 } 132 + .results-table .corpus-text { 133 + font-size: 0.8rem; 134 + max-width: 300px; 135 + overflow: hidden; 136 + text-overflow: ellipsis; 137 + white-space: nowrap; 138 + color: #666; 139 + } 132 140 .results-table .pass { 133 141 color: #16a34a; 134 142 font-weight: bold; ··· 187 195 <h1>🌍 Langdetect</h1> 188 196 <p class="subtitle">Language detection for the browser using n-gram frequency analysis</p> 189 197 190 -  191 - <div class="section" id="demo"> 192 - <h2>Try It</h2> 193 - <div class="demo-area"> 194 - <textarea id="demo-input" rows="4" placeholder="Enter or paste text to detect its language..."></textarea> 195 - <button id="demo-button">Detect Language</button> 196 - <div id="demo-result">Enter text above and click Detect</div> 198 +  199 + <div class="section" style="padding: 1rem 1.5rem; display: flex; align-items: center; gap: 1rem; flex-wrap: wrap;"> 200 + <label for="mode-select" style="font-weight: 600;">Runtime:</label> 201 + <select id="mode-select" style="padding: 0.5rem 1rem; border-radius: 6px; border: 2px solid #e0e0e0; font-size: 1rem;"> 202 + <option value="js">JavaScript (js_of_ocaml)</option> 203 + <option value="wasm">WebAssembly (wasm_of_ocaml)</option> 204 + </select> 205 + <button onclick="reloadWithMode()" style="padding: 0.5rem 1.5rem;">Reload</button> 206 + <span id="mode-status" style="color: #666; font-size: 0.9rem;"></span> 207 + </div> 208 + 209 +  210 + <div class="section"> 211 + <div id="demo"> 212 +  213 + <p class="loading">Loading demo...</p> 197 214 </div> 198 215 199 216 <p style="margin-top: 1rem; margin-bottom: 0.5rem; color: #666; font-size: 0.9rem;"> 200 217 Click a sample to try: 201 218 </p> 202 - <div class="sample-texts"> 219 + <div class="sample-texts" id="sample-texts"> 203 220 <div class="sample-text" data-text="The quick brown fox jumps over the lazy dog."> 204 221 <span class="lang">🇬🇧 English</span> 205 222 </div> ··· 269 286 </p> 270 287 </div> 271 288 272 -  273 - <script src="_build/default/lib/js/langdetect-tests.js"></script> 289 + <script> 290 + // Get mode from URL param or localStorage 291 + function getMode() { 292 + const params = new URLSearchParams(window.location.search); 293 + return params.get('mode') || localStorage.getItem('langdetect-mode') || 'js'; 294 + } 274 295 275 - <script> 276 - // Handle sample text clicks 277 - document.querySelectorAll('.sample-text').forEach(el => { 278 - el.addEventListener('click', () => { 279 - const text = el.getAttribute('data-text'); 280 - document.getElementById('demo-input').value = text; 281 - document.getElementById('demo-button').click(); 296 + // Set mode and reload 297 + function reloadWithMode() { 298 + const mode = document.getElementById('mode-select').value; 299 + localStorage.setItem('langdetect-mode', mode); 300 + const url = new URL(window.location); 301 + url.searchParams.set('mode', mode); 302 + window.location.href = url.toString(); 303 + } 304 + 305 + // Initialize mode selector 306 + const currentMode = getMode(); 307 + document.getElementById('mode-select').value = currentMode; 308 + 309 + // Load the appropriate script 310 + // Note: WASM uses original filename because the loader references the original assets directory 311 + const scriptName = currentMode === 'wasm' ? 'langdetect_js_tests.bc.wasm.js' : 'langdetect-tests.js'; 312 + document.getElementById('mode-status').textContent = `Loading ${currentMode.toUpperCase()}...`; 313 + 314 + const script = document.createElement('script'); 315 + script.src = `_build/default/lib/js/${scriptName}`; 316 + script.onload = function() { 317 + document.getElementById('mode-status').textContent = `Loaded: ${currentMode.toUpperCase()}`; 318 + document.getElementById('mode-status').style.color = '#16a34a'; 319 + setTimeout(setupSampleTexts, 100); 320 + }; 321 + script.onerror = function() { 322 + document.getElementById('mode-status').textContent = `Failed to load ${scriptName}`; 323 + document.getElementById('mode-status').style.color = '#dc2626'; 324 + showLoadError(); 325 + }; 326 + document.head.appendChild(script); 327 + 328 + // Wait for langdetect to be ready, then set up sample text handlers 329 + function setupSampleTexts() { 330 + document.querySelectorAll('.sample-text').forEach(el => { 331 + el.addEventListener('click', () => { 332 + const text = el.getAttribute('data-text'); 333 + // Find the OCaml-created input and button 334 + const input = document.getElementById('demo-input'); 335 + const button = document.getElementById('demo-button'); 336 + if (input && button) { 337 + input.value = text; 338 + button.click(); 339 + } 340 + }); 282 341 }); 283 - }); 342 + } 343 + 344 + function showLoadError() { 345 + const mode = getMode(); 346 + const scriptName = mode === 'wasm' ? 'langdetect_js_tests.bc.wasm.js' : 'langdetect-tests.js'; 347 + const buildCmd = 'opam exec -- dune build lib/js/'; 348 + 349 + const demo = document.getElementById('demo'); 350 + if (demo) { 351 + demo.innerHTML = 352 + `<p style="color: #dc2626;"><strong>Library not loaded</strong><br>` + 353 + `Run <code>${buildCmd}</code> first.</p>`; 354 + } 355 + document.getElementById('test-results').innerHTML = 356 + `<p style="color: #dc2626;"><strong>Tests cannot run:</strong> ${scriptName} not found.<br>` + 357 + `Build with: <code>opam exec -- dune build lib/js/</code></p>`; 358 + } 284 359 285 - // Check if library loaded 360 + // Check if library loaded after timeout 286 361 window.addEventListener('load', () => { 287 - if (typeof langdetect === 'undefined') { 288 - document.getElementById('demo-result').innerHTML = 289 - '<strong style="color: #dc2626;">Library not loaded</strong><br>' + 290 - 'Run <code>opam exec -- dune build lib/js/langdetect-tests.js</code> first.'; 291 - document.getElementById('test-results').innerHTML = 292 - '<p style="color: #dc2626;"><strong>Tests cannot run:</strong> langdetect.js not found.<br>' + 293 - 'Build with: <code>opam exec -- dune build lib/js/</code></p>'; 294 - } 362 + setTimeout(() => { 363 + if (typeof langdetect === 'undefined') { 364 + showLoadError(); 365 + } 366 + }, 500); 295 367 }); 296 368 </script> 297 369