Detect which human language a document uses from OCaml, from the Nu Html validator
languages unicode ocaml

wasm+cmd

+461 -4
+5
bin/dune
··· 1 + (executable 2 + (name langdetect_cli) 3 + (public_name langdetect) 4 + (package langdetect) 5 + (libraries langdetect cmdliner))
+65
bin/langdetect_cli.ml
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org> 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + let detect_language input_text = 7 + let detector = Langdetect.create_default () in 8 + let results = Langdetect.detect detector input_text in 9 + List.iter 10 + (fun (r : Langdetect.result) -> Printf.printf "%s %.4f\n" r.lang r.prob) 11 + results 12 + 13 + let read_all_stdin () = 14 + let buf = Buffer.create 4096 in 15 + try 16 + while true do 17 + Buffer.add_channel buf stdin 4096 18 + done; 19 + Buffer.contents buf 20 + with End_of_file -> Buffer.contents buf 21 + 22 + let read_file path = 23 + let ic = open_in path in 24 + let n = in_channel_length ic in 25 + let s = really_input_string ic n in 26 + close_in ic; 27 + s 28 + 29 + let run file_opt = 30 + let text = 31 + match file_opt with 32 + | Some path -> read_file path 33 + | None -> read_all_stdin () 34 + in 35 + if String.length (String.trim text) = 0 then 36 + `Error (false, "No input text provided") 37 + else begin 38 + detect_language text; 39 + `Ok () 40 + end 41 + 42 + open Cmdliner 43 + 44 + let file_arg = 45 + let doc = "Input file to detect language from. If not provided, reads from stdin." in 46 + Arg.(value & pos 0 (some file) None & info [] ~docv:"FILE" ~doc) 47 + 48 + let cmd = 49 + let doc = "Detect the language of text" in 50 + let man = 51 + [ 52 + `S Manpage.s_description; 53 + `P "Detects the natural language of input text using n-gram frequency analysis."; 54 + `P "Outputs detected language codes and their probabilities as space-separated values, one per line, sorted by probability (highest first)."; 55 + `S Manpage.s_examples; 56 + `P "Detect language from a file:"; 57 + `Pre " langdetect document.txt"; 58 + `P "Detect language from stdin:"; 59 + `Pre " echo 'Hello world' | langdetect"; 60 + ] 61 + in 62 + let info = Cmd.info "langdetect" ~version:"%%VERSION%%" ~doc ~man in 63 + Cmd.v info Term.(ret (const run $ file_arg)) 64 + 65 + let () = exit (Cmd.eval cmd)
+3 -2
dune-project
··· 21 21 (depends 22 22 (ocaml (>= 5.1.0)) 23 23 (uutf (>= 1.0.0)) 24 + (cmdliner (>= 1.2.0)) 24 25 (odoc :with-doc) 25 26 (alcotest (and :with-test (>= 1.7.0))))) 26 27 ··· 35 36 (ocaml (>= 5.1.0)) 36 37 (langdetect (= :version)) 37 38 (brr (>= 0.0.6)) 38 - (js_of_ocaml (>= 5.0.0)) 39 - (js_of_ocaml-compiler (>= 5.0.0)))) 39 + (js_of_ocaml (>= 6.0.0)) 40 + (js_of_ocaml-compiler (>= 6.0.0))))
+2 -2
langdetect-js.opam
··· 13 13 "ocaml" {>= "5.1.0"} 14 14 "langdetect" {= version} 15 15 "brr" {>= "0.0.6"} 16 - "js_of_ocaml" {>= "5.0.0"} 17 - "js_of_ocaml-compiler" {>= "5.0.0"} 16 + "js_of_ocaml" {>= "6.0.0"} 17 + "js_of_ocaml-compiler" {>= "6.0.0"} 18 18 "odoc" {with-doc} 19 19 ] 20 20 build: [
+1
langdetect.opam
··· 12 12 "dune" {>= "3.20"} 13 13 "ocaml" {>= "5.1.0"} 14 14 "uutf" {>= "1.0.0"} 15 + "cmdliner" {>= "1.2.0"} 15 16 "odoc" {with-doc} 16 17 "alcotest" {with-test & >= "1.7.0"} 17 18 ]
+17
lib/js/dune
··· 48 48 (targets langdetect-tests.wasm.js) 49 49 (deps langdetect_js_tests.bc.wasm.js) 50 50 (action (copy %{deps} %{targets}))) 51 + 52 + ; Install web assets to share/langdetect-js/ 53 + ; Includes HTML demo, JS files, WASM loaders, and WASM assets with source maps 54 + (install 55 + (package langdetect-js) 56 + (section share) 57 + (files 58 + (langdetect.html as langdetect-js/langdetect.html) 59 + ; JS files (work standalone in browsers) 60 + (langdetect.js as langdetect-js/langdetect.js) 61 + (langdetect-tests.js as langdetect-js/langdetect-tests.js) 62 + ; WASM loaders (in same dir so relative asset paths work) 63 + (langdetect_js_main.bc.wasm.js as langdetect-js/langdetect_js_main.bc.wasm.js) 64 + (langdetect_js_tests.bc.wasm.js as langdetect-js/langdetect_js_tests.bc.wasm.js) 65 + ; WASM assets - must be in langdetect-js/ so relative paths from loaders work 66 + (glob_files_rec (langdetect_js_main.bc.wasm.assets/* with_prefix langdetect-js/langdetect_js_main.bc.wasm.assets)) 67 + (glob_files_rec (langdetect_js_tests.bc.wasm.assets/* with_prefix langdetect-js/langdetect_js_tests.bc.wasm.assets))))
+368
lib/js/langdetect.html
··· 1 + <!DOCTYPE html> 2 + <html lang="en"> 3 + <head> 4 + <meta charset="UTF-8"> 5 + <meta name="viewport" content="width=device-width, initial-scale=1.0"> 6 + <title>Langdetect - Language Detection Demo</title> 7 + <style> 8 + * { 9 + box-sizing: border-box; 10 + } 11 + body { 12 + font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; 13 + line-height: 1.6; 14 + max-width: 1000px; 15 + margin: 0 auto; 16 + padding: 2rem; 17 + background: #f5f5f5; 18 + color: #333; 19 + } 20 + h1 { 21 + color: #2563eb; 22 + border-bottom: 3px solid #3b82f6; 23 + padding-bottom: 0.5rem; 24 + margin-bottom: 0.5rem; 25 + } 26 + .subtitle { 27 + color: #666; 28 + margin-top: 0; 29 + margin-bottom: 2rem; 30 + } 31 + .section { 32 + background: white; 33 + border: 1px solid #e0e0e0; 34 + border-radius: 12px; 35 + padding: 1.5rem; 36 + margin: 1.5rem 0; 37 + box-shadow: 0 2px 4px rgba(0,0,0,0.05); 38 + } 39 + .section h2 { 40 + margin-top: 0; 41 + color: #1e40af; 42 + } 43 + .demo-area { 44 + display: flex; 45 + flex-direction: column; 46 + gap: 1rem; 47 + } 48 + textarea { 49 + width: 100%; 50 + padding: 1rem; 51 + font-size: 1rem; 52 + font-family: inherit; 53 + border: 2px solid #e0e0e0; 54 + border-radius: 8px; 55 + resize: vertical; 56 + transition: border-color 0.2s; 57 + } 58 + textarea:focus { 59 + outline: none; 60 + border-color: #3b82f6; 61 + } 62 + button { 63 + padding: 0.75rem 2rem; 64 + font-size: 1rem; 65 + font-weight: 600; 66 + cursor: pointer; 67 + border: none; 68 + border-radius: 8px; 69 + background: #2563eb; 70 + color: white; 71 + transition: all 0.2s; 72 + align-self: flex-start; 73 + } 74 + button:hover { 75 + background: #1d4ed8; 76 + transform: translateY(-1px); 77 + } 78 + button:active { 79 + transform: translateY(0); 80 + } 81 + #demo-result { 82 + padding: 1rem; 83 + background: #f8fafc; 84 + border-radius: 8px; 85 + border: 1px solid #e2e8f0; 86 + } 87 + #demo-result ul { 88 + margin: 0.5rem 0; 89 + padding-left: 1.5rem; 90 + } 91 + #demo-result li { 92 + margin: 0.25rem 0; 93 + } 94 + .summary { 95 + background: #eff6ff; 96 + padding: 1rem 1.5rem; 97 + border-radius: 8px; 98 + margin-bottom: 1rem; 99 + } 100 + .summary h2 { 101 + margin: 0 0 0.5rem 0; 102 + } 103 + .summary p { 104 + margin: 0.25rem 0; 105 + } 106 + .results-table { 107 + width: 100%; 108 + border-collapse: collapse; 109 + font-size: 0.9rem; 110 + } 111 + .results-table th, 112 + .results-table td { 113 + padding: 0.75rem 1rem; 114 + text-align: left; 115 + border-bottom: 1px solid #e0e0e0; 116 + } 117 + .results-table th { 118 + background: #f8fafc; 119 + font-weight: 600; 120 + color: #475569; 121 + } 122 + .results-table tr:hover { 123 + background: #f8fafc; 124 + } 125 + .results-table .code { 126 + font-family: 'SF Mono', Monaco, 'Cascadia Code', monospace; 127 + font-size: 0.85rem; 128 + background: #f1f5f9; 129 + padding: 0.2rem 0.4rem; 130 + border-radius: 4px; 131 + } 132 + .results-table .corpus-text { 133 + font-size: 0.8rem; 134 + max-width: 300px; 135 + overflow: hidden; 136 + text-overflow: ellipsis; 137 + white-space: nowrap; 138 + color: #666; 139 + } 140 + .results-table .pass { 141 + color: #16a34a; 142 + font-weight: bold; 143 + font-size: 1.1rem; 144 + } 145 + .results-table .fail { 146 + color: #dc2626; 147 + font-weight: bold; 148 + font-size: 1.1rem; 149 + } 150 + .loading { 151 + text-align: center; 152 + padding: 2rem; 153 + color: #666; 154 + } 155 + .api-docs { 156 + background: #1e293b; 157 + color: #e2e8f0; 158 + padding: 1rem; 159 + border-radius: 8px; 160 + overflow-x: auto; 161 + } 162 + .api-docs code { 163 + color: #7dd3fc; 164 + } 165 + .api-docs .comment { 166 + color: #94a3b8; 167 + } 168 + .sample-texts { 169 + display: grid; 170 + grid-template-columns: repeat(auto-fill, minmax(200px, 1fr)); 171 + gap: 0.5rem; 172 + margin-top: 1rem; 173 + } 174 + .sample-text { 175 + padding: 0.5rem 1rem; 176 + font-size: 0.85rem; 177 + background: #f1f5f9; 178 + border: 1px solid #e2e8f0; 179 + border-radius: 6px; 180 + cursor: pointer; 181 + transition: all 0.2s; 182 + } 183 + .sample-text:hover { 184 + background: #e2e8f0; 185 + border-color: #cbd5e1; 186 + } 187 + .sample-text .lang { 188 + font-weight: 600; 189 + color: #1e40af; 190 + } 191 + </style> 192 + </head> 193 + <body> 194 + 195 + <h1>🌍 Langdetect</h1> 196 + <p class="subtitle">Language detection for the browser using n-gram frequency analysis</p> 197 + 198 + <!-- Mode Selector --> 199 + <div class="section" style="padding: 1rem 1.5rem; display: flex; align-items: center; gap: 1rem; flex-wrap: wrap;"> 200 + <label for="mode-select" style="font-weight: 600;">Runtime:</label> 201 + <select id="mode-select" style="padding: 0.5rem 1rem; border-radius: 6px; border: 2px solid #e0e0e0; font-size: 1rem;"> 202 + <option value="js">JavaScript (js_of_ocaml)</option> 203 + <option value="wasm">WebAssembly (wasm_of_ocaml)</option> 204 + </select> 205 + <button onclick="reloadWithMode()" style="padding: 0.5rem 1.5rem;">Reload</button> 206 + <span id="mode-status" style="color: #666; font-size: 0.9rem;"></span> 207 + </div> 208 + 209 + <!-- Interactive Demo (created by OCaml code, but we add sample texts) --> 210 + <div class="section"> 211 + <div id="demo"> 212 + <!-- Demo UI created by langdetect_js_tests.ml --> 213 + <p class="loading">Loading demo...</p> 214 + </div> 215 + 216 + <p style="margin-top: 1rem; margin-bottom: 0.5rem; color: #666; font-size: 0.9rem;"> 217 + Click a sample to try: 218 + </p> 219 + <div class="sample-texts" id="sample-texts"> 220 + <div class="sample-text" data-text="The quick brown fox jumps over the lazy dog."> 221 + <span class="lang">🇬🇧 English</span> 222 + </div> 223 + <div class="sample-text" data-text="Bonjour le monde! Comment allez-vous aujourd'hui?"> 224 + <span class="lang">🇫🇷 French</span> 225 + </div> 226 + <div class="sample-text" data-text="Guten Tag! Wie geht es Ihnen heute?"> 227 + <span class="lang">🇩🇪 German</span> 228 + </div> 229 + <div class="sample-text" data-text="¡Hola mundo! ¿Cómo estás hoy?"> 230 + <span class="lang">🇪🇸 Spanish</span> 231 + </div> 232 + <div class="sample-text" data-text="你好世界!今天你好吗?"> 233 + <span class="lang">🇨🇳 Chinese</span> 234 + </div> 235 + <div class="sample-text" data-text="こんにちは世界!今日はお元気ですか?"> 236 + <span class="lang">🇯🇵 Japanese</span> 237 + </div> 238 + <div class="sample-text" data-text="مرحبا بالعالم! كيف حالك اليوم؟"> 239 + <span class="lang">🇸🇦 Arabic</span> 240 + </div> 241 + <div class="sample-text" data-text="Привет мир! Как дела сегодня?"> 242 + <span class="lang">🇷🇺 Russian</span> 243 + </div> 244 + </div> 245 + </div> 246 + 247 + <!-- Test Results --> 248 + <div class="section"> 249 + <div id="test-results"> 250 + <p class="loading">Loading tests...</p> 251 + </div> 252 + </div> 253 + 254 + <!-- API Documentation --> 255 + <div class="section"> 256 + <h2>JavaScript API</h2> 257 + <div class="api-docs"> 258 + <pre><span class="comment">// Detect the most likely language</span> 259 + <code>langdetect.detect</code>("Hello, world!") 260 + <span class="comment">// → "en"</span> 261 + 262 + <span class="comment">// Get detection with confidence score</span> 263 + <code>langdetect.detectWithProb</code>("Bonjour le monde!") 264 + <span class="comment">// → { lang: "fr", prob: 0.9999 }</span> 265 + 266 + <span class="comment">// Get all matching languages</span> 267 + <code>langdetect.detectAll</code>("Hello world") 268 + <span class="comment">// → [{ lang: "en", prob: 0.85 }, { lang: "de", prob: 0.10 }, ...]</span> 269 + 270 + <span class="comment">// List supported languages</span> 271 + <code>langdetect.languages</code>() 272 + <span class="comment">// → ["ar", "bg", "bn", "ca", "cs", "da", "de", "el", ...]</span></pre> 273 + </div> 274 + </div> 275 + 276 + <!-- Supported Languages --> 277 + <div class="section"> 278 + <h2>Supported Languages (47)</h2> 279 + <p> 280 + Arabic, Bengali, Bulgarian, Catalan, Chinese (Simplified), Chinese (Traditional), 281 + Croatian, Czech, Danish, Dutch, English, Estonian, Finnish, French, German, Greek, 282 + Gujarati, Hebrew, Hindi, Hungarian, Indonesian, Italian, Japanese, Korean, Latvian, 283 + Lithuanian, Macedonian, Malayalam, Norwegian, Panjabi, Persian, Polish, Portuguese, 284 + Romanian, Russian, Sinhala, Slovak, Spanish, Swedish, Tagalog, Tamil, Telugu, Thai, 285 + Turkish, Ukrainian, Urdu, Vietnamese 286 + </p> 287 + </div> 288 + 289 + <script> 290 + // Get mode from URL param or localStorage 291 + function getMode() { 292 + const params = new URLSearchParams(window.location.search); 293 + return params.get('mode') || localStorage.getItem('langdetect-mode') || 'js'; 294 + } 295 + 296 + // Set mode and reload 297 + function reloadWithMode() { 298 + const mode = document.getElementById('mode-select').value; 299 + localStorage.setItem('langdetect-mode', mode); 300 + const url = new URL(window.location); 301 + url.searchParams.set('mode', mode); 302 + window.location.href = url.toString(); 303 + } 304 + 305 + // Initialize mode selector 306 + const currentMode = getMode(); 307 + document.getElementById('mode-select').value = currentMode; 308 + 309 + // Load the appropriate script (relative paths for installed version) 310 + const scriptName = currentMode === 'wasm' ? 'langdetect_js_tests.bc.wasm.js' : 'langdetect-tests.js'; 311 + document.getElementById('mode-status').textContent = `Loading ${currentMode.toUpperCase()}...`; 312 + 313 + const script = document.createElement('script'); 314 + script.src = scriptName; 315 + script.onload = function() { 316 + document.getElementById('mode-status').textContent = `Loaded: ${currentMode.toUpperCase()}`; 317 + document.getElementById('mode-status').style.color = '#16a34a'; 318 + setTimeout(setupSampleTexts, 100); 319 + }; 320 + script.onerror = function() { 321 + document.getElementById('mode-status').textContent = `Failed to load ${scriptName}`; 322 + document.getElementById('mode-status').style.color = '#dc2626'; 323 + showLoadError(); 324 + }; 325 + document.head.appendChild(script); 326 + 327 + // Wait for langdetect to be ready, then set up sample text handlers 328 + function setupSampleTexts() { 329 + document.querySelectorAll('.sample-text').forEach(el => { 330 + el.addEventListener('click', () => { 331 + const text = el.getAttribute('data-text'); 332 + // Find the OCaml-created input and button 333 + const input = document.getElementById('demo-input'); 334 + const button = document.getElementById('demo-button'); 335 + if (input && button) { 336 + input.value = text; 337 + button.click(); 338 + } 339 + }); 340 + }); 341 + } 342 + 343 + function showLoadError() { 344 + const mode = getMode(); 345 + const scriptName = mode === 'wasm' ? 'langdetect_js_tests.bc.wasm.js' : 'langdetect-tests.js'; 346 + 347 + const demo = document.getElementById('demo'); 348 + if (demo) { 349 + demo.innerHTML = 350 + `<p style="color: #dc2626;"><strong>Library not loaded</strong><br>` + 351 + `Could not load <code>${scriptName}</code>.</p>`; 352 + } 353 + document.getElementById('test-results').innerHTML = 354 + `<p style="color: #dc2626;"><strong>Tests cannot run:</strong> ${scriptName} not found.</p>`; 355 + } 356 + 357 + // Check if library loaded after timeout 358 + window.addEventListener('load', () => { 359 + setTimeout(() => { 360 + if (typeof langdetect === 'undefined') { 361 + showLoadError(); 362 + } 363 + }, 500); 364 + }); 365 + </script> 366 + 367 + </body> 368 + </html>