Detect which human language a document uses from OCaml, from the Nu Html validator
languages unicode ocaml

odoc

+120 -18
+24 -11
lib/js/dune
··· 5 5 (name langdetect_js) 6 6 (public_name langdetect-js) 7 7 (libraries langdetect brr) 8 - (modes byte) ; js_of_ocaml requires bytecode 8 + (modes byte) ; js_of_ocaml requires bytecode 9 9 (modules langdetect_js)) 10 10 11 11 ; Standalone JavaScript file for direct browser use 12 + 12 13 (executable 13 14 (name langdetect_js_main) 14 15 (libraries langdetect_js) 15 - (js_of_ocaml 16 - (javascript_files)) 16 + (js_of_ocaml (javascript_files)) 17 17 (modes js wasm) 18 18 (modules langdetect_js_main)) 19 19 20 20 ; Browser-based test runner 21 + 21 22 (executable 22 23 (name langdetect_js_tests) 23 24 (libraries langdetect_js) 24 - (js_of_ocaml 25 - (javascript_files)) 25 + (js_of_ocaml (javascript_files)) 26 26 (modes js wasm) 27 27 (modules langdetect_js_tests)) 28 28 29 29 ; Copy to nice filenames (JS) 30 + 30 31 (rule 31 32 (targets langdetect.js) 32 33 (deps langdetect_js_main.bc.js) 33 - (action (copy %{deps} %{targets}))) 34 + (action 35 + (copy %{deps} %{targets}))) 34 36 35 37 (rule 36 38 (targets langdetect-tests.js) 37 39 (deps langdetect_js_tests.bc.js) 38 - (action (copy %{deps} %{targets}))) 40 + (action 41 + (copy %{deps} %{targets}))) 39 42 40 43 ; Copy to nice filenames (WASM) 41 44 ; Note: requires wasm_of_ocaml-compiler to be installed 45 + 42 46 (rule 43 47 (targets langdetect.wasm.js) 44 48 (deps langdetect_js_main.bc.wasm.js) 45 - (action (copy %{deps} %{targets}))) 49 + (action 50 + (copy %{deps} %{targets}))) 46 51 47 52 (rule 48 53 (targets langdetect-tests.wasm.js) 49 54 (deps langdetect_js_tests.bc.wasm.js) 50 - (action (copy %{deps} %{targets}))) 55 + (action 56 + (copy %{deps} %{targets}))) 51 57 52 58 ; Install web assets to share/langdetect-js/ 53 59 ; Includes HTML demo, JS files, WASM loaders, and WASM assets with source maps 60 + 54 61 (install 55 62 (package langdetect-js) 56 63 (section share) ··· 63 70 langdetect_js_main.bc.wasm.js 64 71 langdetect_js_tests.bc.wasm.js 65 72 ; WASM assets - must be in langdetect-js/ so relative paths from loaders work 66 - (glob_files_rec (langdetect_js_main.bc.wasm.assets/* with_prefix langdetect_js_main.bc.wasm.assets)) 67 - (glob_files_rec (langdetect_js_tests.bc.wasm.assets/* with_prefix langdetect_js_tests.bc.wasm.assets)))) 73 + (glob_files_rec 74 + (langdetect_js_main.bc.wasm.assets/* 75 + with_prefix 76 + langdetect_js_main.bc.wasm.assets)) 77 + (glob_files_rec 78 + (langdetect_js_tests.bc.wasm.assets/* 79 + with_prefix 80 + langdetect_js_tests.bc.wasm.assets))))
+96 -7
lib/langdetect.mli
··· 9 9 This is an OCaml port of the Cybozu langdetect algorithm. Detects the 10 10 natural language of text using n-gram frequency profiles. Supports 49 11 11 languages including English, Chinese, Japanese, Arabic, and many European 12 - languages. *) 12 + languages. 13 + 14 + {1 Quick Start} 15 + 16 + {[ 17 + (* Create a detector with built-in language profiles *) 18 + let detector = Langdetect.create_default () in 19 + 20 + (* Detect the language of some text *) 21 + let results = Langdetect.detect detector "Hello, how are you today?" in 22 + List.iter (fun r -> 23 + Printf.printf "%s: %.2f%%\n" r.lang (r.prob *. 100.0) 24 + ) results 25 + (* Output: en: 99.99% *) 26 + 27 + (* Get just the best match *) 28 + match Langdetect.detect_best detector "Bonjour, comment allez-vous?" with 29 + | Some lang -> Printf.printf "Detected: %s\n" lang (* fr *) 30 + | None -> Printf.printf "Could not detect language\n" 31 + ]} 32 + 33 + {1 Algorithm Overview} 34 + 35 + The detection algorithm uses n-gram frequency analysis: 36 + 37 + {ol 38 + {- Extract character n-grams (1 to 3 characters) from the input text} 39 + {- Compare n-gram frequencies against pre-computed language profiles} 40 + {- Use a randomized trial approach to handle ambiguous text} 41 + {- Return probabilities for each candidate language}} 42 + 43 + The algorithm is based on the Cybozu langdetect library, originally 44 + developed by Shuyo Nakatani. The n-gram profiles were trained on 45 + Wikipedia text corpora. 46 + 47 + {1 Supported Languages} 48 + 49 + The built-in profiles support 49 languages with ISO 639-1 codes: 50 + 51 + {ul 52 + {- {b European}: af, bg, cs, da, de, el, en, es, et, fi, fr, hr, hu, it, lt, 53 + lv, nl, no, pl, pt, ro, ru, sk, sl, sq, sv, tr, uk} 54 + {- {b Asian}: ar, bn, fa, gu, he, hi, id, ja, kn, ko, ml, mr, ne, pa, ta, 55 + te, th, vi, zh-cn, zh-tw} 56 + {- {b Other}: sw, tl}} 57 + 58 + {1 Performance Considerations} 59 + 60 + {ul 61 + {- Text length: Longer text (100+ characters) yields more accurate results} 62 + {- Short text: May produce ambiguous or incorrect results} 63 + {- Mixed language: Returns the dominant language} 64 + {- Similar languages: May confuse closely related languages (e.g., no/da, es/pt)}} 65 + 66 + The detector processes up to [max_text_length] characters (default: 10000) 67 + for performance. Increase this for more accuracy on long documents. 68 + 69 + {1 Reproducibility} 70 + 71 + Detection uses random sampling internally. For reproducible results: 72 + {[ 73 + let detector = Langdetect.create_default () in 74 + Langdetect.set_random_seed detector 42; 75 + (* Now results are deterministic *) 76 + ]} 77 + 78 + {1 References} 79 + 80 + {ul 81 + {- {{:https://github.com/shuyo/language-detection}Cybozu langdetect} - Original Java implementation} 82 + {- {{:https://www.aclweb.org/anthology/C10-1096/}N-gram Language Detection} - Background on n-gram approach}} *) 13 83 14 84 (** {1 Types} *) 15 85 ··· 20 90 (** Language detection result. *) 21 91 22 92 type config = { 23 - alpha : float; (** Smoothing parameter (default 0.5) *) 24 - n_trial : int; (** Number of random trials (default 7) *) 25 - max_text_length : int; (** Maximum text length to process *) 26 - conv_threshold : float; (** Convergence threshold *) 27 - prob_threshold : float; (** Minimum probability to report *) 93 + alpha : float; 94 + (** Smoothing parameter for probability estimation (default: 0.5). 95 + Higher values make the algorithm less sensitive to rare n-grams. *) 96 + n_trial : int; 97 + (** Number of random trials to run (default: 7). 98 + More trials improve accuracy but increase processing time. *) 99 + max_text_length : int; 100 + (** Maximum text length to process (default: 10000). 101 + Text beyond this limit is ignored. Increase for long documents. *) 102 + conv_threshold : float; 103 + (** Convergence threshold for early termination (default: 0.99999). 104 + Trials stop early when confidence exceeds this value. *) 105 + prob_threshold : float; 106 + (** Minimum probability to include in results (default: 0.1). 107 + Languages below this threshold are filtered from {!detect} output. *) 28 108 } 29 - (** Detection parameters. *) 109 + (** Detection parameters for tuning accuracy and performance. 110 + 111 + Use {!default_config} for standard settings, or customize for specific needs: 112 + {[ 113 + let config = { Langdetect.default_config with 114 + n_trial = 10; (* More trials for better accuracy *) 115 + prob_threshold = 0.2 (* Only report high-confidence results *) 116 + } in 117 + let detector = Langdetect.create_default ~config () 118 + ]} *) 30 119 31 120 val default_config : config 32 121 (** Default configuration values. *)