Detect which human language a document uses from OCaml, from the Nu Html validator
languages unicode ocaml

metadata

+413 -227
+17 -1
.gitignore
··· 1 - _build
··· 1 + # OCaml build artifacts 2 + _build/ 3 + *.install 4 + *.merlin 5 + 6 + # Third-party sources (fetch locally with opam source) 7 + third_party/ 8 + 9 + # Editor and OS files 10 + .DS_Store 11 + *.swp 12 + *~ 13 + .vscode/ 14 + .idea/ 15 + 16 + # Opam local switch 17 + _opam/
+1
.ocamlformat
···
··· 1 + version=0.28.1
+53
.tangled/workflows/build.yml
···
··· 1 + when: 2 + - event: ["push", "pull_request"] 3 + branch: ["main"] 4 + 5 + engine: nixery 6 + 7 + dependencies: 8 + nixpkgs: 9 + - shell 10 + - stdenv 11 + - findutils 12 + - binutils 13 + - libunwind 14 + - ncurses 15 + - opam 16 + - git 17 + - gawk 18 + - gnupatch 19 + - gnum4 20 + - gnumake 21 + - gnutar 22 + - gnused 23 + - gnugrep 24 + - diffutils 25 + - gzip 26 + - bzip2 27 + - gcc 28 + - ocaml 29 + - pkg-config 30 + 31 + steps: 32 + - name: opam 33 + command: | 34 + opam init --disable-sandboxing -a -y 35 + - name: repo 36 + command: | 37 + opam repo add aoah https://tangled.org/anil.recoil.org/aoah-opam-repo.git 38 + - name: switch 39 + command: | 40 + opam install . --confirm-level=unsafe-yes --deps-only 41 + - name: build 42 + command: | 43 + opam exec -- dune build 44 + - name: switch-test 45 + command: | 46 + opam install . --confirm-level=unsafe-yes --deps-only --with-test 47 + - name: test 48 + command: | 49 + opam exec -- dune runtest --verbose 50 + - name: doc 51 + command: | 52 + opam install -y odoc 53 + opam exec -- dune build @doc
+62
README.md
···
··· 1 + # langdetect 2 + 3 + Language detection library for OCaml using n-gram frequency analysis. 4 + 5 + This is an OCaml port of the [Cybozu 6 + langdetect](https://github.com/shuyo/language-detection) algorithm. It detects 7 + the natural language of text using n-gram frequency profiles. It was ported 8 + from <https://github.com/validator/validator>. 9 + 10 + ## Features 11 + 12 + - Detects 49 languages including English, Chinese, Japanese, Arabic, and many European languages 13 + - Fast probabilistic detection using n-gram frequency analysis 14 + - Configurable detection parameters (smoothing, convergence thresholds) 15 + - Reproducible results with optional random seed control 16 + - Pure OCaml implementation with minimal dependencies 17 + 18 + ## Installation 19 + 20 + ```bash 21 + opam install langdetect 22 + ``` 23 + 24 + ## Usage 25 + 26 + ```ocaml 27 + (* Create a detector with all built-in profiles *) 28 + let detector = Langdetect.create_default () 29 + 30 + (* Detect the best matching language *) 31 + let () = 32 + match Langdetect.detect_best detector "Hello, world!" with 33 + | Some lang -> Printf.printf "Detected: %s\n" lang 34 + | None -> print_endline "Could not detect language" 35 + 36 + (* Get all possible languages with probabilities *) 37 + let () = 38 + let results = Langdetect.detect detector "Bonjour le monde" in 39 + List.iter (fun r -> 40 + Printf.printf "%s: %.2f\n" r.Langdetect.lang r.Langdetect.prob 41 + ) results 42 + 43 + (* Use custom configuration *) 44 + let config = { Langdetect.default_config with prob_threshold = 0.3 } 45 + let detector = Langdetect.create_default ~config () 46 + ``` 47 + 48 + ## Supported Languages 49 + 50 + Arabic, Bengali, Bulgarian, Catalan, Croatian, Czech, Danish, Dutch, English, 51 + Estonian, Farsi, Finnish, French, German, Greek, Gujarati, Hebrew, Hindi, 52 + Hungarian, Indonesian, Italian, Japanese, Korean, Latvian, Lithuanian, 53 + Macedonian, Malayalam, Dutch, Norwegian, Panjabi, Polish, Portuguese, Romanian, 54 + Russian, Sinhalese, Albanian, Spanish, Swedish, Tamil, Telugu, Thai, Tagalog, 55 + Turkish, Ukrainian, Urdu, Vietnamese, Chinese (Simplified), Chinese 56 + (Traditional). 57 + 58 + ## License 59 + 60 + MIT License - see LICENSE file for details. 61 + 62 + Based on the Cybozu langdetect algorithm. Copyright (c) 2007-2016 Mozilla Foundation and 2025 Anil Madhavapeddy.
+5 -4
dune-project
··· 5 (generate_opam_files true) 6 7 (license MIT) 8 - (authors "Anil Madhavapeddy <anil@recoil.org>") 9 - (homepage "https://github.com/avsm/ocaml-langdetect") 10 (maintainers "Anil Madhavapeddy <anil@recoil.org>") 11 - (bug_reports "https://github.com/avsm/ocaml-langdetect/issues") 12 (maintenance_intent "(latest)") 13 14 (package ··· 21 (depends 22 (ocaml (>= 5.1.0)) 23 (uutf (>= 1.0.0)) 24 - (alcotest :with-test)))
··· 5 (generate_opam_files true) 6 7 (license MIT) 8 + (authors "Anil Madhavapeddy") 9 + (homepage "https://tangled.org/@anil.recoil.org/ocaml-langdetect") 10 (maintainers "Anil Madhavapeddy <anil@recoil.org>") 11 + (bug_reports "https://tangled.org/@anil.recoil.org/ocaml-langdetect/issues") 12 (maintenance_intent "(latest)") 13 14 (package ··· 21 (depends 22 (ocaml (>= 5.1.0)) 23 (uutf (>= 1.0.0)) 24 + (odoc :with-doc) 25 + (alcotest (and :with-test (>= 1.7.0)))))
+1
gen/dune
··· 1 ; Profile generator executable - only used during build 2 (executable 3 (name gen_profiles) 4 (modules gen_profiles)
··· 1 ; Profile generator executable - only used during build 2 + 3 (executable 4 (name gen_profiles) 5 (modules gen_profiles)
+12 -5
gen/gen_profiles.ml
··· 1 - (* Profile generator - converts JSON language profiles to OCaml module *) 2 3 let read_file path = 4 let ic = open_in path in 5 - let n = in_channel_length ic in 6 - let s = really_input_string ic n in 7 - close_in ic; 8 - s 9 10 (* Simple JSON parser for profile format {"freq": {...}} *) 11 let parse_freq_json content =
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2007-2016 Mozilla Foundation 3 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org> 4 + SPDX-License-Identifier: MIT 5 + ---------------------------------------------------------------------------*) 6 + 7 + (* Profile generator - converts JSON language profiles to OCaml modules *) 8 9 let read_file path = 10 let ic = open_in path in 11 + Fun.protect 12 + ~finally:(fun () -> close_in ic) 13 + (fun () -> 14 + let n = in_channel_length ic in 15 + really_input_string ic n) 16 17 (* Simple JSON parser for profile format {"freq": {...}} *) 18 let parse_freq_json content =
+4 -4
langdetect.opam
··· 4 description: 5 "An OCaml port of the Cybozu langdetect algorithm. Detects the natural language of text using n-gram frequency profiles. Supports 49 languages including English, Chinese, Japanese, Arabic, and many European languages." 6 maintainer: ["Anil Madhavapeddy <anil@recoil.org>"] 7 - authors: ["Anil Madhavapeddy <anil@recoil.org>"] 8 license: "MIT" 9 - homepage: "https://github.com/avsm/ocaml-langdetect" 10 - bug-reports: "https://github.com/avsm/ocaml-langdetect/issues" 11 depends: [ 12 "dune" {>= "3.20"} 13 "ocaml" {>= "5.1.0"} 14 "uutf" {>= "1.0.0"} 15 - "alcotest" {with-test} 16 "odoc" {with-doc} 17 ] 18 build: [ 19 ["dune" "subst"] {dev}
··· 4 description: 5 "An OCaml port of the Cybozu langdetect algorithm. Detects the natural language of text using n-gram frequency profiles. Supports 49 languages including English, Chinese, Japanese, Arabic, and many European languages." 6 maintainer: ["Anil Madhavapeddy <anil@recoil.org>"] 7 + authors: ["Anil Madhavapeddy"] 8 license: "MIT" 9 + homepage: "https://tangled.org/@anil.recoil.org/ocaml-langdetect" 10 + bug-reports: "https://tangled.org/@anil.recoil.org/ocaml-langdetect/issues" 11 depends: [ 12 "dune" {>= "3.20"} 13 "ocaml" {>= "5.1.0"} 14 "uutf" {>= "1.0.0"} 15 "odoc" {with-doc} 16 + "alcotest" {with-test & >= "1.7.0"} 17 ] 18 build: [ 19 ["dune" "subst"] {dev}
+100 -25
lib/dune
··· 1 ; Rule to generate all profile modules from JSON data files 2 (rule 3 (targets 4 - profile_ar.ml profile_bg.ml profile_bn.ml profile_ca.ml profile_cs.ml 5 - profile_da.ml profile_de.ml profile_el.ml profile_en.ml profile_es.ml 6 - profile_et.ml profile_fa.ml profile_fi.ml profile_fr.ml profile_gu.ml 7 - profile_he.ml profile_hi.ml profile_hr.ml profile_hu.ml profile_id.ml 8 - profile_it.ml profile_ja.ml profile_ko.ml profile_lt.ml profile_lv.ml 9 - profile_mk.ml profile_ml.ml profile_nl.ml profile_no.ml profile_pa.ml 10 - profile_pl.ml profile_pt.ml profile_ro.ml profile_ru.ml profile_si.ml 11 - profile_sq.ml profile_sv.ml profile_ta.ml profile_te.ml profile_th.ml 12 - profile_tl.ml profile_tr.ml profile_uk.ml profile_ur.ml profile_vi.ml 13 - profile_zh_cn.ml profile_zh_tw.ml 14 - profiles.ml) 15 (deps 16 - (glob_files profiles.sm/*)) 17 (action 18 - (run %{exe:../gen/gen_profiles.exe} profiles.sm .))) 19 20 (library 21 (name langdetect) 22 (public_name langdetect) 23 (libraries uutf) 24 (modules 25 - langdetect 26 - profiles 27 - profile_ar profile_bg profile_bn profile_ca profile_cs 28 - profile_da profile_de profile_el profile_en profile_es 29 - profile_et profile_fa profile_fi profile_fr profile_gu 30 - profile_he profile_hi profile_hr profile_hu profile_id 31 - profile_it profile_ja profile_ko profile_lt profile_lv 32 - profile_mk profile_ml profile_nl profile_no profile_pa 33 - profile_pl profile_pt profile_ro profile_ru profile_si 34 - profile_sq profile_sv profile_ta profile_te profile_th 35 - profile_tl profile_tr profile_uk profile_ur profile_vi 36 - profile_zh_cn profile_zh_tw))
··· 1 ; Rule to generate all profile modules from JSON data files 2 + 3 (rule 4 (targets 5 + profile_ar.ml 6 + profile_bg.ml 7 + profile_bn.ml 8 + profile_ca.ml 9 + profile_cs.ml 10 + profile_da.ml 11 + profile_de.ml 12 + profile_el.ml 13 + profile_en.ml 14 + profile_es.ml 15 + profile_et.ml 16 + profile_fa.ml 17 + profile_fi.ml 18 + profile_fr.ml 19 + profile_gu.ml 20 + profile_he.ml 21 + profile_hi.ml 22 + profile_hr.ml 23 + profile_hu.ml 24 + profile_id.ml 25 + profile_it.ml 26 + profile_ja.ml 27 + profile_ko.ml 28 + profile_lt.ml 29 + profile_lv.ml 30 + profile_mk.ml 31 + profile_ml.ml 32 + profile_nl.ml 33 + profile_no.ml 34 + profile_pa.ml 35 + profile_pl.ml 36 + profile_pt.ml 37 + profile_ro.ml 38 + profile_ru.ml 39 + profile_si.ml 40 + profile_sq.ml 41 + profile_sv.ml 42 + profile_ta.ml 43 + profile_te.ml 44 + profile_th.ml 45 + profile_tl.ml 46 + profile_tr.ml 47 + profile_uk.ml 48 + profile_ur.ml 49 + profile_vi.ml 50 + profile_zh_cn.ml 51 + profile_zh_tw.ml 52 + profiles.ml) 53 (deps 54 + (glob_files profiles.sm/*)) 55 (action 56 + (run %{exe:../gen/gen_profiles.exe} profiles.sm .))) 57 58 (library 59 (name langdetect) 60 (public_name langdetect) 61 (libraries uutf) 62 (modules 63 + langdetect 64 + profiles 65 + profile_ar 66 + profile_bg 67 + profile_bn 68 + profile_ca 69 + profile_cs 70 + profile_da 71 + profile_de 72 + profile_el 73 + profile_en 74 + profile_es 75 + profile_et 76 + profile_fa 77 + profile_fi 78 + profile_fr 79 + profile_gu 80 + profile_he 81 + profile_hi 82 + profile_hr 83 + profile_hu 84 + profile_id 85 + profile_it 86 + profile_ja 87 + profile_ko 88 + profile_lt 89 + profile_lv 90 + profile_mk 91 + profile_ml 92 + profile_nl 93 + profile_no 94 + profile_pa 95 + profile_pl 96 + profile_pt 97 + profile_ro 98 + profile_ru 99 + profile_si 100 + profile_sq 101 + profile_sv 102 + profile_ta 103 + profile_te 104 + profile_th 105 + profile_tl 106 + profile_tr 107 + profile_uk 108 + profile_ur 109 + profile_vi 110 + profile_zh_cn 111 + profile_zh_tw))
+106 -159
lib/langdetect.ml
··· 1 (** Language detection library based on n-gram frequency analysis. 2 3 This is an OCaml port of the Cybozu langdetect algorithm. *) 4 5 - module StringMap = Map.Make(String) 6 7 - (** Language detection result *) 8 type result = { 9 - lang: string; 10 - prob: float; 11 } 12 13 - (** Detection parameters *) 14 type config = { 15 - alpha: float; (** Smoothing parameter (default 0.5) *) 16 - n_trial: int; (** Number of random trials (default 7) *) 17 - max_text_length: int; (** Maximum text length to process *) 18 - conv_threshold: float; (** Convergence threshold *) 19 - prob_threshold: float; (** Minimum probability to report *) 20 } 21 22 - let default_config = { 23 - alpha = 0.5; 24 - n_trial = 7; 25 - max_text_length = 10000; 26 - conv_threshold = 0.99999; 27 - prob_threshold = 0.1; 28 - } 29 30 - (** N-gram extraction parameters *) 31 let n_gram_max = 3 32 let base_freq = 10000 33 let iteration_limit = 1000 34 let alpha_width = 0.05 35 36 - (** Detector state *) 37 type t = { 38 - config: config; 39 - (* Map from n-gram -> array of probabilities per language *) 40 - word_lang_prob: float array StringMap.t; 41 - (* List of language codes *) 42 - lang_list: string array; 43 - (* Random seed for reproducibility *) 44 - mutable seed: int option; 45 } 46 47 - (** Normalize a Unicode code point for n-gram extraction *) 48 let normalize_uchar uchar = 49 let code = Uchar.to_int uchar in 50 - (* Basic Latin: keep only letters *) 51 - if code < 128 then begin 52 let c = Char.chr code in 53 - if (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') then 54 - Some (String.make 1 c) 55 - else 56 - None (* Treat as space/separator *) 57 - end 58 - else begin 59 - (* Keep non-ASCII characters as-is *) 60 let buf = Buffer.create 4 in 61 Buffer.add_utf_8_uchar buf uchar; 62 Some (Buffer.contents buf) 63 - end 64 65 - (** Extract n-grams from UTF-8 text. 66 - N-grams are sequences of 1-3 Unicode characters. *) 67 - let extract_ngrams ?(max_len=10000) text word_lang_prob = 68 let ngrams = ref [] in 69 - (* Buffer stores up to 3 most recent character strings *) 70 let char_buffer = Array.make n_gram_max "" in 71 let char_count = ref 0 in 72 let processed = ref 0 in 73 - 74 - (* Process each UTF-8 character *) 75 let decoder = Uutf.decoder ~encoding:`UTF_8 (`String text) in 76 let rec process () = 77 if !processed >= max_len then () 78 - else match Uutf.decode decoder with 79 - | `Await -> () (* String source never awaits *) 80 - | `End -> () 81 - | `Malformed _ -> process () (* Skip malformed sequences *) 82 - | `Uchar uchar -> 83 - incr processed; 84 - match normalize_uchar uchar with 85 - | None -> 86 - (* Separator - reset buffer *) 87 - char_buffer.(0) <- ""; 88 - char_buffer.(1) <- ""; 89 - char_buffer.(2) <- ""; 90 - char_count := 0; 91 - process () 92 - | Some char_str -> 93 - (* Shift buffer left and add new char *) 94 - char_buffer.(0) <- char_buffer.(1); 95 - char_buffer.(1) <- char_buffer.(2); 96 - char_buffer.(2) <- char_str; 97 - incr char_count; 98 - 99 - (* Extract 1, 2, 3 grams based on how many chars we have *) 100 - let available = min !char_count n_gram_max in 101 - for n = 1 to available do 102 - let ngram = 103 let start_idx = n_gram_max - n in 104 let parts = ref [] in 105 for i = start_idx to n_gram_max - 1 do 106 parts := char_buffer.(i) :: !parts 107 done; 108 - String.concat "" (List.rev !parts) 109 - in 110 - if StringMap.mem ngram word_lang_prob then 111 - ngrams := ngram :: !ngrams 112 - done; 113 - process () 114 in 115 process (); 116 Array.of_list (List.rev !ngrams) 117 118 - (** Initialize uniform probability distribution *) 119 - let init_prob n_langs = 120 - let prob = Array.make n_langs (1.0 /. float_of_int n_langs) in 121 - prob 122 123 - (** Update language probabilities with an n-gram *) 124 let update_lang_prob prob ngram word_lang_prob alpha = 125 match StringMap.find_opt ngram word_lang_prob with 126 | None -> false ··· 131 done; 132 true 133 134 - (** Normalize probabilities and return max *) 135 let normalize_prob prob = 136 - let sum = Array.fold_left (+.) 0.0 prob in 137 if sum <= 0.0 then 0.0 138 - else begin 139 let max_p = ref 0.0 in 140 for i = 0 to Array.length prob - 1 do 141 prob.(i) <- prob.(i) /. sum; 142 if prob.(i) > !max_p then max_p := prob.(i) 143 done; 144 !max_p 145 - end 146 147 - (** Simple pseudo-random number generator *) 148 let random_state = ref 12345 149 - 150 - let set_seed seed = 151 - random_state := seed 152 153 let next_random () = 154 - random_state := (!random_state * 1103515245 + 12345) land 0x7FFFFFFF; 155 !random_state 156 157 - let random_int bound = 158 - (next_random ()) mod bound 159 160 let random_gaussian () = 161 - (* Box-Muller transform approximation *) 162 - let u1 = (float_of_int (next_random ())) /. float_of_int 0x7FFFFFFF in 163 - let u2 = (float_of_int (next_random ())) /. float_of_int 0x7FFFFFFF in 164 - let u1 = max 0.0001 u1 in (* Avoid log(0) *) 165 sqrt (-2.0 *. log u1) *. cos (2.0 *. Float.pi *. u2) 166 167 - (** Run detection on extracted n-grams *) 168 let detect_block t ngrams = 169 let n_langs = Array.length t.lang_list in 170 if n_langs = 0 || Array.length ngrams = 0 then [||] 171 - else begin 172 let lang_prob = Array.make n_langs 0.0 in 173 - 174 - (* Set seed if specified, otherwise use a deterministic default *) 175 - (match t.seed with 176 - | Some s -> set_seed s 177 - | None -> set_seed 12345); 178 - 179 for _ = 0 to t.config.n_trial - 1 do 180 let prob = init_prob n_langs in 181 - let alpha = t.config.alpha +. random_gaussian () *. alpha_width in 182 - 183 let converged = ref false in 184 let i = ref 0 in 185 - while not !converged && !i < iteration_limit do 186 let r = random_int (Array.length ngrams) in 187 - let _ = update_lang_prob prob ngrams.(r) t.word_lang_prob alpha in 188 - if !i mod 5 = 0 then begin 189 let max_p = normalize_prob prob in 190 - if max_p > t.config.conv_threshold then converged := true 191 - end; 192 incr i 193 done; 194 - 195 - (* Accumulate probabilities *) 196 for j = 0 to n_langs - 1 do 197 - lang_prob.(j) <- lang_prob.(j) +. prob.(j) /. float_of_int t.config.n_trial 198 done 199 done; 200 - 201 lang_prob 202 - end 203 204 - (** Create detector from profiles *) 205 - let create ?(config=default_config) profiles = 206 let lang_list = Array.of_list (List.map fst profiles) in 207 let n_langs = Array.length lang_list in 208 - 209 - (* Build word -> lang prob map *) 210 - (* First, collect all unique n-grams and their frequencies per language *) 211 let all_ngrams = Hashtbl.create 65536 in 212 let lang_totals = Array.make n_langs 0 in 213 - 214 - List.iteri (fun lang_idx (_, freq_list) -> 215 - List.iter (fun (ngram, count) -> 216 - let current = 217 - match Hashtbl.find_opt all_ngrams ngram with 218 - | Some arr -> arr 219 - | None -> 220 - let arr = Array.make n_langs 0 in 221 - Hashtbl.add all_ngrams ngram arr; 222 - arr 223 - in 224 - current.(lang_idx) <- count; 225 - lang_totals.(lang_idx) <- lang_totals.(lang_idx) + count 226 - ) freq_list 227 - ) profiles; 228 - 229 - (* Convert to probability map *) 230 let word_lang_prob = 231 - Hashtbl.fold (fun ngram counts acc -> 232 - (* Compute probability for each language *) 233 - let probs = Array.make n_langs 0.0 in 234 - for i = 0 to n_langs - 1 do 235 - if lang_totals.(i) > 0 then 236 - probs.(i) <- float_of_int counts.(i) /. float_of_int lang_totals.(i) 237 - done; 238 - StringMap.add ngram probs acc 239 - ) all_ngrams StringMap.empty 240 in 241 - 242 { config; word_lang_prob; lang_list; seed = None } 243 244 - (** Set random seed for reproducibility *) 245 - let set_random_seed t seed = 246 - t.seed <- Some seed 247 248 - (** Detect language of text *) 249 let detect t text = 250 - let ngrams = extract_ngrams ~max_len:t.config.max_text_length text t.word_lang_prob in 251 if Array.length ngrams = 0 then [] 252 - else begin 253 let probs = detect_block t ngrams in 254 - (* Sort by probability descending *) 255 let results = ref [] in 256 for i = 0 to Array.length probs - 1 do 257 if probs.(i) > t.config.prob_threshold then 258 results := { lang = t.lang_list.(i); prob = probs.(i) } :: !results 259 done; 260 List.sort (fun a b -> compare b.prob a.prob) !results 261 - end 262 263 - (** Get best language or None *) 264 let detect_best t text = 265 match detect t text with 266 | [] -> None 267 | best :: _ -> Some best.lang 268 269 - (** Get best language with probability *) 270 let detect_with_prob t text = 271 match detect t text with 272 | [] -> None 273 | best :: _ -> Some (best.lang, best.prob) 274 275 - (** Create a detector with all built-in profiles *) 276 - let create_default ?config () = 277 - create ?config Profiles.all_profiles
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2007-2016 Mozilla Foundation 3 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org> 4 + SPDX-License-Identifier: MIT 5 + ---------------------------------------------------------------------------*) 6 + 7 (** Language detection library based on n-gram frequency analysis. 8 9 This is an OCaml port of the Cybozu langdetect algorithm. *) 10 11 + module StringMap = Map.Make (String) 12 13 type result = { 14 + lang : string; 15 + prob : float; 16 } 17 18 type config = { 19 + alpha : float; 20 + n_trial : int; 21 + max_text_length : int; 22 + conv_threshold : float; 23 + prob_threshold : float; 24 } 25 26 + let default_config = 27 + { 28 + alpha = 0.5; 29 + n_trial = 7; 30 + max_text_length = 10000; 31 + conv_threshold = 0.99999; 32 + prob_threshold = 0.1; 33 + } 34 35 let n_gram_max = 3 36 let base_freq = 10000 37 let iteration_limit = 1000 38 let alpha_width = 0.05 39 40 type t = { 41 + config : config; 42 + word_lang_prob : float array StringMap.t; 43 + lang_list : string array; 44 + mutable seed : int option; 45 } 46 47 let normalize_uchar uchar = 48 let code = Uchar.to_int uchar in 49 + if code < 128 then 50 let c = Char.chr code in 51 + match c with 52 + | 'A' .. 'Z' | 'a' .. 'z' -> Some (String.make 1 c) 53 + | _ -> None 54 + else 55 let buf = Buffer.create 4 in 56 Buffer.add_utf_8_uchar buf uchar; 57 Some (Buffer.contents buf) 58 59 + let extract_ngrams ?(max_len = 10000) text word_lang_prob = 60 let ngrams = ref [] in 61 let char_buffer = Array.make n_gram_max "" in 62 let char_count = ref 0 in 63 let processed = ref 0 in 64 let decoder = Uutf.decoder ~encoding:`UTF_8 (`String text) in 65 let rec process () = 66 if !processed >= max_len then () 67 + else 68 + match Uutf.decode decoder with 69 + | `Await | `End -> () 70 + | `Malformed _ -> process () 71 + | `Uchar uchar -> ( 72 + incr processed; 73 + match normalize_uchar uchar with 74 + | None -> 75 + char_buffer.(0) <- ""; 76 + char_buffer.(1) <- ""; 77 + char_buffer.(2) <- ""; 78 + char_count := 0; 79 + process () 80 + | Some char_str -> 81 + char_buffer.(0) <- char_buffer.(1); 82 + char_buffer.(1) <- char_buffer.(2); 83 + char_buffer.(2) <- char_str; 84 + incr char_count; 85 + let available = min !char_count n_gram_max in 86 + for n = 1 to available do 87 let start_idx = n_gram_max - n in 88 let parts = ref [] in 89 for i = start_idx to n_gram_max - 1 do 90 parts := char_buffer.(i) :: !parts 91 done; 92 + let ngram = String.concat "" (List.rev !parts) in 93 + if StringMap.mem ngram word_lang_prob then 94 + ngrams := ngram :: !ngrams 95 + done; 96 + process ()) 97 in 98 process (); 99 Array.of_list (List.rev !ngrams) 100 101 + let init_prob n_langs = Array.make n_langs (1.0 /. float_of_int n_langs) 102 103 let update_lang_prob prob ngram word_lang_prob alpha = 104 match StringMap.find_opt ngram word_lang_prob with 105 | None -> false ··· 110 done; 111 true 112 113 let normalize_prob prob = 114 + let sum = Array.fold_left ( +. ) 0.0 prob in 115 if sum <= 0.0 then 0.0 116 + else 117 let max_p = ref 0.0 in 118 for i = 0 to Array.length prob - 1 do 119 prob.(i) <- prob.(i) /. sum; 120 if prob.(i) > !max_p then max_p := prob.(i) 121 done; 122 !max_p 123 124 let random_state = ref 12345 125 + let set_seed seed = random_state := seed 126 127 let next_random () = 128 + random_state := ((!random_state * 1103515245) + 12345) land 0x7FFFFFFF; 129 !random_state 130 131 + let random_int bound = next_random () mod bound 132 133 let random_gaussian () = 134 + let u1 = float_of_int (next_random ()) /. float_of_int 0x7FFFFFFF in 135 + let u2 = float_of_int (next_random ()) /. float_of_int 0x7FFFFFFF in 136 + let u1 = max 0.0001 u1 in 137 sqrt (-2.0 *. log u1) *. cos (2.0 *. Float.pi *. u2) 138 139 let detect_block t ngrams = 140 let n_langs = Array.length t.lang_list in 141 if n_langs = 0 || Array.length ngrams = 0 then [||] 142 + else 143 let lang_prob = Array.make n_langs 0.0 in 144 + set_seed (Option.value t.seed ~default:12345); 145 for _ = 0 to t.config.n_trial - 1 do 146 let prob = init_prob n_langs in 147 + let alpha = t.config.alpha +. (random_gaussian () *. alpha_width) in 148 let converged = ref false in 149 let i = ref 0 in 150 + while (not !converged) && !i < iteration_limit do 151 let r = random_int (Array.length ngrams) in 152 + let (_ : bool) = update_lang_prob prob ngrams.(r) t.word_lang_prob alpha in 153 + if !i mod 5 = 0 then 154 let max_p = normalize_prob prob in 155 + if max_p > t.config.conv_threshold then converged := true; 156 incr i 157 done; 158 for j = 0 to n_langs - 1 do 159 + lang_prob.(j) <- lang_prob.(j) +. (prob.(j) /. float_of_int t.config.n_trial) 160 done 161 done; 162 lang_prob 163 164 + let create ?(config = default_config) profiles = 165 let lang_list = Array.of_list (List.map fst profiles) in 166 let n_langs = Array.length lang_list in 167 let all_ngrams = Hashtbl.create 65536 in 168 let lang_totals = Array.make n_langs 0 in 169 + List.iteri 170 + (fun lang_idx (_, freq_list) -> 171 + List.iter 172 + (fun (ngram, count) -> 173 + let current = 174 + match Hashtbl.find_opt all_ngrams ngram with 175 + | Some arr -> arr 176 + | None -> 177 + let arr = Array.make n_langs 0 in 178 + Hashtbl.add all_ngrams ngram arr; 179 + arr 180 + in 181 + current.(lang_idx) <- count; 182 + lang_totals.(lang_idx) <- lang_totals.(lang_idx) + count) 183 + freq_list) 184 + profiles; 185 let word_lang_prob = 186 + Hashtbl.fold 187 + (fun ngram counts acc -> 188 + let probs = Array.make n_langs 0.0 in 189 + for i = 0 to n_langs - 1 do 190 + if lang_totals.(i) > 0 then 191 + probs.(i) <- float_of_int counts.(i) /. float_of_int lang_totals.(i) 192 + done; 193 + StringMap.add ngram probs acc) 194 + all_ngrams StringMap.empty 195 in 196 { config; word_lang_prob; lang_list; seed = None } 197 198 + let set_random_seed t seed = t.seed <- Some seed 199 200 let detect t text = 201 + let ngrams = 202 + extract_ngrams ~max_len:t.config.max_text_length text t.word_lang_prob 203 + in 204 if Array.length ngrams = 0 then [] 205 + else 206 let probs = detect_block t ngrams in 207 let results = ref [] in 208 for i = 0 to Array.length probs - 1 do 209 if probs.(i) > t.config.prob_threshold then 210 results := { lang = t.lang_list.(i); prob = probs.(i) } :: !results 211 done; 212 List.sort (fun a b -> compare b.prob a.prob) !results 213 214 let detect_best t text = 215 match detect t text with 216 | [] -> None 217 | best :: _ -> Some best.lang 218 219 let detect_with_prob t text = 220 match detect t text with 221 | [] -> None 222 | best :: _ -> Some (best.lang, best.prob) 223 224 + let create_default ?config () = create ?config Profiles.all_profiles
+45 -27
lib/langdetect.mli
··· 1 - (** Language detection library based on n-gram frequency analysis. *) 2 3 - (** Language detection result *) 4 type result = { 5 - lang: string; 6 - prob: float; 7 } 8 9 - (** Detection parameters *) 10 type config = { 11 - alpha: float; (** Smoothing parameter (default 0.5) *) 12 - n_trial: int; (** Number of random trials (default 7) *) 13 - max_text_length: int; (** Maximum text length to process *) 14 - conv_threshold: float; (** Convergence threshold *) 15 - prob_threshold: float; (** Minimum probability to report *) 16 } 17 18 - (** Default configuration *) 19 val default_config : config 20 21 - (** Detector state *) 22 type t 23 24 - (** Create detector from language profiles. 25 - Each profile is (lang_code, frequency_list) where frequency_list is 26 - a list of (ngram, count) pairs. *) 27 val create : ?config:config -> (string * (string * int) list) list -> t 28 29 - (** Set random seed for reproducible results *) 30 val set_random_seed : t -> int -> unit 31 32 - (** Detect language of text. 33 - Returns list of possible languages with probabilities, sorted by 34 - probability descending. Only languages above prob_threshold are included. *) 35 val detect : t -> string -> result list 36 37 - (** Detect best matching language. 38 - Returns None if no language could be detected. *) 39 val detect_best : t -> string -> string option 40 41 - (** Detect best matching language with its probability. 42 - Returns None if no language could be detected. *) 43 val detect_with_prob : t -> string -> (string * float) option 44 - 45 - (** Create a detector with all built-in language profiles. 46 - This is a convenience function that calls create with all supported profiles. *) 47 - val create_default : ?config:config -> unit -> t
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2007-2016 Mozilla Foundation 3 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org> 4 + SPDX-License-Identifier: MIT 5 + ---------------------------------------------------------------------------*) 6 7 + (** Language detection library based on n-gram frequency analysis. 8 + 9 + This is an OCaml port of the Cybozu langdetect algorithm. Detects the 10 + natural language of text using n-gram frequency profiles. Supports 49 11 + languages including English, Chinese, Japanese, Arabic, and many European 12 + languages. *) 13 + 14 + (** {1 Types} *) 15 + 16 type result = { 17 + lang : string; (** ISO 639-1 language code *) 18 + prob : float; (** Detection probability (0.0 to 1.0) *) 19 } 20 + (** Language detection result. *) 21 22 type config = { 23 + alpha : float; (** Smoothing parameter (default 0.5) *) 24 + n_trial : int; (** Number of random trials (default 7) *) 25 + max_text_length : int; (** Maximum text length to process *) 26 + conv_threshold : float; (** Convergence threshold *) 27 + prob_threshold : float; (** Minimum probability to report *) 28 } 29 + (** Detection parameters. *) 30 31 val default_config : config 32 + (** Default configuration values. *) 33 34 type t 35 + (** Detector state. *) 36 + 37 + (** {1 Creating detectors} *) 38 39 val create : ?config:config -> (string * (string * int) list) list -> t 40 + (** [create ?config profiles] creates a detector from language profiles. 41 + Each profile is [(lang_code, frequency_list)] where [frequency_list] is 42 + a list of [(ngram, count)] pairs. *) 43 44 + val create_default : ?config:config -> unit -> t 45 + (** [create_default ?config ()] creates a detector with all built-in language 46 + profiles. This is a convenience function that calls {!create} with all 47 + supported profiles. *) 48 + 49 val set_random_seed : t -> int -> unit 50 + (** [set_random_seed t seed] sets the random seed for reproducible results. *) 51 52 + (** {1 Detecting languages} *) 53 + 54 val detect : t -> string -> result list 55 + (** [detect t text] detects the language of [text]. Returns a list of possible 56 + languages with probabilities, sorted by probability descending. Only 57 + languages above [prob_threshold] are included. *) 58 59 val detect_best : t -> string -> string option 60 + (** [detect_best t text] returns the best matching language code, or [None] 61 + if no language could be detected. *) 62 63 val detect_with_prob : t -> string -> (string * float) option 64 + (** [detect_with_prob t text] returns the best matching language code with its 65 + probability, or [None] if no language could be detected. *)
+2 -1
test/dune
··· 4 5 (rule 6 (alias runtest) 7 - (action (run %{exe:test_langdetect.exe})))
··· 4 5 (rule 6 (alias runtest) 7 + (action 8 + (run %{exe:test_langdetect.exe})))
+5 -1
test/test_langdetect.ml
··· 1 - (** Tests for the langdetect library *) 2 3 (* Sample texts in various languages for testing *) 4 let english_text =
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2007-2016 Mozilla Foundation 3 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org> 4 + SPDX-License-Identifier: MIT 5 + ---------------------------------------------------------------------------*) 6 7 (* Sample texts in various languages for testing *) 8 let english_text =