···11+# langdetect
22+33+Language detection library for OCaml using n-gram frequency analysis.
44+55+This is an OCaml port of the [Cybozu
66+langdetect](https://github.com/shuyo/language-detection) algorithm. It detects
77+the natural language of text using n-gram frequency profiles. It was ported
88+from <https://github.com/validator/validator>.
99+1010+## Features
1111+1212+- Detects 49 languages including English, Chinese, Japanese, Arabic, and many European languages
1313+- Fast probabilistic detection using n-gram frequency analysis
1414+- Configurable detection parameters (smoothing, convergence thresholds)
1515+- Reproducible results with optional random seed control
1616+- Pure OCaml implementation with minimal dependencies
1717+1818+## Installation
1919+2020+```bash
2121+opam install langdetect
2222+```
2323+2424+## Usage
2525+2626+```ocaml
2727+(* Create a detector with all built-in profiles *)
2828+let detector = Langdetect.create_default ()
2929+3030+(* Detect the best matching language *)
3131+let () =
3232+ match Langdetect.detect_best detector "Hello, world!" with
3333+ | Some lang -> Printf.printf "Detected: %s\n" lang
3434+ | None -> print_endline "Could not detect language"
3535+3636+(* Get all possible languages with probabilities *)
3737+let () =
3838+ let results = Langdetect.detect detector "Bonjour le monde" in
3939+ List.iter (fun r ->
4040+ Printf.printf "%s: %.2f\n" r.Langdetect.lang r.Langdetect.prob
4141+ ) results
4242+4343+(* Use custom configuration *)
4444+let config = { Langdetect.default_config with prob_threshold = 0.3 }
4545+let detector = Langdetect.create_default ~config ()
4646+```
4747+4848+## Supported Languages
4949+5050+Arabic, Bengali, Bulgarian, Catalan, Croatian, Czech, Danish, Dutch, English,
5151+Estonian, Farsi, Finnish, French, German, Greek, Gujarati, Hebrew, Hindi,
5252+Hungarian, Indonesian, Italian, Japanese, Korean, Latvian, Lithuanian,
5353+Macedonian, Malayalam, Dutch, Norwegian, Panjabi, Polish, Portuguese, Romanian,
5454+Russian, Sinhalese, Albanian, Spanish, Swedish, Tamil, Telugu, Thai, Tagalog,
5555+Turkish, Ukrainian, Urdu, Vietnamese, Chinese (Simplified), Chinese
5656+(Traditional).
5757+5858+## License
5959+6060+MIT License - see LICENSE file for details.
6161+6262+Based on the Cybozu langdetect algorithm. Copyright (c) 2007-2016 Mozilla Foundation and 2025 Anil Madhavapeddy.
···11; Profile generator executable - only used during build
22+23(executable
34 (name gen_profiles)
45 (modules gen_profiles)
+12-5
gen/gen_profiles.ml
···11-(* Profile generator - converts JSON language profiles to OCaml module *)
11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2007-2016 Mozilla Foundation
33+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>
44+ SPDX-License-Identifier: MIT
55+ ---------------------------------------------------------------------------*)
66+77+(* Profile generator - converts JSON language profiles to OCaml modules *)
2839let read_file path =
410 let ic = open_in path in
55- let n = in_channel_length ic in
66- let s = really_input_string ic n in
77- close_in ic;
88- s
1111+ Fun.protect
1212+ ~finally:(fun () -> close_in ic)
1313+ (fun () ->
1414+ let n = in_channel_length ic in
1515+ really_input_string ic n)
9161017(* Simple JSON parser for profile format {"freq": {...}} *)
1118let parse_freq_json content =
+4-4
langdetect.opam
···44description:
55 "An OCaml port of the Cybozu langdetect algorithm. Detects the natural language of text using n-gram frequency profiles. Supports 49 languages including English, Chinese, Japanese, Arabic, and many European languages."
66maintainer: ["Anil Madhavapeddy <anil@recoil.org>"]
77-authors: ["Anil Madhavapeddy <anil@recoil.org>"]
77+authors: ["Anil Madhavapeddy"]
88license: "MIT"
99-homepage: "https://github.com/avsm/ocaml-langdetect"
1010-bug-reports: "https://github.com/avsm/ocaml-langdetect/issues"
99+homepage: "https://tangled.org/@anil.recoil.org/ocaml-langdetect"
1010+bug-reports: "https://tangled.org/@anil.recoil.org/ocaml-langdetect/issues"
1111depends: [
1212 "dune" {>= "3.20"}
1313 "ocaml" {>= "5.1.0"}
1414 "uutf" {>= "1.0.0"}
1515- "alcotest" {with-test}
1615 "odoc" {with-doc}
1616+ "alcotest" {with-test & >= "1.7.0"}
1717]
1818build: [
1919 ["dune" "subst"] {dev}
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2007-2016 Mozilla Foundation
33+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>
44+ SPDX-License-Identifier: MIT
55+ ---------------------------------------------------------------------------*)
66+17(** Language detection library based on n-gram frequency analysis.
2839 This is an OCaml port of the Cybozu langdetect algorithm. *)
41055-module StringMap = Map.Make(String)
1111+module StringMap = Map.Make (String)
61277-(** Language detection result *)
813type result = {
99- lang: string;
1010- prob: float;
1414+ lang : string;
1515+ prob : float;
1116}
12171313-(** Detection parameters *)
1418type config = {
1515- alpha: float; (** Smoothing parameter (default 0.5) *)
1616- n_trial: int; (** Number of random trials (default 7) *)
1717- max_text_length: int; (** Maximum text length to process *)
1818- conv_threshold: float; (** Convergence threshold *)
1919- prob_threshold: float; (** Minimum probability to report *)
1919+ alpha : float;
2020+ n_trial : int;
2121+ max_text_length : int;
2222+ conv_threshold : float;
2323+ prob_threshold : float;
2024}
21252222-let default_config = {
2323- alpha = 0.5;
2424- n_trial = 7;
2525- max_text_length = 10000;
2626- conv_threshold = 0.99999;
2727- prob_threshold = 0.1;
2828-}
2626+let default_config =
2727+ {
2828+ alpha = 0.5;
2929+ n_trial = 7;
3030+ max_text_length = 10000;
3131+ conv_threshold = 0.99999;
3232+ prob_threshold = 0.1;
3333+ }
29343030-(** N-gram extraction parameters *)
3135let n_gram_max = 3
3236let base_freq = 10000
3337let iteration_limit = 1000
3438let alpha_width = 0.05
35393636-(** Detector state *)
3740type t = {
3838- config: config;
3939- (* Map from n-gram -> array of probabilities per language *)
4040- word_lang_prob: float array StringMap.t;
4141- (* List of language codes *)
4242- lang_list: string array;
4343- (* Random seed for reproducibility *)
4444- mutable seed: int option;
4141+ config : config;
4242+ word_lang_prob : float array StringMap.t;
4343+ lang_list : string array;
4444+ mutable seed : int option;
4545}
46464747-(** Normalize a Unicode code point for n-gram extraction *)
4847let normalize_uchar uchar =
4948 let code = Uchar.to_int uchar in
5050- (* Basic Latin: keep only letters *)
5151- if code < 128 then begin
4949+ if code < 128 then
5250 let c = Char.chr code in
5353- if (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') then
5454- Some (String.make 1 c)
5555- else
5656- None (* Treat as space/separator *)
5757- end
5858- else begin
5959- (* Keep non-ASCII characters as-is *)
5151+ match c with
5252+ | 'A' .. 'Z' | 'a' .. 'z' -> Some (String.make 1 c)
5353+ | _ -> None
5454+ else
6055 let buf = Buffer.create 4 in
6156 Buffer.add_utf_8_uchar buf uchar;
6257 Some (Buffer.contents buf)
6363- end
64586565-(** Extract n-grams from UTF-8 text.
6666- N-grams are sequences of 1-3 Unicode characters. *)
6767-let extract_ngrams ?(max_len=10000) text word_lang_prob =
5959+let extract_ngrams ?(max_len = 10000) text word_lang_prob =
6860 let ngrams = ref [] in
6969- (* Buffer stores up to 3 most recent character strings *)
7061 let char_buffer = Array.make n_gram_max "" in
7162 let char_count = ref 0 in
7263 let processed = ref 0 in
7373-7474- (* Process each UTF-8 character *)
7564 let decoder = Uutf.decoder ~encoding:`UTF_8 (`String text) in
7665 let rec process () =
7766 if !processed >= max_len then ()
7878- else match Uutf.decode decoder with
7979- | `Await -> () (* String source never awaits *)
8080- | `End -> ()
8181- | `Malformed _ -> process () (* Skip malformed sequences *)
8282- | `Uchar uchar ->
8383- incr processed;
8484- match normalize_uchar uchar with
8585- | None ->
8686- (* Separator - reset buffer *)
8787- char_buffer.(0) <- "";
8888- char_buffer.(1) <- "";
8989- char_buffer.(2) <- "";
9090- char_count := 0;
9191- process ()
9292- | Some char_str ->
9393- (* Shift buffer left and add new char *)
9494- char_buffer.(0) <- char_buffer.(1);
9595- char_buffer.(1) <- char_buffer.(2);
9696- char_buffer.(2) <- char_str;
9797- incr char_count;
9898-9999- (* Extract 1, 2, 3 grams based on how many chars we have *)
100100- let available = min !char_count n_gram_max in
101101- for n = 1 to available do
102102- let ngram =
6767+ else
6868+ match Uutf.decode decoder with
6969+ | `Await | `End -> ()
7070+ | `Malformed _ -> process ()
7171+ | `Uchar uchar -> (
7272+ incr processed;
7373+ match normalize_uchar uchar with
7474+ | None ->
7575+ char_buffer.(0) <- "";
7676+ char_buffer.(1) <- "";
7777+ char_buffer.(2) <- "";
7878+ char_count := 0;
7979+ process ()
8080+ | Some char_str ->
8181+ char_buffer.(0) <- char_buffer.(1);
8282+ char_buffer.(1) <- char_buffer.(2);
8383+ char_buffer.(2) <- char_str;
8484+ incr char_count;
8585+ let available = min !char_count n_gram_max in
8686+ for n = 1 to available do
10387 let start_idx = n_gram_max - n in
10488 let parts = ref [] in
10589 for i = start_idx to n_gram_max - 1 do
10690 parts := char_buffer.(i) :: !parts
10791 done;
108108- String.concat "" (List.rev !parts)
109109- in
110110- if StringMap.mem ngram word_lang_prob then
111111- ngrams := ngram :: !ngrams
112112- done;
113113- process ()
9292+ let ngram = String.concat "" (List.rev !parts) in
9393+ if StringMap.mem ngram word_lang_prob then
9494+ ngrams := ngram :: !ngrams
9595+ done;
9696+ process ())
11497 in
11598 process ();
11699 Array.of_list (List.rev !ngrams)
117100118118-(** Initialize uniform probability distribution *)
119119-let init_prob n_langs =
120120- let prob = Array.make n_langs (1.0 /. float_of_int n_langs) in
121121- prob
101101+let init_prob n_langs = Array.make n_langs (1.0 /. float_of_int n_langs)
122102123123-(** Update language probabilities with an n-gram *)
124103let update_lang_prob prob ngram word_lang_prob alpha =
125104 match StringMap.find_opt ngram word_lang_prob with
126105 | None -> false
···131110 done;
132111 true
133112134134-(** Normalize probabilities and return max *)
135113let normalize_prob prob =
136136- let sum = Array.fold_left (+.) 0.0 prob in
114114+ let sum = Array.fold_left ( +. ) 0.0 prob in
137115 if sum <= 0.0 then 0.0
138138- else begin
116116+ else
139117 let max_p = ref 0.0 in
140118 for i = 0 to Array.length prob - 1 do
141119 prob.(i) <- prob.(i) /. sum;
142120 if prob.(i) > !max_p then max_p := prob.(i)
143121 done;
144122 !max_p
145145- end
146123147147-(** Simple pseudo-random number generator *)
148124let random_state = ref 12345
149149-150150-let set_seed seed =
151151- random_state := seed
125125+let set_seed seed = random_state := seed
152126153127let next_random () =
154154- random_state := (!random_state * 1103515245 + 12345) land 0x7FFFFFFF;
128128+ random_state := ((!random_state * 1103515245) + 12345) land 0x7FFFFFFF;
155129 !random_state
156130157157-let random_int bound =
158158- (next_random ()) mod bound
131131+let random_int bound = next_random () mod bound
159132160133let random_gaussian () =
161161- (* Box-Muller transform approximation *)
162162- let u1 = (float_of_int (next_random ())) /. float_of_int 0x7FFFFFFF in
163163- let u2 = (float_of_int (next_random ())) /. float_of_int 0x7FFFFFFF in
164164- let u1 = max 0.0001 u1 in (* Avoid log(0) *)
134134+ let u1 = float_of_int (next_random ()) /. float_of_int 0x7FFFFFFF in
135135+ let u2 = float_of_int (next_random ()) /. float_of_int 0x7FFFFFFF in
136136+ let u1 = max 0.0001 u1 in
165137 sqrt (-2.0 *. log u1) *. cos (2.0 *. Float.pi *. u2)
166138167167-(** Run detection on extracted n-grams *)
168139let detect_block t ngrams =
169140 let n_langs = Array.length t.lang_list in
170141 if n_langs = 0 || Array.length ngrams = 0 then [||]
171171- else begin
142142+ else
172143 let lang_prob = Array.make n_langs 0.0 in
173173-174174- (* Set seed if specified, otherwise use a deterministic default *)
175175- (match t.seed with
176176- | Some s -> set_seed s
177177- | None -> set_seed 12345);
178178-144144+ set_seed (Option.value t.seed ~default:12345);
179145 for _ = 0 to t.config.n_trial - 1 do
180146 let prob = init_prob n_langs in
181181- let alpha = t.config.alpha +. random_gaussian () *. alpha_width in
182182-147147+ let alpha = t.config.alpha +. (random_gaussian () *. alpha_width) in
183148 let converged = ref false in
184149 let i = ref 0 in
185185- while not !converged && !i < iteration_limit do
150150+ while (not !converged) && !i < iteration_limit do
186151 let r = random_int (Array.length ngrams) in
187187- let _ = update_lang_prob prob ngrams.(r) t.word_lang_prob alpha in
188188- if !i mod 5 = 0 then begin
152152+ let (_ : bool) = update_lang_prob prob ngrams.(r) t.word_lang_prob alpha in
153153+ if !i mod 5 = 0 then
189154 let max_p = normalize_prob prob in
190190- if max_p > t.config.conv_threshold then converged := true
191191- end;
155155+ if max_p > t.config.conv_threshold then converged := true;
192156 incr i
193157 done;
194194-195195- (* Accumulate probabilities *)
196158 for j = 0 to n_langs - 1 do
197197- lang_prob.(j) <- lang_prob.(j) +. prob.(j) /. float_of_int t.config.n_trial
159159+ lang_prob.(j) <- lang_prob.(j) +. (prob.(j) /. float_of_int t.config.n_trial)
198160 done
199161 done;
200200-201162 lang_prob
202202- end
203163204204-(** Create detector from profiles *)
205205-let create ?(config=default_config) profiles =
164164+let create ?(config = default_config) profiles =
206165 let lang_list = Array.of_list (List.map fst profiles) in
207166 let n_langs = Array.length lang_list in
208208-209209- (* Build word -> lang prob map *)
210210- (* First, collect all unique n-grams and their frequencies per language *)
211167 let all_ngrams = Hashtbl.create 65536 in
212168 let lang_totals = Array.make n_langs 0 in
213213-214214- List.iteri (fun lang_idx (_, freq_list) ->
215215- List.iter (fun (ngram, count) ->
216216- let current =
217217- match Hashtbl.find_opt all_ngrams ngram with
218218- | Some arr -> arr
219219- | None ->
220220- let arr = Array.make n_langs 0 in
221221- Hashtbl.add all_ngrams ngram arr;
222222- arr
223223- in
224224- current.(lang_idx) <- count;
225225- lang_totals.(lang_idx) <- lang_totals.(lang_idx) + count
226226- ) freq_list
227227- ) profiles;
228228-229229- (* Convert to probability map *)
169169+ List.iteri
170170+ (fun lang_idx (_, freq_list) ->
171171+ List.iter
172172+ (fun (ngram, count) ->
173173+ let current =
174174+ match Hashtbl.find_opt all_ngrams ngram with
175175+ | Some arr -> arr
176176+ | None ->
177177+ let arr = Array.make n_langs 0 in
178178+ Hashtbl.add all_ngrams ngram arr;
179179+ arr
180180+ in
181181+ current.(lang_idx) <- count;
182182+ lang_totals.(lang_idx) <- lang_totals.(lang_idx) + count)
183183+ freq_list)
184184+ profiles;
230185 let word_lang_prob =
231231- Hashtbl.fold (fun ngram counts acc ->
232232- (* Compute probability for each language *)
233233- let probs = Array.make n_langs 0.0 in
234234- for i = 0 to n_langs - 1 do
235235- if lang_totals.(i) > 0 then
236236- probs.(i) <- float_of_int counts.(i) /. float_of_int lang_totals.(i)
237237- done;
238238- StringMap.add ngram probs acc
239239- ) all_ngrams StringMap.empty
186186+ Hashtbl.fold
187187+ (fun ngram counts acc ->
188188+ let probs = Array.make n_langs 0.0 in
189189+ for i = 0 to n_langs - 1 do
190190+ if lang_totals.(i) > 0 then
191191+ probs.(i) <- float_of_int counts.(i) /. float_of_int lang_totals.(i)
192192+ done;
193193+ StringMap.add ngram probs acc)
194194+ all_ngrams StringMap.empty
240195 in
241241-242196 { config; word_lang_prob; lang_list; seed = None }
243197244244-(** Set random seed for reproducibility *)
245245-let set_random_seed t seed =
246246- t.seed <- Some seed
198198+let set_random_seed t seed = t.seed <- Some seed
247199248248-(** Detect language of text *)
249200let detect t text =
250250- let ngrams = extract_ngrams ~max_len:t.config.max_text_length text t.word_lang_prob in
201201+ let ngrams =
202202+ extract_ngrams ~max_len:t.config.max_text_length text t.word_lang_prob
203203+ in
251204 if Array.length ngrams = 0 then []
252252- else begin
205205+ else
253206 let probs = detect_block t ngrams in
254254- (* Sort by probability descending *)
255207 let results = ref [] in
256208 for i = 0 to Array.length probs - 1 do
257209 if probs.(i) > t.config.prob_threshold then
258210 results := { lang = t.lang_list.(i); prob = probs.(i) } :: !results
259211 done;
260212 List.sort (fun a b -> compare b.prob a.prob) !results
261261- end
262213263263-(** Get best language or None *)
264214let detect_best t text =
265215 match detect t text with
266216 | [] -> None
267217 | best :: _ -> Some best.lang
268218269269-(** Get best language with probability *)
270219let detect_with_prob t text =
271220 match detect t text with
272221 | [] -> None
273222 | best :: _ -> Some (best.lang, best.prob)
274223275275-(** Create a detector with all built-in profiles *)
276276-let create_default ?config () =
277277- create ?config Profiles.all_profiles
224224+let create_default ?config () = create ?config Profiles.all_profiles
+45-27
lib/langdetect.mli
···11-(** Language detection library based on n-gram frequency analysis. *)
11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2007-2016 Mozilla Foundation
33+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>
44+ SPDX-License-Identifier: MIT
55+ ---------------------------------------------------------------------------*)
2633-(** Language detection result *)
77+(** Language detection library based on n-gram frequency analysis.
88+99+ This is an OCaml port of the Cybozu langdetect algorithm. Detects the
1010+ natural language of text using n-gram frequency profiles. Supports 49
1111+ languages including English, Chinese, Japanese, Arabic, and many European
1212+ languages. *)
1313+1414+(** {1 Types} *)
1515+416type result = {
55- lang: string;
66- prob: float;
1717+ lang : string; (** ISO 639-1 language code *)
1818+ prob : float; (** Detection probability (0.0 to 1.0) *)
719}
2020+(** Language detection result. *)
82199-(** Detection parameters *)
1022type config = {
1111- alpha: float; (** Smoothing parameter (default 0.5) *)
1212- n_trial: int; (** Number of random trials (default 7) *)
1313- max_text_length: int; (** Maximum text length to process *)
1414- conv_threshold: float; (** Convergence threshold *)
1515- prob_threshold: float; (** Minimum probability to report *)
2323+ alpha : float; (** Smoothing parameter (default 0.5) *)
2424+ n_trial : int; (** Number of random trials (default 7) *)
2525+ max_text_length : int; (** Maximum text length to process *)
2626+ conv_threshold : float; (** Convergence threshold *)
2727+ prob_threshold : float; (** Minimum probability to report *)
1628}
2929+(** Detection parameters. *)
17301818-(** Default configuration *)
1931val default_config : config
3232+(** Default configuration values. *)
20332121-(** Detector state *)
2234type t
3535+(** Detector state. *)
3636+3737+(** {1 Creating detectors} *)
23382424-(** Create detector from language profiles.
2525- Each profile is (lang_code, frequency_list) where frequency_list is
2626- a list of (ngram, count) pairs. *)
2739val create : ?config:config -> (string * (string * int) list) list -> t
4040+(** [create ?config profiles] creates a detector from language profiles.
4141+ Each profile is [(lang_code, frequency_list)] where [frequency_list] is
4242+ a list of [(ngram, count)] pairs. *)
28432929-(** Set random seed for reproducible results *)
4444+val create_default : ?config:config -> unit -> t
4545+(** [create_default ?config ()] creates a detector with all built-in language
4646+ profiles. This is a convenience function that calls {!create} with all
4747+ supported profiles. *)
4848+3049val set_random_seed : t -> int -> unit
5050+(** [set_random_seed t seed] sets the random seed for reproducible results. *)
31513232-(** Detect language of text.
3333- Returns list of possible languages with probabilities, sorted by
3434- probability descending. Only languages above prob_threshold are included. *)
5252+(** {1 Detecting languages} *)
5353+3554val detect : t -> string -> result list
5555+(** [detect t text] detects the language of [text]. Returns a list of possible
5656+ languages with probabilities, sorted by probability descending. Only
5757+ languages above [prob_threshold] are included. *)
36583737-(** Detect best matching language.
3838- Returns None if no language could be detected. *)
3959val detect_best : t -> string -> string option
6060+(** [detect_best t text] returns the best matching language code, or [None]
6161+ if no language could be detected. *)
40624141-(** Detect best matching language with its probability.
4242- Returns None if no language could be detected. *)
4363val detect_with_prob : t -> string -> (string * float) option
4444-4545-(** Create a detector with all built-in language profiles.
4646- This is a convenience function that calls create with all supported profiles. *)
4747-val create_default : ?config:config -> unit -> t
6464+(** [detect_with_prob t text] returns the best matching language code with its
6565+ probability, or [None] if no language could be detected. *)
···11-(** Tests for the langdetect library *)
11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2007-2016 Mozilla Foundation
33+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>
44+ SPDX-License-Identifier: MIT
55+ ---------------------------------------------------------------------------*)
2637(* Sample texts in various languages for testing *)
48let english_text =