···1+# langdetect
2+3+Language detection library for OCaml using n-gram frequency analysis.
4+5+This is an OCaml port of the [Cybozu
6+langdetect](https://github.com/shuyo/language-detection) algorithm. It detects
7+the natural language of text using n-gram frequency profiles. It was ported
8+from <https://github.com/validator/validator>.
9+10+## Features
11+12+- Detects 49 languages including English, Chinese, Japanese, Arabic, and many European languages
13+- Fast probabilistic detection using n-gram frequency analysis
14+- Configurable detection parameters (smoothing, convergence thresholds)
15+- Reproducible results with optional random seed control
16+- Pure OCaml implementation with minimal dependencies
17+18+## Installation
19+20+```bash
21+opam install langdetect
22+```
23+24+## Usage
25+26+```ocaml
27+(* Create a detector with all built-in profiles *)
28+let detector = Langdetect.create_default ()
29+30+(* Detect the best matching language *)
31+let () =
32+ match Langdetect.detect_best detector "Hello, world!" with
33+ | Some lang -> Printf.printf "Detected: %s\n" lang
34+ | None -> print_endline "Could not detect language"
35+36+(* Get all possible languages with probabilities *)
37+let () =
38+ let results = Langdetect.detect detector "Bonjour le monde" in
39+ List.iter (fun r ->
40+ Printf.printf "%s: %.2f\n" r.Langdetect.lang r.Langdetect.prob
41+ ) results
42+43+(* Use custom configuration *)
44+let config = { Langdetect.default_config with prob_threshold = 0.3 }
45+let detector = Langdetect.create_default ~config ()
46+```
47+48+## Supported Languages
49+50+Arabic, Bengali, Bulgarian, Catalan, Croatian, Czech, Danish, Dutch, English,
51+Estonian, Farsi, Finnish, French, German, Greek, Gujarati, Hebrew, Hindi,
52+Hungarian, Indonesian, Italian, Japanese, Korean, Latvian, Lithuanian,
53+Macedonian, Malayalam, Dutch, Norwegian, Panjabi, Polish, Portuguese, Romanian,
54+Russian, Sinhalese, Albanian, Spanish, Swedish, Tamil, Telugu, Thai, Tagalog,
55+Turkish, Ukrainian, Urdu, Vietnamese, Chinese (Simplified), Chinese
56+(Traditional).
57+58+## License
59+60+MIT License - see LICENSE file for details.
61+62+Based on the Cybozu langdetect algorithm. Copyright (c) 2007-2016 Mozilla Foundation and 2025 Anil Madhavapeddy.
···1; Profile generator executable - only used during build
02(executable
3 (name gen_profiles)
4 (modules gen_profiles)
···1; Profile generator executable - only used during build
2+3(executable
4 (name gen_profiles)
5 (modules gen_profiles)
+12-5
gen/gen_profiles.ml
···1-(* Profile generator - converts JSON language profiles to OCaml module *)
00000023let read_file path =
4 let ic = open_in path in
5- let n = in_channel_length ic in
6- let s = really_input_string ic n in
7- close_in ic;
8- s
0910(* Simple JSON parser for profile format {"freq": {...}} *)
11let parse_freq_json content =
···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2007-2016 Mozilla Foundation
3+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>
4+ SPDX-License-Identifier: MIT
5+ ---------------------------------------------------------------------------*)
6+7+(* Profile generator - converts JSON language profiles to OCaml modules *)
89let read_file path =
10 let ic = open_in path in
11+ Fun.protect
12+ ~finally:(fun () -> close_in ic)
13+ (fun () ->
14+ let n = in_channel_length ic in
15+ really_input_string ic n)
1617(* Simple JSON parser for profile format {"freq": {...}} *)
18let parse_freq_json content =
+4-4
langdetect.opam
···4description:
5 "An OCaml port of the Cybozu langdetect algorithm. Detects the natural language of text using n-gram frequency profiles. Supports 49 languages including English, Chinese, Japanese, Arabic, and many European languages."
6maintainer: ["Anil Madhavapeddy <anil@recoil.org>"]
7-authors: ["Anil Madhavapeddy <anil@recoil.org>"]
8license: "MIT"
9-homepage: "https://github.com/avsm/ocaml-langdetect"
10-bug-reports: "https://github.com/avsm/ocaml-langdetect/issues"
11depends: [
12 "dune" {>= "3.20"}
13 "ocaml" {>= "5.1.0"}
14 "uutf" {>= "1.0.0"}
15- "alcotest" {with-test}
16 "odoc" {with-doc}
017]
18build: [
19 ["dune" "subst"] {dev}
···4description:
5 "An OCaml port of the Cybozu langdetect algorithm. Detects the natural language of text using n-gram frequency profiles. Supports 49 languages including English, Chinese, Japanese, Arabic, and many European languages."
6maintainer: ["Anil Madhavapeddy <anil@recoil.org>"]
7+authors: ["Anil Madhavapeddy"]
8license: "MIT"
9+homepage: "https://tangled.org/@anil.recoil.org/ocaml-langdetect"
10+bug-reports: "https://tangled.org/@anil.recoil.org/ocaml-langdetect/issues"
11depends: [
12 "dune" {>= "3.20"}
13 "ocaml" {>= "5.1.0"}
14 "uutf" {>= "1.0.0"}
015 "odoc" {with-doc}
16+ "alcotest" {with-test & >= "1.7.0"}
17]
18build: [
19 ["dune" "subst"] {dev}
···0000001(** Language detection library based on n-gram frequency analysis.
23 This is an OCaml port of the Cybozu langdetect algorithm. *)
45-module StringMap = Map.Make(String)
67-(** Language detection result *)
8type result = {
9- lang: string;
10- prob: float;
11}
1213-(** Detection parameters *)
14type config = {
15- alpha: float; (** Smoothing parameter (default 0.5) *)
16- n_trial: int; (** Number of random trials (default 7) *)
17- max_text_length: int; (** Maximum text length to process *)
18- conv_threshold: float; (** Convergence threshold *)
19- prob_threshold: float; (** Minimum probability to report *)
20}
2122-let default_config = {
23- alpha = 0.5;
24- n_trial = 7;
25- max_text_length = 10000;
26- conv_threshold = 0.99999;
27- prob_threshold = 0.1;
28-}
02930-(** N-gram extraction parameters *)
31let n_gram_max = 3
32let base_freq = 10000
33let iteration_limit = 1000
34let alpha_width = 0.05
3536-(** Detector state *)
37type t = {
38- config: config;
39- (* Map from n-gram -> array of probabilities per language *)
40- word_lang_prob: float array StringMap.t;
41- (* List of language codes *)
42- lang_list: string array;
43- (* Random seed for reproducibility *)
44- mutable seed: int option;
45}
4647-(** Normalize a Unicode code point for n-gram extraction *)
48let normalize_uchar uchar =
49 let code = Uchar.to_int uchar in
50- (* Basic Latin: keep only letters *)
51- if code < 128 then begin
52 let c = Char.chr code in
53- if (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') then
54- Some (String.make 1 c)
55- else
56- None (* Treat as space/separator *)
57- end
58- else begin
59- (* Keep non-ASCII characters as-is *)
60 let buf = Buffer.create 4 in
61 Buffer.add_utf_8_uchar buf uchar;
62 Some (Buffer.contents buf)
63- end
6465-(** Extract n-grams from UTF-8 text.
66- N-grams are sequences of 1-3 Unicode characters. *)
67-let extract_ngrams ?(max_len=10000) text word_lang_prob =
68 let ngrams = ref [] in
69- (* Buffer stores up to 3 most recent character strings *)
70 let char_buffer = Array.make n_gram_max "" in
71 let char_count = ref 0 in
72 let processed = ref 0 in
73-74- (* Process each UTF-8 character *)
75 let decoder = Uutf.decoder ~encoding:`UTF_8 (`String text) in
76 let rec process () =
77 if !processed >= max_len then ()
78- else match Uutf.decode decoder with
79- | `Await -> () (* String source never awaits *)
80- | `End -> ()
81- | `Malformed _ -> process () (* Skip malformed sequences *)
82- | `Uchar uchar ->
83- incr processed;
84- match normalize_uchar uchar with
85- | None ->
86- (* Separator - reset buffer *)
87- char_buffer.(0) <- "";
88- char_buffer.(1) <- "";
89- char_buffer.(2) <- "";
90- char_count := 0;
91- process ()
92- | Some char_str ->
93- (* Shift buffer left and add new char *)
94- char_buffer.(0) <- char_buffer.(1);
95- char_buffer.(1) <- char_buffer.(2);
96- char_buffer.(2) <- char_str;
97- incr char_count;
98-99- (* Extract 1, 2, 3 grams based on how many chars we have *)
100- let available = min !char_count n_gram_max in
101- for n = 1 to available do
102- let ngram =
103 let start_idx = n_gram_max - n in
104 let parts = ref [] in
105 for i = start_idx to n_gram_max - 1 do
106 parts := char_buffer.(i) :: !parts
107 done;
108- String.concat "" (List.rev !parts)
109- in
110- if StringMap.mem ngram word_lang_prob then
111- ngrams := ngram :: !ngrams
112- done;
113- process ()
114 in
115 process ();
116 Array.of_list (List.rev !ngrams)
117118-(** Initialize uniform probability distribution *)
119-let init_prob n_langs =
120- let prob = Array.make n_langs (1.0 /. float_of_int n_langs) in
121- prob
122123-(** Update language probabilities with an n-gram *)
124let update_lang_prob prob ngram word_lang_prob alpha =
125 match StringMap.find_opt ngram word_lang_prob with
126 | None -> false
···131 done;
132 true
133134-(** Normalize probabilities and return max *)
135let normalize_prob prob =
136- let sum = Array.fold_left (+.) 0.0 prob in
137 if sum <= 0.0 then 0.0
138- else begin
139 let max_p = ref 0.0 in
140 for i = 0 to Array.length prob - 1 do
141 prob.(i) <- prob.(i) /. sum;
142 if prob.(i) > !max_p then max_p := prob.(i)
143 done;
144 !max_p
145- end
146147-(** Simple pseudo-random number generator *)
148let random_state = ref 12345
149-150-let set_seed seed =
151- random_state := seed
152153let next_random () =
154- random_state := (!random_state * 1103515245 + 12345) land 0x7FFFFFFF;
155 !random_state
156157-let random_int bound =
158- (next_random ()) mod bound
159160let random_gaussian () =
161- (* Box-Muller transform approximation *)
162- let u1 = (float_of_int (next_random ())) /. float_of_int 0x7FFFFFFF in
163- let u2 = (float_of_int (next_random ())) /. float_of_int 0x7FFFFFFF in
164- let u1 = max 0.0001 u1 in (* Avoid log(0) *)
165 sqrt (-2.0 *. log u1) *. cos (2.0 *. Float.pi *. u2)
166167-(** Run detection on extracted n-grams *)
168let detect_block t ngrams =
169 let n_langs = Array.length t.lang_list in
170 if n_langs = 0 || Array.length ngrams = 0 then [||]
171- else begin
172 let lang_prob = Array.make n_langs 0.0 in
173-174- (* Set seed if specified, otherwise use a deterministic default *)
175- (match t.seed with
176- | Some s -> set_seed s
177- | None -> set_seed 12345);
178-179 for _ = 0 to t.config.n_trial - 1 do
180 let prob = init_prob n_langs in
181- let alpha = t.config.alpha +. random_gaussian () *. alpha_width in
182-183 let converged = ref false in
184 let i = ref 0 in
185- while not !converged && !i < iteration_limit do
186 let r = random_int (Array.length ngrams) in
187- let _ = update_lang_prob prob ngrams.(r) t.word_lang_prob alpha in
188- if !i mod 5 = 0 then begin
189 let max_p = normalize_prob prob in
190- if max_p > t.config.conv_threshold then converged := true
191- end;
192 incr i
193 done;
194-195- (* Accumulate probabilities *)
196 for j = 0 to n_langs - 1 do
197- lang_prob.(j) <- lang_prob.(j) +. prob.(j) /. float_of_int t.config.n_trial
198 done
199 done;
200-201 lang_prob
202- end
203204-(** Create detector from profiles *)
205-let create ?(config=default_config) profiles =
206 let lang_list = Array.of_list (List.map fst profiles) in
207 let n_langs = Array.length lang_list in
208-209- (* Build word -> lang prob map *)
210- (* First, collect all unique n-grams and their frequencies per language *)
211 let all_ngrams = Hashtbl.create 65536 in
212 let lang_totals = Array.make n_langs 0 in
213-214- List.iteri (fun lang_idx (_, freq_list) ->
215- List.iter (fun (ngram, count) ->
216- let current =
217- match Hashtbl.find_opt all_ngrams ngram with
218- | Some arr -> arr
219- | None ->
220- let arr = Array.make n_langs 0 in
221- Hashtbl.add all_ngrams ngram arr;
222- arr
223- in
224- current.(lang_idx) <- count;
225- lang_totals.(lang_idx) <- lang_totals.(lang_idx) + count
226- ) freq_list
227- ) profiles;
228-229- (* Convert to probability map *)
230 let word_lang_prob =
231- Hashtbl.fold (fun ngram counts acc ->
232- (* Compute probability for each language *)
233- let probs = Array.make n_langs 0.0 in
234- for i = 0 to n_langs - 1 do
235- if lang_totals.(i) > 0 then
236- probs.(i) <- float_of_int counts.(i) /. float_of_int lang_totals.(i)
237- done;
238- StringMap.add ngram probs acc
239- ) all_ngrams StringMap.empty
240 in
241-242 { config; word_lang_prob; lang_list; seed = None }
243244-(** Set random seed for reproducibility *)
245-let set_random_seed t seed =
246- t.seed <- Some seed
247248-(** Detect language of text *)
249let detect t text =
250- let ngrams = extract_ngrams ~max_len:t.config.max_text_length text t.word_lang_prob in
00251 if Array.length ngrams = 0 then []
252- else begin
253 let probs = detect_block t ngrams in
254- (* Sort by probability descending *)
255 let results = ref [] in
256 for i = 0 to Array.length probs - 1 do
257 if probs.(i) > t.config.prob_threshold then
258 results := { lang = t.lang_list.(i); prob = probs.(i) } :: !results
259 done;
260 List.sort (fun a b -> compare b.prob a.prob) !results
261- end
262263-(** Get best language or None *)
264let detect_best t text =
265 match detect t text with
266 | [] -> None
267 | best :: _ -> Some best.lang
268269-(** Get best language with probability *)
270let detect_with_prob t text =
271 match detect t text with
272 | [] -> None
273 | best :: _ -> Some (best.lang, best.prob)
274275-(** Create a detector with all built-in profiles *)
276-let create_default ?config () =
277- create ?config Profiles.all_profiles
···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2007-2016 Mozilla Foundation
3+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>
4+ SPDX-License-Identifier: MIT
5+ ---------------------------------------------------------------------------*)
6+7(** Language detection library based on n-gram frequency analysis.
89 This is an OCaml port of the Cybozu langdetect algorithm. *)
1011+module StringMap = Map.Make (String)
12013type result = {
14+ lang : string;
15+ prob : float;
16}
17018type config = {
19+ alpha : float;
20+ n_trial : int;
21+ max_text_length : int;
22+ conv_threshold : float;
23+ prob_threshold : float;
24}
2526+let default_config =
27+ {
28+ alpha = 0.5;
29+ n_trial = 7;
30+ max_text_length = 10000;
31+ conv_threshold = 0.99999;
32+ prob_threshold = 0.1;
33+ }
34035let n_gram_max = 3
36let base_freq = 10000
37let iteration_limit = 1000
38let alpha_width = 0.05
39040type t = {
41+ config : config;
42+ word_lang_prob : float array StringMap.t;
43+ lang_list : string array;
44+ mutable seed : int option;
00045}
46047let normalize_uchar uchar =
48 let code = Uchar.to_int uchar in
49+ if code < 128 then
050 let c = Char.chr code in
51+ match c with
52+ | 'A' .. 'Z' | 'a' .. 'z' -> Some (String.make 1 c)
53+ | _ -> None
54+ else
00055 let buf = Buffer.create 4 in
56 Buffer.add_utf_8_uchar buf uchar;
57 Some (Buffer.contents buf)
05859+let extract_ngrams ?(max_len = 10000) text word_lang_prob =
0060 let ngrams = ref [] in
061 let char_buffer = Array.make n_gram_max "" in
62 let char_count = ref 0 in
63 let processed = ref 0 in
0064 let decoder = Uutf.decoder ~encoding:`UTF_8 (`String text) in
65 let rec process () =
66 if !processed >= max_len then ()
67+ else
68+ match Uutf.decode decoder with
69+ | `Await | `End -> ()
70+ | `Malformed _ -> process ()
71+ | `Uchar uchar -> (
72+ incr processed;
73+ match normalize_uchar uchar with
74+ | None ->
75+ char_buffer.(0) <- "";
76+ char_buffer.(1) <- "";
77+ char_buffer.(2) <- "";
78+ char_count := 0;
79+ process ()
80+ | Some char_str ->
81+ char_buffer.(0) <- char_buffer.(1);
82+ char_buffer.(1) <- char_buffer.(2);
83+ char_buffer.(2) <- char_str;
84+ incr char_count;
85+ let available = min !char_count n_gram_max in
86+ for n = 1 to available do
0000087 let start_idx = n_gram_max - n in
88 let parts = ref [] in
89 for i = start_idx to n_gram_max - 1 do
90 parts := char_buffer.(i) :: !parts
91 done;
92+ let ngram = String.concat "" (List.rev !parts) in
93+ if StringMap.mem ngram word_lang_prob then
94+ ngrams := ngram :: !ngrams
95+ done;
96+ process ())
097 in
98 process ();
99 Array.of_list (List.rev !ngrams)
100101+let init_prob n_langs = Array.make n_langs (1.0 /. float_of_int n_langs)
0001020103let update_lang_prob prob ngram word_lang_prob alpha =
104 match StringMap.find_opt ngram word_lang_prob with
105 | None -> false
···110 done;
111 true
1120113let normalize_prob prob =
114+ let sum = Array.fold_left ( +. ) 0.0 prob in
115 if sum <= 0.0 then 0.0
116+ else
117 let max_p = ref 0.0 in
118 for i = 0 to Array.length prob - 1 do
119 prob.(i) <- prob.(i) /. sum;
120 if prob.(i) > !max_p then max_p := prob.(i)
121 done;
122 !max_p
01230124let random_state = ref 12345
125+let set_seed seed = random_state := seed
00126127let next_random () =
128+ random_state := ((!random_state * 1103515245) + 12345) land 0x7FFFFFFF;
129 !random_state
130131+let random_int bound = next_random () mod bound
0132133let random_gaussian () =
134+ let u1 = float_of_int (next_random ()) /. float_of_int 0x7FFFFFFF in
135+ let u2 = float_of_int (next_random ()) /. float_of_int 0x7FFFFFFF in
136+ let u1 = max 0.0001 u1 in
0137 sqrt (-2.0 *. log u1) *. cos (2.0 *. Float.pi *. u2)
1380139let detect_block t ngrams =
140 let n_langs = Array.length t.lang_list in
141 if n_langs = 0 || Array.length ngrams = 0 then [||]
142+ else
143 let lang_prob = Array.make n_langs 0.0 in
144+ set_seed (Option.value t.seed ~default:12345);
00000145 for _ = 0 to t.config.n_trial - 1 do
146 let prob = init_prob n_langs in
147+ let alpha = t.config.alpha +. (random_gaussian () *. alpha_width) in
0148 let converged = ref false in
149 let i = ref 0 in
150+ while (not !converged) && !i < iteration_limit do
151 let r = random_int (Array.length ngrams) in
152+ let (_ : bool) = update_lang_prob prob ngrams.(r) t.word_lang_prob alpha in
153+ if !i mod 5 = 0 then
154 let max_p = normalize_prob prob in
155+ if max_p > t.config.conv_threshold then converged := true;
0156 incr i
157 done;
00158 for j = 0 to n_langs - 1 do
159+ lang_prob.(j) <- lang_prob.(j) +. (prob.(j) /. float_of_int t.config.n_trial)
160 done
161 done;
0162 lang_prob
0163164+let create ?(config = default_config) profiles =
0165 let lang_list = Array.of_list (List.map fst profiles) in
166 let n_langs = Array.length lang_list in
000167 let all_ngrams = Hashtbl.create 65536 in
168 let lang_totals = Array.make n_langs 0 in
169+ List.iteri
170+ (fun lang_idx (_, freq_list) ->
171+ List.iter
172+ (fun (ngram, count) ->
173+ let current =
174+ match Hashtbl.find_opt all_ngrams ngram with
175+ | Some arr -> arr
176+ | None ->
177+ let arr = Array.make n_langs 0 in
178+ Hashtbl.add all_ngrams ngram arr;
179+ arr
180+ in
181+ current.(lang_idx) <- count;
182+ lang_totals.(lang_idx) <- lang_totals.(lang_idx) + count)
183+ freq_list)
184+ profiles;
0185 let word_lang_prob =
186+ Hashtbl.fold
187+ (fun ngram counts acc ->
188+ let probs = Array.make n_langs 0.0 in
189+ for i = 0 to n_langs - 1 do
190+ if lang_totals.(i) > 0 then
191+ probs.(i) <- float_of_int counts.(i) /. float_of_int lang_totals.(i)
192+ done;
193+ StringMap.add ngram probs acc)
194+ all_ngrams StringMap.empty
195 in
0196 { config; word_lang_prob; lang_list; seed = None }
197198+let set_random_seed t seed = t.seed <- Some seed
001990200let detect t text =
201+ let ngrams =
202+ extract_ngrams ~max_len:t.config.max_text_length text t.word_lang_prob
203+ in
204 if Array.length ngrams = 0 then []
205+ else
206 let probs = detect_block t ngrams in
0207 let results = ref [] in
208 for i = 0 to Array.length probs - 1 do
209 if probs.(i) > t.config.prob_threshold then
210 results := { lang = t.lang_list.(i); prob = probs.(i) } :: !results
211 done;
212 List.sort (fun a b -> compare b.prob a.prob) !results
02130214let detect_best t text =
215 match detect t text with
216 | [] -> None
217 | best :: _ -> Some best.lang
2180219let detect_with_prob t text =
220 match detect t text with
221 | [] -> None
222 | best :: _ -> Some (best.lang, best.prob)
223224+let create_default ?config () = create ?config Profiles.all_profiles
00
+45-27
lib/langdetect.mli
···1-(** Language detection library based on n-gram frequency analysis. *)
000023-(** Language detection result *)
000000004type result = {
5- lang: string;
6- prob: float;
7}
089-(** Detection parameters *)
10type config = {
11- alpha: float; (** Smoothing parameter (default 0.5) *)
12- n_trial: int; (** Number of random trials (default 7) *)
13- max_text_length: int; (** Maximum text length to process *)
14- conv_threshold: float; (** Convergence threshold *)
15- prob_threshold: float; (** Minimum probability to report *)
16}
01718-(** Default configuration *)
19val default_config : config
02021-(** Detector state *)
22type t
0002324-(** Create detector from language profiles.
25- Each profile is (lang_code, frequency_list) where frequency_list is
26- a list of (ngram, count) pairs. *)
27val create : ?config:config -> (string * (string * int) list) list -> t
0002829-(** Set random seed for reproducible results *)
000030val set_random_seed : t -> int -> unit
03132-(** Detect language of text.
33- Returns list of possible languages with probabilities, sorted by
34- probability descending. Only languages above prob_threshold are included. *)
35val detect : t -> string -> result list
0003637-(** Detect best matching language.
38- Returns None if no language could be detected. *)
39val detect_best : t -> string -> string option
004041-(** Detect best matching language with its probability.
42- Returns None if no language could be detected. *)
43val detect_with_prob : t -> string -> (string * float) option
44-45-(** Create a detector with all built-in language profiles.
46- This is a convenience function that calls create with all supported profiles. *)
47-val create_default : ?config:config -> unit -> t
···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2007-2016 Mozilla Foundation
3+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>
4+ SPDX-License-Identifier: MIT
5+ ---------------------------------------------------------------------------*)
67+(** Language detection library based on n-gram frequency analysis.
8+9+ This is an OCaml port of the Cybozu langdetect algorithm. Detects the
10+ natural language of text using n-gram frequency profiles. Supports 49
11+ languages including English, Chinese, Japanese, Arabic, and many European
12+ languages. *)
13+14+(** {1 Types} *)
15+16type result = {
17+ lang : string; (** ISO 639-1 language code *)
18+ prob : float; (** Detection probability (0.0 to 1.0) *)
19}
20+(** Language detection result. *)
21022type config = {
23+ alpha : float; (** Smoothing parameter (default 0.5) *)
24+ n_trial : int; (** Number of random trials (default 7) *)
25+ max_text_length : int; (** Maximum text length to process *)
26+ conv_threshold : float; (** Convergence threshold *)
27+ prob_threshold : float; (** Minimum probability to report *)
28}
29+(** Detection parameters. *)
30031val default_config : config
32+(** Default configuration values. *)
33034type t
35+(** Detector state. *)
36+37+(** {1 Creating detectors} *)
3800039val create : ?config:config -> (string * (string * int) list) list -> t
40+(** [create ?config profiles] creates a detector from language profiles.
41+ Each profile is [(lang_code, frequency_list)] where [frequency_list] is
42+ a list of [(ngram, count)] pairs. *)
4344+val create_default : ?config:config -> unit -> t
45+(** [create_default ?config ()] creates a detector with all built-in language
46+ profiles. This is a convenience function that calls {!create} with all
47+ supported profiles. *)
48+49val set_random_seed : t -> int -> unit
50+(** [set_random_seed t seed] sets the random seed for reproducible results. *)
5152+(** {1 Detecting languages} *)
53+054val detect : t -> string -> result list
55+(** [detect t text] detects the language of [text]. Returns a list of possible
56+ languages with probabilities, sorted by probability descending. Only
57+ languages above [prob_threshold] are included. *)
580059val detect_best : t -> string -> string option
60+(** [detect_best t text] returns the best matching language code, or [None]
61+ if no language could be detected. *)
620063val detect_with_prob : t -> string -> (string * float) option
64+(** [detect_with_prob t text] returns the best matching language code with its
65+ probability, or [None] if no language could be detected. *)
00
···1-(** Tests for the langdetect library *)
000023(* Sample texts in various languages for testing *)
4let english_text =
···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2007-2016 Mozilla Foundation
3+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>
4+ SPDX-License-Identifier: MIT
5+ ---------------------------------------------------------------------------*)
67(* Sample texts in various languages for testing *)
8let english_text =