···99 This is an OCaml port of the Cybozu langdetect algorithm. Detects the
1010 natural language of text using n-gram frequency profiles. Supports 49
1111 languages including English, Chinese, Japanese, Arabic, and many European
1212- languages. *)
1212+ languages.
1313+1414+ {1 Quick Start}
1515+1616+ {[
1717+ (* Create a detector with built-in language profiles *)
1818+ let detector = Langdetect.create_default () in
1919+2020+ (* Detect the language of some text *)
2121+ let results = Langdetect.detect detector "Hello, how are you today?" in
2222+ List.iter (fun r ->
2323+ Printf.printf "%s: %.2f%%\n" r.lang (r.prob *. 100.0)
2424+ ) results
2525+ (* Output: en: 99.99% *)
2626+2727+ (* Get just the best match *)
2828+ match Langdetect.detect_best detector "Bonjour, comment allez-vous?" with
2929+ | Some lang -> Printf.printf "Detected: %s\n" lang (* fr *)
3030+ | None -> Printf.printf "Could not detect language\n"
3131+ ]}
3232+3333+ {1 Algorithm Overview}
3434+3535+ The detection algorithm uses n-gram frequency analysis:
3636+3737+ {ol
3838+ {- Extract character n-grams (1 to 3 characters) from the input text}
3939+ {- Compare n-gram frequencies against pre-computed language profiles}
4040+ {- Use a randomized trial approach to handle ambiguous text}
4141+ {- Return probabilities for each candidate language}}
4242+4343+ The algorithm is based on the Cybozu langdetect library, originally
4444+ developed by Shuyo Nakatani. The n-gram profiles were trained on
4545+ Wikipedia text corpora.
4646+4747+ {1 Supported Languages}
4848+4949+ The built-in profiles support 49 languages with ISO 639-1 codes:
5050+5151+ {ul
5252+ {- {b European}: af, bg, cs, da, de, el, en, es, et, fi, fr, hr, hu, it, lt,
5353+ lv, nl, no, pl, pt, ro, ru, sk, sl, sq, sv, tr, uk}
5454+ {- {b Asian}: ar, bn, fa, gu, he, hi, id, ja, kn, ko, ml, mr, ne, pa, ta,
5555+ te, th, vi, zh-cn, zh-tw}
5656+ {- {b Other}: sw, tl}}
5757+5858+ {1 Performance Considerations}
5959+6060+ {ul
6161+ {- Text length: Longer text (100+ characters) yields more accurate results}
6262+ {- Short text: May produce ambiguous or incorrect results}
6363+ {- Mixed language: Returns the dominant language}
6464+ {- Similar languages: May confuse closely related languages (e.g., no/da, es/pt)}}
6565+6666+ The detector processes up to [max_text_length] characters (default: 10000)
6767+ for performance. Increase this for more accuracy on long documents.
6868+6969+ {1 Reproducibility}
7070+7171+ Detection uses random sampling internally. For reproducible results:
7272+ {[
7373+ let detector = Langdetect.create_default () in
7474+ Langdetect.set_random_seed detector 42;
7575+ (* Now results are deterministic *)
7676+ ]}
7777+7878+ {1 References}
7979+8080+ {ul
8181+ {- {{:https://github.com/shuyo/language-detection}Cybozu langdetect} - Original Java implementation}
8282+ {- {{:https://www.aclweb.org/anthology/C10-1096/}N-gram Language Detection} - Background on n-gram approach}} *)
13831484(** {1 Types} *)
1585···2090(** Language detection result. *)
21912292type config = {
2323- alpha : float; (** Smoothing parameter (default 0.5) *)
2424- n_trial : int; (** Number of random trials (default 7) *)
2525- max_text_length : int; (** Maximum text length to process *)
2626- conv_threshold : float; (** Convergence threshold *)
2727- prob_threshold : float; (** Minimum probability to report *)
9393+ alpha : float;
9494+ (** Smoothing parameter for probability estimation (default: 0.5).
9595+ Higher values make the algorithm less sensitive to rare n-grams. *)
9696+ n_trial : int;
9797+ (** Number of random trials to run (default: 7).
9898+ More trials improve accuracy but increase processing time. *)
9999+ max_text_length : int;
100100+ (** Maximum text length to process (default: 10000).
101101+ Text beyond this limit is ignored. Increase for long documents. *)
102102+ conv_threshold : float;
103103+ (** Convergence threshold for early termination (default: 0.99999).
104104+ Trials stop early when confidence exceeds this value. *)
105105+ prob_threshold : float;
106106+ (** Minimum probability to include in results (default: 0.1).
107107+ Languages below this threshold are filtered from {!detect} output. *)
28108}
2929-(** Detection parameters. *)
109109+(** Detection parameters for tuning accuracy and performance.
110110+111111+ Use {!default_config} for standard settings, or customize for specific needs:
112112+ {[
113113+ let config = { Langdetect.default_config with
114114+ n_trial = 10; (* More trials for better accuracy *)
115115+ prob_threshold = 0.2 (* Only report high-confidence results *)
116116+ } in
117117+ let detector = Langdetect.create_default ~config ()
118118+ ]} *)
3011931120val default_config : config
32121(** Default configuration values. *)