Detect which human language a document uses from OCaml, from the Nu Html validator
languages unicode ocaml

Pack language profiles into shared string table with flat int array

Reduces binary size by ~75% (115MB → 28MB static library) by:
- Using a shared string table for all 172K unique n-grams
- Storing profile data as a single flat int array (662K elements)
- Using offset-based access with (lang_code, start_idx, num_pairs)

This format is compatible with WASM/js_of_ocaml (31-bit safe ints)
and eliminates duplication across 47 language profiles.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

+144 -156
+74 -48
gen/gen_profiles.ml
··· 4 4 SPDX-License-Identifier: MIT 5 5 ---------------------------------------------------------------------------*) 6 6 7 - (* Profile generator - converts JSON language profiles to OCaml modules *) 7 + (* Profile generator - converts JSON language profiles to a packed OCaml module 8 + with shared string table and a single flat int array for maximum efficiency 9 + and WASM/js_of_ocaml compatibility (31-bit ints) *) 10 + 11 + module StringSet = Set.Make(String) 8 12 9 13 let read_file path = 10 14 let ic = open_in path in ··· 16 20 17 21 (* Simple JSON parser for profile format {"freq": {...}} *) 18 22 let parse_freq_json content = 19 - (* Find the freq object *) 20 23 let freq_start = 21 24 try String.index content '{' + 1 22 25 with Not_found -> failwith "No opening brace" 23 26 in 24 27 let content = String.sub content freq_start (String.length content - freq_start) in 25 - (* Skip to inner object *) 26 28 let inner_start = 27 29 try String.index content '{' + 1 28 30 with Not_found -> failwith "No freq object" ··· 33 35 in 34 36 let inner = String.sub content inner_start (inner_end - inner_start) in 35 37 36 - (* Parse key:value pairs *) 37 38 let pairs = ref [] in 38 39 let i = ref 0 in 39 40 let len = String.length inner in 40 41 while !i < len do 41 - (* Skip whitespace *) 42 42 while !i < len && (inner.[!i] = ' ' || inner.[!i] = '\n' || inner.[!i] = '\r' || inner.[!i] = '\t' || inner.[!i] = ',') do 43 43 incr i 44 44 done; 45 45 if !i >= len then () 46 46 else begin 47 - (* Expect quote for key *) 48 47 if inner.[!i] <> '"' then incr i 49 48 else begin 50 49 incr i; 51 50 let key_start = !i in 52 - (* Find end of key *) 53 51 while !i < len && inner.[!i] <> '"' do 54 52 if inner.[!i] = '\\' then i := !i + 2 55 53 else incr i 56 54 done; 57 55 let key = String.sub inner key_start (!i - key_start) in 58 - incr i; (* skip closing quote *) 59 - (* Skip colon *) 56 + incr i; 60 57 while !i < len && (inner.[!i] = ':' || inner.[!i] = ' ') do incr i done; 61 - (* Parse number *) 62 58 let num_start = !i in 63 59 while !i < len && inner.[!i] >= '0' && inner.[!i] <= '9' do incr i done; 64 60 let num_str = String.sub inner num_start (!i - num_start) in ··· 71 67 done; 72 68 !pairs 73 69 74 - (* Escape string for OCaml, preserving UTF-8 characters *) 75 70 let escape_ocaml_string s = 76 71 let buf = Buffer.create (String.length s * 2) in 77 72 String.iter (fun c -> ··· 83 78 | '\t' -> Buffer.add_string buf "\\t" 84 79 | c when Char.code c < 32 -> 85 80 Buffer.add_string buf (Printf.sprintf "\\x%02x" (Char.code c)) 86 - (* Keep all other characters including UTF-8 bytes as-is *) 87 81 | c -> Buffer.add_char buf c 88 82 ) s; 89 83 Buffer.contents buf 90 84 91 - let generate_profile_module lang_code pairs = 92 - let buf = Buffer.create 65536 in 93 - Buffer.add_string buf "(* Auto-generated language profile - do not edit *)\n\n"; 94 - Buffer.add_string buf (Printf.sprintf "let lang = %S\n\n" lang_code); 95 - Buffer.add_string buf "let freq = [\n"; 96 - List.iter (fun (ngram, count) -> 97 - (* Use custom escaping to preserve UTF-8 *) 98 - Buffer.add_string buf (Printf.sprintf " (\"%s\", %d);\n" (escape_ocaml_string ngram) count) 99 - ) (List.rev pairs); 100 - Buffer.add_string buf "]\n"; 101 - Buffer.contents buf 102 - 103 85 let () = 104 86 if Array.length Sys.argv < 3 then begin 105 87 Printf.eprintf "Usage: %s <profiles_dir> <output_dir>\n" Sys.argv.(0); ··· 109 91 let profiles_dir = Sys.argv.(1) in 110 92 let output_dir = Sys.argv.(2) in 111 93 112 - (* Process each profile *) 94 + (* First pass: collect all profiles and build global string table *) 113 95 let entries = Sys.readdir profiles_dir in 114 - let lang_codes = ref [] in 96 + let all_profiles = ref [] in 97 + let all_ngrams = ref StringSet.empty in 115 98 116 99 Array.iter (fun entry -> 117 100 let path = Filename.concat profiles_dir entry in ··· 120 103 try 121 104 let content = read_file path in 122 105 let pairs = parse_freq_json content in 123 - let lang_code = 124 - (* Normalize lang code: zh-cn -> zh_cn *) 125 - String.map (fun c -> if c = '-' then '_' else c) entry 126 - in 127 - let ml_content = generate_profile_module entry pairs in 128 - let out_path = Filename.concat output_dir (Printf.sprintf "profile_%s.ml" lang_code) in 129 - let oc = open_out out_path in 130 - output_string oc ml_content; 131 - close_out oc; 132 - lang_codes := (entry, lang_code) :: !lang_codes 106 + List.iter (fun (ngram, _) -> 107 + all_ngrams := StringSet.add ngram !all_ngrams 108 + ) pairs; 109 + all_profiles := (entry, pairs) :: !all_profiles 133 110 with e -> 134 111 Printf.eprintf "Error processing %s: %s\n%!" entry (Printexc.to_string e); 135 112 exit 1 136 113 end 137 114 ) entries; 138 115 139 - (* Sort language codes for deterministic output *) 140 - let sorted_codes = List.sort (fun (a, _) (b, _) -> String.compare a b) !lang_codes in 116 + let sorted_profiles = List.sort (fun (a, _) (b, _) -> String.compare a b) !all_profiles in 141 117 142 - (* Generate profiles index module *) 143 - let index_path = Filename.concat output_dir "profiles.ml" in 144 - let oc = open_out index_path in 145 - Printf.fprintf oc "(* Auto-generated profiles index - do not edit *)\n\n"; 146 - Printf.fprintf oc "let all_profiles = [\n"; 147 - List.iter (fun (orig_code, ml_code) -> 148 - Printf.fprintf oc " (%S, Profile_%s.freq);\n" orig_code ml_code 149 - ) sorted_codes; 150 - Printf.fprintf oc "]\n"; 151 - close_out oc 118 + (* Build ngram -> index mapping *) 119 + let ngram_list = StringSet.elements !all_ngrams in 120 + let ngram_to_idx = 121 + let tbl = Hashtbl.create (List.length ngram_list) in 122 + List.iteri (fun idx ngram -> Hashtbl.add tbl ngram idx) ngram_list; 123 + tbl 124 + in 125 + 126 + Printf.eprintf "Total unique n-grams: %d\n%!" (List.length ngram_list); 127 + Printf.eprintf "Total languages: %d\n%!" (List.length sorted_profiles); 128 + 129 + (* Calculate total data size and offsets *) 130 + let offsets = ref [] in 131 + let current_offset = ref 0 in 132 + List.iter (fun (lang_code, pairs) -> 133 + let num_pairs = List.length pairs in 134 + offsets := (lang_code, !current_offset, num_pairs) :: !offsets; 135 + current_offset := !current_offset + (num_pairs * 2) 136 + ) sorted_profiles; 137 + let offsets = List.rev !offsets in 138 + let total_ints = !current_offset in 139 + 140 + Printf.eprintf "Total int array size: %d elements\n%!" total_ints; 141 + 142 + (* Generate single packed module *) 143 + let out_path = Filename.concat output_dir "profiles_packed.ml" in 144 + let oc = open_out out_path in 145 + 146 + Printf.fprintf oc "(* Auto-generated packed profiles - do not edit *)\n"; 147 + Printf.fprintf oc "(* Single flat data array for maximum efficiency with WASM/js_of_ocaml *)\n\n"; 148 + 149 + (* Output string table *) 150 + Printf.fprintf oc "let ngram_table = [|\n"; 151 + List.iter (fun ngram -> 152 + Printf.fprintf oc " \"%s\";\n" (escape_ocaml_string ngram) 153 + ) ngram_list; 154 + Printf.fprintf oc "|]\n\n"; 155 + 156 + (* Output single flat data array with all profiles concatenated *) 157 + Printf.fprintf oc "(* Flat array of (ngram_index, frequency) pairs for all languages *)\n"; 158 + Printf.fprintf oc "let profile_data = [|\n"; 159 + List.iter (fun (_, pairs) -> 160 + List.iter (fun (ngram, freq) -> 161 + let idx = Hashtbl.find ngram_to_idx ngram in 162 + Printf.fprintf oc " %d; %d;\n" idx freq 163 + ) (List.rev pairs) 164 + ) sorted_profiles; 165 + Printf.fprintf oc "|]\n\n"; 166 + 167 + (* Output offsets table: (lang_code, start_index, num_pairs) *) 168 + Printf.fprintf oc "(* Profile offsets: (lang_code, start_index_in_data, num_ngram_pairs) *)\n"; 169 + Printf.fprintf oc "let profile_offsets = [|\n"; 170 + List.iter (fun (lang_code, offset, num_pairs) -> 171 + Printf.fprintf oc " (%S, %d, %d);\n" lang_code offset num_pairs 172 + ) offsets; 173 + Printf.fprintf oc "|]\n"; 174 + 175 + close_out oc; 176 + 177 + Printf.eprintf "Generated %s\n%!" out_path
+3 -100
lib/dune
··· 1 - ; Rule to generate all profile modules from JSON data files 1 + ; Rule to generate packed profiles module from JSON data files 2 2 3 3 (rule 4 - (targets 5 - profile_ar.ml 6 - profile_bg.ml 7 - profile_bn.ml 8 - profile_ca.ml 9 - profile_cs.ml 10 - profile_da.ml 11 - profile_de.ml 12 - profile_el.ml 13 - profile_en.ml 14 - profile_es.ml 15 - profile_et.ml 16 - profile_fa.ml 17 - profile_fi.ml 18 - profile_fr.ml 19 - profile_gu.ml 20 - profile_he.ml 21 - profile_hi.ml 22 - profile_hr.ml 23 - profile_hu.ml 24 - profile_id.ml 25 - profile_it.ml 26 - profile_ja.ml 27 - profile_ko.ml 28 - profile_lt.ml 29 - profile_lv.ml 30 - profile_mk.ml 31 - profile_ml.ml 32 - profile_nl.ml 33 - profile_no.ml 34 - profile_pa.ml 35 - profile_pl.ml 36 - profile_pt.ml 37 - profile_ro.ml 38 - profile_ru.ml 39 - profile_si.ml 40 - profile_sq.ml 41 - profile_sv.ml 42 - profile_ta.ml 43 - profile_te.ml 44 - profile_th.ml 45 - profile_tl.ml 46 - profile_tr.ml 47 - profile_uk.ml 48 - profile_ur.ml 49 - profile_vi.ml 50 - profile_zh_cn.ml 51 - profile_zh_tw.ml 52 - profiles.ml) 4 + (targets profiles_packed.ml) 53 5 (deps 54 6 (glob_files profiles.sm/*)) 55 7 (action ··· 59 11 (name langdetect) 60 12 (public_name langdetect) 61 13 (libraries uutf) 62 - (modules 63 - langdetect 64 - profiles 65 - profile_ar 66 - profile_bg 67 - profile_bn 68 - profile_ca 69 - profile_cs 70 - profile_da 71 - profile_de 72 - profile_el 73 - profile_en 74 - profile_es 75 - profile_et 76 - profile_fa 77 - profile_fi 78 - profile_fr 79 - profile_gu 80 - profile_he 81 - profile_hi 82 - profile_hr 83 - profile_hu 84 - profile_id 85 - profile_it 86 - profile_ja 87 - profile_ko 88 - profile_lt 89 - profile_lv 90 - profile_mk 91 - profile_ml 92 - profile_nl 93 - profile_no 94 - profile_pa 95 - profile_pl 96 - profile_pt 97 - profile_ro 98 - profile_ru 99 - profile_si 100 - profile_sq 101 - profile_sv 102 - profile_ta 103 - profile_te 104 - profile_th 105 - profile_tl 106 - profile_tr 107 - profile_uk 108 - profile_ur 109 - profile_vi 110 - profile_zh_cn 111 - profile_zh_tw)) 14 + (modules langdetect profiles_packed))
+67 -8
lib/langdetect.ml
··· 121 121 done; 122 122 !max_p 123 123 124 - let random_state = ref 12345 125 - let set_seed seed = random_state := seed 124 + (* LCG random number generator using Int32 for WASM compatibility. 125 + The constants (1103515245, 12345) are from the C standard library's rand(). 126 + We mask with 0x3FFFFFFF (30 bits) to ensure the result fits in OCaml's 127 + 31-bit int on 32-bit platforms like WASM. *) 128 + let random_state = ref 12345l 129 + let set_seed seed = random_state := Int32.of_int seed 126 130 127 131 let next_random () = 128 - random_state := ((!random_state * 1103515245) + 12345) land 0x7FFFFFFF; 129 - !random_state 132 + (* Use Int32 to handle overflow correctly on 32-bit platforms (WASM) *) 133 + let open Int32 in 134 + random_state := logand (add (mul !random_state 1103515245l) 12345l) 0x7FFFFFFFl; 135 + (* Mask to 30 bits to fit in OCaml's 31-bit int on 32-bit platforms *) 136 + to_int (logand !random_state 0x3FFFFFFFl) 130 137 131 - let random_int bound = next_random () mod bound 138 + let random_int bound = 139 + let r = next_random () in 140 + (* Ensure positive result even if bound is negative *) 141 + abs (r mod bound) 142 + 143 + let max_random_float = Int32.to_float 0x3FFFFFFFl 132 144 133 145 let random_gaussian () = 134 - let u1 = float_of_int (next_random ()) /. float_of_int 0x7FFFFFFF in 135 - let u2 = float_of_int (next_random ()) /. float_of_int 0x7FFFFFFF in 146 + let u1 = float_of_int (next_random ()) /. max_random_float in 147 + let u2 = float_of_int (next_random ()) /. max_random_float in 136 148 let u1 = max 0.0001 u1 in 137 149 sqrt (-2.0 *. log u1) *. cos (2.0 *. Float.pi *. u2) 138 150 ··· 162 174 done; 163 175 lang_prob 164 176 177 + (* Create detector from packed profiles with flat data array. 178 + ngram_table: global string table mapping indices to n-gram strings 179 + profile_data: flat int array of (ngram_index, frequency) pairs 180 + profile_offsets: array of (lang_code, start_index, num_pairs) *) 181 + let create_packed ?(config = default_config) ~ngram_table ~profile_data profile_offsets = 182 + let n_langs = Array.length profile_offsets in 183 + let lang_list = Array.map (fun (lang, _, _) -> lang) profile_offsets in 184 + let all_ngrams = Hashtbl.create 65536 in 185 + let lang_totals = Array.make n_langs 0 in 186 + Array.iteri 187 + (fun lang_idx (_, start_idx, num_pairs) -> 188 + for pair_idx = 0 to num_pairs - 1 do 189 + let data_idx = start_idx + (pair_idx * 2) in 190 + let ngram_idx = profile_data.(data_idx) in 191 + let count = profile_data.(data_idx + 1) in 192 + let ngram = ngram_table.(ngram_idx) in 193 + let current = 194 + match Hashtbl.find_opt all_ngrams ngram with 195 + | Some arr -> arr 196 + | None -> 197 + let arr = Array.make n_langs 0 in 198 + Hashtbl.add all_ngrams ngram arr; 199 + arr 200 + in 201 + current.(lang_idx) <- count; 202 + lang_totals.(lang_idx) <- lang_totals.(lang_idx) + count 203 + done) 204 + profile_offsets; 205 + let word_lang_prob = 206 + Hashtbl.fold 207 + (fun ngram counts acc -> 208 + let probs = Array.make n_langs 0.0 in 209 + for i = 0 to n_langs - 1 do 210 + if lang_totals.(i) > 0 then 211 + probs.(i) <- float_of_int counts.(i) /. float_of_int lang_totals.(i) 212 + done; 213 + StringMap.add ngram probs acc) 214 + all_ngrams StringMap.empty 215 + in 216 + { config; word_lang_prob; lang_list; seed = None } 217 + 218 + (* Create detector from legacy list-based profiles. 219 + profiles: list of (lang_code, (ngram, frequency) list) *) 165 220 let create ?(config = default_config) profiles = 166 221 let lang_list = Array.of_list (List.map fst profiles) in 167 222 let n_langs = Array.length lang_list in ··· 222 277 | [] -> None 223 278 | best :: _ -> Some (best.lang, best.prob) 224 279 225 - let create_default ?config () = create ?config Profiles.all_profiles 280 + let create_default ?config () = 281 + create_packed ?config 282 + ~ngram_table:Profiles_packed.ngram_table 283 + ~profile_data:Profiles_packed.profile_data 284 + Profiles_packed.profile_offsets