Detect which human language a document uses from OCaml, from the Nu Html validator
languages
unicode
ocaml
1(*---------------------------------------------------------------------------
2 Copyright (c) 2007-2016 Mozilla Foundation
3 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>
4 SPDX-License-Identifier: MIT
5 ---------------------------------------------------------------------------*)
6
7(* Profile generator - converts JSON language profiles to a packed OCaml module
8 with shared string table and chunked arrays for WASM compatibility.
9
10 WASM has a limit of 10,000 operands for array_new_fixed, so we split
11 large arrays into chunks and concatenate at runtime. *)
12
13module StringSet = Set.Make(String)
14
15(* Maximum elements per array chunk to stay under WASM limits *)
16let chunk_size = 9000
17
18let read_file path =
19 let ic = open_in path in
20 Fun.protect
21 ~finally:(fun () -> close_in ic)
22 (fun () ->
23 let n = in_channel_length ic in
24 really_input_string ic n)
25
26(* Simple JSON parser for profile format {"freq": {...}} *)
27let parse_freq_json content =
28 let freq_start =
29 try String.index content '{' + 1
30 with Not_found -> failwith "No opening brace"
31 in
32 let content = String.sub content freq_start (String.length content - freq_start) in
33 let inner_start =
34 try String.index content '{' + 1
35 with Not_found -> failwith "No freq object"
36 in
37 let inner_end =
38 try String.rindex content '}'
39 with Not_found -> failwith "No closing brace"
40 in
41 let inner = String.sub content inner_start (inner_end - inner_start) in
42
43 let pairs = ref [] in
44 let i = ref 0 in
45 let len = String.length inner in
46 while !i < len do
47 while !i < len && (inner.[!i] = ' ' || inner.[!i] = '\n' || inner.[!i] = '\r' || inner.[!i] = '\t' || inner.[!i] = ',') do
48 incr i
49 done;
50 if !i >= len then ()
51 else begin
52 if inner.[!i] <> '"' then incr i
53 else begin
54 incr i;
55 let key_start = !i in
56 while !i < len && inner.[!i] <> '"' do
57 if inner.[!i] = '\\' then i := !i + 2
58 else incr i
59 done;
60 let key = String.sub inner key_start (!i - key_start) in
61 incr i;
62 while !i < len && (inner.[!i] = ':' || inner.[!i] = ' ') do incr i done;
63 let num_start = !i in
64 while !i < len && inner.[!i] >= '0' && inner.[!i] <= '9' do incr i done;
65 let num_str = String.sub inner num_start (!i - num_start) in
66 if num_str <> "" then begin
67 let num = int_of_string num_str in
68 pairs := (key, num) :: !pairs
69 end
70 end
71 end
72 done;
73 !pairs
74
75let escape_ocaml_string s =
76 let buf = Buffer.create (String.length s * 2) in
77 String.iter (fun c ->
78 match c with
79 | '"' -> Buffer.add_string buf "\\\""
80 | '\\' -> Buffer.add_string buf "\\\\"
81 | '\n' -> Buffer.add_string buf "\\n"
82 | '\r' -> Buffer.add_string buf "\\r"
83 | '\t' -> Buffer.add_string buf "\\t"
84 | c when Char.code c < 32 ->
85 Buffer.add_string buf (Printf.sprintf "\\x%02x" (Char.code c))
86 | c -> Buffer.add_char buf c
87 ) s;
88 Buffer.contents buf
89
90(* Split a list into chunks of at most n elements *)
91let chunk_list n lst =
92 let rec aux acc current current_len = function
93 | [] ->
94 if current_len > 0 then List.rev (List.rev current :: acc)
95 else List.rev acc
96 | x :: xs ->
97 if current_len >= n then
98 aux (List.rev current :: acc) [x] 1 xs
99 else
100 aux acc (x :: current) (current_len + 1) xs
101 in
102 aux [] [] 0 lst
103
104let () =
105 if Array.length Sys.argv < 3 then begin
106 Printf.eprintf "Usage: %s <profiles_dir> <output_dir>\n" Sys.argv.(0);
107 exit 1
108 end;
109
110 let profiles_dir = Sys.argv.(1) in
111 let output_dir = Sys.argv.(2) in
112
113 (* First pass: collect all profiles and build global string table *)
114 let entries = Sys.readdir profiles_dir in
115 let all_profiles = ref [] in
116 let all_ngrams = ref StringSet.empty in
117
118 Array.iter (fun entry ->
119 let path = Filename.concat profiles_dir entry in
120 if Sys.is_directory path then ()
121 else begin
122 try
123 let content = read_file path in
124 let pairs = parse_freq_json content in
125 List.iter (fun (ngram, _) ->
126 all_ngrams := StringSet.add ngram !all_ngrams
127 ) pairs;
128 all_profiles := (entry, pairs) :: !all_profiles
129 with e ->
130 Printf.eprintf "Error processing %s: %s\n%!" entry (Printexc.to_string e);
131 exit 1
132 end
133 ) entries;
134
135 let sorted_profiles = List.sort (fun (a, _) (b, _) -> String.compare a b) !all_profiles in
136
137 (* Build ngram -> index mapping *)
138 let ngram_list = StringSet.elements !all_ngrams in
139 let ngram_to_idx =
140 let tbl = Hashtbl.create (List.length ngram_list) in
141 List.iteri (fun idx ngram -> Hashtbl.add tbl ngram idx) ngram_list;
142 tbl
143 in
144
145 Printf.eprintf "Total unique n-grams: %d\n%!" (List.length ngram_list);
146 Printf.eprintf "Total languages: %d\n%!" (List.length sorted_profiles);
147
148 (* Calculate total data size and offsets *)
149 let offsets = ref [] in
150 let current_offset = ref 0 in
151 List.iter (fun (lang_code, pairs) ->
152 let num_pairs = List.length pairs in
153 offsets := (lang_code, !current_offset, num_pairs) :: !offsets;
154 current_offset := !current_offset + (num_pairs * 2)
155 ) sorted_profiles;
156 let offsets = List.rev !offsets in
157 let total_ints = !current_offset in
158
159 Printf.eprintf "Total int array size: %d elements\n%!" total_ints;
160
161 (* Chunk the data *)
162 let ngram_chunks = chunk_list chunk_size ngram_list in
163 Printf.eprintf "N-gram table: %d chunks of max %d elements\n%!"
164 (List.length ngram_chunks) chunk_size;
165
166 (* Build flat list of all profile data *)
167 let all_data = ref [] in
168 List.iter (fun (_, pairs) ->
169 List.iter (fun (ngram, freq) ->
170 let idx = Hashtbl.find ngram_to_idx ngram in
171 all_data := freq :: idx :: !all_data
172 ) (List.rev pairs)
173 ) sorted_profiles;
174 let all_data = List.rev !all_data in
175
176 let data_chunks = chunk_list chunk_size all_data in
177 Printf.eprintf "Profile data: %d chunks of max %d elements\n%!"
178 (List.length data_chunks) chunk_size;
179
180 (* Generate single packed module with chunked arrays *)
181 let out_path = Filename.concat output_dir "profiles_packed.ml" in
182 let oc = open_out out_path in
183
184 Printf.fprintf oc "(* Auto-generated packed profiles - do not edit *)\n";
185 Printf.fprintf oc "(* Chunked arrays for WASM compatibility (max %d elements per chunk) *)\n\n" chunk_size;
186
187 (* Output ngram table chunks *)
188 List.iteri (fun i chunk ->
189 Printf.fprintf oc "let ngram_chunk_%d = [|\n" i;
190 List.iter (fun ngram ->
191 Printf.fprintf oc " \"%s\";\n" (escape_ocaml_string ngram)
192 ) chunk;
193 Printf.fprintf oc "|]\n\n"
194 ) ngram_chunks;
195
196 (* Concatenate ngram chunks *)
197 Printf.fprintf oc "let ngram_table = Array.concat [\n";
198 List.iteri (fun i _ ->
199 Printf.fprintf oc " ngram_chunk_%d;\n" i
200 ) ngram_chunks;
201 Printf.fprintf oc "]\n\n";
202
203 (* Output profile data chunks *)
204 List.iteri (fun i chunk ->
205 Printf.fprintf oc "let data_chunk_%d = [|\n" i;
206 List.iter (fun v ->
207 Printf.fprintf oc " %d;\n" v
208 ) chunk;
209 Printf.fprintf oc "|]\n\n"
210 ) data_chunks;
211
212 (* Concatenate data chunks *)
213 Printf.fprintf oc "(* Flat array of (ngram_index, frequency) pairs for all languages *)\n";
214 Printf.fprintf oc "let profile_data = Array.concat [\n";
215 List.iteri (fun i _ ->
216 Printf.fprintf oc " data_chunk_%d;\n" i
217 ) data_chunks;
218 Printf.fprintf oc "]\n\n";
219
220 (* Output offsets table: (lang_code, start_index, num_pairs) *)
221 Printf.fprintf oc "(* Profile offsets: (lang_code, start_index_in_data, num_ngram_pairs) *)\n";
222 Printf.fprintf oc "let profile_offsets = [|\n";
223 List.iter (fun (lang_code, offset, num_pairs) ->
224 Printf.fprintf oc " (%S, %d, %d);\n" lang_code offset num_pairs
225 ) offsets;
226 Printf.fprintf oc "|]\n";
227
228 close_out oc;
229
230 Printf.eprintf "Generated %s\n%!" out_path