Detect which human language a document uses from OCaml, from the Nu Html validator
languages unicode ocaml
at main 230 lines 7.8 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2007-2016 Mozilla Foundation 3 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org> 4 SPDX-License-Identifier: MIT 5 ---------------------------------------------------------------------------*) 6 7(* Profile generator - converts JSON language profiles to a packed OCaml module 8 with shared string table and chunked arrays for WASM compatibility. 9 10 WASM has a limit of 10,000 operands for array_new_fixed, so we split 11 large arrays into chunks and concatenate at runtime. *) 12 13module StringSet = Set.Make(String) 14 15(* Maximum elements per array chunk to stay under WASM limits *) 16let chunk_size = 9000 17 18let read_file path = 19 let ic = open_in path in 20 Fun.protect 21 ~finally:(fun () -> close_in ic) 22 (fun () -> 23 let n = in_channel_length ic in 24 really_input_string ic n) 25 26(* Simple JSON parser for profile format {"freq": {...}} *) 27let parse_freq_json content = 28 let freq_start = 29 try String.index content '{' + 1 30 with Not_found -> failwith "No opening brace" 31 in 32 let content = String.sub content freq_start (String.length content - freq_start) in 33 let inner_start = 34 try String.index content '{' + 1 35 with Not_found -> failwith "No freq object" 36 in 37 let inner_end = 38 try String.rindex content '}' 39 with Not_found -> failwith "No closing brace" 40 in 41 let inner = String.sub content inner_start (inner_end - inner_start) in 42 43 let pairs = ref [] in 44 let i = ref 0 in 45 let len = String.length inner in 46 while !i < len do 47 while !i < len && (inner.[!i] = ' ' || inner.[!i] = '\n' || inner.[!i] = '\r' || inner.[!i] = '\t' || inner.[!i] = ',') do 48 incr i 49 done; 50 if !i >= len then () 51 else begin 52 if inner.[!i] <> '"' then incr i 53 else begin 54 incr i; 55 let key_start = !i in 56 while !i < len && inner.[!i] <> '"' do 57 if inner.[!i] = '\\' then i := !i + 2 58 else incr i 59 done; 60 let key = String.sub inner key_start (!i - key_start) in 61 incr i; 62 while !i < len && (inner.[!i] = ':' || inner.[!i] = ' ') do incr i done; 63 let num_start = !i in 64 while !i < len && inner.[!i] >= '0' && inner.[!i] <= '9' do incr i done; 65 let num_str = String.sub inner num_start (!i - num_start) in 66 if num_str <> "" then begin 67 let num = int_of_string num_str in 68 pairs := (key, num) :: !pairs 69 end 70 end 71 end 72 done; 73 !pairs 74 75let escape_ocaml_string s = 76 let buf = Buffer.create (String.length s * 2) in 77 String.iter (fun c -> 78 match c with 79 | '"' -> Buffer.add_string buf "\\\"" 80 | '\\' -> Buffer.add_string buf "\\\\" 81 | '\n' -> Buffer.add_string buf "\\n" 82 | '\r' -> Buffer.add_string buf "\\r" 83 | '\t' -> Buffer.add_string buf "\\t" 84 | c when Char.code c < 32 -> 85 Buffer.add_string buf (Printf.sprintf "\\x%02x" (Char.code c)) 86 | c -> Buffer.add_char buf c 87 ) s; 88 Buffer.contents buf 89 90(* Split a list into chunks of at most n elements *) 91let chunk_list n lst = 92 let rec aux acc current current_len = function 93 | [] -> 94 if current_len > 0 then List.rev (List.rev current :: acc) 95 else List.rev acc 96 | x :: xs -> 97 if current_len >= n then 98 aux (List.rev current :: acc) [x] 1 xs 99 else 100 aux acc (x :: current) (current_len + 1) xs 101 in 102 aux [] [] 0 lst 103 104let () = 105 if Array.length Sys.argv < 3 then begin 106 Printf.eprintf "Usage: %s <profiles_dir> <output_dir>\n" Sys.argv.(0); 107 exit 1 108 end; 109 110 let profiles_dir = Sys.argv.(1) in 111 let output_dir = Sys.argv.(2) in 112 113 (* First pass: collect all profiles and build global string table *) 114 let entries = Sys.readdir profiles_dir in 115 let all_profiles = ref [] in 116 let all_ngrams = ref StringSet.empty in 117 118 Array.iter (fun entry -> 119 let path = Filename.concat profiles_dir entry in 120 if Sys.is_directory path then () 121 else begin 122 try 123 let content = read_file path in 124 let pairs = parse_freq_json content in 125 List.iter (fun (ngram, _) -> 126 all_ngrams := StringSet.add ngram !all_ngrams 127 ) pairs; 128 all_profiles := (entry, pairs) :: !all_profiles 129 with e -> 130 Printf.eprintf "Error processing %s: %s\n%!" entry (Printexc.to_string e); 131 exit 1 132 end 133 ) entries; 134 135 let sorted_profiles = List.sort (fun (a, _) (b, _) -> String.compare a b) !all_profiles in 136 137 (* Build ngram -> index mapping *) 138 let ngram_list = StringSet.elements !all_ngrams in 139 let ngram_to_idx = 140 let tbl = Hashtbl.create (List.length ngram_list) in 141 List.iteri (fun idx ngram -> Hashtbl.add tbl ngram idx) ngram_list; 142 tbl 143 in 144 145 Printf.eprintf "Total unique n-grams: %d\n%!" (List.length ngram_list); 146 Printf.eprintf "Total languages: %d\n%!" (List.length sorted_profiles); 147 148 (* Calculate total data size and offsets *) 149 let offsets = ref [] in 150 let current_offset = ref 0 in 151 List.iter (fun (lang_code, pairs) -> 152 let num_pairs = List.length pairs in 153 offsets := (lang_code, !current_offset, num_pairs) :: !offsets; 154 current_offset := !current_offset + (num_pairs * 2) 155 ) sorted_profiles; 156 let offsets = List.rev !offsets in 157 let total_ints = !current_offset in 158 159 Printf.eprintf "Total int array size: %d elements\n%!" total_ints; 160 161 (* Chunk the data *) 162 let ngram_chunks = chunk_list chunk_size ngram_list in 163 Printf.eprintf "N-gram table: %d chunks of max %d elements\n%!" 164 (List.length ngram_chunks) chunk_size; 165 166 (* Build flat list of all profile data *) 167 let all_data = ref [] in 168 List.iter (fun (_, pairs) -> 169 List.iter (fun (ngram, freq) -> 170 let idx = Hashtbl.find ngram_to_idx ngram in 171 all_data := freq :: idx :: !all_data 172 ) (List.rev pairs) 173 ) sorted_profiles; 174 let all_data = List.rev !all_data in 175 176 let data_chunks = chunk_list chunk_size all_data in 177 Printf.eprintf "Profile data: %d chunks of max %d elements\n%!" 178 (List.length data_chunks) chunk_size; 179 180 (* Generate single packed module with chunked arrays *) 181 let out_path = Filename.concat output_dir "profiles_packed.ml" in 182 let oc = open_out out_path in 183 184 Printf.fprintf oc "(* Auto-generated packed profiles - do not edit *)\n"; 185 Printf.fprintf oc "(* Chunked arrays for WASM compatibility (max %d elements per chunk) *)\n\n" chunk_size; 186 187 (* Output ngram table chunks *) 188 List.iteri (fun i chunk -> 189 Printf.fprintf oc "let ngram_chunk_%d = [|\n" i; 190 List.iter (fun ngram -> 191 Printf.fprintf oc " \"%s\";\n" (escape_ocaml_string ngram) 192 ) chunk; 193 Printf.fprintf oc "|]\n\n" 194 ) ngram_chunks; 195 196 (* Concatenate ngram chunks *) 197 Printf.fprintf oc "let ngram_table = Array.concat [\n"; 198 List.iteri (fun i _ -> 199 Printf.fprintf oc " ngram_chunk_%d;\n" i 200 ) ngram_chunks; 201 Printf.fprintf oc "]\n\n"; 202 203 (* Output profile data chunks *) 204 List.iteri (fun i chunk -> 205 Printf.fprintf oc "let data_chunk_%d = [|\n" i; 206 List.iter (fun v -> 207 Printf.fprintf oc " %d;\n" v 208 ) chunk; 209 Printf.fprintf oc "|]\n\n" 210 ) data_chunks; 211 212 (* Concatenate data chunks *) 213 Printf.fprintf oc "(* Flat array of (ngram_index, frequency) pairs for all languages *)\n"; 214 Printf.fprintf oc "let profile_data = Array.concat [\n"; 215 List.iteri (fun i _ -> 216 Printf.fprintf oc " data_chunk_%d;\n" i 217 ) data_chunks; 218 Printf.fprintf oc "]\n\n"; 219 220 (* Output offsets table: (lang_code, start_index, num_pairs) *) 221 Printf.fprintf oc "(* Profile offsets: (lang_code, start_index_in_data, num_ngram_pairs) *)\n"; 222 Printf.fprintf oc "let profile_offsets = [|\n"; 223 List.iter (fun (lang_code, offset, num_pairs) -> 224 Printf.fprintf oc " (%S, %d, %d);\n" lang_code offset num_pairs 225 ) offsets; 226 Printf.fprintf oc "|]\n"; 227 228 close_out oc; 229 230 Printf.eprintf "Generated %s\n%!" out_path