(*--------------------------------------------------------------------------- Copyright (c) 2007-2016 Mozilla Foundation Copyright (c) 2025 Anil Madhavapeddy SPDX-License-Identifier: MIT ---------------------------------------------------------------------------*) (* Profile generator - converts JSON language profiles to a packed OCaml module with shared string table and chunked arrays for WASM compatibility. WASM has a limit of 10,000 operands for array_new_fixed, so we split large arrays into chunks and concatenate at runtime. *) module StringSet = Set.Make(String) (* Maximum elements per array chunk to stay under WASM limits *) let chunk_size = 9000 let read_file path = let ic = open_in path in Fun.protect ~finally:(fun () -> close_in ic) (fun () -> let n = in_channel_length ic in really_input_string ic n) (* Simple JSON parser for profile format {"freq": {...}} *) let parse_freq_json content = let freq_start = try String.index content '{' + 1 with Not_found -> failwith "No opening brace" in let content = String.sub content freq_start (String.length content - freq_start) in let inner_start = try String.index content '{' + 1 with Not_found -> failwith "No freq object" in let inner_end = try String.rindex content '}' with Not_found -> failwith "No closing brace" in let inner = String.sub content inner_start (inner_end - inner_start) in let pairs = ref [] in let i = ref 0 in let len = String.length inner in while !i < len do while !i < len && (inner.[!i] = ' ' || inner.[!i] = '\n' || inner.[!i] = '\r' || inner.[!i] = '\t' || inner.[!i] = ',') do incr i done; if !i >= len then () else begin if inner.[!i] <> '"' then incr i else begin incr i; let key_start = !i in while !i < len && inner.[!i] <> '"' do if inner.[!i] = '\\' then i := !i + 2 else incr i done; let key = String.sub inner key_start (!i - key_start) in incr i; while !i < len && (inner.[!i] = ':' || inner.[!i] = ' ') do incr i done; let num_start = !i in while !i < len && inner.[!i] >= '0' && inner.[!i] <= '9' do incr i done; let num_str = String.sub inner num_start (!i - num_start) in if num_str <> "" then begin let num = int_of_string num_str in pairs := (key, num) :: !pairs end end end done; !pairs let escape_ocaml_string s = let buf = Buffer.create (String.length s * 2) in String.iter (fun c -> match c with | '"' -> Buffer.add_string buf "\\\"" | '\\' -> Buffer.add_string buf "\\\\" | '\n' -> Buffer.add_string buf "\\n" | '\r' -> Buffer.add_string buf "\\r" | '\t' -> Buffer.add_string buf "\\t" | c when Char.code c < 32 -> Buffer.add_string buf (Printf.sprintf "\\x%02x" (Char.code c)) | c -> Buffer.add_char buf c ) s; Buffer.contents buf (* Split a list into chunks of at most n elements *) let chunk_list n lst = let rec aux acc current current_len = function | [] -> if current_len > 0 then List.rev (List.rev current :: acc) else List.rev acc | x :: xs -> if current_len >= n then aux (List.rev current :: acc) [x] 1 xs else aux acc (x :: current) (current_len + 1) xs in aux [] [] 0 lst let () = if Array.length Sys.argv < 3 then begin Printf.eprintf "Usage: %s \n" Sys.argv.(0); exit 1 end; let profiles_dir = Sys.argv.(1) in let output_dir = Sys.argv.(2) in (* First pass: collect all profiles and build global string table *) let entries = Sys.readdir profiles_dir in let all_profiles = ref [] in let all_ngrams = ref StringSet.empty in Array.iter (fun entry -> let path = Filename.concat profiles_dir entry in if Sys.is_directory path then () else begin try let content = read_file path in let pairs = parse_freq_json content in List.iter (fun (ngram, _) -> all_ngrams := StringSet.add ngram !all_ngrams ) pairs; all_profiles := (entry, pairs) :: !all_profiles with e -> Printf.eprintf "Error processing %s: %s\n%!" entry (Printexc.to_string e); exit 1 end ) entries; let sorted_profiles = List.sort (fun (a, _) (b, _) -> String.compare a b) !all_profiles in (* Build ngram -> index mapping *) let ngram_list = StringSet.elements !all_ngrams in let ngram_to_idx = let tbl = Hashtbl.create (List.length ngram_list) in List.iteri (fun idx ngram -> Hashtbl.add tbl ngram idx) ngram_list; tbl in Printf.eprintf "Total unique n-grams: %d\n%!" (List.length ngram_list); Printf.eprintf "Total languages: %d\n%!" (List.length sorted_profiles); (* Calculate total data size and offsets *) let offsets = ref [] in let current_offset = ref 0 in List.iter (fun (lang_code, pairs) -> let num_pairs = List.length pairs in offsets := (lang_code, !current_offset, num_pairs) :: !offsets; current_offset := !current_offset + (num_pairs * 2) ) sorted_profiles; let offsets = List.rev !offsets in let total_ints = !current_offset in Printf.eprintf "Total int array size: %d elements\n%!" total_ints; (* Chunk the data *) let ngram_chunks = chunk_list chunk_size ngram_list in Printf.eprintf "N-gram table: %d chunks of max %d elements\n%!" (List.length ngram_chunks) chunk_size; (* Build flat list of all profile data *) let all_data = ref [] in List.iter (fun (_, pairs) -> List.iter (fun (ngram, freq) -> let idx = Hashtbl.find ngram_to_idx ngram in all_data := freq :: idx :: !all_data ) (List.rev pairs) ) sorted_profiles; let all_data = List.rev !all_data in let data_chunks = chunk_list chunk_size all_data in Printf.eprintf "Profile data: %d chunks of max %d elements\n%!" (List.length data_chunks) chunk_size; (* Generate single packed module with chunked arrays *) let out_path = Filename.concat output_dir "profiles_packed.ml" in let oc = open_out out_path in Printf.fprintf oc "(* Auto-generated packed profiles - do not edit *)\n"; Printf.fprintf oc "(* Chunked arrays for WASM compatibility (max %d elements per chunk) *)\n\n" chunk_size; (* Output ngram table chunks *) List.iteri (fun i chunk -> Printf.fprintf oc "let ngram_chunk_%d = [|\n" i; List.iter (fun ngram -> Printf.fprintf oc " \"%s\";\n" (escape_ocaml_string ngram) ) chunk; Printf.fprintf oc "|]\n\n" ) ngram_chunks; (* Concatenate ngram chunks *) Printf.fprintf oc "let ngram_table = Array.concat [\n"; List.iteri (fun i _ -> Printf.fprintf oc " ngram_chunk_%d;\n" i ) ngram_chunks; Printf.fprintf oc "]\n\n"; (* Output profile data chunks *) List.iteri (fun i chunk -> Printf.fprintf oc "let data_chunk_%d = [|\n" i; List.iter (fun v -> Printf.fprintf oc " %d;\n" v ) chunk; Printf.fprintf oc "|]\n\n" ) data_chunks; (* Concatenate data chunks *) Printf.fprintf oc "(* Flat array of (ngram_index, frequency) pairs for all languages *)\n"; Printf.fprintf oc "let profile_data = Array.concat [\n"; List.iteri (fun i _ -> Printf.fprintf oc " data_chunk_%d;\n" i ) data_chunks; Printf.fprintf oc "]\n\n"; (* Output offsets table: (lang_code, start_index, num_pairs) *) Printf.fprintf oc "(* Profile offsets: (lang_code, start_index_in_data, num_ngram_pairs) *)\n"; Printf.fprintf oc "let profile_offsets = [|\n"; List.iter (fun (lang_code, offset, num_pairs) -> Printf.fprintf oc " (%S, %d, %d);\n" lang_code offset num_pairs ) offsets; Printf.fprintf oc "|]\n"; close_out oc; Printf.eprintf "Generated %s\n%!" out_path