Pure OCaml implementation of the Brotli compression algorithm
at main 227 lines 7.3 kB view raw
1(* Canonical Huffman coding with 2-level lookup tables for Brotli *) 2 3let max_length = 15 4 5(* A Huffman code entry in the lookup table *) 6type code = { 7 bits : int; (* Number of bits used for this symbol, or bits in subtable *) 8 value : int; (* Symbol value, or offset to subtable *) 9} 10 11(* A Huffman lookup table - flat array with 2-level structure *) 12type table = code array 13 14exception Invalid_huffman_tree 15 16(* Returns reverse(reverse(key, len) + 1, len) for canonical code generation *) 17let get_next_key key length = 18 let rec loop step = 19 if key land step = 0 then 20 (key land (step - 1)) + step 21 else 22 loop (step lsr 1) 23 in 24 loop (1 lsl (length - 1)) 25 26(* Store code in table[i], table[i+step], table[i+2*step], ... *) 27let replicate_value table base step table_end code = 28 let rec loop index = 29 if index >= base then begin 30 table.(index) <- code; 31 loop (index - step) 32 end 33 in 34 loop (base + table_end - step) 35 36(* Calculate the width of the next 2nd level table *) 37let next_table_bit_size count length root_bits = 38 let left = ref (1 lsl (length - root_bits)) in 39 let len = ref length in 40 while !len < max_length do 41 left := !left - count.(!len); 42 if !left <= 0 then 43 len := max_length (* Break *) 44 else begin 45 incr len; 46 left := !left lsl 1 47 end 48 done; 49 !len - root_bits 50 51(* Build a Huffman lookup table from code lengths *) 52let build_table ~code_lengths ~alphabet_size ~root_bits = 53 let count = Array.make (max_length + 1) 0 in 54 let offset = Array.make (max_length + 1) 0 in 55 let sorted_symbols = Array.make alphabet_size 0 in 56 57 (* Build histogram of code lengths *) 58 for symbol = 0 to alphabet_size - 1 do 59 let len = code_lengths.(symbol) in 60 count.(len) <- count.(len) + 1 61 done; 62 63 (* Generate offsets into sorted symbol table by code length *) 64 offset.(1) <- 0; 65 for length = 1 to max_length - 1 do 66 offset.(length + 1) <- offset.(length) + count.(length) 67 done; 68 69 (* Sort symbols by length, by symbol order within each length *) 70 for symbol = 0 to alphabet_size - 1 do 71 let length = code_lengths.(symbol) in 72 if length <> 0 then begin 73 sorted_symbols.(offset.(length)) <- symbol; 74 offset.(length) <- offset.(length) + 1 75 end 76 done; 77 78 let table_bits = ref root_bits in 79 let table_size = ref (1 lsl !table_bits) in 80 let total_size = ref !table_size in 81 82 (* Pre-allocate table with maximum possible size *) 83 let max_table_size = !table_size * 4 in (* Conservative estimate *) 84 let root_table = Array.make max_table_size { bits = 0; value = 0 } in 85 86 (* Special case: code with only one value *) 87 if offset.(max_length) = 1 then begin 88 for key = 0 to !total_size - 1 do 89 root_table.(key) <- { bits = 0; value = sorted_symbols.(0) land 0xFFFF } 90 done; 91 Array.sub root_table 0 !total_size 92 end 93 else begin 94 let table = ref 0 in 95 let key = ref 0 in 96 let symbol = ref 0 in 97 let step = ref 2 in 98 99 (* Fill in root table *) 100 for length = 1 to root_bits do 101 while count.(length) > 0 do 102 let code = { bits = length land 0xFF; value = sorted_symbols.(!symbol) land 0xFFFF } in 103 incr symbol; 104 replicate_value root_table (!table + !key) !step !table_size code; 105 key := get_next_key !key length; 106 count.(length) <- count.(length) - 1 107 done; 108 step := !step lsl 1 109 done; 110 111 (* Fill in 2nd level tables and add pointers to root table *) 112 let mask = !total_size - 1 in 113 let low = ref (-1) in 114 step := 2; 115 let start_table = 0 in 116 117 for length = root_bits + 1 to max_length do 118 while count.(length) > 0 do 119 if (!key land mask) <> !low then begin 120 table := !table + !table_size; 121 table_bits := next_table_bit_size count length root_bits; 122 table_size := 1 lsl !table_bits; 123 total_size := !total_size + !table_size; 124 low := !key land mask; 125 root_table.(start_table + !low) <- { 126 bits = (!table_bits + root_bits) land 0xFF; 127 value = (!table - start_table - !low) land 0xFFFF 128 } 129 end; 130 let code = { bits = (length - root_bits) land 0xFF; value = sorted_symbols.(!symbol) land 0xFFFF } in 131 incr symbol; 132 replicate_value root_table (!table + (!key lsr root_bits)) !step !table_size code; 133 key := get_next_key !key length; 134 count.(length) <- count.(length) - 1 135 done; 136 step := !step lsl 1 137 done; 138 139 Array.sub root_table 0 !total_size 140 end 141 142(* Read a symbol from the bit stream using the Huffman table *) 143let[@inline] read_symbol table root_bits br = 144 let bits = Bit_reader.peek_bits br 15 in 145 let initial_idx = bits land ((1 lsl root_bits) - 1) in 146 let entry = table.(initial_idx) in 147 if entry.bits <= root_bits then begin 148 (* Symbol found in root table *) 149 Bit_reader.skip_bits br entry.bits; 150 entry.value 151 end 152 else begin 153 (* Need to look in 2nd level table *) 154 Bit_reader.skip_bits br root_bits; 155 let extra_bits = entry.bits - root_bits in 156 let idx2 = (bits lsr root_bits) land ((1 lsl extra_bits) - 1) in 157 let entry2 = table.(initial_idx + entry.value + idx2) in 158 Bit_reader.skip_bits br entry2.bits; 159 entry2.value 160 end 161 162(* Build Huffman table for simple prefix codes (1-4 symbols) *) 163let build_simple_table symbols num_symbols = 164 let table_size = 1 lsl Constants.huffman_max_table_bits in 165 let table = Array.make table_size { bits = 0; value = 0 } in 166 167 match num_symbols with 168 | 1 -> 169 (* Single symbol - use 0 bits *) 170 for i = 0 to table_size - 1 do 171 table.(i) <- { bits = 0; value = symbols.(0) } 172 done; 173 table 174 | 2 -> 175 (* Two symbols - 1 bit each *) 176 let half = table_size / 2 in 177 for i = 0 to half - 1 do 178 table.(i) <- { bits = 1; value = symbols.(0) } 179 done; 180 for i = half to table_size - 1 do 181 table.(i) <- { bits = 1; value = symbols.(1) } 182 done; 183 table 184 | 3 -> 185 (* Three symbols: 1, 2, 2 bits *) 186 let quarter = table_size / 4 in 187 for i = 0 to quarter - 1 do 188 table.(i) <- { bits = 1; value = symbols.(0) } 189 done; 190 for i = quarter to 2 * quarter - 1 do 191 table.(i) <- { bits = 2; value = symbols.(1) } 192 done; 193 for i = 2 * quarter to table_size - 1 do 194 table.(i) <- { bits = 2; value = symbols.(2) } 195 done; 196 table 197 | 4 -> 198 (* Four symbols: 2 bits each, with tree-select bit *) 199 let quarter = table_size / 4 in 200 for i = 0 to quarter - 1 do 201 table.(i) <- { bits = 2; value = symbols.(0) } 202 done; 203 for i = quarter to 2 * quarter - 1 do 204 table.(i) <- { bits = 2; value = symbols.(1) } 205 done; 206 for i = 2 * quarter to 3 * quarter - 1 do 207 table.(i) <- { bits = 2; value = symbols.(2) } 208 done; 209 for i = 3 * quarter to table_size - 1 do 210 table.(i) <- { bits = 2; value = symbols.(3) } 211 done; 212 table 213 | _ -> 214 raise Invalid_huffman_tree 215 216(* Maximum table sizes for different alphabet sizes *) 217let max_table_sizes = [| 218 256; 402; 436; 468; 500; 534; 566; 598; 219 630; 662; 694; 726; 758; 790; 822; 854; 220 886; 918; 950; 982; 1014; 1046; 1078; 1080 221|] 222 223(* Get maximum table size for a given alphabet size *) 224let max_table_size alphabet_size = 225 if alphabet_size <= 256 then 256 226 else if alphabet_size <= 704 then 1080 227 else 2048 (* Large alphabets *)