Pure OCaml implementation of the Brotli compression algorithm
1(* Canonical Huffman coding with 2-level lookup tables for Brotli *)
2
3let max_length = 15
4
5(* A Huffman code entry in the lookup table *)
6type code = {
7 bits : int; (* Number of bits used for this symbol, or bits in subtable *)
8 value : int; (* Symbol value, or offset to subtable *)
9}
10
11(* A Huffman lookup table - flat array with 2-level structure *)
12type table = code array
13
14exception Invalid_huffman_tree
15
16(* Returns reverse(reverse(key, len) + 1, len) for canonical code generation *)
17let get_next_key key length =
18 let rec loop step =
19 if key land step = 0 then
20 (key land (step - 1)) + step
21 else
22 loop (step lsr 1)
23 in
24 loop (1 lsl (length - 1))
25
26(* Store code in table[i], table[i+step], table[i+2*step], ... *)
27let replicate_value table base step table_end code =
28 let rec loop index =
29 if index >= base then begin
30 table.(index) <- code;
31 loop (index - step)
32 end
33 in
34 loop (base + table_end - step)
35
36(* Calculate the width of the next 2nd level table *)
37let next_table_bit_size count length root_bits =
38 let left = ref (1 lsl (length - root_bits)) in
39 let len = ref length in
40 while !len < max_length do
41 left := !left - count.(!len);
42 if !left <= 0 then
43 len := max_length (* Break *)
44 else begin
45 incr len;
46 left := !left lsl 1
47 end
48 done;
49 !len - root_bits
50
51(* Build a Huffman lookup table from code lengths *)
52let build_table ~code_lengths ~alphabet_size ~root_bits =
53 let count = Array.make (max_length + 1) 0 in
54 let offset = Array.make (max_length + 1) 0 in
55 let sorted_symbols = Array.make alphabet_size 0 in
56
57 (* Build histogram of code lengths *)
58 for symbol = 0 to alphabet_size - 1 do
59 let len = code_lengths.(symbol) in
60 count.(len) <- count.(len) + 1
61 done;
62
63 (* Generate offsets into sorted symbol table by code length *)
64 offset.(1) <- 0;
65 for length = 1 to max_length - 1 do
66 offset.(length + 1) <- offset.(length) + count.(length)
67 done;
68
69 (* Sort symbols by length, by symbol order within each length *)
70 for symbol = 0 to alphabet_size - 1 do
71 let length = code_lengths.(symbol) in
72 if length <> 0 then begin
73 sorted_symbols.(offset.(length)) <- symbol;
74 offset.(length) <- offset.(length) + 1
75 end
76 done;
77
78 let table_bits = ref root_bits in
79 let table_size = ref (1 lsl !table_bits) in
80 let total_size = ref !table_size in
81
82 (* Pre-allocate table with maximum possible size *)
83 let max_table_size = !table_size * 4 in (* Conservative estimate *)
84 let root_table = Array.make max_table_size { bits = 0; value = 0 } in
85
86 (* Special case: code with only one value *)
87 if offset.(max_length) = 1 then begin
88 for key = 0 to !total_size - 1 do
89 root_table.(key) <- { bits = 0; value = sorted_symbols.(0) land 0xFFFF }
90 done;
91 Array.sub root_table 0 !total_size
92 end
93 else begin
94 let table = ref 0 in
95 let key = ref 0 in
96 let symbol = ref 0 in
97 let step = ref 2 in
98
99 (* Fill in root table *)
100 for length = 1 to root_bits do
101 while count.(length) > 0 do
102 let code = { bits = length land 0xFF; value = sorted_symbols.(!symbol) land 0xFFFF } in
103 incr symbol;
104 replicate_value root_table (!table + !key) !step !table_size code;
105 key := get_next_key !key length;
106 count.(length) <- count.(length) - 1
107 done;
108 step := !step lsl 1
109 done;
110
111 (* Fill in 2nd level tables and add pointers to root table *)
112 let mask = !total_size - 1 in
113 let low = ref (-1) in
114 step := 2;
115 let start_table = 0 in
116
117 for length = root_bits + 1 to max_length do
118 while count.(length) > 0 do
119 if (!key land mask) <> !low then begin
120 table := !table + !table_size;
121 table_bits := next_table_bit_size count length root_bits;
122 table_size := 1 lsl !table_bits;
123 total_size := !total_size + !table_size;
124 low := !key land mask;
125 root_table.(start_table + !low) <- {
126 bits = (!table_bits + root_bits) land 0xFF;
127 value = (!table - start_table - !low) land 0xFFFF
128 }
129 end;
130 let code = { bits = (length - root_bits) land 0xFF; value = sorted_symbols.(!symbol) land 0xFFFF } in
131 incr symbol;
132 replicate_value root_table (!table + (!key lsr root_bits)) !step !table_size code;
133 key := get_next_key !key length;
134 count.(length) <- count.(length) - 1
135 done;
136 step := !step lsl 1
137 done;
138
139 Array.sub root_table 0 !total_size
140 end
141
142(* Read a symbol from the bit stream using the Huffman table *)
143let[@inline] read_symbol table root_bits br =
144 let bits = Bit_reader.peek_bits br 15 in
145 let initial_idx = bits land ((1 lsl root_bits) - 1) in
146 let entry = table.(initial_idx) in
147 if entry.bits <= root_bits then begin
148 (* Symbol found in root table *)
149 Bit_reader.skip_bits br entry.bits;
150 entry.value
151 end
152 else begin
153 (* Need to look in 2nd level table *)
154 Bit_reader.skip_bits br root_bits;
155 let extra_bits = entry.bits - root_bits in
156 let idx2 = (bits lsr root_bits) land ((1 lsl extra_bits) - 1) in
157 let entry2 = table.(initial_idx + entry.value + idx2) in
158 Bit_reader.skip_bits br entry2.bits;
159 entry2.value
160 end
161
162(* Build Huffman table for simple prefix codes (1-4 symbols) *)
163let build_simple_table symbols num_symbols =
164 let table_size = 1 lsl Constants.huffman_max_table_bits in
165 let table = Array.make table_size { bits = 0; value = 0 } in
166
167 match num_symbols with
168 | 1 ->
169 (* Single symbol - use 0 bits *)
170 for i = 0 to table_size - 1 do
171 table.(i) <- { bits = 0; value = symbols.(0) }
172 done;
173 table
174 | 2 ->
175 (* Two symbols - 1 bit each *)
176 let half = table_size / 2 in
177 for i = 0 to half - 1 do
178 table.(i) <- { bits = 1; value = symbols.(0) }
179 done;
180 for i = half to table_size - 1 do
181 table.(i) <- { bits = 1; value = symbols.(1) }
182 done;
183 table
184 | 3 ->
185 (* Three symbols: 1, 2, 2 bits *)
186 let quarter = table_size / 4 in
187 for i = 0 to quarter - 1 do
188 table.(i) <- { bits = 1; value = symbols.(0) }
189 done;
190 for i = quarter to 2 * quarter - 1 do
191 table.(i) <- { bits = 2; value = symbols.(1) }
192 done;
193 for i = 2 * quarter to table_size - 1 do
194 table.(i) <- { bits = 2; value = symbols.(2) }
195 done;
196 table
197 | 4 ->
198 (* Four symbols: 2 bits each, with tree-select bit *)
199 let quarter = table_size / 4 in
200 for i = 0 to quarter - 1 do
201 table.(i) <- { bits = 2; value = symbols.(0) }
202 done;
203 for i = quarter to 2 * quarter - 1 do
204 table.(i) <- { bits = 2; value = symbols.(1) }
205 done;
206 for i = 2 * quarter to 3 * quarter - 1 do
207 table.(i) <- { bits = 2; value = symbols.(2) }
208 done;
209 for i = 3 * quarter to table_size - 1 do
210 table.(i) <- { bits = 2; value = symbols.(3) }
211 done;
212 table
213 | _ ->
214 raise Invalid_huffman_tree
215
216(* Maximum table sizes for different alphabet sizes *)
217let max_table_sizes = [|
218 256; 402; 436; 468; 500; 534; 566; 598;
219 630; 662; 694; 726; 758; 790; 822; 854;
220 886; 918; 950; 982; 1014; 1046; 1078; 1080
221|]
222
223(* Get maximum table size for a given alphabet size *)
224let max_table_size alphabet_size =
225 if alphabet_size <= 256 then 256
226 else if alphabet_size <= 704 then 1080
227 else 2048 (* Large alphabets *)