Zstd compression in pure OCaml
1(** Zstandard format constants (RFC 8878) *)
2
3(** Magic numbers *)
4let zstd_magic_number = 0xFD2FB528l
5let dict_magic_number = 0xEC30A437l
6let skippable_magic_start = 0x184D2A50l
7let skippable_magic_mask = 0xFFFFFFF0l
8let skippable_header_size = 8
9
10(** Block size limits *)
11let block_size_max = 128 * 1024 (* 128 KB *)
12let max_literals_size = block_size_max
13
14(** Maximum values *)
15let max_window_log = 31
16let min_window_log = 10
17let max_huffman_bits = 11
18let max_fse_accuracy_log = 15
19let max_huffman_symbols = 256
20let max_fse_symbols = 256
21
22(** Block types *)
23type block_type =
24 | Raw_block
25 | RLE_block
26 | Compressed_block
27 | Reserved_block
28
29let block_type_of_int = function
30 | 0 -> Raw_block
31 | 1 -> RLE_block
32 | 2 -> Compressed_block
33 | _ -> Reserved_block
34
35(* Block type integer values for encoding *)
36let block_raw = 0
37let block_rle = 1
38let block_compressed = 2
39
40(** Literals block types *)
41type literals_block_type =
42 | Raw_literals
43 | RLE_literals
44 | Compressed_literals
45 | Treeless_literals
46
47let literals_block_type_of_int = function
48 | 0 -> Raw_literals
49 | 1 -> RLE_literals
50 | 2 -> Compressed_literals
51 | _ -> Treeless_literals
52
53(** Sequence compression modes *)
54type seq_mode =
55 | Predefined_mode
56 | RLE_mode
57 | FSE_mode
58 | Repeat_mode
59
60let seq_mode_of_int = function
61 | 0 -> Predefined_mode
62 | 1 -> RLE_mode
63 | 2 -> FSE_mode
64 | _ -> Repeat_mode
65
66(** Default FSE distribution tables for predefined mode *)
67
68(* Literals length default distribution (accuracy log 6, 64 states) *)
69let ll_default_distribution = [|
70 4; 3; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 1; 1; 1;
71 2; 2; 2; 2; 2; 2; 2; 2; 2; 3; 2; 1; 1; 1; 1; 1;
72 -1; -1; -1; -1
73|]
74let ll_default_accuracy_log = 6
75let ll_max_accuracy_log = 9
76
77(* Match length default distribution (accuracy log 6, 64 states) *)
78let ml_default_distribution = [|
79 1; 4; 3; 2; 2; 2; 2; 2; 2; 1; 1; 1; 1; 1; 1; 1;
80 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1;
81 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; -1; -1;
82 -1; -1; -1; -1; -1
83|]
84let ml_default_accuracy_log = 6
85let ml_max_accuracy_log = 9
86
87(* Offset default distribution (accuracy log 5, 32 states) *)
88let of_default_distribution = [|
89 1; 1; 1; 1; 1; 1; 2; 2; 2; 1; 1; 1; 1; 1; 1; 1;
90 1; 1; 1; 1; 1; 1; 1; 1; -1; -1; -1; -1; -1
91|]
92let of_default_accuracy_log = 5
93let of_max_accuracy_log = 8
94
95(** Sequence code baselines and extra bits *)
96
97(* Literals length: code 0-35 *)
98let ll_baselines = [|
99 0; 1; 2; 3; 4; 5; 6; 7; 8; 9; 10; 11;
100 12; 13; 14; 15; 16; 18; 20; 22; 24; 28; 32; 40;
101 48; 64; 128; 256; 512; 1024; 2048; 4096; 8192; 16384; 32768; 65536
102|]
103let ll_extra_bits = [|
104 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0;
105 0; 0; 0; 0; 1; 1; 1; 1; 2; 2; 3; 3;
106 4; 6; 7; 8; 9; 10; 11; 12; 13; 14; 15; 16
107|]
108let ll_max_code = 35
109
110(* Match length: code 0-52 *)
111let ml_baselines = [|
112 3; 4; 5; 6; 7; 8; 9; 10; 11; 12; 13; 14; 15; 16;
113 17; 18; 19; 20; 21; 22; 23; 24; 25; 26; 27; 28; 29; 30;
114 31; 32; 33; 34; 35; 37; 39; 41; 43; 47; 51; 59; 67; 83;
115 99; 131; 259; 515; 1027; 2051; 4099; 8195; 16387; 32771; 65539
116|]
117let ml_extra_bits = [|
118 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0;
119 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0;
120 0; 0; 0; 0; 1; 1; 1; 1; 2; 2; 3; 3; 4; 4;
121 5; 7; 8; 9; 10; 11; 12; 13; 14; 15; 16
122|]
123let ml_max_code = 52
124
125(* Offset codes: the code is the number of bits to read *)
126let of_max_code = 31
127
128(** Initial repeat offsets *)
129let initial_repeat_offsets = [| 1; 4; 8 |]
130
131(** Error types *)
132type error =
133 | Invalid_magic_number
134 | Invalid_frame_header
135 | Invalid_block_type
136 | Invalid_block_size
137 | Invalid_literals_header
138 | Invalid_huffman_table
139 | Invalid_fse_table
140 | Invalid_sequence_header
141 | Invalid_offset
142 | Invalid_match_length
143 | Truncated_input
144 | Output_too_small
145 | Checksum_mismatch
146 | Dictionary_mismatch
147 | Corruption
148
149exception Zstd_error of error
150
151let error_message = function
152 | Invalid_magic_number -> "Invalid magic number"
153 | Invalid_frame_header -> "Invalid frame header"
154 | Invalid_block_type -> "Invalid block type"
155 | Invalid_block_size -> "Invalid block size"
156 | Invalid_literals_header -> "Invalid literals header"
157 | Invalid_huffman_table -> "Invalid Huffman table"
158 | Invalid_fse_table -> "Invalid FSE table"
159 | Invalid_sequence_header -> "Invalid sequence header"
160 | Invalid_offset -> "Invalid offset"
161 | Invalid_match_length -> "Invalid match length"
162 | Truncated_input -> "Truncated input"
163 | Output_too_small -> "Output buffer too small"
164 | Checksum_mismatch -> "Checksum mismatch"
165 | Dictionary_mismatch -> "Dictionary mismatch"
166 | Corruption -> "Data corruption detected"