Zstd compression in pure OCaml
at main 166 lines 4.6 kB view raw
1(** Zstandard format constants (RFC 8878) *) 2 3(** Magic numbers *) 4let zstd_magic_number = 0xFD2FB528l 5let dict_magic_number = 0xEC30A437l 6let skippable_magic_start = 0x184D2A50l 7let skippable_magic_mask = 0xFFFFFFF0l 8let skippable_header_size = 8 9 10(** Block size limits *) 11let block_size_max = 128 * 1024 (* 128 KB *) 12let max_literals_size = block_size_max 13 14(** Maximum values *) 15let max_window_log = 31 16let min_window_log = 10 17let max_huffman_bits = 11 18let max_fse_accuracy_log = 15 19let max_huffman_symbols = 256 20let max_fse_symbols = 256 21 22(** Block types *) 23type block_type = 24 | Raw_block 25 | RLE_block 26 | Compressed_block 27 | Reserved_block 28 29let block_type_of_int = function 30 | 0 -> Raw_block 31 | 1 -> RLE_block 32 | 2 -> Compressed_block 33 | _ -> Reserved_block 34 35(* Block type integer values for encoding *) 36let block_raw = 0 37let block_rle = 1 38let block_compressed = 2 39 40(** Literals block types *) 41type literals_block_type = 42 | Raw_literals 43 | RLE_literals 44 | Compressed_literals 45 | Treeless_literals 46 47let literals_block_type_of_int = function 48 | 0 -> Raw_literals 49 | 1 -> RLE_literals 50 | 2 -> Compressed_literals 51 | _ -> Treeless_literals 52 53(** Sequence compression modes *) 54type seq_mode = 55 | Predefined_mode 56 | RLE_mode 57 | FSE_mode 58 | Repeat_mode 59 60let seq_mode_of_int = function 61 | 0 -> Predefined_mode 62 | 1 -> RLE_mode 63 | 2 -> FSE_mode 64 | _ -> Repeat_mode 65 66(** Default FSE distribution tables for predefined mode *) 67 68(* Literals length default distribution (accuracy log 6, 64 states) *) 69let ll_default_distribution = [| 70 4; 3; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 1; 1; 1; 71 2; 2; 2; 2; 2; 2; 2; 2; 2; 3; 2; 1; 1; 1; 1; 1; 72 -1; -1; -1; -1 73|] 74let ll_default_accuracy_log = 6 75let ll_max_accuracy_log = 9 76 77(* Match length default distribution (accuracy log 6, 64 states) *) 78let ml_default_distribution = [| 79 1; 4; 3; 2; 2; 2; 2; 2; 2; 1; 1; 1; 1; 1; 1; 1; 80 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 81 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; -1; -1; 82 -1; -1; -1; -1; -1 83|] 84let ml_default_accuracy_log = 6 85let ml_max_accuracy_log = 9 86 87(* Offset default distribution (accuracy log 5, 32 states) *) 88let of_default_distribution = [| 89 1; 1; 1; 1; 1; 1; 2; 2; 2; 1; 1; 1; 1; 1; 1; 1; 90 1; 1; 1; 1; 1; 1; 1; 1; -1; -1; -1; -1; -1 91|] 92let of_default_accuracy_log = 5 93let of_max_accuracy_log = 8 94 95(** Sequence code baselines and extra bits *) 96 97(* Literals length: code 0-35 *) 98let ll_baselines = [| 99 0; 1; 2; 3; 4; 5; 6; 7; 8; 9; 10; 11; 100 12; 13; 14; 15; 16; 18; 20; 22; 24; 28; 32; 40; 101 48; 64; 128; 256; 512; 1024; 2048; 4096; 8192; 16384; 32768; 65536 102|] 103let ll_extra_bits = [| 104 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 105 0; 0; 0; 0; 1; 1; 1; 1; 2; 2; 3; 3; 106 4; 6; 7; 8; 9; 10; 11; 12; 13; 14; 15; 16 107|] 108let ll_max_code = 35 109 110(* Match length: code 0-52 *) 111let ml_baselines = [| 112 3; 4; 5; 6; 7; 8; 9; 10; 11; 12; 13; 14; 15; 16; 113 17; 18; 19; 20; 21; 22; 23; 24; 25; 26; 27; 28; 29; 30; 114 31; 32; 33; 34; 35; 37; 39; 41; 43; 47; 51; 59; 67; 83; 115 99; 131; 259; 515; 1027; 2051; 4099; 8195; 16387; 32771; 65539 116|] 117let ml_extra_bits = [| 118 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 119 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 120 0; 0; 0; 0; 1; 1; 1; 1; 2; 2; 3; 3; 4; 4; 121 5; 7; 8; 9; 10; 11; 12; 13; 14; 15; 16 122|] 123let ml_max_code = 52 124 125(* Offset codes: the code is the number of bits to read *) 126let of_max_code = 31 127 128(** Initial repeat offsets *) 129let initial_repeat_offsets = [| 1; 4; 8 |] 130 131(** Error types *) 132type error = 133 | Invalid_magic_number 134 | Invalid_frame_header 135 | Invalid_block_type 136 | Invalid_block_size 137 | Invalid_literals_header 138 | Invalid_huffman_table 139 | Invalid_fse_table 140 | Invalid_sequence_header 141 | Invalid_offset 142 | Invalid_match_length 143 | Truncated_input 144 | Output_too_small 145 | Checksum_mismatch 146 | Dictionary_mismatch 147 | Corruption 148 149exception Zstd_error of error 150 151let error_message = function 152 | Invalid_magic_number -> "Invalid magic number" 153 | Invalid_frame_header -> "Invalid frame header" 154 | Invalid_block_type -> "Invalid block type" 155 | Invalid_block_size -> "Invalid block size" 156 | Invalid_literals_header -> "Invalid literals header" 157 | Invalid_huffman_table -> "Invalid Huffman table" 158 | Invalid_fse_table -> "Invalid FSE table" 159 | Invalid_sequence_header -> "Invalid sequence header" 160 | Invalid_offset -> "Invalid offset" 161 | Invalid_match_length -> "Invalid match length" 162 | Truncated_input -> "Truncated input" 163 | Output_too_small -> "Output buffer too small" 164 | Checksum_mismatch -> "Checksum mismatch" 165 | Dictionary_mismatch -> "Dictionary mismatch" 166 | Corruption -> "Data corruption detected"