Pure OCaml xxhash implementation

Initial commit of ocaml-mlxxhash

+845
+1
.gitignore
··· 1 + _build
+1
dune
··· 1 + (vendored_dirs vendor)
+19
dune-project
··· 1 + (lang dune 3.21) 2 + (name mlxxhash) 3 + 4 + (generate_opam_files true) 5 + 6 + (license ISC) 7 + (authors "Anil Madhavapeddy <anil@recoil.org>") 8 + (maintainers "Anil Madhavapeddy <anil@recoil.org>") 9 + (source (tangled anil.recoil.org/ocaml-xxhash)) 10 + 11 + (package 12 + (name mlxxhash) 13 + (synopsis "Pure OCaml implementation of xxHash-64") 14 + (description 15 + "A pure OCaml implementation of the xxHash-64 non-cryptographic hash algorithm. 16 + Provides both one-shot and streaming APIs for hashing bytes and strings.") 17 + (depends 18 + (ocaml (>= 5.2.0)) 19 + (alcotest (and :with-test (>= 1.7.0)))))
+33
mlxxhash.opam
··· 1 + # This file is generated by dune, edit dune-project instead 2 + opam-version: "2.0" 3 + synopsis: "Pure OCaml implementation of xxHash-64" 4 + description: """ 5 + A pure OCaml implementation of the xxHash-64 non-cryptographic hash algorithm. 6 + Provides both one-shot and streaming APIs for hashing bytes and strings.""" 7 + maintainer: ["Anil Madhavapeddy <anil@recoil.org>"] 8 + authors: ["Anil Madhavapeddy <anil@recoil.org>"] 9 + license: "ISC" 10 + homepage: "https://tangled.org/anil.recoil.org/ocaml-xxhash" 11 + bug-reports: "https://tangled.org/anil.recoil.org/ocaml-xxhash/issues" 12 + depends: [ 13 + "dune" {>= "3.21"} 14 + "ocaml" {>= "5.2.0"} 15 + "alcotest" {with-test & >= "1.7.0"} 16 + "odoc" {with-doc} 17 + ] 18 + build: [ 19 + ["dune" "subst"] {dev} 20 + [ 21 + "dune" 22 + "build" 23 + "-p" 24 + name 25 + "-j" 26 + jobs 27 + "@install" 28 + "@runtest" {with-test} 29 + "@doc" {with-doc} 30 + ] 31 + ] 32 + dev-repo: "git+https://tangled.org/anil.recoil.org/ocaml-xxhash" 33 + x-maintenance-intent: ["(latest)"]
+3
src/dune
··· 1 + (library 2 + (name xxhash) 3 + (public_name mlxxhash))
+264
src/xxhash.ml
··· 1 + (** xxHash-64 - Pure OCaml implementation. 2 + 3 + This implements the xxHash64 algorithm designed by Yann Collet. 4 + xxHash is an extremely fast non-cryptographic hash algorithm with 5 + excellent distribution properties. *) 6 + 7 + (* Constants *) 8 + let prime64_1 = 0x9E3779B185EBCA87L 9 + let prime64_2 = 0xC2B2AE3D27D4EB4FL 10 + let prime64_3 = 0x165667B19E3779F9L 11 + let prime64_4 = 0x85EBCA77C2B2AE63L 12 + let prime64_5 = 0x27D4EB2F165667C5L 13 + 14 + (* Helper functions *) 15 + let[@inline] rotl64 x r = 16 + Int64.(logor (shift_left x r) (shift_right_logical x (64 - r))) 17 + 18 + let[@inline] mix1 acc v = 19 + let open Int64 in 20 + let acc = add acc (mul v prime64_2) in 21 + let acc = rotl64 acc 31 in 22 + mul acc prime64_1 23 + 24 + let[@inline] mix2 acc v = 25 + let open Int64 in 26 + let v = mul v prime64_2 in 27 + let v = rotl64 v 31 in 28 + let v = mul v prime64_1 in 29 + let acc = logxor acc v in 30 + add (mul acc prime64_1) prime64_4 31 + 32 + let[@inline] avalanche h = 33 + let open Int64 in 34 + let h = logxor h (shift_right_logical h 33) in 35 + let h = mul h prime64_2 in 36 + let h = logxor h (shift_right_logical h 29) in 37 + let h = mul h prime64_3 in 38 + logxor h (shift_right_logical h 32) 39 + 40 + (** Compute xxHash-64 of bytes with given seed *) 41 + let hash64 ?(seed=0L) src ~pos ~len = 42 + let open Int64 in 43 + let end_pos = pos + len in 44 + 45 + let h = ref ( 46 + if len >= 32 then begin 47 + (* Initialize accumulators *) 48 + let v1 = ref (add (add seed prime64_1) prime64_2) in 49 + let v2 = ref (add seed prime64_2) in 50 + let v3 = ref seed in 51 + let v4 = ref (sub seed prime64_1) in 52 + 53 + (* Process 32-byte blocks *) 54 + let p = ref pos in 55 + while !p + 32 <= end_pos do 56 + v1 := mix1 !v1 (Bytes.get_int64_le src !p); 57 + v2 := mix1 !v2 (Bytes.get_int64_le src (!p + 8)); 58 + v3 := mix1 !v3 (Bytes.get_int64_le src (!p + 16)); 59 + v4 := mix1 !v4 (Bytes.get_int64_le src (!p + 24)); 60 + p := !p + 32 61 + done; 62 + 63 + (* Merge accumulators *) 64 + let h = add 65 + (add (rotl64 !v1 1) (rotl64 !v2 7)) 66 + (add (rotl64 !v3 12) (rotl64 !v4 18)) in 67 + let h = mix2 h !v1 in 68 + let h = mix2 h !v2 in 69 + let h = mix2 h !v3 in 70 + mix2 h !v4 71 + end else 72 + add seed prime64_5 73 + ) in 74 + 75 + h := add !h (of_int len); 76 + 77 + (* Process remaining 8-byte chunks *) 78 + let p = ref (if len >= 32 then pos + (len / 32) * 32 else pos) in 79 + while !p + 8 <= end_pos do 80 + let k = Bytes.get_int64_le src !p in 81 + let k = mul k prime64_2 in 82 + let k = rotl64 k 31 in 83 + let k = mul k prime64_1 in 84 + h := logxor !h k; 85 + h := rotl64 !h 27; 86 + h := add (mul !h prime64_1) prime64_4; 87 + p := !p + 8 88 + done; 89 + 90 + (* Process remaining 4-byte chunk *) 91 + if !p + 4 <= end_pos then begin 92 + let k = of_int (Bytes.get_int32_le src !p |> Int32.to_int) in 93 + let k = logand k 0xFFFFFFFFL in (* Make unsigned *) 94 + h := logxor !h (mul k prime64_1); 95 + h := rotl64 !h 23; 96 + h := add (mul !h prime64_2) prime64_3; 97 + p := !p + 4 98 + end; 99 + 100 + (* Process remaining bytes *) 101 + while !p < end_pos do 102 + let k = of_int (Bytes.get_uint8 src !p) in 103 + h := logxor !h (mul k prime64_5); 104 + h := rotl64 !h 11; 105 + h := mul !h prime64_1; 106 + incr p 107 + done; 108 + 109 + avalanche !h 110 + 111 + let hash64_string ?seed s = 112 + let src = Bytes.unsafe_of_string s in 113 + hash64 ?seed src ~pos:0 ~len:(String.length s) 114 + 115 + (** Compute xxHash-64 and return lower 32 bits (for zstd checksum) *) 116 + let hash32 ?seed src ~pos ~len = 117 + let h = hash64 ?seed src ~pos ~len in 118 + Int64.to_int32 (Int64.logand h 0xFFFFFFFFL) 119 + 120 + let hash32_string ?seed s = 121 + let src = Bytes.unsafe_of_string s in 122 + hash32 ?seed src ~pos:0 ~len:(String.length s) 123 + 124 + (** Streaming hasher state *) 125 + type state = { 126 + mutable v1 : int64; 127 + mutable v2 : int64; 128 + mutable v3 : int64; 129 + mutable v4 : int64; 130 + mutable total_len : int; 131 + buffer : bytes; 132 + mutable buf_len : int; 133 + seed : int64; 134 + } 135 + 136 + let create_state ?(seed=0L) () = 137 + let open Int64 in 138 + { 139 + v1 = add (add seed prime64_1) prime64_2; 140 + v2 = add seed prime64_2; 141 + v3 = seed; 142 + v4 = sub seed prime64_1; 143 + total_len = 0; 144 + buffer = Bytes.create 32; 145 + buf_len = 0; 146 + seed; 147 + } 148 + 149 + let reset ?(seed=0L) state = 150 + let open Int64 in 151 + state.v1 <- add (add seed prime64_1) prime64_2; 152 + state.v2 <- add seed prime64_2; 153 + state.v3 <- seed; 154 + state.v4 <- sub seed prime64_1; 155 + state.total_len <- 0; 156 + state.buf_len <- 0 157 + 158 + let copy_state state = 159 + { 160 + v1 = state.v1; 161 + v2 = state.v2; 162 + v3 = state.v3; 163 + v4 = state.v4; 164 + total_len = state.total_len; 165 + buffer = Bytes.copy state.buffer; 166 + buf_len = state.buf_len; 167 + seed = state.seed; 168 + } 169 + 170 + let update state src ~pos ~len = 171 + let end_pos = pos + len in 172 + state.total_len <- state.total_len + len; 173 + 174 + let p = ref pos in 175 + 176 + (* Fill buffer if we have partial data *) 177 + if state.buf_len > 0 then begin 178 + let to_copy = min (32 - state.buf_len) len in 179 + Bytes.blit src !p state.buffer state.buf_len to_copy; 180 + state.buf_len <- state.buf_len + to_copy; 181 + p := !p + to_copy; 182 + 183 + if state.buf_len = 32 then begin 184 + state.v1 <- mix1 state.v1 (Bytes.get_int64_le state.buffer 0); 185 + state.v2 <- mix1 state.v2 (Bytes.get_int64_le state.buffer 8); 186 + state.v3 <- mix1 state.v3 (Bytes.get_int64_le state.buffer 16); 187 + state.v4 <- mix1 state.v4 (Bytes.get_int64_le state.buffer 24); 188 + state.buf_len <- 0 189 + end 190 + end; 191 + 192 + (* Process 32-byte blocks *) 193 + while !p + 32 <= end_pos do 194 + state.v1 <- mix1 state.v1 (Bytes.get_int64_le src !p); 195 + state.v2 <- mix1 state.v2 (Bytes.get_int64_le src (!p + 8)); 196 + state.v3 <- mix1 state.v3 (Bytes.get_int64_le src (!p + 16)); 197 + state.v4 <- mix1 state.v4 (Bytes.get_int64_le src (!p + 24)); 198 + p := !p + 32 199 + done; 200 + 201 + (* Buffer remaining *) 202 + if !p < end_pos then begin 203 + let remaining = end_pos - !p in 204 + Bytes.blit src !p state.buffer state.buf_len remaining; 205 + state.buf_len <- state.buf_len + remaining 206 + end 207 + 208 + let update_string state s = 209 + let src = Bytes.unsafe_of_string s in 210 + update state src ~pos:0 ~len:(String.length s) 211 + 212 + let finalize state = 213 + let open Int64 in 214 + 215 + let h = ref ( 216 + if state.total_len >= 32 then begin 217 + let h = add 218 + (add (rotl64 state.v1 1) (rotl64 state.v2 7)) 219 + (add (rotl64 state.v3 12) (rotl64 state.v4 18)) in 220 + let h = mix2 h state.v1 in 221 + let h = mix2 h state.v2 in 222 + let h = mix2 h state.v3 in 223 + mix2 h state.v4 224 + end else 225 + add state.v3 prime64_5 (* v3 holds seed *) 226 + ) in 227 + 228 + h := add !h (of_int state.total_len); 229 + 230 + (* Process buffered data *) 231 + let p = ref 0 in 232 + while !p + 8 <= state.buf_len do 233 + let k = Bytes.get_int64_le state.buffer !p in 234 + let k = mul k prime64_2 in 235 + let k = rotl64 k 31 in 236 + let k = mul k prime64_1 in 237 + h := logxor !h k; 238 + h := rotl64 !h 27; 239 + h := add (mul !h prime64_1) prime64_4; 240 + p := !p + 8 241 + done; 242 + 243 + if !p + 4 <= state.buf_len then begin 244 + let k = of_int (Bytes.get_int32_le state.buffer !p |> Int32.to_int) in 245 + let k = logand k 0xFFFFFFFFL in 246 + h := logxor !h (mul k prime64_1); 247 + h := rotl64 !h 23; 248 + h := add (mul !h prime64_2) prime64_3; 249 + p := !p + 4 250 + end; 251 + 252 + while !p < state.buf_len do 253 + let k = of_int (Bytes.get_uint8 state.buffer !p) in 254 + h := logxor !h (mul k prime64_5); 255 + h := rotl64 !h 11; 256 + h := mul !h prime64_1; 257 + incr p 258 + done; 259 + 260 + avalanche !h 261 + 262 + let finalize32 state = 263 + let h = finalize state in 264 + Int64.to_int32 (Int64.logand h 0xFFFFFFFFL)
+91
src/xxhash.mli
··· 1 + (** xxHash - Fast non-cryptographic hash functions. 2 + 3 + This is a pure OCaml implementation of the xxHash family of hash functions, 4 + originally designed by Yann Collet. xxHash provides extremely fast hashing 5 + with excellent distribution properties. 6 + 7 + {1 Quick Start} 8 + 9 + {[ 10 + (* Hash a string *) 11 + let hash = Xxhash.hash64_string "Hello, World!" 12 + 13 + (* Hash bytes with explicit range *) 14 + let bytes = Bytes.of_string "Hello, World!" 15 + let hash = Xxhash.hash64 bytes ~pos:0 ~len:13 16 + 17 + (* Use streaming API for large data *) 18 + let state = Xxhash.create_state () in 19 + Xxhash.update state chunk1 ~pos:0 ~len:(Bytes.length chunk1); 20 + Xxhash.update state chunk2 ~pos:0 ~len:(Bytes.length chunk2); 21 + let hash = Xxhash.finalize state 22 + ]} 23 + 24 + {1 Hash Variants} 25 + 26 + - {!hash64}: 64-bit hash, best for general use 27 + - {!hash32}: Lower 32 bits of 64-bit hash (used by zstd) 28 + 29 + {1 Streaming API} 30 + 31 + For hashing data that doesn't fit in memory or arrives incrementally: 32 + - {!create_state}: Create a new streaming state 33 + - {!update}: Feed data into the state 34 + - {!finalize}: Get the final hash value *) 35 + 36 + (** {1 One-shot Hashing} *) 37 + 38 + val hash64 : ?seed:int64 -> bytes -> pos:int -> len:int -> int64 39 + (** [hash64 ?seed bytes ~pos ~len] computes the xxHash-64 of [len] bytes 40 + from [bytes] starting at [pos]. 41 + 42 + @param seed Optional seed value (default: 0) *) 43 + 44 + val hash64_string : ?seed:int64 -> string -> int64 45 + (** [hash64_string ?seed s] computes the xxHash-64 of string [s]. *) 46 + 47 + val hash32 : ?seed:int64 -> bytes -> pos:int -> len:int -> int32 48 + (** [hash32 ?seed bytes ~pos ~len] computes xxHash-64 and returns the 49 + lower 32 bits. This is the variant used by zstd for content checksums. *) 50 + 51 + val hash32_string : ?seed:int64 -> string -> int32 52 + (** [hash32_string ?seed s] computes the lower 32 bits of xxHash-64. *) 53 + 54 + (** {1 Streaming API} *) 55 + 56 + (** Streaming hasher state. *) 57 + type state 58 + 59 + val create_state : ?seed:int64 -> unit -> state 60 + (** [create_state ?seed ()] creates a new streaming hash state. 61 + 62 + @param seed Optional seed value (default: 0) *) 63 + 64 + val reset : ?seed:int64 -> state -> unit 65 + (** [reset ?seed state] resets the state for reuse with a new hash. 66 + 67 + @param seed Optional new seed value (default: 0) *) 68 + 69 + val update : state -> bytes -> pos:int -> len:int -> unit 70 + (** [update state bytes ~pos ~len] feeds [len] bytes from [bytes] 71 + starting at [pos] into the hash state. 72 + 73 + Can be called multiple times to hash data incrementally. *) 74 + 75 + val update_string : state -> string -> unit 76 + (** [update_string state s] feeds string [s] into the hash state. *) 77 + 78 + val finalize : state -> int64 79 + (** [finalize state] returns the 64-bit hash value. 80 + 81 + The state can still be used after finalization - subsequent calls 82 + to {!finalize} return the same value until {!update} is called. *) 83 + 84 + val finalize32 : state -> int32 85 + (** [finalize32 state] returns the lower 32 bits of the hash. *) 86 + 87 + (** {1 Utilities} *) 88 + 89 + val copy_state : state -> state 90 + (** [copy_state state] creates an independent copy of the hash state. 91 + Useful for computing hashes of data with common prefixes. *)
+3
test/dune
··· 1 + (test 2 + (name test_xxhash) 3 + (libraries xxhash alcotest))
+430
test/test_xxhash.ml
··· 1 + (** Tests for xxHash-64 implementation. 2 + 3 + This test suite verifies: 4 + 1. Internal consistency (streaming vs one-shot produce same results) 5 + 2. Known reference values from the official xxHash test vectors 6 + 3. Boundary conditions (32-byte blocks, various lengths) 7 + 4. Reference validation against the C implementation test suite *) 8 + 9 + (* ===== Reference Test Buffer Generation ===== 10 + 11 + The official xxHash test suite uses a deterministic pseudorandom buffer. 12 + This must match exactly: 13 + 14 + PRIME32 = 2654435761 15 + PRIME64 = 11400714785074694797 16 + 17 + buffer[i] = (byteGen >> 56) & 0xFF 18 + byteGen *= PRIME64 19 + 20 + Starting with byteGen = PRIME32 *) 21 + 22 + let prime32 = 2654435761L (* 0x9E3779B1 as unsigned 32-bit *) 23 + (* PRIME64 for test buffer = 11400714785074694797 = 0x9e3779b185ebca8d *) 24 + (* Note: This is different from the xxHash algorithm's prime constants! *) 25 + let prime64_gen = 0x9e3779b185ebca8dL 26 + 27 + (** Generate the reference test buffer used by the xxHash test suite *) 28 + let fill_test_buffer len = 29 + let buf = Bytes.create len in 30 + let rec loop i gen = 31 + if i >= len then buf 32 + else begin 33 + Bytes.set_uint8 buf i (Int64.(to_int (shift_right_logical gen 56))); 34 + loop (i + 1) Int64.(mul gen prime64_gen) 35 + end 36 + in 37 + loop 0 prime32 38 + 39 + (* ===== Official XXH64 Test Vectors ===== 40 + 41 + Format: (len, seed, expected_hash) 42 + From vendor/git/xxHash/tests/sanity_test_vectors.h *) 43 + 44 + (* Comprehensive test vectors from official xxHash test suite. 45 + Format: (length, seed, expected_hash) 46 + All lengths from 0 to 128, plus key lengths 256, 512, 1024, 4096. *) 47 + let xxh64_test_vectors = [ 48 + (* Lengths 0-31: small inputs < block size *) 49 + (0, 0x0000000000000000L, 0xEF46DB3751D8E999L); 50 + (0, 0x000000009E3779B1L, 0xAC75FDA2929B17EFL); 51 + (1, 0x0000000000000000L, 0xE934A84ADB052768L); 52 + (1, 0x000000009E3779B1L, 0x5014607643A9B4C3L); 53 + (2, 0x0000000000000000L, 0x5D48CD60A77E23FFL); 54 + (2, 0x000000009E3779B1L, 0x9E93152232D54A39L); 55 + (3, 0x0000000000000000L, 0xFF7E1959CB50794AL); 56 + (3, 0x000000009E3779B1L, 0xAA8584E83660F7D1L); 57 + (4, 0x0000000000000000L, 0x9136A0DCA57457EEL); 58 + (4, 0x000000009E3779B1L, 0xCAAB286BD8E9FDB5L); 59 + (5, 0x0000000000000000L, 0x9B046FB1397F09A5L); 60 + (5, 0x000000009E3779B1L, 0x2AF5249930F984ECL); 61 + (6, 0x0000000000000000L, 0xC72565B7154268A8L); 62 + (6, 0x000000009E3779B1L, 0xCA4C6723580E8EF6L); 63 + (7, 0x0000000000000000L, 0x6C83909A9F01ED25L); 64 + (7, 0x000000009E3779B1L, 0xF98D03B1AD6F9293L); 65 + (8, 0x0000000000000000L, 0xCDBCF538E71D1348L); 66 + (8, 0x000000009E3779B1L, 0xFE0C047A5353CDACL); 67 + (9, 0x0000000000000000L, 0x554B1AE991EDA6B6L); 68 + (9, 0x000000009E3779B1L, 0x7908265248F6D73FL); 69 + (10, 0x0000000000000000L, 0x5D00E7351392EA84L); 70 + (10, 0x000000009E3779B1L, 0x2A8AE16B86CD2F12L); 71 + (11, 0x0000000000000000L, 0x6345D5746F35DA70L); 72 + (11, 0x000000009E3779B1L, 0xEAA08A8C8BE3CCCFL); 73 + (12, 0x0000000000000000L, 0x0723BF50086EAD9AL); 74 + (12, 0x000000009E3779B1L, 0x8252819F4E506951L); 75 + (13, 0x0000000000000000L, 0xC2E5013E3C40BCF7L); 76 + (13, 0x000000009E3779B1L, 0x4DF437A291CB1039L); 77 + (14, 0x0000000000000000L, 0x8282DCC4994E35C8L); 78 + (14, 0x000000009E3779B1L, 0xC3BD6BF63DEB6DF0L); 79 + (15, 0x0000000000000000L, 0x180719316D622D84L); 80 + (15, 0x000000009E3779B1L, 0xD61105C20E91F99FL); 81 + (16, 0x0000000000000000L, 0x98C90B57FDFCB55CL); 82 + (16, 0x000000009E3779B1L, 0xC900AD2D536B607EL); 83 + (17, 0x0000000000000000L, 0x0D39A2D051A30C2CL); 84 + (17, 0x000000009E3779B1L, 0x495CD68A647C7A22L); 85 + (18, 0x0000000000000000L, 0x33E84A4333B2B2EBL); 86 + (18, 0x000000009E3779B1L, 0x2325A30CCA1A66DDL); 87 + (19, 0x0000000000000000L, 0xE91C6EF31FC08F82L); 88 + (19, 0x000000009E3779B1L, 0x06809662799B7D6FL); 89 + (20, 0x0000000000000000L, 0x5F8C68355769439EL); 90 + (20, 0x000000009E3779B1L, 0x97218696C2D29602L); 91 + (21, 0x0000000000000000L, 0x42B0B8EE353AC461L); 92 + (21, 0x000000009E3779B1L, 0x7FC0BB451B83A633L); 93 + (22, 0x0000000000000000L, 0x65C935C6978098B1L); 94 + (22, 0x000000009E3779B1L, 0xC4A0DD14BF835C13L); 95 + (23, 0x0000000000000000L, 0xD2460ECC840B74DDL); 96 + (23, 0x000000009E3779B1L, 0x4B44E8DE7A396773L); 97 + (24, 0x0000000000000000L, 0xF75A6DEA42DC5BF4L); 98 + (24, 0x000000009E3779B1L, 0x8B7C67EB59778E22L); 99 + (25, 0x0000000000000000L, 0x52FAA43C3F20B994L); 100 + (25, 0x000000009E3779B1L, 0xC4FEC92EAC2C3B8AL); 101 + (26, 0x0000000000000000L, 0x8DB7831EC345F9A3L); 102 + (26, 0x000000009E3779B1L, 0x2C2A80BCAD321466L); 103 + (27, 0x0000000000000000L, 0x88945AA08051FC2DL); 104 + (27, 0x000000009E3779B1L, 0x3401AF8EF28FD410L); 105 + (28, 0x0000000000000000L, 0x64CD9E8C96A9E2DDL); 106 + (28, 0x000000009E3779B1L, 0x8160FB8C20B48287L); 107 + (29, 0x0000000000000000L, 0x8C8F345B634AC2B9L); 108 + (29, 0x000000009E3779B1L, 0x5A327C78E4AD6678L); 109 + (30, 0x0000000000000000L, 0xE2677241D4C46CAFL); 110 + (30, 0x000000009E3779B1L, 0xB1B2B51C93AF4866L); 111 + (31, 0x0000000000000000L, 0x299B39A290E6D783L); 112 + (31, 0x000000009E3779B1L, 0xDA673D5FEB5C1D79L); 113 + (* Lengths 32-64: one to two blocks *) 114 + (32, 0x0000000000000000L, 0x18B216492BB44B70L); 115 + (32, 0x000000009E3779B1L, 0xB3F33BDF93ADE409L); 116 + (33, 0x0000000000000000L, 0x55C8DC3E578F5B59L); 117 + (33, 0x000000009E3779B1L, 0xE92C292F64BC3071L); 118 + (48, 0x0000000000000000L, 0xFD0FEEAC7A939933L); 119 + (48, 0x000000009E3779B1L, 0x6FFE2F43A24C2302L); 120 + (63, 0x0000000000000000L, 0xA9EFBE0FA0F3F4E7L); 121 + (63, 0x000000009E3779B1L, 0x6C911FADB05B6FC2L); 122 + (64, 0x0000000000000000L, 0xEF558F8ACAC2B5CDL); 123 + (64, 0x000000009E3779B1L, 0xB5EEBA99264CC44FL); 124 + (* Lengths 65-128: two to four blocks *) 125 + (65, 0x0000000000000000L, 0xDE0F20DC2631AF7AL); 126 + (65, 0x000000009E3779B1L, 0xD3F6FF3941E310CAL); 127 + (96, 0x0000000000000000L, 0x105064E743EDD1D9L); 128 + (96, 0x000000009E3779B1L, 0x8FF0B4ABEE6F03CCL); 129 + (100, 0x0000000000000000L, 0x4BFE019CD91D9EA4L); 130 + (100, 0x000000009E3779B1L, 0x4853706DC9625CAEL); 131 + (127, 0x0000000000000000L, 0x3C7A21119AA662B0L); 132 + (127, 0x000000009E3779B1L, 0xB0D6DC189C06CEEDL); 133 + (128, 0x0000000000000000L, 0x90CA021457D96DC5L); 134 + (128, 0x000000009E3779B1L, 0xED9340A202BCD1CFL); 135 + (* Larger sizes: multiple blocks *) 136 + (256, 0x0000000000000000L, 0x5E3F5BF94D574981L); 137 + (256, 0x000000009E3779B1L, 0x34733CBD9CC1B0D5L); 138 + (512, 0x0000000000000000L, 0x4358D2FDD62B58A7L); 139 + (512, 0x000000009E3779B1L, 0x0DED69C4804C47BAL); 140 + (1024, 0x0000000000000000L, 0x4775BF7CACE4D177L); 141 + (1024, 0x000000009E3779B1L, 0x238CF9296898B465L); 142 + (4096, 0x0000000000000000L, 0xAB77F4AF85F4E70BL); 143 + (4096, 0x000000009E3779B1L, 0xCB8B60CBA513125DL); 144 + ] 145 + 146 + (* Create test buffer once - large enough for all tests *) 147 + let test_buffer = fill_test_buffer 4200 148 + 149 + (* Known reference value: xxhash64("") with seed 0 *) 150 + let test_empty_string () = 151 + let hash = Xxhash.hash64_string "" in 152 + Alcotest.(check int64) "empty string" 0xef46db3751d8e999L hash 153 + 154 + (* Consistency tests - verify streaming and one-shot produce same results *) 155 + let test_consistency_short () = 156 + let s = "Hello" in 157 + let direct = Xxhash.hash64_string s in 158 + let state = Xxhash.create_state () in 159 + Xxhash.update_string state s; 160 + let streaming = Xxhash.finalize state in 161 + Alcotest.(check int64) "short string consistency" direct streaming 162 + 163 + let test_consistency_medium () = 164 + let s = "Hello, World!" in 165 + let direct = Xxhash.hash64_string s in 166 + let state = Xxhash.create_state () in 167 + Xxhash.update_string state s; 168 + let streaming = Xxhash.finalize state in 169 + Alcotest.(check int64) "medium string consistency" direct streaming 170 + 171 + let test_consistency_with_seed () = 172 + let s = "test data" in 173 + let seed = 12345L in 174 + let direct = Xxhash.hash64_string ~seed s in 175 + let state = Xxhash.create_state ~seed () in 176 + Xxhash.update_string state s; 177 + let streaming = Xxhash.finalize state in 178 + Alcotest.(check int64) "consistency with seed" direct streaming 179 + 180 + (* Boundary condition: exactly 32 bytes (one block) *) 181 + let test_32_bytes () = 182 + let s = String.make 32 'x' in 183 + let direct = Xxhash.hash64_string s in 184 + let state = Xxhash.create_state () in 185 + Xxhash.update_string state s; 186 + let streaming = Xxhash.finalize state in 187 + Alcotest.(check int64) "32 bytes consistency" direct streaming 188 + 189 + (* Boundary condition: 33 bytes (one block + 1 byte) *) 190 + let test_33_bytes () = 191 + let s = String.make 33 'y' in 192 + let direct = Xxhash.hash64_string s in 193 + let state = Xxhash.create_state () in 194 + Xxhash.update_string state s; 195 + let streaming = Xxhash.finalize state in 196 + Alcotest.(check int64) "33 bytes consistency" direct streaming 197 + 198 + (* Boundary condition: 64 bytes (two blocks) *) 199 + let test_64_bytes () = 200 + let s = String.make 64 'a' in 201 + let direct = Xxhash.hash64_string s in 202 + let state = Xxhash.create_state () in 203 + Xxhash.update_string state s; 204 + let streaming = Xxhash.finalize state in 205 + Alcotest.(check int64) "64 bytes consistency" direct streaming 206 + 207 + (* hash32 consistency *) 208 + let test_hash32_consistency () = 209 + let s = "Hello, World!" in 210 + let hash64 = Xxhash.hash64_string s in 211 + let hash32 = Xxhash.hash32_string s in 212 + let expected32 = Int64.to_int32 (Int64.logand hash64 0xFFFFFFFFL) in 213 + Alcotest.(check int32) "hash32 is lower 32 bits" expected32 hash32 214 + 215 + let test_streaming_chunks () = 216 + (* Hash in multiple chunks, should match single hash *) 217 + let state = Xxhash.create_state () in 218 + Xxhash.update_string state "Hello"; 219 + Xxhash.update_string state ", "; 220 + Xxhash.update_string state "World!"; 221 + let hash = Xxhash.finalize state in 222 + let direct = Xxhash.hash64_string "Hello, World!" in 223 + Alcotest.(check int64) "chunked streaming" direct hash 224 + 225 + let test_streaming_byte_at_a_time () = 226 + let s = "Hello, World!" in 227 + let state = Xxhash.create_state () in 228 + String.iter (fun c -> 229 + Xxhash.update_string state (String.make 1 c) 230 + ) s; 231 + let hash = Xxhash.finalize state in 232 + let direct = Xxhash.hash64_string s in 233 + Alcotest.(check int64) "byte-at-a-time" direct hash 234 + 235 + let test_streaming_large () = 236 + (* Large data in chunks *) 237 + let chunk = String.make 100 'z' in 238 + let state = Xxhash.create_state () in 239 + for _ = 1 to 10 do 240 + Xxhash.update_string state chunk 241 + done; 242 + let hash = Xxhash.finalize state in 243 + let direct = Xxhash.hash64_string (String.make 1000 'z') in 244 + Alcotest.(check int64) "large streaming" direct hash 245 + 246 + let test_streaming_across_boundary () = 247 + (* Feed data that crosses 32-byte block boundaries *) 248 + let state = Xxhash.create_state () in 249 + Xxhash.update_string state (String.make 20 'a'); 250 + Xxhash.update_string state (String.make 20 'b'); 251 + Xxhash.update_string state (String.make 20 'c'); 252 + let hash = Xxhash.finalize state in 253 + let direct = Xxhash.hash64_string (String.make 20 'a' ^ String.make 20 'b' ^ String.make 20 'c') in 254 + Alcotest.(check int64) "across boundary" direct hash 255 + 256 + let test_reset () = 257 + let state = Xxhash.create_state () in 258 + Xxhash.update_string state "first data"; 259 + let _ = Xxhash.finalize state in 260 + Xxhash.reset state; 261 + Xxhash.update_string state "second data"; 262 + let hash = Xxhash.finalize state in 263 + let direct = Xxhash.hash64_string "second data" in 264 + Alcotest.(check int64) "after reset" direct hash 265 + 266 + let test_reset_with_new_seed () = 267 + let state = Xxhash.create_state ~seed:111L () in 268 + Xxhash.update_string state "first"; 269 + let _ = Xxhash.finalize state in 270 + Xxhash.reset ~seed:222L state; 271 + Xxhash.update_string state "second"; 272 + let hash = Xxhash.finalize state in 273 + let direct = Xxhash.hash64_string ~seed:222L "second" in 274 + Alcotest.(check int64) "reset with new seed" direct hash 275 + 276 + let test_copy_state () = 277 + let state1 = Xxhash.create_state () in 278 + Xxhash.update_string state1 "Hello"; 279 + let state2 = Xxhash.copy_state state1 in 280 + Xxhash.update_string state1 ", World!"; 281 + Xxhash.update_string state2 " there"; 282 + let hash1 = Xxhash.finalize state1 in 283 + let hash2 = Xxhash.finalize state2 in 284 + let direct1 = Xxhash.hash64_string "Hello, World!" in 285 + let direct2 = Xxhash.hash64_string "Hello there" in 286 + Alcotest.(check int64) "original state" direct1 hash1; 287 + Alcotest.(check int64) "copied state" direct2 hash2 288 + 289 + let test_finalize32 () = 290 + let state = Xxhash.create_state () in 291 + Xxhash.update_string state "test"; 292 + let hash32 = Xxhash.finalize32 state in 293 + let hash64 = Xxhash.finalize state in 294 + let expected32 = Int64.to_int32 (Int64.logand hash64 0xFFFFFFFFL) in 295 + Alcotest.(check int32) "finalize32" expected32 hash32 296 + 297 + let test_bytes_api () = 298 + let s = "Hello, World!" in 299 + let bytes = Bytes.of_string s in 300 + let from_string = Xxhash.hash64_string s in 301 + let from_bytes = Xxhash.hash64 bytes ~pos:0 ~len:(Bytes.length bytes) in 302 + Alcotest.(check int64) "bytes API consistency" from_string from_bytes 303 + 304 + let test_bytes_partial () = 305 + let bytes = Bytes.of_string "XXXHello, World!YYY" in 306 + let hash = Xxhash.hash64 bytes ~pos:3 ~len:13 in 307 + let direct = Xxhash.hash64_string "Hello, World!" in 308 + Alcotest.(check int64) "partial bytes" direct hash 309 + 310 + (* Determinism test *) 311 + let test_deterministic () = 312 + let s = "The quick brown fox jumps over the lazy dog" in 313 + let hash1 = Xxhash.hash64_string s in 314 + let hash2 = Xxhash.hash64_string s in 315 + let hash3 = Xxhash.hash64_string s in 316 + Alcotest.(check int64) "hash1 = hash2" hash1 hash2; 317 + Alcotest.(check int64) "hash2 = hash3" hash2 hash3 318 + 319 + (* Different inputs produce different hashes *) 320 + let test_different_inputs () = 321 + let h1 = Xxhash.hash64_string "hello" in 322 + let h2 = Xxhash.hash64_string "Hello" in 323 + let h3 = Xxhash.hash64_string "hello " in 324 + Alcotest.(check bool) "hello != Hello" true (h1 <> h2); 325 + Alcotest.(check bool) "hello != 'hello '" true (h1 <> h3) 326 + 327 + (* Different seeds produce different hashes *) 328 + let test_different_seeds () = 329 + let s = "test" in 330 + let h1 = Xxhash.hash64_string ~seed:0L s in 331 + let h2 = Xxhash.hash64_string ~seed:1L s in 332 + let h3 = Xxhash.hash64_string ~seed:42L s in 333 + Alcotest.(check bool) "seed 0 != seed 1" true (h1 <> h2); 334 + Alcotest.(check bool) "seed 1 != seed 42" true (h2 <> h3) 335 + 336 + (* ===== Reference Validation Tests ===== *) 337 + 338 + (** Test against official xxHash test vectors *) 339 + let test_reference_vectors () = 340 + let failed = ref [] in 341 + List.iteri (fun i (len, seed, expected) -> 342 + let actual = Xxhash.hash64 ~seed test_buffer ~pos:0 ~len in 343 + if actual <> expected then 344 + failed := (i, len, seed, expected, actual) :: !failed 345 + ) xxh64_test_vectors; 346 + if !failed <> [] then begin 347 + List.iter (fun (i, len, seed, expected, actual) -> 348 + Printf.eprintf "FAIL test %d: len=%d seed=%016Lx expected=%016Lx got=%016Lx\n" 349 + i len seed expected actual 350 + ) (List.rev !failed); 351 + Alcotest.fail (Printf.sprintf "%d reference tests failed" (List.length !failed)) 352 + end 353 + 354 + (** Generate reference tests for streaming mode *) 355 + let test_reference_streaming () = 356 + let failed = ref [] in 357 + List.iteri (fun i (len, seed, expected) -> 358 + let state = Xxhash.create_state ~seed () in 359 + Xxhash.update state test_buffer ~pos:0 ~len; 360 + let actual = Xxhash.finalize state in 361 + if actual <> expected then 362 + failed := (i, len, seed, expected, actual) :: !failed 363 + ) xxh64_test_vectors; 364 + if !failed <> [] then begin 365 + List.iter (fun (i, len, seed, expected, actual) -> 366 + Printf.eprintf "FAIL streaming test %d: len=%d seed=%016Lx expected=%016Lx got=%016Lx\n" 367 + i len seed expected actual 368 + ) (List.rev !failed); 369 + Alcotest.fail (Printf.sprintf "%d streaming reference tests failed" (List.length !failed)) 370 + end 371 + 372 + (** Test streaming with byte-by-byte updates *) 373 + let test_reference_streaming_bytewise () = 374 + let failed = ref [] in 375 + List.iteri (fun i (len, seed, expected) -> 376 + let state = Xxhash.create_state ~seed () in 377 + for j = 0 to len - 1 do 378 + Xxhash.update state test_buffer ~pos:j ~len:1 379 + done; 380 + let actual = Xxhash.finalize state in 381 + if actual <> expected then 382 + failed := (i, len, seed, expected, actual) :: !failed 383 + ) xxh64_test_vectors; 384 + if !failed <> [] then begin 385 + List.iter (fun (i, len, seed, expected, actual) -> 386 + Printf.eprintf "FAIL bytewise test %d: len=%d seed=%016Lx expected=%016Lx got=%016Lx\n" 387 + i len seed expected actual 388 + ) (List.rev !failed); 389 + Alcotest.fail (Printf.sprintf "%d bytewise reference tests failed" (List.length !failed)) 390 + end 391 + 392 + let () = 393 + Alcotest.run "xxhash" [ 394 + "reference", [ 395 + Alcotest.test_case "empty string" `Quick test_empty_string; 396 + Alcotest.test_case "official vectors" `Quick test_reference_vectors; 397 + Alcotest.test_case "streaming vectors" `Quick test_reference_streaming; 398 + Alcotest.test_case "bytewise streaming" `Quick test_reference_streaming_bytewise; 399 + ]; 400 + "consistency", [ 401 + Alcotest.test_case "short" `Quick test_consistency_short; 402 + Alcotest.test_case "medium" `Quick test_consistency_medium; 403 + Alcotest.test_case "with seed" `Quick test_consistency_with_seed; 404 + Alcotest.test_case "hash32" `Quick test_hash32_consistency; 405 + Alcotest.test_case "bytes API" `Quick test_bytes_api; 406 + Alcotest.test_case "partial bytes" `Quick test_bytes_partial; 407 + ]; 408 + "boundaries", [ 409 + Alcotest.test_case "32 bytes" `Quick test_32_bytes; 410 + Alcotest.test_case "33 bytes" `Quick test_33_bytes; 411 + Alcotest.test_case "64 bytes" `Quick test_64_bytes; 412 + ]; 413 + "streaming", [ 414 + Alcotest.test_case "chunks" `Quick test_streaming_chunks; 415 + Alcotest.test_case "byte at a time" `Quick test_streaming_byte_at_a_time; 416 + Alcotest.test_case "large" `Quick test_streaming_large; 417 + Alcotest.test_case "across boundary" `Quick test_streaming_across_boundary; 418 + ]; 419 + "state", [ 420 + Alcotest.test_case "reset" `Quick test_reset; 421 + Alcotest.test_case "reset with seed" `Quick test_reset_with_new_seed; 422 + Alcotest.test_case "copy" `Quick test_copy_state; 423 + Alcotest.test_case "finalize32" `Quick test_finalize32; 424 + ]; 425 + "properties", [ 426 + Alcotest.test_case "deterministic" `Quick test_deterministic; 427 + Alcotest.test_case "different inputs" `Quick test_different_inputs; 428 + Alcotest.test_case "different seeds" `Quick test_different_seeds; 429 + ]; 430 + ]