···11+(** xxHash-64 - Pure OCaml implementation.
22+33+ This implements the xxHash64 algorithm designed by Yann Collet.
44+ xxHash is an extremely fast non-cryptographic hash algorithm with
55+ excellent distribution properties. *)
66+77+(* Constants *)
88+let prime64_1 = 0x9E3779B185EBCA87L
99+let prime64_2 = 0xC2B2AE3D27D4EB4FL
1010+let prime64_3 = 0x165667B19E3779F9L
1111+let prime64_4 = 0x85EBCA77C2B2AE63L
1212+let prime64_5 = 0x27D4EB2F165667C5L
1313+1414+(* Helper functions *)
1515+let[@inline] rotl64 x r =
1616+ Int64.(logor (shift_left x r) (shift_right_logical x (64 - r)))
1717+1818+let[@inline] mix1 acc v =
1919+ let open Int64 in
2020+ let acc = add acc (mul v prime64_2) in
2121+ let acc = rotl64 acc 31 in
2222+ mul acc prime64_1
2323+2424+let[@inline] mix2 acc v =
2525+ let open Int64 in
2626+ let v = mul v prime64_2 in
2727+ let v = rotl64 v 31 in
2828+ let v = mul v prime64_1 in
2929+ let acc = logxor acc v in
3030+ add (mul acc prime64_1) prime64_4
3131+3232+let[@inline] avalanche h =
3333+ let open Int64 in
3434+ let h = logxor h (shift_right_logical h 33) in
3535+ let h = mul h prime64_2 in
3636+ let h = logxor h (shift_right_logical h 29) in
3737+ let h = mul h prime64_3 in
3838+ logxor h (shift_right_logical h 32)
3939+4040+(** Compute xxHash-64 of bytes with given seed *)
4141+let hash64 ?(seed=0L) src ~pos ~len =
4242+ let open Int64 in
4343+ let end_pos = pos + len in
4444+4545+ let h = ref (
4646+ if len >= 32 then begin
4747+ (* Initialize accumulators *)
4848+ let v1 = ref (add (add seed prime64_1) prime64_2) in
4949+ let v2 = ref (add seed prime64_2) in
5050+ let v3 = ref seed in
5151+ let v4 = ref (sub seed prime64_1) in
5252+5353+ (* Process 32-byte blocks *)
5454+ let p = ref pos in
5555+ while !p + 32 <= end_pos do
5656+ v1 := mix1 !v1 (Bytes.get_int64_le src !p);
5757+ v2 := mix1 !v2 (Bytes.get_int64_le src (!p + 8));
5858+ v3 := mix1 !v3 (Bytes.get_int64_le src (!p + 16));
5959+ v4 := mix1 !v4 (Bytes.get_int64_le src (!p + 24));
6060+ p := !p + 32
6161+ done;
6262+6363+ (* Merge accumulators *)
6464+ let h = add
6565+ (add (rotl64 !v1 1) (rotl64 !v2 7))
6666+ (add (rotl64 !v3 12) (rotl64 !v4 18)) in
6767+ let h = mix2 h !v1 in
6868+ let h = mix2 h !v2 in
6969+ let h = mix2 h !v3 in
7070+ mix2 h !v4
7171+ end else
7272+ add seed prime64_5
7373+ ) in
7474+7575+ h := add !h (of_int len);
7676+7777+ (* Process remaining 8-byte chunks *)
7878+ let p = ref (if len >= 32 then pos + (len / 32) * 32 else pos) in
7979+ while !p + 8 <= end_pos do
8080+ let k = Bytes.get_int64_le src !p in
8181+ let k = mul k prime64_2 in
8282+ let k = rotl64 k 31 in
8383+ let k = mul k prime64_1 in
8484+ h := logxor !h k;
8585+ h := rotl64 !h 27;
8686+ h := add (mul !h prime64_1) prime64_4;
8787+ p := !p + 8
8888+ done;
8989+9090+ (* Process remaining 4-byte chunk *)
9191+ if !p + 4 <= end_pos then begin
9292+ let k = of_int (Bytes.get_int32_le src !p |> Int32.to_int) in
9393+ let k = logand k 0xFFFFFFFFL in (* Make unsigned *)
9494+ h := logxor !h (mul k prime64_1);
9595+ h := rotl64 !h 23;
9696+ h := add (mul !h prime64_2) prime64_3;
9797+ p := !p + 4
9898+ end;
9999+100100+ (* Process remaining bytes *)
101101+ while !p < end_pos do
102102+ let k = of_int (Bytes.get_uint8 src !p) in
103103+ h := logxor !h (mul k prime64_5);
104104+ h := rotl64 !h 11;
105105+ h := mul !h prime64_1;
106106+ incr p
107107+ done;
108108+109109+ avalanche !h
110110+111111+let hash64_string ?seed s =
112112+ let src = Bytes.unsafe_of_string s in
113113+ hash64 ?seed src ~pos:0 ~len:(String.length s)
114114+115115+(** Compute xxHash-64 and return lower 32 bits (for zstd checksum) *)
116116+let hash32 ?seed src ~pos ~len =
117117+ let h = hash64 ?seed src ~pos ~len in
118118+ Int64.to_int32 (Int64.logand h 0xFFFFFFFFL)
119119+120120+let hash32_string ?seed s =
121121+ let src = Bytes.unsafe_of_string s in
122122+ hash32 ?seed src ~pos:0 ~len:(String.length s)
123123+124124+(** Streaming hasher state *)
125125+type state = {
126126+ mutable v1 : int64;
127127+ mutable v2 : int64;
128128+ mutable v3 : int64;
129129+ mutable v4 : int64;
130130+ mutable total_len : int;
131131+ buffer : bytes;
132132+ mutable buf_len : int;
133133+ seed : int64;
134134+}
135135+136136+let create_state ?(seed=0L) () =
137137+ let open Int64 in
138138+ {
139139+ v1 = add (add seed prime64_1) prime64_2;
140140+ v2 = add seed prime64_2;
141141+ v3 = seed;
142142+ v4 = sub seed prime64_1;
143143+ total_len = 0;
144144+ buffer = Bytes.create 32;
145145+ buf_len = 0;
146146+ seed;
147147+ }
148148+149149+let reset ?(seed=0L) state =
150150+ let open Int64 in
151151+ state.v1 <- add (add seed prime64_1) prime64_2;
152152+ state.v2 <- add seed prime64_2;
153153+ state.v3 <- seed;
154154+ state.v4 <- sub seed prime64_1;
155155+ state.total_len <- 0;
156156+ state.buf_len <- 0
157157+158158+let copy_state state =
159159+ {
160160+ v1 = state.v1;
161161+ v2 = state.v2;
162162+ v3 = state.v3;
163163+ v4 = state.v4;
164164+ total_len = state.total_len;
165165+ buffer = Bytes.copy state.buffer;
166166+ buf_len = state.buf_len;
167167+ seed = state.seed;
168168+ }
169169+170170+let update state src ~pos ~len =
171171+ let end_pos = pos + len in
172172+ state.total_len <- state.total_len + len;
173173+174174+ let p = ref pos in
175175+176176+ (* Fill buffer if we have partial data *)
177177+ if state.buf_len > 0 then begin
178178+ let to_copy = min (32 - state.buf_len) len in
179179+ Bytes.blit src !p state.buffer state.buf_len to_copy;
180180+ state.buf_len <- state.buf_len + to_copy;
181181+ p := !p + to_copy;
182182+183183+ if state.buf_len = 32 then begin
184184+ state.v1 <- mix1 state.v1 (Bytes.get_int64_le state.buffer 0);
185185+ state.v2 <- mix1 state.v2 (Bytes.get_int64_le state.buffer 8);
186186+ state.v3 <- mix1 state.v3 (Bytes.get_int64_le state.buffer 16);
187187+ state.v4 <- mix1 state.v4 (Bytes.get_int64_le state.buffer 24);
188188+ state.buf_len <- 0
189189+ end
190190+ end;
191191+192192+ (* Process 32-byte blocks *)
193193+ while !p + 32 <= end_pos do
194194+ state.v1 <- mix1 state.v1 (Bytes.get_int64_le src !p);
195195+ state.v2 <- mix1 state.v2 (Bytes.get_int64_le src (!p + 8));
196196+ state.v3 <- mix1 state.v3 (Bytes.get_int64_le src (!p + 16));
197197+ state.v4 <- mix1 state.v4 (Bytes.get_int64_le src (!p + 24));
198198+ p := !p + 32
199199+ done;
200200+201201+ (* Buffer remaining *)
202202+ if !p < end_pos then begin
203203+ let remaining = end_pos - !p in
204204+ Bytes.blit src !p state.buffer state.buf_len remaining;
205205+ state.buf_len <- state.buf_len + remaining
206206+ end
207207+208208+let update_string state s =
209209+ let src = Bytes.unsafe_of_string s in
210210+ update state src ~pos:0 ~len:(String.length s)
211211+212212+let finalize state =
213213+ let open Int64 in
214214+215215+ let h = ref (
216216+ if state.total_len >= 32 then begin
217217+ let h = add
218218+ (add (rotl64 state.v1 1) (rotl64 state.v2 7))
219219+ (add (rotl64 state.v3 12) (rotl64 state.v4 18)) in
220220+ let h = mix2 h state.v1 in
221221+ let h = mix2 h state.v2 in
222222+ let h = mix2 h state.v3 in
223223+ mix2 h state.v4
224224+ end else
225225+ add state.v3 prime64_5 (* v3 holds seed *)
226226+ ) in
227227+228228+ h := add !h (of_int state.total_len);
229229+230230+ (* Process buffered data *)
231231+ let p = ref 0 in
232232+ while !p + 8 <= state.buf_len do
233233+ let k = Bytes.get_int64_le state.buffer !p in
234234+ let k = mul k prime64_2 in
235235+ let k = rotl64 k 31 in
236236+ let k = mul k prime64_1 in
237237+ h := logxor !h k;
238238+ h := rotl64 !h 27;
239239+ h := add (mul !h prime64_1) prime64_4;
240240+ p := !p + 8
241241+ done;
242242+243243+ if !p + 4 <= state.buf_len then begin
244244+ let k = of_int (Bytes.get_int32_le state.buffer !p |> Int32.to_int) in
245245+ let k = logand k 0xFFFFFFFFL in
246246+ h := logxor !h (mul k prime64_1);
247247+ h := rotl64 !h 23;
248248+ h := add (mul !h prime64_2) prime64_3;
249249+ p := !p + 4
250250+ end;
251251+252252+ while !p < state.buf_len do
253253+ let k = of_int (Bytes.get_uint8 state.buffer !p) in
254254+ h := logxor !h (mul k prime64_5);
255255+ h := rotl64 !h 11;
256256+ h := mul !h prime64_1;
257257+ incr p
258258+ done;
259259+260260+ avalanche !h
261261+262262+let finalize32 state =
263263+ let h = finalize state in
264264+ Int64.to_int32 (Int64.logand h 0xFFFFFFFFL)
+91
src/xxhash.mli
···11+(** xxHash - Fast non-cryptographic hash functions.
22+33+ This is a pure OCaml implementation of the xxHash family of hash functions,
44+ originally designed by Yann Collet. xxHash provides extremely fast hashing
55+ with excellent distribution properties.
66+77+ {1 Quick Start}
88+99+ {[
1010+ (* Hash a string *)
1111+ let hash = Xxhash.hash64_string "Hello, World!"
1212+1313+ (* Hash bytes with explicit range *)
1414+ let bytes = Bytes.of_string "Hello, World!"
1515+ let hash = Xxhash.hash64 bytes ~pos:0 ~len:13
1616+1717+ (* Use streaming API for large data *)
1818+ let state = Xxhash.create_state () in
1919+ Xxhash.update state chunk1 ~pos:0 ~len:(Bytes.length chunk1);
2020+ Xxhash.update state chunk2 ~pos:0 ~len:(Bytes.length chunk2);
2121+ let hash = Xxhash.finalize state
2222+ ]}
2323+2424+ {1 Hash Variants}
2525+2626+ - {!hash64}: 64-bit hash, best for general use
2727+ - {!hash32}: Lower 32 bits of 64-bit hash (used by zstd)
2828+2929+ {1 Streaming API}
3030+3131+ For hashing data that doesn't fit in memory or arrives incrementally:
3232+ - {!create_state}: Create a new streaming state
3333+ - {!update}: Feed data into the state
3434+ - {!finalize}: Get the final hash value *)
3535+3636+(** {1 One-shot Hashing} *)
3737+3838+val hash64 : ?seed:int64 -> bytes -> pos:int -> len:int -> int64
3939+(** [hash64 ?seed bytes ~pos ~len] computes the xxHash-64 of [len] bytes
4040+ from [bytes] starting at [pos].
4141+4242+ @param seed Optional seed value (default: 0) *)
4343+4444+val hash64_string : ?seed:int64 -> string -> int64
4545+(** [hash64_string ?seed s] computes the xxHash-64 of string [s]. *)
4646+4747+val hash32 : ?seed:int64 -> bytes -> pos:int -> len:int -> int32
4848+(** [hash32 ?seed bytes ~pos ~len] computes xxHash-64 and returns the
4949+ lower 32 bits. This is the variant used by zstd for content checksums. *)
5050+5151+val hash32_string : ?seed:int64 -> string -> int32
5252+(** [hash32_string ?seed s] computes the lower 32 bits of xxHash-64. *)
5353+5454+(** {1 Streaming API} *)
5555+5656+(** Streaming hasher state. *)
5757+type state
5858+5959+val create_state : ?seed:int64 -> unit -> state
6060+(** [create_state ?seed ()] creates a new streaming hash state.
6161+6262+ @param seed Optional seed value (default: 0) *)
6363+6464+val reset : ?seed:int64 -> state -> unit
6565+(** [reset ?seed state] resets the state for reuse with a new hash.
6666+6767+ @param seed Optional new seed value (default: 0) *)
6868+6969+val update : state -> bytes -> pos:int -> len:int -> unit
7070+(** [update state bytes ~pos ~len] feeds [len] bytes from [bytes]
7171+ starting at [pos] into the hash state.
7272+7373+ Can be called multiple times to hash data incrementally. *)
7474+7575+val update_string : state -> string -> unit
7676+(** [update_string state s] feeds string [s] into the hash state. *)
7777+7878+val finalize : state -> int64
7979+(** [finalize state] returns the 64-bit hash value.
8080+8181+ The state can still be used after finalization - subsequent calls
8282+ to {!finalize} return the same value until {!update} is called. *)
8383+8484+val finalize32 : state -> int32
8585+(** [finalize32 state] returns the lower 32 bits of the hash. *)
8686+8787+(** {1 Utilities} *)
8888+8989+val copy_state : state -> state
9090+(** [copy_state state] creates an independent copy of the hash state.
9191+ Useful for computing hashes of data with common prefixes. *)