this repo has no description
1//! hash functions used by spaCy/Thinc for token attribute IDs and hash embeddings.
2//!
3//! MurmurHash2_64A: string → uint64 (token attribute hashing)
4//! MurmurHash3_x86_128_uint64: uint64 → 4x uint32 (hash embedding bucket selection)
5
6const std = @import("std");
7
8/// MurmurHash2_64A — the hash function spaCy uses for string→uint64 attribute IDs.
9/// matches preshed/murmurhash.pyx hash_string().
10pub fn murmurhash2_64a(key: []const u8, seed: u64) u64 {
11 const m: u64 = 0xc6a4a7935bd1e995;
12 const r = 47;
13
14 var h: u64 = seed ^ (key.len *% m);
15
16 // process 8-byte chunks
17 const n_blocks = key.len / 8;
18 var i: usize = 0;
19 while (i < n_blocks) : (i += 1) {
20 var k = std.mem.readInt(u64, key[i * 8 ..][0..8], .little);
21 k *%= m;
22 k ^= k >> r;
23 k *%= m;
24 h ^= k;
25 h *%= m;
26 }
27
28 // process remaining bytes
29 const tail = key[n_blocks * 8 ..];
30 var remaining: u64 = 0;
31 switch (tail.len) {
32 7 => remaining = @as(u64, tail[6]) << 48 | @as(u64, tail[5]) << 40 | @as(u64, tail[4]) << 32 | @as(u64, tail[3]) << 24 | @as(u64, tail[2]) << 16 | @as(u64, tail[1]) << 8 | @as(u64, tail[0]),
33 6 => remaining = @as(u64, tail[5]) << 40 | @as(u64, tail[4]) << 32 | @as(u64, tail[3]) << 24 | @as(u64, tail[2]) << 16 | @as(u64, tail[1]) << 8 | @as(u64, tail[0]),
34 5 => remaining = @as(u64, tail[4]) << 32 | @as(u64, tail[3]) << 24 | @as(u64, tail[2]) << 16 | @as(u64, tail[1]) << 8 | @as(u64, tail[0]),
35 4 => remaining = @as(u64, tail[3]) << 24 | @as(u64, tail[2]) << 16 | @as(u64, tail[1]) << 8 | @as(u64, tail[0]),
36 3 => remaining = @as(u64, tail[2]) << 16 | @as(u64, tail[1]) << 8 | @as(u64, tail[0]),
37 2 => remaining = @as(u64, tail[1]) << 8 | @as(u64, tail[0]),
38 1 => remaining = @as(u64, tail[0]),
39 else => {},
40 }
41 if (tail.len > 0) {
42 h ^= remaining;
43 h *%= m;
44 }
45
46 h ^= h >> r;
47 h *%= m;
48 h ^= h >> r;
49
50 return h;
51}
52
53/// MurmurHash3_x86_128 adapted for a single uint64 input.
54/// this is the exact hash function Thinc uses in hash embeddings
55/// to produce 4 bucket indices from a token attribute ID.
56///
57/// from thinc/backends/numpy_ops.pyx MurmurHash3_x86_128_uint64
58pub fn murmurhash3_128_uint64(val: u64, seed: u32) [4]u32 {
59 var h1: u64 = val;
60 h1 *%= 0x87c37b91114253d5;
61 h1 = std.math.rotl(u64, h1, 31);
62 h1 *%= 0x4cf5ad432745937f;
63 h1 ^= seed;
64 h1 ^= 8; // length = 8 bytes
65
66 var h2: u64 = seed;
67 h2 ^= 8;
68
69 // fmix64
70 h1 +%= h2;
71 h2 +%= h1;
72
73 h1 ^= h1 >> 33;
74 h1 *%= 0xff51afd7ed558ccd;
75 h1 ^= h1 >> 33;
76 h1 *%= 0xc4ceb9fe1a85ec53;
77 h1 ^= h1 >> 33;
78
79 h2 ^= h2 >> 33;
80 h2 *%= 0xff51afd7ed558ccd;
81 h2 ^= h2 >> 33;
82 h2 *%= 0xc4ceb9fe1a85ec53;
83 h2 ^= h2 >> 33;
84
85 h1 +%= h2;
86 h2 +%= h1;
87
88 return .{
89 @truncate(h1),
90 @truncate(h1 >> 32),
91 @truncate(h2),
92 @truncate(h2 >> 32),
93 };
94}
95
96/// convenience: hash a string to the uint64 attribute ID spaCy uses,
97/// with the default seed of 1.
98pub fn hashString(s: []const u8) u64 {
99 return murmurhash2_64a(s, 1);
100}
101
102// === tests ===
103
104const testing = std.testing;
105
106// cross-validated against spaCy's hash_string() (preshed/murmurhash.pyx, seed=1)
107test "murmurhash2_64a matches spacy hash_string" {
108 try testing.expectEqual(@as(u64, 0xdd6e45542c05f898), hashString("obama"));
109 try testing.expectEqual(@as(u64, 0xd58ee95da168bb57), hashString("Barack"));
110 try testing.expectEqual(@as(u64, 0x90b4b7068fc46e30), hashString("Paris"));
111 try testing.expectEqual(@as(u64, 0xa30ebbc9c2b3d425), hashString("visited"));
112 try testing.expectEqual(@as(u64, 0x6032b56374c05136), hashString("SpaceX"));
113 try testing.expectEqual(@as(u64, 0xa52be5b3f6674b2a), hashString("a"));
114}
115
116test "murmurhash2_64a empty string" {
117 // spaCy: hash_string("") = 0xc6a4a7935bd064dc (seed=1)
118 try testing.expectEqual(@as(u64, 0xc6a4a7935bd064dc), hashString(""));
119}
120
121// cross-validated against thinc NumpyOps.hash()
122test "murmurhash3_128_uint64 matches thinc" {
123 const r1 = murmurhash3_128_uint64(12345, 8);
124 try testing.expectEqual(@as(u32, 1415810048), r1[0]);
125 try testing.expectEqual(@as(u32, 2915517168), r1[1]);
126 try testing.expectEqual(@as(u32, 2123715072), r1[2]);
127 try testing.expectEqual(@as(u32, 78308456), r1[3]);
128
129 const r2 = murmurhash3_128_uint64(12345, 9);
130 try testing.expectEqual(@as(u32, 3518799221), r2[0]);
131 try testing.expectEqual(@as(u32, 2668567277), r2[1]);
132 try testing.expectEqual(@as(u32, 3200850228), r2[2]);
133 try testing.expectEqual(@as(u32, 3937369458), r2[3]);
134
135 const r3 = murmurhash3_128_uint64(42, 10);
136 try testing.expectEqual(@as(u32, 4049717780), r3[0]);
137 try testing.expectEqual(@as(u32, 353985546), r3[1]);
138 try testing.expectEqual(@as(u32, 1712375736), r3[2]);
139 try testing.expectEqual(@as(u32, 3784464606), r3[3]);
140
141 // edge case: input 0 → all zeros
142 const r4 = murmurhash3_128_uint64(0, 8);
143 try testing.expectEqual(@as(u32, 0), r4[0]);
144 try testing.expectEqual(@as(u32, 0), r4[1]);
145 try testing.expectEqual(@as(u32, 0), r4[2]);
146 try testing.expectEqual(@as(u32, 0), r4[3]);
147}