this repo has no description
at main 147 lines 5.2 kB view raw
1//! hash functions used by spaCy/Thinc for token attribute IDs and hash embeddings. 2//! 3//! MurmurHash2_64A: string → uint64 (token attribute hashing) 4//! MurmurHash3_x86_128_uint64: uint64 → 4x uint32 (hash embedding bucket selection) 5 6const std = @import("std"); 7 8/// MurmurHash2_64A — the hash function spaCy uses for string→uint64 attribute IDs. 9/// matches preshed/murmurhash.pyx hash_string(). 10pub fn murmurhash2_64a(key: []const u8, seed: u64) u64 { 11 const m: u64 = 0xc6a4a7935bd1e995; 12 const r = 47; 13 14 var h: u64 = seed ^ (key.len *% m); 15 16 // process 8-byte chunks 17 const n_blocks = key.len / 8; 18 var i: usize = 0; 19 while (i < n_blocks) : (i += 1) { 20 var k = std.mem.readInt(u64, key[i * 8 ..][0..8], .little); 21 k *%= m; 22 k ^= k >> r; 23 k *%= m; 24 h ^= k; 25 h *%= m; 26 } 27 28 // process remaining bytes 29 const tail = key[n_blocks * 8 ..]; 30 var remaining: u64 = 0; 31 switch (tail.len) { 32 7 => remaining = @as(u64, tail[6]) << 48 | @as(u64, tail[5]) << 40 | @as(u64, tail[4]) << 32 | @as(u64, tail[3]) << 24 | @as(u64, tail[2]) << 16 | @as(u64, tail[1]) << 8 | @as(u64, tail[0]), 33 6 => remaining = @as(u64, tail[5]) << 40 | @as(u64, tail[4]) << 32 | @as(u64, tail[3]) << 24 | @as(u64, tail[2]) << 16 | @as(u64, tail[1]) << 8 | @as(u64, tail[0]), 34 5 => remaining = @as(u64, tail[4]) << 32 | @as(u64, tail[3]) << 24 | @as(u64, tail[2]) << 16 | @as(u64, tail[1]) << 8 | @as(u64, tail[0]), 35 4 => remaining = @as(u64, tail[3]) << 24 | @as(u64, tail[2]) << 16 | @as(u64, tail[1]) << 8 | @as(u64, tail[0]), 36 3 => remaining = @as(u64, tail[2]) << 16 | @as(u64, tail[1]) << 8 | @as(u64, tail[0]), 37 2 => remaining = @as(u64, tail[1]) << 8 | @as(u64, tail[0]), 38 1 => remaining = @as(u64, tail[0]), 39 else => {}, 40 } 41 if (tail.len > 0) { 42 h ^= remaining; 43 h *%= m; 44 } 45 46 h ^= h >> r; 47 h *%= m; 48 h ^= h >> r; 49 50 return h; 51} 52 53/// MurmurHash3_x86_128 adapted for a single uint64 input. 54/// this is the exact hash function Thinc uses in hash embeddings 55/// to produce 4 bucket indices from a token attribute ID. 56/// 57/// from thinc/backends/numpy_ops.pyx MurmurHash3_x86_128_uint64 58pub fn murmurhash3_128_uint64(val: u64, seed: u32) [4]u32 { 59 var h1: u64 = val; 60 h1 *%= 0x87c37b91114253d5; 61 h1 = std.math.rotl(u64, h1, 31); 62 h1 *%= 0x4cf5ad432745937f; 63 h1 ^= seed; 64 h1 ^= 8; // length = 8 bytes 65 66 var h2: u64 = seed; 67 h2 ^= 8; 68 69 // fmix64 70 h1 +%= h2; 71 h2 +%= h1; 72 73 h1 ^= h1 >> 33; 74 h1 *%= 0xff51afd7ed558ccd; 75 h1 ^= h1 >> 33; 76 h1 *%= 0xc4ceb9fe1a85ec53; 77 h1 ^= h1 >> 33; 78 79 h2 ^= h2 >> 33; 80 h2 *%= 0xff51afd7ed558ccd; 81 h2 ^= h2 >> 33; 82 h2 *%= 0xc4ceb9fe1a85ec53; 83 h2 ^= h2 >> 33; 84 85 h1 +%= h2; 86 h2 +%= h1; 87 88 return .{ 89 @truncate(h1), 90 @truncate(h1 >> 32), 91 @truncate(h2), 92 @truncate(h2 >> 32), 93 }; 94} 95 96/// convenience: hash a string to the uint64 attribute ID spaCy uses, 97/// with the default seed of 1. 98pub fn hashString(s: []const u8) u64 { 99 return murmurhash2_64a(s, 1); 100} 101 102// === tests === 103 104const testing = std.testing; 105 106// cross-validated against spaCy's hash_string() (preshed/murmurhash.pyx, seed=1) 107test "murmurhash2_64a matches spacy hash_string" { 108 try testing.expectEqual(@as(u64, 0xdd6e45542c05f898), hashString("obama")); 109 try testing.expectEqual(@as(u64, 0xd58ee95da168bb57), hashString("Barack")); 110 try testing.expectEqual(@as(u64, 0x90b4b7068fc46e30), hashString("Paris")); 111 try testing.expectEqual(@as(u64, 0xa30ebbc9c2b3d425), hashString("visited")); 112 try testing.expectEqual(@as(u64, 0x6032b56374c05136), hashString("SpaceX")); 113 try testing.expectEqual(@as(u64, 0xa52be5b3f6674b2a), hashString("a")); 114} 115 116test "murmurhash2_64a empty string" { 117 // spaCy: hash_string("") = 0xc6a4a7935bd064dc (seed=1) 118 try testing.expectEqual(@as(u64, 0xc6a4a7935bd064dc), hashString("")); 119} 120 121// cross-validated against thinc NumpyOps.hash() 122test "murmurhash3_128_uint64 matches thinc" { 123 const r1 = murmurhash3_128_uint64(12345, 8); 124 try testing.expectEqual(@as(u32, 1415810048), r1[0]); 125 try testing.expectEqual(@as(u32, 2915517168), r1[1]); 126 try testing.expectEqual(@as(u32, 2123715072), r1[2]); 127 try testing.expectEqual(@as(u32, 78308456), r1[3]); 128 129 const r2 = murmurhash3_128_uint64(12345, 9); 130 try testing.expectEqual(@as(u32, 3518799221), r2[0]); 131 try testing.expectEqual(@as(u32, 2668567277), r2[1]); 132 try testing.expectEqual(@as(u32, 3200850228), r2[2]); 133 try testing.expectEqual(@as(u32, 3937369458), r2[3]); 134 135 const r3 = murmurhash3_128_uint64(42, 10); 136 try testing.expectEqual(@as(u32, 4049717780), r3[0]); 137 try testing.expectEqual(@as(u32, 353985546), r3[1]); 138 try testing.expectEqual(@as(u32, 1712375736), r3[2]); 139 try testing.expectEqual(@as(u32, 3784464606), r3[3]); 140 141 // edge case: input 0 → all zeros 142 const r4 = murmurhash3_128_uint64(0, 8); 143 try testing.expectEqual(@as(u32, 0), r4[0]); 144 try testing.expectEqual(@as(u32, 0), r4[1]); 145 try testing.expectEqual(@as(u32, 0), r4[2]); 146 try testing.expectEqual(@as(u32, 0), r4[3]); 147}