Skip to content

Commit d268106

Browse files
authored
upgrade rapidhash v1 to v3 and fix several bugs in _hash_integer translation (#59177)
The rapidhash v3 algorithm supports streaming data, which makes it much more suitable for use as a string hasher than the original. This is actually v3 nano, since it is more similar to the original v1, but without as much vectorization performance improvements v3. This is intended to be the first in a sequence of several more PRs to improve upon this code to be usable in more generic situations. It also fixes several bugs in the translation to hashing integers. Initial version written by Claude, with full review and correction by myself afterwards (it caught none of the bugs, but got most of the updates right).
1 parent f701768 commit d268106

File tree

2 files changed

+56
-55
lines changed

2 files changed

+56
-55
lines changed

base/hashing.jl

Lines changed: 53 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ const HASH_SECRET = tuple(
55
0x2d358dccaa6c78a5,
66
0x8bb84b93962eacc9,
77
0x4b33a62ed433d4a3,
8+
0xaaaaaaaaaaaaaaaa,
89
)
910

1011
"""
@@ -73,74 +74,76 @@ hash_integer(x::Integer, h::UInt) = _hash_integer(x, UInt64(h)) % UInt
7374
function _hash_integer(
7475
x::Integer,
7576
seed::UInt64 = HASH_SEED,
76-
secret::NTuple{3, UInt64} = HASH_SECRET
77+
secret::NTuple{4, UInt64} = HASH_SECRET
7778
)
7879
seed ⊻= (x < 0)
79-
u = abs(x)
80+
u0 = abs(x) # n.b.: this hashes typemin(IntN) correctly even if abs fails
81+
u = u0
8082

8183
# always left-pad to full byte
8284
buflen = UInt(max(cld(top_set_bit(u), 8), 1))
83-
seed = seed (hash_mix(seed secret[1], secret[2]) buflen)
85+
seed = seed hash_mix(seed secret[3], secret[2])
8486

8587
a = zero(UInt64)
8688
b = zero(UInt64)
89+
i = buflen
8790

8891
if buflen 16
8992
if buflen 4
90-
a = (UInt64(u % UInt32) << 32) |
91-
UInt64((u >>> ((buflen - 4) * 8)) % UInt32)
92-
93-
delta = (buflen & 24) >>> (buflen >>> 3)
94-
95-
b = (UInt64((u >>> (8 * delta)) % UInt32) << 32) |
96-
UInt64((u >>> (8 * (buflen - 4 - delta))) % UInt32)
93+
seed ⊻= buflen
94+
if buflen 8
95+
a = UInt64(u % UInt64)
96+
b = UInt64((u >>> (8 * (buflen - 8))) % UInt64)
97+
else
98+
a = UInt64(u % UInt32)
99+
b = UInt64((u >>> (8 * (buflen - 4))) % UInt32)
100+
end
97101
else # buflen > 0
98102
b0 = u % UInt8
99103
b1 = (u >>> (8 * div(buflen, 2))) % UInt8
100104
b2 = (u >>> (8 * (buflen - 1))) % UInt8
101-
a = (UInt64(b0) << 56) |
102-
(UInt64(b1) << 32) |
103-
UInt64(b2)
105+
a = (UInt64(b0) << 45) | UInt64(b2)
106+
b = UInt64(b1)
104107
end
105108
else
106-
a = (u >>> 8(buflen - 16)) % UInt
107-
b = (u >>> 8(buflen - 8)) % UInt
108-
109-
i = buflen
110109
if i > 48
111110
see1 = seed
112111
see2 = seed
113-
while i 48
114-
l0 = u % UInt; u >>>= 64
115-
l1 = u % UInt; u >>>= 64
116-
l2 = u % UInt; u >>>= 64
117-
l3 = u % UInt; u >>>= 64
118-
l4 = u % UInt; u >>>= 64
119-
l5 = u % UInt; u >>>= 64
112+
while i > 48
113+
l0 = u % UInt64; u >>>= 64
114+
l1 = u % UInt64; u >>>= 64
115+
l2 = u % UInt64; u >>>= 64
116+
l3 = u % UInt64; u >>>= 64
117+
l4 = u % UInt64; u >>>= 64
118+
l5 = u % UInt64; u >>>= 64
120119

121120
seed = hash_mix(l0 secret[1], l1 seed)
122121
see1 = hash_mix(l2 secret[2], l3 see1)
123122
see2 = hash_mix(l4 secret[3], l5 see2)
124123
i -= 48
125124
end
126-
seed = seed see1 see2
125+
seed ⊻= see1
126+
seed ⊻= see2
127127
end
128128
if i > 16
129-
l0 = u % UInt; u >>>= 64
130-
l1 = u % UInt; u >>>= 64
131-
seed = hash_mix(l0 secret[3], l1 seed secret[2])
129+
l0 = u % UInt64; u >>>= 64
130+
l1 = u % UInt64; u >>>= 64
131+
seed = hash_mix(l0 secret[3], l1 seed)
132132
if i > 32
133-
l2 = u % UInt; u >>>= 64
134-
l3 = u % UInt; u >>>= 64
133+
l2 = u % UInt64; u >>>= 64
134+
l3 = u % UInt64; u >>>= 64
135135
seed = hash_mix(l2 secret[3], l3 seed)
136136
end
137137
end
138+
139+
a = (u0 >>> 8(buflen - 16)) % UInt64 i
140+
b = (u0 >>> 8(buflen - 8)) % UInt64
138141
end
139142

140143
a = a secret[2]
141144
b = b seed
142145
b, a = mul_parts(a, b)
143-
return hash_mix(a secret[1] buflen, b secret[2])
146+
return hash_mix(a secret[4], b secret[2] i)
144147
end
145148

146149

@@ -266,43 +269,40 @@ hash(x::Symbol) = objectid(x)
266269
load_le(::Type{T}, ptr::Ptr{UInt8}, i) where {T <: Union{UInt32, UInt64}} =
267270
unsafe_load(convert(Ptr{T}, ptr + i - 1))
268271

269-
function read_small(ptr::Ptr{UInt8}, n::Int)
270-
return (UInt64(unsafe_load(ptr)) << 56) |
271-
(UInt64(unsafe_load(ptr, div(n, 2) + 1)) << 32) |
272-
UInt64(unsafe_load(ptr, n))
273-
end
274-
275272
@assume_effects :terminates_globally function hash_bytes(
276273
ptr::Ptr{UInt8},
277274
n::Int,
278275
seed::UInt64,
279-
secret::NTuple{3, UInt64}
276+
secret::NTuple{4, UInt64}
280277
)
281278
# Adapted with gratitude from [rapidhash](https://github.com/Nicoshev/rapidhash)
282279
buflen = UInt64(n)
283-
seed = seed (hash_mix(seed secret[1], secret[2]) buflen)
280+
seed = seed hash_mix(seed secret[3], secret[2])
284281

285282
a = zero(UInt64)
286283
b = zero(UInt64)
284+
i = buflen
287285

288286
if buflen 16
289287
if buflen 4
290-
a = (UInt64(load_le(UInt32, ptr, 1)) << 32) |
291-
UInt64(load_le(UInt32, ptr, n - 3))
292-
293-
delta = (buflen & 24) >>> (buflen >>> 3)
294-
b = (UInt64(load_le(UInt32, ptr, delta + 1)) << 32) |
295-
UInt64(load_le(UInt32, ptr, n - 3 - delta))
288+
seed ⊻= buflen
289+
if buflen 8
290+
a = load_le(UInt64, ptr, 1)
291+
b = load_le(UInt64, ptr, n - 7)
292+
else
293+
a = UInt64(load_le(UInt32, ptr, 1))
294+
b = UInt64(load_le(UInt32, ptr, n - 3))
295+
end
296296
elseif buflen > 0
297-
a = read_small(ptr, n)
297+
a = (UInt64(unsafe_load(ptr)) << 45) | UInt64(unsafe_load(ptr, n))
298+
b = UInt64(unsafe_load(ptr, div(n, 2) + 1))
298299
end
299300
else
300301
pos = 1
301-
i = buflen
302302
if i > 48
303303
see1 = seed
304304
see2 = seed
305-
while i 48
305+
while i > 48
306306
seed = hash_mix(
307307
load_le(UInt64, ptr, pos) secret[1],
308308
load_le(UInt64, ptr, pos + 8) seed
@@ -318,12 +318,13 @@ end
318318
pos += 48
319319
i -= 48
320320
end
321-
seed = seed see1 see2
321+
seed ⊻= see1
322+
seed ⊻= see2
322323
end
323324
if i > 16
324325
seed = hash_mix(
325326
load_le(UInt64, ptr, pos) secret[3],
326-
load_le(UInt64, ptr, pos + 8) seed secret[2]
327+
load_le(UInt64, ptr, pos + 8) seed
327328
)
328329
if i > 32
329330
seed = hash_mix(
@@ -333,14 +334,14 @@ end
333334
end
334335
end
335336

336-
a = load_le(UInt64, ptr, n - 15)
337+
a = load_le(UInt64, ptr, n - 15) i
337338
b = load_le(UInt64, ptr, n - 7)
338339
end
339340

340341
a = a secret[2]
341342
b = b seed
342343
b, a = mul_parts(a, b)
343-
return hash_mix(a secret[1] buflen, b secret[2])
344+
return hash_mix(a secret[4], b secret[2] i)
344345
end
345346

346347
@assume_effects :total hash(data::String, h::UInt) =

stdlib/TOML/test/print.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,18 +83,18 @@ loaders = ["gzip", { driver = "csv", args = {delim = "\t"}}]
8383
@testset "vec with dicts and non-dicts" begin
8484
# https://github.com/JuliaLang/julia/issues/45340
8585
d = Dict("b" => Any[111, Dict("a" => 222, "d" => 333)])
86-
@test toml_str(d) == (sizeof(Int) == 8 ?
86+
@test toml_str(d) == (sizeof(Int) == 4 ?
8787
"b = [111, {a = 222, d = 333}]\n" :
8888
"b = [111, {d = 333, a = 222}]\n")
8989

9090

9191
d = Dict("b" => Any[Dict("a" => 222, "d" => 333), 111])
92-
@test toml_str(d) == (sizeof(Int) == 8 ?
92+
@test toml_str(d) == (sizeof(Int) == 4 ?
9393
"b = [{a = 222, d = 333}, 111]\n" :
9494
"b = [{d = 333, a = 222}, 111]\n")
9595

9696
d = Dict("b" => Any[Dict("a" => 222, "d" => 333)])
97-
@test toml_str(d) == (sizeof(Int) == 8 ?
97+
@test toml_str(d) == (sizeof(Int) == 4 ?
9898
"""
9999
[[b]]
100100
a = 222

0 commit comments

Comments
 (0)