Skip to content

Commit 4a3d736

Browse files
adienesJeffBezansonoscardssmithgiordanoIanButterworth
authored
use rapidhash (#57509)
closes #57235 todos: * are the test changes acceptable? * default seed going from `0` --> `0xbdd89aa982704029` * lots and lots of benchmarking * I don't really understand the effects stuff (for the `String` hash) * how configurable should the secret be? to address #37166 should the seed/secret be an env var? * proper attribution to creators of `rapidhash` and also to reference implementations * I changed some `h + seed` to `h ⊻ seed` only because I don't really get why `+` was used in the first place * should instances of `hash(x) - 3h` instead be `hash(x, h)` ? --------- Co-authored-by: Jeff Bezanson <[email protected]> Co-authored-by: Oscar Smith <[email protected]> Co-authored-by: Mosè Giordano <[email protected]> Co-authored-by: Ian Butterworth <[email protected]>
1 parent 75382d6 commit 4a3d736

File tree

18 files changed

+178
-101
lines changed

18 files changed

+178
-101
lines changed

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ Language changes
1111
* `mod(x::AbstractFloat, -Inf)` now returns `x` (as long as `x` is finite), this aligns with C standard and
1212
is considered a bug fix ([#47102])
1313

14+
- The `hash` algorithm and its values have changed. Most `hash` specializations will remain correct and require no action. Types that reimplement the core hashing logic independently, such as some third-party string packages do, may require a migration to the new algorithm. ([#57509])
15+
1416
Compiler/Runtime improvements
1517
-----------------------------
1618

base/abstractarray.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3563,7 +3563,7 @@ sizehint!(a::AbstractVector, _) = a
35633563

35643564
const hash_abstractarray_seed = UInt === UInt64 ? 0x7e2d6fb6448beb77 : 0xd4514ce5
35653565
function hash(A::AbstractArray, h::UInt)
3566-
h += hash_abstractarray_seed
3566+
h = hash_abstractarray_seed
35673567
# Axes are themselves AbstractArrays, so hashing them directly would stack overflow
35683568
# Instead hash the tuple of firsts and lasts along each dimension
35693569
h = hash(map(first, axes(A)), h)

base/binaryplatforms.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ end
157157

158158
# Hash definition to ensure that it's stable
159159
function Base.hash(p::Platform, h::UInt)
160-
h += 0x506c6174666f726d % UInt
160+
h = 0x506c6174666f726d % UInt
161161
h = hash(p.tags, h)
162162
h = hash(p.compare_strategies, h)
163163
return h

base/char.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,7 @@ in(x::AbstractChar, y::AbstractChar) = x == y
222222
==(x::Char, y::Char) = bitcast(UInt32, x) == bitcast(UInt32, y)
223223
isless(x::Char, y::Char) = bitcast(UInt32, x) < bitcast(UInt32, y)
224224
hash(x::Char, h::UInt) =
225-
hash_uint64(((bitcast(UInt32, x) + UInt64(0xd4d64234)) << 32) UInt64(h))
225+
hash_finalizer(((bitcast(UInt32, x) + UInt64(0xd4d64234)) << 32) UInt64(h)) % UInt
226226

227227
# fallbacks:
228228
isless(x::AbstractChar, y::AbstractChar) = isless(Char(x), Char(y))

base/gmp.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -892,7 +892,7 @@ if Limb === UInt64 === UInt
892892
return hash(ldexp(flipsign(Float64(limb), sz), pow), h)
893893
end
894894
h = hash_integer(pow, h)
895-
h ⊻= hash_uint(flipsign(limb, sz) h)
895+
h ⊻= hash_finalizer(flipsign(limb, sz) h)
896896
for idx = idx+1:asz
897897
if shift == 0
898898
limb = unsafe_load(ptr, idx)
@@ -906,7 +906,7 @@ if Limb === UInt64 === UInt
906906
limb = limb2 << upshift | limb1 >> shift
907907
end
908908
end
909-
h ⊻= hash_uint(limb h)
909+
h ⊻= hash_finalizer(limb h)
910910
end
911911
return h
912912
end

base/hashing.jl

Lines changed: 127 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
# This file is a part of Julia. License is MIT: https://julialang.org/license
22

3-
## hashing a single value ##
3+
const HASH_SEED = UInt == UInt64 ? 0xbdd89aa982704029 : 0xeabe9406
4+
const HASH_SECRET = tuple(
5+
0x2d358dccaa6c78a5,
6+
0x8bb84b93962eacc9,
7+
0x4b33a62ed433d4a3,
8+
)
49

510
"""
611
hash(x[, h::UInt])::UInt
@@ -17,75 +22,52 @@ The hash value may change when a new Julia process is started.
1722
1823
```jldoctest; filter = r"0x[0-9a-f]{16}"
1924
julia> a = hash(10)
20-
0x95ea2955abd45275
25+
0x759d18cc5346a65f
2126
2227
julia> hash(10, a) # only use the output of another hash function as the second argument
23-
0xd42bad54a8575b16
28+
0x03158cd61b1b0bd1
2429
```
2530
2631
See also: [`objectid`](@ref), [`Dict`](@ref), [`Set`](@ref).
2732
"""
28-
hash(x::Any) = hash(x, zero(UInt))
33+
hash(data::Any) = hash(data, HASH_SEED)
2934
hash(w::WeakRef, h::UInt) = hash(w.value, h)
3035

3136
# Types can't be deleted, so marking as total allows the compiler to look up the hash
32-
hash(T::Type, h::UInt) = hash_uint(3h - @assume_effects :total ccall(:jl_type_hash, UInt, (Any,), T))
37+
hash(T::Type, h::UInt) =
38+
hash((@assume_effects :total ccall(:jl_type_hash, UInt, (Any,), T)), h)
39+
hash(@nospecialize(data), h::UInt) = hash(objectid(data), h)
3340

34-
## hashing general objects ##
35-
36-
hash(@nospecialize(x), h::UInt) = hash_uint(3h - objectid(x))
37-
38-
hash(x::Symbol) = objectid(x)
39-
40-
## core data hashing functions ##
41-
42-
function hash_64_64(n::UInt64)
43-
a::UInt64 = n
44-
a = ~a + a << 21
45-
a = a a >> 24
46-
a = a + a << 3 + a << 8
47-
a = a a >> 14
48-
a = a + a << 2 + a << 4
49-
a = a a >> 28
50-
a = a + a << 31
51-
return a
41+
function mul_parts(a::UInt64, b::UInt64)
42+
p = widemul(a, b)
43+
return (p >> 64) % UInt64, p % UInt64
5244
end
53-
54-
function hash_64_32(n::UInt64)
55-
a::UInt64 = n
56-
a = ~a + a << 18
57-
a = a a >> 31
58-
a = a * 21
59-
a = a a >> 11
60-
a = a + a << 6
61-
a = a a >> 22
62-
return a % UInt32
45+
hash_mix(a::UInt64, b::UInt64) = (mul_parts(a, b)...)
46+
47+
# faster-but-weaker than hash_mix intended for small keys
48+
hash_mix_linear(x::UInt64, h::UInt) = 3h - x
49+
function hash_finalizer(x::UInt64)
50+
x ⊻= (x >> 32)
51+
x *= 0x63652a4cd374b267
52+
x ⊻= (x >> 33)
53+
return x
6354
end
6455

65-
function hash_32_32(n::UInt32)
66-
a::UInt32 = n
67-
a = a + 0x7ed55d16 + a << 12
68-
a = a 0xc761c23c a >> 19
69-
a = a + 0x165667b1 + a << 5
70-
a = a + 0xd3a2646c a << 9
71-
a = a + 0xfd7046c5 + a << 3
72-
a = a 0xb55a4f09 a >> 16
73-
return a
74-
end
56+
hash_64_64(data::UInt64) = hash_finalizer(data)
57+
hash_64_32(data::UInt64) = hash_64_64(data) % UInt32
58+
hash_32_32(data::UInt32) = hash_64_32(UInt64(data))
7559

7660
if UInt === UInt64
77-
hash_uint64(x::UInt64) = hash_64_64(x)
78-
hash_uint(x::UInt) = hash_64_64(x)
61+
const hash_uint64 = hash_64_64
62+
const hash_uint = hash_64_64
7963
else
80-
hash_uint64(x::UInt64) = hash_64_32(x)
81-
hash_uint(x::UInt) = hash_32_32(x)
64+
const hash_uint64 = hash_64_32
65+
const hash_uint = hash_32_32
8266
end
8367

84-
## efficient value-based hashing of integers ##
85-
86-
hash(x::Int64, h::UInt) = hash_uint64(bitcast(UInt64, x)) - 3h
87-
hash(x::UInt64, h::UInt) = hash_uint64(x) - 3h
88-
hash(x::Union{Bool,Int8,UInt8,Int16,UInt16,Int32,UInt32}, h::UInt) = hash(Int64(x), h)
68+
hash(x::UInt64, h::UInt) = hash_uint64(hash_mix_linear(x, h))
69+
hash(x::Int64, h::UInt) = hash(bitcast(UInt64, x), h)
70+
hash(x::Union{Bool, Int8, UInt8, Int16, UInt16, Int32, UInt32}, h::UInt) = hash(Int64(x), h)
8971

9072
function hash_integer(n::Integer, h::UInt)
9173
h ⊻= hash_uint((n % UInt) h)
@@ -100,7 +82,7 @@ end
10082

10183
## efficient value-based hashing of floats ##
10284

103-
const hx_NaN = hash_uint64(reinterpret(UInt64, NaN))
85+
const hx_NaN = hash(reinterpret(UInt64, NaN))
10486
function hash(x::Float64, h::UInt)
10587
# see comments on trunc and hash(Real, UInt)
10688
if typemin(Int64) <= x < typemax(Int64)
@@ -116,7 +98,7 @@ function hash(x::Float64, h::UInt)
11698
elseif isnan(x)
11799
return hx_NaN h # NaN does not have a stable bit pattern
118100
end
119-
return hash_uint64(bitcast(UInt64, x)) - 3h
101+
return hash(bitcast(UInt64, x), h)
120102
end
121103

122104
hash(x::Float32, h::UInt) = hash(Float64(x), h)
@@ -131,7 +113,7 @@ function hash(x::Float16, h::UInt)
131113
elseif isnan(x)
132114
return hx_NaN h # NaN does not have a stable bit pattern
133115
end
134-
return hash_uint64(bitcast(UInt64, Float64(x))) - 3h
116+
return hash(bitcast(UInt64, Float64(x)), h)
135117
end
136118

137119
## generic hashing for rational values ##
@@ -180,21 +162,100 @@ end
180162

181163

182164
## symbol & expression hashing ##
183-
184165
if UInt === UInt64
185-
hash(x::Expr, h::UInt) = hash(x.args, hash(x.head, h + 0x83c7900696d26dc6))
186-
hash(x::QuoteNode, h::UInt) = hash(x.value, h + 0x2c97bf8b3de87020)
166+
hash(x::Expr, h::UInt) = hash(x.args, hash(x.head, h 0x83c7900696d26dc6))
167+
hash(x::QuoteNode, h::UInt) = hash(x.value, h 0x2c97bf8b3de87020)
187168
else
188-
hash(x::Expr, h::UInt) = hash(x.args, hash(x.head, h + 0x96d26dc6))
189-
hash(x::QuoteNode, h::UInt) = hash(x.value, h + 0x469d72af)
169+
hash(x::Expr, h::UInt) = hash(x.args, hash(x.head, h 0x469d72af))
170+
hash(x::QuoteNode, h::UInt) = hash(x.value, h 0x469d72af)
190171
end
191172

192-
## hashing strings ##
173+
hash(x::Symbol) = objectid(x)
193174

194-
const memhash = UInt === UInt64 ? :memhash_seed : :memhash32_seed
195-
const memhash_seed = UInt === UInt64 ? 0x71e729fd56419c81 : 0x56419c81
196175

197-
@assume_effects :total function hash(s::String, h::UInt)
198-
h += memhash_seed
199-
ccall(memhash, UInt, (Ptr{UInt8}, Csize_t, UInt32), s, sizeof(s), h % UInt32) + h
176+
load_le(::Type{T}, ptr::Ptr{UInt8}, i) where {T <: Union{UInt32, UInt64}} =
177+
unsafe_load(convert(Ptr{T}, ptr + i - 1))
178+
179+
function read_small(ptr::Ptr{UInt8}, n::Int)
180+
return (UInt64(unsafe_load(ptr)) << 56) |
181+
(UInt64(unsafe_load(ptr, div(n, 2) + 1)) << 32) |
182+
UInt64(unsafe_load(ptr, n))
200183
end
184+
185+
@assume_effects :terminates_globally function hash_bytes(
186+
ptr::Ptr{UInt8},
187+
n::Int,
188+
seed::UInt64,
189+
secret::NTuple{3, UInt64}
190+
)
191+
# Adapted with gratitude from [rapidhash](https://github.com/Nicoshev/rapidhash)
192+
buflen = UInt64(n)
193+
seed = seed (hash_mix(seed secret[1], secret[2]) buflen)
194+
195+
a = zero(UInt64)
196+
b = zero(UInt64)
197+
198+
if buflen 16
199+
if buflen 4
200+
a = (UInt64(load_le(UInt32, ptr, 1)) << 32) |
201+
UInt64(load_le(UInt32, ptr, n - 3))
202+
203+
delta = (buflen & 24) >>> (buflen >>> 3)
204+
b = (UInt64(load_le(UInt32, ptr, delta + 1)) << 32) |
205+
UInt64(load_le(UInt32, ptr, n - 3 - delta))
206+
elseif buflen > 0
207+
a = read_small(ptr, n)
208+
end
209+
else
210+
pos = 1
211+
i = buflen
212+
while i 48
213+
see1 = seed
214+
see2 = seed
215+
while i 48
216+
seed = hash_mix(
217+
load_le(UInt64, ptr, pos) secret[1],
218+
load_le(UInt64, ptr, pos + 8) seed
219+
)
220+
see1 = hash_mix(
221+
load_le(UInt64, ptr, pos + 16) secret[2],
222+
load_le(UInt64, ptr, pos + 24) see1
223+
)
224+
see2 = hash_mix(
225+
load_le(UInt64, ptr, pos + 32) secret[3],
226+
load_le(UInt64, ptr, pos + 40) see2
227+
)
228+
pos += 48
229+
i -= 48
230+
end
231+
seed = seed see1 see2
232+
end
233+
if i > 16
234+
seed = hash_mix(
235+
load_le(UInt64, ptr, pos) secret[3],
236+
load_le(UInt64, ptr, pos + 8) seed secret[2]
237+
)
238+
if i > 32
239+
seed = hash_mix(
240+
load_le(UInt64, ptr, pos + 16) secret[3],
241+
load_le(UInt64, ptr, pos + 24) seed
242+
)
243+
end
244+
end
245+
246+
a = load_le(UInt64, ptr, n - 15)
247+
b = load_le(UInt64, ptr, n - 7)
248+
end
249+
250+
a = a secret[2]
251+
b = b seed
252+
b, a = mul_parts(a, b)
253+
return hash_mix(a secret[1] buflen, b secret[2])
254+
end
255+
256+
@assume_effects :total hash(data::String, h::UInt) =
257+
GC.@preserve data hash_bytes(pointer(data), sizeof(data), UInt64(h), HASH_SECRET) % UInt
258+
259+
# no longer used in Base, but a lot of packages access these internals
260+
const memhash = UInt === UInt64 ? :memhash_seed : :memhash32_seed
261+
const memhash_seed = UInt === UInt64 ? 0x71e729fd56419c81 : 0x56419c81

base/multidimensional.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ module IteratorsMD
148148
# hashing
149149
const cartindexhash_seed = UInt == UInt64 ? 0xd60ca92f8284b8b0 : 0xf2ea7c2e
150150
function Base.hash(ci::CartesianIndex, h::UInt)
151-
h += cartindexhash_seed
151+
h = cartindexhash_seed
152152
for i in ci.I
153153
h = hash(i, h)
154154
end

base/pkgid.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ end
1717
==(a::PkgId, b::PkgId) = a.uuid == b.uuid && a.name == b.name
1818

1919
function hash(pkg::PkgId, h::UInt)
20-
h += 0xc9f248583a0ca36c % UInt
20+
h = 0xc9f248583a0ca36c % UInt
2121
h = hash(pkg.uuid, h)
2222
h = hash(pkg.name, h)
2323
return h

base/regex.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -802,7 +802,7 @@ end
802802
## hash ##
803803
const hashre_seed = UInt === UInt64 ? 0x67e195eb8555e72d : 0xe32373e4
804804
function hash(r::Regex, h::UInt)
805-
h += hashre_seed
805+
h = hashre_seed
806806
h = hash(r.pattern, h)
807807
h = hash(r.compile_options, h)
808808
h = hash(r.match_options, h)

base/stacktraces.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ function ==(a::StackFrame, b::StackFrame)
9090
end
9191

9292
function hash(frame::StackFrame, h::UInt)
93-
h += 0xf4fbda67fe20ce88 % UInt
93+
h = 0xf4fbda67fe20ce88 % UInt
9494
h = hash(frame.line, h)
9595
h = hash(frame.file, h)
9696
h = hash(frame.func, h)

0 commit comments

Comments
 (0)