change AbstractString and Integer hashing to use generic hashing interface (#59691)

vtjnash · web-flow · commit afb44f57df8b · 2025-09-30T09:50:07.000-04:00
Now that hashing has 3 interfaces (pointer (unsafe), array (indexable),
iterable) in decreasing levels of typical optimization and performance,
use those instead of making custom implementations for specific types.
This automatically opts all AbstractString into fast hashing if they've
correctly defined the `codeunit` string interface.
diff --git a/base/gmp.jl b/base/gmp.jl
@@ -864,21 +864,48 @@ if Limb === UInt64 === UInt
 
     using .Base: HASH_SECRET, hash_bytes, hash_finalizer
 
+    # UnsafeLimbView provides a safe iterator interface to BigInt limb data
+    struct UnsafeLimbView <: AbstractVector{UInt8}
+        bigint::BigInt
+        start_byte::Int
+        num_bytes::Int
+    end
+
+    function Base.size(view::UnsafeLimbView)
+        return (view.num_bytes,)
+    end
+
+    function Base.getindex(view::UnsafeLimbView, i::Int)
+        @boundscheck checkbounds(view, i)
+        GC.@preserve view begin
+            limb_index = div(view.start_byte + i - 2, 8) + 1
+            byte_in_limb = (view.start_byte + i - 2) % 8
+            limb = unsafe_load(view.bigint.d, limb_index)
+            return UInt8((limb >> (8 * byte_in_limb)) & 0xff)
+        end
+    end
+
+    function Base.iterate(view::UnsafeLimbView, state::Int = 1)
+        state > view.num_bytes && return nothing
+        return @inbounds(view[state]), state + 1
+    end
+
+    function Base.length(view::UnsafeLimbView)
+        return view.num_bytes
+    end
+
     function hash_integer(n::BigInt, h::UInt)
         iszero(n) && return hash_integer(0, h)
-        GC.@preserve n begin
-            s = n.size
-            h ⊻= (s < 0)
-
-            us = abs(s)
-            leading_zero_bytes = div(leading_zeros(unsafe_load(n.d, us)), 8)
-            hash_bytes(
-                Ptr{UInt8}(n.d),
-                8 * us - leading_zero_bytes,
-                h,
-                HASH_SECRET
-            )
-        end
+        s = n.size
+        h ⊻= (s < 0)
+
+        us = abs(s)
+        leading_zero_bytes = div(leading_zeros(unsafe_load(n.d, us)), 8)
+        num_bytes = 8 * us - leading_zero_bytes
+
+        # Use UnsafeLimbView for safe iterator-based access
+        limb_view = UnsafeLimbView(n, 1, num_bytes)
+        return hash_bytes(limb_view, h, HASH_SECRET)
     end
 
     function hash(x::BigInt, h::UInt)
@@ -913,12 +940,11 @@ if Limb === UInt64 === UInt
             h ⊻= (sz < 0)
             leading_zero_bytes = div(leading_zeros(unsafe_load(x.d, asz)), 8)
             trailing_zero_bytes = div(pow, 8)
-            return hash_bytes(
-                Ptr{UInt8}(x.d) + trailing_zero_bytes,
-                8 * asz - (leading_zero_bytes + trailing_zero_bytes),
-                h,
-                HASH_SECRET
-            )
+            num_bytes = 8 * asz - (leading_zero_bytes + trailing_zero_bytes)
+
+            # Use UnsafeLimbView for safe iterator-based access
+            limb_view = UnsafeLimbView(x, trailing_zero_bytes + 1, num_bytes)
+            return hash_bytes(limb_view, h, HASH_SECRET)
         end
     end
 end
diff --git a/base/hashing.jl b/base/hashing.jl
@@ -70,80 +70,100 @@ hash(x::UInt64, h::UInt) = hash_uint64(hash_mix_linear(x, h))
 hash(x::Int64, h::UInt) = hash(bitcast(UInt64, x), h)
 hash(x::Union{Bool, Int8, UInt8, Int16, UInt16, Int32, UInt32}, h::UInt) = hash(Int64(x), h)
 
+# IntegerCodeUnits provides a little-endian byte representation of integers
+struct IntegerCodeUnits{T<:Integer} <: AbstractVector{UInt8}
+    value::T
+    num_bytes::Int
+
+    function IntegerCodeUnits(x::T) where {T<:Integer}
+        # Calculate number of bytes needed (always pad to full byte)
+        u = abs(x)
+        num_bytes = max(cld(top_set_bit(u), 8), 1)
+        return new{T}(x, num_bytes)
+    end
+end
+
+function Base.size(units::IntegerCodeUnits)
+    return (units.num_bytes,)
+end
+
+function Base.length(units::IntegerCodeUnits)
+    return units.num_bytes
+end
+
+function Base.getindex(units::IntegerCodeUnits, i::Int)
+    @boundscheck checkbounds(units, i)
+    u = abs(units.value)
+    byte_pos = i - 1
+    return UInt8((u >>> (8 * byte_pos)) & 0xff)
+end
+
+function Base.iterate(units::IntegerCodeUnits, state::Int = 1)
+    state > units.num_bytes && return nothing
+    return units[state], state + 1
+end
+
+# Main interface function to get little-endian byte representation of integers
+codeunits(x::Integer) = IntegerCodeUnits(x)
+
+# UTF8Units provides UTF-8 byte iteration for any AbstractString
+struct UTF8Units{T<:AbstractString}
+    string::T
+end
+
+utf8units(s::AbstractString) = codeunit(s) <: UInt8 ? codeunits(s) : UTF8Units(s)
+
+# Iterator state: (char_iter_state, remaining_utf8_bytes)
+function Base.iterate(units::UTF8Units)
+    char_result = iterate(units.string)
+    char_result === nothing && return nothing
+    char, char_state = char_result
+
+    # Decode char to UTF-8 bytes (similar to the write function)
+    u = bswap(reinterpret(UInt32, char))
+
+    # Return first byte and set up state for remaining bytes
+    first_byte = u % UInt8
+    remaining_bytes = u >> 8
+    return first_byte, (char_state, remaining_bytes)
+end
+
+function Base.iterate(units::UTF8Units, state)
+    char_state, remaining_bytes = state
+    # If we have more bytes from current char, return next byte
+    if remaining_bytes != 0
+        byte = remaining_bytes % UInt8
+        new_remaining = remaining_bytes >> 8
+        return byte, (char_state, new_remaining)
+    end
+
+    # Move to next char
+    char_result = iterate(units.string, char_state)
+    char_result === nothing && return nothing
+    char, new_char_state = char_result
+
+    # Decode new char to UTF-8 bytes
+    u = bswap(reinterpret(UInt32, char))
+
+    # Return first byte and set up state for remaining bytes
+    first_byte = u % UInt8
+    remaining_bytes = u >> 8
+
+    return first_byte, (new_char_state, remaining_bytes)
+end
+
 hash_integer(x::Integer, h::UInt) = _hash_integer(x, UInt64(h)) % UInt
 function _hash_integer(
         x::Integer,
         seed::UInt64,
         secret::NTuple{4, UInt64} = HASH_SECRET
     )
+    # Handle sign by XOR-ing with seed
     seed ⊻= (x < 0)
-    u0 = abs(x) # n.b.: this hashes typemin(IntN) correctly even if abs fails
-    u = u0
-
-    # always left-pad to full byte
-    buflen = UInt(max(cld(top_set_bit(u), 8), 1))
-    seed = seed ⊻ hash_mix(seed ⊻ secret[3], secret[2])
-
-    a = zero(UInt64)
-    b = zero(UInt64)
-    i = buflen
-
-    if buflen ≤ 16
-        if buflen ≥ 4
-            seed ⊻= buflen
-            if buflen ≥ 8
-                a = UInt64(u % UInt64)
-                b = UInt64((u >>> (8 * (buflen - 8))) % UInt64)
-            else
-                a = UInt64(u % UInt32)
-                b = UInt64((u >>> (8 * (buflen - 4))) % UInt32)
-            end
-        else # buflen > 0
-            b0 = u % UInt8
-            b1 = (u >>> (8 * div(buflen, 2))) % UInt8
-            b2 = (u >>> (8 * (buflen - 1))) % UInt8
-            a = (UInt64(b0) << 45) | UInt64(b2)
-            b = UInt64(b1)
-        end
-    else
-        if i > 48
-            see1 = seed
-            see2 = seed
-            while i > 48
-                l0 = u % UInt64; u >>>= 64
-                l1 = u % UInt64; u >>>= 64
-                l2 = u % UInt64; u >>>= 64
-                l3 = u % UInt64; u >>>= 64
-                l4 = u % UInt64; u >>>= 64
-                l5 = u % UInt64; u >>>= 64
-
-                seed = hash_mix(l0 ⊻ secret[1], l1 ⊻ seed)
-                see1 = hash_mix(l2 ⊻ secret[2], l3 ⊻ see1)
-                see2 = hash_mix(l4 ⊻ secret[3], l5 ⊻ see2)
-                i -= 48
-            end
-            seed ⊻= see1
-            seed ⊻= see2
-        end
-        if i > 16
-            l0 = u % UInt64; u >>>= 64
-            l1 = u % UInt64; u >>>= 64
-            seed = hash_mix(l0 ⊻ secret[3], l1 ⊻ seed)
-            if i > 32
-                l2 = u % UInt64; u >>>= 64
-                l3 = u % UInt64; u >>>= 64
-                seed = hash_mix(l2 ⊻ secret[3], l3 ⊻ seed)
-            end
-        end
-
-        a = (u0 >>> 8(buflen - 16)) % UInt64 ⊻ i
-        b = (u0 >>> 8(buflen - 8)) % UInt64
-    end
-
-    a = a ⊻ secret[2]
-    b = b ⊻ seed
-    b, a = mul_parts(a, b)
-    return hash_mix(a ⊻ secret[4], b ⊻ secret[2] ⊻ i)
+    # Get little-endian byte representation of absolute value
+    # and hash using the new safe hash_bytes function
+    u = abs(x) # n.b.: this hashes typemin(IntN) correctly even if abs fails
+    return hash_bytes(codeunits(u), seed, secret)
 end
 
 
@@ -619,6 +639,8 @@ end
     return hash_mix(a ⊻ secret[4], b ⊻ secret[2] ⊻ bytes_chunk)
 end
 
+hash(data::AbstractString, h::UInt) =
+    hash_bytes(utf8units(data), UInt64(h), HASH_SECRET) % UInt
 @assume_effects :total hash(data::String, h::UInt) =
     GC.@preserve data hash_bytes(pointer(data), sizeof(data), UInt64(h), HASH_SECRET) % UInt
 
diff --git a/base/strings/basic.jl b/base/strings/basic.jl
@@ -362,10 +362,6 @@ end
 
 isless(a::Symbol, b::Symbol) = cmp(a, b) < 0
 
-# hashing
-
-hash(s::AbstractString, h::UInt) = hash(String(s)::String, h)
-
 ## character index arithmetic ##
 
 """
diff --git a/base/strings/lazy.jl b/base/strings/lazy.jl
@@ -96,6 +96,7 @@ iterate(s::LazyString, i::Integer) = iterate(String(s), i)
 isequal(a::LazyString, b::LazyString) = isequal(String(a), String(b))
 ==(a::LazyString, b::LazyString) = (String(a) == String(b))
 ncodeunits(s::LazyString) = ncodeunits(String(s))
-codeunit(s::LazyString) = codeunit(String(s))
+codeunit(s::LazyString) = codeunit("") # returns UInt8
 codeunit(s::LazyString, i::Integer) = codeunit(String(s), i)
+codeunits(s::LazyString) = codeunits(String(s))
 isvalid(s::LazyString, i::Integer) = isvalid(String(s), i)
diff --git a/test/strings/basic.jl b/test/strings/basic.jl
@@ -1193,12 +1193,10 @@ end
     apple_uint8 = Vector{UInt8}("Apple")
     @test apple_uint8 == [0x41, 0x70, 0x70, 0x6c, 0x65]
 
-    apple_uint8 = Array{UInt8}("Apple")
-    @test apple_uint8 == [0x41, 0x70, 0x70, 0x6c, 0x65]
-
-    Base.String(::tstStringType) = "Test"
+    Base.codeunit(::tstStringType) = UInt8
+    Base.codeunits(t::tstStringType) = t.data
     abstract_apple = tstStringType(apple_uint8)
-    @test hash(abstract_apple, UInt(1)) == hash("Test", UInt(1))
+    @test hash(abstract_apple, UInt(1)) == hash("Apple", UInt(1))
 
     @test length("abc", 1, 3) == length("abc", UInt(1), UInt(3))