@@ -70,80 +70,100 @@ hash(x::UInt64, h::UInt) = hash_uint64(hash_mix_linear(x, h))
70
70
hash (x:: Int64 , h:: UInt ) = hash (bitcast (UInt64, x), h)
71
71
hash (x:: Union{Bool, Int8, UInt8, Int16, UInt16, Int32, UInt32} , h:: UInt ) = hash (Int64 (x), h)
72
72
73
+ # IntegerCodeUnits provides a little-endian byte representation of integers
74
+ struct IntegerCodeUnits{T<: Integer } <: AbstractVector{UInt8}
75
+ value:: T
76
+ num_bytes:: Int
77
+
78
+ function IntegerCodeUnits (x:: T ) where {T<: Integer }
79
+ # Calculate number of bytes needed (always pad to full byte)
80
+ u = abs (x)
81
+ num_bytes = max (cld (top_set_bit (u), 8 ), 1 )
82
+ return new {T} (x, num_bytes)
83
+ end
84
+ end
85
+
86
+ function Base. size (units:: IntegerCodeUnits )
87
+ return (units. num_bytes,)
88
+ end
89
+
90
+ function Base. length (units:: IntegerCodeUnits )
91
+ return units. num_bytes
92
+ end
93
+
94
+ function Base. getindex (units:: IntegerCodeUnits , i:: Int )
95
+ @boundscheck checkbounds (units, i)
96
+ u = abs (units. value)
97
+ byte_pos = i - 1
98
+ return UInt8 ((u >>> (8 * byte_pos)) & 0xff )
99
+ end
100
+
101
+ function Base. iterate (units:: IntegerCodeUnits , state:: Int = 1 )
102
+ state > units. num_bytes && return nothing
103
+ return units[state], state + 1
104
+ end
105
+
106
+ # Main interface function to get little-endian byte representation of integers
107
+ codeunits (x:: Integer ) = IntegerCodeUnits (x)
108
+
109
+ # UTF8Units provides UTF-8 byte iteration for any AbstractString
110
+ struct UTF8Units{T<: AbstractString }
111
+ string:: T
112
+ end
113
+
114
+ utf8units (s:: AbstractString ) = codeunit (s) <: UInt8 ? codeunits (s) : UTF8Units (s)
115
+
116
+ # Iterator state: (char_iter_state, remaining_utf8_bytes)
117
+ function Base. iterate (units:: UTF8Units )
118
+ char_result = iterate (units. string)
119
+ char_result === nothing && return nothing
120
+ char, char_state = char_result
121
+
122
+ # Decode char to UTF-8 bytes (similar to the write function)
123
+ u = bswap (reinterpret (UInt32, char))
124
+
125
+ # Return first byte and set up state for remaining bytes
126
+ first_byte = u % UInt8
127
+ remaining_bytes = u >> 8
128
+ return first_byte, (char_state, remaining_bytes)
129
+ end
130
+
131
+ function Base. iterate (units:: UTF8Units , state)
132
+ char_state, remaining_bytes = state
133
+ # If we have more bytes from current char, return next byte
134
+ if remaining_bytes != 0
135
+ byte = remaining_bytes % UInt8
136
+ new_remaining = remaining_bytes >> 8
137
+ return byte, (char_state, new_remaining)
138
+ end
139
+
140
+ # Move to next char
141
+ char_result = iterate (units. string, char_state)
142
+ char_result === nothing && return nothing
143
+ char, new_char_state = char_result
144
+
145
+ # Decode new char to UTF-8 bytes
146
+ u = bswap (reinterpret (UInt32, char))
147
+
148
+ # Return first byte and set up state for remaining bytes
149
+ first_byte = u % UInt8
150
+ remaining_bytes = u >> 8
151
+
152
+ return first_byte, (new_char_state, remaining_bytes)
153
+ end
154
+
73
155
hash_integer (x:: Integer , h:: UInt ) = _hash_integer (x, UInt64 (h)) % UInt
74
156
function _hash_integer (
75
157
x:: Integer ,
76
158
seed:: UInt64 ,
77
159
secret:: NTuple{4, UInt64} = HASH_SECRET
78
160
)
161
+ # Handle sign by XOR-ing with seed
79
162
seed ⊻= (x < 0 )
80
- u0 = abs (x) # n.b.: this hashes typemin(IntN) correctly even if abs fails
81
- u = u0
82
-
83
- # always left-pad to full byte
84
- buflen = UInt (max (cld (top_set_bit (u), 8 ), 1 ))
85
- seed = seed ⊻ hash_mix (seed ⊻ secret[3 ], secret[2 ])
86
-
87
- a = zero (UInt64)
88
- b = zero (UInt64)
89
- i = buflen
90
-
91
- if buflen ≤ 16
92
- if buflen ≥ 4
93
- seed ⊻= buflen
94
- if buflen ≥ 8
95
- a = UInt64 (u % UInt64)
96
- b = UInt64 ((u >>> (8 * (buflen - 8 ))) % UInt64)
97
- else
98
- a = UInt64 (u % UInt32)
99
- b = UInt64 ((u >>> (8 * (buflen - 4 ))) % UInt32)
100
- end
101
- else # buflen > 0
102
- b0 = u % UInt8
103
- b1 = (u >>> (8 * div (buflen, 2 ))) % UInt8
104
- b2 = (u >>> (8 * (buflen - 1 ))) % UInt8
105
- a = (UInt64 (b0) << 45 ) | UInt64 (b2)
106
- b = UInt64 (b1)
107
- end
108
- else
109
- if i > 48
110
- see1 = seed
111
- see2 = seed
112
- while i > 48
113
- l0 = u % UInt64; u >>>= 64
114
- l1 = u % UInt64; u >>>= 64
115
- l2 = u % UInt64; u >>>= 64
116
- l3 = u % UInt64; u >>>= 64
117
- l4 = u % UInt64; u >>>= 64
118
- l5 = u % UInt64; u >>>= 64
119
-
120
- seed = hash_mix (l0 ⊻ secret[1 ], l1 ⊻ seed)
121
- see1 = hash_mix (l2 ⊻ secret[2 ], l3 ⊻ see1)
122
- see2 = hash_mix (l4 ⊻ secret[3 ], l5 ⊻ see2)
123
- i -= 48
124
- end
125
- seed ⊻= see1
126
- seed ⊻= see2
127
- end
128
- if i > 16
129
- l0 = u % UInt64; u >>>= 64
130
- l1 = u % UInt64; u >>>= 64
131
- seed = hash_mix (l0 ⊻ secret[3 ], l1 ⊻ seed)
132
- if i > 32
133
- l2 = u % UInt64; u >>>= 64
134
- l3 = u % UInt64; u >>>= 64
135
- seed = hash_mix (l2 ⊻ secret[3 ], l3 ⊻ seed)
136
- end
137
- end
138
-
139
- a = (u0 >>> 8 (buflen - 16 )) % UInt64 ⊻ i
140
- b = (u0 >>> 8 (buflen - 8 )) % UInt64
141
- end
142
-
143
- a = a ⊻ secret[2 ]
144
- b = b ⊻ seed
145
- b, a = mul_parts (a, b)
146
- return hash_mix (a ⊻ secret[4 ], b ⊻ secret[2 ] ⊻ i)
163
+ # Get little-endian byte representation of absolute value
164
+ # and hash using the new safe hash_bytes function
165
+ u = abs (x) # n.b.: this hashes typemin(IntN) correctly even if abs fails
166
+ return hash_bytes (codeunits (u), seed, secret)
147
167
end
148
168
149
169
619
639
return hash_mix (a ⊻ secret[4 ], b ⊻ secret[2 ] ⊻ bytes_chunk)
620
640
end
621
641
642
+ hash (data:: AbstractString , h:: UInt ) =
643
+ hash_bytes (utf8units (data), UInt64 (h), HASH_SECRET) % UInt
622
644
@assume_effects :total hash (data:: String , h:: UInt ) =
623
645
GC. @preserve data hash_bytes (pointer (data), sizeof (data), UInt64 (h), HASH_SECRET) % UInt
624
646
0 commit comments