@@ -182,6 +182,35 @@ inline void mum(std::uint64_t* a, std::uint64_t* b) {
182182# endif
183183}
184184
185+ inline void umul128 (std::uint64_t const u, std::uint64_t const v, std::uint64_t * const rl, std::uint64_t * const rh) {
186+ # if defined(__SIZEOF_INT128__)
187+ __uint128_t r = u;
188+ r *= v;
189+ *rl = static_cast <std::uint64_t >(r);
190+ *rh = static_cast <std::uint64_t >(r >> 64U );
191+ # elif defined(_MSC_VER) && defined(_M_X64)
192+ *rl = _umul128 (u, v, rh);
193+ # else
194+ std::uint64_t ha = u >> 32U ;
195+ std::uint64_t hb = v >> 32U ;
196+ std::uint64_t la = static_cast <std::uint32_t >(u);
197+ std::uint64_t lb = static_cast <std::uint32_t >(v);
198+ std::uint64_t hi{};
199+ std::uint64_t lo{};
200+ std::uint64_t rh = ha * hb;
201+ std::uint64_t rm0 = ha * lb;
202+ std::uint64_t rm1 = hb * la;
203+ std::uint64_t rl = la * lb;
204+ std::uint64_t t = rl + (rm0 << 32U );
205+ auto c = static_cast <std::uint64_t >(t < rl);
206+ lo = t + (rm1 << 32U );
207+ c += static_cast <std::uint64_t >(lo < t);
208+ hi = rh + (rm0 >> 32U ) + (rm1 >> 32U ) + c;
209+ *rl = lo;
210+ *rh = hi;
211+ # endif
212+ }
213+
185214// multiply and xor mix function, aka MUM
186215[[nodiscard]] inline auto mix (std::uint64_t a, std::uint64_t b) -> std::uint64_t {
187216 mum (&a, &b);
@@ -196,6 +225,11 @@ inline void mum(std::uint64_t* a, std::uint64_t* b) {
196225 return (x << n) | (x >> ((-n) & 63U ));
197226}
198227
228+ // reads 1, 2, or 3 bytes
229+ [[nodiscard]] inline auto r3 (const std::uint8_t * p, std::size_t k) -> std::uint64_t {
230+ return (static_cast <std::uint64_t >(p[0 ]) << 16U ) | (static_cast <std::uint64_t >(p[k >> 1U ]) << 8U ) | p[k - 1 ];
231+ }
232+
199233[[nodiscard]] inline auto r4 (const std::uint8_t * p) -> std::uint64_t {
200234 // return static_cast<std::uint64_t>(p[0]) << 0U | static_cast<std::uint64_t>(p[1]) << 8U |
201235 // static_cast<std::uint64_t>(p[2]) << 16U | static_cast<std::uint64_t>(p[3]) << 24U;
@@ -213,72 +247,59 @@ inline void mum(std::uint64_t* a, std::uint64_t* b) {
213247}
214248
215249[[maybe_unused]] [[nodiscard]] inline auto hash (void const * key, std::size_t l) -> std::uint64_t {
216- static constexpr auto seed = UINT64_C (0xa0761d6478bd642f );
217-
218- static constexpr auto k = UINT64_C (0x2B7E151628AED2A7 ); // digits of e
219- static constexpr auto seed2 = rotl (seed - k, 15 ) + rotl (seed - k, 47 );
220- auto h0 = seed;
221- auto h1 = seed + k;
222- auto h2 = seed2;
223- auto h3 = seed2 + ((k * k) ^ k);
224-
225- // depending on your system unrolling might (or might not) make things
226- // a tad bit faster on large strings. on my system, it actually makes
227- // things slower.
228- // generally speaking, the cost of bigger code size is usually not
229- // worth the trade-off since larger code-size will hinder inlinability
230- // but depending on your needs, you may want to uncomment the pragma
231- // below to unroll the loop.
232- // # pragma GCC unroll 2
233- auto const * p = static_cast <std::uint8_t const *>(key);
234-
235- if (ANKERL_UNORDERED_DENSE_UNLIKELY (l >= 32 )) {
236- do {
237- auto const stripe0 = r8 (p);
238- auto const stripe1 = r8 (p + 8 );
239- auto const stripe2 = r8 (p + 16 );
240- auto const stripe3 = r8 (p + 24 );
241-
242- h0 = (stripe0 + h0) * k + rotl (stripe3, 27 );
243- h1 = (stripe1 + h1 + rotl (stripe0, 27 )) * k;
244- h2 = (stripe2 + h2 + rotl (stripe1, 27 )) * k;
245- h3 = (stripe3 + h3 + rotl (stripe2, 27 )) * k;
246-
247- l -= 32 ;
248- p += 32 ;
249- } while (ANKERL_UNORDERED_DENSE_LIKELY (l >= 32 ));
250- }
251- while (ANKERL_UNORDERED_DENSE_LIKELY (l >= 8 )) {
252- h0 = (h0 ^ r4 (p + 0 )) * k;
253- h1 = (h1 ^ r4 (p + 4 )) * k;
254- l -= 8 ;
255- p += 8 ;
256- }
257-
258- if (l >= 4 ) {
259- h2 ^= r4 (p);
260- h3 ^= r4 (p + l - 4 );
261- } else if (l > 0 ) {
262- h2 ^= p[0 ];
263- h3 ^= p[l >> 1U ] | (static_cast <std::uint64_t >(p[l - 1 ]) << 8U );
264- }
265-
266- h0 += rotl (h2 * k, 31 ) ^ (h2 >> 31U );
267- h1 += rotl (h3 * k, 31 ) ^ (h3 >> 31U );
268- h0 *= k;
269- h0 ^= h0 >> 31U ;
270- h1 += h0;
271-
272- auto x = static_cast <std::uint64_t >(l) * k;
273- x ^= rotl (x, 29 );
274- x += seed;
275- x ^= h1;
276-
277- x ^= rotl (x, 15 ) ^ rotl (x, 42 );
278- x *= k;
279- x ^= rotl (x, 13 ) ^ rotl (x, 31 );
280-
281- return x;
250+ static constexpr auto use_seed = UINT64_C (0 );
251+
252+ // The seeds are initialized to mantissa bits of PI.
253+ auto seed1 = UINT64_C (0x243F6A8885A308D3 ) ^ l;
254+ auto seed2 = UINT64_C (0x452821E638D01377 ) ^ l;
255+
256+ auto val01 = UINT64_C (0xAAAAAAAAAAAAAAAA ); // /< `10` bit-pairs.
257+ auto val10 = UINT64_C (0x5555555555555555 ); // /< `01` bit-pairs.
258+ umul128 (seed2 ^ (use_seed & val10), seed1 ^ (use_seed & val01), &seed1, &seed2);
259+
260+ auto const * msg = static_cast <const uint8_t *>(key);
261+
262+ if (ANKERL_UNORDERED_DENSE_UNLIKELY (l > 16 ))
263+ ANKERL_UNORDERED_DENSE_UNLIKELY_ATTR {
264+ val01 ^= seed1;
265+ val10 ^= seed2;
266+
267+ do {
268+ umul128 (r8 (msg) ^ seed1, r8 (msg + 8 ) ^ seed2, &seed1, &seed2);
269+
270+ l -= 16 ;
271+ msg += 16 ;
272+
273+ seed1 += val01;
274+ seed2 += val10;
275+
276+ } while (ANKERL_UNORDERED_DENSE_LIKELY (l > 16 ));
277+ }
278+
279+ std::uint64_t a = 0 ;
280+ std::uint64_t b = 0 ;
281+ if (ANKERL_UNORDERED_DENSE_LIKELY (l >= 4 )) {
282+ const uint8_t * const msg4 = msg + l - 4 ;
283+ const size_t mo = l >> 3U ;
284+
285+ a = r4 (msg) << 32U | r4 (msg4);
286+ b = r4 (msg + (mo * 4 )) << 32U | r4 (msg4 - (mo * 4 ));
287+ } else {
288+ // a = r3(msg, l);
289+ if (l != 0 ) {
290+ a = msg[0 ];
291+ if (l != 1 ) {
292+ a |= static_cast <std::uint64_t >(msg[1 ]) << 8 ;
293+ if (l != 2 ) {
294+ a |= static_cast <std::uint64_t >(msg[2 ]) << 16 ;
295+ }
296+ }
297+ }
298+ }
299+ umul128 (a ^ seed1, b ^ seed2, &seed1, &seed2);
300+ umul128 (val01 ^ seed1, seed2, &a, &b);
301+
302+ return (a ^ b);
282303}
283304
284305[[nodiscard]] inline auto hash (std::uint64_t x) -> std::uint64_t {
0 commit comments