@@ -191,33 +191,36 @@ inline void mum(std::uint64_t* a, std::uint64_t* b) {
191191// This is a modified ChibiHash version 2: https://github.com/N-R-K/ChibiHash
192192// hash results will change on different endian machines!
193193
194- [[nodiscard]] auto constexpr rotl (std::uint64_t value, unsigned shift) -> std::uint64_t {
195- const unsigned width = 64 ;
196- const unsigned mask = width - 1 ;
197- if ((shift &= mask) == 0 ) { // NOLINT(bugprone-assignment-in-if-condition)
198- return value;
199- }
200- return (value << shift) | (value >> (width - shift));
194+ [[nodiscard]] auto constexpr rotl (std::uint64_t x, unsigned n) -> std::uint64_t {
195+ n &= 63U ;
196+ return (x << n) | (x >> ((-n) & 63U ));
201197}
202198
203199[[nodiscard]] inline auto r4 (const std::uint8_t * p) -> std::uint64_t {
204- return static_cast <std::uint64_t >(p[0 ]) << 0U | static_cast <std::uint64_t >(p[1 ]) << 8U |
205- static_cast <std::uint64_t >(p[2 ]) << 16U | static_cast <std::uint64_t >(p[3 ]) << 24U ;
200+ // return static_cast<std::uint64_t>(p[0]) << 0U | static_cast<std::uint64_t>(p[1]) << 8U |
201+ // static_cast<std::uint64_t>(p[2]) << 16U | static_cast<std::uint64_t>(p[3]) << 24U;
202+ std::uint32_t v{};
203+ std::memcpy (&v, p, 4 );
204+ return v;
206205}
207206
208207// read functions. WARNING: we don't care about endianness, so results are different on big endian!
209208[[nodiscard]] inline auto r8 (const std::uint8_t * p) -> std::uint64_t {
210- return r4 (p) | (r4 (p + 4 ) << 32U );
209+ // return r4(p) | (r4(p + 4) << 32U);
210+ std::uint64_t v{};
211+ std::memcpy (&v, p, 8U );
212+ return v;
211213}
212214
213- [[maybe_unused]] [[nodiscard]] inline auto hash (void const * key, std::size_t len) -> std::uint64_t {
214- auto const * p = static_cast <std::uint8_t const *>(key);
215- auto l = static_cast <std::ptrdiff_t >(len);
215+ [[maybe_unused]] [[nodiscard]] inline auto hash (void const * key, std::size_t l) -> std::uint64_t {
216+ static constexpr auto seed = UINT64_C (0xa0761d6478bd642f );
216217
217- static constexpr std::uint64_t seed = UINT64_C (0xa0761d6478bd642f );
218- static constexpr std::uint64_t k = UINT64_C (0x2B7E151628AED2A7 ); // digits of e
219- static constexpr std::uint64_t seed2 = rotl (seed - k, 15 ) + rotl (seed - k, 47 );
220- std::array<std::uint64_t , 4 > h = {seed, seed + k, seed2, seed2 + (k * k ^ k)};
218+ static constexpr auto k = UINT64_C (0x2B7E151628AED2A7 ); // digits of e
219+ static constexpr auto seed2 = rotl (seed - k, 15 ) + rotl (seed - k, 47 );
220+ auto h0 = seed;
221+ auto h1 = seed + k;
222+ auto h2 = seed2;
223+ auto h3 = seed2 + ((k * k) ^ k);
221224
222225 // depending on your system unrolling might (or might not) make things
223226 // a tad bit faster on large strings. on my system, it actually makes
@@ -227,39 +230,49 @@ inline void mum(std::uint64_t* a, std::uint64_t* b) {
227230 // but depending on your needs, you may want to uncomment the pragma
228231 // below to unroll the loop.
229232 // # pragma GCC unroll 2
230- for (; l >= 32 ; l -= 32 ) {
231- for (unsigned i = 0 ; i < 4 ; ++i, p += 8 ) {
232- auto stripe = r8 (p);
233- h[i] = (stripe + h[i]) * k;
234- h[(i + 1 ) & 3U ] += rotl (stripe, 27 );
235- }
236- }
233+ auto const * p = static_cast <std::uint8_t const *>(key);
234+
235+ if (ANKERL_UNORDERED_DENSE_UNLIKELY (l >= 32 )) {
236+ do {
237+ auto const stripe0 = r8 (p);
238+ auto const stripe1 = r8 (p + 8 );
239+ auto const stripe2 = r8 (p + 16 );
240+ auto const stripe3 = r8 (p + 24 );
237241
238- for (; l >= 8 ; l -= 8 , p += 8 ) {
239- h[0 ] ^= r4 (p + 0 );
240- h[0 ] *= k;
241- h[1 ] ^= r4 (p + 4 );
242- h[1 ] *= k;
242+ h0 = (stripe0 + h0) * k + rotl (stripe3, 27 );
243+ h1 = (stripe1 + h1 + rotl (stripe0, 27 )) * k;
244+ h2 = (stripe2 + h2 + rotl (stripe1, 27 )) * k;
245+ h3 = (stripe3 + h3 + rotl (stripe2, 27 )) * k;
246+
247+ l -= 32 ;
248+ p += 32 ;
249+ } while (ANKERL_UNORDERED_DENSE_LIKELY (l >= 32 ));
250+ }
251+ while (ANKERL_UNORDERED_DENSE_LIKELY (l >= 8 )) {
252+ h0 = (h0 ^ r4 (p + 0 )) * k;
253+ h1 = (h1 ^ r4 (p + 4 )) * k;
254+ l -= 8 ;
255+ p += 8 ;
243256 }
244257
245258 if (l >= 4 ) {
246- h[ 2 ] ^= r4 (p);
247- h[ 3 ] ^= r4 (p + l - 4 );
259+ h2 ^= r4 (p);
260+ h3 ^= r4 (p + l - 4 );
248261 } else if (l > 0 ) {
249- h[ 2 ] ^= p[0 ];
250- h[ 3 ] ^= p[l / 2 ] | (static_cast <std::uint64_t >(p[l - 1 ]) << 8U );
262+ h2 ^= p[0 ];
263+ h3 ^= p[l >> 1U ] | (static_cast <std::uint64_t >(p[l - 1 ]) << 8U );
251264 }
252265
253- h[ 0 ] += rotl (h[ 2 ] * k, 31 ) ^ (h[ 2 ] >> 31U );
254- h[ 1 ] += rotl (h[ 3 ] * k, 31 ) ^ (h[ 3 ] >> 31U );
255- h[ 0 ] *= k;
256- h[ 0 ] ^= h[ 0 ] >> 31U ;
257- h[ 1 ] += h[ 0 ] ;
266+ h0 += rotl (h2 * k, 31 ) ^ (h2 >> 31U );
267+ h1 += rotl (h3 * k, 31 ) ^ (h3 >> 31U );
268+ h0 *= k;
269+ h0 ^= h0 >> 31U ;
270+ h1 += h0 ;
258271
259- auto x = static_cast <std::uint64_t >(len ) * k;
272+ auto x = static_cast <std::uint64_t >(l ) * k;
260273 x ^= rotl (x, 29 );
261274 x += seed;
262- x ^= h[ 1 ] ;
275+ x ^= h1 ;
263276
264277 x ^= rotl (x, 15 ) ^ rotl (x, 42 );
265278 x *= k;
0 commit comments