Skip to content

Commit 4bd9ee2

Browse files
goldvitalycopybara-github
authored andcommitted
Special implementation for string hash with sizes greater than 64.
AES instructions are used, when available. We load blocks of 64 bytes of the string into 4 independently hashed 128-bit vectors. We use AES encrypt and decrypt to mix the bits. Instructions are running in parallel. Last <=64 bytes are loaded to 4 (or 2 if rest length is <=32) overlapping vectors and encrypted additionally. At the end we mix by another encryption similar to the case in 33-64. ``` name CYCLES/op CYCLES/op vs base BM_HASHING_Combine_contiguous_Fleet_hot 479.0m ± 1% 437.0m ± 0% -8.77% (p=0.000 n=30) BM_HASHING_Combine_contiguous_Fleet_cold 1.700 ± 2% 1.526 ± 2% -10.24% (p=0.000 n=30) arcadia-rome: BM_HASHING_Combine_contiguous_Fleet_hot 465.0m ± 1% 452.0m ± 1% -2.80% (p=0.000 n=30) BM_HASHING_Combine_contiguous_Fleet_cold 4.024 ± 1% 3.676 ± 0% -8.66% (p=0.000 n=30) ``` ASM analysis https://godbolt.org/z/5EzEnT46j shows 8 cycles savings for 128 byte string. We also perform 2x less load operations. PiperOrigin-RevId: 842818076 Change-Id: Ib89f25e0bae2c8ba9ed340350408c27afe6fd222
1 parent 5b1e199 commit 4bd9ee2

File tree

3 files changed

+110
-40
lines changed

3 files changed

+110
-40
lines changed

absl/hash/hash_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1284,7 +1284,7 @@ TEST(SwisstableCollisions, LowEntropyStrings) {
12841284
constexpr char kMinChar = 0;
12851285
constexpr char kMaxChar = 64;
12861286
// These sizes cover the different hashing cases.
1287-
for (size_t size : {8u, 16u, 32u, 64u}) {
1287+
for (size_t size : {8u, 16u, 32u, 64u, 128u}) {
12881288
for (size_t b = 0; b < size - 1; ++b) {
12891289
absl::flat_hash_set<std::string> set;
12901290
std::string s(size, '\0');

absl/hash/internal/hash.cc

Lines changed: 87 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
#include "absl/base/prefetch.h"
2727
#include "absl/hash/internal/city.h"
2828

29-
3029
#ifdef ABSL_AES_INTERNAL_HAVE_X86_SIMD
3130
#error ABSL_AES_INTERNAL_HAVE_X86_SIMD cannot be directly set
3231
#elif defined(__SSE4_2__) && defined(__AES__)
@@ -46,18 +45,20 @@ namespace hash_internal {
4645

4746
namespace {
4847

49-
uint64_t Mix32Bytes(const uint8_t* ptr, uint64_t current_state) {
50-
uint64_t a = absl::base_internal::UnalignedLoad64(ptr);
51-
uint64_t b = absl::base_internal::UnalignedLoad64(ptr + 8);
52-
uint64_t c = absl::base_internal::UnalignedLoad64(ptr + 16);
53-
uint64_t d = absl::base_internal::UnalignedLoad64(ptr + 24);
54-
55-
uint64_t cs0 = Mix(a ^ kStaticRandomData[1], b ^ current_state);
56-
uint64_t cs1 = Mix(c ^ kStaticRandomData[2], d ^ current_state);
57-
return cs0 ^ cs1;
48+
void PrefetchFutureDataToLocalCache(const uint8_t* ptr) {
49+
PrefetchToLocalCache(ptr + 5 * ABSL_CACHELINE_SIZE);
5850
}
5951

6052
#ifdef ABSL_AES_INTERNAL_HAVE_X86_SIMD
53+
uint64_t Mix4x16Vectors(__m128i a, __m128i b, __m128i c, __m128i d) {
54+
// res128 = decrypt(a + c, d) + decrypt(b + d, a)
55+
auto res128 = _mm_add_epi64(_mm_aesenc_si128(_mm_add_epi64(a, c), d),
56+
_mm_aesdec_si128(_mm_sub_epi64(b, d), a));
57+
auto x64 = static_cast<uint64_t>(_mm_cvtsi128_si64(res128));
58+
auto y64 = static_cast<uint64_t>(_mm_extract_epi64(res128, 1));
59+
return x64 ^ y64;
60+
}
61+
6162
uint64_t LowLevelHash33To64(uint64_t seed, const uint8_t* ptr, size_t len) {
6263
assert(len > 32);
6364
assert(len <= 64);
@@ -84,21 +85,89 @@ uint64_t LowLevelHash33To64(uint64_t seed, const uint8_t* ptr, size_t len) {
8485

8586
// We perform another round of encryption to mix bits between two halves of
8687
// the input.
87-
auto res128 = _mm_add_epi64(_mm_aesenc_si128(_mm_add_epi64(na, nc), nd),
88-
_mm_aesdec_si128(_mm_sub_epi64(nb, nd), na));
89-
auto x64 = static_cast<uint64_t>(_mm_cvtsi128_si64(res128));
90-
auto y64 = static_cast<uint64_t>(_mm_extract_epi64(res128, 1));
91-
return x64 ^ y64;
88+
return Mix4x16Vectors(na, nb, nc, nd);
89+
}
90+
91+
[[maybe_unused]] ABSL_ATTRIBUTE_NOINLINE uint64_t
92+
LowLevelHashLenGt64(uint64_t seed, const void* data, size_t len) {
93+
assert(len > 64);
94+
const uint8_t* ptr = static_cast<const uint8_t*>(data);
95+
const uint8_t* last_32_ptr = ptr + len - 32;
96+
97+
// If we have more than 64 bytes, we're going to handle chunks of 64
98+
// bytes at a time. We're going to build up four separate hash states
99+
// which we will then hash together. This avoids short dependency chains.
100+
__m128i state0 =
101+
_mm_set_epi64x(static_cast<int64_t>(seed), static_cast<int64_t>(len));
102+
__m128i state1 = state0;
103+
__m128i state2 = state1;
104+
__m128i state3 = state2;
105+
106+
// Mixing two 128-bit vectors at a time with corresponding states.
107+
// All variables are mixed slightly differently to avoid hash collision
108+
// due to trivial byte rotation.
109+
// We combine state and data with _mm_add_epi64/_mm_sub_epi64 before applying
110+
// AES encryption to make hash function dependent on the order of the blocks.
111+
// See comments in LowLevelHash33To64 for more considerations.
112+
auto mix_ab = [&state0,
113+
&state1](const uint8_t* p) ABSL_ATTRIBUTE_ALWAYS_INLINE {
114+
// i128 a = *p;
115+
// i128 b = *(p + 16);
116+
// state0 = decrypt(state0 + a, state0);
117+
// state1 = decrypt(state1 - b, state1);
118+
auto a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
119+
auto b = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + 16));
120+
state0 = _mm_aesdec_si128(_mm_add_epi64(state0, a), state0);
121+
state1 = _mm_aesdec_si128(_mm_sub_epi64(state1, b), state1);
122+
};
123+
auto mix_cd = [&state2,
124+
&state3](const uint8_t* p) ABSL_ATTRIBUTE_ALWAYS_INLINE {
125+
// i128 c = *p;
126+
// i128 d = *(p + 16);
127+
// state2 = encrypt(state2 + c, state2);
128+
// state3 = encrypt(state3 - d, state3);
129+
auto c = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
130+
auto d = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + 16));
131+
state2 = _mm_aesenc_si128(_mm_add_epi64(state2, c), state2);
132+
state3 = _mm_aesenc_si128(_mm_sub_epi64(state3, d), state3);
133+
};
134+
135+
do {
136+
PrefetchFutureDataToLocalCache(ptr);
137+
mix_ab(ptr);
138+
mix_cd(ptr + 32);
139+
140+
ptr += 64;
141+
len -= 64;
142+
} while (len > 64);
143+
144+
// We now have a data `ptr` with at most 64 bytes.
145+
if (len > 32) {
146+
mix_ab(ptr);
147+
}
148+
mix_cd(last_32_ptr);
149+
150+
return Mix4x16Vectors(state0, state1, state2, state3);
92151
}
93152
#else
153+
uint64_t Mix32Bytes(const uint8_t* ptr, uint64_t current_state) {
154+
uint64_t a = absl::base_internal::UnalignedLoad64(ptr);
155+
uint64_t b = absl::base_internal::UnalignedLoad64(ptr + 8);
156+
uint64_t c = absl::base_internal::UnalignedLoad64(ptr + 16);
157+
uint64_t d = absl::base_internal::UnalignedLoad64(ptr + 24);
158+
159+
uint64_t cs0 = Mix(a ^ kStaticRandomData[1], b ^ current_state);
160+
uint64_t cs1 = Mix(c ^ kStaticRandomData[2], d ^ current_state);
161+
return cs0 ^ cs1;
162+
}
163+
94164
uint64_t LowLevelHash33To64(uint64_t seed, const uint8_t* ptr, size_t len) {
95165
assert(len > 32);
96166
assert(len <= 64);
97167
uint64_t current_state = seed ^ kStaticRandomData[0] ^ len;
98168
const uint8_t* last_32_ptr = ptr + len - 32;
99169
return Mix32Bytes(last_32_ptr, Mix32Bytes(ptr, current_state));
100170
}
101-
#endif // ABSL_AES_INTERNAL_HAVE_X86_SIMD
102171

103172
[[maybe_unused]] ABSL_ATTRIBUTE_NOINLINE uint64_t
104173
LowLevelHashLenGt64(uint64_t seed, const void* data, size_t len) {
@@ -114,7 +183,7 @@ LowLevelHashLenGt64(uint64_t seed, const void* data, size_t len) {
114183
uint64_t duplicated_state2 = current_state;
115184

116185
do {
117-
PrefetchToLocalCache(ptr + 5 * ABSL_CACHELINE_SIZE);
186+
PrefetchFutureDataToLocalCache(ptr);
118187

119188
uint64_t a = absl::base_internal::UnalignedLoad64(ptr);
120189
uint64_t b = absl::base_internal::UnalignedLoad64(ptr + 8);
@@ -148,6 +217,7 @@ LowLevelHashLenGt64(uint64_t seed, const void* data, size_t len) {
148217
// safely read from `ptr + len - 32`.
149218
return Mix32Bytes(last_32_ptr, current_state);
150219
}
220+
#endif // ABSL_AES_INTERNAL_HAVE_X86_SIMD
151221

152222
[[maybe_unused]] uint64_t LowLevelHashLenGt32(uint64_t seed, const void* data,
153223
size_t len) {

absl/hash/internal/low_level_hash_test.cc

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -380,28 +380,28 @@ TEST(LowLevelHashTest, VerifyGolden) {
380380
0xe4c78173c7ea537b, 0x0bbdc2bcabdb50b1, 0xd9aa134df2d87623,
381381
0x6c4907c9477a9409, 0xc3e418a5dbda52e5, 0x4d24f3e9d0dda93a,
382382
0xcdb565a363dbe45f, 0xa95f228c8ee57478, 0x6b8f00bab5130227,
383-
0x2d05a0f44818b67a, 0xa64b55b071afbbea, 0xa205bfe6c724ce4d,
384-
0x69dd26ca8ac21744, 0xef80e2ff2f6a9bc0, 0xde266c0baa202c20,
385-
0xfa3463080ac74c50, 0x379d968a40125c2b, 0x4cbbd0a7b3c7d648,
386-
0xc92afd93f4c665d2, 0x6e28f5adb7ae38dc, 0x7c689c9c237be35e,
387-
0xaea41b29bd9d0f73, 0x832cef631d77e59f, 0x70cac8e87bc37dd3,
388-
0x8e8c98bbde68e764, 0xd6117aeb3ddedded, 0xd796ab808e766240,
389-
0x8953d0ea1a7d9814, 0xa212eba4281b391c, 0x21a555a8939ce597,
390-
0x809d31660f6d81a8, 0x2356524b20ab400f, 0x5bc611e1e49d0478,
391-
0xba9c065e2f385ce2, 0xb0a0fd12f4e83899, 0x14d076a35b1ff2ca,
392-
0x8acd0bb8cf9a93c0, 0xe62e8ec094039ee4, 0x38a536a7072bdc61,
393-
0xca256297602524f8, 0xfc62ebfb3530caeb, 0x8d8b0c05520569f6,
394-
0xbbaca65cf154c59d, 0x3739b5ada7e338d3, 0xdb9ea31f47365340,
395-
0x410b5c9c1da56755, 0x7e0abc03dbd10283, 0x136f87be70ed442e,
396-
0x6b727d4feddbe1e9, 0x074ebb21183b01df, 0x3fe92185b1985484,
397-
0xc5d8efd3c68305ca, 0xd9bada21b17e272e, 0x64d73133e1360f83,
398-
0xeb8563aa993e21f9, 0xe5e8da50cceab28f, 0x7a6f92eb3223d2f3,
399-
0xbdaf98370ea9b31b, 0x1682a84457f077bc, 0x4abd2d33b6e3be37,
400-
0xb35bc81a7c9d4c04, 0x3e5bde3fb7cfe63d, 0xff3abe6e2ffec974,
401-
0xb8116dd26cf6feec, 0x7a77a6e4ed0cf081, 0xb71eec2d5a184316,
402-
0x6fa932f77b4da817, 0x795f79b33909b2c4, 0x1b8755ef6b5eb34e,
403-
0x2255b72d7d6b2d79, 0xf2bdafafa90bd50a, 0x442a578f02cb1fc8,
404-
0xc25aefe55ecf83db, 0x3114c056f9c5a676,
383+
0x2d05a0f44818b67a, 0xd6bf7d990b5f44cb, 0xa3608bdb4712861a,
384+
0xf20c33e5e355330b, 0xbc86e1b13130180d, 0x0848221b397b839a,
385+
0x17cc0acf44a7e210, 0xc18c6dc584fe0f62, 0x896c7858a59f991d,
386+
0xeab1e6d7d2856ed7, 0x7e4b2d99c23edc51, 0x9aeeeb7fa46e7cf0,
387+
0x161b9f2e3611790f, 0x5f82aae18d971b36, 0x8d0dd9965881e162,
388+
0x56700ea26285895a, 0xcd919c86c29a053e, 0x3e5d589282d9a722,
389+
0x92caee9f48a66604, 0x7e1a2fd9b06f14b0, 0xce1d5293f95b0178,
390+
0x8101361290e70a11, 0x570e3e9c9eafc1c6, 0x77b6241926a7a568,
391+
0x313e5cb34f346699, 0xab8ebeab0514b82b, 0x6e0a43763a310408,
392+
0x761b76ec22b2e440, 0x4238c84a9ec00528, 0xb9ea1f6d4d5552af,
393+
0xd21f8f110b9dc060, 0xb3d3842b69ac3689, 0xd0a88aa1dcf59869,
394+
0xf3f69f637b123403, 0xf5f34b1068cac7da, 0xe69a08d604774abf,
395+
0x57648d3a73332437, 0x9762947f5013d00d, 0x35c5d734a0015922,
396+
0xbee2fe5a104ce209, 0xedb060efa6efca34, 0x5ccf0f4786d97bc2,
397+
0x1ef8ed72e80d7bef, 0x58522deb49c5e30f, 0xde97cd2a6f8bd13b,
398+
0x3fae37c6f9855d09, 0xea99ae786feca261, 0x8c6d1d46670b0943,
399+
0x84658b2a232c7bfb, 0x7058b7a7968de394, 0x0d44fba68e25aa8f,
400+
0xc7f687020f8eb00b, 0xbf9671e1196153d6, 0x1009be891b7f83e7,
401+
0x4f9457fb4aa12865, 0x30a49d9563643b32, 0x0302e2c5b46d5a3a,
402+
0x77553f42fb0bfbf7, 0x26b95e89f0077110, 0x76ce68ebe01191ba,
403+
0x724110fb509e4376, 0xebe74b016b5cfb88, 0x3b0fe11dcf175fc9,
404+
0x20b737b9c0490538, 0x0db21c429b45fd17,
405405
};
406406
#else
407407
constexpr uint64_t kGolden[kNumGoldenOutputs] = {

0 commit comments

Comments
 (0)