Skip to content

Commit 60b607b

Browse files
goldvitalycopybara-github
authored andcommitted
CRC32 version of CombineContiguous for length <= 32.
For length in [17, 32] we compute two chain of dependent CRC32 operations to have good entropy in the resulting two 32 bit numbers. 1. x := CRC32(CRC32(state, A), D) 2. y := CRC32(CRC32(bswap(state), C), B) On ARM: CRC32 has 2 cycles latency and throughput equal to 1. Computations will be pipelined without any wait. On x86: CRC32 has 3 cycles latency and throughput equal to 1. There will be 1 extra cycle wait, but we can do `cmp` in parallel. At the end we multiply (mul - x) * (y - mul). mul is added to fill upper 32 bits of CRC result with good entropy bits. `mul = rotr(kMul, len)` We also mixing length differently: 1. `state + 8 * len` (`lea` instruction), later one or two CRC shuffle these bits well into low 32 bit. 2. `rotr(kMul, len)` is used for filling high 32 bits before multiplication in `Mix`. This avoid reading from `kStaticRandomData`. For smaller strings we try to extremely minimize binary size and register pressure. CRC instruction fused with memory read is used. llvm-mca reporting 1 cycle smaller latency compared to separate `mov` + `crc`. ASM analysis https://godbolt.org/z/e1xrKzhdc: 1. 100+ bytes binary size saving (per inline instance) 2. 25+ instruction saving 3. 2 registers are not used (r8 and r9). Latency in isolation without accounting comparison are controversial. 1. latency for 8 bytes in isolation is 1 cycle better: https://godbolt.org/z/zc39eM3K9 2. latency for 1-3 bytes in isolation is 2 cycles better: https://godbolt.org/z/qMKfbv438 3. latency for 16 bytes in isolation is 3 cycles worse: https://godbolt.org/z/vcqr8oGv3 4. latency for 32 bytes in isolation is 5 cycles worse: https://godbolt.org/z/nEPP5jP58 PiperOrigin-RevId: 850659551 Change-Id: I02a2434f2d98473b099c171ef1c56adffa821c60
1 parent 7b40ebf commit 60b607b

File tree

1 file changed

+108
-3
lines changed

1 file changed

+108
-3
lines changed

absl/hash/internal/hash.h

Lines changed: 108 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@
7474
#include <vector>
7575

7676
#include "absl/base/attributes.h"
77+
#include "absl/base/internal/endian.h"
7778
#include "absl/base/internal/unaligned_access.h"
7879
#include "absl/base/optimization.h"
7980
#include "absl/base/port.h"
@@ -93,6 +94,38 @@
9394
#include <filesystem> // NOLINT
9495
#endif
9596

97+
// 32-bit builds with SSE 4.2 do not have _mm_crc32_u64, so the
98+
// __x86_64__ condition is necessary.
99+
#if defined(__SSE4_2__) && defined(__x86_64__)
100+
101+
#include <x86intrin.h>
102+
#define ABSL_HASH_INTERNAL_HAS_CRC32
103+
#define ABSL_HASH_INTERNAL_CRC32_U64 _mm_crc32_u64
104+
#define ABSL_HASH_INTERNAL_CRC32_U32 _mm_crc32_u32
105+
#define ABSL_HASH_INTERNAL_CRC32_U8 _mm_crc32_u8
106+
107+
#elif defined(_MSC_VER) && !defined(__clang__) && defined(__AVX__)
108+
109+
// MSVC AVX (/arch:AVX) implies SSE 4.2.
110+
#include <intrin.h>
111+
#define ABSL_HASH_INTERNAL_HAS_CRC32
112+
#define ABSL_HASH_INTERNAL_CRC32_U64 _mm_crc32_u64
113+
#define ABSL_HASH_INTERNAL_CRC32_U32 _mm_crc32_u32
114+
#define ABSL_HASH_INTERNAL_CRC32_U8 _mm_crc32_u8
115+
116+
#elif defined(__ARM_FEATURE_CRC32)
117+
118+
#include <arm_acle.h>
119+
#define ABSL_HASH_INTERNAL_HAS_CRC32
120+
// Casting to uint32_t to be consistent with x86 intrinsic (_mm_crc32_u64
121+
// accepts crc as 64 bit integer).
122+
#define ABSL_HASH_INTERNAL_CRC32_U64(crc, data) \
123+
__crc32cd(static_cast<uint32_t>(crc), data)
124+
#define ABSL_HASH_INTERNAL_CRC32_U32 __crc32cw
125+
#define ABSL_HASH_INTERNAL_CRC32_U8 __crc32cb
126+
127+
#endif
128+
96129
namespace absl {
97130
ABSL_NAMESPACE_BEGIN
98131

@@ -965,18 +998,20 @@ ABSL_ATTRIBUTE_ALWAYS_INLINE inline uint64_t Mix(uint64_t lhs, uint64_t rhs) {
965998
return Uint128High64(m) ^ Uint128Low64(m);
966999
}
9671000

968-
// Reads 8 bytes from p.
969-
inline uint64_t Read8(const unsigned char* p) {
9701001
// Suppress erroneous array bounds errors on GCC.
9711002
#if defined(__GNUC__) && !defined(__clang__)
9721003
#pragma GCC diagnostic push
9731004
#pragma GCC diagnostic ignored "-Warray-bounds"
9741005
#endif
1006+
inline uint32_t Read4(const unsigned char* p) {
1007+
return absl::base_internal::UnalignedLoad32(p);
1008+
}
1009+
inline uint64_t Read8(const unsigned char* p) {
9751010
return absl::base_internal::UnalignedLoad64(p);
1011+
}
9761012
#if defined(__GNUC__) && !defined(__clang__)
9771013
#pragma GCC diagnostic pop
9781014
#endif
979-
}
9801015

9811016
// Reads 9 to 16 bytes from p.
9821017
// The first 8 bytes are in .first, and the rest of the bytes are in .second
@@ -1096,6 +1131,70 @@ inline uint64_t CombineContiguousImpl(
10961131
return CombineLargeContiguousImplOn32BitLengthGt8(state, first, len);
10971132
}
10981133

1134+
#ifdef ABSL_HASH_INTERNAL_HAS_CRC32
1135+
inline uint64_t CombineContiguousImpl(
1136+
uint64_t state, const unsigned char* first, size_t len,
1137+
std::integral_constant<int, 8> /* sizeof_size_t */) {
1138+
if (ABSL_PREDICT_FALSE(len > 32)) {
1139+
return CombineLargeContiguousImplOn64BitLengthGt32(state, first, len);
1140+
}
1141+
// `mul` is the salt that is used for final mixing. It is important to fill
1142+
// high 32 bits because CRC wipes out high 32 bits.
1143+
// `rotr` is important to mix `len` into high 32 bits.
1144+
uint64_t mul = absl::rotr(kMul, static_cast<int>(len));
1145+
// Only low 32 bits of each uint64_t are used in CRC32 so we use gbswap_64 to
1146+
// move high 32 bits to low 32 bits. It has slightly smaller binary size than
1147+
// `>> 32`. `state + 8 * len` is a single instruction on both x86 and ARM, so
1148+
// we use it to better mix length. Although only the low 32 bits of the pair
1149+
// elements are used, we use pair<uint64_t, uint64_t> for better generated
1150+
// code.
1151+
std::pair<uint64_t, uint64_t> crcs = {state + 8 * len,
1152+
absl::gbswap_64(state)};
1153+
1154+
// All CRC operations here directly read bytes from the memory.
1155+
// Single fused instructions are used, like `crc32 rcx, qword ptr [rsi]`.
1156+
// On x86, llvm-mca reports latency `R + 2` for such fused instructions, while
1157+
// `R + 3` for two separate `mov` + `crc` instructions. `R` is the latency of
1158+
// reading the memory. Fused instructions also reduce register pressure
1159+
// allowing surrounding code to be more efficient when this code is inlined.
1160+
if (len > 8) {
1161+
crcs = {ABSL_HASH_INTERNAL_CRC32_U64(crcs.first, Read8(first)),
1162+
ABSL_HASH_INTERNAL_CRC32_U64(crcs.second, Read8(first + len - 8))};
1163+
if (len > 16) {
1164+
// We compute the second round of dependent CRC32 operations.
1165+
crcs = {ABSL_HASH_INTERNAL_CRC32_U64(crcs.first, Read8(first + len - 16)),
1166+
ABSL_HASH_INTERNAL_CRC32_U64(crcs.second, Read8(first + 8))};
1167+
}
1168+
} else {
1169+
if (len >= 4) {
1170+
// We use CRC for 4 bytes to benefit from the fused instruction and better
1171+
// hash quality.
1172+
// Using `xor` or `add` may reduce latency for this case, but would
1173+
// require more registers, more instructions and will have worse hash
1174+
// quality.
1175+
crcs = {ABSL_HASH_INTERNAL_CRC32_U32(static_cast<uint32_t>(crcs.first),
1176+
Read4(first)),
1177+
ABSL_HASH_INTERNAL_CRC32_U32(static_cast<uint32_t>(crcs.second),
1178+
Read4(first + len - 4))};
1179+
} else if (len >= 1) {
1180+
// We mix three bytes all into different output registers.
1181+
// This way, we do not need shifting of these bytes (so they don't overlap
1182+
// with each other).
1183+
crcs = {ABSL_HASH_INTERNAL_CRC32_U8(static_cast<uint32_t>(crcs.first),
1184+
first[0]),
1185+
ABSL_HASH_INTERNAL_CRC32_U8(static_cast<uint32_t>(crcs.second),
1186+
first[len - 1])};
1187+
// Middle byte is mixed weaker. It is a new byte only for len == 3.
1188+
// Mixing is independent from CRC operations so it is scheduled ASAP.
1189+
mul += first[len / 2];
1190+
}
1191+
}
1192+
// `mul` is mixed into both sides of `Mix` to guarantee non-zero values for
1193+
// both multiplicands. Using Mix instead of just multiplication here improves
1194+
// hash quality, especially for short strings.
1195+
return Mix(mul - crcs.first, crcs.second - mul);
1196+
}
1197+
#else
10991198
inline uint64_t CombineContiguousImpl(
11001199
uint64_t state, const unsigned char* first, size_t len,
11011200
std::integral_constant<int, 8> /* sizeof_size_t */) {
@@ -1118,6 +1217,7 @@ inline uint64_t CombineContiguousImpl(
11181217
// to calling CombineLargeContiguousImpl once with 2 * PiecewiseChunkSize().
11191218
return CombineLargeContiguousImplOn64BitLengthGt32(state, first, len);
11201219
}
1220+
#endif // ABSL_HASH_INTERNAL_HAS_CRC32
11211221

11221222
#if defined(ABSL_INTERNAL_LEGACY_HASH_NAMESPACE) && \
11231223
ABSL_META_INTERNAL_STD_HASH_SFINAE_FRIENDLY_
@@ -1452,4 +1552,9 @@ H PiecewiseCombiner::finalize(H state) {
14521552
ABSL_NAMESPACE_END
14531553
} // namespace absl
14541554

1555+
#undef ABSL_HASH_INTERNAL_HAS_CRC32
1556+
#undef ABSL_HASH_INTERNAL_CRC32_U64
1557+
#undef ABSL_HASH_INTERNAL_CRC32_U32
1558+
#undef ABSL_HASH_INTERNAL_CRC32_U8
1559+
14551560
#endif // ABSL_HASH_INTERNAL_HASH_H_

0 commit comments

Comments
 (0)