Skip to content

Commit 4e5beaf

Browse files
ezbrcopybara-github
authored andcommitted
Change PrecombineLengthMix to sample data from kStaticRandomData.
The idea is based on Carbon hashing's SampleRandomData(). This also fixes an issue that we found when adding debug asserts to detect cases of long probe sequences in absl hash tables. In that case, the keys were length=29 strings with all the entropy in the 2nd and 3rd bytes, and we saw many collisions due to low entropy in the last byte of the hash. It seems that this was caused by the input seed having low entropy, and doing the length mixing this way solves that issue. Also: - Fix a bug in HashtableDebugAccess::GetNumProbes. - Move the DoubleRange test from flat_hash_set_test.cc to hash_test.cc since it's a test for hash quality. - Fix using declaration style in hash_test.cc to comply with Google C++ style guide. PiperOrigin-RevId: 774891965 Change-Id: I22f18a0288e9bfb2734f7b8c6e788b623a1db0f5
1 parent 2b320cb commit 4e5beaf

File tree

5 files changed

+65
-39
lines changed

5 files changed

+65
-39
lines changed

absl/container/flat_hash_set_test.cc

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -397,21 +397,6 @@ TEST(FlatHashSet, IsDefaultHash) {
397397
false);
398398
}
399399

400-
// Test that we don't cause excessive collisions on the hash table for
401-
// doubles in the range [-1024, 1024]. See cl/773069881 for more information.
402-
TEST(FlatHashSet, DoubleRange) {
403-
using absl::container_internal::hashtable_debug_internal::
404-
HashtableDebugAccess;
405-
absl::flat_hash_set<double> set;
406-
for (double t = -1024.0; t < 1024.0; t += 1.0) {
407-
set.insert(t);
408-
}
409-
for (double t = -1024.0; t < 1024.0; t += 1.0) {
410-
ASSERT_LT(HashtableDebugAccess<decltype(set)>::GetNumProbes(set, t), 64)
411-
<< t;
412-
}
413-
}
414-
415400
} // namespace
416401
} // namespace container_internal
417402
ABSL_NAMESPACE_END

absl/container/internal/raw_hash_set.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3713,7 +3713,7 @@ struct HashtableDebugAccess<Set, absl::void_t<typename Set::raw_hash_set>> {
37133713

37143714
static size_t GetNumProbes(const Set& set,
37153715
const typename Set::key_type& key) {
3716-
if (set.is_soo()) return 0;
3716+
if (set.is_small()) return 0;
37173717
size_t num_probes = 0;
37183718
const size_t hash = set.hash_of(key);
37193719
auto seq = probe(set.common(), hash);

absl/hash/hash_test.cc

Lines changed: 50 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,17 +58,17 @@
5858

5959
namespace {
6060

61+
using ::absl::Hash;
62+
using ::absl::container_internal::hashtable_debug_internal::
63+
HashtableDebugAccess;
64+
using ::absl::hash_internal::SpyHashState;
6165
using ::absl::hash_test_internal::is_hashable;
6266
using ::absl::hash_test_internal::TypeErasedContainer;
6367
using ::absl::hash_test_internal::TypeErasedValue;
64-
using ::testing::SizeIs;
6568

6669
template <typename T>
6770
using TypeErasedVector = TypeErasedContainer<std::vector<T>>;
6871

69-
using absl::Hash;
70-
using absl::hash_internal::SpyHashState;
71-
7272
template <typename T>
7373
class HashValueIntTest : public testing::Test {
7474
};
@@ -1239,4 +1239,50 @@ TEST(HashOf, DoubleSignCollision) {
12391239
EXPECT_NE(absl::HashOf(-1.0), absl::HashOf(1.0));
12401240
}
12411241

1242+
// Test that we don't cause excessive collisions on the hash table for
1243+
// doubles in the range [-1024, 1024]. See cl/773069881 for more information.
1244+
TEST(SwisstableCollisions, DoubleRange) {
1245+
#ifdef GOOGLE_UNSUPPORTED_OS_LOONIX
1246+
// TODO(b/424834054): make this test pass on Loonix.
1247+
GTEST_SKIP() << "Test fails on Loonix.";
1248+
#endif
1249+
1250+
absl::flat_hash_set<double> set;
1251+
for (double t = -1024.0; t < 1024.0; t += 1.0) {
1252+
set.insert(t);
1253+
ASSERT_LT(HashtableDebugAccess<decltype(set)>::GetNumProbes(set, t), 64)
1254+
<< t;
1255+
}
1256+
}
1257+
1258+
// Test that for each pair of adjacent bytes in a string, if there's only
1259+
// entropy in those two bytes, then we don't have excessive collisions.
1260+
TEST(SwisstableCollisions, LowEntropyStrings) {
1261+
if (sizeof(size_t) < 8) {
1262+
// TODO(b/424834054): make this test pass on 32-bit platforms. We need to
1263+
// make 32-bit Mix() stronger.
1264+
GTEST_SKIP() << "Test fails on 32-bit platforms";
1265+
}
1266+
1267+
constexpr char kMinChar = 0;
1268+
constexpr char kMaxChar = 64;
1269+
// These sizes cover the different hashing cases.
1270+
for (size_t size : {8u, 16u, 32u, 64u}) {
1271+
for (size_t b = 0; b < size - 1; ++b) {
1272+
absl::flat_hash_set<std::string> set;
1273+
std::string s(size, '\0');
1274+
for (char c1 = kMinChar; c1 < kMaxChar; ++c1) {
1275+
for (char c2 = kMinChar; c2 < kMaxChar; ++c2) {
1276+
s[b] = c1;
1277+
s[b + 1] = c2;
1278+
set.insert(s);
1279+
ASSERT_LT(HashtableDebugAccess<decltype(set)>::GetNumProbes(set, s),
1280+
64)
1281+
<< size << " " << b;
1282+
}
1283+
}
1284+
}
1285+
}
1286+
}
1287+
12421288
} // namespace

absl/hash/internal/hash.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,10 +100,10 @@ uint64_t Mix32Bytes(const uint8_t* ptr, uint64_t current_state) {
100100
ABSL_ATTRIBUTE_ALWAYS_INLINE inline uint64_t HashBlockOn32Bit(
101101
const unsigned char* data, size_t len, uint64_t state) {
102102
// TODO(b/417141985): expose and use CityHash32WithSeed.
103-
return Mix(
104-
PrecombineLengthMix(state, len) ^
105-
hash_internal::CityHash32(reinterpret_cast<const char*>(data), len),
106-
kMul);
103+
// Note: we can't use PrecombineLengthMix here because len can be up to 1024.
104+
return Mix((state + len) ^ hash_internal::CityHash32(
105+
reinterpret_cast<const char*>(data), len),
106+
kMul);
107107
}
108108

109109
ABSL_ATTRIBUTE_NOINLINE uint64_t

absl/hash/internal/hash.h

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -935,21 +935,6 @@ hash_range_or_bytes(H hash_state, const T* data, size_t size) {
935935
hash_internal::WeaklyMixedInteger{size});
936936
}
937937

938-
// Extremely weak mixture of length that is added to the state before combining
939-
// the data. It is used only for small strings.
940-
inline uint64_t PrecombineLengthMix(uint64_t state, size_t len) {
941-
// The length is always one byte here. We place it to 4th byte for the
942-
// following reasons:
943-
// 1. 4th byte is unused for very short strings 0-3 bytes.
944-
// 2. 4th byte is duplicated for 4 bytes string.
945-
// 3. 4th byte is in the middle and mixed well for 5-8 bytes strings.
946-
//
947-
// There were experiments with adding just `len` here.
948-
// Also seems have slightly better performance overall, that gives collisions
949-
// for small strings.
950-
return state + (uint64_t{len} << 24);
951-
}
952-
953938
inline constexpr uint64_t kMul = uint64_t{0xdcb22ca68cb134ed};
954939

955940
// Random data taken from the hexadecimal digits of Pi's fractional component.
@@ -959,6 +944,16 @@ ABSL_CACHELINE_ALIGNED inline constexpr uint64_t kStaticRandomData[] = {
959944
0x082e'fa98'ec4e'6c89, 0x4528'21e6'38d0'1377,
960945
};
961946

947+
// Extremely weak mixture of length that is mixed into the state before
948+
// combining the data. It is used only for small strings. This also ensures that
949+
// we have high entropy in all bits of the state.
950+
inline uint64_t PrecombineLengthMix(uint64_t state, size_t len) {
951+
ABSL_ASSUME(len + sizeof(uint64_t) <= sizeof(kStaticRandomData));
952+
uint64_t data = absl::base_internal::UnalignedLoad64(
953+
reinterpret_cast<const unsigned char*>(&kStaticRandomData[0]) + len);
954+
return state ^ data;
955+
}
956+
962957
ABSL_ATTRIBUTE_ALWAYS_INLINE inline uint64_t Mix(uint64_t lhs, uint64_t rhs) {
963958
// For 32 bit platforms we are trying to use all 64 lower bits.
964959
if constexpr (sizeof(size_t) < 8) {

0 commit comments

Comments
 (0)