Skip to content

Commit 7b47a42

Browse files
Merge pull request ClickHouse#79800 from dkratunov/faster-blooms
bloomfilter: use libdivide to compute the bit location
2 parents 8220d89 + 2b1cd06 commit 7b47a42

File tree

2 files changed

+28
-9
lines changed

2 files changed

+28
-9
lines changed

src/Interpreters/BloomFilter.cpp

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <DataTypes/DataTypeArray.h>
77
#include <DataTypes/DataTypeNullable.h>
88
#include <DataTypes/DataTypeLowCardinality.h>
9+
#include <libdivide.h>
910

1011

1112
namespace DB
@@ -39,7 +40,8 @@ BloomFilter::BloomFilter(const BloomFilterParameters & params)
3940
}
4041

4142
BloomFilter::BloomFilter(size_t size_, size_t hashes_, size_t seed_)
42-
: size(size_), hashes(hashes_), seed(seed_), words((size + sizeof(UnderType) - 1) / sizeof(UnderType)), filter(words, 0)
43+
: size(size_), hashes(hashes_), seed(seed_), words((size + sizeof(UnderType) - 1) / sizeof(UnderType)),
44+
modulus(8 * size_), divider(modulus), filter(words, 0)
4345
{
4446
chassert(size != 0);
4547
chassert(hashes != 0);
@@ -49,6 +51,8 @@ void BloomFilter::resize(size_t size_)
4951
{
5052
size = size_;
5153
words = ((size + sizeof(UnderType) - 1) / sizeof(UnderType));
54+
modulus = 8 * size;
55+
divider = libdivide::divider<size_t, libdivide::BRANCHFREE>(modulus);
5256
filter.resize(words);
5357
}
5458

@@ -57,11 +61,15 @@ bool BloomFilter::find(const char * data, size_t len)
5761
size_t hash1 = CityHash_v1_0_2::CityHash64WithSeed(data, len, seed);
5862
size_t hash2 = CityHash_v1_0_2::CityHash64WithSeed(data, len, SEED_GEN_A * seed + SEED_GEN_B);
5963

64+
size_t acc = hash1;
6065
for (size_t i = 0; i < hashes; ++i)
6166
{
62-
size_t pos = (hash1 + i * hash2 + i * i) % (8 * size);
63-
if (!(filter[pos / (8 * sizeof(UnderType))] & (1ULL << (pos % (8 * sizeof(UnderType))))))
67+
/// It accumulates in the loop as follows:
68+
/// pos = (hash1 + hash2 * i + i * i) % (8 * size)
69+
size_t pos = fastMod(acc + i * i);
70+
if (!(filter[pos / word_bits] & (1ULL << (pos % word_bits))))
6471
return false;
72+
acc += hash2;
6573
}
6674
return true;
6775
}
@@ -71,10 +79,14 @@ void BloomFilter::add(const char * data, size_t len)
7179
size_t hash1 = CityHash_v1_0_2::CityHash64WithSeed(data, len, seed);
7280
size_t hash2 = CityHash_v1_0_2::CityHash64WithSeed(data, len, SEED_GEN_A * seed + SEED_GEN_B);
7381

82+
size_t acc = hash1;
7483
for (size_t i = 0; i < hashes; ++i)
7584
{
76-
size_t pos = (hash1 + i * hash2 + i * i) % (8 * size);
77-
filter[pos / (8 * sizeof(UnderType))] |= (1ULL << (pos % (8 * sizeof(UnderType))));
85+
/// It accumulates in the loop as follows:
86+
/// pos = (hash1 + hash2 * i + i * i) % (8 * size)
87+
size_t pos = fastMod(acc + i * i);
88+
filter[pos / word_bits] |= (1ULL << (pos % word_bits));
89+
acc += hash2;
7890
}
7991
}
8092

@@ -116,14 +128,14 @@ bool operator== (const BloomFilter & a, const BloomFilter & b)
116128

117129
void BloomFilter::addHashWithSeed(const UInt64 & hash, const UInt64 & hash_seed)
118130
{
119-
size_t pos = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(hash, hash_seed)) % (8 * size);
120-
filter[pos / (8 * sizeof(UnderType))] |= (1ULL << (pos % (8 * sizeof(UnderType))));
131+
size_t pos = fastMod(CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(hash, hash_seed)));
132+
filter[pos / word_bits] |= (1ULL << (pos % word_bits));
121133
}
122134

123135
bool BloomFilter::findHashWithSeed(const UInt64 & hash, const UInt64 & hash_seed)
124136
{
125-
size_t pos = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(hash, hash_seed)) % (8 * size);
126-
return bool(filter[pos / (8 * sizeof(UnderType))] & (1ULL << (pos % (8 * sizeof(UnderType)))));
137+
size_t pos = fastMod(CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(hash, hash_seed)));
138+
return bool(filter[pos / word_bits] & (1ULL << (pos % word_bits)));
127139
}
128140

129141
DataTypePtr BloomFilter::getPrimitiveType(const DataTypePtr & data_type)

src/Interpreters/BloomFilter.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include <base/types.h>
44
#include <Columns/IColumn_fwd.h>
55
#include <DataTypes/IDataType.h>
6+
#include <libdivide.h>
67

78
#include <vector>
89

@@ -57,12 +58,18 @@ class BloomFilter
5758
friend bool operator== (const BloomFilter & a, const BloomFilter & b);
5859
private:
5960

61+
static constexpr size_t word_bits = 8 * sizeof(UnderType);
62+
6063
size_t size;
6164
size_t hashes;
6265
size_t seed;
6366
size_t words;
67+
size_t modulus; /// 8 * size, cached for fast modulo.
68+
libdivide::divider<size_t, libdivide::BRANCHFREE> divider; /// Divider for fast modulo by modulus.
6469
Container filter;
6570

71+
inline size_t fastMod(size_t value) const { return value - (value / divider) * modulus; }
72+
6673
public:
6774
static ColumnPtr getPrimitiveColumn(const ColumnPtr & column);
6875
static DataTypePtr getPrimitiveType(const DataTypePtr & data_type);

0 commit comments

Comments
 (0)