Skip to content

Commit c9a5876

Browse files
committed
Speed up Bloom filter construction (only for the 100M case so far)
1 parent 5cb6749 commit c9a5876

File tree

2 files changed

+71
-1
lines changed

2 files changed

+71
-1
lines changed

benchmarks/bulk-insert-and-query.cc

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,7 @@ struct FilterAPI<BloomFilter<ItemType, bits_per_item, HashFamily>> {
338338
table->Add(key);
339339
}
340340
static void AddAll(const vector<ItemType> keys, const size_t start, const size_t end, Table* table) {
341-
throw std::runtime_error("Unsupported");
341+
table->AddAll(keys, start, end);
342342
}
343343

344344
CONTAIN_ATTRIBUTES
@@ -807,6 +807,28 @@ int main(int argc, char * argv[]) {
807807
}
808808

809809

810+
if (algorithmId == 37 || algorithmId < 0) {
811+
auto cf = FilterBenchmark<
812+
BloomFilter<uint64_t, 8, SimpleMixSplit>>(
813+
add_count, to_add, to_lookup, seed, true);
814+
cout << setw(NAME_WIDTH) << "Bloom8" << cf << endl;
815+
}
816+
817+
if (algorithmId == 38 || algorithmId < 0) {
818+
auto cf = FilterBenchmark<
819+
BloomFilter<uint64_t, 12, SimpleMixSplit>>(
820+
add_count, to_add, to_lookup, seed, true);
821+
cout << setw(NAME_WIDTH) << "Bloom12" << cf << endl;
822+
}
823+
824+
if (algorithmId == 39 || algorithmId < 0) {
825+
auto cf = FilterBenchmark<
826+
BloomFilter<uint64_t, 16, SimpleMixSplit>>(
827+
add_count, to_add, to_lookup, seed, true);
828+
cout << setw(NAME_WIDTH) << "Bloom16" << cf << endl;
829+
}
830+
831+
810832
// broken algorithms (don't always find all key)
811833
/*
812834
if (algorithmId == 25) {

src/bloom.h

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ class BloomFilter {
6060
// Add an item to the filter.
6161
Status Add(const ItemType &item);
6262

63+
// Add multiple items to the filter.
64+
Status AddAll(const vector<ItemType> data, const size_t start, const size_t end);
65+
6366
// Report if the item is inserted, with false positive rate.
6467
Status Contain(const ItemType &item) const;
6568

@@ -92,6 +95,51 @@ Status BloomFilter<ItemType, bits_per_item, HashFamily, k>::Add(
9295
return Ok;
9396
}
9497

98+
#define BLOCK_SHIFT 18
99+
#define BLOCK_LEN (1 << BLOCK_SHIFT)
100+
101+
void applyBlock(uint64_t* tmp, int block, int len, uint64_t *data) {
102+
for (int i = 0; i < len; i += 2) {
103+
int index = tmp[(block << BLOCK_SHIFT) + i];
104+
uint64_t bits = tmp[(block << BLOCK_SHIFT) + i + 1];
105+
data[index] |= bits;
106+
}
107+
}
108+
109+
template <typename ItemType, size_t bits_per_item,
110+
typename HashFamily, int k>
111+
Status BloomFilter<ItemType, bits_per_item, HashFamily, k>::AddAll(
112+
const vector<ItemType> keys, const size_t start, const size_t end) {
113+
int blocks = 1 + arrayLength / BLOCK_LEN;
114+
uint64_t* tmp = new uint64_t[blocks * BLOCK_LEN];
115+
int* tmpLen = new int[blocks]();
116+
for(size_t i = start; i < end; i++) {
117+
uint64_t key = keys[i];
118+
uint64_t hash = hasher(key);
119+
uint32_t a = (uint32_t) (hash >> 32);
120+
uint32_t bb = (uint32_t) hash;
121+
for (int j = 0; j < k; j++) {
122+
int index = reduce(a, this->arrayLength);
123+
int block = index >> BLOCK_SHIFT;
124+
int len = tmpLen[block];
125+
tmp[(block << BLOCK_SHIFT) + len] = index;
126+
tmp[(block << BLOCK_SHIFT) + len + 1] = getBit(a);
127+
tmpLen[block] = len + 2;
128+
if (len + 2 == BLOCK_LEN) {
129+
applyBlock(tmp, block, len + 2, data);
130+
tmpLen[block] = 0;
131+
}
132+
a += bb;
133+
}
134+
}
135+
for (int block = 0; block < blocks; block++) {
136+
applyBlock(tmp, block, tmpLen[block], data);
137+
}
138+
delete[] tmp;
139+
delete[] tmpLen;
140+
return Ok;
141+
}
142+
95143
template <typename ItemType, size_t bits_per_item,
96144
typename HashFamily, int k>
97145
Status BloomFilter<ItemType, bits_per_item, HashFamily, k>::Contain(

0 commit comments

Comments
 (0)