Skip to content

Commit c95e162

Browse files
committed
Added succinct counting blocked Bloom using rank instead of select
1 parent 905ab96 commit c95e162

File tree

3 files changed

+483
-44
lines changed

3 files changed

+483
-44
lines changed

benchmarks/bulk-insert-and-query.cc

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -666,6 +666,25 @@ struct FilterAPI<SuccinctCountingBlockedBloomFilter<ItemType, bits_per_item, Has
666666
}
667667
};
668668

669+
template <typename ItemType, size_t bits_per_item, typename HashFamily>
670+
struct FilterAPI<SuccinctCountingBlockedBloomRankFilter<ItemType, bits_per_item, HashFamily>> {
671+
using Table = SuccinctCountingBlockedBloomRankFilter<ItemType, bits_per_item, HashFamily>;
672+
static Table ConstructFromAddCount(size_t add_count) { return Table(add_count); }
673+
static void Add(uint64_t key, Table* table) {
674+
table->Add(key);
675+
}
676+
static void AddAll(const vector<ItemType> keys, const size_t start, const size_t end, Table* table) {
677+
throw std::runtime_error("Unsupported");
678+
// table->AddAll(keys, start, end);
679+
}
680+
static void Remove(uint64_t key, Table * table) {
681+
table->Remove(key);
682+
}
683+
CONTAIN_ATTRIBUTES static bool Contain(uint64_t key, const Table * table) {
684+
return table->Contain(key);
685+
}
686+
};
687+
669688
// assuming that first1,last1 and first2, last2 are sorted,
670689
// this tries to find out how many of first1,last1 can be
671690
// found in first2, last2, this includes duplicates
@@ -942,6 +961,7 @@ int main(int argc, char * argv[]) {
942961
{60, "CountingBloom10 (addall)"},
943962
{61, "SuccCountingBloom10 (addall)"},
944963
{62, "SuccCountBlockBloom10"},
964+
{63, "SuccCountBlockBloomRank10"},
945965

946966
{70, "Xor8-singleheader"},
947967
{80, "Morton"},
@@ -1394,6 +1414,13 @@ int main(int argc, char * argv[]) {
13941414
add_count, to_add, distinct_add, to_lookup, distinct_lookup, intersectionsize, hasduplicates, mixed_sets, seed, false, true);
13951415
cout << setw(NAME_WIDTH) << names[a] << cf << endl;
13961416
}
1417+
a = 63;
1418+
if (algorithmId == a || (algos.find(a) != algos.end())) {
1419+
auto cf = FilterBenchmark<
1420+
SuccinctCountingBlockedBloomRankFilter<uint64_t, 10, SimpleMixSplit>>(
1421+
add_count, to_add, distinct_add, to_lookup, distinct_lookup, intersectionsize, hasduplicates, mixed_sets, seed, false, true);
1422+
cout << setw(NAME_WIDTH) << names[a] << cf << endl;
1423+
}
13971424

13981425
a = 70;
13991426
if (algorithmId == a || (algos.find(a) != algos.end())) {

src/bloom/bloom.h

Lines changed: 41 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,8 @@ BloomFilter<ItemType, bits_per_item, branchless, HashFamily, k>::Info() const {
192192
return ss.str();
193193
}
194194

195+
196+
195197
/***************
196198
* Simple block filter (naive implementation)
197199
***************/
@@ -200,82 +202,81 @@ template <size_t blocksize, int k,
200202
typename HashFamily = ::hashing::TwoIndependentMultiplyShift>
201203
class SimpleBlockFilter {
202204
private:
203-
// The filter is divided up into Buckets:
204-
using Bucket = uint64_t[blocksize];
205-
206-
const int bucketCount;
207-
208-
Bucket *directory_;
209-
205+
const size_t arrayLength;
206+
uint64_t* data;
210207
HashFamily hasher_;
211-
212208
public:
213209
// Consumes at most (1 << log_heap_space) bytes on the heap:
214210
explicit SimpleBlockFilter(const int bits);
215211
~SimpleBlockFilter() noexcept;
216212
void Add(const uint64_t key) noexcept;
217-
218213
bool Find(const uint64_t key) const noexcept;
219-
uint64_t SizeInBytes() const { return sizeof(Bucket) * bucketCount; }
214+
uint64_t SizeInBytes() const {
215+
return arrayLength * 8;
216+
}
220217
};
221218

222219
template <size_t blocksize, int k, typename HashFamily>
223220
SimpleBlockFilter<blocksize, k, HashFamily>::SimpleBlockFilter(
224221
const int capacity)
225-
: bucketCount(capacity * k / (blocksize * 47)), directory_(nullptr),
222+
: arrayLength((capacity * 10) / 64 + 8),
226223
hasher_() {
227-
const size_t alloc_size = bucketCount * sizeof(Bucket);
228-
const int malloc_failed =
229-
posix_memalign(reinterpret_cast<void **>(&directory_), 64, alloc_size);
230-
if (malloc_failed)
231-
throw ::std::bad_alloc();
232-
memset(directory_, 0, alloc_size);
224+
data = new uint64_t[arrayLength]();
233225
}
234226

235227
template <size_t blocksize, int k, typename HashFamily>
236228
SimpleBlockFilter<blocksize, k, HashFamily>::~SimpleBlockFilter() noexcept {
237-
free(directory_);
238-
directory_ = nullptr;
229+
free(data);
230+
data = nullptr;
239231
}
240232

241233
static inline uint64_t rotl64(uint64_t n, unsigned int c) {
242234
// assumes width is a power of 2
243235
const unsigned int mask = (CHAR_BIT * sizeof(n) - 1);
244-
// assert ( (c<=mask) &&"rotate by type width or more");
245236
c &= mask;
246237
return (n << c) | (n >> ((-c) & mask));
247238
}
248239

249-
char setbit64(uint64_t *t, uint64_t bit) { return *t |= (1L << (bit & 63)); }
250-
251240
template <size_t blocksize, int k, typename HashFamily>
252241
inline void
253242
SimpleBlockFilter<blocksize, k, HashFamily>::Add(const uint64_t key) noexcept {
254243
const auto hash = hasher_(key);
255-
const uint32_t bucket_idx = reduce(rotl64(hash, 32), bucketCount);
256-
Bucket *bucket = directory_ + bucket_idx;
257-
uint32_t a = (uint32_t)(hash >> 32);
258-
uint32_t b = (uint32_t)hash;
259-
for (int i = 0; i < k; i++) {
260-
setbit64((uint64_t *)bucket + (a % blocksize), a / blocksize);
261-
a += b;
262-
}
244+
const uint32_t idx = reduce(hash, arrayLength);
245+
uint64_t *bucket = data + idx;
246+
// uint32_t a = (uint32_t)(hash ^ (hash >> 32));
247+
248+
// *bucket++ |= (uint64_t) ((1L << (a & 63)) | (1L << ((a >> 6) & 63)));
249+
// *bucket |= (uint64_t) ((1L << ((a >> 12) & 63)) | (1L << ((a >> 18) & 63)));
250+
251+
// *bucket++ |= (uint64_t) (a & (a >> 1));
252+
uint64_t m1 = 1L << hash;
253+
uint64_t m2 = 1L << (hash >> 8);
254+
uint64_t m = m1 | m2;
255+
*bucket |= m;
256+
263257
}
264258
template <size_t blocksize, int k, typename HashFamily>
265259
inline bool
266260
SimpleBlockFilter<blocksize, k, HashFamily>::Find(const uint64_t key) const
267261
noexcept {
268262
const auto hash = hasher_(key);
269-
const uint32_t bucket_idx = reduce(rotl64(hash, 32), bucketCount);
270-
const Bucket *bucket = directory_ + bucket_idx;
271-
uint32_t a = (uint32_t)(hash >> 32);
272-
uint32_t b = (uint32_t)hash;
273-
char ok = 1;
274-
for (int i = 0; i < k; i++) {
275-
ok &= bittest64(((const uint64_t *)bucket) + (a % blocksize), a / blocksize);
276-
a += b;
277-
}
278-
return ok;
263+
const uint32_t idx = reduce(hash, arrayLength);
264+
uint64_t *bucket = data + idx;
265+
// uint32_t a = (uint32_t)(hash ^ (hash >> 32));
266+
// uint64_t m1 = (uint64_t) ((1L << (a & 63)) | (1L << ((a >> 6) & 63)));
267+
// uint64_t m2 = (uint64_t) ((1L << ((a >> 12) & 63)) | (1L << ((a >> 18) & 63)));
268+
uint64_t m1 = 1L << hash;
269+
uint64_t m2 = 1L << (hash >> 8);
270+
uint64_t m = m1 | m2;
271+
return !((m & *bucket) - m);
272+
273+
/*
274+
uint64_t x = *bucket++;
275+
// a += b;
276+
// x = *bucket++;
277+
// y &= (x >> (a & 63)) & (x >> ((a >> 8) & 63));
278+
return y & 1;
279+
*/
279280
}
280281

281282
} // namespace bloomfilter

0 commit comments

Comments
 (0)