Skip to content

Commit d026b8a

Browse files
committed
New: succinct counting blocked Bloom filter
1 parent 610b5f6 commit d026b8a

File tree

2 files changed

+308
-1
lines changed

2 files changed

+308
-1
lines changed

benchmarks/bulk-insert-and-query.cc

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -547,6 +547,25 @@ struct FilterAPI<SuccinctCountingBloomFilter<ItemType, bits_per_item, branchless
547547
}
548548
};
549549

550+
template <typename ItemType, size_t bits_per_item, typename HashFamily>
551+
struct FilterAPI<SuccinctCountingBlockedBloomFilter<ItemType, bits_per_item, HashFamily>> {
552+
using Table = SuccinctCountingBlockedBloomFilter<ItemType, bits_per_item, HashFamily>;
553+
static Table ConstructFromAddCount(size_t add_count) { return Table(add_count); }
554+
static void Add(uint64_t key, Table* table) {
555+
table->Add(key);
556+
}
557+
static void AddAll(const vector<ItemType> keys, const size_t start, const size_t end, Table* table) {
558+
throw std::runtime_error("Unsupported");
559+
// table->AddAll(keys, start, end);
560+
}
561+
static void Remove(uint64_t key, Table * table) {
562+
table->Remove(key);
563+
}
564+
CONTAIN_ATTRIBUTES static bool Contain(uint64_t key, const Table * table) {
565+
return table->Contain(key);
566+
}
567+
};
568+
550569
// assuming that first1,last1 and first2, last2 are sorted,
551570
// this tries to find out how many of first1,last1 can be
552571
// found in first2, last2, this includes duplicates
@@ -822,6 +841,7 @@ int main(int argc, char * argv[]) {
822841
// Counting Bloom
823842
{60, "CountingBloom10 (addall)"},
824843
{61, "SuccCountingBloom10 (addall)"},
844+
{62, "SuccCountBlockBloom10"},
825845
// Sort
826846
{100, "Sort"},
827847
};
@@ -1263,6 +1283,13 @@ int main(int argc, char * argv[]) {
12631283
add_count, to_add, distinct_add, to_lookup, distinct_lookup, intersectionsize, hasduplicates, mixed_sets, seed, true, true);
12641284
cout << setw(NAME_WIDTH) << names[a] << cf << endl;
12651285
}
1286+
a = 62;
1287+
if (algorithmId == a || algorithmId < 0 || (algos.find(a) != algos.end())) {
1288+
auto cf = FilterBenchmark<
1289+
SuccinctCountingBlockedBloomFilter<uint64_t, 10, SimpleMixSplit>>(
1290+
add_count, to_add, distinct_add, to_lookup, distinct_lookup, intersectionsize, hasduplicates, mixed_sets, seed, false, true);
1291+
cout << setw(NAME_WIDTH) << names[a] << cf << endl;
1292+
}
12661293

12671294
// Sort ----------------------------------------------------------
12681295
a = 100;

src/counting_bloom.h

Lines changed: 281 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ inline uint32_t reduce(uint32_t hash, uint32_t n) {
6565
return (uint32_t)(((uint64_t)hash * n) >> 32);
6666
}
6767

68+
// CountingBloomFilter --------------------------------------------------------------------------------------
69+
6870
template <typename ItemType, size_t bits_per_item, bool branchless,
6971
typename HashFamily = TwoIndependentMultiplyShift,
7072
int k = (int)((double)bits_per_item * 0.693147180559945 + 0.5)>
@@ -183,7 +185,7 @@ Status CountingBloomFilter<ItemType, bits_per_item, branchless, HashFamily, k>::
183185
return Ok;
184186
}
185187

186-
// --------------------------------------------------------------------------------------
188+
// SuccinctCountingBloomFilter --------------------------------------------------------------------------------------
187189

188190
// #define VERIFY_COUNT
189191

@@ -467,5 +469,283 @@ Status SuccinctCountingBloomFilter<ItemType, bits_per_item, branchless, HashFami
467469
return Ok;
468470
}
469471

472+
// SuccinctCountingBlockedBloomFilter --------------------------------------------------------------------------------------
473+
474+
475+
// #define VERIFY_COUNT
476+
477+
template <typename ItemType, size_t bits_per_item, typename HashFamily,
478+
int k = (int)((double)bits_per_item * 0.693147180559945 + 0.5)>
479+
class SuccinctCountingBlockedBloomFilter {
480+
private:
481+
const int bucketCount;
482+
HashFamily hasher;
483+
uint64_t *data;
484+
uint64_t *counts;
485+
uint64_t *overflow;
486+
size_t overflowLength;
487+
size_t nextFreeOverflow;
488+
#ifdef VERIFY_COUNT
489+
uint8_t *realCount;
490+
#endif
491+
492+
void Increment(size_t group, int bit);
493+
void Decrement(size_t group, int bit);
494+
int ReadCount(size_t group, int bit);
495+
496+
public:
497+
explicit SuccinctCountingBlockedBloomFilter(const int capacity);
498+
~SuccinctCountingBlockedBloomFilter() noexcept;
499+
void Add(const uint64_t key) noexcept;
500+
void Remove(const uint64_t key) noexcept;
501+
bool Contain(const uint64_t key) const noexcept;
502+
uint64_t SizeInBytes() const {
503+
return 2 * 64 * bucketCount + 8 * overflowLength;
504+
}
505+
};
506+
507+
template <typename ItemType, size_t bits_per_item, typename HashFamily, int k>
508+
SuccinctCountingBlockedBloomFilter<ItemType, bits_per_item, HashFamily, k>::
509+
SuccinctCountingBlockedBloomFilter(const int capacity)
510+
: bucketCount(capacity * bits_per_item / 512), hasher() {
511+
const size_t alloc_size = bucketCount * (512 / 8);
512+
const int malloc_failed =
513+
posix_memalign(reinterpret_cast<void **>(&data), 64, alloc_size);
514+
if (malloc_failed)
515+
throw ::std::bad_alloc();
516+
memset(data, 0, alloc_size);
517+
size_t arrayLength = bucketCount * 8;
518+
overflowLength = 100 + arrayLength / 100 * 36;
519+
counts = new uint64_t[arrayLength]();
520+
overflow = new uint64_t[overflowLength]();
521+
#ifdef VERIFY_COUNT
522+
realCount = new uint8_t[arrayLength * 64]();
523+
#endif
524+
nextFreeOverflow = 0;
525+
for (size_t i = 0; i < overflowLength; i += 8) {
526+
overflow[i] = i + 8;
527+
}
528+
}
529+
530+
template <typename ItemType, size_t bits_per_item, typename HashFamily, int k>
531+
SuccinctCountingBlockedBloomFilter<ItemType, bits_per_item, HashFamily, k>::
532+
~SuccinctCountingBlockedBloomFilter() noexcept {
533+
free(data);
534+
delete[] counts;
535+
delete[] overflow;
536+
}
537+
538+
static inline uint64_t rotl64(uint64_t n, unsigned int c) {
539+
// assumes width is a power of 2
540+
const unsigned int mask = (CHAR_BIT * sizeof(n) - 1);
541+
// assert ( (c<=mask) &&"rotate by type width or more");
542+
c &= mask;
543+
return (n << c) | (n >> ((-c) & mask));
544+
}
545+
546+
template <typename ItemType, size_t bits_per_item, typename HashFamily, int k>
547+
void SuccinctCountingBlockedBloomFilter<ItemType, bits_per_item, HashFamily, k>::
548+
Add(const uint64_t key) noexcept {
549+
const auto hash = hasher(key);
550+
const uint32_t bucket_start = reduce(rotl64(hash, 32), bucketCount) * 8;
551+
uint32_t a = (uint32_t)hash;
552+
if (k >= 3) {
553+
Increment(bucket_start + ((a >> 0) & 7), (a >> 3) & 0x3f);
554+
Increment(bucket_start + ((a >> 9) & 7), (a >> 12) & 0x3f);
555+
Increment(bucket_start + ((a >> 18) & 7), (a >> 21) & 0x3f);
556+
// data[bucket_start + ((a >> 0) & 7)] |= 1ULL << ((a >> 3) & 0x3f);
557+
// data[bucket_start + ((a >> 9) & 7)] |= 1ULL << ((a >> 12) & 0x3f);
558+
// data[bucket_start + ((a >> 18) & 7)] |= 1ULL << ((a >> 21) & 0x3f);
559+
}
560+
uint32_t b = (uint32_t)(hash >> 32);
561+
for (int i = 3; i < k; i++) {
562+
a += b;
563+
Increment(bucket_start + (a & 7), (a >> 3) & 0x3f);
564+
// data[bucket_start + (a & 7)] |= 1ULL << ((a >> 3) & 0x3f);
565+
}
566+
}
567+
568+
template <typename ItemType, size_t bits_per_item, typename HashFamily, int k>
569+
void SuccinctCountingBlockedBloomFilter<ItemType, bits_per_item, HashFamily, k>::
570+
Increment(size_t group, int bit) {
571+
#ifdef VERIFY_COUNT
572+
realCount[(group << 6) + bit]++;
573+
#endif
574+
uint64_t m = data[group];
575+
uint64_t c = counts[group];
576+
if ((c & 0xc000000000000000ULL) != 0) {
577+
// an overflow entry, or overflowing now
578+
size_t index;
579+
if ((c & 0x8000000000000000ULL) == 0) {
580+
// convert to an overflow entry
581+
// allocate overflow
582+
index = nextFreeOverflow;
583+
if (index >= overflowLength) {
584+
::std::cout << "ERROR: overflow too small\n";
585+
data[group] |= 1ULL << bit;
586+
return;
587+
}
588+
nextFreeOverflow = (size_t) overflow[index];
589+
for (int i = 0; i < 8; i++) {
590+
overflow[index + i] = 0;
591+
}
592+
// convert to a pointer
593+
for (int i = 0; i < 64; i++) {
594+
int n = ReadCount(group, i);
595+
overflow[index + i / 8] += n * (1ULL << (i * 8));
596+
}
597+
uint64_t count = 64;
598+
c = 0x8000000000000000ULL | (count << 32) | index;
599+
counts[group] = c;
600+
} else {
601+
// already
602+
index = (size_t) (c & 0x0fffffffULL);
603+
c += 1ULL << 32;
604+
counts[group] = c;
605+
}
606+
overflow[index + bit / 8] += (1ULL << (bit * 8));
607+
data[group] |= 1ULL << bit;
608+
} else {
609+
data[group] |= 1ULL << bit;
610+
int bitsBefore = bitCount64(m & (0xffffffffffffffffULL >> (63 - bit)));
611+
int before = select64((c << 1) | 1, bitsBefore);
612+
int d = (m >> bit) & 1;
613+
int insertAt = before - d;
614+
uint64_t mask = (1ULL << insertAt) - 1;
615+
uint64_t left = c & ~mask;
616+
uint64_t right = c & mask;
617+
c = (left << 1) | ((1ULL ^ d) << insertAt) | right;
618+
counts[group] = c;
619+
}
620+
#ifdef VERIFY_COUNT
621+
for(int b = 0; b < 64; b++) {
622+
if (realCount[(group << 6) + b] != ReadCount(group, b)) {
623+
::std::cout << "group " << group << "/" << b << " of " << bit << "\n";
624+
}
625+
}
626+
#endif
627+
}
628+
629+
template <typename ItemType, size_t bits_per_item, typename HashFamily, int k>
630+
int SuccinctCountingBlockedBloomFilter<ItemType, bits_per_item, HashFamily, k>::
631+
ReadCount(size_t group, int bit) {
632+
uint64_t m = data[group];
633+
uint64_t d = (m >> bit) & 1;
634+
if (d == 0) {
635+
return 0;
636+
}
637+
uint64_t c = counts[group];
638+
if ((c & 0x8000000000000000ULL) != 0) {
639+
size_t index = (size_t) (c & 0x0fffffffULL);
640+
uint64_t n = overflow[index + bit / 8];
641+
n >>= 8 * (bit & 0xff);
642+
return (int) (n & 0xff);
643+
}
644+
int bitsBefore = bitCount64(m & (0xffffffffffffffffULL >> (63 - bit)));
645+
int bitPos = select64(c, bitsBefore - 1);
646+
uint64_t y = ((c << (63 - bitPos)) << 1) | (1ULL << (63 - bitPos));
647+
return numberOfLeadingZeros64(y) + 1;
648+
}
649+
650+
template <typename ItemType, size_t bits_per_item, typename HashFamily, int k>
651+
void SuccinctCountingBlockedBloomFilter<ItemType, bits_per_item, HashFamily, k>::
652+
Remove(const uint64_t key) noexcept {
653+
const auto hash = hasher(key);
654+
const uint32_t bucket_start = reduce(rotl64(hash, 32), bucketCount) * 8;
655+
uint32_t a = (uint32_t)hash;
656+
if (k >= 3) {
657+
Decrement(bucket_start + ((a >> 0) & 7), (a >> 3) & 0x3f);
658+
Decrement(bucket_start + ((a >> 9) & 7), (a >> 12) & 0x3f);
659+
Decrement(bucket_start + ((a >> 18) & 7), (a >> 21) & 0x3f);
660+
}
661+
uint32_t b = (uint32_t)(hash >> 32);
662+
for (int i = 3; i < k; i++) {
663+
a += b;
664+
Decrement(bucket_start + (a & 7), (a >> 3) & 0x3f);
665+
}
666+
}
667+
668+
template <typename ItemType, size_t bits_per_item, typename HashFamily, int k>
669+
void SuccinctCountingBlockedBloomFilter<ItemType, bits_per_item, HashFamily, k>::
670+
Decrement(size_t group, int bit) {
671+
#ifdef VERIFY_COUNT
672+
realCount[(group << 6) + bit]--;
673+
#endif
674+
uint64_t m = data[group];
675+
uint64_t c = counts[group];
676+
if ((c & 0x8000000000000000ULL) != 0) {
677+
// an overflow entry
678+
size_t index = (size_t) (c & 0x0fffffffULL);
679+
size_t count = (size_t) (c >> 32) & 0x0fffffffULL;
680+
c -= 1ULL << 32;
681+
counts[group] = c;
682+
uint64_t n = overflow[index + bit / 8];
683+
overflow[index + bit / 8] = n - (1ULL << (bit * 8));
684+
n >>= 8 * (bit & 0xf);
685+
if ((n & 0xff) == 1) {
686+
data[group] &= ~(1ULL << bit);
687+
}
688+
if (count < 64) {
689+
// convert back to an inline entry, and free up the overflow entry
690+
uint64_t c2 = 0;
691+
for (int j = 63; j >= 0; j--) {
692+
int cj = (int) ((overflow[index + j / 8] >> (8 * j)) & 0xff);
693+
if (cj > 0) {
694+
c2 = ((c2 << 1) | 1) << (cj - 1);
695+
}
696+
}
697+
counts[group] = c2;
698+
// free overflow
699+
overflow[index] = nextFreeOverflow;
700+
nextFreeOverflow = index;
701+
}
702+
} else {
703+
int bitsBefore = bitCount64(m & (0xffffffffffffffffULL >> (63 - bit)));
704+
int before = select64((c << 1) | 1, bitsBefore) - 1;
705+
int removeAt = max(0, before - 1);
706+
// remove the bit from the counter
707+
uint64_t mask = (1ULL << removeAt) - 1;
708+
uint64_t left = (c >> 1) & ~mask;
709+
uint64_t right= c & mask;
710+
counts[group] = left | right;
711+
uint64_t removed = (c >> removeAt) & 1;
712+
// possibly reset the data bit
713+
data[group] = m & ~(removed << bit);
714+
}
715+
#ifdef VERIFY_COUNT
716+
for(int b = 0; b < 64; b++) {
717+
if (realCount[(group << 6) + b] != ReadCount(group, b)) {
718+
::std::cout << "group- " << group << "/" << b << " of " << bit << "\n";
719+
}
720+
}
721+
#endif
722+
}
723+
724+
template <typename ItemType, size_t bits_per_item, typename HashFamily, int k>
725+
bool SuccinctCountingBlockedBloomFilter<ItemType, bits_per_item, HashFamily, k>::
726+
Contain(const uint64_t key) const noexcept {
727+
const auto hash = hasher(key);
728+
const uint32_t bucket_start = reduce(rotl64(hash, 32), bucketCount) * 8;
729+
uint32_t a = (uint32_t)hash;
730+
char ok = 1;
731+
if (k >= 3) {
732+
ok &= data[bucket_start + ((a >> 0) & 7)] >> ((a >> 3) & 0x3f);
733+
ok &= data[bucket_start + ((a >> 9) & 7)] >> ((a >> 12) & 0x3f);
734+
ok &= data[bucket_start + ((a >> 18) & 7)] >> ((a >> 21) & 0x3f);
735+
}
736+
if (!ok) {
737+
return ok;
738+
}
739+
uint32_t b = (uint32_t)(hash >> 32);
740+
for (int i = 3; i < k; i++) {
741+
a += b;
742+
ok &= data[bucket_start + (a & 7)] >> ((a >> 3) & 63);
743+
if (!ok) {
744+
return ok;
745+
}
746+
}
747+
return ok;
748+
}
749+
470750
}
471751
#endif

0 commit comments

Comments
 (0)