Skip to content

Commit 0b9d948

Browse files
committed
Implemented a naive block Bloom
1 parent 3e76710 commit 0b9d948

File tree

3 files changed

+129
-41
lines changed

3 files changed

+129
-41
lines changed

benchmarks/bulk-insert-and-query.cc

Lines changed: 35 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,28 @@ struct FilterAPI<XorFilter<ItemType, FingerprintType>> {
291291
}
292292
};
293293

294+
295+
template<size_t blocksize, int k, typename HashFamily>
296+
struct FilterAPI<SimpleBlockFilter<blocksize,k,HashFamily>> {
297+
using Table = SimpleBlockFilter<blocksize,k,HashFamily>;
298+
static Table ConstructFromAddCount(size_t add_count) {
299+
Table ans(ceil(add_count * 8.0 / CHAR_BIT));
300+
return ans;
301+
}
302+
static void Add(uint64_t key, Table* table) {
303+
table->Add(key);
304+
}
305+
static void AddAll(const vector<uint64_t> keys, const size_t start, const size_t end, Table* table) {
306+
throw std::runtime_error("Unsupported");
307+
}
308+
309+
CONTAIN_ATTRIBUTES
310+
static bool Contain(uint64_t key, const Table * table) {
311+
return table->Find(key);
312+
}
313+
};
314+
315+
294316
template <typename ItemType, typename FingerprintType, typename HashFamily>
295317
struct FilterAPI<XorFilter<ItemType, FingerprintType, HashFamily>> {
296318
using Table = XorFilter<ItemType, FingerprintType, HashFamily>;
@@ -678,43 +700,6 @@ void parse_comma_separated(char * c, std::set<int> & answer ) {
678700
}
679701
}
680702

681-
/*
682-
#define MUL 1625L
683-
#define MUL2 (MUL*MUL)
684-
// (1<<64) / MUL
685-
#define INVMUL 11351842506898185L
686-
// (1<<64) / MUL2
687-
#define INVMUL2 6985749235014L
688-
int main() {
689-
printf("start\n");
690-
for (int a = 0; a < MUL; a++) {
691-
for (int b = 0; b < MUL; b++) {
692-
for (int c = 0; c < MUL; c++) {
693-
uint32_t x = a * MUL2 + b * MUL + c;
694-
int aa = (int) (((__uint128_t) x * (INVMUL2 + 1)) >> 64);
695-
if (aa != a) {
696-
printf("wrong a");
697-
return -1;
698-
}
699-
int bb = (int) (((__uint128_t) x * (INVMUL + 1)) >> 64);
700-
int rb = bb % MUL;
701-
if (rb != b) {
702-
printf("wrong b");
703-
return -1;
704-
}
705-
int expected = (a + b + c) % MUL;
706-
int got = (aa + bb + x) % MUL;
707-
if (expected != got) {
708-
printf("wrong modulo");
709-
return -1;
710-
}
711-
}
712-
}
713-
}
714-
printf("end\n");
715-
return 0;
716-
}
717-
*/
718703

719704
int main(int argc, char * argv[]) {
720705
#ifdef __aarch64__
@@ -726,7 +711,8 @@ int main(int argc, char * argv[]) {
726711
{14,"GCS"}, {22, "Xor10 (NBitArray)"}, {23, "Xor14 (NBitArray)"},
727712
{25, "Xor10"},{26, "Xor10.666"}, {37,"Bloom8 (addall)"},
728713
{38,"Bloom12 (addall)"},
729-
{40,"BlockedBloom (addall)"}
714+
{40,"BlockedBloom (addall)"},
715+
{70,"SimpleBlockedBloom"}
730716
};
731717
#elif defined( __AVX2__)
732718
std::map<int,std::string> names = {{0,"Xor8"},{1,"Xor12"},
@@ -737,7 +723,8 @@ int main(int argc, char * argv[]) {
737723
{14,"GCS"}, {15,"CQF"}, {22, "Xor10 (NBitArray)"}, {23, "Xor14 (NBitArray)"},
738724
{25, "Xor10"},{26, "Xor10.666"}, {37,"Bloom8 (addall)"},
739725
{38,"Bloom12 (addall)"},{39,"Bloom16 (addall)"},
740-
{40,"BlockedBloom (addall)"}, {63,"BlockedBloom16"}, {64,"BlockedBloom64"}
726+
{40,"BlockedBloom (addall)"}, {63,"BlockedBloom16"}, {64,"BlockedBloom64"},
727+
{70,"SimpleBlockedBloom"}
741728
};
742729
#else
743730
std::map<int,std::string> names = {{0,"Xor8"},{1,"Xor12"},
@@ -747,7 +734,8 @@ int main(int argc, char * argv[]) {
747734
{11,"sort"}, {12,"Xor+8"}, {13,"Xor+16"},
748735
{14,"GCS"}, {22, "Xor10 (NBitArray)"}, {23, "Xor14 (NBitArray)"},
749736
{25, "Xor10"},{26, "Xor10.666"}, {37,"Bloom8 (addall)"},
750-
{38,"Bloom12 (addall)"},{39,"Bloom16 (addall)"}
737+
{38,"Bloom12 (addall)"},{39,"Bloom16 (addall)"},
738+
{70,"SimpleBlockedBloom"}
751739
};
752740
#endif
753741

@@ -1091,6 +1079,13 @@ int main(int argc, char * argv[]) {
10911079
add_count, to_add, distinct_add, to_lookup, distinct_lookup, intersectionsize, hasduplicates, mixed_sets, seed, true);
10921080
cout << setw(NAME_WIDTH) << names[23] << cf << endl;
10931081
}
1082+
if (algorithmId == 70 || (algos.find(70) != algos.end())) {
1083+
auto cf = FilterBenchmark<
1084+
SimpleBlockFilter<8, 8, SimpleMixSplit>>(
1085+
add_count, to_add, distinct_add, to_lookup, distinct_lookup, intersectionsize, hasduplicates, mixed_sets, seed, false);
1086+
cout << setw(NAME_WIDTH) << names[70] << cf << endl;
1087+
}
1088+
10941089

10951090
// this algo overflows and crashes
10961091
/*if (algorithmId == 24 || (algos.find(24) != algos.end())) {

src/bloom.h

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,5 +172,98 @@ std::string BloomFilter<ItemType, bits_per_item, HashFamily, k>::Info() const {
172172
}
173173
return ss.str();
174174
}
175+
176+
177+
178+
179+
/***************
180+
* Simple block filter (naive implementation)
181+
***************/
182+
183+
template<size_t blocksize, int k, typename HashFamily = ::hashing::TwoIndependentMultiplyShift>
184+
class SimpleBlockFilter {
185+
private:
186+
// The filter is divided up into Buckets:
187+
using Bucket = uint64_t[blocksize];
188+
189+
const int bucketCount;
190+
191+
Bucket* directory_;
192+
193+
HashFamily hasher_;
194+
195+
public:
196+
// Consumes at most (1 << log_heap_space) bytes on the heap:
197+
explicit SimpleBlockFilter(const int bits);
198+
~SimpleBlockFilter() noexcept;
199+
void Add(const uint64_t key) noexcept;
200+
201+
bool Find(const uint64_t key) const noexcept;
202+
uint64_t SizeInBytes() const { return sizeof(Bucket) * bucketCount; }
203+
204+
205+
};
206+
207+
template<size_t blocksize,int k, typename HashFamily>
208+
SimpleBlockFilter<blocksize,k,HashFamily>::SimpleBlockFilter(const int capacity)
209+
: bucketCount(capacity * k / ( blocksize * 64)),
210+
directory_(nullptr),
211+
hasher_() {
212+
const size_t alloc_size = bucketCount * sizeof(Bucket);
213+
const int malloc_failed =
214+
posix_memalign(reinterpret_cast<void**>(&directory_), 64, alloc_size);
215+
if (malloc_failed) throw ::std::bad_alloc();
216+
memset(directory_, 0, alloc_size);
217+
}
218+
219+
template<size_t blocksize, int k, typename HashFamily>
220+
SimpleBlockFilter<blocksize,k,HashFamily>::~SimpleBlockFilter() noexcept {
221+
free(directory_);
222+
directory_ = nullptr;
223+
}
224+
225+
static inline uint64_t rotl64(uint64_t n, unsigned int c) {
226+
// assumes width is a power of 2
227+
const unsigned int mask = (CHAR_BIT * sizeof(n) - 1);
228+
// assert ( (c<=mask) &&"rotate by type width or more");
229+
c &= mask;
230+
return (n << c) | ( n >> ((-c) & mask));
231+
}
232+
233+
template<size_t blocksize,int k, typename HashFamily>
234+
inline void
235+
SimpleBlockFilter<blocksize,k, HashFamily>::Add(const uint64_t key) noexcept {
236+
const auto hash = hasher_(key);
237+
const uint32_t bucket_idx = reduce(rotl64(hash, 32), bucketCount);
238+
Bucket * bucket = directory_ + bucket_idx;
239+
uint32_t a = (uint32_t) (hash >> 32);
240+
uint32_t b = (uint32_t) hash;
241+
for (int i = 0; i < k; i++) {
242+
((uint64_t *)bucket)[reduce(a, blocksize)] |= getBit(a);
243+
a += b;
244+
}
245+
246+
}
247+
248+
249+
template<size_t blocksize, int k, typename HashFamily>
250+
inline bool
251+
SimpleBlockFilter<blocksize,k,HashFamily>::Find(const uint64_t key) const noexcept {
252+
const auto hash = hasher_(key);
253+
const uint32_t bucket_idx = reduce(rotl64(hash, 32), bucketCount);
254+
const Bucket * bucket = directory_ + bucket_idx;
255+
uint32_t a = (uint32_t) (hash >> 32);
256+
uint32_t b = (uint32_t) hash;
257+
for (int i = 0; i < k; i++) {
258+
if ((((const uint64_t *)bucket)[reduce(a, blocksize)] & getBit(a)) == 0) {
259+
return false;
260+
}
261+
a += b;
262+
}
263+
return true;
264+
}
265+
266+
267+
175268
} // namespace bloomfilter
176269
#endif // BLOOM_FILTER_BLOOM_FILTER_H_

src/simd-block-fixed-fpp.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ inline uint32_t reduce(uint32_t hash, uint32_t n) {
3030
return (uint32_t) (((uint64_t) hash * n) >> 32);
3131
}
3232

33-
inline uint64_t rotl64(uint64_t n, unsigned int c) {
33+
static inline uint64_t rotl64(uint64_t n, unsigned int c) {
3434
// assumes width is a power of 2
3535
const unsigned int mask = (CHAR_BIT * sizeof(n) - 1);
3636
// assert ( (c<=mask) &&"rotate by type width or more");

0 commit comments

Comments
 (0)