Skip to content

Commit 846e10f

Browse files
committed
Prototype
1 parent 5028dcd commit 846e10f

File tree

8 files changed

+2056
-25
lines changed

8 files changed

+2056
-25
lines changed

.gitmodules

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,3 @@
11
[submodule "dependencies/fastfilter_cpp"]
22
path = dependencies/fastfilter_cpp
33
url = https://github.com/FastFilter/fastfilter_cpp.git
4-
[submodule "dependencies/xor_singleheader"]
5-
path = dependencies/xor_singleheader
6-
url = https://github.com/FastFilter/xor_singleheader.git

Makefile

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,13 @@
11
all: build_filter query_filter
22

3-
dependencies/fastfilter_cpp/src/xorfilter.h:
3+
dependencies/fastfilter_cpp/src/xorfilter/xorfilter.h:
44
git submodule update --init --recursive
55

6-
dependencies/xor_singleheader/include/xorfilter.h:
7-
git submodule update --init --recursive
86

9-
query_filter: src/query_filter.cpp src/hexutil.h dependencies/xor_singleheader/include/xorfilter.h
7+
query_filter: src/query_filter.cpp src/hexutil.h dependencies/xor_singleheader/include/xorfilter.h
108
c++ -O3 -o query_filter src/query_filter.cpp -Wall -std=c++11 -Idependencies/fastfilter_cpp/src -Idependencies
119

12-
build_filter: src/build_filter.cpp dependencies/fastfilter_cpp/src/xorfilter/xorfilter.h dependencies/fastfilter_cpp/src/xorfilter/xorfilter_plus.h src/hexutil.h dependencies/xor_singleheader/include/xorfilter.h
10+
build_filter: src/build_filter.cpp dependencies/fastfilter_cpp/src/xorfilter/xorfilter.h dependencies/fastfilter_cpp/src/xorfilter/xorfilter_plus.h src/hexutil.h dependencies/xor_singleheader/include/xorfilter.h
1311
c++ -O3 -o build_filter src/build_filter.cpp -std=c++11 -Wall -Idependencies/fastfilter_cpp/src -Idependencies
1412

1513
clean:

dependencies/xor_singleheader

Lines changed: 0 additions & 1 deletion
This file was deleted.
Lines changed: 354 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,354 @@
1+
#ifndef BINARYFUSEFILTER_H
2+
#define BINARYFUSEFILTER_H
3+
#include <math.h>
4+
#include <stdbool.h>
5+
#include <stddef.h>
6+
#include <stdint.h>
7+
#include <stdio.h>
8+
#include <stdlib.h>
9+
#include <string.h>
10+
#ifndef XOR_MAX_ITERATIONS
11+
#define XOR_MAX_ITERATIONS \
12+
100 // probabillity of success should always be > 0.5 so 100 iterations is
13+
// highly unlikely
14+
#endif
15+
16+
/**
17+
* We start with a few utilities.
18+
***/
19+
static inline uint64_t binary_fuse_murmur64(uint64_t h) {
20+
h ^= h >> 33;
21+
h *= UINT64_C(0xff51afd7ed558ccd);
22+
h ^= h >> 33;
23+
h *= UINT64_C(0xc4ceb9fe1a85ec53);
24+
h ^= h >> 33;
25+
return h;
26+
}
27+
static inline uint64_t binary_fuse_mix_split(uint64_t key, uint64_t seed) {
28+
return binary_fuse_murmur64(key + seed);
29+
}
30+
static inline uint64_t binary_fuse_rotl64(uint64_t n, unsigned int c) {
31+
return (n << (c & 63)) | (n >> ((-c) & 63));
32+
}
33+
static inline uint32_t binary_fuse_reduce(uint32_t hash, uint32_t n) {
34+
// http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
35+
return (uint32_t)(((uint64_t)hash * n) >> 32);
36+
}
37+
static inline uint64_t binary_fuse8_fingerprint(uint64_t hash) {
38+
return hash ^ (hash >> 32);
39+
}
40+
41+
/**
42+
* We need a decent random number generator.
43+
**/
44+
45+
// returns random number, modifies the seed
46+
static inline uint64_t binary_fuse_rng_splitmix64(uint64_t *seed) {
47+
uint64_t z = (*seed += UINT64_C(0x9E3779B97F4A7C15));
48+
z = (z ^ (z >> 30)) * UINT64_C(0xBF58476D1CE4E5B9);
49+
z = (z ^ (z >> 27)) * UINT64_C(0x94D049BB133111EB);
50+
return z ^ (z >> 31);
51+
}
52+
53+
typedef struct binary_fuse8_s {
54+
uint64_t Seed;
55+
uint32_t SegmentLength;
56+
uint32_t SegmentLengthMask;
57+
uint32_t SegmentCount;
58+
uint32_t SegmentCountLength;
59+
uint32_t ArrayLength;
60+
uint8_t *Fingerprints;
61+
} binary_fuse8_t;
62+
63+
#ifdef _MSC_VER
64+
// Windows programmers who target 32-bit platform may need help:
65+
uint64_t binary_fuse_mulhi(uint64_t a, uint64_t b) { return __umulh(a, b); }
66+
#else
67+
uint64_t binary_fuse_mulhi(uint64_t a, uint64_t b) {
68+
return ((__uint128_t)a * b) >> 64;
69+
}
70+
#endif
71+
72+
typedef struct binary_hashes_s {
73+
uint32_t h0;
74+
uint32_t h1;
75+
uint32_t h2;
76+
} binary_hashes_t;
77+
78+
static inline binary_hashes_t binary_fuse_hash_batch(uint64_t hash,
79+
const binary_fuse8_t *filter) {
80+
uint64_t hi = binary_fuse_mulhi(hash, filter->SegmentCountLength);
81+
binary_hashes_t ans;
82+
ans.h0 = (uint32_t)hi;
83+
ans.h1 = ans.h0 + filter->SegmentLength;
84+
ans.h2 = ans.h1 + filter->SegmentLength;
85+
ans.h1 ^= (uint32_t)(hash >> 18) & filter->SegmentLengthMask;
86+
ans.h2 ^= (uint32_t)(hash)&filter->SegmentLengthMask;
87+
return ans;
88+
}
89+
static inline uint32_t binary_fuse_hash(int index, uint64_t hash,
90+
const binary_fuse8_t *filter) {
91+
uint64_t h = binary_fuse_mulhi(hash, filter->SegmentCountLength);
92+
h += index * filter->SegmentLength;
93+
// keep the lower 36 bits
94+
uint64_t hh = hash & ((1UL << 36) - 1);
95+
// index 0: right shift by 36; index 1: right shift by 18; index 2: no shift
96+
h ^= (size_t)((hh >> (36 - 18 * index)) & filter->SegmentLengthMask);
97+
return h;
98+
}
99+
100+
// Report if the key is in the set, with false positive rate.
101+
static inline bool binary_fuse8_contain(uint64_t key,
102+
const binary_fuse8_t *filter) {
103+
uint64_t hash = binary_fuse_mix_split(key, filter->Seed);
104+
uint8_t f = binary_fuse8_fingerprint(hash);
105+
binary_hashes_t hashes = binary_fuse_hash_batch(hash, filter);
106+
f ^= filter->Fingerprints[hashes.h0] ^ filter->Fingerprints[hashes.h1] ^
107+
filter->Fingerprints[hashes.h2];
108+
return f == 0;
109+
}
110+
111+
static inline uint32_t binary_fuse8_calculate_segment_length(uint32_t arity,
112+
uint32_t size) {
113+
// These parameters are very sensitive. Replacing 'floor' by 'round' can
114+
// substantially affect the construction time.
115+
if (arity == 3) {
116+
return ((uint32_t)2) << (int)(floor(0.831 * log((double)(size)) + 0.75 +
117+
0.5));
118+
} else if (arity == 4) {
119+
return ((uint32_t)1) << (int)(floor(0.936 * log((double)(size)) - 1 + 0.5));
120+
} else {
121+
return 65536;
122+
}
123+
}
124+
125+
double binary_fuse8_max(double a, double b) {
126+
if (a < b) {
127+
return b;
128+
}
129+
return a;
130+
}
131+
132+
static inline double binary_fuse8_calculate_size_factor(uint32_t arity,
133+
uint32_t size) {
134+
if (arity == 3) {
135+
return binary_fuse8_max(1.125, 0.4 + 9.3 / log((double)size));
136+
} else if (arity == 4) {
137+
return binary_fuse8_max(1.075, 0.77 + 4.06 / log((double)size));
138+
} else {
139+
return 2.0;
140+
}
141+
}
142+
143+
// allocate enough capacity for a set containing up to 'size' elements
144+
// caller is responsible to call binary_fuse8_free(filter)
145+
static inline bool binary_fuse8_allocate(uint32_t size,
146+
binary_fuse8_t *filter) {
147+
uint32_t arity = 3;
148+
filter->SegmentLength = binary_fuse8_calculate_segment_length(arity, size);
149+
if (filter->SegmentLength > 262144) {
150+
filter->SegmentLength = 262144;
151+
}
152+
filter->SegmentLengthMask = filter->SegmentLength - 1;
153+
double sizeFactor = binary_fuse8_calculate_size_factor(arity, size);
154+
uint32_t capacity = (uint32_t)(round((double)size * sizeFactor));
155+
uint32_t initSegmentCount =
156+
(capacity + filter->SegmentLength - 1) / filter->SegmentLength -
157+
(arity - 1);
158+
filter->ArrayLength = (initSegmentCount + arity - 1) * filter->SegmentLength;
159+
filter->SegmentCount =
160+
(filter->ArrayLength + filter->SegmentLength - 1) / filter->SegmentLength;
161+
if (filter->SegmentCount <= arity - 1) {
162+
filter->SegmentCount = 1;
163+
} else {
164+
filter->SegmentCount = filter->SegmentCount - (arity - 1);
165+
}
166+
filter->ArrayLength =
167+
(filter->SegmentCount + arity - 1) * filter->SegmentLength;
168+
filter->SegmentCountLength = filter->SegmentCount * filter->SegmentLength;
169+
filter->Fingerprints = (uint8_t*)malloc(filter->ArrayLength);
170+
return filter->Fingerprints != NULL;
171+
}
172+
173+
// report memory usage
174+
static inline size_t binary_fuse8_size_in_bytes(const binary_fuse8_t *filter) {
175+
return filter->ArrayLength * sizeof(uint8_t) + sizeof(binary_fuse8_t);
176+
}
177+
178+
// release memory
179+
static inline void binary_fuse8_free(binary_fuse8_t *filter) {
180+
free(filter->Fingerprints);
181+
filter->Fingerprints = NULL;
182+
filter->Seed = 0;
183+
filter->SegmentLength = 0;
184+
filter->SegmentLengthMask = 0;
185+
filter->SegmentCount = 0;
186+
filter->SegmentCountLength = 0;
187+
filter->ArrayLength = 0;
188+
}
189+
190+
static inline uint8_t binary_fuse8_mod3(uint8_t x) {
191+
return x > 2 ? x - 3 : x;
192+
}
193+
194+
// construct the filter, returns true on success, false on failure.
195+
// most likely, a failure is due to too high a memory usage
196+
// size is the number of keys
197+
// The caller is responsable for calling binary_fuse8_allocate(size,filter)
198+
// before. The caller is responsible to ensure that there are no duplicated
199+
// keys. The inner loop will run up to XOR_MAX_ITERATIONS times (default on
200+
// 100), it should never fail, except if there are duplicated keys. If it fails,
201+
// a return value of false is provided.
202+
//
203+
bool binary_fuse8_populate(const uint64_t *keys, uint32_t size,
204+
binary_fuse8_t *filter) {
205+
uint64_t rng_counter = 0x726b2b9d438b9d4d;
206+
filter->Seed = binary_fuse_rng_splitmix64(&rng_counter);
207+
uint64_t *reverseOrder = (uint64_t *)calloc((size + 1), sizeof(uint64_t));
208+
uint32_t capacity = filter->ArrayLength;
209+
uint32_t *alone = (uint32_t *)malloc(capacity * sizeof(uint32_t));
210+
uint8_t *t2count = (uint8_t *)calloc(capacity, sizeof(uint8_t));
211+
uint8_t *reverseH = (uint8_t *)malloc(size * sizeof(uint8_t));
212+
uint64_t *t2hash = (uint64_t *)calloc(capacity, sizeof(uint64_t));
213+
214+
uint32_t blockBits = 1;
215+
while (((uint32_t)1 << blockBits) < filter->SegmentCount) {
216+
blockBits += 1;
217+
}
218+
uint32_t block = ((uint32_t)1 << blockBits);
219+
uint32_t *startPos = (uint32_t *)malloc((1 << blockBits) * sizeof(uint32_t));
220+
uint32_t h012[5];
221+
222+
if ((alone == NULL) || (t2count == NULL) || (reverseH == NULL) ||
223+
(t2hash == NULL) || (reverseOrder == NULL) || (startPos == NULL)) {
224+
free(alone);
225+
free(t2count);
226+
free(reverseH);
227+
free(t2hash);
228+
free(reverseOrder);
229+
free(startPos);
230+
return false;
231+
}
232+
reverseOrder[size] = 1;
233+
for (int loop = 0; true; ++loop) {
234+
if (loop + 1 > XOR_MAX_ITERATIONS) {
235+
fprintf(stderr, "Too many iterations. Are all your keys unique?");
236+
free(alone);
237+
free(t2count);
238+
free(reverseH);
239+
free(t2hash);
240+
free(reverseOrder);
241+
free(startPos);
242+
return false;
243+
}
244+
245+
for (uint32_t i = 0; i < block; i++) {
246+
// important : i * size would overflow as a 32-bit number in some
247+
// cases.
248+
startPos[i] = ((uint64_t)i * size) >> blockBits;
249+
}
250+
251+
uint64_t maskblock = block - 1;
252+
for (uint32_t i = 0; i < size; i++) {
253+
uint64_t hash = binary_fuse_murmur64(keys[i] + filter->Seed);
254+
uint64_t segment_index = hash >> (64 - blockBits);
255+
while (reverseOrder[startPos[segment_index]] != 0) {
256+
segment_index++;
257+
segment_index &= maskblock;
258+
}
259+
reverseOrder[startPos[segment_index]] = hash;
260+
startPos[segment_index]++;
261+
}
262+
int error = 0;
263+
for (uint32_t i = 0; i < size; i++) {
264+
uint64_t hash = reverseOrder[i];
265+
uint32_t h0 = binary_fuse_hash(0, hash, filter);
266+
t2count[h0] += 4;
267+
t2hash[h0] ^= hash;
268+
uint32_t h1= binary_fuse_hash(1, hash, filter);
269+
t2count[h1] += 4;
270+
t2count[h1] ^= 1;
271+
t2hash[h1] ^= hash;
272+
uint32_t h2 = binary_fuse_hash(2, hash, filter);
273+
t2count[h2] += 4;
274+
t2hash[h2] ^= hash;
275+
t2count[h2] ^= 2;
276+
error = (t2count[h0] < 4) ? 1 : error;
277+
error = (t2count[h1] < 4) ? 1 : error;
278+
error = (t2count[h2] < 4) ? 1 : error;
279+
}
280+
if(error) { continue; }
281+
282+
// End of key addition
283+
uint32_t Qsize = 0;
284+
// Add sets with one key to the queue.
285+
for (uint32_t i = 0; i < capacity; i++) {
286+
alone[Qsize] = i;
287+
Qsize += ((t2count[i] >> 2) == 1) ? 1 : 0;
288+
}
289+
uint32_t stacksize = 0;
290+
while (Qsize > 0) {
291+
Qsize--;
292+
uint32_t index = alone[Qsize];
293+
if ((t2count[index] >> 2) == 1) {
294+
uint64_t hash = t2hash[index];
295+
296+
//h012[0] = binary_fuse_hash(0, hash, filter);
297+
h012[1] = binary_fuse_hash(1, hash, filter);
298+
h012[2] = binary_fuse_hash(2, hash, filter);
299+
h012[3] = binary_fuse_hash(0, hash, filter); // == h012[0];
300+
h012[4] = h012[1];
301+
uint8_t found = t2count[index] & 3;
302+
reverseH[stacksize] = found;
303+
reverseOrder[stacksize] = hash;
304+
stacksize++;
305+
uint32_t other_index1 = h012[found + 1];
306+
alone[Qsize] = other_index1;
307+
Qsize += ((t2count[other_index1] >> 2) == 2 ? 1 : 0);
308+
309+
t2count[other_index1] -= 4;
310+
t2count[other_index1] ^= binary_fuse8_mod3(found + 1);
311+
t2hash[other_index1] ^= hash;
312+
313+
uint32_t other_index2 = h012[found + 2];
314+
alone[Qsize] = other_index2;
315+
Qsize += ((t2count[other_index2] >> 2) == 2 ? 1 : 0);
316+
t2count[other_index2] -= 4;
317+
t2count[other_index2] ^= binary_fuse8_mod3(found + 2);
318+
t2hash[other_index2] ^= hash;
319+
}
320+
}
321+
if (stacksize == size) {
322+
// success
323+
break;
324+
}
325+
memset(reverseOrder, 0, sizeof(uint64_t[size]));
326+
memset(t2count, 0, sizeof(uint8_t[capacity]));
327+
memset(t2hash, 0, sizeof(uint64_t[capacity]));
328+
filter->Seed = binary_fuse_rng_splitmix64(&rng_counter);
329+
}
330+
331+
for (uint32_t i = size - 1; i < size; i--) {
332+
// the hash of the key we insert next
333+
uint64_t hash = reverseOrder[i];
334+
uint8_t xor2 = binary_fuse8_fingerprint(hash);
335+
uint8_t found = reverseH[i];
336+
h012[0] = binary_fuse_hash(0, hash, filter);
337+
h012[1] = binary_fuse_hash(1, hash, filter);
338+
h012[2] = binary_fuse_hash(2, hash, filter);
339+
h012[3] = h012[0];
340+
h012[4] = h012[1];
341+
filter->Fingerprints[h012[found]] = xor2 ^
342+
filter->Fingerprints[h012[found + 1]] ^
343+
filter->Fingerprints[h012[found + 2]];
344+
}
345+
free(alone);
346+
free(t2count);
347+
free(reverseH);
348+
free(t2hash);
349+
free(reverseOrder);
350+
free(startPos);
351+
return true;
352+
}
353+
354+
#endif

0 commit comments

Comments
 (0)