Skip to content

Commit 64e5eae

Browse files
committed
Prepare for fuse filter (implementation currently is a copy of the xor filter)
1 parent 755cd46 commit 64e5eae

File tree

2 files changed

+352
-0
lines changed

2 files changed

+352
-0
lines changed

benchmarks/bulk-insert-and-query.cc

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include "xorfilter_2n.h"
3434
#include "xorfilter_plus.h"
3535
#include "xorfilter_singleheader.h"
36+
#include "xor_fuse_filter.h"
3637
#include "bloom.h"
3738
#include "counting_bloom.h"
3839
#include "gcs.h"
@@ -54,6 +55,7 @@ using namespace xorfilter;
5455
using namespace xorfilter2;
5556
using namespace xorfilter2n;
5657
using namespace xorfilter_plus;
58+
using namespace xorfusefilter;
5759
using namespace bloomfilter;
5860
using namespace counting_bloomfilter;
5961
using namespace gcsfilter;
@@ -313,6 +315,24 @@ struct FilterAPI<XorFilter<ItemType, FingerprintType>> {
313315
}
314316
};
315317

318+
template <typename ItemType, typename FingerprintType>
319+
struct FilterAPI<XorFuseFilter<ItemType, FingerprintType>> {
320+
using Table = XorFuseFilter<ItemType, FingerprintType>;
321+
static Table ConstructFromAddCount(size_t add_count) { return Table(add_count); }
322+
static void Add(uint64_t key, Table* table) {
323+
throw std::runtime_error("Unsupported");
324+
}
325+
static void AddAll(const vector<ItemType> keys, const size_t start, const size_t end, Table* table) {
326+
table->AddAll(keys, start, end);
327+
}
328+
static void Remove(uint64_t key, Table * table) {
329+
throw std::runtime_error("Unsupported");
330+
}
331+
CONTAIN_ATTRIBUTES static bool Contain(uint64_t key, const Table * table) {
332+
return (0 == table->Contain(key));
333+
}
334+
};
335+
316336
class MortonFilter {
317337
Morton3_8* filter;
318338
size_t size;
@@ -981,6 +1001,8 @@ int main(int argc, char * argv[]) {
9811001
{70, "Xor8-singleheader"},
9821002
{80, "Morton"},
9831003

1004+
{90, "XorFuse8"},
1005+
9841006
// Sort
9851007
{100, "Sort"},
9861008
};
@@ -1453,6 +1475,15 @@ int main(int argc, char * argv[]) {
14531475
cout << setw(NAME_WIDTH) << names[a] << cf << endl;
14541476
}
14551477

1478+
// Xor Fuse Filter ----------------------------------------------------------
1479+
a = 90;
1480+
if (algorithmId == a || algorithmId < 0 || (algos.find(a) != algos.end())) {
1481+
auto cf = FilterBenchmark<
1482+
XorFuseFilter<uint64_t, uint8_t>>(
1483+
add_count, to_add, distinct_add, to_lookup, distinct_lookup, intersectionsize, hasduplicates, mixed_sets, seed, true);
1484+
cout << setw(NAME_WIDTH) << names[a] << cf << endl;
1485+
}
1486+
14561487
// Sort ----------------------------------------------------------
14571488
a = 100;
14581489
if (algorithmId == a || algorithmId < 0 || (algos.find(a) != algos.end())) {

src/xorfilter/xor_fuse_filter.h

Lines changed: 321 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,321 @@
1+
#ifndef XOR_FUSE_FILTER_XOR_FILTER_H_
2+
#define XOR_FUSE_FILTER_XOR_FILTER_H_
3+
4+
#include <assert.h>
5+
#include <algorithm>
6+
#include "hashutil.h"
7+
8+
using namespace std;
9+
using namespace hashing;
10+
11+
namespace xorfusefilter {
12+
// status returned by a xor filter operation
13+
enum Status {
14+
Ok = 0,
15+
NotFound = 1,
16+
NotEnoughSpace = 2,
17+
NotSupported = 3,
18+
};
19+
20+
inline uint64_t rotl64(uint64_t n, unsigned int c) {
21+
// assumes width is a power of 2
22+
const unsigned int mask = (CHAR_BIT * sizeof(n) - 1);
23+
// assert ( (c<=mask) &&"rotate by type width or more");
24+
c &= mask;
25+
return (n << c) | ( n >> ((-c) & mask));
26+
}
27+
28+
__attribute__((always_inline))
29+
inline uint32_t reduce(uint32_t hash, uint32_t n) {
30+
// http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
31+
return (uint32_t) (((uint64_t) hash * n) >> 32);
32+
}
33+
34+
size_t getHashFromHash(uint64_t hash, int index, int blockLength) {
35+
uint32_t r = rotl64(hash, index * 21);
36+
return (size_t) reduce(r, blockLength) + index * blockLength;
37+
}
38+
39+
template <typename ItemType, typename FingerprintType,
40+
typename HashFamily = TwoIndependentMultiplyShift>
41+
class XorFuseFilter {
42+
public:
43+
44+
size_t size;
45+
size_t arrayLength;
46+
size_t blockLength;
47+
FingerprintType *fingerprints;
48+
49+
HashFamily* hasher;
50+
51+
inline FingerprintType fingerprint(const uint64_t hash) const {
52+
return (FingerprintType) hash ^ (hash >> 32);
53+
}
54+
55+
explicit XorFuseFilter(const size_t size) {
56+
hasher = new HashFamily();
57+
this->size = size;
58+
this->arrayLength = 32 + 1.23 * size;
59+
this->blockLength = arrayLength / 3;
60+
fingerprints = new FingerprintType[arrayLength]();
61+
std::fill_n(fingerprints, arrayLength, 0);
62+
}
63+
64+
~XorFuseFilter() {
65+
delete[] fingerprints;
66+
delete hasher;
67+
}
68+
69+
Status AddAll(const vector<ItemType> &data, const size_t start, const size_t end) {
70+
return AddAll(data.data(),start,end);
71+
}
72+
73+
Status AddAll(const ItemType* data, const size_t start, const size_t end);
74+
75+
// Report if the item is inserted, with false positive rate.
76+
Status Contain(const ItemType &item) const;
77+
78+
/* methods for providing stats */
79+
// summary infomation
80+
std::string Info() const;
81+
82+
// number of current inserted items;
83+
size_t Size() const { return size; }
84+
85+
// size of the filter in bytes.
86+
size_t SizeInBytes() const { return arrayLength * sizeof(FingerprintType); }
87+
};
88+
89+
struct t2val {
90+
uint64_t t2;
91+
uint64_t t2count;
92+
};
93+
94+
typedef struct t2val t2val_t;
95+
96+
const int blockShift = 18;
97+
98+
void applyBlock(uint64_t* tmp, int b, int len, t2val_t * t2vals) {
99+
for (int i = 0; i < len; i += 2) {
100+
uint64_t x = tmp[(b << blockShift) + i];
101+
int index = (int) tmp[(b << blockShift) + i + 1];
102+
t2vals[index].t2count++;
103+
t2vals[index].t2 ^= x;
104+
}
105+
}
106+
107+
int applyBlock2(uint64_t* tmp, int b, int len, t2val_t * t2vals, int* alone, int alonePos) {
108+
for (int i = 0; i < len; i += 2) {
109+
uint64_t hash = tmp[(b << blockShift) + i];
110+
int index = (int) tmp[(b << blockShift) + i + 1];
111+
int oldCount = t2vals[index].t2count;
112+
if (oldCount >= 1) {
113+
int newCount = oldCount - 1;
114+
t2vals[index].t2count = newCount;
115+
if (newCount == 1) {
116+
alone[alonePos++] = index;
117+
}
118+
t2vals[index].t2 ^= hash;
119+
}
120+
}
121+
return alonePos;
122+
}
123+
124+
template <typename ItemType, typename FingerprintType,
125+
typename HashFamily>
126+
Status XorFuseFilter<ItemType, FingerprintType, HashFamily>::AddAll(
127+
const ItemType* keys, const size_t start, const size_t end) {
128+
129+
int m = arrayLength;
130+
uint64_t* reverseOrder = new uint64_t[size];
131+
uint8_t* reverseH = new uint8_t[size];
132+
size_t reverseOrderPos;
133+
int hashIndex = 0;
134+
t2val_t * t2vals = new t2val_t[m];
135+
while (true) {
136+
memset(t2vals, 0, sizeof(t2val_t[m]));
137+
int blocks = 1 + ((3 * blockLength) >> blockShift);
138+
uint64_t* tmp = new uint64_t[blocks << blockShift];
139+
int* tmpc = new int[blocks]();
140+
for(size_t i = start; i < end; i++) {
141+
uint64_t k = keys[i];
142+
uint64_t hash = (*hasher)(k);
143+
for (int hi = 0; hi < 3; hi++) {
144+
int index = getHashFromHash(hash, hi, blockLength);
145+
int b = index >> blockShift;
146+
int i2 = tmpc[b];
147+
tmp[(b << blockShift) + i2] = hash;
148+
tmp[(b << blockShift) + i2 + 1] = index;
149+
tmpc[b] += 2;
150+
if (i2 + 2 == (1 << blockShift)) {
151+
applyBlock(tmp, b, i2 + 2, t2vals);
152+
tmpc[b] = 0;
153+
}
154+
}
155+
156+
}
157+
for (int b = 0; b < blocks; b++) {
158+
applyBlock(tmp, b, tmpc[b], t2vals);
159+
}
160+
delete[] tmp;
161+
delete[] tmpc;
162+
reverseOrderPos = 0;
163+
164+
int* alone = new int[arrayLength];
165+
int alonePos = 0;
166+
for (size_t i = 0; i < arrayLength; i++) {
167+
if (t2vals[i].t2count == 1) {
168+
alone[alonePos++] = i;
169+
}
170+
}
171+
tmp = new uint64_t[blocks << blockShift];
172+
tmpc = new int[blocks]();
173+
reverseOrderPos = 0;
174+
int bestBlock = -1;
175+
while (reverseOrderPos < size) {
176+
if (alonePos == 0) {
177+
// we need to apply blocks until we have an entry that is alone
178+
// (that is, until alonePos > 0)
179+
// so, find a large block (the larger the better)
180+
// but don't need to search very long
181+
// start searching where we stopped the last time
182+
// (to make it more even)
183+
for (int i = 0, b = bestBlock + 1, best = -1; i < blocks; i++) {
184+
if (b >= blocks) {
185+
b = 0;
186+
}
187+
if (tmpc[b] > best) {
188+
best = tmpc[b];
189+
bestBlock = b;
190+
if (best > (1 << (blockShift - 1))) {
191+
// sufficiently large: stop
192+
break;
193+
}
194+
}
195+
}
196+
if (tmpc[bestBlock] > 0) {
197+
alonePos = applyBlock2(tmp, bestBlock, tmpc[bestBlock], t2vals, alone, alonePos);
198+
tmpc[bestBlock] = 0;
199+
}
200+
// applying a block may not actually result in a new entry that is alone
201+
if (alonePos == 0) {
202+
for (int b = 0; b < blocks && alonePos == 0; b++) {
203+
if (tmpc[b] > 0) {
204+
alonePos = applyBlock2(tmp, b, tmpc[b], t2vals, alone, alonePos);
205+
tmpc[b] = 0;
206+
}
207+
}
208+
}
209+
}
210+
if (alonePos == 0) {
211+
break;
212+
}
213+
int i = alone[--alonePos];
214+
int b = i >> blockShift;
215+
if (tmpc[b] > 0) {
216+
alonePos = applyBlock2(tmp, b, tmpc[b], t2vals, alone, alonePos);
217+
tmpc[b] = 0;
218+
}
219+
uint8_t found = -1;
220+
if (t2vals[i].t2count == 0) {
221+
continue;
222+
}
223+
long hash = t2vals[i].t2;
224+
for (int hi = 0; hi < 3; hi++) {
225+
int h = getHashFromHash(hash, hi, blockLength);
226+
if (h == i) {
227+
found = (uint8_t) hi;
228+
t2vals[i].t2count = 0;
229+
} else {
230+
int b = h >> blockShift;
231+
int i2 = tmpc[b];
232+
tmp[(b << blockShift) + i2] = hash;
233+
tmp[(b << blockShift) + i2 + 1] = h;
234+
tmpc[b] += 2;
235+
if (tmpc[b] >= 1 << blockShift) {
236+
alonePos = applyBlock2(tmp, b, tmpc[b], t2vals, alone, alonePos);
237+
tmpc[b] = 0;
238+
}
239+
}
240+
}
241+
reverseOrder[reverseOrderPos] = hash;
242+
reverseH[reverseOrderPos] = found;
243+
reverseOrderPos++;
244+
}
245+
delete[] tmp;
246+
delete[] tmpc;
247+
delete[] alone;
248+
249+
if (reverseOrderPos == size) {
250+
break;
251+
}
252+
253+
std::cout << "WARNING: hashIndex " << hashIndex << "\n";
254+
if (hashIndex >= 0) {
255+
std::cout << (end - start) << " keys; arrayLength " << arrayLength
256+
<< " blockLength " << blockLength
257+
<< " reverseOrderPos " << reverseOrderPos << "\n";
258+
}
259+
260+
hashIndex++;
261+
262+
// use a new random numbers
263+
delete hasher;
264+
hasher = new HashFamily();
265+
266+
}
267+
268+
for (int i = reverseOrderPos - 1; i >= 0; i--) {
269+
// the hash of the key we insert next
270+
uint64_t hash = reverseOrder[i];
271+
int found = reverseH[i];
272+
// which entry in the table we can change
273+
int change = -1;
274+
// we set table[change] to the fingerprint of the key,
275+
// unless the other two entries are already occupied
276+
FingerprintType xor2 = fingerprint(hash);
277+
for (int hi = 0; hi < 3; hi++) {
278+
size_t h = getHashFromHash(hash, hi, blockLength);
279+
if (found == hi) {
280+
change = h;
281+
} else {
282+
// this is different from BDZ: using xor to calculate the
283+
// fingerprint
284+
xor2 ^= fingerprints[h];
285+
}
286+
}
287+
fingerprints[change] = xor2;
288+
}
289+
delete [] t2vals;
290+
delete [] reverseOrder;
291+
delete [] reverseH;
292+
293+
return Ok;
294+
}
295+
296+
template <typename ItemType, typename FingerprintType,
297+
typename HashFamily>
298+
Status XorFuseFilter<ItemType, FingerprintType, HashFamily>::Contain(
299+
const ItemType &key) const {
300+
uint64_t hash = (*hasher)(key);
301+
FingerprintType f = fingerprint(hash);
302+
uint32_t r0 = (uint32_t) hash;
303+
uint32_t r1 = (uint32_t) rotl64(hash, 21);
304+
uint32_t r2 = (uint32_t) rotl64(hash, 42);
305+
uint32_t h0 = reduce(r0, blockLength);
306+
uint32_t h1 = reduce(r1, blockLength) + blockLength;
307+
uint32_t h2 = reduce(r2, blockLength) + 2 * blockLength;
308+
f ^= fingerprints[h0] ^ fingerprints[h1] ^ fingerprints[h2];
309+
return f == 0 ? Ok : NotFound;
310+
}
311+
312+
template <typename ItemType, typename FingerprintType,
313+
typename HashFamily>
314+
std::string XorFuseFilter<ItemType, FingerprintType, HashFamily>::Info() const {
315+
std::stringstream ss;
316+
ss << "XorFuseFilter Status:\n"
317+
<< "\t\tKeys stored: " << Size() << "\n";
318+
return ss.str();
319+
}
320+
} // namespace xorfusefilter
321+
#endif // XOR_FUSE_FILTER_XOR_FILTER_H_

0 commit comments

Comments
 (0)