Skip to content

Commit bc582a5

Browse files
committed
New Xor10 variant (fast, but needs more space as only 30 of 32 bits are used)
1 parent 6200806 commit bc582a5

File tree

2 files changed

+308
-1
lines changed

2 files changed

+308
-1
lines changed

benchmarks/bulk-insert-and-query.cc

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "cuckoofilter.h"
2323
#include "cuckoofilter_stable.h"
2424
#include "xorfilter.h"
25+
#include "xorfilter_10bit.h"
2526
#include "xorfilter_2.h"
2627
#include "xorfilter_2n.h"
2728
#include "xorfilter_plus.h"
@@ -260,6 +261,23 @@ struct FilterAPI<XorFilter2<ItemType, FingerprintType, FingerprintStorageType, H
260261
}
261262
};
262263

264+
template <typename ItemType, typename HashFamily>
265+
struct FilterAPI<XorFilter10<ItemType, HashFamily>> {
266+
using Table = XorFilter10<ItemType, HashFamily>;
267+
static Table ConstructFromAddCount(size_t add_count) { return Table(add_count); }
268+
static void Add(uint64_t key, Table* table) {
269+
throw std::runtime_error("Unsupported");
270+
}
271+
static void AddAll(const vector<ItemType> keys, const size_t start, const size_t end, Table* table) {
272+
table->AddAll(keys, start, end);
273+
}
274+
275+
CONTAIN_ATTRIBUTES
276+
static bool Contain(uint64_t key, const Table * table) {
277+
return (0 == table->Contain(key));
278+
}
279+
};
280+
263281
template <typename ItemType, typename FingerprintType, typename FingerprintStorageType, typename HashFamily>
264282
struct FilterAPI<XorFilter2n<ItemType, FingerprintType, FingerprintStorageType, HashFamily>> {
265283
using Table = XorFilter2n<ItemType, FingerprintType, FingerprintStorageType, HashFamily>;
@@ -560,7 +578,7 @@ int main(int argc, char * argv[]) {
560578
{5,"Cuckoo16"}, {6,"CuckooSemiSort13" }, {7,"Bloom8"},
561579
{8,"Bloom12" }, {9,"Bloom16"}, {10,"BlockedBloom"},
562580
{11,"sort"}, {12,"Xor+8"}, {13,"Xor+16"},
563-
{14,"GCS"}, {15,"CQF"}, {37,"Bloom8 (addall)"},
581+
{14,"GCS"}, {15,"CQF"}, {25, "Xor10"}, {37,"Bloom8 (addall)"},
564582
{38,"Bloom12 (addall)"},{39,"Bloom16 (addall)"},
565583
{40,"BlockedBloom (addall)"}
566584
};
@@ -877,6 +895,13 @@ int main(int argc, char * argv[]) {
877895
cout << setw(NAME_WIDTH) << "Xor10.x" << cf << endl;
878896
}
879897

898+
if (algorithmId == 25 || (algos.find(25) != algos.end())) {
899+
auto cf = FilterBenchmark<
900+
XorFilter10<uint64_t, SimpleMixSplit>>(
901+
add_count, to_add, distinct_add, to_lookup, distinct_lookup, intersectionsize, hasduplicates, mixed_sets, seed, true);
902+
cout << setw(NAME_WIDTH) << "Xor10" << cf << endl;
903+
}
904+
880905

881906
if (algorithmId == 37 || algorithmId < 0 || (algos.find(37) != algos.end())) {
882907
auto cf = FilterBenchmark<

src/xorfilter_10bit.h

Lines changed: 282 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,282 @@
1+
#ifndef XOR_FILTER10_XOR_FILTER_H_
2+
#define XOR_FILTER10_XOR_FILTER_H_
3+
4+
#include <assert.h>
5+
#include <algorithm>
6+
#include <xorfilter.h>
7+
#include "hashutil.h"
8+
9+
using namespace std;
10+
using namespace hashing;
11+
12+
namespace xorfilter {
13+
14+
size_t getHashFromHash10(uint64_t hash, int index, int blockLength) {
15+
uint32_t r = rotl64(hash, index * 21);
16+
return (size_t) reduce(r, blockLength);
17+
}
18+
19+
template <typename ItemType, typename HashFamily = TwoIndependentMultiplyShift>
20+
class XorFilter10 {
21+
22+
size_t size;
23+
size_t arrayLength;
24+
size_t blockLength;
25+
uint32_t *fingerprints;
26+
27+
HashFamily* hasher;
28+
29+
inline uint32_t fingerprint(const uint64_t hash) const {
30+
return (uint32_t) (hash ^ (hash >> 32));
31+
}
32+
33+
public:
34+
explicit XorFilter10(const size_t size) {
35+
hasher = new HashFamily();
36+
this->size = size;
37+
this->arrayLength = 32 + 1.23 * size;
38+
this->blockLength = arrayLength / 3;
39+
fingerprints = new uint32_t[blockLength]();
40+
}
41+
42+
~XorFilter10() {
43+
delete[] fingerprints;
44+
delete hasher;
45+
}
46+
47+
Status AddAll(const vector<ItemType> data, const size_t start, const size_t end);
48+
49+
// Report if the item is inserted, with false positive rate.
50+
Status Contain(const ItemType &item) const;
51+
52+
// number of current inserted items;
53+
size_t Size() const { return size; }
54+
55+
// size of the filter in bytes.
56+
size_t SizeInBytes() const { return blockLength * sizeof(uint32_t); }
57+
};
58+
59+
template <typename ItemType, typename HashFamily>
60+
Status XorFilter10<ItemType, HashFamily>::AddAll(
61+
const vector<ItemType> keys, const size_t start, const size_t end) {
62+
63+
int m = arrayLength;
64+
uint64_t* reverseOrder = new uint64_t[size];
65+
uint8_t* reverseH = new uint8_t[size];
66+
size_t reverseOrderPos;
67+
int hashIndex = 0;
68+
t2val_t * t2vals = new t2val_t[m];
69+
while (true) {
70+
memset(t2vals, 0, sizeof(t2val_t[m]));
71+
int blocks = 1 + ((3 * blockLength) >> blockShift);
72+
uint64_t* tmp = new uint64_t[blocks << blockShift];
73+
int* tmpc = new int[blocks]();
74+
for(size_t i = start; i < end; i++) {
75+
uint64_t k = keys[i];
76+
uint64_t hash = (*hasher)(k);
77+
for (int hi = 0; hi < 3; hi++) {
78+
int index = getHashFromHash(hash, hi, blockLength);
79+
int b = index >> blockShift;
80+
int i2 = tmpc[b];
81+
tmp[(b << blockShift) + i2] = hash;
82+
tmp[(b << blockShift) + i2 + 1] = index;
83+
tmpc[b] += 2;
84+
if (i2 + 2 == (1 << blockShift)) {
85+
applyBlock(tmp, b, i2 + 2, t2vals);
86+
tmpc[b] = 0;
87+
}
88+
}
89+
90+
}
91+
for (int b = 0; b < blocks; b++) {
92+
applyBlock(tmp, b, tmpc[b], t2vals);
93+
}
94+
delete[] tmp;
95+
delete[] tmpc;
96+
reverseOrderPos = 0;
97+
98+
int* alone = new int[arrayLength];
99+
int alonePos = 0;
100+
for (size_t i = 0; i < arrayLength; i++) {
101+
if (t2vals[i].t2count == 1) {
102+
alone[alonePos++] = i;
103+
}
104+
}
105+
tmp = new uint64_t[blocks << blockShift];
106+
tmpc = new int[blocks]();
107+
reverseOrderPos = 0;
108+
int bestBlock = -1;
109+
while (reverseOrderPos < size) {
110+
if (alonePos == 0) {
111+
// we need to apply blocks until we have an entry that is alone
112+
// (that is, until alonePos > 0)
113+
// so, find a large block (the larger the better)
114+
// but don't need to search very long
115+
// start searching where we stopped the last time
116+
// (to make it more even)
117+
for (int i = 0, b = bestBlock + 1, best = -1; i < blocks; i++) {
118+
if (b >= blocks) {
119+
b = 0;
120+
}
121+
if (tmpc[b] > best) {
122+
best = tmpc[b];
123+
bestBlock = b;
124+
if (best > (1 << (blockShift - 1))) {
125+
// sufficiently large: stop
126+
break;
127+
}
128+
}
129+
}
130+
if (tmpc[bestBlock] > 0) {
131+
alonePos = applyBlock2(tmp, bestBlock, tmpc[bestBlock], t2vals, alone, alonePos);
132+
tmpc[bestBlock] = 0;
133+
}
134+
// applying a block may not actually result in a new entry that is alone
135+
if (alonePos == 0) {
136+
for (int b = 0; b < blocks && alonePos == 0; b++) {
137+
if (tmpc[b] > 0) {
138+
alonePos = applyBlock2(tmp, b, tmpc[b], t2vals, alone, alonePos);
139+
tmpc[b] = 0;
140+
}
141+
}
142+
}
143+
}
144+
if (alonePos == 0) {
145+
break;
146+
}
147+
int i = alone[--alonePos];
148+
int b = i >> blockShift;
149+
if (tmpc[b] > 0) {
150+
alonePos = applyBlock2(tmp, b, tmpc[b], t2vals, alone, alonePos);
151+
tmpc[b] = 0;
152+
}
153+
uint8_t found = -1;
154+
if (t2vals[i].t2count == 0) {
155+
continue;
156+
}
157+
long hash = t2vals[i].t2;
158+
for (int hi = 0; hi < 3; hi++) {
159+
int h = getHashFromHash(hash, hi, blockLength);
160+
if (h == i) {
161+
found = (uint8_t) hi;
162+
t2vals[i].t2count = 0;
163+
} else {
164+
int b = h >> blockShift;
165+
int i2 = tmpc[b];
166+
tmp[(b << blockShift) + i2] = hash;
167+
tmp[(b << blockShift) + i2 + 1] = h;
168+
tmpc[b] += 2;
169+
if (tmpc[b] >= 1 << blockShift) {
170+
alonePos = applyBlock2(tmp, b, tmpc[b], t2vals, alone, alonePos);
171+
tmpc[b] = 0;
172+
}
173+
}
174+
}
175+
reverseOrder[reverseOrderPos] = hash;
176+
reverseH[reverseOrderPos] = found;
177+
reverseOrderPos++;
178+
}
179+
delete[] tmp;
180+
delete[] tmpc;
181+
delete[] alone;
182+
183+
/*
184+
185+
int* alone = new int[arrayLength];
186+
int alonePos = 0;
187+
for (size_t i = 0; i < arrayLength; i++) {
188+
if (t2vals[i].t2count == 1) {
189+
alone[alonePos++] = i;
190+
}
191+
}
192+
reverseOrderPos = 0;
193+
while (alonePos > 0 && reverseOrderPos < size) {
194+
int i = alone[--alonePos];
195+
if (t2vals[i].t2count == 0) {
196+
continue;
197+
}
198+
long hash = t2vals[i].t2;
199+
uint8_t found = -1;
200+
for (int hi = 0; hi < 3; hi++) {
201+
int h = getHashFromHash(hash, hi, blockLength);
202+
int newCount = --t2vals[h].t2count;
203+
if (newCount == 0) {
204+
found = (uint8_t) hi;
205+
} else {
206+
if (newCount == 1) {
207+
alone[alonePos++] = h;
208+
}
209+
t2vals[h].t2 ^= hash;
210+
}
211+
}
212+
reverseOrder[reverseOrderPos] = hash;
213+
reverseH[reverseOrderPos] = found;
214+
reverseOrderPos++;
215+
}
216+
delete [] alone;
217+
*/
218+
219+
if (reverseOrderPos == size) {
220+
break;
221+
}
222+
223+
std::cout << "WARNING: hashIndex " << hashIndex << "\n";
224+
if (hashIndex >= 0) {
225+
std::cout << (end - start) << " keys; arrayLength " << arrayLength
226+
<< " blockLength " << blockLength
227+
<< " reverseOrderPos " << reverseOrderPos << "\n";
228+
}
229+
230+
hashIndex++;
231+
232+
// use a new random numbers
233+
delete hasher;
234+
hasher = new HashFamily();
235+
236+
}
237+
238+
for (int i = reverseOrderPos - 1; i >= 0; i--) {
239+
// the hash of the key we insert next
240+
uint64_t hash = reverseOrder[i];
241+
int found = reverseH[i];
242+
// which entry in the table we can change
243+
int change = -1;
244+
// we set table[change] to the fingerprint of the key,
245+
// unless the other two entries are already occupied
246+
uint32_t xor2 = fingerprint(hash);
247+
for (int hi = 0; hi < 3; hi++) {
248+
size_t h = getHashFromHash10(hash, hi, blockLength);
249+
if (found == hi) {
250+
change = h;
251+
} else {
252+
// this is different from BDZ: using xor to calculate the
253+
// fingerprint
254+
xor2 ^= fingerprints[h] >> (10 * hi);
255+
}
256+
}
257+
fingerprints[change] |= (xor2 & 0x3ff) << (10 * found);
258+
}
259+
delete [] t2vals;
260+
delete [] reverseOrder;
261+
delete [] reverseH;
262+
263+
return Ok;
264+
}
265+
266+
template <typename ItemType, typename HashFamily>
267+
Status XorFilter10<ItemType, HashFamily>::Contain(
268+
const ItemType &key) const {
269+
uint64_t hash = (*hasher)(key);
270+
uint32_t f = fingerprint(hash);
271+
uint32_t r0 = (uint32_t) hash;
272+
uint32_t r1 = (uint32_t) rotl64(hash, 21);
273+
uint32_t r2 = (uint32_t) rotl64(hash, 42);
274+
uint32_t h0 = reduce(r0, blockLength);
275+
uint32_t h1 = reduce(r1, blockLength);
276+
uint32_t h2 = reduce(r2, blockLength);
277+
f ^= fingerprints[h0] ^ (fingerprints[h1] >> 10) ^ (fingerprints[h2] >> 20);
278+
return (f & 0x3ff) == 0 ? Ok : NotFound;
279+
}
280+
281+
} // namespace xorfilter
282+
#endif // XOR_FILTER10_XOR_FILTER_H_

0 commit comments

Comments
 (0)