Skip to content

Commit 992c083

Browse files
committed
Xor 13: using 39 of 40 bits
1 parent 1235478 commit 992c083

File tree

2 files changed

+315
-0
lines changed

2 files changed

+315
-0
lines changed

benchmarks/bulk-insert-and-query.cc

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "cuckoofilter_stable.h"
2424
#include "xorfilter.h"
2525
#include "xorfilter_10bit.h"
26+
#include "xorfilter_13bit.h"
2627
#include "xorfilter_10_666bit.h"
2728
#include "xorfilter_2.h"
2829
#include "xorfilter_2n.h"
@@ -279,6 +280,23 @@ struct FilterAPI<XorFilter10<ItemType, HashFamily>> {
279280
}
280281
};
281282

283+
template <typename ItemType, typename HashFamily>
284+
struct FilterAPI<XorFilter13<ItemType, HashFamily>> {
285+
using Table = XorFilter13<ItemType, HashFamily>;
286+
static Table ConstructFromAddCount(size_t add_count) { return Table(add_count); }
287+
static void Add(uint64_t key, Table* table) {
288+
throw std::runtime_error("Unsupported");
289+
}
290+
static void AddAll(const vector<ItemType> keys, const size_t start, const size_t end, Table* table) {
291+
table->AddAll(keys, start, end);
292+
}
293+
294+
CONTAIN_ATTRIBUTES
295+
static bool Contain(uint64_t key, const Table * table) {
296+
return (0 == table->Contain(key));
297+
}
298+
};
299+
282300
template <typename ItemType, typename HashFamily>
283301
struct FilterAPI<XorFilter10_666<ItemType, HashFamily>> {
284302
using Table = XorFilter10_666<ItemType, HashFamily>;
@@ -965,6 +983,12 @@ int main(int argc, char * argv[]) {
965983
cout << setw(NAME_WIDTH) << "Xor10.666" << cf << endl;
966984
}
967985

986+
if (algorithmId == 27 || (algos.find(27) != algos.end())) {
987+
auto cf = FilterBenchmark<
988+
XorFilter13<uint64_t, SimpleMixSplit>>(
989+
add_count, to_add, distinct_add, to_lookup, distinct_lookup, intersectionsize, hasduplicates, mixed_sets, seed, true);
990+
cout << setw(NAME_WIDTH) << "Xor13" << cf << endl;
991+
}
968992

969993
if (algorithmId == 37 || algorithmId < 0 || (algos.find(37) != algos.end())) {
970994
auto cf = FilterBenchmark<

src/xorfilter_13bit.h

Lines changed: 291 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,291 @@
1+
#ifndef XOR_FILTER13_XOR_FILTER_H_
2+
#define XOR_FILTER13_XOR_FILTER_H_
3+
4+
#include <assert.h>
5+
#include <algorithm>
6+
#include <xorfilter_10bit.h>
7+
#include "hashutil.h"
8+
9+
using namespace std;
10+
using namespace hashing;
11+
12+
namespace xorfilter {
13+
14+
template <typename ItemType, typename HashFamily = TwoIndependentMultiplyShift>
15+
class XorFilter13 {
16+
17+
size_t size;
18+
size_t arrayLength;
19+
size_t blockLength;
20+
size_t byteCount;
21+
uint8_t *fingerprints;
22+
23+
HashFamily* hasher;
24+
25+
inline uint32_t fingerprint(const uint64_t hash) const {
26+
return (uint32_t) (hash ^ (hash >> 32));
27+
}
28+
29+
public:
30+
explicit XorFilter13(const size_t size) {
31+
hasher = new HashFamily();
32+
this->size = size;
33+
this->arrayLength = 32 + 1.23 * size;
34+
this->blockLength = arrayLength / 3;
35+
byteCount = blockLength * 5 + 4;
36+
fingerprints = new uint8_t[byteCount]();
37+
}
38+
39+
~XorFilter13() {
40+
delete[] fingerprints;
41+
delete hasher;
42+
}
43+
44+
Status AddAll(const vector<ItemType> data, const size_t start, const size_t end);
45+
46+
// Report if the item is inserted, with false positive rate.
47+
Status Contain(const ItemType &item) const;
48+
49+
// number of current inserted items;
50+
size_t Size() const { return size; }
51+
52+
// size of the filter in bytes.
53+
size_t SizeInBytes() const { return byteCount; }
54+
};
55+
56+
#define COPY sizeof(uint64_t)
57+
//#define COPY 5
58+
59+
template <typename ItemType, typename HashFamily>
60+
Status XorFilter13<ItemType, HashFamily>::AddAll(
61+
const vector<ItemType> keys, const size_t start, const size_t end) {
62+
63+
int m = arrayLength;
64+
uint64_t* reverseOrder = new uint64_t[size];
65+
uint8_t* reverseH = new uint8_t[size];
66+
size_t reverseOrderPos;
67+
int hashIndex = 0;
68+
t2val_t * t2vals = new t2val_t[m];
69+
while (true) {
70+
memset(t2vals, 0, sizeof(t2val_t[m]));
71+
int blocks = 1 + ((3 * blockLength) >> blockShift);
72+
uint64_t* tmp = new uint64_t[blocks << blockShift];
73+
int* tmpc = new int[blocks]();
74+
for(size_t i = start; i < end; i++) {
75+
uint64_t k = keys[i];
76+
uint64_t hash = (*hasher)(k);
77+
for (int hi = 0; hi < 3; hi++) {
78+
int index = getHashFromHash(hash, hi, blockLength);
79+
int b = index >> blockShift;
80+
int i2 = tmpc[b];
81+
tmp[(b << blockShift) + i2] = hash;
82+
tmp[(b << blockShift) + i2 + 1] = index;
83+
tmpc[b] += 2;
84+
if (i2 + 2 == (1 << blockShift)) {
85+
applyBlock(tmp, b, i2 + 2, t2vals);
86+
tmpc[b] = 0;
87+
}
88+
}
89+
90+
}
91+
for (int b = 0; b < blocks; b++) {
92+
applyBlock(tmp, b, tmpc[b], t2vals);
93+
}
94+
delete[] tmp;
95+
delete[] tmpc;
96+
reverseOrderPos = 0;
97+
98+
int* alone = new int[arrayLength];
99+
int alonePos = 0;
100+
for (size_t i = 0; i < arrayLength; i++) {
101+
if (t2vals[i].t2count == 1) {
102+
alone[alonePos++] = i;
103+
}
104+
}
105+
tmp = new uint64_t[blocks << blockShift];
106+
tmpc = new int[blocks]();
107+
reverseOrderPos = 0;
108+
int bestBlock = -1;
109+
while (reverseOrderPos < size) {
110+
if (alonePos == 0) {
111+
// we need to apply blocks until we have an entry that is alone
112+
// (that is, until alonePos > 0)
113+
// so, find a large block (the larger the better)
114+
// but don't need to search very long
115+
// start searching where we stopped the last time
116+
// (to make it more even)
117+
for (int i = 0, b = bestBlock + 1, best = -1; i < blocks; i++) {
118+
if (b >= blocks) {
119+
b = 0;
120+
}
121+
if (tmpc[b] > best) {
122+
best = tmpc[b];
123+
bestBlock = b;
124+
if (best > (1 << (blockShift - 1))) {
125+
// sufficiently large: stop
126+
break;
127+
}
128+
}
129+
}
130+
if (tmpc[bestBlock] > 0) {
131+
alonePos = applyBlock2(tmp, bestBlock, tmpc[bestBlock], t2vals, alone, alonePos);
132+
tmpc[bestBlock] = 0;
133+
}
134+
// applying a block may not actually result in a new entry that is alone
135+
if (alonePos == 0) {
136+
for (int b = 0; b < blocks && alonePos == 0; b++) {
137+
if (tmpc[b] > 0) {
138+
alonePos = applyBlock2(tmp, b, tmpc[b], t2vals, alone, alonePos);
139+
tmpc[b] = 0;
140+
}
141+
}
142+
}
143+
}
144+
if (alonePos == 0) {
145+
break;
146+
}
147+
int i = alone[--alonePos];
148+
int b = i >> blockShift;
149+
if (tmpc[b] > 0) {
150+
alonePos = applyBlock2(tmp, b, tmpc[b], t2vals, alone, alonePos);
151+
tmpc[b] = 0;
152+
}
153+
uint8_t found = -1;
154+
if (t2vals[i].t2count == 0) {
155+
continue;
156+
}
157+
long hash = t2vals[i].t2;
158+
for (int hi = 0; hi < 3; hi++) {
159+
int h = getHashFromHash(hash, hi, blockLength);
160+
if (h == i) {
161+
found = (uint8_t) hi;
162+
t2vals[i].t2count = 0;
163+
} else {
164+
int b = h >> blockShift;
165+
int i2 = tmpc[b];
166+
tmp[(b << blockShift) + i2] = hash;
167+
tmp[(b << blockShift) + i2 + 1] = h;
168+
tmpc[b] += 2;
169+
if (tmpc[b] >= 1 << blockShift) {
170+
alonePos = applyBlock2(tmp, b, tmpc[b], t2vals, alone, alonePos);
171+
tmpc[b] = 0;
172+
}
173+
}
174+
}
175+
reverseOrder[reverseOrderPos] = hash;
176+
reverseH[reverseOrderPos] = found;
177+
reverseOrderPos++;
178+
}
179+
delete[] tmp;
180+
delete[] tmpc;
181+
delete[] alone;
182+
183+
/*
184+
185+
int* alone = new int[arrayLength];
186+
int alonePos = 0;
187+
for (size_t i = 0; i < arrayLength; i++) {
188+
if (t2vals[i].t2count == 1) {
189+
alone[alonePos++] = i;
190+
}
191+
}
192+
reverseOrderPos = 0;
193+
while (alonePos > 0 && reverseOrderPos < size) {
194+
int i = alone[--alonePos];
195+
if (t2vals[i].t2count == 0) {
196+
continue;
197+
}
198+
long hash = t2vals[i].t2;
199+
uint8_t found = -1;
200+
for (int hi = 0; hi < 3; hi++) {
201+
int h = getHashFromHash(hash, hi, blockLength);
202+
int newCount = --t2vals[h].t2count;
203+
if (newCount == 0) {
204+
found = (uint8_t) hi;
205+
} else {
206+
if (newCount == 1) {
207+
alone[alonePos++] = h;
208+
}
209+
t2vals[h].t2 ^= hash;
210+
}
211+
}
212+
reverseOrder[reverseOrderPos] = hash;
213+
reverseH[reverseOrderPos] = found;
214+
reverseOrderPos++;
215+
}
216+
delete [] alone;
217+
*/
218+
219+
if (reverseOrderPos == size) {
220+
break;
221+
}
222+
223+
std::cout << "WARNING: hashIndex " << hashIndex << "\n";
224+
if (hashIndex >= 0) {
225+
std::cout << (end - start) << " keys; arrayLength " << arrayLength
226+
<< " blockLength " << blockLength
227+
<< " reverseOrderPos " << reverseOrderPos << "\n";
228+
}
229+
230+
hashIndex++;
231+
232+
// use a new random numbers
233+
delete hasher;
234+
hasher = new HashFamily();
235+
236+
}
237+
238+
for (int i = reverseOrderPos - 1; i >= 0; i--) {
239+
// the hash of the key we insert next
240+
uint64_t hash = reverseOrder[i];
241+
int found = reverseH[i];
242+
// which entry in the table we can change
243+
int change = -1;
244+
// we set table[change] to the fingerprint of the key,
245+
// unless the other two entries are already occupied
246+
uint64_t xor2 = fingerprint(hash);
247+
for (int hi = 0; hi < 3; hi++) {
248+
size_t h = getHashFromHash10(hash, hi, blockLength);
249+
if (found == hi) {
250+
change = h;
251+
} else {
252+
// this is different from BDZ: using xor to calculate the
253+
// fingerprint
254+
uint64_t x;
255+
memcpy(&x, fingerprints + 5 * h, COPY);
256+
xor2 ^= x >> (0 + 13 * hi);
257+
}
258+
}
259+
uint64_t x = 0;
260+
memcpy(&x, fingerprints + 5 * change, COPY);
261+
x |= (xor2 & 0x1fff) << (0 + 13 * found);
262+
memcpy(fingerprints + 5 * change, &x, COPY);
263+
}
264+
delete [] t2vals;
265+
delete [] reverseOrder;
266+
delete [] reverseH;
267+
268+
return Ok;
269+
}
270+
271+
template <typename ItemType, typename HashFamily>
272+
Status XorFilter13<ItemType, HashFamily>::Contain(
273+
const ItemType &key) const {
274+
uint64_t hash = (*hasher)(key);
275+
uint64_t f = fingerprint(hash);
276+
uint32_t r0 = (uint32_t) hash;
277+
uint32_t r1 = (uint32_t) rotl64(hash, 21);
278+
uint32_t r2 = (uint32_t) rotl64(hash, 42);
279+
uint32_t h0 = reduce(r0, blockLength);
280+
uint32_t h1 = reduce(r1, blockLength);
281+
uint32_t h2 = reduce(r2, blockLength);
282+
uint64_t x0, x1, x2;
283+
memcpy(&x0, fingerprints + 5 * h0, COPY);
284+
memcpy(&x1, fingerprints + 5 * h1, COPY);
285+
memcpy(&x2, fingerprints + 5 * h2, COPY);
286+
f ^= x0 ^ (x1 >> 13) ^ (x2 >> 26);
287+
return (f & 0x1fff) == 0 ? Ok : NotFound;
288+
}
289+
290+
} // namespace xorfilter
291+
#endif // XOR_FILTER13_XOR_FILTER_H_

0 commit comments

Comments
 (0)