Skip to content

Commit 05cd1ec

Browse files
committed
Xor 10.666: new variant, a bit slow
1 parent c29e59e commit 05cd1ec

File tree

2 files changed

+368
-0
lines changed

2 files changed

+368
-0
lines changed

benchmarks/bulk-insert-and-query.cc

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "cuckoofilter_stable.h"
2424
#include "xorfilter.h"
2525
#include "xorfilter_10bit.h"
26+
#include "xorfilter_10_666bit.h"
2627
#include "xorfilter_2.h"
2728
#include "xorfilter_2n.h"
2829
#include "xorfilter_plus.h"
@@ -278,6 +279,23 @@ struct FilterAPI<XorFilter10<ItemType, HashFamily>> {
278279
}
279280
};
280281

282+
template <typename ItemType, typename HashFamily>
283+
struct FilterAPI<XorFilter10_666<ItemType, HashFamily>> {
284+
using Table = XorFilter10_666<ItemType, HashFamily>;
285+
static Table ConstructFromAddCount(size_t add_count) { return Table(add_count); }
286+
static void Add(uint64_t key, Table* table) {
287+
throw std::runtime_error("Unsupported");
288+
}
289+
static void AddAll(const vector<ItemType> keys, const size_t start, const size_t end, Table* table) {
290+
table->AddAll(keys, start, end);
291+
}
292+
293+
CONTAIN_ATTRIBUTES
294+
static bool Contain(uint64_t key, const Table * table) {
295+
return (0 == table->Contain(key));
296+
}
297+
};
298+
281299
template <typename ItemType, typename FingerprintType, typename FingerprintStorageType, typename HashFamily>
282300
struct FilterAPI<XorFilter2n<ItemType, FingerprintType, FingerprintStorageType, HashFamily>> {
283301
using Table = XorFilter2n<ItemType, FingerprintType, FingerprintStorageType, HashFamily>;
@@ -572,6 +590,44 @@ void parse_comma_separated(char * c, std::set<int> & answer ) {
572590
}
573591
}
574592

593+
/*
594+
#define MUL 1625L
595+
#define MUL2 (MUL*MUL)
596+
// (1<<64) / MUL
597+
#define INVMUL 11351842506898185L
598+
// (1<<64) / MUL2
599+
#define INVMUL2 6985749235014L
600+
int main() {
601+
printf("start\n");
602+
for (int a = 0; a < MUL; a++) {
603+
for (int b = 0; b < MUL; b++) {
604+
for (int c = 0; c < MUL; c++) {
605+
uint32_t x = a * MUL2 + b * MUL + c;
606+
int aa = (int) (((__uint128_t) x * (INVMUL2 + 1)) >> 64);
607+
if (aa != a) {
608+
printf("wrong a");
609+
return -1;
610+
}
611+
int bb = (int) (((__uint128_t) x * (INVMUL + 1)) >> 64);
612+
int rb = bb % MUL;
613+
if (rb != b) {
614+
printf("wrong b");
615+
return -1;
616+
}
617+
int expected = (a + b + c) % MUL;
618+
int got = (aa + bb + x) % MUL;
619+
if (expected != got) {
620+
printf("wrong modulo");
621+
return -1;
622+
}
623+
}
624+
}
625+
}
626+
printf("end\n");
627+
return 0;
628+
}
629+
*/
630+
575631
int main(int argc, char * argv[]) {
576632
std::map<int,std::string> names = {{0,"Xor8"},{1,"Xor12"},
577633
{2,"Xor16"}, {3,"Cuckoo8"}, {4,"Cuckoo12"},
@@ -902,6 +958,13 @@ int main(int argc, char * argv[]) {
902958
cout << setw(NAME_WIDTH) << "Xor10" << cf << endl;
903959
}
904960

961+
if (algorithmId == 26 || (algos.find(26) != algos.end())) {
962+
auto cf = FilterBenchmark<
963+
XorFilter10_666<uint64_t, SimpleMixSplit>>(
964+
add_count, to_add, distinct_add, to_lookup, distinct_lookup, intersectionsize, hasduplicates, mixed_sets, seed, true);
965+
cout << setw(NAME_WIDTH) << "Xor10.666" << cf << endl;
966+
}
967+
905968

906969
if (algorithmId == 37 || algorithmId < 0 || (algos.find(37) != algos.end())) {
907970
auto cf = FilterBenchmark<

src/xorfilter_10_666bit.h

Lines changed: 305 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,305 @@
1+
#ifndef XOR_FILTER10_666_XOR_FILTER_H_
2+
#define XOR_FILTER10_666_XOR_FILTER_H_
3+
4+
#include <assert.h>
5+
#include <algorithm>
6+
#include <xorfilter.h>
7+
#include <xorfilter_10bit.h>
8+
#include "hashutil.h"
9+
10+
using namespace std;
11+
using namespace hashing;
12+
13+
namespace xorfilter {
14+
15+
const uint32_t fingerMul = 1625;
16+
// const uint32_t fingerMul = 1024;
17+
const uint32_t fingerMul2 = fingerMul * fingerMul;
18+
// (1<<64) / fingerMul
19+
const __uint128_t invFingerMul = 11351842506898185L;
20+
// (1<<64) / fingerMul2
21+
const __uint128_t invFingerMul2 = 6985749235014L;
22+
23+
template <typename ItemType, typename HashFamily = TwoIndependentMultiplyShift>
24+
class XorFilter10_666 {
25+
26+
size_t size;
27+
size_t arrayLength;
28+
size_t blockLength;
29+
uint32_t *fingerprints;
30+
31+
HashFamily* hasher;
32+
33+
inline uint32_t fingerprint(const uint64_t hash) const {
34+
return (uint32_t) reduce(hash ^ (hash >> 32), fingerMul);
35+
}
36+
37+
public:
38+
explicit XorFilter10_666(const size_t size) {
39+
hasher = new HashFamily();
40+
this->size = size;
41+
this->arrayLength = 32 + 1.23 * size;
42+
this->blockLength = arrayLength / 3;
43+
fingerprints = new uint32_t[blockLength]();
44+
}
45+
46+
~XorFilter10_666() {
47+
delete[] fingerprints;
48+
delete hasher;
49+
}
50+
51+
Status AddAll(const vector<ItemType> data, const size_t start, const size_t end);
52+
53+
// Report if the item is inserted, with false positive rate.
54+
Status Contain(const ItemType &item) const;
55+
56+
// number of current inserted items;
57+
size_t Size() const { return size; }
58+
59+
// size of the filter in bytes.
60+
size_t SizeInBytes() const { return blockLength * sizeof(uint32_t); }
61+
};
62+
63+
int getFinger(uint32_t x, int i) {
64+
if (i == 0) {
65+
return x;
66+
}
67+
if (i == 1) {
68+
// return x >> 10;
69+
return (int) (((__uint128_t) x * (invFingerMul + 1)) >> 64);
70+
}
71+
if (i == 2) {
72+
// return x >> 20;
73+
return (int) (((__uint128_t) x * (invFingerMul2 + 1)) >> 64);
74+
}
75+
exit(0);
76+
}
77+
78+
template <typename ItemType, typename HashFamily>
79+
Status XorFilter10_666<ItemType, HashFamily>::AddAll(
80+
const vector<ItemType> keys, const size_t start, const size_t end) {
81+
82+
int m = arrayLength;
83+
uint64_t* reverseOrder = new uint64_t[size];
84+
uint8_t* reverseH = new uint8_t[size];
85+
size_t reverseOrderPos;
86+
int hashIndex = 0;
87+
t2val_t * t2vals = new t2val_t[m];
88+
while (true) {
89+
memset(t2vals, 0, sizeof(t2val_t[m]));
90+
int blocks = 1 + ((3 * blockLength) >> blockShift);
91+
uint64_t* tmp = new uint64_t[blocks << blockShift];
92+
int* tmpc = new int[blocks]();
93+
for(size_t i = start; i < end; i++) {
94+
uint64_t k = keys[i];
95+
uint64_t hash = (*hasher)(k);
96+
for (int hi = 0; hi < 3; hi++) {
97+
int index = getHashFromHash(hash, hi, blockLength);
98+
int b = index >> blockShift;
99+
int i2 = tmpc[b];
100+
tmp[(b << blockShift) + i2] = hash;
101+
tmp[(b << blockShift) + i2 + 1] = index;
102+
tmpc[b] += 2;
103+
if (i2 + 2 == (1 << blockShift)) {
104+
applyBlock(tmp, b, i2 + 2, t2vals);
105+
tmpc[b] = 0;
106+
}
107+
}
108+
109+
}
110+
for (int b = 0; b < blocks; b++) {
111+
applyBlock(tmp, b, tmpc[b], t2vals);
112+
}
113+
delete[] tmp;
114+
delete[] tmpc;
115+
reverseOrderPos = 0;
116+
117+
int* alone = new int[arrayLength];
118+
int alonePos = 0;
119+
for (size_t i = 0; i < arrayLength; i++) {
120+
if (t2vals[i].t2count == 1) {
121+
alone[alonePos++] = i;
122+
}
123+
}
124+
tmp = new uint64_t[blocks << blockShift];
125+
tmpc = new int[blocks]();
126+
reverseOrderPos = 0;
127+
int bestBlock = -1;
128+
while (reverseOrderPos < size) {
129+
if (alonePos == 0) {
130+
// we need to apply blocks until we have an entry that is alone
131+
// (that is, until alonePos > 0)
132+
// so, find a large block (the larger the better)
133+
// but don't need to search very long
134+
// start searching where we stopped the last time
135+
// (to make it more even)
136+
for (int i = 0, b = bestBlock + 1, best = -1; i < blocks; i++) {
137+
if (b >= blocks) {
138+
b = 0;
139+
}
140+
if (tmpc[b] > best) {
141+
best = tmpc[b];
142+
bestBlock = b;
143+
if (best > (1 << (blockShift - 1))) {
144+
// sufficiently large: stop
145+
break;
146+
}
147+
}
148+
}
149+
if (tmpc[bestBlock] > 0) {
150+
alonePos = applyBlock2(tmp, bestBlock, tmpc[bestBlock], t2vals, alone, alonePos);
151+
tmpc[bestBlock] = 0;
152+
}
153+
// applying a block may not actually result in a new entry that is alone
154+
if (alonePos == 0) {
155+
for (int b = 0; b < blocks && alonePos == 0; b++) {
156+
if (tmpc[b] > 0) {
157+
alonePos = applyBlock2(tmp, b, tmpc[b], t2vals, alone, alonePos);
158+
tmpc[b] = 0;
159+
}
160+
}
161+
}
162+
}
163+
if (alonePos == 0) {
164+
break;
165+
}
166+
int i = alone[--alonePos];
167+
int b = i >> blockShift;
168+
if (tmpc[b] > 0) {
169+
alonePos = applyBlock2(tmp, b, tmpc[b], t2vals, alone, alonePos);
170+
tmpc[b] = 0;
171+
}
172+
uint8_t found = -1;
173+
if (t2vals[i].t2count == 0) {
174+
continue;
175+
}
176+
long hash = t2vals[i].t2;
177+
for (int hi = 0; hi < 3; hi++) {
178+
int h = getHashFromHash(hash, hi, blockLength);
179+
if (h == i) {
180+
found = (uint8_t) hi;
181+
t2vals[i].t2count = 0;
182+
} else {
183+
int b = h >> blockShift;
184+
int i2 = tmpc[b];
185+
tmp[(b << blockShift) + i2] = hash;
186+
tmp[(b << blockShift) + i2 + 1] = h;
187+
tmpc[b] += 2;
188+
if (tmpc[b] >= 1 << blockShift) {
189+
alonePos = applyBlock2(tmp, b, tmpc[b], t2vals, alone, alonePos);
190+
tmpc[b] = 0;
191+
}
192+
}
193+
}
194+
reverseOrder[reverseOrderPos] = hash;
195+
reverseH[reverseOrderPos] = found;
196+
reverseOrderPos++;
197+
}
198+
delete[] tmp;
199+
delete[] tmpc;
200+
delete[] alone;
201+
202+
/*
203+
204+
int* alone = new int[arrayLength];
205+
int alonePos = 0;
206+
for (size_t i = 0; i < arrayLength; i++) {
207+
if (t2vals[i].t2count == 1) {
208+
alone[alonePos++] = i;
209+
}
210+
}
211+
reverseOrderPos = 0;
212+
while (alonePos > 0 && reverseOrderPos < size) {
213+
int i = alone[--alonePos];
214+
if (t2vals[i].t2count == 0) {
215+
continue;
216+
}
217+
long hash = t2vals[i].t2;
218+
uint8_t found = -1;
219+
for (int hi = 0; hi < 3; hi++) {
220+
int h = getHashFromHash(hash, hi, blockLength);
221+
int newCount = --t2vals[h].t2count;
222+
if (newCount == 0) {
223+
found = (uint8_t) hi;
224+
} else {
225+
if (newCount == 1) {
226+
alone[alonePos++] = h;
227+
}
228+
t2vals[h].t2 ^= hash;
229+
}
230+
}
231+
reverseOrder[reverseOrderPos] = hash;
232+
reverseH[reverseOrderPos] = found;
233+
reverseOrderPos++;
234+
}
235+
delete [] alone;
236+
*/
237+
238+
if (reverseOrderPos == size) {
239+
break;
240+
}
241+
242+
std::cout << "WARNING: hashIndex " << hashIndex << "\n";
243+
if (hashIndex >= 0) {
244+
std::cout << (end - start) << " keys; arrayLength " << arrayLength
245+
<< " blockLength " << blockLength
246+
<< " reverseOrderPos " << reverseOrderPos << "\n";
247+
}
248+
249+
hashIndex++;
250+
251+
// use a new random numbers
252+
delete hasher;
253+
hasher = new HashFamily();
254+
255+
}
256+
257+
for (int i = reverseOrderPos - 1; i >= 0; i--) {
258+
// the hash of the key we insert next
259+
uint64_t hash = reverseOrder[i];
260+
int found = reverseH[i];
261+
// which entry in the table we can change
262+
int change = -1;
263+
// we set table[change] to the fingerprint of the key,
264+
// unless the other two entries are already occupied
265+
uint32_t sum = fingerprint(hash);
266+
for (int hi = 0; hi < 3; hi++) {
267+
size_t h = getHashFromHash10(hash, hi, blockLength);
268+
if (found == hi) {
269+
change = h;
270+
} else {
271+
// this is different from BDZ: using xor to calculate the
272+
// fingerprint
273+
sum += getFinger(fingerprints[h], hi) % fingerMul;
274+
}
275+
}
276+
// set such that if the right fingerprint is added, the result is 0
277+
uint32_t set = (fingerMul - (sum % fingerMul)) % fingerMul;
278+
fingerprints[change] += found == 0 ? set : found == 1 ? (set * fingerMul) : (set * fingerMul2);
279+
}
280+
delete [] t2vals;
281+
delete [] reverseOrder;
282+
delete [] reverseH;
283+
284+
return Ok;
285+
}
286+
287+
template <typename ItemType, typename HashFamily>
288+
Status XorFilter10_666<ItemType, HashFamily>::Contain(
289+
const ItemType &key) const {
290+
uint64_t hash = (*hasher)(key);
291+
uint32_t f = fingerprint(hash);
292+
uint32_t r0 = (uint32_t) hash;
293+
uint32_t r1 = (uint32_t) rotl64(hash, 21);
294+
uint32_t r2 = (uint32_t) rotl64(hash, 42);
295+
uint32_t h0 = reduce(r0, blockLength);
296+
uint32_t h1 = reduce(r1, blockLength);
297+
uint32_t h2 = reduce(r2, blockLength);
298+
f += fingerprints[h0];
299+
f += (((__uint128_t) fingerprints[h1] * (invFingerMul + 1)) >> 64);
300+
f += (((__uint128_t) fingerprints[h2] * (invFingerMul2 + 1)) >> 64);
301+
return (f % fingerMul) == 0 ? Ok : NotFound;
302+
}
303+
304+
} // namespace xorfilter
305+
#endif // XOR_FILTER10_666_XOR_FILTER_H_

0 commit comments

Comments
 (0)