Skip to content

Commit 3ded030

Browse files
committed
Faster construction for xorfilter_2
1 parent bc582a5 commit 3ded030

File tree

1 file changed

+115
-10
lines changed

1 file changed

+115
-10
lines changed

src/xorfilter_2.h

Lines changed: 115 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -104,18 +104,34 @@ struct t2val {
104104

105105
typedef struct t2val t2val_t;
106106

107-
#define BLOCK_SHIFT 18
108-
#define BLOCK_LEN (1 << BLOCK_SHIFT)
107+
const int blockShift = 18;
109108

110109
void applyBlock(uint64_t* tmp, int b, int len, t2val_t * t2vals) {
111110
for (int i = 0; i < len; i += 2) {
112-
uint64_t x = tmp[(b << BLOCK_SHIFT) + i];
113-
int index = (int) tmp[(b << BLOCK_SHIFT) + i + 1];
111+
uint64_t x = tmp[(b << blockShift) + i];
112+
int index = (int) tmp[(b << blockShift) + i + 1];
114113
t2vals[index].t2count++;
115114
t2vals[index].t2 ^= x;
116115
}
117116
}
118117

118+
int applyBlock2(uint64_t* tmp, int b, int len, t2val_t * t2vals, int* alone, int alonePos) {
119+
for (int i = 0; i < len; i += 2) {
120+
uint64_t hash = tmp[(b << blockShift) + i];
121+
int index = (int) tmp[(b << blockShift) + i + 1];
122+
int oldCount = t2vals[index].t2count;
123+
if (oldCount >= 1) {
124+
int newCount = oldCount - 1;
125+
t2vals[index].t2count = newCount;
126+
if (newCount == 1) {
127+
alone[alonePos++] = index;
128+
}
129+
t2vals[index].t2 ^= hash;
130+
}
131+
}
132+
return alonePos;
133+
}
134+
119135
template <typename ItemType, typename FingerprintType,
120136
typename FingerprintStorageType, typename HashFamily>
121137
Status XorFilter2<ItemType, FingerprintType, FingerprintStorageType, HashFamily>::AddAll(
@@ -128,20 +144,20 @@ Status XorFilter2<ItemType, FingerprintType, FingerprintStorageType, HashFamily>
128144
t2val_t * t2vals = new t2val_t[m];
129145
while (true) {
130146
memset(t2vals, 0, sizeof(t2val_t[m]));
131-
int blocks = 1 + (3 * blockLength) / BLOCK_LEN;
132-
uint64_t* tmp = new uint64_t[blocks * BLOCK_LEN];
147+
int blocks = 1 + ((3 * blockLength) >> blockShift);
148+
uint64_t* tmp = new uint64_t[blocks << blockShift];
133149
int* tmpc = new int[blocks]();
134150
for(size_t i = start; i < end; i++) {
135151
uint64_t k = keys[i];
136152
uint64_t hash = (*hasher)(k);
137153
for (int hi = 0; hi < 3; hi++) {
138154
int index = getHashFromHash(hash, hi, blockLength);
139-
int b = index >> BLOCK_SHIFT;
155+
int b = index >> blockShift;
140156
int i2 = tmpc[b];
141-
tmp[(b << BLOCK_SHIFT) + i2] = hash;
142-
tmp[(b << BLOCK_SHIFT) + i2 + 1] = index;
157+
tmp[(b << blockShift) + i2] = hash;
158+
tmp[(b << blockShift) + i2 + 1] = index;
143159
tmpc[b] += 2;
144-
if (i2 + 2 == BLOCK_LEN) {
160+
if (i2 + 2 == (1 << blockShift)) {
145161
applyBlock(tmp, b, i2 + 2, t2vals);
146162
tmpc[b] = 0;
147163
}
@@ -153,8 +169,94 @@ Status XorFilter2<ItemType, FingerprintType, FingerprintStorageType, HashFamily>
153169
}
154170
delete[] tmp;
155171
delete[] tmpc;
172+
reverseOrderPos = 0;
156173

174+
int* alone = new int[arrayLength];
175+
int alonePos = 0;
176+
for (size_t i = 0; i < arrayLength; i++) {
177+
if (t2vals[i].t2count == 1) {
178+
alone[alonePos++] = i;
179+
}
180+
}
181+
tmp = new uint64_t[blocks << blockShift];
182+
tmpc = new int[blocks]();
157183
reverseOrderPos = 0;
184+
int bestBlock = -1;
185+
while (reverseOrderPos < size) {
186+
if (alonePos == 0) {
187+
// we need to apply blocks until we have an entry that is alone
188+
// (that is, until alonePos > 0)
189+
// so, find a large block (the larger the better)
190+
// but don't need to search very long
191+
// start searching where we stopped the last time
192+
// (to make it more even)
193+
for (int i = 0, b = bestBlock + 1, best = -1; i < blocks; i++) {
194+
if (b >= blocks) {
195+
b = 0;
196+
}
197+
if (tmpc[b] > best) {
198+
best = tmpc[b];
199+
bestBlock = b;
200+
if (best > (1 << (blockShift - 1))) {
201+
// sufficiently large: stop
202+
break;
203+
}
204+
}
205+
}
206+
if (tmpc[bestBlock] > 0) {
207+
alonePos = applyBlock2(tmp, bestBlock, tmpc[bestBlock], t2vals, alone, alonePos);
208+
tmpc[bestBlock] = 0;
209+
}
210+
// applying a block may not actually result in a new entry that is alone
211+
if (alonePos == 0) {
212+
for (int b = 0; b < blocks && alonePos == 0; b++) {
213+
if (tmpc[b] > 0) {
214+
alonePos = applyBlock2(tmp, b, tmpc[b], t2vals, alone, alonePos);
215+
tmpc[b] = 0;
216+
}
217+
}
218+
}
219+
}
220+
if (alonePos == 0) {
221+
break;
222+
}
223+
int i = alone[--alonePos];
224+
int b = i >> blockShift;
225+
if (tmpc[b] > 0) {
226+
alonePos = applyBlock2(tmp, b, tmpc[b], t2vals, alone, alonePos);
227+
tmpc[b] = 0;
228+
}
229+
uint8_t found = -1;
230+
if (t2vals[i].t2count == 0) {
231+
continue;
232+
}
233+
long hash = t2vals[i].t2;
234+
for (int hi = 0; hi < 3; hi++) {
235+
int h = getHashFromHash(hash, hi, blockLength);
236+
if (h == i) {
237+
found = (uint8_t) hi;
238+
t2vals[i].t2count = 0;
239+
} else {
240+
int b = h >> blockShift;
241+
int i2 = tmpc[b];
242+
tmp[(b << blockShift) + i2] = hash;
243+
tmp[(b << blockShift) + i2 + 1] = h;
244+
tmpc[b] += 2;
245+
if (tmpc[b] >= 1 << blockShift) {
246+
alonePos = applyBlock2(tmp, b, tmpc[b], t2vals, alone, alonePos);
247+
tmpc[b] = 0;
248+
}
249+
}
250+
}
251+
reverseOrder[reverseOrderPos] = hash;
252+
reverseH[reverseOrderPos] = found;
253+
reverseOrderPos++;
254+
}
255+
delete[] tmp;
256+
delete[] tmpc;
257+
delete[] alone;
258+
259+
/*
158260
int* alone = new int[arrayLength];
159261
int alonePos = 0;
160262
reverseOrderPos = 0;
@@ -190,6 +292,9 @@ Status XorFilter2<ItemType, FingerprintType, FingerprintStorageType, HashFamily>
190292
}
191293
}
192294
delete [] alone;
295+
296+
*/
297+
193298
if (reverseOrderPos == size) {
194299
break;
195300
}

0 commit comments

Comments
 (0)