Skip to content

Commit b0cd845

Browse files
committed
Speed up construction (work in progress)
1 parent 011006e commit b0cd845

File tree

1 file changed

+23
-117
lines changed

1 file changed

+23
-117
lines changed

src/xorfilter.h

Lines changed: 23 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ class XorFilter {
4848
HashFamily* hasher;
4949

5050
inline FingerprintType fingerprint(const uint64_t hash) const {
51-
return (FingerprintType) hash;
51+
return (FingerprintType) hash ^ (hash >> 32);
5252
}
5353

5454
public:
@@ -106,15 +106,6 @@ int applyBlock2(uint64_t* tmp, int b, int len, t2val_t * t2vals, int* alone, int
106106
uint64_t hash = tmp[(b << BLOCK_SHIFT) + i];
107107
int index = (int) tmp[(b << BLOCK_SHIFT) + i + 1];
108108
int oldCount = t2vals[index].t2count;
109-
// std::cout << " consume index " << index << " hash " << hash << " oldCount " << oldCount << " i " << i << "\n";
110-
/*
111-
int newCount = --t2vals[h].t2count;
112-
if (newCount == 1) {
113-
alone[alonePos++] = h;
114-
}
115-
t2vals[h].t2 ^= hash;
116-
*/
117-
118109
if (oldCount >= 1) {
119110
int newCount = oldCount - 1;
120111
t2vals[index].t2count = newCount;
@@ -174,25 +165,36 @@ Status XorFilter<ItemType, FingerprintType, HashFamily>::AddAll(
174165
alone[alonePos++] = i;
175166
}
176167
}
177-
178168
tmp = new uint64_t[blocks * BLOCK_LEN];
179169
tmpc = new int[blocks]();
180-
181170
reverseOrderPos = 0;
171+
int bestBlock = -1;
182172
while (reverseOrderPos < size) {
183-
184173
if (alonePos == 0) {
185-
int bestb = -1, bb = -1;
186-
for (int b = 0; b < blocks && alonePos == 0; b++) {
187-
if (tmpc[b] > bestb) {
188-
bestb = tmpc[b];
189-
bb = b;
174+
// we need to apply blocks until we have an entry that is alone
175+
// (that is, until alonePos > 0)
176+
// so, find a large block (the larger the better)
177+
// but don't need to search very long
178+
// start searching where we stopped the last time
179+
// (to make it more even)
180+
for (int i = 0, b = bestBlock + 1, best = -1; i < blocks; i++) {
181+
if (b >= blocks) {
182+
b = 0;
183+
}
184+
if (tmpc[b] > best) {
185+
best = tmpc[b];
186+
bestBlock = b;
187+
if (best > BLOCK_LEN / 2) {
188+
// sufficiently large: stop
189+
break;
190+
}
190191
}
191192
}
192-
if (tmpc[bb] > 0) {
193-
alonePos = applyBlock2(tmp, bb, tmpc[bb], t2vals, alone, alonePos);
194-
tmpc[bb] = 0;
193+
if (tmpc[bestBlock] > 0) {
194+
alonePos = applyBlock2(tmp, bestBlock, tmpc[bestBlock], t2vals, alone, alonePos);
195+
tmpc[bestBlock] = 0;
195196
}
197+
// applying a block may not actually result in a new entry that is alone
196198
if (alonePos == 0) {
197199
for (int b = 0; b < blocks && alonePos == 0; b++) {
198200
if (tmpc[b] > 0) {
@@ -201,44 +203,27 @@ Status XorFilter<ItemType, FingerprintType, HashFamily>::AddAll(
201203
}
202204
}
203205
}
204-
// std::cout << "now alone " << alonePos << "\n";
205206
}
206-
207207
if (alonePos == 0) {
208208
break;
209209
}
210-
211210
int i = alone[--alonePos];
212-
213211
int b = i >> BLOCK_SHIFT;
214212
if (tmpc[b] > 0) {
215213
alonePos = applyBlock2(tmp, b, tmpc[b], t2vals, alone, alonePos);
216214
tmpc[b] = 0;
217215
}
218-
219216
uint8_t found = -1;
220217
if (t2vals[i].t2count == 0) {
221218
continue;
222219
}
223-
// if (t2vals[i].t2count > 100 || t2vals[i].t2count < 0) {
224-
//std::cout << "UNEXPECTED " << i << " = " << t2vals[i].t2count << "\n";
225-
//}
226220
long hash = t2vals[i].t2;
227-
//if (hash == 0) {
228-
// std::cout << "UNEXPECTED hash " << i << " = " << t2vals[i].t2count << "\n";
229-
//}
230-
231221
for (int hi = 0; hi < 3; hi++) {
232222
int h = getHashFromHash(hash, hi, blockLength);
233223
if (h == i) {
234224
found = (uint8_t) hi;
235-
//if (t2vals[i].t2count != 1) {
236-
// std::cout << " NOT 1 " << t2vals[i].t2count << "\n";
237-
//}
238225
t2vals[i].t2count = 0;
239226
} else {
240-
//std::cout << " add index " << h << " hash " << hash << " hi " << hi << "\n";
241-
242227
int b = h >> BLOCK_SHIFT;
243228
int i2 = tmpc[b];
244229
tmp[(b << BLOCK_SHIFT) + i2] = hash;
@@ -248,35 +233,16 @@ Status XorFilter<ItemType, FingerprintType, HashFamily>::AddAll(
248233
alonePos = applyBlock2(tmp, b, tmpc[b], t2vals, alone, alonePos);
249234
tmpc[b] = 0;
250235
}
251-
/*
252-
253-
int newCount = --t2vals[h].t2count;
254-
if (newCount == 1) {
255-
alone[alonePos++] = h;
256-
}
257-
t2vals[h].t2 ^= hash;
258-
*/
259236
}
260237
}
261-
// std::cout << " add " << hash << " found " << (int) found << "\n";
262-
263238
reverseOrder[reverseOrderPos] = hash;
264-
265-
//if (found < 0) {
266-
// std::cout << " NOT FOUND " << hash << "\n";
267-
//}
268239
reverseH[reverseOrderPos] = found;
269240
reverseOrderPos++;
270-
271-
272241
}
273-
274242
delete[] tmp;
275243
delete[] tmpc;
276-
277244
delete [] alone;
278245

279-
280246
/*
281247
int* alone = new int[arrayLength];
282248
int alonePos = 0;
@@ -312,65 +278,6 @@ Status XorFilter<ItemType, FingerprintType, HashFamily>::AddAll(
312278
delete [] alone;
313279
*/
314280

315-
316-
317-
/*
318-
int* alone = new int[blocks * BLOCK_LEN];
319-
int* alonePos = new int[blocks]();
320-
for (size_t i = 0; i < arrayLength; i++) {
321-
if (t2vals[i].t2count == 1) {
322-
int b = i >> BLOCK_SHIFT;
323-
// TODO could in theory go over the limit
324-
int p = alonePos[b]++;
325-
alone[(b << BLOCK_SHIFT) + p] = i;
326-
}
327-
}
328-
reverseOrderPos = 0;
329-
330-
int currentBlock = 0;
331-
while (reverseOrderPos < size) {
332-
if (alonePos[currentBlock] == 0) {
333-
for(int i=0, b=currentBlock + 1; i<blocks; i++, b++) {
334-
if (b > blocks) {
335-
b = 0;
336-
}
337-
if (alonePos[b] > 0) {
338-
currentBlock = b;
339-
break;
340-
}
341-
}
342-
}
343-
if (alonePos[currentBlock] == 0) {
344-
break;
345-
}
346-
int i = (b << BLOCK_SHIFT) + alone[--alonePos[currentBlock]];
347-
if (t2vals[i].t2count == 0) {
348-
continue;
349-
}
350-
long hash = t2vals[i].t2;
351-
uint8_t found = -1;
352-
for (int hi = 0; hi < 3; hi++) {
353-
int h = getHashFromHash(hash, hi, blockLength);
354-
int newCount = --t2vals[h].t2count;
355-
if (newCount == 0) {
356-
found = (uint8_t) hi;
357-
} else {
358-
if (newCount == 1) {
359-
alone[alonePos++] = h;
360-
}
361-
t2vals[h].t2 ^= hash;
362-
}
363-
}
364-
reverseOrder[reverseOrderPos] = hash;
365-
reverseH[reverseOrderPos] = found;
366-
reverseOrderPos++;
367-
}
368-
delete [] alone;
369-
*/
370-
371-
372-
373-
374281
if (reverseOrderPos == size) {
375282
break;
376283
}
@@ -389,7 +296,6 @@ Status XorFilter<ItemType, FingerprintType, HashFamily>::AddAll(
389296
hasher = new HashFamily();
390297

391298
}
392-
393299
for (int i = reverseOrderPos - 1; i >= 0; i--) {
394300
// the hash of the key we insert next
395301
uint64_t hash = reverseOrder[i];

0 commit comments

Comments
 (0)