@@ -48,7 +48,7 @@ class XorFilter {
4848 HashFamily* hasher;
4949
5050 inline FingerprintType fingerprint (const uint64_t hash) const {
51- return (FingerprintType) hash;
51+ return (FingerprintType) hash ^ (hash >> 32 ) ;
5252 }
5353
5454 public:
@@ -106,15 +106,6 @@ int applyBlock2(uint64_t* tmp, int b, int len, t2val_t * t2vals, int* alone, int
106106 uint64_t hash = tmp[(b << BLOCK_SHIFT) + i];
107107 int index = (int ) tmp[(b << BLOCK_SHIFT) + i + 1 ];
108108 int oldCount = t2vals[index].t2count ;
109- // std::cout << " consume index " << index << " hash " << hash << " oldCount " << oldCount << " i " << i << "\n";
110- /*
111- int newCount = --t2vals[h].t2count;
112- if (newCount == 1) {
113- alone[alonePos++] = h;
114- }
115- t2vals[h].t2 ^= hash;
116- */
117-
118109 if (oldCount >= 1 ) {
119110 int newCount = oldCount - 1 ;
120111 t2vals[index].t2count = newCount;
@@ -174,25 +165,36 @@ Status XorFilter<ItemType, FingerprintType, HashFamily>::AddAll(
174165 alone[alonePos++] = i;
175166 }
176167 }
177-
178168 tmp = new uint64_t [blocks * BLOCK_LEN];
179169 tmpc = new int [blocks]();
180-
181170 reverseOrderPos = 0 ;
171+ int bestBlock = -1 ;
182172 while (reverseOrderPos < size) {
183-
184173 if (alonePos == 0 ) {
185- int bestb = -1 , bb = -1 ;
186- for (int b = 0 ; b < blocks && alonePos == 0 ; b++) {
187- if (tmpc[b] > bestb) {
188- bestb = tmpc[b];
189- bb = b;
174+ // we need to apply blocks until we have an entry that is alone
175+ // (that is, until alonePos > 0)
176+ // so, find a large block (the larger the better)
177+ // but don't need to search very long
178+ // start searching where we stopped the last time
179+ // (to make it more even)
180+ for (int i = 0 , b = bestBlock + 1 , best = -1 ; i < blocks; i++) {
181+ if (b >= blocks) {
182+ b = 0 ;
183+ }
184+ if (tmpc[b] > best) {
185+ best = tmpc[b];
186+ bestBlock = b;
187+ if (best > BLOCK_LEN / 2 ) {
188+ // sufficiently large: stop
189+ break ;
190+ }
190191 }
191192 }
192- if (tmpc[bb ] > 0 ) {
193- alonePos = applyBlock2 (tmp, bb , tmpc[bb ], t2vals, alone, alonePos);
194- tmpc[bb ] = 0 ;
193+ if (tmpc[bestBlock ] > 0 ) {
194+ alonePos = applyBlock2 (tmp, bestBlock , tmpc[bestBlock ], t2vals, alone, alonePos);
195+ tmpc[bestBlock ] = 0 ;
195196 }
197+ // applying a block may not actually result in a new entry that is alone
196198 if (alonePos == 0 ) {
197199 for (int b = 0 ; b < blocks && alonePos == 0 ; b++) {
198200 if (tmpc[b] > 0 ) {
@@ -201,44 +203,27 @@ Status XorFilter<ItemType, FingerprintType, HashFamily>::AddAll(
201203 }
202204 }
203205 }
204- // std::cout << "now alone " << alonePos << "\n";
205206 }
206-
207207 if (alonePos == 0 ) {
208208 break ;
209209 }
210-
211210 int i = alone[--alonePos];
212-
213211 int b = i >> BLOCK_SHIFT;
214212 if (tmpc[b] > 0 ) {
215213 alonePos = applyBlock2 (tmp, b, tmpc[b], t2vals, alone, alonePos);
216214 tmpc[b] = 0 ;
217215 }
218-
219216 uint8_t found = -1 ;
220217 if (t2vals[i].t2count == 0 ) {
221218 continue ;
222219 }
223- // if (t2vals[i].t2count > 100 || t2vals[i].t2count < 0) {
224- // std::cout << "UNEXPECTED " << i << " = " << t2vals[i].t2count << "\n";
225- // }
226220 long hash = t2vals[i].t2 ;
227- // if (hash == 0) {
228- // std::cout << "UNEXPECTED hash " << i << " = " << t2vals[i].t2count << "\n";
229- // }
230-
231221 for (int hi = 0 ; hi < 3 ; hi++) {
232222 int h = getHashFromHash (hash, hi, blockLength);
233223 if (h == i) {
234224 found = (uint8_t ) hi;
235- // if (t2vals[i].t2count != 1) {
236- // std::cout << " NOT 1 " << t2vals[i].t2count << "\n";
237- // }
238225 t2vals[i].t2count = 0 ;
239226 } else {
240- // std::cout << " add index " << h << " hash " << hash << " hi " << hi << "\n";
241-
242227 int b = h >> BLOCK_SHIFT;
243228 int i2 = tmpc[b];
244229 tmp[(b << BLOCK_SHIFT) + i2] = hash;
@@ -248,35 +233,16 @@ Status XorFilter<ItemType, FingerprintType, HashFamily>::AddAll(
248233 alonePos = applyBlock2 (tmp, b, tmpc[b], t2vals, alone, alonePos);
249234 tmpc[b] = 0 ;
250235 }
251- /*
252-
253- int newCount = --t2vals[h].t2count;
254- if (newCount == 1) {
255- alone[alonePos++] = h;
256- }
257- t2vals[h].t2 ^= hash;
258- */
259236 }
260237 }
261- // std::cout << " add " << hash << " found " << (int) found << "\n";
262-
263238 reverseOrder[reverseOrderPos] = hash;
264-
265- // if (found < 0) {
266- // std::cout << " NOT FOUND " << hash << "\n";
267- // }
268239 reverseH[reverseOrderPos] = found;
269240 reverseOrderPos++;
270-
271-
272241 }
273-
274242 delete[] tmp;
275243 delete[] tmpc;
276-
277244 delete [] alone;
278245
279-
280246/*
281247 int* alone = new int[arrayLength];
282248 int alonePos = 0;
@@ -312,65 +278,6 @@ Status XorFilter<ItemType, FingerprintType, HashFamily>::AddAll(
312278 delete [] alone;
313279*/
314280
315-
316-
317- /*
318- int* alone = new int[blocks * BLOCK_LEN];
319- int* alonePos = new int[blocks]();
320- for (size_t i = 0; i < arrayLength; i++) {
321- if (t2vals[i].t2count == 1) {
322- int b = i >> BLOCK_SHIFT;
323- // TODO could in theory go over the limit
324- int p = alonePos[b]++;
325- alone[(b << BLOCK_SHIFT) + p] = i;
326- }
327- }
328- reverseOrderPos = 0;
329-
330- int currentBlock = 0;
331- while (reverseOrderPos < size) {
332- if (alonePos[currentBlock] == 0) {
333- for(int i=0, b=currentBlock + 1; i<blocks; i++, b++) {
334- if (b > blocks) {
335- b = 0;
336- }
337- if (alonePos[b] > 0) {
338- currentBlock = b;
339- break;
340- }
341- }
342- }
343- if (alonePos[currentBlock] == 0) {
344- break;
345- }
346- int i = (b << BLOCK_SHIFT) + alone[--alonePos[currentBlock]];
347- if (t2vals[i].t2count == 0) {
348- continue;
349- }
350- long hash = t2vals[i].t2;
351- uint8_t found = -1;
352- for (int hi = 0; hi < 3; hi++) {
353- int h = getHashFromHash(hash, hi, blockLength);
354- int newCount = --t2vals[h].t2count;
355- if (newCount == 0) {
356- found = (uint8_t) hi;
357- } else {
358- if (newCount == 1) {
359- alone[alonePos++] = h;
360- }
361- t2vals[h].t2 ^= hash;
362- }
363- }
364- reverseOrder[reverseOrderPos] = hash;
365- reverseH[reverseOrderPos] = found;
366- reverseOrderPos++;
367- }
368- delete [] alone;
369- */
370-
371-
372-
373-
374281 if (reverseOrderPos == size) {
375282 break ;
376283 }
@@ -389,7 +296,6 @@ Status XorFilter<ItemType, FingerprintType, HashFamily>::AddAll(
389296 hasher = new HashFamily ();
390297
391298 }
392-
393299 for (int i = reverseOrderPos - 1 ; i >= 0 ; i--) {
394300 // the hash of the key we insert next
395301 uint64_t hash = reverseOrder[i];
0 commit comments