Skip to content

Commit f5f4a07

Browse files
Merge pull request #30 from scalyr/master
Cleanup XorPlus8 + guard against infinite loop in table construction
2 parents 1f92499 + 7be7c18 commit f5f4a07

File tree

1 file changed

+32
-26
lines changed

1 file changed

+32
-26
lines changed

fastfilter/src/main/java/org/fastfilter/xorplus/XorPlus8.java

Lines changed: 32 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,10 @@ public class XorPlus8 implements Filter {
2525

2626
// TODO compression; now we have 9% / 11.5% / 36.5% free entries
2727

28+
// the number of hash slots which are XORed to compute the stored hash for each key
2829
private static final int HASHES = 3;
2930

31+
// this figure is added when computing the size of the table, i.e. it is the 32 in "1.23 * size + 32"
3032
private static final int OFFSET = 32;
3133

3234
// the table needs to be 1.23 times the number of keys to store
@@ -38,19 +40,19 @@ public class XorPlus8 implements Filter {
3840
// the number of keys in the filter
3941
private final int size;
4042

41-
// the table (array) length, that is size * 1.23
43+
// the table (array) length, that is size * 1.23 + 32
4244
private final int arrayLength;
4345

44-
// if the table is divided into 3 blocks (one block for each hash)
45-
// this allows to better compress the filter,
46-
// because the last block contains more zero entries than the first two
46+
// the table is divided into 3 (HASHES) blocks, one block for each hash. Each block holds this number of entries.
47+
// this allows to better compress the filter, because the last block contains more zero entries than the first two.
4748
private final int blockLength;
4849

4950
private long seed;
5051

51-
// the fingerprints (internally an array of long)
52+
// the fingerprints
5253
private byte[] fingerprints;
5354

55+
// the table (array) length, in bits
5456
private int bitCount;
5557

5658
private Rank9 rank;
@@ -65,27 +67,21 @@ public long getBitCount() {
6567
}
6668

6769
/**
68-
* Calculate the table (array) length. This is 1.23 times the size, plus an offset of 32 (see paper, Fig. 1)
70+
* Calculate the table (array) length. This is 1.23 times the size, plus an offset of 32 (see paper, Fig. 1).
71+
* We round down to a multiple of HASHES, as any excess entries couldn't be used.
6972
*
7073
* @param size the number of entries
7174
* @return the table length
7275
*/
7376
private static int getArrayLength(int size) {
74-
return (int) (OFFSET + (long) FACTOR_TIMES_100 * size / 100);
77+
int arrayLength = (int) (OFFSET + (long) FACTOR_TIMES_100 * size / 100);
78+
return arrayLength - (arrayLength % HASHES);
7579
}
7680

7781
public static XorPlus8 construct(long[] keys) {
7882
return new XorPlus8(keys);
7983
}
8084

81-
public XorPlus8(int size, byte[] fingerprints) {
82-
this.size = size;
83-
this.arrayLength = getArrayLength(size);
84-
bitCount = arrayLength * BITS_PER_FINGERPRINT;
85-
this.blockLength = arrayLength / HASHES;
86-
this.fingerprints = fingerprints;
87-
}
88-
8985
/**
9086
* Construct the filter. This is basically the BDZ algorithm. The algorithm
9187
* itself is basically the same as BDZ, except that xor is used to store the
@@ -105,9 +101,8 @@ public XorPlus8(int size, byte[] fingerprints) {
105101
*/
106102
public XorPlus8(long[] keys) {
107103
this.size = keys.length;
108-
arrayLength = getArrayLength(size);
109-
bitCount = arrayLength * BITS_PER_FINGERPRINT;
110-
blockLength = arrayLength / HASHES;
104+
this.arrayLength = getArrayLength(size);
105+
this.blockLength = arrayLength / HASHES;
111106
int m = arrayLength;
112107

113108
// the order in which the fingerprints are inserted, where
@@ -120,10 +115,17 @@ public XorPlus8(long[] keys) {
120115
int reverseOrderPos;
121116

122117
// == mapping step ==
123-
// hashIndex is usually 0; only if we detect a cycle
124-
// (which is extremely unlikely) we would have to use a larger hashIndex
118+
// we usually execute this loop just once. If we detect a cycle (which is extremely unlikely)
119+
// then we try again, with a new random seed.
125120
long seed = 0;
121+
int attempts = 0;
126122
do {
123+
attempts++;
124+
if (attempts >= 100) {
125+
// if the same key appears more than once in the keys array, every attempt to build the table will yield a collision
126+
throw new IllegalArgumentException("Unable to construct the table after 100 attempts; likely indicates duplicate keys");
127+
}
128+
127129
seed = Hash.randomSeed();
128130
// we use an second table t2 to keep the list of all keys that map
129131
// to a given entry (with a broken hash function, all keys could map
@@ -140,8 +142,9 @@ public XorPlus8(long[] keys) {
140142
int h = getHash(k, seed, hi);
141143
t2[h] ^= k;
142144
if (t2count[h] > 120) {
143-
// probably something wrong with the hash function
144-
throw new IllegalArgumentException();
145+
// probably something wrong with the hash function; or, the keys[] array contains many copies
146+
// of the same value
147+
throw new IllegalArgumentException("More than 120 keys hashed to the same location; indicates duplicate keys, or a bad hash function");
145148
}
146149
t2count[h]++;
147150
}
@@ -183,7 +186,10 @@ public XorPlus8(long[] keys) {
183186
break;
184187
}
185188
if (t2count[i] <= 0) {
186-
continue;
189+
continue; // if a key is the sole occupant for more than one of its hashes, it will wind up
190+
// being listed in multiple slots of the "alone" table; in that case, when we come
191+
// to the second or third "alone" entry for that key, it will already have been
192+
// removed, and so t2count will be 0.
187193
}
188194
long k = t2[i];
189195
if (t2count[i] != 1) {
@@ -251,9 +257,9 @@ public XorPlus8(long[] keys) {
251257
set.set(i);
252258
}
253259
}
254-
rank = new Rank9(set, blockLength);
260+
this.rank = new Rank9(set, blockLength);
255261

256-
fingerprints = new byte[2 * blockLength + set.cardinality()];
262+
this.fingerprints = new byte[2 * blockLength + set.cardinality()];
257263
if (2 * blockLength >= 0) {
258264
System.arraycopy(fp, 0, fingerprints, 0, 2 * blockLength);
259265
}
@@ -358,10 +364,10 @@ public XorPlus8(InputStream in) {
358364
DataInputStream din = new DataInputStream(in);
359365
size = din.readInt();
360366
arrayLength = getArrayLength(size);
361-
bitCount = arrayLength * BITS_PER_FINGERPRINT;
362367
blockLength = arrayLength / HASHES;
363368
seed = din.readLong();
364369
int fingerprintLength = din.readInt();
370+
bitCount = fingerprintLength * BITS_PER_FINGERPRINT;
365371
fingerprints = new byte[fingerprintLength];
366372
din.readFully(fingerprints);
367373
rank = new Rank9(din);

0 commit comments

Comments
 (0)