@@ -25,8 +25,10 @@ public class XorPlus8 implements Filter {
2525
2626 // TODO compression; now we have 9% / 11.5% / 36.5% free entries
2727
28+ // the number of hash slots which are XORed to compute the stored hash for each key
2829 private static final int HASHES = 3 ;
2930
31+ // this figure is added when computing the size of the table, i.e. it is the 32 in "1.23 * size + 32"
3032 private static final int OFFSET = 32 ;
3133
3234 // the table needs to be 1.23 times the number of keys to store
@@ -38,19 +40,19 @@ public class XorPlus8 implements Filter {
3840 // the number of keys in the filter
3941 private final int size ;
4042
41- // the table (array) length, that is size * 1.23
43+ // the table (array) length, that is size * 1.23 + 32
4244 private final int arrayLength ;
4345
44- // if the table is divided into 3 blocks (one block for each hash)
45- // this allows to better compress the filter,
46- // because the last block contains more zero entries than the first two
46+ // the table is divided into 3 (HASHES) blocks, one block for each hash. Each block holds this number of entries.
47+ // this allows to better compress the filter, because the last block contains more zero entries than the first two.
4748 private final int blockLength ;
4849
4950 private long seed ;
5051
51- // the fingerprints (internally an array of long)
52+ // the fingerprints
5253 private byte [] fingerprints ;
5354
55+ // the table (array) length, in bits
5456 private int bitCount ;
5557
5658 private Rank9 rank ;
@@ -65,27 +67,21 @@ public long getBitCount() {
6567 }
6668
6769 /**
68- * Calculate the table (array) length. This is 1.23 times the size, plus an offset of 32 (see paper, Fig. 1)
70+ * Calculate the table (array) length. This is 1.23 times the size, plus an offset of 32 (see paper, Fig. 1).
71+ * We round down to a multiple of HASHES, as any excess entries couldn't be used.
6972 *
7073 * @param size the number of entries
7174 * @return the table length
7275 */
7376 private static int getArrayLength (int size ) {
74- return (int ) (OFFSET + (long ) FACTOR_TIMES_100 * size / 100 );
77+ int arrayLength = (int ) (OFFSET + (long ) FACTOR_TIMES_100 * size / 100 );
78+ return arrayLength - (arrayLength % HASHES );
7579 }
7680
7781 public static XorPlus8 construct (long [] keys ) {
7882 return new XorPlus8 (keys );
7983 }
8084
81- public XorPlus8 (int size , byte [] fingerprints ) {
82- this .size = size ;
83- this .arrayLength = getArrayLength (size );
84- bitCount = arrayLength * BITS_PER_FINGERPRINT ;
85- this .blockLength = arrayLength / HASHES ;
86- this .fingerprints = fingerprints ;
87- }
88-
8985 /**
9086 * Construct the filter. This is basically the BDZ algorithm. The algorithm
9187 * itself is basically the same as BDZ, except that xor is used to store the
@@ -105,9 +101,8 @@ public XorPlus8(int size, byte[] fingerprints) {
105101 */
106102 public XorPlus8 (long [] keys ) {
107103 this .size = keys .length ;
108- arrayLength = getArrayLength (size );
109- bitCount = arrayLength * BITS_PER_FINGERPRINT ;
110- blockLength = arrayLength / HASHES ;
104+ this .arrayLength = getArrayLength (size );
105+ this .blockLength = arrayLength / HASHES ;
111106 int m = arrayLength ;
112107
113108 // the order in which the fingerprints are inserted, where
@@ -120,10 +115,17 @@ public XorPlus8(long[] keys) {
120115 int reverseOrderPos ;
121116
122117 // == mapping step ==
123- // hashIndex is usually 0; only if we detect a cycle
124- // (which is extremely unlikely) we would have to use a larger hashIndex
118+ // we usually execute this loop just once. If we detect a cycle (which is extremely unlikely)
119+ // then we try again, with a new random seed.
125120 long seed = 0 ;
121+ int attempts = 0 ;
126122 do {
123+ attempts ++;
124+ if (attempts >= 100 ) {
125+ // if the same key appears more than once in the keys array, every attempt to build the table will yield a collision
126+ throw new IllegalArgumentException ("Unable to construct the table after 100 attempts; likely indicates duplicate keys" );
127+ }
128+
127129 seed = Hash .randomSeed ();
128130 // we use an second table t2 to keep the list of all keys that map
129131 // to a given entry (with a broken hash function, all keys could map
@@ -140,8 +142,9 @@ public XorPlus8(long[] keys) {
140142 int h = getHash (k , seed , hi );
141143 t2 [h ] ^= k ;
142144 if (t2count [h ] > 120 ) {
143- // probably something wrong with the hash function
144- throw new IllegalArgumentException ();
145+ // probably something wrong with the hash function; or, the keys[] array contains many copies
146+ // of the same value
147+ throw new IllegalArgumentException ("More than 120 keys hashed to the same location; indicates duplicate keys, or a bad hash function" );
145148 }
146149 t2count [h ]++;
147150 }
@@ -183,7 +186,10 @@ public XorPlus8(long[] keys) {
183186 break ;
184187 }
185188 if (t2count [i ] <= 0 ) {
186- continue ;
189+ continue ; // if a key is the sole occupant for more than one of its hashes, it will wind up
190+ // being listed in multiple slots of the "alone" table; in that case, when we come
191+ // to the second or third "alone" entry for that key, it will already have been
192+ // removed, and so t2count will be 0.
187193 }
188194 long k = t2 [i ];
189195 if (t2count [i ] != 1 ) {
@@ -251,9 +257,9 @@ public XorPlus8(long[] keys) {
251257 set .set (i );
252258 }
253259 }
254- rank = new Rank9 (set , blockLength );
260+ this . rank = new Rank9 (set , blockLength );
255261
256- fingerprints = new byte [2 * blockLength + set .cardinality ()];
262+ this . fingerprints = new byte [2 * blockLength + set .cardinality ()];
257263 if (2 * blockLength >= 0 ) {
258264 System .arraycopy (fp , 0 , fingerprints , 0 , 2 * blockLength );
259265 }
@@ -358,10 +364,10 @@ public XorPlus8(InputStream in) {
358364 DataInputStream din = new DataInputStream (in );
359365 size = din .readInt ();
360366 arrayLength = getArrayLength (size );
361- bitCount = arrayLength * BITS_PER_FINGERPRINT ;
362367 blockLength = arrayLength / HASHES ;
363368 seed = din .readLong ();
364369 int fingerprintLength = din .readInt ();
370+ bitCount = fingerprintLength * BITS_PER_FINGERPRINT ;
365371 fingerprints = new byte [fingerprintLength ];
366372 din .readFully (fingerprints );
367373 rank = new Rank9 (din );
0 commit comments