Skip to content

Commit 6d850d5

Browse files
committed
added XorBinaryFuse32
1 parent b245da3 commit 6d850d5

File tree

4 files changed

+302
-7
lines changed

4 files changed

+302
-7
lines changed

fastfilter/src/main/java/org/fastfilter/FilterType.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,12 @@ public Filter construct(long[] keys, int setting) {
8787
return XorBinaryFuse8.construct(keys);
8888
}
8989
},
90+
XOR_BINARY_FUSE_32 {
91+
@Override
92+
public Filter construct(long[] keys, int setting) {
93+
return XorBinaryFuse32.construct(keys);
94+
}
95+
},
9096
XOR_PLUS_8 {
9197
@Override
9298
public Filter construct(long[] keys, int setting) {
Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
package org.fastfilter.xor;
2+
3+
import java.util.Arrays;
4+
5+
import org.fastfilter.Filter;
6+
import org.fastfilter.utils.Hash;
7+
8+
/**
9+
* The xor binary fuse filter, a new algorithm that can replace a Bloom filter.
10+
*/
11+
public class XorBinaryFuse32 implements Filter {
12+
13+
private static final int ARITY = 3;
14+
15+
private final int segmentCount;
16+
private final int segmentCountLength;
17+
private final int segmentLength;
18+
private final int segmentLengthMask;
19+
private final int arrayLength;
20+
private final int[] fingerprints;
21+
private long seed;
22+
23+
public XorBinaryFuse32(int segmentCount, int segmentLength) {
24+
if (segmentLength < 0 || Integer.bitCount(segmentLength) != 1) {
25+
throw new IllegalArgumentException("Segment length needs to be a power of 2, is " + segmentLength);
26+
}
27+
if (segmentCount <= 0) {
28+
throw new IllegalArgumentException("Illegal segment count: " + segmentCount);
29+
}
30+
this.segmentLength = segmentLength;
31+
this.segmentCount = segmentCount;
32+
this.segmentLengthMask = segmentLength - 1;
33+
this.segmentCountLength = segmentCount * segmentLength;
34+
this.arrayLength = (segmentCount + ARITY - 1) * segmentLength;
35+
this.fingerprints = new int[arrayLength];
36+
}
37+
38+
public long getBitCount() {
39+
return ((long) (arrayLength)) * Integer.SIZE;
40+
}
41+
42+
static int calculateSegmentLength(int arity, int size) {
43+
int segmentLength;
44+
if (arity == 3) {
45+
segmentLength = 1 << (int) Math.floor(Math.log(size) / Math.log(3.33) + 2.11);
46+
} else if (arity == 4) {
47+
segmentLength = 1 << (int) Math.floor(Math.log(size) / Math.log(2.91) - 0.5);
48+
} else {
49+
// not supported
50+
segmentLength = 65536;
51+
}
52+
return segmentLength;
53+
}
54+
55+
static double calculateSizeFactor(int arity, int size) {
56+
double sizeFactor;
57+
if (arity == 3) {
58+
sizeFactor = Math.max(1.125, 0.875 + 0.25 * Math.log(1000000) / Math.log(size));
59+
} else if (arity == 4) {
60+
sizeFactor = Math.max(1.075, 0.77 + 0.305 * Math.log(600000) / Math.log(size));
61+
} else {
62+
// not supported
63+
sizeFactor = 2.0;
64+
}
65+
return sizeFactor;
66+
}
67+
68+
private static int mod3(int x) {
69+
if (x > 2) {
70+
x -= 3;
71+
}
72+
return x;
73+
}
74+
75+
public static XorBinaryFuse32 construct(long[] keys) {
76+
int size = keys.length;
77+
int segmentLength = calculateSegmentLength(ARITY, size);
78+
// the current implementation hardcodes a 18-bit limit to
79+
// to the segment length.
80+
if (segmentLength > (1 << 18)) {
81+
segmentLength = (1 << 18);
82+
}
83+
double sizeFactor = calculateSizeFactor(ARITY, size);
84+
int capacity = (int) (size * sizeFactor);
85+
int segmentCount = (capacity + segmentLength - 1) / segmentLength - (ARITY - 1);
86+
int arrayLength = (segmentCount + ARITY - 1) * segmentLength;
87+
segmentCount = (arrayLength + segmentLength - 1) / segmentLength;
88+
segmentCount = segmentCount <= ARITY - 1 ? 1 : segmentCount - (ARITY - 1);
89+
XorBinaryFuse32 filter = new XorBinaryFuse32(segmentCount, segmentLength);
90+
filter.addAll(keys);
91+
return filter;
92+
}
93+
94+
private void addAll(long[] keys) {
95+
int size = keys.length;
96+
long[] reverseOrder = new long[size + 1];
97+
int[] reverseH = new int[size];
98+
int reverseOrderPos = 0;
99+
100+
// the lowest 2 bits are the h index (0, 1, or 2)
101+
// so we only have 6 bits for counting;
102+
// but that's sufficient
103+
int[] t2count = new int[arrayLength];
104+
long[] t2hash = new long[arrayLength];
105+
int[] alone = new int[arrayLength];
106+
int hashIndex = 0;
107+
// the array h0, h1, h2, h0, h1, h2
108+
int[] h012 = new int[5];
109+
int blockBits = 1;
110+
while ((1 << blockBits) < segmentCount) {
111+
blockBits++;
112+
}
113+
int block = 1 << blockBits;
114+
mainloop:
115+
while (true) {
116+
reverseOrder[size] = 1;
117+
int[] startPos = new int[block];
118+
for (int i = 0; i < 1 << blockBits; i++) {
119+
startPos[i] = (int) ((long) i * size / block);
120+
}
121+
// counting sort
122+
123+
for (long key : keys) {
124+
long hash = Hash.hash64(key, seed);
125+
int segmentIndex = (int) (hash >>> (64 - blockBits));
126+
// We only overwrite when the hash was zero. Zero hash values
127+
// may be misplaced (unlikely).
128+
while (reverseOrder[startPos[segmentIndex]] != 0) {
129+
segmentIndex++;
130+
segmentIndex &= (1 << blockBits) - 1;
131+
}
132+
reverseOrder[startPos[segmentIndex]] = hash;
133+
startPos[segmentIndex]++;
134+
}
135+
int countMask = 0;
136+
for (int i = 0; i < size; i++) {
137+
long hash = reverseOrder[i];
138+
for (int hi = 0; hi < 3; hi++) {
139+
int index = getHashFromHash(hash, hi);
140+
t2count[index] += 4;
141+
t2count[index] ^= hi;
142+
t2hash[index] ^= hash;
143+
countMask |= t2count[index];
144+
}
145+
}
146+
startPos = null;
147+
if (countMask < 0) {
148+
// we have a possible counter overflow
149+
continue mainloop;
150+
}
151+
152+
reverseOrderPos = 0;
153+
int alonePos = 0;
154+
for (int i = 0; i < arrayLength; i++) {
155+
alone[alonePos] = i;
156+
int inc = (t2count[i] >> 2) == 1 ? 1 : 0;
157+
alonePos += inc;
158+
}
159+
160+
while (alonePos > 0) {
161+
alonePos--;
162+
int index = alone[alonePos];
163+
if ((t2count[index] >> 2) == 1) {
164+
// It is still there!
165+
long hash = t2hash[index];
166+
int found = t2count[index] & 3;
167+
168+
reverseH[reverseOrderPos] = found;
169+
reverseOrder[reverseOrderPos] = hash;
170+
171+
h012[0] = getHashFromHash(hash, 0);
172+
h012[1] = getHashFromHash(hash, 1);
173+
h012[2] = getHashFromHash(hash, 2);
174+
175+
int index3 = h012[mod3(found + 1)];
176+
alone[alonePos] = index3;
177+
alonePos += ((t2count[index3] >> 2) == 2 ? 1 : 0);
178+
t2count[index3] -= 4;
179+
t2count[index3] ^= mod3(found + 1);
180+
t2hash[index3] ^= hash;
181+
182+
index3 = h012[mod3(found + 2)];
183+
alone[alonePos] = index3;
184+
alonePos += ((t2count[index3] >> 2) == 2 ? 1 : 0);
185+
t2count[index3] -= 4;
186+
t2count[index3] ^= mod3(found + 2);
187+
t2hash[index3] ^= hash;
188+
189+
reverseOrderPos++;
190+
}
191+
}
192+
193+
if (reverseOrderPos == size) {
194+
break;
195+
}
196+
hashIndex++;
197+
Arrays.fill(t2count, 0);
198+
Arrays.fill(t2hash, 0);
199+
Arrays.fill(reverseOrder, 0);
200+
201+
if (hashIndex > 100) {
202+
// if construction doesn't succeed eventually,
203+
// then there is likely a problem with the hash function
204+
// let us not crash the system:
205+
for (int i = 0; i < fingerprints.length; i++) {
206+
fingerprints[i] = (int) 0xFFFFFFFF;
207+
}
208+
return;
209+
}
210+
// use a new random numbers
211+
seed = Hash.randomSeed();
212+
}
213+
alone = null;
214+
t2count = null;
215+
t2hash = null;
216+
217+
for (int i = reverseOrderPos - 1; i >= 0; i--) {
218+
long hash = reverseOrder[i];
219+
int found = reverseH[i];
220+
int xor2 = fingerprint(hash);
221+
h012[0] = getHashFromHash(hash, 0);
222+
h012[1] = getHashFromHash(hash, 1);
223+
h012[2] = getHashFromHash(hash, 2);
224+
h012[3] = h012[0];
225+
h012[4] = h012[1];
226+
fingerprints[h012[found]] = (xor2 ^ fingerprints[h012[found + 1]] ^ fingerprints[h012[found + 2]]);
227+
}
228+
}
229+
230+
@Override
231+
public boolean mayContain(long key) {
232+
long hash = Hash.hash64(key, seed);
233+
int f = fingerprint(hash);
234+
int h0 = Hash.reduce((int) (hash >>> 32), segmentCountLength);
235+
int h1 = h0 + segmentLength;
236+
int h2 = h1 + segmentLength;
237+
long hh = hash;
238+
h1 ^= (int) ((hh >> 18) & segmentLengthMask);
239+
h2 ^= (int) ((hh) & segmentLengthMask);
240+
f ^= fingerprints[h0] ^ fingerprints[h1] ^ fingerprints[h2];
241+
return (f & 0xff) == 0;
242+
}
243+
244+
@Override
245+
public String toString() {
246+
return "segmentLength " + segmentLength + " segmentCount " + segmentCount;
247+
}
248+
249+
int getHashFromHash(long hash, int index) {
250+
long h = Hash.reduce((int) (hash >>> 32), segmentCountLength);
251+
// long h = Hash.multiplyHighUnsigned(hash, segmentCountLength);
252+
h += index * segmentLength;
253+
// keep the lower 36 bits
254+
long hh = hash & ((1L << 36) - 1);
255+
// index 0: right shift by 36; index 1: right shift by 18; index 2: no shift
256+
h ^= (int) ((hh >>> (36 - 18 * index)) & segmentLengthMask);
257+
return (int) h;
258+
}
259+
260+
private int fingerprint(long hash) {
261+
return (int) hash;
262+
}
263+
264+
}

fastfilter/src/test/java/org/fastfilter/TestFilterType.java

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,7 @@
1111
import org.fastfilter.gcs.GolombCompressedSet;
1212
import org.fastfilter.gcs.GolombCompressedSet2;
1313
import org.fastfilter.mphf.MPHFilter;
14-
import org.fastfilter.xor.Xor16;
15-
import org.fastfilter.xor.Xor8;
16-
import org.fastfilter.xor.XorBinaryFuse8;
17-
import org.fastfilter.xor.XorSimple;
18-
import org.fastfilter.xor.XorSimple2;
14+
import org.fastfilter.xor.*;
1915
import org.fastfilter.xorplus.XorPlus8;
2016

2117
/**
@@ -100,6 +96,12 @@ public Filter construct(long[] keys, int setting) {
10096
return XorBinaryFuse8.construct(keys);
10197
}
10298
},
99+
XOR_BINARY_FUSE_32 {
100+
@Override
101+
public Filter construct(long[] keys, int setting) {
102+
return XorBinaryFuse32.construct(keys);
103+
}
104+
},
103105
CUCKOO_8 {
104106
@Override
105107
public Filter construct(long[] keys, int setting) {
@@ -146,7 +148,7 @@ public Filter construct(long[] keys, int setting) {
146148
/**
147149
* Construct the filter with the given keys and the setting.
148150
*
149-
* @param keys the keys
151+
* @param keys the keys
150152
* @param setting the setting (roughly bits per fingerprint)
151153
* @return the constructed filter
152154
*/

fastfilter/src/test/java/org/fastfilter/xor/SmallSetTest.java

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ public void small() {
1313
Xor8.construct(new long[]{0xef9bddc5166c081cL, 0x33bf87adaa46dcfcL});
1414
Xor16.construct(new long[]{0xef9bddc5166c081cL, 0x33bf87adaa46dcfcL});
1515
XorBinaryFuse8.construct(new long[]{0xef9bddc5166c081cL, 0x33bf87adaa46dcfcL});
16+
XorBinaryFuse32.construct(new long[]{0xef9bddc5166c081cL, 0x33bf87adaa46dcfcL});
1617
XorSimple.construct(new long[]{0xef9bddc5166c081cL, 0x33bf87adaa46dcfcL});
1718
XorSimple2.construct(new long[]{0xef9bddc5166c081cL, 0x33bf87adaa46dcfcL});
1819
}
@@ -27,7 +28,29 @@ public void verySmallSizes() {
2728
testWithSize(n);
2829
}
2930
}
30-
31+
32+
33+
@Test
34+
public void smallSizes32() {
35+
long lastTime = System.currentTimeMillis();
36+
for (int n = 1; n < 1_500_000; n = (int) ((n * 1.01) + 7)) {
37+
XorBinaryFuse32 f = testWithSize32(n);
38+
long now = System.currentTimeMillis();
39+
if (now - lastTime > 5000) {
40+
lastTime = now;
41+
System.out.println("n=" + n + " " + f.toString());
42+
}
43+
}
44+
}
45+
46+
private static XorBinaryFuse32 testWithSize32(int n) {
47+
long[] keys = new long[n];
48+
for (int i = 0; i < n; i++) {
49+
keys[i] = i;
50+
}
51+
return XorBinaryFuse32.construct(keys);
52+
}
53+
3154
@Test
3255
public void smallSizes() {
3356
long lastTime = System.currentTimeMillis();

0 commit comments

Comments
 (0)