Skip to content

Commit 399fc9f

Browse files
committed
C-fuse filters, but density is lower only at the end of the array (not at the start)
1 parent cd62dab commit 399fc9f

File tree

1 file changed

+263
-0
lines changed

1 file changed

+263
-0
lines changed
Lines changed: 263 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,263 @@
1+
package org.fastfilter.xor;
2+
3+
import java.util.Locale;
4+
5+
import org.fastfilter.utils.Hash;
6+
import org.fastfilter.utils.RandomGenerator;
7+
8+
/**
9+
* Calculate the best segment length for various sizes, and the probability of
10+
* mapping, for the fuse filter. Specially interesting are "small" set sizes
11+
* between 100 and 1 million.
12+
*
13+
* Unlike the regular fuse filter, which has less density at the beginning and
14+
* the end of the array, here density is only lower at the end of the array.
15+
* Index computation is slower, but load should be slightly higher.
16+
*
17+
* See also "Peeling Close to the Orientability Threshold - Spatial Coupling in
18+
* Hashing-Based Data Structures"
19+
*
20+
*/
21+
public class ProbabilityCFuse2 {
22+
23+
private static final int HASHES = 3;
24+
private static final int BITS_PER_FINGERPRINT = 8;
25+
26+
// size 100 load 0.45 segmentLength 64 bits/key 17.8 p 0.86
27+
// size 1000 load 0.70 segmentLength 256 bits/key 11.4 p 0.88
28+
// size 10000 load 0.80 segmentLength 1024 bits/key 10.0 p 0.90
29+
// size 100000 load 0.85 segmentLength 4096 bits/key 9.4 p 0.87
30+
31+
private static void testProb() {
32+
int segmentLength = 16;
33+
int segmentCount = 100;
34+
int[] counts = new int[200];
35+
for(int i=0; i<1000000; i++) {
36+
for(int index = 0; index < 3; index++) {
37+
long seed = 0;
38+
long key = i;
39+
long hash = Hash.hash64(key, seed);
40+
41+
// int r0 = (int) Hash.hash64(hash, 1);
42+
// int x = Hash.reduce(r0, segmentCount);
43+
// int h0 = x + (int) (Hash.hash64(hash, 2) & (segmentLength - 1));
44+
// int h1 = x + (int) (Hash.hash64(hash, 3) & (segmentLength - 1));
45+
// int h2 = x + (int) (Hash.hash64(hash, 4) & (segmentLength - 1));
46+
//
47+
int r0 = (int) Hash.hash64(hash, 1);
48+
int x = Hash.reduce(r0, segmentCount * 2 + segmentLength - 1);
49+
int h0 = x + (int) (Hash.hash64(hash, 2) & (segmentLength - 1));
50+
int h1 = x + (int) (Hash.hash64(hash, 3) & (segmentLength - 1));
51+
int h2 = x + (int) (Hash.hash64(hash, 4) & (segmentLength - 1));
52+
h0 = Math.abs(h0 - segmentCount - segmentLength + 1);
53+
h1 = Math.abs(h1 - segmentCount - segmentLength + 1);
54+
h2 = Math.abs(h2 - segmentCount - segmentLength + 1);
55+
int idx = index == 0 ? h0 : index == 1 ? h1 : h2;
56+
counts[idx]++;
57+
}
58+
}
59+
for(int i=0; i<counts.length; i++) {
60+
System.out.println(i + " " + counts[i]);
61+
}
62+
63+
}
64+
65+
public static void main(String... args) {
66+
//testProb();
67+
//if(true)return;
68+
69+
70+
// for(int size = 100_000; size < 1_000_000; size *= 10) {
71+
for(int size = 1; size < 1_000_000; size *= 10) {
72+
// for(int size = 1; size < 1_000_000; size = (size < 100) ? (size + 1) : (int) (size * 1.1)) {
73+
Data best = null;
74+
for (int segmentLengthBits = 3; segmentLengthBits <= 12; segmentLengthBits++) {
75+
// for (int segmentLengthBits = 3; segmentLengthBits < 14; segmentLengthBits++) {
76+
int segmentLength = 1 << segmentLengthBits;
77+
if (segmentLength > size) {
78+
break;
79+
}
80+
for(double load = 0.85; load > 0.3; load-= 0.05) {
81+
Data d = getProbability(size, segmentLengthBits, load, best);
82+
if (d != null && d.p > 0.85) {
83+
if (best == null || d.bitsPerKey < best.bitsPerKey) {
84+
best = d;
85+
}
86+
break;
87+
}
88+
}
89+
}
90+
if (best != null) {
91+
System.out.println(best);
92+
// for(int i=0; i<100; i++) {
93+
// System.out.println(i + ": " + best.data[i]);
94+
// }
95+
}
96+
}
97+
}
98+
99+
static Data getProbability(int size, int segmentLengthBits, double load, Data best) {
100+
int segmentLength = 1 << segmentLengthBits;
101+
int arrayLength = (int) (size / load);
102+
if (arrayLength <= 0) {
103+
return null;
104+
}
105+
int segmentCount = arrayLength - 1 * segmentLength;
106+
if (segmentCount <= 0) {
107+
return null;
108+
}
109+
Data d = new Data();
110+
d.size = size;
111+
d.load = load;
112+
d.segmentLength = segmentLength;
113+
d.bitsPerKey = (double) arrayLength * 8 / size;
114+
if (best != null && d.bitsPerKey > best.bitsPerKey) {
115+
return null;
116+
}
117+
// System.out.println(" test " + d);
118+
int successCount = 0;
119+
int testCount = Math.max(10, 10_000_000 / size);
120+
for(int seed = 0; seed < testCount; seed++) {
121+
long[] keys = new long[size];
122+
RandomGenerator.createRandomUniqueListFast(keys, seed);
123+
int[] success = testMapping(keys, segmentLengthBits, segmentCount, arrayLength, seed);
124+
if (success != null) {
125+
d.data = success;
126+
successCount++;
127+
}
128+
}
129+
double p = 1.0 * successCount / testCount;
130+
d.p = p;
131+
return d;
132+
}
133+
134+
public static int[] testMapping(long[] keys, int segmentLengthBits, int segmentCount, int arrayLength, long seed) {
135+
int segmentLength = 1 << segmentLengthBits;
136+
int size = keys.length;
137+
int m = arrayLength;
138+
long[] reverseOrder = new long[size];
139+
byte[] reverseH = new byte[size];
140+
int reverseOrderPos;
141+
seed = Hash.randomSeed();
142+
byte[] t2count = new byte[m];
143+
long[] t2 = new long[m];
144+
for (long k : keys) {
145+
for (int hi = 0; hi < HASHES; hi++) {
146+
int h = getHash(segmentLengthBits, segmentLength, segmentCount, k, seed, hi);
147+
t2[h] ^= k;
148+
if (t2count[h] > 120) {
149+
// probably something wrong with the hash function
150+
throw new IllegalArgumentException();
151+
}
152+
t2count[h]++;
153+
}
154+
}
155+
reverseOrderPos = 0;
156+
int[] alone = new int[arrayLength];
157+
int alonePos = 0;
158+
for (int i = 0; i < arrayLength; i++) {
159+
if (t2count[ i] == 1) {
160+
alone[alonePos++] = i;
161+
}
162+
}
163+
int found = -1;
164+
while (alonePos > 0) {
165+
int i = alone[--alonePos];
166+
if (t2count[i] <= 0) {
167+
continue;
168+
}
169+
if (t2count[i] != 1) {
170+
throw new AssertionError();
171+
}
172+
--t2count[i];
173+
long k = t2[i];
174+
for (int hi = 0; hi < HASHES; hi++) {
175+
int h = getHash(segmentLengthBits, segmentLength, segmentCount, k, seed, hi);
176+
int newCount = --t2count[h];
177+
if (h == i) {
178+
found = hi;
179+
} else {
180+
if (newCount == 1) {
181+
alone[alonePos++] = h;
182+
}
183+
t2[h] ^= k;
184+
}
185+
}
186+
reverseOrder[reverseOrderPos] = k;
187+
reverseH[reverseOrderPos] = (byte) found;
188+
reverseOrderPos++;
189+
}
190+
if (reverseOrderPos != size) {
191+
return null;
192+
}
193+
byte[] fp = new byte[m];
194+
for (int i = reverseOrderPos - 1; i >= 0; i--) {
195+
long k = reverseOrder[i];
196+
found = reverseH[i];
197+
int change = -1;
198+
long hash = Hash.hash64(k, seed);
199+
int xor = fingerprint(hash);
200+
for (int hi = 0; hi < HASHES; hi++) {
201+
int h = getHash(segmentLengthBits, segmentLength, segmentCount, k, seed, hi);
202+
if (found == hi) {
203+
change = h;
204+
} else {
205+
xor ^= fp[h];
206+
}
207+
}
208+
fp[change] = (byte) xor;
209+
}
210+
int[] nonZero = new int[100];
211+
for(int i=0; i<fp.length; i++) {
212+
if (fp[i] != 0) {
213+
nonZero[i * 100 / fp.length]++;
214+
}
215+
}
216+
return nonZero;
217+
}
218+
219+
private static int fingerprint(long hash) {
220+
return (int) (hash & ((1 << BITS_PER_FINGERPRINT) - 1));
221+
}
222+
223+
private static int getHash(int segmentLengthBits, int segmentLength, int segmentCount, long key, long seed, int index) {
224+
long hash = Hash.hash64(key, seed);
225+
int r0 = (int) Hash.hash64(hash, 1);
226+
/*
227+
int x = Hash.reduce(r0, segmentCount * 2 - 1);
228+
int h0 = x + (int) (Hash.hash64(hash, 2) & (2 * segmentLength - 1));
229+
int h1 = x + (int) (Hash.hash64(hash, 3) & (2 * segmentLength - 1));
230+
int h2 = x + (int) (Hash.hash64(hash, 4) & (2 * segmentLength - 1));
231+
h0 = Math.abs(h0 - segmentCount - segmentLength + 1);
232+
h1 = Math.abs(h1 - segmentCount - segmentLength + 1);
233+
h2 = Math.abs(h2 - segmentCount - segmentLength + 1);
234+
*/
235+
236+
int x = Hash.reduce(r0, segmentCount * 2 + segmentLength - 1);
237+
int h0 = x + (int) (Hash.hash64(hash, 2) & (segmentLength - 1));
238+
int h1 = x + (int) (Hash.hash64(hash, 3) & (segmentLength - 1));
239+
int h2 = x + (int) (Hash.hash64(hash, 4) & (segmentLength - 1));
240+
h0 = Math.abs(h0 - segmentCount - segmentLength + 1);
241+
h1 = Math.abs(h1 - segmentCount - segmentLength + 1);
242+
h2 = Math.abs(h2 - segmentCount - segmentLength + 1);
243+
244+
return index == 0 ? h0 : index == 1 ? h1 : h2;
245+
}
246+
247+
static class Data {
248+
int size;
249+
double load;
250+
int segmentLength;
251+
double bitsPerKey;
252+
double p;
253+
int[] data;
254+
255+
public String toString() {
256+
return String.format(Locale.ENGLISH, "size %d load %.2f " +
257+
"segmentLength %d bits/key %.1f p %.2f"
258+
, size, load, segmentLength, bitsPerKey, p);
259+
}
260+
261+
}
262+
263+
}

0 commit comments

Comments
 (0)