Skip to content

Commit cd62dab

Browse files
committed
Calculating the mapping probability and best segment length for fuse and c-fuse filters
1 parent 84aba92 commit cd62dab

File tree

2 files changed

+255
-74
lines changed

2 files changed

+255
-74
lines changed
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
package org.fastfilter.xor;
2+
3+
import java.util.Locale;
4+
5+
import org.fastfilter.utils.Hash;
6+
import org.fastfilter.utils.RandomGenerator;
7+
8+
/**
9+
* Calculate the best segment length for various sizes, and the probability of
10+
* mapping, for the fuse filter. Specially interesting are "small" set sizes
11+
* between 100 and 1 million.
12+
*
13+
* See also "Peeling Close to the Orientability Threshold - Spatial Coupling in Hashing-Based Data Structures"
14+
15+
*/
16+
public class ProbabilityCFuse {
17+
18+
private static final int HASHES = 3;
19+
20+
// size 100 load 0.45 segmentLength 64 bits/key 17.8 p 0.88
21+
// size 1000 load 0.65 segmentLength 256 bits/key 12.3 p 0.92
22+
// size 10000 load 0.75 segmentLength 1024 bits/key 10.7 p 0.94
23+
// size 100000 load 0.80 segmentLength 4096 bits/key 10.0 p 0.92
24+
25+
public static void main(String... args) {
26+
for(int size = 1; size < 1_000_000; size *= 10) {
27+
// for(int size = 1; size < 1_000_000; size = (size < 100) ? (size + 1) : (int) (size * 1.1)) {
28+
Data best = null;
29+
for (int segmentLengthBits = 3; segmentLengthBits < 14; segmentLengthBits++) {
30+
int segmentLength = 1 << segmentLengthBits;
31+
if (segmentLength > size) {
32+
break;
33+
}
34+
for(double load = 0.85; load > 0.3; load-= 0.05) {
35+
Data d = getProbability(size, segmentLengthBits, load, best);
36+
if (d != null && d.p > 0.85) {
37+
if (best == null || d.bitsPerKey < best.bitsPerKey) {
38+
best = d;
39+
}
40+
break;
41+
}
42+
}
43+
}
44+
if (best != null) {
45+
System.out.println(best);
46+
}
47+
}
48+
}
49+
50+
static Data getProbability(int size, int segmentLengthBits, double load, Data best) {
51+
int segmentLength = 1 << segmentLengthBits;
52+
int arrayLength = (int) (size / load);
53+
if (arrayLength <= 0) {
54+
return null;
55+
}
56+
int segmentCount = arrayLength - 1 * segmentLength;
57+
if (segmentCount <= 0) {
58+
return null;
59+
}
60+
Data d = new Data();
61+
d.size = size;
62+
d.load = load;
63+
d.segmentLength = segmentLength;
64+
d.bitsPerKey = (double) arrayLength * 8 / size;
65+
if (best != null && d.bitsPerKey > best.bitsPerKey) {
66+
return null;
67+
}
68+
// System.out.println(" test " + d);
69+
int successCount = 0;
70+
int testCount = Math.max(10, 10_000_000 / size);
71+
for(int seed = 0; seed < testCount; seed++) {
72+
long[] keys = new long[size];
73+
RandomGenerator.createRandomUniqueListFast(keys, seed);
74+
boolean success = testMapping(keys, segmentLengthBits, segmentCount, arrayLength, seed);
75+
if (success) {
76+
successCount++;
77+
}
78+
}
79+
double p = 1.0 * successCount / testCount;
80+
d.p = p;
81+
return d;
82+
}
83+
84+
public static boolean testMapping(long[] keys, int segmentLengthBits, int segmentCount, int arrayLength, long seed) {
85+
int segmentLength = 1 << segmentLengthBits;
86+
int size = keys.length;
87+
int m = arrayLength;
88+
long[] reverseOrder = new long[size];
89+
byte[] reverseH = new byte[size];
90+
int reverseOrderPos;
91+
seed = Hash.randomSeed();
92+
byte[] t2count = new byte[m];
93+
long[] t2 = new long[m];
94+
for (long k : keys) {
95+
for (int hi = 0; hi < HASHES; hi++) {
96+
int h = getHash(segmentLengthBits, segmentLength, segmentCount, k, seed, hi);
97+
t2[h] ^= k;
98+
if (t2count[h] > 120) {
99+
// probably something wrong with the hash function
100+
throw new IllegalArgumentException();
101+
}
102+
t2count[h]++;
103+
}
104+
}
105+
reverseOrderPos = 0;
106+
int[] alone = new int[arrayLength];
107+
int alonePos = 0;
108+
for (int i = 0; i < arrayLength; i++) {
109+
if (t2count[ i] == 1) {
110+
alone[alonePos++] = i;
111+
}
112+
}
113+
int found = -1;
114+
while (alonePos > 0) {
115+
int i = alone[--alonePos];
116+
if (t2count[i] <= 0) {
117+
continue;
118+
}
119+
if (t2count[i] != 1) {
120+
throw new AssertionError();
121+
}
122+
--t2count[i];
123+
long k = t2[i];
124+
for (int hi = 0; hi < HASHES; hi++) {
125+
int h = getHash(segmentLengthBits, segmentLength, segmentCount, k, seed, hi);
126+
int newCount = --t2count[h];
127+
if (h == i) {
128+
found = hi;
129+
} else {
130+
if (newCount == 1) {
131+
alone[alonePos++] = h;
132+
}
133+
t2[h] ^= k;
134+
}
135+
}
136+
reverseOrder[reverseOrderPos] = k;
137+
reverseH[reverseOrderPos] = (byte) found;
138+
reverseOrderPos++;
139+
}
140+
return reverseOrderPos == size;
141+
}
142+
143+
private static int getHash(int segmentLengthBits, int segmentLength, int segmentCount, long key, long seed, int index) {
144+
long hash = Hash.hash64(key, seed);
145+
int r0 = (int) Hash.hash64(hash, 1);
146+
int x = Hash.reduce(r0, segmentCount);
147+
int h0 = x + (int) (Hash.hash64(hash, 2) & (segmentLength - 1));
148+
int h1 = x + (int) (Hash.hash64(hash, 3) & (segmentLength - 1));
149+
int h2 = x + (int) (Hash.hash64(hash, 4) & (segmentLength - 1));
150+
return index == 0 ? h0 : index == 1 ? h1 : h2;
151+
}
152+
153+
static class Data {
154+
int size;
155+
double load;
156+
int segmentLength;
157+
double bitsPerKey;
158+
double p;
159+
160+
public String toString() {
161+
return String.format(Locale.ENGLISH, "size %d load %.2f " +
162+
"segmentLength %d bits/key %.1f p %.2f"
163+
, size, load, segmentLength, bitsPerKey, p);
164+
}
165+
166+
}
167+
168+
}

fastfilter/src/test/java/org/fastfilter/xor/ProbabilityFuse.java

Lines changed: 87 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -5,83 +5,85 @@
55
import org.fastfilter.utils.Hash;
66
import org.fastfilter.utils.RandomGenerator;
77

8+
/**
9+
* Calculate the best segment length for various sizes, and the probability of
10+
* mapping, for the fuse filter. Specially interesting are "small" set sizes
11+
* between 100 and 1 million.
12+
*
13+
* See also "Dense Peelable Random Uniform Hypergraphs"
14+
*/
815
public class ProbabilityFuse {
9-
16+
1017
private static final int HASHES = 3;
11-
private static final int FUSE_ARITY = 3;
12-
private static final int FUSE_SEGMENT_COUNT = 100;
13-
private static final int FUSE_SLOTS = FUSE_SEGMENT_COUNT + FUSE_ARITY - 1;
18+
19+
// size 10 load 0.40 segmentLength 8 bits/key 20.0 p 0.90
20+
// size 100 load 0.60 segmentLength 32 bits/key 13.3 p 0.93
21+
// size 1000 load 0.70 segmentLength 64 bits/key 11.4 p 0.89
22+
// size 10000 load 0.80 segmentLength 256 bits/key 10.0 p 0.86
23+
// size 100000 load 0.85 segmentLength 1024 bits/key 9.4 p 0.98
1424

1525
public static void main(String... args) {
16-
for(int size = 10; size < 500000; size *= 1.1) {
17-
System.out.print("size " + size);
18-
double start = Math.max(0.1, Math.min(0.8, Math.log10(size / 100) /4));
19-
double change = 0.1;
20-
int lastDirection = 1;
21-
double p = 0;
22-
double factor = start + 0.1;
23-
for(; factor > 0.0;) {
24-
int successCount = 0;
25-
int testCount = Math.max(10, 1000000 / size);
26-
for(int seed = 0; seed < testCount; seed++) {
27-
long[] keys = new long[size];
28-
RandomGenerator.createRandomUniqueListFast(keys, seed);
29-
boolean success = testMapping(keys, factor, seed);
30-
if (success) {
31-
successCount++;
32-
}
26+
for(int size = 1; size < 1_000_000; size *= 10) {
27+
// for(int size = 1; size < 1_000_000; size = (size < 100) ? (size + 1) : (int) (size * 1.1)) {
28+
Data best = null;
29+
for (int segmentLengthBits = 3; segmentLengthBits < 14; segmentLengthBits++) {
30+
int segmentLength = 1 << segmentLengthBits;
31+
if (segmentLength > size) {
32+
break;
3333
}
34-
p = 1.0 * successCount / testCount;
35-
double minP = 0.01;
36-
if (p < minP && factor > 0.1) {
37-
factor -= change;
38-
if (lastDirection != -1) {
39-
lastDirection = -1;
40-
change = change / 2;
41-
}
42-
} else if (p > minP * 1.1) {
43-
if (change < 0.0001) {
34+
for(double load = 0.85; load > 0.3; load-= 0.05) {
35+
Data d = getProbability(size, segmentLengthBits, load, best);
36+
if (d != null && d.p > 0.85) {
37+
if (best == null || d.bitsPerKey < best.bitsPerKey) {
38+
best = d;
39+
}
4440
break;
4541
}
46-
if (factor > 0.8) {
47-
break;
48-
}
49-
factor += change;
50-
if (lastDirection != 1) {
51-
lastDirection = 1;
52-
change = change / 2;
53-
}
54-
} else {
55-
break;
5642
}
57-
// System.out.printf(Locale.ENGLISH, " %2.5f %2.3f %2.20f\n", factor, p, change);
5843
}
59-
System.out.printf(Locale.ENGLISH, " %2.5f %2.3f\n", factor, p);
44+
if (best != null) {
45+
System.out.println(best);
46+
}
6047
}
6148
}
6249

63-
/**
64-
* Get the fill rate for a certain size for a 95% probability.
65-
*
66-
* @param size the size
67-
* @return the factor
68-
*/
69-
public static double getFactor(int size) {
70-
if (size < 100) {
71-
return 0.13;
50+
static Data getProbability(int size, int segmentLengthBits, double load, Data best) {
51+
int segmentLength = 1 << segmentLengthBits;
52+
int arrayLength = (int) (size / load);
53+
if (arrayLength <= 0) {
54+
return null;
55+
}
56+
int segmentCount = (arrayLength - 2 * segmentLength) / segmentLength;
57+
if (segmentCount <= 0) {
58+
return null;
7259
}
73-
if (size > 170000) {
74-
return 0.879;
60+
Data d = new Data();
61+
d.size = size;
62+
d.load = load;
63+
d.segmentLength = segmentLength;
64+
d.bitsPerKey = (double) arrayLength * 8 / size;
65+
if (best != null && d.bitsPerKey > best.bitsPerKey) {
66+
return null;
7567
}
76-
// this formula is weird, using cosine and log base 10, but it works.
77-
// it was found manually trying to fit the curve
78-
return Math.cos(Math.log10(size) / 1.2 - 4.7) / 2.7 + 0.5;
68+
// System.out.println(" test " + d);
69+
int successCount = 0;
70+
int testCount = Math.max(10, 10_000_000 / size);
71+
for(int seed = 0; seed < testCount; seed++) {
72+
long[] keys = new long[size];
73+
RandomGenerator.createRandomUniqueListFast(keys, seed);
74+
boolean success = testMapping(keys, segmentLengthBits, segmentCount, arrayLength, seed);
75+
if (success) {
76+
successCount++;
77+
}
78+
}
79+
double p = 1.0 * successCount / testCount;
80+
d.p = p;
81+
return d;
7982
}
80-
81-
public static boolean testMapping(long[] keys, double factor, long seed) {
83+
84+
public static boolean testMapping(long[] keys, int segmentLengthBits, int segmentCount, int arrayLength, long seed) {
85+
int segmentLength = 1 << segmentLengthBits;
8286
int size = keys.length;
83-
int arrayLength = getArrayLength(size, factor);
84-
int segmentLength = arrayLength / FUSE_SLOTS;
8587
int m = arrayLength;
8688
long[] reverseOrder = new long[size];
8789
byte[] reverseH = new byte[size];
@@ -91,7 +93,7 @@ public static boolean testMapping(long[] keys, double factor, long seed) {
9193
long[] t2 = new long[m];
9294
for (long k : keys) {
9395
for (int hi = 0; hi < HASHES; hi++) {
94-
int h = getHash(segmentLength, k, seed, hi);
96+
int h = getHash(segmentLengthBits, segmentLength, segmentCount, k, seed, hi);
9597
t2[h] ^= k;
9698
if (t2count[h] > 120) {
9799
// probably something wrong with the hash function
@@ -120,7 +122,7 @@ public static boolean testMapping(long[] keys, double factor, long seed) {
120122
--t2count[i];
121123
long k = t2[i];
122124
for (int hi = 0; hi < HASHES; hi++) {
123-
int h = getHash(segmentLength, k, seed, hi);
125+
int h = getHash(segmentLengthBits, segmentLength, segmentCount, k, seed, hi);
124126
int newCount = --t2count[h];
125127
if (h == i) {
126128
found = hi;
@@ -137,19 +139,30 @@ public static boolean testMapping(long[] keys, double factor, long seed) {
137139
}
138140
return reverseOrderPos == size;
139141
}
140-
141-
private static int getHash(int segmentLength, long key, long seed, int index) {
142+
143+
private static int getHash(int segmentLengthBits, int segmentLength, int segmentCount, long key, long seed, int index) {
142144
long hash = Hash.hash64(key, seed);
143-
int r0 = (int) ((0xBF58476D1CE4E5B9L * hash) >> 32);
144-
int seg = Hash.reduce(r0, FUSE_SEGMENT_COUNT);
145-
int r = (int) Long.rotateLeft(hash, 21 * index);
146-
return (seg + index) * segmentLength + Hash.reduce(r, segmentLength);
145+
int seg = Hash.reduce((int) hash, segmentCount);
146+
long hh = (hash ^ (hash >>> 32));
147+
int h0 = (seg + 0) * segmentLength + (int) ((hh >> (0 * segmentLengthBits)) & (segmentLength - 1));
148+
int h1 = (seg + 1) * segmentLength + (int) ((hh >> (1 * segmentLengthBits)) & (segmentLength - 1));
149+
int h2 = (seg + 2) * segmentLength + (int) ((hh >> (2 * segmentLengthBits)) & (segmentLength - 1));
150+
return index == 0 ? h0 : index == 1 ? h1 : h2;
147151
}
148152

149-
private static int getArrayLength(int size, double factor) {
150-
int capacity = (int) (1.0 / factor * size) + 64;
151-
capacity = (capacity + FUSE_SLOTS - 1) / FUSE_SLOTS * FUSE_SLOTS;
152-
return capacity;
153-
}
153+
static class Data {
154+
int size;
155+
double load;
156+
int segmentLength;
157+
double bitsPerKey;
158+
double p;
159+
160+
public String toString() {
161+
return String.format(Locale.ENGLISH, "size %d load %.2f " +
162+
"segmentLength %d bits/key %.1f p %.2f"
163+
, size, load, segmentLength, bitsPerKey, p);
164+
}
165+
166+
}
154167

155168
}

0 commit comments

Comments
 (0)