55import org .fastfilter .utils .Hash ;
66import org .fastfilter .utils .RandomGenerator ;
77
8+ /**
9+ * Calculate the best segment length for various sizes, and the probability of
10+ * mapping, for the fuse filter. Specially interesting are "small" set sizes
11+ * between 100 and 1 million.
12+ *
13+ * See also "Dense Peelable Random Uniform Hypergraphs"
14+ */
815public class ProbabilityFuse {
9-
16+
1017 private static final int HASHES = 3 ;
11- private static final int FUSE_ARITY = 3 ;
12- private static final int FUSE_SEGMENT_COUNT = 100 ;
13- private static final int FUSE_SLOTS = FUSE_SEGMENT_COUNT + FUSE_ARITY - 1 ;
18+
19+ // size 10 load 0.40 segmentLength 8 bits/key 20.0 p 0.90
20+ // size 100 load 0.60 segmentLength 32 bits/key 13.3 p 0.93
21+ // size 1000 load 0.70 segmentLength 64 bits/key 11.4 p 0.89
22+ // size 10000 load 0.80 segmentLength 256 bits/key 10.0 p 0.86
23+ // size 100000 load 0.85 segmentLength 1024 bits/key 9.4 p 0.98
1424
1525 public static void main (String ... args ) {
16- for (int size = 10 ; size < 500000 ; size *= 1.1 ) {
17- System .out .print ("size " + size );
18- double start = Math .max (0.1 , Math .min (0.8 , Math .log10 (size / 100 ) /4 ));
19- double change = 0.1 ;
20- int lastDirection = 1 ;
21- double p = 0 ;
22- double factor = start + 0.1 ;
23- for (; factor > 0.0 ;) {
24- int successCount = 0 ;
25- int testCount = Math .max (10 , 1000000 / size );
26- for (int seed = 0 ; seed < testCount ; seed ++) {
27- long [] keys = new long [size ];
28- RandomGenerator .createRandomUniqueListFast (keys , seed );
29- boolean success = testMapping (keys , factor , seed );
30- if (success ) {
31- successCount ++;
32- }
26+ for (int size = 1 ; size < 1_000_000 ; size *= 10 ) {
27+ // for(int size = 1; size < 1_000_000; size = (size < 100) ? (size + 1) : (int) (size * 1.1)) {
28+ Data best = null ;
29+ for (int segmentLengthBits = 3 ; segmentLengthBits < 14 ; segmentLengthBits ++) {
30+ int segmentLength = 1 << segmentLengthBits ;
31+ if (segmentLength > size ) {
32+ break ;
3333 }
34- p = 1.0 * successCount / testCount ;
35- double minP = 0.01 ;
36- if (p < minP && factor > 0.1 ) {
37- factor -= change ;
38- if (lastDirection != -1 ) {
39- lastDirection = -1 ;
40- change = change / 2 ;
41- }
42- } else if (p > minP * 1.1 ) {
43- if (change < 0.0001 ) {
34+ for (double load = 0.85 ; load > 0.3 ; load -= 0.05 ) {
35+ Data d = getProbability (size , segmentLengthBits , load , best );
36+ if (d != null && d .p > 0.85 ) {
37+ if (best == null || d .bitsPerKey < best .bitsPerKey ) {
38+ best = d ;
39+ }
4440 break ;
4541 }
46- if (factor > 0.8 ) {
47- break ;
48- }
49- factor += change ;
50- if (lastDirection != 1 ) {
51- lastDirection = 1 ;
52- change = change / 2 ;
53- }
54- } else {
55- break ;
5642 }
57- // System.out.printf(Locale.ENGLISH, " %2.5f %2.3f %2.20f\n", factor, p, change);
5843 }
59- System .out .printf (Locale .ENGLISH , " %2.5f %2.3f\n " , factor , p );
44+ if (best != null ) {
45+ System .out .println (best );
46+ }
6047 }
6148 }
6249
63- /**
64- * Get the fill rate for a certain size for a 95% probability.
65- *
66- * @param size the size
67- * @return the factor
68- */
69- public static double getFactor ( int size ) {
70- if (size < 100 ) {
71- return 0.13 ;
50+ static Data getProbability ( int size , int segmentLengthBits , double load , Data best ) {
51+ int segmentLength = 1 << segmentLengthBits ;
52+ int arrayLength = ( int ) ( size / load );
53+ if ( arrayLength <= 0 ) {
54+ return null ;
55+ }
56+ int segmentCount = ( arrayLength - 2 * segmentLength ) / segmentLength ;
57+ if (segmentCount <= 0 ) {
58+ return null ;
7259 }
73- if (size > 170000 ) {
74- return 0.879 ;
60+ Data d = new Data ();
61+ d .size = size ;
62+ d .load = load ;
63+ d .segmentLength = segmentLength ;
64+ d .bitsPerKey = (double ) arrayLength * 8 / size ;
65+ if (best != null && d .bitsPerKey > best .bitsPerKey ) {
66+ return null ;
7567 }
76- // this formula is weird, using cosine and log base 10, but it works.
77- // it was found manually trying to fit the curve
78- return Math .cos (Math .log10 (size ) / 1.2 - 4.7 ) / 2.7 + 0.5 ;
68+ // System.out.println(" test " + d);
69+ int successCount = 0 ;
70+ int testCount = Math .max (10 , 10_000_000 / size );
71+ for (int seed = 0 ; seed < testCount ; seed ++) {
72+ long [] keys = new long [size ];
73+ RandomGenerator .createRandomUniqueListFast (keys , seed );
74+ boolean success = testMapping (keys , segmentLengthBits , segmentCount , arrayLength , seed );
75+ if (success ) {
76+ successCount ++;
77+ }
78+ }
79+ double p = 1.0 * successCount / testCount ;
80+ d .p = p ;
81+ return d ;
7982 }
80-
81- public static boolean testMapping (long [] keys , double factor , long seed ) {
83+
84+ public static boolean testMapping (long [] keys , int segmentLengthBits , int segmentCount , int arrayLength , long seed ) {
85+ int segmentLength = 1 << segmentLengthBits ;
8286 int size = keys .length ;
83- int arrayLength = getArrayLength (size , factor );
84- int segmentLength = arrayLength / FUSE_SLOTS ;
8587 int m = arrayLength ;
8688 long [] reverseOrder = new long [size ];
8789 byte [] reverseH = new byte [size ];
@@ -91,7 +93,7 @@ public static boolean testMapping(long[] keys, double factor, long seed) {
9193 long [] t2 = new long [m ];
9294 for (long k : keys ) {
9395 for (int hi = 0 ; hi < HASHES ; hi ++) {
94- int h = getHash (segmentLength , k , seed , hi );
96+ int h = getHash (segmentLengthBits , segmentLength , segmentCount , k , seed , hi );
9597 t2 [h ] ^= k ;
9698 if (t2count [h ] > 120 ) {
9799 // probably something wrong with the hash function
@@ -120,7 +122,7 @@ public static boolean testMapping(long[] keys, double factor, long seed) {
120122 --t2count [i ];
121123 long k = t2 [i ];
122124 for (int hi = 0 ; hi < HASHES ; hi ++) {
123- int h = getHash (segmentLength , k , seed , hi );
125+ int h = getHash (segmentLengthBits , segmentLength , segmentCount , k , seed , hi );
124126 int newCount = --t2count [h ];
125127 if (h == i ) {
126128 found = hi ;
@@ -137,19 +139,30 @@ public static boolean testMapping(long[] keys, double factor, long seed) {
137139 }
138140 return reverseOrderPos == size ;
139141 }
140-
141- private static int getHash (int segmentLength , long key , long seed , int index ) {
142+
143+ private static int getHash (int segmentLengthBits , int segmentLength , int segmentCount , long key , long seed , int index ) {
142144 long hash = Hash .hash64 (key , seed );
143- int r0 = (int ) ((0xBF58476D1CE4E5B9L * hash ) >> 32 );
144- int seg = Hash .reduce (r0 , FUSE_SEGMENT_COUNT );
145- int r = (int ) Long .rotateLeft (hash , 21 * index );
146- return (seg + index ) * segmentLength + Hash .reduce (r , segmentLength );
145+ int seg = Hash .reduce ((int ) hash , segmentCount );
146+ long hh = (hash ^ (hash >>> 32 ));
147+ int h0 = (seg + 0 ) * segmentLength + (int ) ((hh >> (0 * segmentLengthBits )) & (segmentLength - 1 ));
148+ int h1 = (seg + 1 ) * segmentLength + (int ) ((hh >> (1 * segmentLengthBits )) & (segmentLength - 1 ));
149+ int h2 = (seg + 2 ) * segmentLength + (int ) ((hh >> (2 * segmentLengthBits )) & (segmentLength - 1 ));
150+ return index == 0 ? h0 : index == 1 ? h1 : h2 ;
147151 }
148152
149- private static int getArrayLength (int size , double factor ) {
150- int capacity = (int ) (1.0 / factor * size ) + 64 ;
151- capacity = (capacity + FUSE_SLOTS - 1 ) / FUSE_SLOTS * FUSE_SLOTS ;
152- return capacity ;
153- }
153+ static class Data {
154+ int size ;
155+ double load ;
156+ int segmentLength ;
157+ double bitsPerKey ;
158+ double p ;
159+
160+ public String toString () {
161+ return String .format (Locale .ENGLISH , "size %d load %.2f " +
162+ "segmentLength %d bits/key %.1f p %.2f"
163+ , size , load , segmentLength , bitsPerKey , p );
164+ }
165+
166+ }
154167
155168}
0 commit comments