1616// under the License.
1717
1818use super :: BloomFilter ;
19+ use crate :: codec:: family:: Family ;
1920use crate :: hash:: DEFAULT_UPDATE_SEED ;
2021
21- const MIN_NUM_BITS : u64 = 64 ;
22- const MAX_NUM_BITS : u64 = ( 1u64 << 35 ) - 64 ; // ~32 GB - reasonable limit
23-
2422/// Builder for creating [`BloomFilter`] instances.
2523///
2624/// Provides two construction modes:
27- /// - [`with_accuracy()`](Self::with_accuracy): Specify target items and false positive rate
25+ /// * [`with_accuracy()`](Self::with_accuracy): Specify target items and false positive rate
2826/// (recommended)
29- /// - [`with_size()`](Self::with_size): Specify exact bit count and hash functions (manual)
27+ /// * [`with_size()`](Self::with_size): Specify requested bit count and hash functions (manual)
3028#[ derive( Debug , Clone ) ]
3129pub struct BloomFilterBuilder {
3230 num_bits : u64 ,
@@ -35,19 +33,31 @@ pub struct BloomFilterBuilder {
3533}
3634
3735impl BloomFilterBuilder {
36+ /// Minimum allowed requested Bloom filter size, in bits.
37+ pub const MIN_NUM_BITS : u64 = 1 ;
38+ /// Maximum allowed requested Bloom filter size, in bits.
39+ ///
40+ /// Derived from serialization limits so the encoded sketch length fits in a signed 32-bit size
41+ /// field.
42+ pub const MAX_NUM_BITS : u64 = ( i32:: MAX as u64 - Family :: BLOOMFILTER . max_pre_longs as u64 ) * 64 ;
43+ /// Minimum allowed number of hash functions.
44+ pub const MIN_NUM_HASHES : u16 = 1 ;
45+ /// Maximum allowed number of hash functions.
46+ pub const MAX_NUM_HASHES : u16 = i16:: MAX as u16 ;
47+
3848 /// Creates a builder with optimal parameters for a target accuracy.
3949 ///
4050 /// Automatically calculates the optimal number of bits and hash functions
4151 /// to achieve the desired false positive probability for a given number of items.
4252 ///
4353 /// # Arguments
4454 ///
45- /// - `max_items`: Maximum expected number of distinct items
46- /// - `fpp`: Target false positive probability (e.g., 0.01 for 1%)
55+ /// * `max_items`: Maximum expected number of distinct items
56+ /// * `fpp`: Target false positive probability (e.g., 0.01 for 1%)
4757 ///
4858 /// # Panics
4959 ///
50- /// Panics if `max_items` is 0 or `fpp` is not in (0.0, 1.0) .
60+ /// Panics if `max_items` is 0 or `fpp` is not in (0.0, 1.0] .
5161 ///
5262 /// # Examples
5363 ///
@@ -61,8 +71,8 @@ impl BloomFilterBuilder {
6171 pub fn with_accuracy ( max_items : u64 , fpp : f64 ) -> Self {
6272 assert ! ( max_items > 0 , "max_items must be greater than 0" ) ;
6373 assert ! (
64- fpp > 0.0 && fpp < 1.0 ,
65- "fpp must be between 0.0 and 1.0 (exclusive )"
74+ fpp > 0.0 && fpp <= 1.0 ,
75+ "fpp must be between 0.0 and 1.0 (inclusive of 1.0 )"
6676 ) ;
6777
6878 let num_bits = Self :: suggest_num_bits ( max_items, fpp) ;
@@ -77,19 +87,22 @@ impl BloomFilterBuilder {
7787
7888 /// Creates a builder with manual size specification.
7989 ///
80- /// Use this when you want precise control over the filter size,
90+ /// Use this when you want precise control over the requested filter size,
8191 /// or when working with pre-calculated parameters.
8292 ///
93+ /// The underlying storage is word-based, so the actual capacity is rounded
94+ /// up to the next multiple of 64 bits.
95+ ///
8396 /// # Arguments
8497 ///
85- /// - `num_bits`: Total number of bits in the filter
86- /// - `num_hashes`: Number of hash functions to use
98+ /// * `num_bits`: Total number of bits in the filter
99+ /// * `num_hashes`: Number of hash functions to use
87100 ///
88101 /// # Panics
89102 ///
90- /// Panics if:
91- /// - `num_bits` < MIN_NUM_BITS (64) or `num_bits` > MAX_NUM_BITS (~32 GB)
92- /// - `num_hashes` < 1 or `num_hashes` > 100
103+ /// Panics if any of :
104+ /// * `num_bits` < [`Self:: MIN_NUM_BITS`] or `num_bits` > [`Self:: MAX_NUM_BITS`]
105+ /// * `num_hashes` < [`Self::MIN_NUM_HASHES`] or `num_hashes` > [`Self::MAX_NUM_HASHES`]
93106 ///
94107 /// # Examples
95108 ///
@@ -99,17 +112,19 @@ impl BloomFilterBuilder {
99112 /// ```
100113 pub fn with_size ( num_bits : u64 , num_hashes : u16 ) -> Self {
101114 assert ! (
102- num_bits >= MIN_NUM_BITS ,
103- "num_bits must be at least {}" ,
104- MIN_NUM_BITS
115+ ( Self :: MIN_NUM_BITS ..=Self :: MAX_NUM_BITS ) . contains( & num_bits) ,
116+ "num_bits must be between {} and {}, got {}" ,
117+ Self :: MIN_NUM_BITS ,
118+ Self :: MAX_NUM_BITS ,
119+ num_bits,
105120 ) ;
106121 assert ! (
107- num_bits <= MAX_NUM_BITS ,
108- "num_bits must not exceed {}" ,
109- MAX_NUM_BITS
122+ ( Self :: MIN_NUM_HASHES ..=Self :: MAX_NUM_HASHES ) . contains( & num_hashes) ,
123+ "num_bits must be between {} and {}, got {}" ,
124+ Self :: MIN_NUM_HASHES ,
125+ Self :: MAX_NUM_HASHES ,
126+ num_hashes
110127 ) ;
111- assert ! ( num_hashes >= 1 , "num_hashes must be at least 1" ) ;
112- assert ! ( num_hashes <= 100 , "num_hashes must not exceed 100" ) ;
113128
114129 BloomFilterBuilder {
115130 num_bits,
@@ -141,16 +156,13 @@ impl BloomFilterBuilder {
141156 ///
142157 /// Panics if neither `with_accuracy()` nor `with_size()` was called.
143158 pub fn build ( self ) -> BloomFilter {
144- let capacity_bits = self . num_bits ;
145159 let num_hashes = self . num_hashes ;
146-
147- let num_words = capacity_bits. div_ceil ( 64 ) as usize ;
148- let bit_array = vec ! [ 0u64 ; num_words] ;
160+ let num_words = self . num_bits . div_ceil ( 64 ) as usize ;
161+ let bit_array = vec ! [ 0u64 ; num_words] . into_boxed_slice ( ) ;
149162
150163 BloomFilter {
151164 seed : self . seed ,
152165 num_hashes,
153- capacity_bits,
154166 num_bits_set : 0 ,
155167 bit_array,
156168 }
@@ -175,10 +187,7 @@ impl BloomFilterBuilder {
175187
176188 let bits = ( -n * p. ln ( ) / ln2_squared) . ceil ( ) as u64 ;
177189
178- // Round up to multiple of 64 for efficiency
179- let bits = bits. div_ceil ( 64 ) * 64 ;
180-
181- bits. clamp ( MIN_NUM_BITS , MAX_NUM_BITS )
190+ bits. clamp ( Self :: MIN_NUM_BITS , Self :: MAX_NUM_BITS )
182191 }
183192
184193 /// Suggests optimal number of hash functions given max items and bit count.
@@ -197,9 +206,12 @@ impl BloomFilterBuilder {
197206 let m = num_bits as f64 ;
198207 let n = max_items as f64 ;
199208
200- let k = ( m / n * std:: f64:: consts:: LN_2 ) . round ( ) ;
201-
202- ( k as u16 ) . clamp ( 1 , 100 ) // Reasonable bounds
209+ // Ceil to avoid selecting too few hashes.
210+ let k = ( m / n * std:: f64:: consts:: LN_2 ) . ceil ( ) ;
211+ k. clamp (
212+ f64:: from ( Self :: MIN_NUM_HASHES ) ,
213+ f64:: from ( Self :: MAX_NUM_HASHES ) ,
214+ ) as u16
203215 }
204216
205217 /// Suggests optimal number of hash functions from target FPP.
@@ -215,7 +227,11 @@ impl BloomFilterBuilder {
215227 /// assert_eq!(hashes, 7); // -log2(0.01) ≈ 6.64
216228 /// ```
217229 pub fn suggest_num_hashes_from_fpp ( fpp : f64 ) -> u16 {
230+ // Ceil to avoid selecting too few hashes.
218231 let k = -fpp. log2 ( ) ;
219- ( k. round ( ) as u16 ) . clamp ( 1 , 100 )
232+ k. ceil ( ) . clamp (
233+ f64:: from ( Self :: MIN_NUM_HASHES ) ,
234+ f64:: from ( Self :: MAX_NUM_HASHES ) ,
235+ ) as u16
220236 }
221237}
0 commit comments