Skip to content

Commit 45730e2

Browse files
authored
Merge branch 'main' into density
2 parents 698c143 + f0d997e commit 45730e2

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

63 files changed

+21010
-1039
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@ All significant changes to this project will be documented in this file.
1313

1414
* `CountMinSketch` with unsigned values now supports `halve` and `decay` operations.
1515
* `CpcSketch` and `CpcUnion` are now available for cardinality estimation.
16-
* `FrequentItemsSketch` now supports serde for `u64` value.
16+
* `CpcWrapper` is now available for reading estimation from a serialized CpcSketch without full deserialization.
17+
* `FrequentItemsSketch` now supports serde for any value implement `FrequentItemValue` (builtin supports for `i64`, `u64`, and `String`).
18+
* Expose `codec::SketchBytes`, `codec::SketchSlice`, and `FrequentItemValue` as public API.
1719

1820
## v0.2.0 (2026-01-14)
1921

Cargo.lock

Lines changed: 0 additions & 59 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

README.md

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,6 @@
3535
[actions-badge]: https://github.com/apache/datasketches-rust/workflows/CI/badge.svg
3636
[actions-url]: https://github.com/apache/datasketches-rust/actions?query=workflow%3ACI
3737

38-
> [!WARNING]
39-
>
40-
> This repository is under early development. Use it with caution!
41-
4238
This is the core Rust component of the DataSketches library. It contains a subset of the sketching algorithms and can be accessed directly from user applications.
4339

4440
Note that we have parallel core library components for Java, C++, Python, and Go implementations of many of the same sketch algorithms:

datasketches/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@ rustdoc-args = ["--cfg", "docsrs"]
3636

3737
[dev-dependencies]
3838
googletest = { workspace = true }
39-
rand = { workspace = true }
4039
insta = { workspace = true }
4140

4241
[lints]

datasketches/src/bloom/builder.rs

Lines changed: 53 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,15 @@
1616
// under the License.
1717

1818
use super::BloomFilter;
19+
use crate::codec::family::Family;
1920
use crate::hash::DEFAULT_UPDATE_SEED;
2021

21-
const MIN_NUM_BITS: u64 = 64;
22-
const MAX_NUM_BITS: u64 = (1u64 << 35) - 64; // ~32 GB - reasonable limit
23-
2422
/// Builder for creating [`BloomFilter`] instances.
2523
///
2624
/// Provides two construction modes:
27-
/// - [`with_accuracy()`](Self::with_accuracy): Specify target items and false positive rate
25+
/// * [`with_accuracy()`](Self::with_accuracy): Specify target items and false positive rate
2826
/// (recommended)
29-
/// - [`with_size()`](Self::with_size): Specify exact bit count and hash functions (manual)
27+
/// * [`with_size()`](Self::with_size): Specify requested bit count and hash functions (manual)
3028
#[derive(Debug, Clone)]
3129
pub struct BloomFilterBuilder {
3230
num_bits: u64,
@@ -35,19 +33,31 @@ pub struct BloomFilterBuilder {
3533
}
3634

3735
impl BloomFilterBuilder {
36+
/// Minimum allowed requested Bloom filter size, in bits.
37+
pub const MIN_NUM_BITS: u64 = 1;
38+
/// Maximum allowed requested Bloom filter size, in bits.
39+
///
40+
/// Derived from serialization limits so the encoded sketch length fits in a signed 32-bit size
41+
/// field.
42+
pub const MAX_NUM_BITS: u64 = (i32::MAX as u64 - Family::BLOOMFILTER.max_pre_longs as u64) * 64;
43+
/// Minimum allowed number of hash functions.
44+
pub const MIN_NUM_HASHES: u16 = 1;
45+
/// Maximum allowed number of hash functions.
46+
pub const MAX_NUM_HASHES: u16 = i16::MAX as u16;
47+
3848
/// Creates a builder with optimal parameters for a target accuracy.
3949
///
4050
/// Automatically calculates the optimal number of bits and hash functions
4151
/// to achieve the desired false positive probability for a given number of items.
4252
///
4353
/// # Arguments
4454
///
45-
/// - `max_items`: Maximum expected number of distinct items
46-
/// - `fpp`: Target false positive probability (e.g., 0.01 for 1%)
55+
/// * `max_items`: Maximum expected number of distinct items
56+
/// * `fpp`: Target false positive probability (e.g., 0.01 for 1%)
4757
///
4858
/// # Panics
4959
///
50-
/// Panics if `max_items` is 0 or `fpp` is not in (0.0, 1.0).
60+
/// Panics if `max_items` is 0 or `fpp` is not in (0.0, 1.0].
5161
///
5262
/// # Examples
5363
///
@@ -61,8 +71,8 @@ impl BloomFilterBuilder {
6171
pub fn with_accuracy(max_items: u64, fpp: f64) -> Self {
6272
assert!(max_items > 0, "max_items must be greater than 0");
6373
assert!(
64-
fpp > 0.0 && fpp < 1.0,
65-
"fpp must be between 0.0 and 1.0 (exclusive)"
74+
fpp > 0.0 && fpp <= 1.0,
75+
"fpp must be between 0.0 and 1.0 (inclusive of 1.0)"
6676
);
6777

6878
let num_bits = Self::suggest_num_bits(max_items, fpp);
@@ -77,19 +87,22 @@ impl BloomFilterBuilder {
7787

7888
/// Creates a builder with manual size specification.
7989
///
80-
/// Use this when you want precise control over the filter size,
90+
/// Use this when you want precise control over the requested filter size,
8191
/// or when working with pre-calculated parameters.
8292
///
93+
/// The underlying storage is word-based, so the actual capacity is rounded
94+
/// up to the next multiple of 64 bits.
95+
///
8396
/// # Arguments
8497
///
85-
/// - `num_bits`: Total number of bits in the filter
86-
/// - `num_hashes`: Number of hash functions to use
98+
/// * `num_bits`: Total number of bits in the filter
99+
/// * `num_hashes`: Number of hash functions to use
87100
///
88101
/// # Panics
89102
///
90-
/// Panics if:
91-
/// - `num_bits` < MIN_NUM_BITS (64) or `num_bits` > MAX_NUM_BITS (~32 GB)
92-
/// - `num_hashes` < 1 or `num_hashes` > 100
103+
/// Panics if any of:
104+
/// * `num_bits` < [`Self::MIN_NUM_BITS`] or `num_bits` > [`Self::MAX_NUM_BITS`]
105+
/// * `num_hashes` < [`Self::MIN_NUM_HASHES`] or `num_hashes` > [`Self::MAX_NUM_HASHES`]
93106
///
94107
/// # Examples
95108
///
@@ -99,17 +112,19 @@ impl BloomFilterBuilder {
99112
/// ```
100113
pub fn with_size(num_bits: u64, num_hashes: u16) -> Self {
101114
assert!(
102-
num_bits >= MIN_NUM_BITS,
103-
"num_bits must be at least {}",
104-
MIN_NUM_BITS
115+
(Self::MIN_NUM_BITS..=Self::MAX_NUM_BITS).contains(&num_bits),
116+
"num_bits must be between {} and {}, got {}",
117+
Self::MIN_NUM_BITS,
118+
Self::MAX_NUM_BITS,
119+
num_bits,
105120
);
106121
assert!(
107-
num_bits <= MAX_NUM_BITS,
108-
"num_bits must not exceed {}",
109-
MAX_NUM_BITS
122+
(Self::MIN_NUM_HASHES..=Self::MAX_NUM_HASHES).contains(&num_hashes),
123+
"num_bits must be between {} and {}, got {}",
124+
Self::MIN_NUM_HASHES,
125+
Self::MAX_NUM_HASHES,
126+
num_hashes
110127
);
111-
assert!(num_hashes >= 1, "num_hashes must be at least 1");
112-
assert!(num_hashes <= 100, "num_hashes must not exceed 100");
113128

114129
BloomFilterBuilder {
115130
num_bits,
@@ -141,16 +156,13 @@ impl BloomFilterBuilder {
141156
///
142157
/// Panics if neither `with_accuracy()` nor `with_size()` was called.
143158
pub fn build(self) -> BloomFilter {
144-
let capacity_bits = self.num_bits;
145159
let num_hashes = self.num_hashes;
146-
147-
let num_words = capacity_bits.div_ceil(64) as usize;
148-
let bit_array = vec![0u64; num_words];
160+
let num_words = self.num_bits.div_ceil(64) as usize;
161+
let bit_array = vec![0u64; num_words].into_boxed_slice();
149162

150163
BloomFilter {
151164
seed: self.seed,
152165
num_hashes,
153-
capacity_bits,
154166
num_bits_set: 0,
155167
bit_array,
156168
}
@@ -175,10 +187,7 @@ impl BloomFilterBuilder {
175187

176188
let bits = (-n * p.ln() / ln2_squared).ceil() as u64;
177189

178-
// Round up to multiple of 64 for efficiency
179-
let bits = bits.div_ceil(64) * 64;
180-
181-
bits.clamp(MIN_NUM_BITS, MAX_NUM_BITS)
190+
bits.clamp(Self::MIN_NUM_BITS, Self::MAX_NUM_BITS)
182191
}
183192

184193
/// Suggests optimal number of hash functions given max items and bit count.
@@ -197,9 +206,12 @@ impl BloomFilterBuilder {
197206
let m = num_bits as f64;
198207
let n = max_items as f64;
199208

200-
let k = (m / n * std::f64::consts::LN_2).round();
201-
202-
(k as u16).clamp(1, 100) // Reasonable bounds
209+
// Ceil to avoid selecting too few hashes.
210+
let k = (m / n * std::f64::consts::LN_2).ceil();
211+
k.clamp(
212+
f64::from(Self::MIN_NUM_HASHES),
213+
f64::from(Self::MAX_NUM_HASHES),
214+
) as u16
203215
}
204216

205217
/// Suggests optimal number of hash functions from target FPP.
@@ -215,7 +227,11 @@ impl BloomFilterBuilder {
215227
/// assert_eq!(hashes, 7); // -log2(0.01) ≈ 6.64
216228
/// ```
217229
pub fn suggest_num_hashes_from_fpp(fpp: f64) -> u16 {
230+
// Ceil to avoid selecting too few hashes.
218231
let k = -fpp.log2();
219-
(k.round() as u16).clamp(1, 100)
232+
k.ceil().clamp(
233+
f64::from(Self::MIN_NUM_HASHES),
234+
f64::from(Self::MAX_NUM_HASHES),
235+
) as u16
220236
}
221237
}

datasketches/src/bloom/mod.rs

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,14 @@
2323
//!
2424
//! # Properties
2525
//!
26-
//! - **No false negatives**: If an item was inserted, `contains()` will always return `true`
27-
//! - **Possible false positives**: `contains()` may return `true` for items never inserted
28-
//! - **Fixed size**: Unlike typical sketches, Bloom filters do not resize automatically
29-
//! - **Linear space**: Size is proportional to the expected number of distinct items
26+
//! * **No false negatives**: If an item was inserted, `contains()` will always return `true`
27+
//! * **Possible false positives**: `contains()` may return `true` for items never inserted
28+
//! * **Fixed size**: Unlike typical sketches, Bloom filters do not resize automatically
29+
//! * **Linear space**: Size is proportional to the expected number of distinct items
3030
//!
3131
//! # Usage
3232
//!
33-
//! ```rust
33+
//! ```
3434
//! use datasketches::bloom::BloomFilter;
3535
//! use datasketches::bloom::BloomFilterBuilder;
3636
//!
@@ -60,7 +60,7 @@
6060
//!
6161
//! Automatically calculates optimal size and hash functions:
6262
//!
63-
//! ```rust
63+
//! ```
6464
//! # use datasketches::bloom::BloomFilterBuilder;
6565
//! let filter = BloomFilterBuilder::with_accuracy(
6666
//! 10_000, // Expected max items
@@ -72,9 +72,9 @@
7272
//!
7373
//! ## By Size (Manual)
7474
//!
75-
//! Specify exact bit count and hash functions:
75+
//! Specify requested bit count and hash functions (rounded up to a multiple of 64 bits):
7676
//!
77-
//! ```rust
77+
//! ```
7878
//! # use datasketches::bloom::BloomFilterBuilder;
7979
//! let filter = BloomFilterBuilder::with_size(
8080
//! 95_851, // Number of bits
@@ -87,7 +87,7 @@
8787
//!
8888
//! Bloom filters support efficient set operations:
8989
//!
90-
//! ```rust
90+
//! ```
9191
//! # use datasketches::bloom::BloomFilterBuilder;
9292
//! let mut filter1 = BloomFilterBuilder::with_accuracy(100, 0.01).build();
9393
//! let mut filter2 = BloomFilterBuilder::with_accuracy(100, 0.01).build();
@@ -109,15 +109,15 @@
109109
//!
110110
//! # Implementation Details
111111
//!
112-
//! - Uses XXHash64 for hashing
113-
//! - Implements double hashing (Kirsch-Mitzenmacher method) for k hash functions
114-
//! - Bits packed efficiently in `u64` words
115-
//! - Compatible serialization format (family ID: 21)
112+
//! * Uses XXHash64 for hashing
113+
//! * Implements double hashing (Kirsch-Mitzenmacher method) for k hash functions
114+
//! * Bits packed efficiently in `u64` words
115+
//! * Compatible serialization format (family ID: 21)
116116
//!
117117
//! # References
118118
//!
119-
//! - Bloom, Burton H. (1970). "Space/time trade-offs in hash coding with allowable errors"
120-
//! - Kirsch and Mitzenmacher (2008). "Less Hashing, Same Performance: Building a Better Bloom
119+
//! * Bloom, Burton H. (1970). "Space/time trade-offs in hash coding with allowable errors"
120+
//! * Kirsch and Mitzenmacher (2008). "Less Hashing, Same Performance: Building a Better Bloom
121121
//! Filter"
122122
123123
mod builder;

0 commit comments

Comments
 (0)