Skip to content

Commit 933dcf7

Browse files
authored
Merge branch 'main' into refine_theta_sketch
2 parents 38440dc + f0d997e commit 933dcf7

39 files changed

+1019
-620
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@ All significant changes to this project will be documented in this file.
1313

1414
* `CountMinSketch` with unsigned values now supports `halve` and `decay` operations.
1515
* `CpcSketch` and `CpcUnion` are now available for cardinality estimation.
16-
* `FrequentItemsSketch` now supports serde for `u64` value.
16+
* `CpcWrapper` is now available for reading estimation from a serialized CpcSketch without full deserialization.
17+
* `FrequentItemsSketch` now supports serde for any value implement `FrequentItemValue` (builtin supports for `i64`, `u64`, and `String`).
18+
* Expose `codec::SketchBytes`, `codec::SketchSlice`, and `FrequentItemValue` as public API.
1719

1820
## v0.2.0 (2026-01-14)
1921

datasketches/src/bloom/builder.rs

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@ use crate::hash::DEFAULT_UPDATE_SEED;
2222
/// Builder for creating [`BloomFilter`] instances.
2323
///
2424
/// Provides two construction modes:
25-
/// - [`with_accuracy()`](Self::with_accuracy): Specify target items and false positive rate
25+
/// * [`with_accuracy()`](Self::with_accuracy): Specify target items and false positive rate
2626
/// (recommended)
27-
/// - [`with_size()`](Self::with_size): Specify requested bit count and hash functions (manual)
27+
/// * [`with_size()`](Self::with_size): Specify requested bit count and hash functions (manual)
2828
#[derive(Debug, Clone)]
2929
pub struct BloomFilterBuilder {
3030
num_bits: u64,
@@ -52,8 +52,8 @@ impl BloomFilterBuilder {
5252
///
5353
/// # Arguments
5454
///
55-
/// - `max_items`: Maximum expected number of distinct items
56-
/// - `fpp`: Target false positive probability (e.g., 0.01 for 1%)
55+
/// * `max_items`: Maximum expected number of distinct items
56+
/// * `fpp`: Target false positive probability (e.g., 0.01 for 1%)
5757
///
5858
/// # Panics
5959
///
@@ -95,14 +95,14 @@ impl BloomFilterBuilder {
9595
///
9696
/// # Arguments
9797
///
98-
/// - `num_bits`: Total number of bits in the filter
99-
/// - `num_hashes`: Number of hash functions to use
98+
/// * `num_bits`: Total number of bits in the filter
99+
/// * `num_hashes`: Number of hash functions to use
100100
///
101101
/// # Panics
102102
///
103103
/// Panics if any of:
104-
/// - `num_bits` < [`Self::MIN_NUM_BITS`] or `num_bits` > [`Self::MAX_NUM_BITS`]
105-
/// - `num_hashes` < [`Self::MIN_NUM_HASHES`] or `num_hashes` > [`Self::MIN_NUM_HASHES`]
104+
/// * `num_bits` < [`Self::MIN_NUM_BITS`] or `num_bits` > [`Self::MAX_NUM_BITS`]
105+
/// * `num_hashes` < [`Self::MIN_NUM_HASHES`] or `num_hashes` > [`Self::MAX_NUM_HASHES`]
106106
///
107107
/// # Examples
108108
///

datasketches/src/bloom/mod.rs

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,14 @@
2323
//!
2424
//! # Properties
2525
//!
26-
//! - **No false negatives**: If an item was inserted, `contains()` will always return `true`
27-
//! - **Possible false positives**: `contains()` may return `true` for items never inserted
28-
//! - **Fixed size**: Unlike typical sketches, Bloom filters do not resize automatically
29-
//! - **Linear space**: Size is proportional to the expected number of distinct items
26+
//! * **No false negatives**: If an item was inserted, `contains()` will always return `true`
27+
//! * **Possible false positives**: `contains()` may return `true` for items never inserted
28+
//! * **Fixed size**: Unlike typical sketches, Bloom filters do not resize automatically
29+
//! * **Linear space**: Size is proportional to the expected number of distinct items
3030
//!
3131
//! # Usage
3232
//!
33-
//! ```rust
33+
//! ```
3434
//! use datasketches::bloom::BloomFilter;
3535
//! use datasketches::bloom::BloomFilterBuilder;
3636
//!
@@ -60,7 +60,7 @@
6060
//!
6161
//! Automatically calculates optimal size and hash functions:
6262
//!
63-
//! ```rust
63+
//! ```
6464
//! # use datasketches::bloom::BloomFilterBuilder;
6565
//! let filter = BloomFilterBuilder::with_accuracy(
6666
//! 10_000, // Expected max items
@@ -74,7 +74,7 @@
7474
//!
7575
//! Specify requested bit count and hash functions (rounded up to a multiple of 64 bits):
7676
//!
77-
//! ```rust
77+
//! ```
7878
//! # use datasketches::bloom::BloomFilterBuilder;
7979
//! let filter = BloomFilterBuilder::with_size(
8080
//! 95_851, // Number of bits
@@ -87,7 +87,7 @@
8787
//!
8888
//! Bloom filters support efficient set operations:
8989
//!
90-
//! ```rust
90+
//! ```
9191
//! # use datasketches::bloom::BloomFilterBuilder;
9292
//! let mut filter1 = BloomFilterBuilder::with_accuracy(100, 0.01).build();
9393
//! let mut filter2 = BloomFilterBuilder::with_accuracy(100, 0.01).build();
@@ -109,15 +109,15 @@
109109
//!
110110
//! # Implementation Details
111111
//!
112-
//! - Uses XXHash64 for hashing
113-
//! - Implements double hashing (Kirsch-Mitzenmacher method) for k hash functions
114-
//! - Bits packed efficiently in `u64` words
115-
//! - Compatible serialization format (family ID: 21)
112+
//! * Uses XXHash64 for hashing
113+
//! * Implements double hashing (Kirsch-Mitzenmacher method) for k hash functions
114+
//! * Bits packed efficiently in `u64` words
115+
//! * Compatible serialization format (family ID: 21)
116116
//!
117117
//! # References
118118
//!
119-
//! - Bloom, Burton H. (1970). "Space/time trade-offs in hash coding with allowable errors"
120-
//! - Kirsch and Mitzenmacher (2008). "Less Hashing, Same Performance: Building a Better Bloom
119+
//! * Bloom, Burton H. (1970). "Space/time trade-offs in hash coding with allowable errors"
120+
//! * Kirsch and Mitzenmacher (2008). "Less Hashing, Same Performance: Building a Better Bloom
121121
//! Filter"
122122
123123
mod builder;

datasketches/src/bloom/sketch.rs

Lines changed: 29 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,10 @@ use std::hash::Hasher;
2020

2121
use crate::codec::SketchBytes;
2222
use crate::codec::SketchSlice;
23+
use crate::codec::assert::ensure_preamble_longs_in_range;
24+
use crate::codec::assert::ensure_serial_version_is;
25+
use crate::codec::assert::insufficient_data;
2326
use crate::codec::family::Family;
24-
use crate::codec::utility::ensure_preamble_longs_in_range;
25-
use crate::codec::utility::ensure_serial_version_is;
2627
use crate::error::Error;
2728
use crate::hash::XxHash64;
2829

@@ -33,9 +34,9 @@ const EMPTY_FLAG_MASK: u8 = 1 << 2;
3334
/// A Bloom filter for probabilistic set membership testing.
3435
///
3536
/// Provides fast membership queries with:
36-
/// - No false negatives (inserted items always return `true`)
37-
/// - Tunable false positive rate
38-
/// - Constant space usage
37+
/// * No false negatives (inserted items always return `true`)
38+
/// * Tunable false positive rate
39+
/// * Constant space usage
3940
///
4041
/// Use [`super::BloomFilterBuilder`] to construct instances.
4142
#[derive(Debug, Clone, PartialEq)]
@@ -54,8 +55,8 @@ impl BloomFilter {
5455
/// Tests whether an item is possibly in the set.
5556
///
5657
/// Returns:
57-
/// - `true`: Item was **possibly** inserted (or false positive)
58-
/// - `false`: Item was **definitely not** inserted
58+
/// * `true`: Item was **possibly** inserted (or false positive)
59+
/// * `false`: Item was **definitely not** inserted
5960
///
6061
/// # Examples
6162
///
@@ -290,8 +291,8 @@ impl BloomFilter {
290291
///
291292
/// Uses the approximation: `load_factor^k`
292293
/// where:
293-
/// - load_factor = fraction of bits set (bits_used / capacity)
294-
/// - k = num_hashes
294+
/// * load_factor = fraction of bits set (bits_used / capacity)
295+
/// * k = num_hashes
295296
///
296297
/// This assumes uniform bit distribution and is more accurate than
297298
/// trying to estimate insertion count from the load factor.
@@ -307,9 +308,9 @@ impl BloomFilter {
307308
/// Checks if two filters are compatible for merging.
308309
///
309310
/// Filters are compatible if they have the same:
310-
/// - Capacity (number of bits)
311-
/// - Number of hash functions
312-
/// - Seed
311+
/// * Capacity (number of bits)
312+
/// * Number of hash functions
313+
/// * Seed
313314
pub fn is_compatible(&self, other: &Self) -> bool {
314315
self.bit_array.len() == other.bit_array.len()
315316
&& self.num_hashes == other.num_hashes
@@ -379,9 +380,9 @@ impl BloomFilter {
379380
/// # Errors
380381
///
381382
/// Returns an error if:
382-
/// - The data is truncated or corrupted
383-
/// - The family ID doesn't match (not a Bloom filter)
384-
/// - The serial version is unsupported
383+
/// * The data is truncated or corrupted
384+
/// * The family ID doesn't match (not a Bloom filter)
385+
/// * The serial version is unsupported
385386
///
386387
/// # Examples
387388
///
@@ -399,18 +400,14 @@ impl BloomFilter {
399400
// Read preamble
400401
let preamble_longs = cursor
401402
.read_u8()
402-
.map_err(|_| Error::insufficient_data("preamble_longs"))?;
403+
.map_err(insufficient_data("preamble_longs"))?;
403404
let serial_version = cursor
404405
.read_u8()
405-
.map_err(|_| Error::insufficient_data("serial_version"))?;
406-
let family_id = cursor
407-
.read_u8()
408-
.map_err(|_| Error::insufficient_data("family_id"))?;
406+
.map_err(insufficient_data("serial_version"))?;
407+
let family_id = cursor.read_u8().map_err(insufficient_data("family_id"))?;
409408

410409
// Byte 3: flags byte (directly after family_id)
411-
let flags = cursor
412-
.read_u8()
413-
.map_err(|_| Error::insufficient_data("flags"))?;
410+
let flags = cursor.read_u8().map_err(insufficient_data("flags"))?;
414411

415412
// Validate
416413
Family::BLOOMFILTER.validate_id(family_id)?;
@@ -425,7 +422,7 @@ impl BloomFilter {
425422
// Bytes 4-5: num_hashes (u16)
426423
let num_hashes = cursor
427424
.read_u16_le()
428-
.map_err(|_| Error::insufficient_data("num_hashes"))?;
425+
.map_err(insufficient_data("num_hashes"))?;
429426
if num_hashes == 0 || num_hashes > i16::MAX as u16 {
430427
return Err(Error::deserial(format!(
431428
"invalid num_hashes: expected [1, {}], got {}",
@@ -436,18 +433,14 @@ impl BloomFilter {
436433
// Bytes 6-7: unused (u16)
437434
let _unused = cursor
438435
.read_u16_le()
439-
.map_err(|_| Error::insufficient_data("unused_header"))?;
440-
let seed = cursor
441-
.read_u64_le()
442-
.map_err(|_| Error::insufficient_data("seed"))?;
436+
.map_err(insufficient_data("unused_header"))?;
437+
let seed = cursor.read_u64_le().map_err(insufficient_data("seed"))?;
443438

444439
// Bit array capacity is stored as number of 64-bit words (int32) + unused padding (uint32).
445440
let num_longs = cursor
446441
.read_i32_le()
447-
.map_err(|_| Error::insufficient_data("num_longs"))?;
448-
let _unused = cursor
449-
.read_u32_le()
450-
.map_err(|_| Error::insufficient_data("unused"))?;
442+
.map_err(insufficient_data("num_longs"))?;
443+
let _unused = cursor.read_u32_le().map_err(insufficient_data("unused"))?;
451444

452445
if num_longs <= 0 {
453446
return Err(Error::deserial(format!(
@@ -465,12 +458,12 @@ impl BloomFilter {
465458
} else {
466459
let raw_num_bits_set = cursor
467460
.read_u64_le()
468-
.map_err(|_| Error::insufficient_data("num_bits_set"))?;
461+
.map_err(insufficient_data("num_bits_set"))?;
469462

470463
for word in &mut bit_array {
471464
*word = cursor
472465
.read_u64_le()
473-
.map_err(|_| Error::insufficient_data("bit_array"))?;
466+
.map_err(insufficient_data("bit_array"))?;
474467
}
475468

476469
// Handle "dirty" state: 0xFFFFFFFFFFFFFFFF indicates bits need recounting
@@ -501,8 +494,8 @@ impl BloomFilter {
501494
/// Computes the two base hash values using XXHash64.
502495
///
503496
/// Uses a two-hash approach:
504-
/// - h0 = XXHash64(item, seed)
505-
/// - h1 = XXHash64(item, h0)
497+
/// * h0 = XXHash64(item, seed)
498+
/// * h1 = XXHash64(item, h0)
506499
fn compute_hash<T: Hash>(&self, item: &T) -> (u64, u64) {
507500
// First hash with the configured seed
508501
let mut hasher = XxHash64::with_seed(self.seed);
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@ use std::ops::RangeBounds;
2020

2121
use crate::error::Error;
2222

23+
pub(crate) fn insufficient_data(tag: &'static str) -> impl FnOnce(std::io::Error) -> Error {
24+
move |_| Error::insufficient_data(tag)
25+
}
26+
2327
pub(crate) fn ensure_serial_version_is(expected: u8, actual: u8) -> Result<(), Error> {
2428
if expected == actual {
2529
Ok(())

datasketches/src/codec/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,5 +24,5 @@ pub use self::decode::SketchSlice;
2424
pub use self::encode::SketchBytes;
2525

2626
// private to datasketches crate
27+
pub(crate) mod assert;
2728
pub(crate) mod family;
28-
pub(crate) mod utility;

0 commit comments

Comments
 (0)