Skip to content

Commit 6546d66

Browse files
authored
refactor: export FrequentItemValue and improve docs (#98)
Signed-off-by: tison <wander4096@gmail.com>
1 parent 74b2b87 commit 6546d66

25 files changed

+229
-274
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ All significant changes to this project will be documented in this file.
1313

1414
* `CountMinSketch` with unsigned values now supports `halve` and `decay` operations.
1515
* `CpcSketch` and `CpcUnion` are now available for cardinality estimation.
16-
* `FrequentItemsSketch` now supports serde for `u64` value.
16+
* `FrequentItemsSketch` now supports serde for any value implement `FrequentItemValue` (builtin supports for `i64`, `u64`, and `String`).
17+
* Expose `codec::SketchBytes`, `codec::SketchSlice`, and `FrequentItemValue` as public API.
1718

1819
## v0.2.0 (2026-01-14)
1920

datasketches/src/bloom/builder.rs

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@ use crate::hash::DEFAULT_UPDATE_SEED;
2222
/// Builder for creating [`BloomFilter`] instances.
2323
///
2424
/// Provides two construction modes:
25-
/// - [`with_accuracy()`](Self::with_accuracy): Specify target items and false positive rate
25+
/// * [`with_accuracy()`](Self::with_accuracy): Specify target items and false positive rate
2626
/// (recommended)
27-
/// - [`with_size()`](Self::with_size): Specify requested bit count and hash functions (manual)
27+
/// * [`with_size()`](Self::with_size): Specify requested bit count and hash functions (manual)
2828
#[derive(Debug, Clone)]
2929
pub struct BloomFilterBuilder {
3030
num_bits: u64,
@@ -52,8 +52,8 @@ impl BloomFilterBuilder {
5252
///
5353
/// # Arguments
5454
///
55-
/// - `max_items`: Maximum expected number of distinct items
56-
/// - `fpp`: Target false positive probability (e.g., 0.01 for 1%)
55+
/// * `max_items`: Maximum expected number of distinct items
56+
/// * `fpp`: Target false positive probability (e.g., 0.01 for 1%)
5757
///
5858
/// # Panics
5959
///
@@ -95,14 +95,14 @@ impl BloomFilterBuilder {
9595
///
9696
/// # Arguments
9797
///
98-
/// - `num_bits`: Total number of bits in the filter
99-
/// - `num_hashes`: Number of hash functions to use
98+
/// * `num_bits`: Total number of bits in the filter
99+
/// * `num_hashes`: Number of hash functions to use
100100
///
101101
/// # Panics
102102
///
103103
/// Panics if any of:
104-
/// - `num_bits` < [`Self::MIN_NUM_BITS`] or `num_bits` > [`Self::MAX_NUM_BITS`]
105-
/// - `num_hashes` < [`Self::MIN_NUM_HASHES`] or `num_hashes` > [`Self::MIN_NUM_HASHES`]
104+
/// * `num_bits` < [`Self::MIN_NUM_BITS`] or `num_bits` > [`Self::MAX_NUM_BITS`]
105+
/// * `num_hashes` < [`Self::MIN_NUM_HASHES`] or `num_hashes` > [`Self::MAX_NUM_HASHES`]
106106
///
107107
/// # Examples
108108
///

datasketches/src/bloom/mod.rs

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,10 @@
2323
//!
2424
//! # Properties
2525
//!
26-
//! - **No false negatives**: If an item was inserted, `contains()` will always return `true`
27-
//! - **Possible false positives**: `contains()` may return `true` for items never inserted
28-
//! - **Fixed size**: Unlike typical sketches, Bloom filters do not resize automatically
29-
//! - **Linear space**: Size is proportional to the expected number of distinct items
26+
//! * **No false negatives**: If an item was inserted, `contains()` will always return `true`
27+
//! * **Possible false positives**: `contains()` may return `true` for items never inserted
28+
//! * **Fixed size**: Unlike typical sketches, Bloom filters do not resize automatically
29+
//! * **Linear space**: Size is proportional to the expected number of distinct items
3030
//!
3131
//! # Usage
3232
//!
@@ -109,15 +109,15 @@
109109
//!
110110
//! # Implementation Details
111111
//!
112-
//! - Uses XXHash64 for hashing
113-
//! - Implements double hashing (Kirsch-Mitzenmacher method) for k hash functions
114-
//! - Bits packed efficiently in `u64` words
115-
//! - Compatible serialization format (family ID: 21)
112+
//! * Uses XXHash64 for hashing
113+
//! * Implements double hashing (Kirsch-Mitzenmacher method) for k hash functions
114+
//! * Bits packed efficiently in `u64` words
115+
//! * Compatible serialization format (family ID: 21)
116116
//!
117117
//! # References
118118
//!
119-
//! - Bloom, Burton H. (1970). "Space/time trade-offs in hash coding with allowable errors"
120-
//! - Kirsch and Mitzenmacher (2008). "Less Hashing, Same Performance: Building a Better Bloom
119+
//! * Bloom, Burton H. (1970). "Space/time trade-offs in hash coding with allowable errors"
120+
//! * Kirsch and Mitzenmacher (2008). "Less Hashing, Same Performance: Building a Better Bloom
121121
//! Filter"
122122
123123
mod builder;

datasketches/src/bloom/sketch.rs

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@ const EMPTY_FLAG_MASK: u8 = 1 << 2;
3333
/// A Bloom filter for probabilistic set membership testing.
3434
///
3535
/// Provides fast membership queries with:
36-
/// - No false negatives (inserted items always return `true`)
37-
/// - Tunable false positive rate
38-
/// - Constant space usage
36+
/// * No false negatives (inserted items always return `true`)
37+
/// * Tunable false positive rate
38+
/// * Constant space usage
3939
///
4040
/// Use [`super::BloomFilterBuilder`] to construct instances.
4141
#[derive(Debug, Clone, PartialEq)]
@@ -54,8 +54,8 @@ impl BloomFilter {
5454
/// Tests whether an item is possibly in the set.
5555
///
5656
/// Returns:
57-
/// - `true`: Item was **possibly** inserted (or false positive)
58-
/// - `false`: Item was **definitely not** inserted
57+
/// * `true`: Item was **possibly** inserted (or false positive)
58+
/// * `false`: Item was **definitely not** inserted
5959
///
6060
/// # Examples
6161
///
@@ -290,8 +290,8 @@ impl BloomFilter {
290290
///
291291
/// Uses the approximation: `load_factor^k`
292292
/// where:
293-
/// - load_factor = fraction of bits set (bits_used / capacity)
294-
/// - k = num_hashes
293+
/// * load_factor = fraction of bits set (bits_used / capacity)
294+
/// * k = num_hashes
295295
///
296296
/// This assumes uniform bit distribution and is more accurate than
297297
/// trying to estimate insertion count from the load factor.
@@ -307,9 +307,9 @@ impl BloomFilter {
307307
/// Checks if two filters are compatible for merging.
308308
///
309309
/// Filters are compatible if they have the same:
310-
/// - Capacity (number of bits)
311-
/// - Number of hash functions
312-
/// - Seed
310+
/// * Capacity (number of bits)
311+
/// * Number of hash functions
312+
/// * Seed
313313
pub fn is_compatible(&self, other: &Self) -> bool {
314314
self.bit_array.len() == other.bit_array.len()
315315
&& self.num_hashes == other.num_hashes
@@ -379,9 +379,9 @@ impl BloomFilter {
379379
/// # Errors
380380
///
381381
/// Returns an error if:
382-
/// - The data is truncated or corrupted
383-
/// - The family ID doesn't match (not a Bloom filter)
384-
/// - The serial version is unsupported
382+
/// * The data is truncated or corrupted
383+
/// * The family ID doesn't match (not a Bloom filter)
384+
/// * The serial version is unsupported
385385
///
386386
/// # Examples
387387
///
@@ -501,8 +501,8 @@ impl BloomFilter {
501501
/// Computes the two base hash values using XXHash64.
502502
///
503503
/// Uses a two-hash approach:
504-
/// - h0 = XXHash64(item, seed)
505-
/// - h1 = XXHash64(item, h0)
504+
/// * h0 = XXHash64(item, seed)
505+
/// * h1 = XXHash64(item, h0)
506506
fn compute_hash<T: Hash>(&self, item: &T) -> (u64, u64) {
507507
// First hash with the configured seed
508508
let mut hasher = XxHash64::with_seed(self.seed);

datasketches/src/common/binomial_bounds.rs

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -274,9 +274,9 @@ static UB_EQUIV_TABLE: [f64; 363] = [
274274
///
275275
/// # Arguments
276276
///
277-
/// * `num_samples` - The number of samples in the sample set.
278-
/// * `theta` - The sampling probability. Must be in the range (0.0, 1.0].
279-
/// * `num_std_dev` - The number of standard deviations for confidence bounds.
277+
/// * `num_samples`: The number of samples in the sample set.
278+
/// * `theta`: The sampling probability. Must be in the range (0.0, 1.0].
279+
/// * `num_std_dev`: The number of standard deviations for confidence bounds.
280280
///
281281
/// # Returns
282282
///
@@ -301,11 +301,11 @@ pub(crate) fn lower_bound(
301301
///
302302
/// # Arguments
303303
///
304-
/// * `num_samples` - The number of samples in the sample set.
305-
/// * `theta` - The sampling probability. Must be in the range `(0.0, 1.0]`.
306-
/// * `num_std_dev` - The number of standard deviations for confidence bounds.
307-
/// * `no_data_seen` - This is normally false. However, in the case where you have zero samples and
308-
/// a theta < 1.0, this flag enables the distinction between a virgin case when no actual data has
304+
/// * `num_samples`: The number of samples in the sample set.
305+
/// * `theta`: The sampling probability. Must be in the range `(0.0, 1.0]`.
306+
/// * `num_std_dev`: The number of standard deviations for confidence bounds.
307+
/// * `no_data_seen`: This is normally false. However, in the case where you have zero samples and a
308+
/// theta < 1.0, this flag enables the distinction between a virgin case when no actual data has
309309
/// been seen and the case where the estimate may be zero but an upper error bound may still
310310
/// exist.
311311
///
@@ -367,16 +367,16 @@ fn cont_classic_ub(num_samples: u64, theta: f64, num_std_devs: f64) -> f64 {
367367
///
368368
/// # Arguments
369369
///
370-
/// * `num_samples` - The number of observed samples (k). Must be >= 1.
371-
/// * `p` - The sampling probability. Must satisfy: 0 < p < 1.
372-
/// * `delta` - The tail probability. Must satisfy: 0 < delta < 1.
370+
/// * `num_samples`: The number of observed samples (k). Must be >= 1.
371+
/// * `p`: The sampling probability. Must satisfy: 0 < p < 1.
372+
/// * `delta`: The tail probability. Must satisfy: 0 < delta < 1.
373373
///
374374
/// # Invariants
375375
///
376-
/// - `num_samples >= 1`
377-
/// - `0.0 < p < 1.0`
378-
/// - `0.0 < delta < 1.0`
379-
/// - `(num_samples / p) < 500.0` (enforced for performance and numerical stability)
376+
/// * `num_samples >= 1`
377+
/// * `0.0 < p < 1.0`
378+
/// * `0.0 < delta < 1.0`
379+
/// * `(num_samples / p) < 500.0` (enforced for performance and numerical stability)
380380
///
381381
/// # Returns
382382
///
@@ -413,15 +413,15 @@ fn special_n_star(num_samples: u64, p: f64, delta: f64) -> Result<u64, Error> {
413413
///
414414
/// # Arguments
415415
///
416-
/// * `num_samples` - The number of observed samples (k). Must be >= 1.
417-
/// * `p` - The sampling probability. Must satisfy: 0 < p < 1.
418-
/// * `delta` - The tail probability. Must satisfy: 0 < delta < 1.
416+
/// * `num_samples`: The number of observed samples (k). Must be >= 1.
417+
/// * `p`: The sampling probability. Must satisfy: 0 < p < 1.
418+
/// * `delta`: The tail probability. Must satisfy: 0 < delta < 1.
419419
///
420420
/// # Invariants
421421
///
422-
/// - `num_samples >= 1`
423-
/// - `0.0 < p < 1.0`
424-
/// - `0.0 < delta < 1.0`
422+
/// * `num_samples >= 1`
423+
/// * `0.0 < p < 1.0`
424+
/// * `0.0 < delta < 1.0`
425425
///
426426
/// # Returns
427427
///
@@ -452,14 +452,14 @@ fn special_n_prime_b(num_samples: u64, p: f64, delta: f64) -> Result<u64, Error>
452452
///
453453
/// # Arguments
454454
///
455-
/// * `num_samples` - The number of observed samples (k). Must be >= 1.
456-
/// * `p` - The sampling probability. Must satisfy: 0 < p < 1.
457-
/// * `delta` - The tail probability. Must satisfy: 0 < delta < 1.
455+
/// * `num_samples`: The number of observed samples (k). Must be >= 1.
456+
/// * `p`: The sampling probability. Must satisfy: 0 < p < 1.
457+
/// * `delta`: The tail probability. Must satisfy: 0 < delta < 1.
458458
///
459459
/// # Invariants
460460
///
461-
/// - `(num_samples / p) < 500.0` (enforced for performance)
462-
/// - A super-small delta could also make it slow.
461+
/// * `(num_samples / p) < 500.0` (enforced for performance)
462+
/// * A super-small delta could also make it slow.
463463
fn special_n_prime_f(num_samples: u64, p: f64, delta: f64) -> Result<u64, Error> {
464464
// Use a different algorithm if the following is true; this one will be too slow, or worse.
465465
if (num_samples as f64 / p) >= 500.0 {

datasketches/src/countmin/sketch.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,10 @@ impl<T: CountMinValue> CountMinSketch<T> {
7575
/// # Panics
7676
///
7777
/// Panics if any of:
78-
/// - `num_hashes` is 0
79-
/// - `num_buckets` is less than 3
80-
/// - the total table size exceeds the supported limit
81-
/// - the computed seed hash is zero
78+
/// * `num_hashes` is 0
79+
/// * `num_buckets` is less than 3
80+
/// * the total table size exceeds the supported limit
81+
/// * the computed seed hash is zero
8282
///
8383
/// # Examples
8484
///

datasketches/src/frequencies/mod.rs

Lines changed: 59 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,66 @@
1717

1818
//! Frequency sketches for finding heavy hitters in data streams.
1919
//!
20-
//! This module implements the Frequent Items sketch from Apache DataSketches. It tracks
21-
//! approximate frequencies in a stream and can report heavy hitters with explicit
22-
//! error guarantees (no false negatives or no false positives).
20+
//! # Overview
2321
//!
24-
//! For background, see the Java documentation:
25-
//! <https://apache.github.io/datasketches-java/9.0.0/org/apache/datasketches/frequencies/FrequentItemsSketch.html>
22+
//! This sketch is based on the paper ["A High-Performance Algorithm for Identifying Frequent Items
23+
//! in Data Streams"](https://arxiv.org/abs/1705.07001) by Daniel Anderson, Pryce Bevan, Kevin Lang,
24+
//! Edo Liberty, Lee Rhodes, and Justin Thaler.
2625
//!
27-
//! # Usage
26+
//! This sketch is useful for tracking approximate frequencies of items of type `T` that implements
27+
//! [`FrequentItemValue`], with optional associated counts (`T` item, `u64` count) that are members
28+
//! of a multiset of such items. The true frequency of an item is defined to be the sum of
29+
//! associated counts.
2830
//!
29-
//! ```rust
31+
//! This implementation provides the following capabilities:
32+
//! * Estimate the frequency of an item.
33+
//! * Return upper and lower bounds of any item, such that the true frequency is always between the
34+
//! upper and lower bounds.
35+
//! * Return a global maximum error that holds for all items in the stream.
36+
//! * Return an array of frequent items that qualify either [`ErrorType::NoFalsePositives`] or
37+
//! [`ErrorType::NoFalseNegatives`].
38+
//! * Merge itself with another sketch created from this module.
39+
//! * Serialize to bytes, or deserialize from bytes, for storage or transmission.
40+
//!
41+
//! # Accuracy
42+
//!
43+
//! If fewer than `0.75 * max_map_size` different items are inserted into the sketch the estimated
44+
//! frequencies returned by the sketch will be exact.
45+
//!
46+
//! The logic of the frequent items sketch is such that the stored counts and true counts are never
47+
//! too different. More specifically, for any item, the sketch can return an estimate of the true
48+
//! frequency of item, along with upper and lower bounds on the frequency (that hold
49+
//! deterministically).
50+
//!
51+
//! For this implementation and for a specific active item, it is guaranteed that the true frequency
52+
//! will be between the Upper Bound (UB) and the Lower Bound (LB) computed for that item.
53+
//! Specifically, `(UB - LB) ≤ W * epsilon`, where `W` denotes the sum of all item counts, and
54+
//! `epsilon = 3.5/M`, where `M` is the `max_map_size`.
55+
//!
56+
//! This is the worst case guarantee that applies to arbitrary inputs. [^1]
57+
//! For inputs typically seen in practice (`UB - LB`) is usually much smaller.
58+
//!
59+
//! [^1]: For speed we do employ some randomization that introduces a small probability that our
60+
//! proof of the worst-case bound might not apply to a given run. However, we have ensured that this
61+
//! probability is extremely small. For example, if the stream causes one table purge (rebuild),
62+
//! our proof of the worst case bound applies with probability at least `1 - 1E-14`. If the stream
63+
//! causes `1E9` purges, our proof applies with probability at least `1 - 1E-5`.
64+
//!
65+
//! # Background
66+
//!
67+
//! This code implements a variant of what is commonly known as the "Misra-Gries algorithm".
68+
//! Variants of it were discovered and rediscovered and redesigned several times over the years:
69+
//! * "Finding repeated elements", Misra, Gries, 1982
70+
//! * "Frequency estimation of Internet packet streams with limited space" Demaine, Lopez-Ortiz,
71+
//! Munro, 2002
72+
//! * "A simple algorithm for finding frequent elements in streams and bags" Karp, Shenker,
73+
//! Papadimitriou, 2003
74+
//! * "Efficient Computation of Frequent and Top-k Elements in Data Streams" Metwally, Agrawal,
75+
//! Abbadi, 2006
76+
//!
77+
//! # Examples
78+
//!
79+
//! ```
3080
//! # use datasketches::frequencies::ErrorType;
3181
//! # use datasketches::frequencies::FrequentItemsSketch;
3282
//! let mut sketch = FrequentItemsSketch::<i64>::new(64);
@@ -38,7 +88,7 @@
3888
//!
3989
//! # Serialization
4090
//!
41-
//! ```rust
91+
//! ```
4292
//! # use datasketches::frequencies::FrequentItemsSketch;
4393
//! let mut sketch = FrequentItemsSketch::<i64>::new(64);
4494
//! sketch.update_with_count(42, 2);
@@ -52,6 +102,7 @@ mod reverse_purge_item_hash_map;
52102
mod serialization;
53103
mod sketch;
54104

105+
pub use self::serialization::FrequentItemValue;
55106
pub use self::sketch::ErrorType;
56107
pub use self::sketch::FrequentItemsSketch;
57108
pub use self::sketch::Row;

datasketches/src/frequencies/reverse_purge_item_hash_map.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ impl<T: Eq + Hash> ReversePurgeItemHashMap<T> {
192192
T: Clone,
193193
{
194194
if self.num_active == 0 {
195-
return Vec::new();
195+
return vec![];
196196
}
197197
let mut keys = Vec::with_capacity(self.num_active);
198198
for i in 0..self.keys.len() {
@@ -208,7 +208,7 @@ impl<T: Eq + Hash> ReversePurgeItemHashMap<T> {
208208
/// Returns the active values in the map.
209209
pub fn active_values(&self) -> Vec<u64> {
210210
if self.num_active == 0 {
211-
return Vec::new();
211+
return vec![];
212212
}
213213
let mut values = Vec::with_capacity(self.num_active);
214214
for i in 0..self.values.len() {

0 commit comments

Comments
 (0)