Skip to content

Commit 11bcbd9

Browse files
authored
Merge pull request #6 from urschrei/shugel/push-spslktpupltv
Add builder function
2 parents b4b3508 + 3b2a250 commit 11bcbd9

File tree

5 files changed

+343
-3
lines changed

5 files changed

+343
-3
lines changed

README.md

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,59 @@ cvmcount -t file.txt -e 0.8 -d 0.1 -s 5000
4646

4747
The `--help` option is available.
4848

49+
# Library Usage
50+
51+
The library provides both a simple constructor and a builder pattern for more ergonomic usage:
52+
53+
## Simple Constructor
54+
55+
```rust
56+
use cvmcount::CVM;
57+
58+
let mut cvm = CVM::new(0.05, 0.01, 10_000);
59+
for item in data_stream {
60+
cvm.process_element(item);
61+
}
62+
let estimate = cvm.calculate_final_result();
63+
```
64+
65+
## Builder Pattern (Recommended)
66+
67+
The builder pattern provides better readability and validation:
68+
69+
```rust
70+
use cvmcount::CVM;
71+
72+
// Using defaults (epsilon=0.8, confidence=0.9, size=1000)
73+
let mut cvm: CVM<String> = CVM::builder().build().unwrap();
74+
75+
// Custom configuration with confidence level
76+
let mut cvm: CVM<i32> = CVM::builder()
77+
.epsilon(0.05) // 5 % accuracy
78+
.confidence(0.99) // 99 % confidence
79+
.estimated_size(50_000)
80+
.build()
81+
.unwrap();
82+
83+
// Using delta (failure probability) instead of confidence
84+
let mut cvm: CVM<String> = CVM::builder()
85+
.epsilon(0.1) // 10 % accuracy
86+
.delta(0.01) // 1 % chance of failure
87+
.estimated_size(1_000)
88+
.build()
89+
.unwrap();
90+
91+
// Process your data
92+
for word in text.split_whitespace() {
93+
cvm.process_element(word.to_string());
94+
}
95+
96+
let estimate = cvm.calculate_final_result();
97+
println!("Estimated unique words: {}", estimate as usize);
98+
```
99+
100+
The builder validates parameters and provides clear error messages for invalid inputs.
101+
49102
## Analysis
50103

51104
![](cvmcount.png)

benches/benchmarks.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ use std::{
88

99
use criterion::Criterion;
1010
use cvmcount::CVM;
11-
use rand::{thread_rng, Rng};
11+
use rand::{Rng, thread_rng};
1212
use regex::Regex;
1313

1414
use std::collections::HashSet;

src/lib.rs

Lines changed: 287 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,164 @@ use crate::treap::Treap;
99
use rand::rngs::StdRng;
1010
use rand::{Rng, SeedableRng};
1111

12+
/// Specification for confidence level in the CVM algorithm
13+
#[derive(Debug, Clone, Copy)]
14+
pub enum ConfidenceSpec {
15+
/// Specify delta directly (probability of failure)
16+
Delta(f64),
17+
/// Specify confidence level (probability of success)
18+
Confidence(f64),
19+
}
20+
21+
impl ConfidenceSpec {
22+
/// Convert to delta value for internal use
23+
fn to_delta(self) -> f64 {
24+
match self {
25+
ConfidenceSpec::Delta(delta) => delta,
26+
ConfidenceSpec::Confidence(confidence) => 1.0 - confidence,
27+
}
28+
}
29+
30+
/// Validate the confidence specification
31+
fn validate(self) -> Result<Self, String> {
32+
match self {
33+
ConfidenceSpec::Delta(delta) => {
34+
if delta <= 0.0 || delta >= 1.0 {
35+
Err("Delta must be between 0.0 and 1.0 (exclusive)".to_string())
36+
} else {
37+
Ok(self)
38+
}
39+
}
40+
ConfidenceSpec::Confidence(confidence) => {
41+
if confidence <= 0.0 || confidence >= 1.0 {
42+
Err("Confidence must be between 0.0 and 1.0 (exclusive)".to_string())
43+
} else {
44+
Ok(self)
45+
}
46+
}
47+
}
48+
}
49+
}
50+
51+
/// Builder for constructing CVM instances with validation and defaults
52+
///
53+
/// # Examples
54+
///
55+
/// ```
56+
/// use cvmcount::CVM;
57+
///
58+
/// // Using defaults (`epsilon=0.8`, `confidence=0.9`, `size=1000`)
59+
/// let cvm: CVM<String> = CVM::<String>::builder().build().unwrap();
60+
///
61+
/// // Custom parameters
62+
/// let cvm: CVM<i32> = CVM::<i32>::builder()
63+
/// .epsilon(0.05) // 5 % accuracy
64+
/// .confidence(0.99) // 99 % confidence
65+
/// .estimated_size(10_000)
66+
/// .build()
67+
/// .unwrap();
68+
///
69+
/// // Using delta instead of confidence
70+
/// let cvm: CVM<String> = CVM::<String>::builder()
71+
/// .epsilon(0.1)
72+
/// .delta(0.01) // 1 % failure probability
73+
/// .build()
74+
/// .unwrap();
75+
/// ```
76+
#[derive(Debug, Clone, Default)]
77+
pub struct CVMBuilder {
78+
epsilon: Option<f64>,
79+
confidence_spec: Option<ConfidenceSpec>,
80+
stream_size: Option<usize>,
81+
}
82+
83+
impl CVMBuilder {
84+
/// Create a new builder with default values
85+
pub fn new() -> Self {
86+
Self::default()
87+
}
88+
89+
/// Set the epsilon parameter (accuracy requirement)
90+
///
91+
/// `Epsilon` determines how close you want your estimate to be to the true number
92+
/// of distinct elements. A smaller `ε` means you require a more precise estimate.
93+
/// For example, `ε = 0.05` means you want your estimate to be within 5 % of the
94+
/// actual value.
95+
///
96+
/// Must be between 0.0 and 1.0 (exclusive).
97+
pub fn epsilon(mut self, epsilon: f64) -> Self {
98+
self.epsilon = Some(epsilon);
99+
self
100+
}
101+
102+
/// Set the confidence level (probability that the estimate will be accurate)
103+
///
104+
/// Confidence represents how certain you want to be that the algorithm's
105+
/// estimate will fall within the desired accuracy range. For example,
106+
/// `confidence = 0.99` means you're 99 % sure the estimate will be accurate.
107+
///
108+
/// Must be between 0.0 and 1.0 (exclusive).
109+
/// Cannot be used together with [`Self::delta`] – the last one called will be used.
110+
pub fn confidence(mut self, confidence: f64) -> Self {
111+
self.confidence_spec = Some(ConfidenceSpec::Confidence(confidence));
112+
self
113+
}
114+
115+
/// Set the delta parameter (probability of failure)
116+
///
117+
/// Delta represents the probability that the algorithm's estimate will fall
118+
/// outside the desired accuracy range. For example, `delta = 0.01` means there's
119+
/// a 1 % chance the estimate will be inaccurate.
120+
///
121+
/// Must be between 0.0 and 1.0 (exclusive).
122+
/// Cannot be used together with [`Self::confidence()`] – the last one called will be used.
123+
pub fn delta(mut self, delta: f64) -> Self {
124+
self.confidence_spec = Some(ConfidenceSpec::Delta(delta));
125+
self
126+
}
127+
128+
/// Set the estimated stream size
129+
///
130+
/// This is used to determine buffer size and can be a loose approximation.
131+
/// The closer it is to the actual stream size, the more accurate the results
132+
/// will be.
133+
pub fn estimated_size(mut self, size: usize) -> Self {
134+
self.stream_size = Some(size);
135+
self
136+
}
137+
138+
/// Build the CVM instance with validation
139+
///
140+
/// Uses the following defaults if not specified:
141+
/// - `epsilon: 0.8` (good starting point for most applications)
142+
/// - `confidence: 0.9` (90 % confidence, equivalent to delta = 0.1)
143+
/// - `estimated_size: 1000`
144+
///
145+
/// Returns an error if any parameters are invalid.
146+
pub fn build<T: Ord>(self) -> Result<CVM<T>, String> {
147+
// Validate and get epsilon
148+
let epsilon = self.epsilon.unwrap_or(0.8);
149+
if epsilon <= 0.0 || epsilon >= 1.0 {
150+
return Err("Epsilon must be between 0.0 and 1.0 (exclusive)".to_string());
151+
}
152+
153+
// Validate and get delta
154+
let confidence_spec = self
155+
.confidence_spec
156+
.unwrap_or(ConfidenceSpec::Confidence(0.9));
157+
let validated_spec = confidence_spec.validate()?;
158+
let delta = validated_spec.to_delta();
159+
160+
// Validate and get stream size
161+
let stream_size = self.stream_size.unwrap_or(1000);
162+
if stream_size == 0 {
163+
return Err("Stream size must be greater than 0".to_string());
164+
}
165+
166+
Ok(CVM::new(epsilon, delta, stream_size))
167+
}
168+
}
169+
12170
/// A counter implementing the CVM algorithm
13171
///
14172
/// This implementation uses a treap (randomized binary search tree) as the buffer,
@@ -24,6 +182,31 @@ pub struct CVM<T: Ord> {
24182
}
25183

26184
impl<T: Ord> CVM<T> {
185+
/// Create a new builder for constructing CVM instances
186+
///
187+
/// The builder provides a more ergonomic way to construct CVM instances with
188+
/// validation and sensible defaults.
189+
///
190+
/// # Examples
191+
///
192+
/// ```
193+
/// use cvmcount::CVM;
194+
///
195+
/// // Using defaults
196+
/// let cvm: CVM<String> = CVM::<String>::builder().build().unwrap();
197+
///
198+
/// // Custom configuration
199+
/// let cvm: CVM<i32> = CVM::<i32>::builder()
200+
/// .epsilon(0.05)
201+
/// .confidence(0.99)
202+
/// .estimated_size(10_000)
203+
/// .build()
204+
/// .unwrap();
205+
/// ```
206+
pub fn builder() -> CVMBuilder {
207+
CVMBuilder::new()
208+
}
209+
27210
/// Initialise the algorithm
28211
///
29212
/// `epsilon`: how close you want your estimate to be to the true number of distinct elements.
@@ -90,6 +273,7 @@ mod tests {
90273
path::Path,
91274
};
92275

276+
use super::{CVM, ConfidenceSpec};
93277
use regex::Regex;
94278
use std::collections::HashSet;
95279

@@ -118,4 +302,107 @@ mod tests {
118302
.for_each(|line| line_to_word(&re, &mut hs, &line.unwrap()));
119303
assert_eq!(hs.len(), 9016)
120304
}
305+
306+
#[test]
307+
fn test_builder_defaults() {
308+
let cvm: CVM<String> = CVM::<String>::builder().build().unwrap();
309+
// Verify that it's properly constructed with defaults
310+
assert_eq!(cvm.calculate_final_result(), 0.0); // Empty buffer
311+
}
312+
313+
#[test]
314+
fn test_builder_custom_params() {
315+
let cvm: CVM<i32> = CVM::<i32>::builder()
316+
.epsilon(0.05)
317+
.confidence(0.99)
318+
.estimated_size(5000)
319+
.build()
320+
.unwrap();
321+
322+
// Test that it works by processing some elements
323+
let mut cvm = cvm;
324+
for i in 0..100 {
325+
cvm.process_element(i);
326+
}
327+
let result = cvm.calculate_final_result();
328+
assert!(result > 0.0);
329+
}
330+
331+
#[test]
332+
fn test_builder_delta_vs_confidence() {
333+
// Test that confidence and delta give equivalent results
334+
let cvm1: CVM<i32> = CVM::<i32>::builder().confidence(0.9).build().unwrap();
335+
336+
let cvm2: CVM<i32> = CVM::<i32>::builder().delta(0.1).build().unwrap();
337+
338+
// They should have the same internal configuration
339+
// (we can't directly test this without exposing internals,
340+
// but we can test they both work)
341+
assert_eq!(cvm1.calculate_final_result(), 0.0);
342+
assert_eq!(cvm2.calculate_final_result(), 0.0);
343+
}
344+
345+
#[test]
346+
fn test_builder_last_wins() {
347+
// Test that the last confidence/delta setting wins
348+
let cvm: CVM<i32> = CVM::<i32>::builder()
349+
.confidence(0.9)
350+
.delta(0.05) // This should override confidence
351+
.build()
352+
.unwrap();
353+
354+
assert_eq!(cvm.calculate_final_result(), 0.0);
355+
}
356+
357+
#[test]
358+
fn test_builder_validation() {
359+
// Test epsilon validation
360+
let result = CVM::<i32>::builder().epsilon(0.0).build::<i32>();
361+
assert!(result.is_err());
362+
363+
let result = CVM::<i32>::builder().epsilon(1.0).build::<i32>();
364+
assert!(result.is_err());
365+
366+
let result = CVM::<i32>::builder().epsilon(-0.5).build::<i32>();
367+
assert!(result.is_err());
368+
369+
// Test confidence validation
370+
let result = CVM::<i32>::builder().confidence(0.0).build::<i32>();
371+
assert!(result.is_err());
372+
373+
let result = CVM::<i32>::builder().confidence(1.0).build::<i32>();
374+
assert!(result.is_err());
375+
376+
// Test delta validation
377+
let result = CVM::<i32>::builder().delta(0.0).build::<i32>();
378+
assert!(result.is_err());
379+
380+
let result = CVM::<i32>::builder().delta(1.0).build::<i32>();
381+
assert!(result.is_err());
382+
383+
// Test stream size validation
384+
let result = CVM::<i32>::builder().estimated_size(0).build::<i32>();
385+
assert!(result.is_err());
386+
}
387+
388+
#[test]
389+
fn test_builder_method_chaining() {
390+
let result = CVM::<String>::builder()
391+
.epsilon(0.1)
392+
.confidence(0.95)
393+
.estimated_size(2000)
394+
.build::<String>();
395+
396+
assert!(result.is_ok());
397+
}
398+
399+
#[test]
400+
fn test_confidence_spec_conversion() {
401+
// Test ConfidenceSpec::to_delta conversion
402+
let confidence_spec = ConfidenceSpec::Confidence(0.9);
403+
assert!((confidence_spec.to_delta() - 0.1).abs() < f64::EPSILON);
404+
405+
let delta_spec = ConfidenceSpec::Delta(0.05);
406+
assert!((delta_spec.to_delta() - 0.05).abs() < f64::EPSILON);
407+
}
121408
}

src/main.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use clap::{arg, crate_version, value_parser, Command};
1+
use clap::{Command, arg, crate_version, value_parser};
22
use regex::Regex;
33
use std::fs::File;
44
use std::io::BufRead;

src/treap.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,8 +255,8 @@ impl<T: Ord> Default for Treap<T> {
255255
#[cfg(test)]
256256
mod tests {
257257
use super::*;
258-
use rand::rngs::StdRng;
259258
use rand::SeedableRng;
259+
use rand::rngs::StdRng;
260260

261261
#[test]
262262
fn test_insert_and_contains() {

0 commit comments

Comments
 (0)