Skip to content

Commit 87c837b

Browse files
committed
Builder function
1 parent b4b3508 commit 87c837b

File tree

5 files changed

+353
-3
lines changed

5 files changed

+353
-3
lines changed

README.md

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,59 @@ cvmcount -t file.txt -e 0.8 -d 0.1 -s 5000
4646

4747
The `--help` option is available.
4848

49+
# Library Usage
50+
51+
The library provides both a simple constructor and a builder pattern for more ergonomic usage:
52+
53+
## Simple Constructor
54+
55+
```rust
56+
use cvmcount::CVM;
57+
58+
let mut cvm = CVM::new(0.05, 0.01, 10_000);
59+
for item in data_stream {
60+
cvm.process_element(item);
61+
}
62+
let estimate = cvm.calculate_final_result();
63+
```
64+
65+
## Builder Pattern (Recommended)
66+
67+
The builder pattern provides better readability and validation:
68+
69+
```rust
70+
use cvmcount::CVM;
71+
72+
// Using defaults (epsilon=0.8, confidence=0.9, size=1000)
73+
let mut cvm: CVM<String> = CVM::builder().build().unwrap();
74+
75+
// Custom configuration with confidence level
76+
let mut cvm: CVM<i32> = CVM::builder()
77+
.epsilon(0.05) // 5 % accuracy
78+
.confidence(0.99) // 99 % confidence
79+
.estimated_size(50_000)
80+
.build()
81+
.unwrap();
82+
83+
// Using delta (failure probability) instead of confidence
84+
let mut cvm: CVM<String> = CVM::builder()
85+
.epsilon(0.1) // 10 % accuracy
86+
.delta(0.01) // 1 % chance of failure
87+
.estimated_size(1_000)
88+
.build()
89+
.unwrap();
90+
91+
// Process your data
92+
for word in text.split_whitespace() {
93+
cvm.process_element(word.to_string());
94+
}
95+
96+
let estimate = cvm.calculate_final_result();
97+
println!("Estimated unique words: {}", estimate as usize);
98+
```
99+
100+
The builder validates parameters and provides clear error messages for invalid inputs.
101+
49102
## Analysis
50103

51104
![](cvmcount.png)

benches/benchmarks.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ use std::{
88

99
use criterion::Criterion;
1010
use cvmcount::CVM;
11-
use rand::{thread_rng, Rng};
11+
use rand::{Rng, thread_rng};
1212
use regex::Regex;
1313

1414
use std::collections::HashSet;

src/lib.rs

Lines changed: 297 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,174 @@ use crate::treap::Treap;
99
use rand::rngs::StdRng;
1010
use rand::{Rng, SeedableRng};
1111

12+
/// Specification for confidence level in the CVM algorithm
13+
#[derive(Debug, Clone, Copy)]
14+
pub enum ConfidenceSpec {
15+
/// Specify delta directly (probability of failure)
16+
Delta(f64),
17+
/// Specify confidence level (probability of success)
18+
Confidence(f64),
19+
}
20+
21+
impl ConfidenceSpec {
22+
/// Convert to delta value for internal use
23+
fn to_delta(self) -> f64 {
24+
match self {
25+
ConfidenceSpec::Delta(delta) => delta,
26+
ConfidenceSpec::Confidence(confidence) => 1.0 - confidence,
27+
}
28+
}
29+
30+
/// Validate the confidence specification
31+
fn validate(self) -> Result<Self, String> {
32+
match self {
33+
ConfidenceSpec::Delta(delta) => {
34+
if delta <= 0.0 || delta >= 1.0 {
35+
Err("Delta must be between 0.0 and 1.0 (exclusive)".to_string())
36+
} else {
37+
Ok(self)
38+
}
39+
}
40+
ConfidenceSpec::Confidence(confidence) => {
41+
if confidence <= 0.0 || confidence >= 1.0 {
42+
Err("Confidence must be between 0.0 and 1.0 (exclusive)".to_string())
43+
} else {
44+
Ok(self)
45+
}
46+
}
47+
}
48+
}
49+
}
50+
51+
/// Builder for constructing CVM instances with validation and defaults
52+
///
53+
/// # Examples
54+
///
55+
/// ```
56+
/// use cvmcount::CVM;
57+
///
58+
/// // Using defaults (`epsilon=0.8`, `confidence=0.9`, `size=1000`)
59+
/// let cvm: CVM<String> = CVM::<String>::builder().build().unwrap();
60+
///
61+
/// // Custom parameters
62+
/// let cvm: CVM<i32> = CVM::<i32>::builder()
63+
/// .epsilon(0.05) // 5 % accuracy
64+
/// .confidence(0.99) // 99 % confidence
65+
/// .estimated_size(10_000)
66+
/// .build()
67+
/// .unwrap();
68+
///
69+
/// // Using delta instead of confidence
70+
/// let cvm: CVM<String> = CVM::<String>::builder()
71+
/// .epsilon(0.1)
72+
/// .delta(0.01) // 1 % failure probability
73+
/// .build()
74+
/// .unwrap();
75+
/// ```
76+
#[derive(Debug, Clone)]
77+
pub struct CVMBuilder {
78+
epsilon: Option<f64>,
79+
confidence_spec: Option<ConfidenceSpec>,
80+
stream_size: Option<usize>,
81+
}
82+
83+
impl Default for CVMBuilder {
84+
fn default() -> Self {
85+
Self {
86+
epsilon: None,
87+
confidence_spec: None,
88+
stream_size: None,
89+
}
90+
}
91+
}
92+
93+
impl CVMBuilder {
94+
/// Create a new builder with default values
95+
pub fn new() -> Self {
96+
Self::default()
97+
}
98+
99+
/// Set the epsilon parameter (accuracy requirement)
100+
///
101+
/// `Epsilon` determines how close you want your estimate to be to the true number
102+
/// of distinct elements. A smaller `ε` means you require a more precise estimate.
103+
/// For example, `ε = 0.05` means you want your estimate to be within 5 % of the
104+
/// actual value.
105+
///
106+
/// Must be between 0.0 and 1.0 (exclusive).
107+
pub fn epsilon(mut self, epsilon: f64) -> Self {
108+
self.epsilon = Some(epsilon);
109+
self
110+
}
111+
112+
/// Set the confidence level (probability that the estimate will be accurate)
113+
///
114+
/// Confidence represents how certain you want to be that the algorithm's
115+
/// estimate will fall within the desired accuracy range. For example,
116+
/// `confidence = 0.99` means you're 99 % sure the estimate will be accurate.
117+
///
118+
/// Must be between 0.0 and 1.0 (exclusive).
119+
/// Cannot be used together with [`Self::delta`] – the last one called will be used.
120+
pub fn confidence(mut self, confidence: f64) -> Self {
121+
self.confidence_spec = Some(ConfidenceSpec::Confidence(confidence));
122+
self
123+
}
124+
125+
/// Set the delta parameter (probability of failure)
126+
///
127+
/// Delta represents the probability that the algorithm's estimate will fall
128+
/// outside the desired accuracy range. For example, `delta = 0.01` means there's
129+
/// a 1 % chance the estimate will be inaccurate.
130+
///
131+
/// Must be between 0.0 and 1.0 (exclusive).
132+
/// Cannot be used together with [`Self::confidence()`] – the last one called will be used.
133+
pub fn delta(mut self, delta: f64) -> Self {
134+
self.confidence_spec = Some(ConfidenceSpec::Delta(delta));
135+
self
136+
}
137+
138+
/// Set the estimated stream size
139+
///
140+
/// This is used to determine buffer size and can be a loose approximation.
141+
/// The closer it is to the actual stream size, the more accurate the results
142+
/// will be.
143+
pub fn estimated_size(mut self, size: usize) -> Self {
144+
self.stream_size = Some(size);
145+
self
146+
}
147+
148+
/// Build the CVM instance with validation
149+
///
150+
/// Uses the following defaults if not specified:
151+
/// - `epsilon: 0.8` (good starting point for most applications)
152+
/// - `confidence: 0.9` (90 % confidence, equivalent to delta = 0.1)
153+
/// - `estimated_size: 1000`
154+
///
155+
/// Returns an error if any parameters are invalid.
156+
pub fn build<T: Ord>(self) -> Result<CVM<T>, String> {
157+
// Validate and get epsilon
158+
let epsilon = self.epsilon.unwrap_or(0.8);
159+
if epsilon <= 0.0 || epsilon >= 1.0 {
160+
return Err("Epsilon must be between 0.0 and 1.0 (exclusive)".to_string());
161+
}
162+
163+
// Validate and get delta
164+
let confidence_spec = self
165+
.confidence_spec
166+
.unwrap_or(ConfidenceSpec::Confidence(0.9));
167+
let validated_spec = confidence_spec.validate()?;
168+
let delta = validated_spec.to_delta();
169+
170+
// Validate and get stream size
171+
let stream_size = self.stream_size.unwrap_or(1000);
172+
if stream_size == 0 {
173+
return Err("Stream size must be greater than 0".to_string());
174+
}
175+
176+
Ok(CVM::new(epsilon, delta, stream_size))
177+
}
178+
}
179+
12180
/// A counter implementing the CVM algorithm
13181
///
14182
/// This implementation uses a treap (randomized binary search tree) as the buffer,
@@ -24,6 +192,31 @@ pub struct CVM<T: Ord> {
24192
}
25193

26194
impl<T: Ord> CVM<T> {
195+
/// Create a new builder for constructing CVM instances
196+
///
197+
/// The builder provides a more ergonomic way to construct CVM instances with
198+
/// validation and sensible defaults.
199+
///
200+
/// # Examples
201+
///
202+
/// ```
203+
/// use cvmcount::CVM;
204+
///
205+
/// // Using defaults
206+
/// let cvm: CVM<String> = CVM::<String>::builder().build().unwrap();
207+
///
208+
/// // Custom configuration
209+
/// let cvm: CVM<i32> = CVM::<i32>::builder()
210+
/// .epsilon(0.05)
211+
/// .confidence(0.99)
212+
/// .estimated_size(10_000)
213+
/// .build()
214+
/// .unwrap();
215+
/// ```
216+
pub fn builder() -> CVMBuilder {
217+
CVMBuilder::new()
218+
}
219+
27220
/// Initialise the algorithm
28221
///
29222
/// `epsilon`: how close you want your estimate to be to the true number of distinct elements.
@@ -90,6 +283,7 @@ mod tests {
90283
path::Path,
91284
};
92285

286+
use super::{CVM, ConfidenceSpec, EstimateDistinct};
93287
use regex::Regex;
94288
use std::collections::HashSet;
95289

@@ -118,4 +312,107 @@ mod tests {
118312
.for_each(|line| line_to_word(&re, &mut hs, &line.unwrap()));
119313
assert_eq!(hs.len(), 9016)
120314
}
315+
316+
#[test]
317+
fn test_builder_defaults() {
318+
let cvm: CVM<String> = CVM::<String>::builder().build().unwrap();
319+
// Verify that it's properly constructed with defaults
320+
assert_eq!(cvm.calculate_final_result(), 0.0); // Empty buffer
321+
}
322+
323+
#[test]
324+
fn test_builder_custom_params() {
325+
let cvm: CVM<i32> = CVM::<i32>::builder()
326+
.epsilon(0.05)
327+
.confidence(0.99)
328+
.estimated_size(5000)
329+
.build()
330+
.unwrap();
331+
332+
// Test that it works by processing some elements
333+
let mut cvm = cvm;
334+
for i in 0..100 {
335+
cvm.process_element(i);
336+
}
337+
let result = cvm.calculate_final_result();
338+
assert!(result > 0.0);
339+
}
340+
341+
#[test]
342+
fn test_builder_delta_vs_confidence() {
343+
// Test that confidence and delta give equivalent results
344+
let cvm1: CVM<i32> = CVM::<i32>::builder().confidence(0.9).build().unwrap();
345+
346+
let cvm2: CVM<i32> = CVM::<i32>::builder().delta(0.1).build().unwrap();
347+
348+
// They should have the same internal configuration
349+
// (we can't directly test this without exposing internals,
350+
// but we can test they both work)
351+
assert_eq!(cvm1.calculate_final_result(), 0.0);
352+
assert_eq!(cvm2.calculate_final_result(), 0.0);
353+
}
354+
355+
#[test]
356+
fn test_builder_last_wins() {
357+
// Test that the last confidence/delta setting wins
358+
let cvm: CVM<i32> = CVM::<i32>::builder()
359+
.confidence(0.9)
360+
.delta(0.05) // This should override confidence
361+
.build()
362+
.unwrap();
363+
364+
assert_eq!(cvm.calculate_final_result(), 0.0);
365+
}
366+
367+
#[test]
368+
fn test_builder_validation() {
369+
// Test epsilon validation
370+
let result = CVM::<i32>::builder().epsilon(0.0).build::<i32>();
371+
assert!(result.is_err());
372+
373+
let result = CVM::<i32>::builder().epsilon(1.0).build::<i32>();
374+
assert!(result.is_err());
375+
376+
let result = CVM::<i32>::builder().epsilon(-0.5).build::<i32>();
377+
assert!(result.is_err());
378+
379+
// Test confidence validation
380+
let result = CVM::<i32>::builder().confidence(0.0).build::<i32>();
381+
assert!(result.is_err());
382+
383+
let result = CVM::<i32>::builder().confidence(1.0).build::<i32>();
384+
assert!(result.is_err());
385+
386+
// Test delta validation
387+
let result = CVM::<i32>::builder().delta(0.0).build::<i32>();
388+
assert!(result.is_err());
389+
390+
let result = CVM::<i32>::builder().delta(1.0).build::<i32>();
391+
assert!(result.is_err());
392+
393+
// Test stream size validation
394+
let result = CVM::<i32>::builder().estimated_size(0).build::<i32>();
395+
assert!(result.is_err());
396+
}
397+
398+
#[test]
399+
fn test_builder_method_chaining() {
400+
let result = CVM::<String>::builder()
401+
.epsilon(0.1)
402+
.confidence(0.95)
403+
.estimated_size(2000)
404+
.build::<String>();
405+
406+
assert!(result.is_ok());
407+
}
408+
409+
#[test]
410+
fn test_confidence_spec_conversion() {
411+
// Test ConfidenceSpec::to_delta conversion
412+
let confidence_spec = ConfidenceSpec::Confidence(0.9);
413+
assert!((confidence_spec.to_delta() - 0.1).abs() < f64::EPSILON);
414+
415+
let delta_spec = ConfidenceSpec::Delta(0.05);
416+
assert!((delta_spec.to_delta() - 0.05).abs() < f64::EPSILON);
417+
}
121418
}

src/main.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use clap::{arg, crate_version, value_parser, Command};
1+
use clap::{Command, arg, crate_version, value_parser};
22
use regex::Regex;
33
use std::fs::File;
44
use std::io::BufRead;

0 commit comments

Comments
 (0)