@@ -9,6 +9,174 @@ use crate::treap::Treap;
99use rand:: rngs:: StdRng ;
1010use rand:: { Rng , SeedableRng } ;
1111
12+ /// Specification for confidence level in the CVM algorithm
13+ #[ derive( Debug , Clone , Copy ) ]
14+ pub enum ConfidenceSpec {
15+ /// Specify delta directly (probability of failure)
16+ Delta ( f64 ) ,
17+ /// Specify confidence level (probability of success)
18+ Confidence ( f64 ) ,
19+ }
20+
21+ impl ConfidenceSpec {
22+ /// Convert to delta value for internal use
23+ fn to_delta ( self ) -> f64 {
24+ match self {
25+ ConfidenceSpec :: Delta ( delta) => delta,
26+ ConfidenceSpec :: Confidence ( confidence) => 1.0 - confidence,
27+ }
28+ }
29+
30+ /// Validate the confidence specification
31+ fn validate ( self ) -> Result < Self , String > {
32+ match self {
33+ ConfidenceSpec :: Delta ( delta) => {
34+ if delta <= 0.0 || delta >= 1.0 {
35+ Err ( "Delta must be between 0.0 and 1.0 (exclusive)" . to_string ( ) )
36+ } else {
37+ Ok ( self )
38+ }
39+ }
40+ ConfidenceSpec :: Confidence ( confidence) => {
41+ if confidence <= 0.0 || confidence >= 1.0 {
42+ Err ( "Confidence must be between 0.0 and 1.0 (exclusive)" . to_string ( ) )
43+ } else {
44+ Ok ( self )
45+ }
46+ }
47+ }
48+ }
49+ }
50+
51+ /// Builder for constructing CVM instances with validation and defaults
52+ ///
53+ /// # Examples
54+ ///
55+ /// ```
56+ /// use cvmcount::CVM;
57+ ///
58+ /// // Using defaults (`epsilon=0.8`, `confidence=0.9`, `size=1000`)
59+ /// let cvm: CVM<String> = CVM::<String>::builder().build().unwrap();
60+ ///
61+ /// // Custom parameters
62+ /// let cvm: CVM<i32> = CVM::<i32>::builder()
63+ /// .epsilon(0.05) // 5 % accuracy
64+ /// .confidence(0.99) // 99 % confidence
65+ /// .estimated_size(10_000)
66+ /// .build()
67+ /// .unwrap();
68+ ///
69+ /// // Using delta instead of confidence
70+ /// let cvm: CVM<String> = CVM::<String>::builder()
71+ /// .epsilon(0.1)
72+ /// .delta(0.01) // 1 % failure probability
73+ /// .build()
74+ /// .unwrap();
75+ /// ```
76+ #[ derive( Debug , Clone ) ]
77+ pub struct CVMBuilder {
78+ epsilon : Option < f64 > ,
79+ confidence_spec : Option < ConfidenceSpec > ,
80+ stream_size : Option < usize > ,
81+ }
82+
83+ impl Default for CVMBuilder {
84+ fn default ( ) -> Self {
85+ Self {
86+ epsilon : None ,
87+ confidence_spec : None ,
88+ stream_size : None ,
89+ }
90+ }
91+ }
92+
93+ impl CVMBuilder {
94+ /// Create a new builder with default values
95+ pub fn new ( ) -> Self {
96+ Self :: default ( )
97+ }
98+
99+ /// Set the epsilon parameter (accuracy requirement)
100+ ///
101+ /// `Epsilon` determines how close you want your estimate to be to the true number
102+ /// of distinct elements. A smaller `ε` means you require a more precise estimate.
103+ /// For example, `ε = 0.05` means you want your estimate to be within 5 % of the
104+ /// actual value.
105+ ///
106+ /// Must be between 0.0 and 1.0 (exclusive).
107+ pub fn epsilon ( mut self , epsilon : f64 ) -> Self {
108+ self . epsilon = Some ( epsilon) ;
109+ self
110+ }
111+
112+ /// Set the confidence level (probability that the estimate will be accurate)
113+ ///
114+ /// Confidence represents how certain you want to be that the algorithm's
115+ /// estimate will fall within the desired accuracy range. For example,
116+ /// `confidence = 0.99` means you're 99 % sure the estimate will be accurate.
117+ ///
118+ /// Must be between 0.0 and 1.0 (exclusive).
119+ /// Cannot be used together with [`Self::delta`] – the last one called will be used.
120+ pub fn confidence ( mut self , confidence : f64 ) -> Self {
121+ self . confidence_spec = Some ( ConfidenceSpec :: Confidence ( confidence) ) ;
122+ self
123+ }
124+
125+ /// Set the delta parameter (probability of failure)
126+ ///
127+ /// Delta represents the probability that the algorithm's estimate will fall
128+ /// outside the desired accuracy range. For example, `delta = 0.01` means there's
129+ /// a 1 % chance the estimate will be inaccurate.
130+ ///
131+ /// Must be between 0.0 and 1.0 (exclusive).
132+ /// Cannot be used together with [`Self::confidence()`] – the last one called will be used.
133+ pub fn delta ( mut self , delta : f64 ) -> Self {
134+ self . confidence_spec = Some ( ConfidenceSpec :: Delta ( delta) ) ;
135+ self
136+ }
137+
138+ /// Set the estimated stream size
139+ ///
140+ /// This is used to determine buffer size and can be a loose approximation.
141+ /// The closer it is to the actual stream size, the more accurate the results
142+ /// will be.
143+ pub fn estimated_size ( mut self , size : usize ) -> Self {
144+ self . stream_size = Some ( size) ;
145+ self
146+ }
147+
148+ /// Build the CVM instance with validation
149+ ///
150+ /// Uses the following defaults if not specified:
151+ /// - `epsilon: 0.8` (good starting point for most applications)
152+ /// - `confidence: 0.9` (90 % confidence, equivalent to delta = 0.1)
153+ /// - `estimated_size: 1000`
154+ ///
155+ /// Returns an error if any parameters are invalid.
156+ pub fn build < T : Ord > ( self ) -> Result < CVM < T > , String > {
157+ // Validate and get epsilon
158+ let epsilon = self . epsilon . unwrap_or ( 0.8 ) ;
159+ if epsilon <= 0.0 || epsilon >= 1.0 {
160+ return Err ( "Epsilon must be between 0.0 and 1.0 (exclusive)" . to_string ( ) ) ;
161+ }
162+
163+ // Validate and get delta
164+ let confidence_spec = self
165+ . confidence_spec
166+ . unwrap_or ( ConfidenceSpec :: Confidence ( 0.9 ) ) ;
167+ let validated_spec = confidence_spec. validate ( ) ?;
168+ let delta = validated_spec. to_delta ( ) ;
169+
170+ // Validate and get stream size
171+ let stream_size = self . stream_size . unwrap_or ( 1000 ) ;
172+ if stream_size == 0 {
173+ return Err ( "Stream size must be greater than 0" . to_string ( ) ) ;
174+ }
175+
176+ Ok ( CVM :: new ( epsilon, delta, stream_size) )
177+ }
178+ }
179+
12180/// A counter implementing the CVM algorithm
13181///
14182/// This implementation uses a treap (randomized binary search tree) as the buffer,
@@ -24,6 +192,31 @@ pub struct CVM<T: Ord> {
24192}
25193
26194impl < T : Ord > CVM < T > {
195+ /// Create a new builder for constructing CVM instances
196+ ///
197+ /// The builder provides a more ergonomic way to construct CVM instances with
198+ /// validation and sensible defaults.
199+ ///
200+ /// # Examples
201+ ///
202+ /// ```
203+ /// use cvmcount::CVM;
204+ ///
205+ /// // Using defaults
206+ /// let cvm: CVM<String> = CVM::<String>::builder().build().unwrap();
207+ ///
208+ /// // Custom configuration
209+ /// let cvm: CVM<i32> = CVM::<i32>::builder()
210+ /// .epsilon(0.05)
211+ /// .confidence(0.99)
212+ /// .estimated_size(10_000)
213+ /// .build()
214+ /// .unwrap();
215+ /// ```
216+ pub fn builder ( ) -> CVMBuilder {
217+ CVMBuilder :: new ( )
218+ }
219+
27220 /// Initialise the algorithm
28221 ///
29222 /// `epsilon`: how close you want your estimate to be to the true number of distinct elements.
@@ -90,6 +283,7 @@ mod tests {
90283 path:: Path ,
91284 } ;
92285
286+ use super :: { CVM , ConfidenceSpec , EstimateDistinct } ;
93287 use regex:: Regex ;
94288 use std:: collections:: HashSet ;
95289
@@ -118,4 +312,107 @@ mod tests {
118312 . for_each ( |line| line_to_word ( & re, & mut hs, & line. unwrap ( ) ) ) ;
119313 assert_eq ! ( hs. len( ) , 9016 )
120314 }
315+
316+ #[ test]
317+ fn test_builder_defaults ( ) {
318+ let cvm: CVM < String > = CVM :: < String > :: builder ( ) . build ( ) . unwrap ( ) ;
319+ // Verify that it's properly constructed with defaults
320+ assert_eq ! ( cvm. calculate_final_result( ) , 0.0 ) ; // Empty buffer
321+ }
322+
323+ #[ test]
324+ fn test_builder_custom_params ( ) {
325+ let cvm: CVM < i32 > = CVM :: < i32 > :: builder ( )
326+ . epsilon ( 0.05 )
327+ . confidence ( 0.99 )
328+ . estimated_size ( 5000 )
329+ . build ( )
330+ . unwrap ( ) ;
331+
332+ // Test that it works by processing some elements
333+ let mut cvm = cvm;
334+ for i in 0 ..100 {
335+ cvm. process_element ( i) ;
336+ }
337+ let result = cvm. calculate_final_result ( ) ;
338+ assert ! ( result > 0.0 ) ;
339+ }
340+
341+ #[ test]
342+ fn test_builder_delta_vs_confidence ( ) {
343+ // Test that confidence and delta give equivalent results
344+ let cvm1: CVM < i32 > = CVM :: < i32 > :: builder ( ) . confidence ( 0.9 ) . build ( ) . unwrap ( ) ;
345+
346+ let cvm2: CVM < i32 > = CVM :: < i32 > :: builder ( ) . delta ( 0.1 ) . build ( ) . unwrap ( ) ;
347+
348+ // They should have the same internal configuration
349+ // (we can't directly test this without exposing internals,
350+ // but we can test they both work)
351+ assert_eq ! ( cvm1. calculate_final_result( ) , 0.0 ) ;
352+ assert_eq ! ( cvm2. calculate_final_result( ) , 0.0 ) ;
353+ }
354+
355+ #[ test]
356+ fn test_builder_last_wins ( ) {
357+ // Test that the last confidence/delta setting wins
358+ let cvm: CVM < i32 > = CVM :: < i32 > :: builder ( )
359+ . confidence ( 0.9 )
360+ . delta ( 0.05 ) // This should override confidence
361+ . build ( )
362+ . unwrap ( ) ;
363+
364+ assert_eq ! ( cvm. calculate_final_result( ) , 0.0 ) ;
365+ }
366+
367+ #[ test]
368+ fn test_builder_validation ( ) {
369+ // Test epsilon validation
370+ let result = CVM :: < i32 > :: builder ( ) . epsilon ( 0.0 ) . build :: < i32 > ( ) ;
371+ assert ! ( result. is_err( ) ) ;
372+
373+ let result = CVM :: < i32 > :: builder ( ) . epsilon ( 1.0 ) . build :: < i32 > ( ) ;
374+ assert ! ( result. is_err( ) ) ;
375+
376+ let result = CVM :: < i32 > :: builder ( ) . epsilon ( -0.5 ) . build :: < i32 > ( ) ;
377+ assert ! ( result. is_err( ) ) ;
378+
379+ // Test confidence validation
380+ let result = CVM :: < i32 > :: builder ( ) . confidence ( 0.0 ) . build :: < i32 > ( ) ;
381+ assert ! ( result. is_err( ) ) ;
382+
383+ let result = CVM :: < i32 > :: builder ( ) . confidence ( 1.0 ) . build :: < i32 > ( ) ;
384+ assert ! ( result. is_err( ) ) ;
385+
386+ // Test delta validation
387+ let result = CVM :: < i32 > :: builder ( ) . delta ( 0.0 ) . build :: < i32 > ( ) ;
388+ assert ! ( result. is_err( ) ) ;
389+
390+ let result = CVM :: < i32 > :: builder ( ) . delta ( 1.0 ) . build :: < i32 > ( ) ;
391+ assert ! ( result. is_err( ) ) ;
392+
393+ // Test stream size validation
394+ let result = CVM :: < i32 > :: builder ( ) . estimated_size ( 0 ) . build :: < i32 > ( ) ;
395+ assert ! ( result. is_err( ) ) ;
396+ }
397+
398+ #[ test]
399+ fn test_builder_method_chaining ( ) {
400+ let result = CVM :: < String > :: builder ( )
401+ . epsilon ( 0.1 )
402+ . confidence ( 0.95 )
403+ . estimated_size ( 2000 )
404+ . build :: < String > ( ) ;
405+
406+ assert ! ( result. is_ok( ) ) ;
407+ }
408+
409+ #[ test]
410+ fn test_confidence_spec_conversion ( ) {
411+ // Test ConfidenceSpec::to_delta conversion
412+ let confidence_spec = ConfidenceSpec :: Confidence ( 0.9 ) ;
413+ assert ! ( ( confidence_spec. to_delta( ) - 0.1 ) . abs( ) < f64 :: EPSILON ) ;
414+
415+ let delta_spec = ConfidenceSpec :: Delta ( 0.05 ) ;
416+ assert ! ( ( delta_spec. to_delta( ) - 0.05 ) . abs( ) < f64 :: EPSILON ) ;
417+ }
121418}
0 commit comments