@@ -9,6 +9,164 @@ use crate::treap::Treap;
99use rand:: rngs:: StdRng ;
1010use rand:: { Rng , SeedableRng } ;
1111
12+ /// Specification for confidence level in the CVM algorithm
13+ #[ derive( Debug , Clone , Copy ) ]
14+ pub enum ConfidenceSpec {
15+ /// Specify delta directly (probability of failure)
16+ Delta ( f64 ) ,
17+ /// Specify confidence level (probability of success)
18+ Confidence ( f64 ) ,
19+ }
20+
21+ impl ConfidenceSpec {
22+ /// Convert to delta value for internal use
23+ fn to_delta ( self ) -> f64 {
24+ match self {
25+ ConfidenceSpec :: Delta ( delta) => delta,
26+ ConfidenceSpec :: Confidence ( confidence) => 1.0 - confidence,
27+ }
28+ }
29+
30+ /// Validate the confidence specification
31+ fn validate ( self ) -> Result < Self , String > {
32+ match self {
33+ ConfidenceSpec :: Delta ( delta) => {
34+ if delta <= 0.0 || delta >= 1.0 {
35+ Err ( "Delta must be between 0.0 and 1.0 (exclusive)" . to_string ( ) )
36+ } else {
37+ Ok ( self )
38+ }
39+ }
40+ ConfidenceSpec :: Confidence ( confidence) => {
41+ if confidence <= 0.0 || confidence >= 1.0 {
42+ Err ( "Confidence must be between 0.0 and 1.0 (exclusive)" . to_string ( ) )
43+ } else {
44+ Ok ( self )
45+ }
46+ }
47+ }
48+ }
49+ }
50+
51+ /// Builder for constructing CVM instances with validation and defaults
52+ ///
53+ /// # Examples
54+ ///
55+ /// ```
56+ /// use cvmcount::CVM;
57+ ///
58+ /// // Using defaults (`epsilon=0.8`, `confidence=0.9`, `size=1000`)
59+ /// let cvm: CVM<String> = CVM::<String>::builder().build().unwrap();
60+ ///
61+ /// // Custom parameters
62+ /// let cvm: CVM<i32> = CVM::<i32>::builder()
63+ /// .epsilon(0.05) // 5 % accuracy
64+ /// .confidence(0.99) // 99 % confidence
65+ /// .estimated_size(10_000)
66+ /// .build()
67+ /// .unwrap();
68+ ///
69+ /// // Using delta instead of confidence
70+ /// let cvm: CVM<String> = CVM::<String>::builder()
71+ /// .epsilon(0.1)
72+ /// .delta(0.01) // 1 % failure probability
73+ /// .build()
74+ /// .unwrap();
75+ /// ```
76+ #[ derive( Debug , Clone , Default ) ]
77+ pub struct CVMBuilder {
78+ epsilon : Option < f64 > ,
79+ confidence_spec : Option < ConfidenceSpec > ,
80+ stream_size : Option < usize > ,
81+ }
82+
83+ impl CVMBuilder {
84+ /// Create a new builder with default values
85+ pub fn new ( ) -> Self {
86+ Self :: default ( )
87+ }
88+
89+ /// Set the epsilon parameter (accuracy requirement)
90+ ///
91+ /// `Epsilon` determines how close you want your estimate to be to the true number
92+ /// of distinct elements. A smaller `ε` means you require a more precise estimate.
93+ /// For example, `ε = 0.05` means you want your estimate to be within 5 % of the
94+ /// actual value.
95+ ///
96+ /// Must be between 0.0 and 1.0 (exclusive).
97+ pub fn epsilon ( mut self , epsilon : f64 ) -> Self {
98+ self . epsilon = Some ( epsilon) ;
99+ self
100+ }
101+
102+ /// Set the confidence level (probability that the estimate will be accurate)
103+ ///
104+ /// Confidence represents how certain you want to be that the algorithm's
105+ /// estimate will fall within the desired accuracy range. For example,
106+ /// `confidence = 0.99` means you're 99 % sure the estimate will be accurate.
107+ ///
108+ /// Must be between 0.0 and 1.0 (exclusive).
109+ /// Cannot be used together with [`Self::delta`] – the last one called will be used.
110+ pub fn confidence ( mut self , confidence : f64 ) -> Self {
111+ self . confidence_spec = Some ( ConfidenceSpec :: Confidence ( confidence) ) ;
112+ self
113+ }
114+
115+ /// Set the delta parameter (probability of failure)
116+ ///
117+ /// Delta represents the probability that the algorithm's estimate will fall
118+ /// outside the desired accuracy range. For example, `delta = 0.01` means there's
119+ /// a 1 % chance the estimate will be inaccurate.
120+ ///
121+ /// Must be between 0.0 and 1.0 (exclusive).
122+ /// Cannot be used together with [`Self::confidence()`] – the last one called will be used.
123+ pub fn delta ( mut self , delta : f64 ) -> Self {
124+ self . confidence_spec = Some ( ConfidenceSpec :: Delta ( delta) ) ;
125+ self
126+ }
127+
128+ /// Set the estimated stream size
129+ ///
130+ /// This is used to determine buffer size and can be a loose approximation.
131+ /// The closer it is to the actual stream size, the more accurate the results
132+ /// will be.
133+ pub fn estimated_size ( mut self , size : usize ) -> Self {
134+ self . stream_size = Some ( size) ;
135+ self
136+ }
137+
138+ /// Build the CVM instance with validation
139+ ///
140+ /// Uses the following defaults if not specified:
141+ /// - `epsilon: 0.8` (good starting point for most applications)
142+ /// - `confidence: 0.9` (90 % confidence, equivalent to delta = 0.1)
143+ /// - `estimated_size: 1000`
144+ ///
145+ /// Returns an error if any parameters are invalid.
146+ pub fn build < T : Ord > ( self ) -> Result < CVM < T > , String > {
147+ // Validate and get epsilon
148+ let epsilon = self . epsilon . unwrap_or ( 0.8 ) ;
149+ if epsilon <= 0.0 || epsilon >= 1.0 {
150+ return Err ( "Epsilon must be between 0.0 and 1.0 (exclusive)" . to_string ( ) ) ;
151+ }
152+
153+ // Validate and get delta
154+ let confidence_spec = self
155+ . confidence_spec
156+ . unwrap_or ( ConfidenceSpec :: Confidence ( 0.9 ) ) ;
157+ let validated_spec = confidence_spec. validate ( ) ?;
158+ let delta = validated_spec. to_delta ( ) ;
159+
160+ // Validate and get stream size
161+ let stream_size = self . stream_size . unwrap_or ( 1000 ) ;
162+ if stream_size == 0 {
163+ return Err ( "Stream size must be greater than 0" . to_string ( ) ) ;
164+ }
165+
166+ Ok ( CVM :: new ( epsilon, delta, stream_size) )
167+ }
168+ }
169+
12170/// A counter implementing the CVM algorithm
13171///
14172/// This implementation uses a treap (randomized binary search tree) as the buffer,
@@ -24,6 +182,31 @@ pub struct CVM<T: Ord> {
24182}
25183
26184impl < T : Ord > CVM < T > {
185+ /// Create a new builder for constructing CVM instances
186+ ///
187+ /// The builder provides a more ergonomic way to construct CVM instances with
188+ /// validation and sensible defaults.
189+ ///
190+ /// # Examples
191+ ///
192+ /// ```
193+ /// use cvmcount::CVM;
194+ ///
195+ /// // Using defaults
196+ /// let cvm: CVM<String> = CVM::<String>::builder().build().unwrap();
197+ ///
198+ /// // Custom configuration
199+ /// let cvm: CVM<i32> = CVM::<i32>::builder()
200+ /// .epsilon(0.05)
201+ /// .confidence(0.99)
202+ /// .estimated_size(10_000)
203+ /// .build()
204+ /// .unwrap();
205+ /// ```
206+ pub fn builder ( ) -> CVMBuilder {
207+ CVMBuilder :: new ( )
208+ }
209+
27210 /// Initialise the algorithm
28211 ///
29212 /// `epsilon`: how close you want your estimate to be to the true number of distinct elements.
@@ -90,6 +273,7 @@ mod tests {
90273 path:: Path ,
91274 } ;
92275
276+ use super :: { CVM , ConfidenceSpec } ;
93277 use regex:: Regex ;
94278 use std:: collections:: HashSet ;
95279
@@ -118,4 +302,107 @@ mod tests {
118302 . for_each ( |line| line_to_word ( & re, & mut hs, & line. unwrap ( ) ) ) ;
119303 assert_eq ! ( hs. len( ) , 9016 )
120304 }
305+
306+ #[ test]
307+ fn test_builder_defaults ( ) {
308+ let cvm: CVM < String > = CVM :: < String > :: builder ( ) . build ( ) . unwrap ( ) ;
309+ // Verify that it's properly constructed with defaults
310+ assert_eq ! ( cvm. calculate_final_result( ) , 0.0 ) ; // Empty buffer
311+ }
312+
313+ #[ test]
314+ fn test_builder_custom_params ( ) {
315+ let cvm: CVM < i32 > = CVM :: < i32 > :: builder ( )
316+ . epsilon ( 0.05 )
317+ . confidence ( 0.99 )
318+ . estimated_size ( 5000 )
319+ . build ( )
320+ . unwrap ( ) ;
321+
322+ // Test that it works by processing some elements
323+ let mut cvm = cvm;
324+ for i in 0 ..100 {
325+ cvm. process_element ( i) ;
326+ }
327+ let result = cvm. calculate_final_result ( ) ;
328+ assert ! ( result > 0.0 ) ;
329+ }
330+
331+ #[ test]
332+ fn test_builder_delta_vs_confidence ( ) {
333+ // Test that confidence and delta give equivalent results
334+ let cvm1: CVM < i32 > = CVM :: < i32 > :: builder ( ) . confidence ( 0.9 ) . build ( ) . unwrap ( ) ;
335+
336+ let cvm2: CVM < i32 > = CVM :: < i32 > :: builder ( ) . delta ( 0.1 ) . build ( ) . unwrap ( ) ;
337+
338+ // They should have the same internal configuration
339+ // (we can't directly test this without exposing internals,
340+ // but we can test they both work)
341+ assert_eq ! ( cvm1. calculate_final_result( ) , 0.0 ) ;
342+ assert_eq ! ( cvm2. calculate_final_result( ) , 0.0 ) ;
343+ }
344+
345+ #[ test]
346+ fn test_builder_last_wins ( ) {
347+ // Test that the last confidence/delta setting wins
348+ let cvm: CVM < i32 > = CVM :: < i32 > :: builder ( )
349+ . confidence ( 0.9 )
350+ . delta ( 0.05 ) // This should override confidence
351+ . build ( )
352+ . unwrap ( ) ;
353+
354+ assert_eq ! ( cvm. calculate_final_result( ) , 0.0 ) ;
355+ }
356+
357+ #[ test]
358+ fn test_builder_validation ( ) {
359+ // Test epsilon validation
360+ let result = CVM :: < i32 > :: builder ( ) . epsilon ( 0.0 ) . build :: < i32 > ( ) ;
361+ assert ! ( result. is_err( ) ) ;
362+
363+ let result = CVM :: < i32 > :: builder ( ) . epsilon ( 1.0 ) . build :: < i32 > ( ) ;
364+ assert ! ( result. is_err( ) ) ;
365+
366+ let result = CVM :: < i32 > :: builder ( ) . epsilon ( -0.5 ) . build :: < i32 > ( ) ;
367+ assert ! ( result. is_err( ) ) ;
368+
369+ // Test confidence validation
370+ let result = CVM :: < i32 > :: builder ( ) . confidence ( 0.0 ) . build :: < i32 > ( ) ;
371+ assert ! ( result. is_err( ) ) ;
372+
373+ let result = CVM :: < i32 > :: builder ( ) . confidence ( 1.0 ) . build :: < i32 > ( ) ;
374+ assert ! ( result. is_err( ) ) ;
375+
376+ // Test delta validation
377+ let result = CVM :: < i32 > :: builder ( ) . delta ( 0.0 ) . build :: < i32 > ( ) ;
378+ assert ! ( result. is_err( ) ) ;
379+
380+ let result = CVM :: < i32 > :: builder ( ) . delta ( 1.0 ) . build :: < i32 > ( ) ;
381+ assert ! ( result. is_err( ) ) ;
382+
383+ // Test stream size validation
384+ let result = CVM :: < i32 > :: builder ( ) . estimated_size ( 0 ) . build :: < i32 > ( ) ;
385+ assert ! ( result. is_err( ) ) ;
386+ }
387+
388+ #[ test]
389+ fn test_builder_method_chaining ( ) {
390+ let result = CVM :: < String > :: builder ( )
391+ . epsilon ( 0.1 )
392+ . confidence ( 0.95 )
393+ . estimated_size ( 2000 )
394+ . build :: < String > ( ) ;
395+
396+ assert ! ( result. is_ok( ) ) ;
397+ }
398+
399+ #[ test]
400+ fn test_confidence_spec_conversion ( ) {
401+ // Test ConfidenceSpec::to_delta conversion
402+ let confidence_spec = ConfidenceSpec :: Confidence ( 0.9 ) ;
403+ assert ! ( ( confidence_spec. to_delta( ) - 0.1 ) . abs( ) < f64 :: EPSILON ) ;
404+
405+ let delta_spec = ConfidenceSpec :: Delta ( 0.05 ) ;
406+ assert ! ( ( delta_spec. to_delta( ) - 0.05 ) . abs( ) < f64 :: EPSILON ) ;
407+ }
121408}
0 commit comments