@@ -9,7 +9,7 @@ use vortex_array::variants::{ExtensionArrayTrait, PrimitiveArrayTrait, StructArr
99use vortex_array:: { Array , ArrayRef , Canonical } ;
1010use vortex_dtype:: datetime:: TemporalMetadata ;
1111use vortex_dtype:: { DType , Nullability } ;
12- use vortex_error:: { VortexExpect , VortexResult } ;
12+ use vortex_error:: { VortexExpect , VortexResult , VortexUnwrap } ;
1313
1414pub use crate :: float:: FloatCompressor ;
1515pub use crate :: integer:: IntCompressor ;
@@ -39,6 +39,8 @@ impl Default for GenerateStatsOptions {
3939 }
4040}
4141
42+ const SAMPLE_SIZE : u32 = 64 ;
43+
4244/// Stats for the compressor.
4345pub trait CompressorStats : Debug + Clone {
4446 type ArrayType : Array ;
@@ -52,11 +54,11 @@ pub trait CompressorStats: Debug + Clone {
5254
5355 fn source ( & self ) -> & Self :: ArrayType ;
5456
55- fn sample ( & self , sample_size : u16 , sample_count : u16 ) -> Self {
57+ fn sample ( & self , sample_size : u32 , sample_count : u32 ) -> Self {
5658 self . sample_opts ( sample_size, sample_count, GenerateStatsOptions :: default ( ) )
5759 }
5860
59- fn sample_opts ( & self , sample_size : u16 , sample_count : u16 , opts : GenerateStatsOptions ) -> Self ;
61+ fn sample_opts ( & self , sample_size : u32 , sample_count : u32 , opts : GenerateStatsOptions ) -> Self ;
6062}
6163
6264/// Top-level compression scheme trait.
@@ -127,7 +129,16 @@ pub fn estimate_compression_ratio_with_sampling<T: Scheme + ?Sized>(
127129 let sample = if is_sample {
128130 stats. clone ( )
129131 } else {
130- stats. sample ( 64 , 10 )
132+ // We want to sample about 1% of data
133+ let source_len = stats. source ( ) . len ( ) ;
134+
135+ // We want to sample about 1% of data, while keeping a minimal sample of 640 values.
136+ let sample_count = usize:: max (
137+ ( source_len / 100 ) / usize:: try_from ( SAMPLE_SIZE ) . vortex_unwrap ( ) ,
138+ 10 ,
139+ ) ;
140+
141+ stats. sample ( SAMPLE_SIZE , sample_count. try_into ( ) . vortex_unwrap ( ) )
131142 } ;
132143
133144 let after = compressor
0 commit comments