11use vortex_array:: aliases:: hash_set:: HashSet ;
2- use vortex_array:: arrays:: { ConstantArray , VarBinViewArray } ;
3- use vortex_array:: stats:: Precision ;
4- use vortex_array:: stats:: Stat :: IsConstant ;
5- use vortex_array:: { Array , ArrayRef , ArrayStatistics , ToCanonical } ;
2+ use vortex_array:: arrays:: VarBinViewArray ;
3+ use vortex_array:: { Array , ArrayRef , ToCanonical } ;
64use vortex_dict:: DictArray ;
75use vortex_dict:: builders:: dict_encode;
8- use vortex_error:: { VortexExpect , VortexResult , vortex_bail } ;
6+ use vortex_error:: { VortexExpect , VortexResult } ;
97use vortex_fsst:: { fsst_compress, fsst_train_compressor} ;
108
119use crate :: downscale:: downscale_integer_array;
@@ -19,35 +17,28 @@ use crate::{
1917#[ derive( Clone , Debug ) ]
2018pub struct StringStats {
2119 src : VarBinViewArray ,
22- exact_distinct_count : Option < u32 > ,
2320 estimated_distinct_count : u32 ,
2421 value_count : u32 ,
25- null_count : u32 ,
22+ // null_count: u32,
2623}
2724
2825/// Estimate the number of distinct strings in the var bin view array.
29- /// If all are inlined then is count is exact and true is returned.
3026#[ allow( clippy:: cast_possible_truncation) ]
31- fn estimate_distinct_count ( strings : & VarBinViewArray ) -> ( u32 , bool ) {
27+ fn estimate_distinct_count ( strings : & VarBinViewArray ) -> u32 {
3228 let views = strings. views ( ) ;
3329 // Iterate the views. Two strings which are equal must have the same first 8-bytes.
3430 // NOTE: there are cases where this performs pessimally, e.g. when we have strings that all
3531 // share a 4-byte prefix and have the same length.
3632 let mut distinct = HashSet :: with_capacity ( views. len ( ) / 2 ) ;
37- let mut all_inlined = true ;
3833 views. iter ( ) . for_each ( |& view| {
3934 let len_and_prefix = view. as_u128 ( ) as u64 ;
4035 distinct. insert ( len_and_prefix) ;
41- all_inlined &= view. is_inlined ( ) ;
4236 } ) ;
4337
44- (
45- distinct
46- . len ( )
47- . try_into ( )
48- . vortex_expect ( "distinct count must fit in u32" ) ,
49- all_inlined,
50- )
38+ distinct
39+ . len ( )
40+ . try_into ( )
41+ . vortex_expect ( "distinct count must fit in u32" )
5142}
5243
5344impl CompressorStats for StringStats {
@@ -59,22 +50,17 @@ impl CompressorStats for StringStats {
5950 . compute_null_count ( )
6051 . vortex_expect ( "null count" ) ;
6152 let value_count = input. len ( ) - null_count;
62- let ( estimated_distinct, is_exact ) = if opts. count_distinct_values {
53+ let estimated_distinct = if opts. count_distinct_values {
6354 estimate_distinct_count ( input)
6455 } else {
65- ( u32:: MAX , false )
56+ u32:: MAX
6657 } ;
6758
68- if is_exact && estimated_distinct == 1 && null_count == 0 {
69- input. statistics ( ) . set ( IsConstant , Precision :: exact ( true ) ) ;
70- }
71-
7259 Self {
7360 src : input. clone ( ) ,
7461 value_count : value_count. try_into ( ) . vortex_expect ( "value_count" ) ,
75- null_count : null_count. try_into ( ) . vortex_expect ( "null_count" ) ,
62+ // null_count: null_count.try_into().vortex_expect("null_count"),
7663 estimated_distinct_count : estimated_distinct,
77- exact_distinct_count : is_exact. then_some ( estimated_distinct) ,
7864 }
7965 }
8066
@@ -99,12 +85,7 @@ impl Compressor for StringCompressor {
9985 type StatsType = StringStats ;
10086
10187 fn schemes ( ) -> & ' static [ & ' static Self :: SchemeType ] {
102- & [
103- & UncompressedScheme ,
104- & ConstantScheme ,
105- & DictScheme ,
106- & FSSTScheme ,
107- ]
88+ & [ & UncompressedScheme , & DictScheme , & FSSTScheme ]
10889 }
10990
11091 fn default_scheme ( ) -> & ' static Self :: SchemeType {
@@ -123,9 +104,6 @@ impl<T> StringScheme for T where T: Scheme<StatsType = StringStats, CodeType = S
123104#[ derive( Debug , Copy , Clone ) ]
124105pub struct UncompressedScheme ;
125106
126- #[ derive( Debug , Copy , Clone ) ]
127- pub struct ConstantScheme ;
128-
129107#[ derive( Debug , Copy , Clone ) ]
130108pub struct DictScheme ;
131109
@@ -138,7 +116,6 @@ pub struct StringCode(u8);
138116const UNCOMPRESSED_SCHEME : StringCode = StringCode ( 0 ) ;
139117const DICT_SCHEME : StringCode = StringCode ( 1 ) ;
140118const FSST_SCHEME : StringCode = StringCode ( 2 ) ;
141- const CONSTANT_SCHEME : StringCode = StringCode ( 3 ) ;
142119
143120impl Scheme for UncompressedScheme {
144121 type StatsType = StringStats ;
@@ -169,62 +146,6 @@ impl Scheme for UncompressedScheme {
169146 }
170147}
171148
172- impl Scheme for ConstantScheme {
173- type StatsType = StringStats ;
174- type CodeType = StringCode ;
175-
176- fn code ( & self ) -> Self :: CodeType {
177- CONSTANT_SCHEME
178- }
179-
180- fn is_constant ( & self ) -> bool {
181- true
182- }
183-
184- fn expected_compression_ratio (
185- & self ,
186- stats : & StringStats ,
187- is_sample : bool ,
188- _allowed_cascading : usize ,
189- _excludes : & [ Self :: CodeType ] ,
190- ) -> VortexResult < f64 > {
191- // Never yield ConstantScheme for a sample, it could be a false-positive.
192- if is_sample {
193- return Ok ( 0.0 ) ;
194- }
195-
196- // Only arrays with one distinct values can be constant compressed.
197- if stats. exact_distinct_count . unwrap_or ( stats. value_count ) != 1 {
198- return Ok ( 0.0 ) ;
199- }
200-
201- // Cannot have mix of nulls and non-nulls
202- if stats. null_count > 0 && stats. value_count > 0 {
203- return Ok ( 0.0 ) ;
204- }
205-
206- Ok ( stats. value_count as f64 )
207- }
208-
209- fn compress (
210- & self ,
211- stats : & Self :: StatsType ,
212- _is_sample : bool ,
213- _allowed_cascading : usize ,
214- _excludes : & [ Self :: CodeType ] ,
215- ) -> VortexResult < ArrayRef > {
216- if stats. source ( ) . statistics ( ) . get_as :: < bool > ( IsConstant ) != Some ( Precision :: exact ( true ) ) {
217- vortex_bail ! ( "array was thought to be constant, but it is not" ) ;
218- }
219- let scalar = stats
220- . source ( )
221- . as_constant ( )
222- . vortex_expect ( "constant array expected" ) ;
223-
224- Ok ( ConstantArray :: new ( scalar, stats. src . len ( ) ) . into_array ( ) )
225- }
226- }
227-
228149impl Scheme for DictScheme {
229150 type StatsType = StringStats ;
230151 type CodeType = StringCode ;
0 commit comments