11use std:: cmp:: min;
2- use std :: ops :: Not ;
2+
33use std:: sync:: Arc ;
44use std:: time:: { SystemTime , UNIX_EPOCH } ;
55use std:: { collections:: HashMap , ops:: AddAssign } ;
66
77use delta_kernel:: expressions:: Scalar ;
88use indexmap:: IndexMap ;
99use itertools:: Itertools ;
10- use parquet :: basic :: Type ;
10+
1111use parquet:: file:: metadata:: ParquetMetaData ;
1212use parquet:: format:: FileMetaData ;
1313use parquet:: schema:: types:: { ColumnDescriptor , SchemaDescriptor } ;
@@ -16,7 +16,6 @@ use parquet::{
1616 file:: { metadata:: RowGroupMetaData , statistics:: Statistics } ,
1717 format:: TimeUnit ,
1818} ;
19- use tracing:: warn;
2019
2120use super :: * ;
2221use crate :: kernel:: { scalars:: ScalarExt , Add } ;
@@ -190,19 +189,10 @@ fn stats_from_metadata(
190189 let maybe_stats: Option < AggregatedStats > = row_group_metadata
191190 . iter ( )
192191 . flat_map ( |g| {
193- g. column ( idx) . statistics ( ) . into_iter ( ) . filter_map ( |s| {
194- let is_binary = matches ! ( & column_descr. physical_type( ) , Type :: BYTE_ARRAY )
195- && matches ! ( column_descr. logical_type( ) , Some ( LogicalType :: String ) ) . not ( ) ;
196- if is_binary {
197- warn ! (
198- "Skipping column {} because it's a binary field." ,
199- & column_descr. name( ) . to_string( )
200- ) ;
201- None
202- } else {
203- Some ( AggregatedStats :: from ( ( s, & column_descr. logical_type ( ) ) ) )
204- }
205- } )
192+ g. column ( idx)
193+ . statistics ( )
194+ . into_iter ( )
195+ . map ( |s| AggregatedStats :: from ( ( s, & column_descr. logical_type ( ) ) ) )
206196 } )
207197 . reduce ( |mut left, right| {
208198 left += right;
@@ -316,7 +306,6 @@ impl StatsScalar {
316306 }
317307 . unwrap_or_default ( ) ;
318308 match logical_type {
319- None => Ok ( Self :: Bytes ( bytes. to_vec ( ) ) ) ,
320309 Some ( LogicalType :: String ) => {
321310 Ok ( Self :: String ( String :: from_utf8 ( bytes. to_vec ( ) ) . map_err (
322311 |_| DeltaWriterError :: StatsParsingFailed {
@@ -325,10 +314,7 @@ impl StatsScalar {
325314 } ,
326315 ) ?) )
327316 }
328- _ => Err ( DeltaWriterError :: StatsParsingFailed {
329- debug_value : format ! ( "{bytes:?}" ) ,
330- logical_type : logical_type. clone ( ) ,
331- } ) ,
317+ _ => Ok ( Self :: Bytes ( bytes. to_vec ( ) ) ) ,
332318 }
333319 }
334320 ( Statistics :: FixedLenByteArray ( v) , Some ( LogicalType :: Decimal { scale, precision } ) ) => {
@@ -384,6 +370,17 @@ impl StatsScalar {
384370 let val = uuid:: Uuid :: from_bytes ( bytes) ;
385371 Ok ( Self :: Uuid ( val) )
386372 }
373+ ( Statistics :: FixedLenByteArray ( v) , None ) => {
374+ let bytes = if use_min {
375+ v. min_bytes_opt ( )
376+ } else {
377+ v. max_bytes_opt ( )
378+ }
379+ . unwrap_or_default ( ) ;
380+
381+ Ok ( Self :: Bytes ( bytes. to_vec ( ) ) )
382+ }
383+ // TODO other fixed binary column types
387384 ( stats, _) => Err ( DeltaWriterError :: StatsParsingFailed {
388385 debug_value : format ! ( "{stats:?}" ) ,
389386 logical_type : logical_type. clone ( ) ,
@@ -798,6 +795,22 @@ mod tests {
798795 Some ( LogicalType :: Uuid ) ,
799796 Value :: from ( "c2e8c7f7-d1f9-4b49-a5d9-4bfe75c317e2" ) ,
800797 ) ,
798+ (
799+ simple_parquet_stat ! (
800+ Statistics :: ByteArray ,
801+ ByteArray :: from( b"\x00 \x00 \x01 \x02 " . to_vec( ) )
802+ ) ,
803+ None ,
804+ Value :: from ( "\\ x00\\ x00\\ x01\\ x02" ) ,
805+ ) ,
806+ (
807+ simple_parquet_stat ! (
808+ Statistics :: FixedLenByteArray ,
809+ FixedLenByteArray :: from( b"\x00 \x00 \x01 \x02 " . to_vec( ) )
810+ ) ,
811+ None ,
812+ Value :: from ( "\\ x00\\ x00\\ x01\\ x02" ) ,
813+ ) ,
801814 ] ;
802815
803816 for ( stats, logical_type, expected) in cases {
@@ -880,6 +893,12 @@ mod tests {
880893 ( "uuid" , ColumnValueStat :: Value ( v) ) => {
881894 assert_eq ! ( "176c770d-92af-4a21-bf76-5d8c5261d659" , v. as_str( ) . unwrap( ) )
882895 }
896+ ( "binary" , ColumnValueStat :: Value ( v) ) => {
897+ assert_eq ! ( "\\ x00\\ x00\\ x01\\ x02\\ x03\\ x04" , v. as_str( ) . unwrap( ) )
898+ }
899+ ( "fixed_binary" , ColumnValueStat :: Value ( v) ) => {
900+ assert_eq ! ( "\\ x00\\ x00\\ x01\\ x02\\ x03" , v. as_str( ) . unwrap( ) )
901+ }
883902 k => panic ! ( "Key {k:?} should not be present in min_values" ) ,
884903 }
885904 }
@@ -911,6 +930,12 @@ mod tests {
911930 ( "uuid" , ColumnValueStat :: Value ( v) ) => {
912931 assert_eq ! ( "a98bea04-d119-4f21-8edc-eb218b5849af" , v. as_str( ) . unwrap( ) )
913932 }
933+ ( "binary" , ColumnValueStat :: Value ( v) ) => {
934+ assert_eq ! ( "\\ x00\\ x00\\ x01\\ x02\\ x03\\ x05" , v. as_str( ) . unwrap( ) )
935+ }
936+ ( "fixed_binary" , ColumnValueStat :: Value ( v) ) => {
937+ assert_eq ! ( "\\ x00\\ x00\\ x01\\ x02\\ x04" , v. as_str( ) . unwrap( ) )
938+ }
914939 k => panic ! ( "Key {k:?} should not be present in max_values" ) ,
915940 }
916941 }
@@ -938,6 +963,8 @@ mod tests {
938963 ( "some_nested_list" , ColumnCountStat :: Value ( v) ) => assert_eq ! ( 100 , * v) ,
939964 ( "date" , ColumnCountStat :: Value ( v) ) => assert_eq ! ( 0 , * v) ,
940965 ( "uuid" , ColumnCountStat :: Value ( v) ) => assert_eq ! ( 0 , * v) ,
966+ ( "binary" , ColumnCountStat :: Value ( v) ) => assert_eq ! ( 100 , * v) ,
967+ ( "fixed_binary" , ColumnCountStat :: Value ( v) ) => assert_eq ! ( 100 , * v) ,
941968 k => panic ! ( "Key {k:?} should not be present in null_count" ) ,
942969 }
943970 }
@@ -1089,6 +1116,8 @@ mod tests {
10891116 "some_nested_list" : [ [ 42 ] , [ 84 ] ] ,
10901117 "date" : "2021-06-22" ,
10911118 "uuid" : "176c770d-92af-4a21-bf76-5d8c5261d659" ,
1119+ "binary" : "\\ x00\\ x00\\ x01\\ x02\\ x03\\ x04" ,
1120+ "fixed_binary" : "\\ x00\\ x00\\ x01\\ x02\\ x03" ,
10921121 } ) ,
10931122 100 ,
10941123 )
@@ -1111,6 +1140,8 @@ mod tests {
11111140 "some_nested_list" : [ [ 42 ] , [ 84 ] ] ,
11121141 "date" : "2021-06-22" ,
11131142 "uuid" : "54f3e867-3f7b-4122-a452-9d74fb4fe1ba" ,
1143+ "binary" : "\\ x00\\ x00\\ x01\\ x02\\ x03\\ x05" ,
1144+ "fixed_binary" : "\\ x00\\ x00\\ x01\\ x02\\ x04" ,
11141145 } ) ,
11151146 100 ,
11161147 ) )
0 commit comments