11use  std:: cmp:: min; 
2- use  std :: ops :: Not ; 
2+ 
33use  std:: sync:: Arc ; 
44use  std:: time:: { SystemTime ,  UNIX_EPOCH } ; 
55use  std:: { collections:: HashMap ,  ops:: AddAssign } ; 
66
77use  delta_kernel:: expressions:: Scalar ; 
88use  indexmap:: IndexMap ; 
99use  itertools:: Itertools ; 
10- use  parquet :: basic :: Type ; 
10+ 
1111use  parquet:: file:: metadata:: ParquetMetaData ; 
1212use  parquet:: format:: FileMetaData ; 
1313use  parquet:: schema:: types:: { ColumnDescriptor ,  SchemaDescriptor } ; 
@@ -16,7 +16,6 @@ use parquet::{
1616    file:: { metadata:: RowGroupMetaData ,  statistics:: Statistics } , 
1717    format:: TimeUnit , 
1818} ; 
19- use  tracing:: warn; 
2019
2120use  super :: * ; 
2221use  crate :: kernel:: { scalars:: ScalarExt ,  Add } ; 
@@ -190,19 +189,10 @@ fn stats_from_metadata(
190189        let  maybe_stats:  Option < AggregatedStats >  = row_group_metadata
191190            . iter ( ) 
192191            . flat_map ( |g| { 
193-                 g. column ( idx) . statistics ( ) . into_iter ( ) . filter_map ( |s| { 
194-                     let  is_binary = matches ! ( & column_descr. physical_type( ) ,  Type :: BYTE_ARRAY ) 
195-                         && matches ! ( column_descr. logical_type( ) ,  Some ( LogicalType :: String ) ) . not ( ) ; 
196-                     if  is_binary { 
197-                         warn ! ( 
198-                             "Skipping column {} because it's a binary field." , 
199-                             & column_descr. name( ) . to_string( ) 
200-                         ) ; 
201-                         None 
202-                     }  else  { 
203-                         Some ( AggregatedStats :: from ( ( s,  & column_descr. logical_type ( ) ) ) ) 
204-                     } 
205-                 } ) 
192+                 g. column ( idx) 
193+                     . statistics ( ) 
194+                     . into_iter ( ) 
195+                     . map ( |s| AggregatedStats :: from ( ( s,  & column_descr. logical_type ( ) ) ) ) 
206196            } ) 
207197            . reduce ( |mut  left,  right| { 
208198                left += right; 
@@ -316,7 +306,6 @@ impl StatsScalar {
316306                } 
317307                . unwrap_or_default ( ) ; 
318308                match  logical_type { 
319-                     None  => Ok ( Self :: Bytes ( bytes. to_vec ( ) ) ) , 
320309                    Some ( LogicalType :: String )  => { 
321310                        Ok ( Self :: String ( String :: from_utf8 ( bytes. to_vec ( ) ) . map_err ( 
322311                            |_| DeltaWriterError :: StatsParsingFailed  { 
@@ -325,10 +314,7 @@ impl StatsScalar {
325314                            } , 
326315                        ) ?) ) 
327316                    } 
328-                     _ => Err ( DeltaWriterError :: StatsParsingFailed  { 
329-                         debug_value :  format ! ( "{bytes:?}" ) , 
330-                         logical_type :  logical_type. clone ( ) , 
331-                     } ) , 
317+                     _ => Ok ( Self :: Bytes ( bytes. to_vec ( ) ) ) , 
332318                } 
333319            } 
334320            ( Statistics :: FixedLenByteArray ( v) ,  Some ( LogicalType :: Decimal  {  scale,  precision } ) )  => { 
@@ -384,6 +370,17 @@ impl StatsScalar {
384370                let  val = uuid:: Uuid :: from_bytes ( bytes) ; 
385371                Ok ( Self :: Uuid ( val) ) 
386372            } 
373+             ( Statistics :: FixedLenByteArray ( v) ,  None )  => { 
374+                 let  bytes = if  use_min { 
375+                     v. min_bytes_opt ( ) 
376+                 }  else  { 
377+                     v. max_bytes_opt ( ) 
378+                 } 
379+                 . unwrap_or_default ( ) ; 
380+ 
381+                 Ok ( Self :: Bytes ( bytes. to_vec ( ) ) ) 
382+             } 
383+             // TODO other fixed binary column types 
387384            ( stats,  _)  => Err ( DeltaWriterError :: StatsParsingFailed  { 
388385                debug_value :  format ! ( "{stats:?}" ) , 
389386                logical_type :  logical_type. clone ( ) , 
@@ -798,6 +795,22 @@ mod tests {
798795                Some ( LogicalType :: Uuid ) , 
799796                Value :: from ( "c2e8c7f7-d1f9-4b49-a5d9-4bfe75c317e2" ) , 
800797            ) , 
798+             ( 
799+                 simple_parquet_stat ! ( 
800+                     Statistics :: ByteArray , 
801+                     ByteArray :: from( b"\x00 \x00 \x01 \x02 " . to_vec( ) ) 
802+                 ) , 
803+                 None , 
804+                 Value :: from ( "\\ x00\\ x00\\ x01\\ x02" ) , 
805+             ) , 
806+             ( 
807+                 simple_parquet_stat ! ( 
808+                     Statistics :: FixedLenByteArray , 
809+                     FixedLenByteArray :: from( b"\x00 \x00 \x01 \x02 " . to_vec( ) ) 
810+                 ) , 
811+                 None , 
812+                 Value :: from ( "\\ x00\\ x00\\ x01\\ x02" ) , 
813+             ) , 
801814        ] ; 
802815
803816        for  ( stats,  logical_type,  expected)  in  cases { 
@@ -880,6 +893,12 @@ mod tests {
880893                ( "uuid" ,  ColumnValueStat :: Value ( v) )  => { 
881894                    assert_eq ! ( "176c770d-92af-4a21-bf76-5d8c5261d659" ,  v. as_str( ) . unwrap( ) ) 
882895                } 
896+                 ( "binary" ,  ColumnValueStat :: Value ( v) )  => { 
897+                     assert_eq ! ( "\\ x00\\ x00\\ x01\\ x02\\ x03\\ x04" ,  v. as_str( ) . unwrap( ) ) 
898+                 } 
899+                 ( "fixed_binary" ,  ColumnValueStat :: Value ( v) )  => { 
900+                     assert_eq ! ( "\\ x00\\ x00\\ x01\\ x02\\ x03" ,  v. as_str( ) . unwrap( ) ) 
901+                 } 
883902                k => panic ! ( "Key {k:?} should not be present in min_values" ) , 
884903            } 
885904        } 
@@ -911,6 +930,12 @@ mod tests {
911930                ( "uuid" ,  ColumnValueStat :: Value ( v) )  => { 
912931                    assert_eq ! ( "a98bea04-d119-4f21-8edc-eb218b5849af" ,  v. as_str( ) . unwrap( ) ) 
913932                } 
933+                 ( "binary" ,  ColumnValueStat :: Value ( v) )  => { 
934+                     assert_eq ! ( "\\ x00\\ x00\\ x01\\ x02\\ x03\\ x05" ,  v. as_str( ) . unwrap( ) ) 
935+                 } 
936+                 ( "fixed_binary" ,  ColumnValueStat :: Value ( v) )  => { 
937+                     assert_eq ! ( "\\ x00\\ x00\\ x01\\ x02\\ x04" ,  v. as_str( ) . unwrap( ) ) 
938+                 } 
914939                k => panic ! ( "Key {k:?} should not be present in max_values" ) , 
915940            } 
916941        } 
@@ -938,6 +963,8 @@ mod tests {
938963                ( "some_nested_list" ,  ColumnCountStat :: Value ( v) )  => assert_eq ! ( 100 ,  * v) , 
939964                ( "date" ,  ColumnCountStat :: Value ( v) )  => assert_eq ! ( 0 ,  * v) , 
940965                ( "uuid" ,  ColumnCountStat :: Value ( v) )  => assert_eq ! ( 0 ,  * v) , 
966+                 ( "binary" ,  ColumnCountStat :: Value ( v) )  => assert_eq ! ( 100 ,  * v) , 
967+                 ( "fixed_binary" ,  ColumnCountStat :: Value ( v) )  => assert_eq ! ( 100 ,  * v) , 
941968                k => panic ! ( "Key {k:?} should not be present in null_count" ) , 
942969            } 
943970        } 
@@ -1089,6 +1116,8 @@ mod tests {
10891116                "some_nested_list" :  [ [ 42 ] ,  [ 84 ] ] , 
10901117                "date" :  "2021-06-22" , 
10911118                "uuid" :  "176c770d-92af-4a21-bf76-5d8c5261d659" , 
1119+                 "binary" :  "\\ x00\\ x00\\ x01\\ x02\\ x03\\ x04" , 
1120+                 "fixed_binary" :  "\\ x00\\ x00\\ x01\\ x02\\ x03" , 
10921121            } ) , 
10931122            100 , 
10941123        ) 
@@ -1111,6 +1140,8 @@ mod tests {
11111140                "some_nested_list" :  [ [ 42 ] ,  [ 84 ] ] , 
11121141                "date" :  "2021-06-22" , 
11131142                "uuid" :  "54f3e867-3f7b-4122-a452-9d74fb4fe1ba" , 
1143+                 "binary" :  "\\ x00\\ x00\\ x01\\ x02\\ x03\\ x05" , 
1144+                 "fixed_binary" :  "\\ x00\\ x00\\ x01\\ x02\\ x04" , 
11141145            } ) , 
11151146            100 , 
11161147        ) ) 
0 commit comments