11use std:: any:: Any ;
2- use std:: sync:: Arc ;
2+ use std:: sync:: { Arc , RwLock } ;
33
44use arrow_schema:: { Schema , SchemaRef } ;
55use async_trait:: async_trait;
@@ -9,18 +9,25 @@ use datafusion::datasource::physical_plan::{FileScanConfig, FileSinkConfig};
99use datafusion:: execution:: SessionState ;
1010use datafusion_common:: parsers:: CompressionTypeVariant ;
1111use datafusion_common:: stats:: Precision ;
12- use datafusion_common:: { not_impl_err, DataFusionError , Result as DFResult , Statistics } ;
12+ use datafusion_common:: {
13+ not_impl_err, ColumnStatistics , DataFusionError , Result as DFResult , Statistics ,
14+ } ;
1315use datafusion_expr:: Expr ;
1416use datafusion_physical_expr:: { LexRequirement , PhysicalExpr } ;
1517use datafusion_physical_plan:: metrics:: ExecutionPlanMetricsSet ;
1618use datafusion_physical_plan:: ExecutionPlan ;
1719use object_store:: { ObjectMeta , ObjectStore } ;
1820use vortex_array:: arrow:: infer_schema;
1921use vortex_array:: Context ;
20- use vortex_file:: { read_initial_bytes, VORTEX_FILE_EXTENSION } ;
21- use vortex_io:: ObjectStoreReadAt ;
22+ use vortex_file:: metadata:: MetadataFetcher ;
23+ use vortex_file:: {
24+ read_initial_bytes, LayoutContext , LayoutDeserializer , LayoutMessageCache , RelativeLayoutCache ,
25+ Scan , VORTEX_FILE_EXTENSION ,
26+ } ;
27+ use vortex_io:: { IoDispatcher , ObjectStoreReadAt } ;
2228
2329use super :: execution:: VortexExec ;
30+ use super :: statistics:: array_to_col_statistics;
2431use crate :: can_be_pushed_down;
2532
2633#[ derive( Debug , Default ) ]
@@ -86,13 +93,48 @@ impl FileFormat for VortexFormat {
8693 let os_read_at = ObjectStoreReadAt :: new ( store. clone ( ) , object. location . clone ( ) ) ;
8794 let initial_read = read_initial_bytes ( & os_read_at, object. size as u64 ) . await ?;
8895 let layout = initial_read. fb_layout ( ) ?;
96+ let dtype = initial_read. lazy_dtype ( ) . map_err ( |e| {
97+ DataFusionError :: External ( Box :: new (
98+ e. with_context ( "Failed to fetch dtype from initial read" ) ,
99+ ) )
100+ } ) ?;
89101 let row_count = layout. row_count ( ) ;
90102
91- let stats = Statistics {
92- num_rows : Precision :: Exact ( row_count as usize ) ,
93- total_byte_size : Precision :: Absent ,
94- column_statistics : Statistics :: unknown_column ( & table_schema) ,
95- } ;
103+ let layout_deserializer =
104+ LayoutDeserializer :: new ( Context :: default ( ) . into ( ) , LayoutContext :: default ( ) . into ( ) ) ;
105+ let layout_message_cache = Arc :: new ( RwLock :: new ( LayoutMessageCache :: new ( ) ) ) ;
106+ let relative_message_cache =
107+ RelativeLayoutCache :: new ( layout_message_cache. clone ( ) , dtype. into ( ) ) ;
108+
109+ let root_layout = vortex_file:: read_layout_from_initial (
110+ & initial_read,
111+ & layout_deserializer,
112+ Scan :: empty ( ) ,
113+ relative_message_cache,
114+ ) ?;
115+
116+ let io = IoDispatcher :: default ( ) ;
117+ let mut stats = Statistics :: new_unknown ( & table_schema) ;
118+ stats. num_rows = Precision :: Exact ( row_count as usize ) ;
119+
120+ let metadata_table =
121+ MetadataFetcher :: fetch ( os_read_at, io. into ( ) , root_layout, layout_message_cache)
122+ . await ?;
123+
124+ if let Some ( metadata) = metadata_table {
125+ let mut column_statistics = Vec :: with_capacity ( table_schema. fields ( ) . len ( ) ) ;
126+
127+ for col_stats in metadata. into_iter ( ) {
128+ let col_stats = match col_stats {
129+ Some ( array) => array_to_col_statistics ( array. try_into ( ) ?) ?,
130+ None => ColumnStatistics :: new_unknown ( ) ,
131+ } ;
132+
133+ column_statistics. push ( col_stats) ;
134+ }
135+
136+ stats. column_statistics = column_statistics;
137+ }
96138
97139 Ok ( stats)
98140 }
0 commit comments