@@ -31,9 +31,7 @@ use arrow::{
3131 buffer:: Buffer ,
3232 datatypes:: { ArrowNativeType , DataType , Field , Schema , SchemaRef , UInt16Type } ,
3333} ;
34- use datafusion_common:: {
35- exec_err, stats:: Precision , ColumnStatistics , Constraints , Result , Statistics ,
36- } ;
34+ use datafusion_common:: { exec_err, ColumnStatistics , Constraints , Result , Statistics } ;
3735use datafusion_common:: { DataFusionError , ScalarValue } ;
3836use datafusion_execution:: {
3937 object_store:: ObjectStoreUrl , SendableRecordBatchStream , TaskContext ,
@@ -86,20 +84,22 @@ use crate::{
8684/// # Field::new("c4", DataType::Int32, false),
8785/// # ]));
8886/// # // Note: crate mock ParquetSource, as ParquetSource is not in the datasource crate
89- /// # struct ParquetSource {};
87+ /// # struct ParquetSource {
88+ /// # projected_statistics: Option<Statistics>
89+ /// # };
9090/// # impl FileSource for ParquetSource {
9191/// # fn create_file_opener(&self, _: Arc<dyn ObjectStore>, _: &FileScanConfig, _: usize) -> Arc<dyn FileOpener> { unimplemented!() }
9292/// # fn as_any(&self) -> &dyn Any { self }
9393/// # fn with_batch_size(&self, _: usize) -> Arc<dyn FileSource> { unimplemented!() }
9494/// # fn with_schema(&self, _: SchemaRef) -> Arc<dyn FileSource> { unimplemented!() }
9595/// # fn with_projection(&self, _: &FileScanConfig) -> Arc<dyn FileSource> { unimplemented!() }
96- /// # fn with_statistics(&self, _ : Statistics) -> Arc<dyn FileSource> { Arc::new(Self::new() ) }
96+ /// # fn with_statistics(&self, statistics : Statistics) -> Arc<dyn FileSource> { Arc::new(Self {projected_statistics: Some(statistics)} ) }
9797/// # fn metrics(&self) -> &ExecutionPlanMetricsSet { unimplemented!() }
98- /// # fn statistics(&self) -> datafusion_common::Result<Statistics> { unimplemented!( ) }
98+ /// # fn statistics(&self) -> datafusion_common::Result<Statistics> { Ok(self.projected_statistics.clone().expect("projected_statistics should be set") ) }
9999/// # fn file_type(&self) -> &str { "parquet" }
100100/// # }
101101/// # impl ParquetSource {
102- /// # fn new() -> Self { Self{ } }
102+ /// # fn new() -> Self { Self {projected_statistics: None } }
103103/// # }
104104/// // create FileScan config for reading parquet files from file://
105105/// let object_store_url = ObjectStoreUrl::local_filesystem();
@@ -244,7 +244,7 @@ impl DataSource for FileScanConfig {
244244 }
245245
246246 fn statistics ( & self ) -> Result < Statistics > {
247- self . file_source . statistics ( )
247+ Ok ( self . projected_stats ( ) )
248248 }
249249
250250 fn with_fetch ( & self , limit : Option < usize > ) -> Option < Arc < dyn DataSource > > {
@@ -324,13 +324,7 @@ impl FileScanConfig {
324324
325325 /// Set the file source
326326 pub fn with_source ( mut self , file_source : Arc < dyn FileSource > ) -> Self {
327- let (
328- _projected_schema,
329- _constraints,
330- projected_statistics,
331- _projected_output_ordering,
332- ) = self . project ( ) ;
333- self . file_source = file_source. with_statistics ( projected_statistics) ;
327+ self . file_source = file_source. with_statistics ( self . statistics . clone ( ) ) ;
334328 self
335329 }
336330
@@ -342,10 +336,75 @@ impl FileScanConfig {
342336
343337 /// Set the statistics of the files
344338 pub fn with_statistics ( mut self , statistics : Statistics ) -> Self {
345- self . statistics = statistics;
339+ self . statistics = statistics. clone ( ) ;
340+ self . file_source = self . file_source . with_statistics ( statistics) ;
346341 self
347342 }
348343
344+ fn projection_indices ( & self ) -> Vec < usize > {
345+ match & self . projection {
346+ Some ( proj) => proj. clone ( ) ,
347+ None => ( 0 ..self . file_schema . fields ( ) . len ( )
348+ + self . table_partition_cols . len ( ) )
349+ . collect ( ) ,
350+ }
351+ }
352+
353+ fn projected_stats ( & self ) -> Statistics {
354+ let statistics = self
355+ . file_source
356+ . statistics ( )
357+ . unwrap_or ( self . statistics . clone ( ) ) ;
358+
359+ let table_cols_stats = self
360+ . projection_indices ( )
361+ . into_iter ( )
362+ . map ( |idx| {
363+ if idx < self . file_schema . fields ( ) . len ( ) {
364+ statistics. column_statistics [ idx] . clone ( )
365+ } else {
366+ // TODO provide accurate stat for partition column (#1186)
367+ ColumnStatistics :: new_unknown ( )
368+ }
369+ } )
370+ . collect ( ) ;
371+
372+ Statistics {
373+ num_rows : statistics. num_rows ,
374+ // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
375+ total_byte_size : statistics. total_byte_size ,
376+ column_statistics : table_cols_stats,
377+ }
378+ }
379+
380+ fn projected_schema ( & self ) -> Arc < Schema > {
381+ let table_fields: Vec < _ > = self
382+ . projection_indices ( )
383+ . into_iter ( )
384+ . map ( |idx| {
385+ if idx < self . file_schema . fields ( ) . len ( ) {
386+ self . file_schema . field ( idx) . clone ( )
387+ } else {
388+ let partition_idx = idx - self . file_schema . fields ( ) . len ( ) ;
389+ self . table_partition_cols [ partition_idx] . clone ( )
390+ }
391+ } )
392+ . collect ( ) ;
393+
394+ Arc :: new ( Schema :: new_with_metadata (
395+ table_fields,
396+ self . file_schema . metadata ( ) . clone ( ) ,
397+ ) )
398+ }
399+
400+ fn projected_constraints ( & self ) -> Constraints {
401+ let indexes = self . projection_indices ( ) ;
402+
403+ self . constraints
404+ . project ( & indexes)
405+ . unwrap_or_else ( Constraints :: empty)
406+ }
407+
349408 /// Set the projection of the files
350409 pub fn with_projection ( mut self , projection : Option < Vec < usize > > ) -> Self {
351410 self . projection = projection;
@@ -433,54 +492,13 @@ impl FileScanConfig {
433492 ) ;
434493 }
435494
436- let proj_indices = if let Some ( proj) = & self . projection {
437- proj
438- } else {
439- let len = self . file_schema . fields ( ) . len ( ) + self . table_partition_cols . len ( ) ;
440- & ( 0 ..len) . collect :: < Vec < _ > > ( )
441- } ;
442-
443- let mut table_fields = vec ! [ ] ;
444- let mut table_cols_stats = vec ! [ ] ;
445- for idx in proj_indices {
446- if * idx < self . file_schema . fields ( ) . len ( ) {
447- let field = self . file_schema . field ( * idx) ;
448- table_fields. push ( field. clone ( ) ) ;
449- table_cols_stats. push ( self . statistics . column_statistics [ * idx] . clone ( ) )
450- } else {
451- let partition_idx = idx - self . file_schema . fields ( ) . len ( ) ;
452- table_fields. push ( self . table_partition_cols [ partition_idx] . to_owned ( ) ) ;
453- // TODO provide accurate stat for partition column (#1186)
454- table_cols_stats. push ( ColumnStatistics :: new_unknown ( ) )
455- }
456- }
457-
458- let table_stats = Statistics {
459- num_rows : self . statistics . num_rows ,
460- // TODO correct byte size?
461- total_byte_size : Precision :: Absent ,
462- column_statistics : table_cols_stats,
463- } ;
464-
465- let projected_schema = Arc :: new ( Schema :: new_with_metadata (
466- table_fields,
467- self . file_schema . metadata ( ) . clone ( ) ,
468- ) ) ;
495+ let schema = self . projected_schema ( ) ;
496+ let constraints = self . projected_constraints ( ) ;
497+ let stats = self . projected_stats ( ) ;
469498
470- let projected_constraints = self
471- . constraints
472- . project ( proj_indices)
473- . unwrap_or_else ( Constraints :: empty) ;
499+ let output_ordering = get_projected_output_ordering ( self , & schema) ;
474500
475- let projected_output_ordering =
476- get_projected_output_ordering ( self , & projected_schema) ;
477-
478- (
479- projected_schema,
480- projected_constraints,
481- table_stats,
482- projected_output_ordering,
483- )
501+ ( schema, constraints, stats, output_ordering)
484502 }
485503
486504 #[ cfg_attr( not( feature = "avro" ) , allow( unused) ) ] // Only used by avro
@@ -1048,6 +1066,7 @@ mod tests {
10481066 compute:: SortOptions ,
10491067 } ;
10501068
1069+ use datafusion_common:: stats:: Precision ;
10511070 use datafusion_common:: { assert_batches_eq, DFSchema } ;
10521071 use datafusion_expr:: { execution_props:: ExecutionProps , SortExpr } ;
10531072 use datafusion_physical_expr:: create_physical_expr;
@@ -1203,6 +1222,12 @@ mod tests {
12031222 ) ,
12041223 ] ;
12051224 // create a projected schema
1225+ let statistics = Statistics {
1226+ num_rows : Precision :: Inexact ( 3 ) ,
1227+ total_byte_size : Precision :: Absent ,
1228+ column_statistics : Statistics :: unknown_column ( & file_batch. schema ( ) ) ,
1229+ } ;
1230+
12061231 let conf = config_for_projection (
12071232 file_batch. schema ( ) ,
12081233 // keep all cols from file and 2 from partitioning
@@ -1213,9 +1238,23 @@ mod tests {
12131238 file_batch. schema( ) . fields( ) . len( ) ,
12141239 file_batch. schema( ) . fields( ) . len( ) + 2 ,
12151240 ] ) ,
1216- Statistics :: new_unknown ( & file_batch . schema ( ) ) ,
1241+ statistics . clone ( ) ,
12171242 to_partition_cols ( partition_cols. clone ( ) ) ,
12181243 ) ;
1244+
1245+ let source_statistics = conf. file_source . statistics ( ) . unwrap ( ) ;
1246+ let conf_stats = conf. statistics ( ) . unwrap ( ) ;
1247+
1248+ // projection should be reflected in the file source statistics
1249+ assert_eq ! ( conf_stats. num_rows, Precision :: Inexact ( 3 ) ) ;
1250+
1251+ // 3 original statistics + 2 partition statistics
1252+ assert_eq ! ( conf_stats. column_statistics. len( ) , 5 ) ;
1253+
1254+ // file statics should not be modified
1255+ assert_eq ! ( source_statistics, statistics) ;
1256+ assert_eq ! ( source_statistics. column_statistics. len( ) , 3 ) ;
1257+
12191258 let ( proj_schema, ..) = conf. project ( ) ;
12201259 // created a projector for that projected schema
12211260 let mut proj = PartitionColumnProjector :: new (
0 commit comments