add row_groups_fully_matched_statistics

xudong963 · xudong963 · commit 53ebed796c06 · 2025-11-17T14:40:29.000+08:00
diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs
@@ -150,6 +150,11 @@ impl TestOutput {
         self.metric_value("row_groups_matched_statistics")
     }
 
+    /// The number of row_groups fully matched by statistics
+    fn row_groups_fully_matched_statistics(&self) -> Option<usize> {
+        self.metric_value("row_groups_fully_matched_statistics")
+    }
+
     /// The number of row_groups pruned by statistics
     fn row_groups_pruned_statistics(&self) -> Option<usize> {
         self.metric_value("row_groups_pruned_statistics")
diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs
@@ -34,6 +34,7 @@ struct RowGroupPruningTest {
     query: String,
     expected_errors: Option<usize>,
     expected_row_group_matched_by_statistics: Option<usize>,
+    expected_row_group_fully_matched_by_statistics: Option<usize>,
     expected_row_group_pruned_by_statistics: Option<usize>,
     expected_files_pruned_by_statistics: Option<usize>,
     expected_row_group_matched_by_bloom_filter: Option<usize>,
@@ -50,6 +51,7 @@ impl RowGroupPruningTest {
             expected_errors: None,
             expected_row_group_matched_by_statistics: None,
             expected_row_group_pruned_by_statistics: None,
+            expected_row_group_fully_matched_by_statistics: None,
             expected_files_pruned_by_statistics: None,
             expected_row_group_matched_by_bloom_filter: None,
             expected_row_group_pruned_by_bloom_filter: None,
@@ -82,6 +84,15 @@ impl RowGroupPruningTest {
         self
     }
 
+    // Set the expected fully matched row groups by statistics
+    fn with_fully_matched_by_stats(
+        mut self,
+        fully_matched_by_stats: Option<usize>,
+    ) -> Self {
+        self.expected_row_group_fully_matched_by_statistics = fully_matched_by_stats;
+        self
+    }
+
     // Set the expected pruned row groups by statistics
     fn with_pruned_by_stats(mut self, pruned_by_stats: Option<usize>) -> Self {
         self.expected_row_group_pruned_by_statistics = pruned_by_stats;
@@ -197,6 +208,11 @@ impl RowGroupPruningTest {
             self.expected_row_group_matched_by_statistics,
             "mismatched row_groups_matched_statistics",
         );
+        assert_eq!(
+            output.row_groups_fully_matched_statistics(),
+            self.expected_row_group_fully_matched_by_statistics,
+            "mismatched row_groups_fully_matched_statistics",
+        );
         assert_eq!(
             output.row_groups_pruned_statistics(),
             self.expected_row_group_pruned_by_statistics,
@@ -1719,8 +1735,24 @@ fn make_i32_batch(
     RecordBatch::try_new(schema, vec![array]).map_err(DataFusionError::from)
 }
 
+// Helper function to create a batch with two Int32 columns
+fn make_two_col_i32_batch(
+    name_a: &str,
+    name_b: &str,
+    values_a: Vec<i32>,
+    values_b: Vec<i32>,
+) -> datafusion_common::error::Result<RecordBatch> {
+    let schema = Arc::new(Schema::new(vec![
+        Field::new(name_a, DataType::Int32, false),
+        Field::new(name_b, DataType::Int32, false),
+    ]));
+    let array_a: ArrayRef = Arc::new(Int32Array::from(values_a));
+    let array_b: ArrayRef = Arc::new(Int32Array::from(values_b));
+    RecordBatch::try_new(schema, vec![array_a, array_b]).map_err(DataFusionError::from)
+}
+
 #[tokio::test]
-async fn test_limit_pruning() -> datafusion_common::error::Result<()> {
+async fn test_limit_pruning_basic() -> datafusion_common::error::Result<()> {
     // Scenario: Simple integer column, multiple row groups
     // Query: SELECT c1 FROM  t WHERE c1 = 0 LIMIT 2
     // We expect 2 rows in total.
@@ -1754,6 +1786,7 @@ async fn test_limit_pruning() -> datafusion_common::error::Result<()> {
         .with_expected_rows(2)
         .with_pruned_files(Some(0))
         .with_matched_by_stats(Some(4))
+        .with_fully_matched_by_stats(Some(3))
         .with_pruned_by_stats(Some(1))
         .with_limit_pruned_row_groups(Some(3))
         .test_row_group_prune_with_custom_data(schema, batches, 2)
@@ -1762,22 +1795,6 @@ async fn test_limit_pruning() -> datafusion_common::error::Result<()> {
     Ok(())
 }
 
-// Helper function to create a batch with two Int32 columns
-fn make_two_col_i32_batch(
-    name_a: &str,
-    name_b: &str,
-    values_a: Vec<i32>,
-    values_b: Vec<i32>,
-) -> datafusion_common::error::Result<RecordBatch> {
-    let schema = Arc::new(Schema::new(vec![
-        Field::new(name_a, DataType::Int32, false),
-        Field::new(name_b, DataType::Int32, false),
-    ]));
-    let array_a: ArrayRef = Arc::new(Int32Array::from(values_a));
-    let array_b: ArrayRef = Arc::new(Int32Array::from(values_b));
-    RecordBatch::try_new(schema, vec![array_a, array_b]).map_err(DataFusionError::from)
-}
-
 #[tokio::test]
 async fn test_limit_pruning_complex_filter() -> datafusion_common::error::Result<()> {
     // Test Case 1: Complex filter with two columns (a = 1 AND b > 1 AND b < 4)
@@ -1815,6 +1832,7 @@ async fn test_limit_pruning_complex_filter() -> datafusion_common::error::Result
         .with_expected_rows(5)
         .with_pruned_files(Some(0))
         .with_matched_by_stats(Some(4)) // RG0,1,2,3 are matched
+        .with_fully_matched_by_stats(Some(3))
         .with_pruned_by_stats(Some(2)) // RG4,5 are pruned
         .with_limit_pruned_row_groups(Some(2)) // RG0, RG3 is pruned by limit
         .test_row_group_prune_with_custom_data(schema, batches, 3)
@@ -1855,6 +1873,7 @@ async fn test_limit_pruning_multiple_fully_matched(
         .with_expected_rows(8)
         .with_pruned_files(Some(0))
         .with_matched_by_stats(Some(4)) // RG0,1,2,3 matched
+        .with_fully_matched_by_stats(Some(4))
         .with_pruned_by_stats(Some(1)) // RG4 pruned
         .with_limit_pruned_row_groups(Some(2)) // RG2,3 pruned by limit
         .test_row_group_prune_with_custom_data(schema, batches, 4)
@@ -1894,6 +1913,7 @@ async fn test_limit_pruning_no_fully_matched() -> datafusion_common::error::Resu
         .with_expected_rows(3)
         .with_pruned_files(Some(0))
         .with_matched_by_stats(Some(4)) // RG0,1,2,3 matched
+        .with_fully_matched_by_stats(Some(0))
         .with_pruned_by_stats(Some(1)) // RG4 pruned
         .with_limit_pruned_row_groups(Some(0)) // RG3 pruned by limit
         .test_row_group_prune_with_custom_data(schema, batches, 3)
@@ -1934,6 +1954,7 @@ async fn test_limit_pruning_exceeds_fully_matched() -> datafusion_common::error:
         .with_expected_rows(10) // Total: 1 + 3 + 4 + 1 = 9 (less than limit)
         .with_pruned_files(Some(0))
         .with_matched_by_stats(Some(4)) // RG0,1,2,3 matched
+        .with_fully_matched_by_stats(Some(2))
         .with_pruned_by_stats(Some(1)) // RG4 pruned
         .with_limit_pruned_row_groups(Some(0)) // No limit pruning since we need all RGs
         .test_row_group_prune_with_custom_data(schema, batches, 4)
diff --git a/datafusion/datasource-parquet/src/metrics.rs b/datafusion/datasource-parquet/src/metrics.rs
@@ -50,6 +50,8 @@ pub struct ParquetFileMetrics {
     pub row_groups_pruned_bloom_filter: Count,
     /// Number of row groups pruned due to limit pruning.
     pub limit_pruned_row_groups: Count,
+    /// Number of row groups whose statistics were checked and fully matched
+    pub row_groups_fully_matched_statistics: Count,
     /// Number of row groups whose statistics were checked and matched (not pruned)
     pub row_groups_matched_statistics: Count,
     /// Number of row groups pruned by statistics
@@ -98,6 +100,10 @@ impl ParquetFileMetrics {
         let limit_pruned_row_groups = MetricBuilder::new(metrics)
             .with_new_label("filename", filename.to_string())
             .counter("limit_pruned_row_groups", partition);
+        
+        let row_groups_fully_matched_statistics = MetricBuilder::new(metrics)
+            .with_new_label("filename", filename.to_string())
+            .counter("row_groups_fully_matched_statistics", partition);
 
         let row_groups_matched_statistics = MetricBuilder::new(metrics)
             .with_new_label("filename", filename.to_string())
@@ -151,6 +157,7 @@ impl ParquetFileMetrics {
             predicate_evaluation_errors,
             row_groups_matched_bloom_filter,
             row_groups_pruned_bloom_filter,
+            row_groups_fully_matched_statistics,
             row_groups_matched_statistics,
             row_groups_pruned_statistics,
             limit_pruned_row_groups,
diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs
@@ -166,8 +166,6 @@ impl RowGroupAccessPlanFilter {
 
         assert_eq!(groups.len(), self.access_plan.len());
         // Indexes of row groups still to scan
-        let row_group_indexes_to_consider = self.access_plan.row_group_indexes();
-        // Indexes of row groups still to scan
         let row_group_indexes = self.access_plan.row_group_indexes();
         let row_group_metadatas = row_group_indexes
             .iter()
@@ -190,7 +188,7 @@ impl RowGroupAccessPlanFilter {
                     values.iter().enumerate()
                 {
                     let original_row_group_idx =
-                        row_group_indexes_to_consider[idx_in_pruning_stats_result];
+                        row_group_indexes[idx_in_pruning_stats_result];
                     if !pruning_result {
                         new_access_plan.skip(original_row_group_idx);
                         metrics.row_groups_pruned_statistics.add(1);
@@ -201,6 +199,8 @@ impl RowGroupAccessPlanFilter {
                     }
                 }
 
+                // Note: this part of code shouldn't be expensive with a limited number of row groups
+                // If we do find it's expensive, we can consider optimizing it further.
                 if !fully_contained_candidates_original_idx.is_empty() {
                     // Use NotExpr to create the inverted predicate
                     let inverted_expr =
@@ -232,6 +232,7 @@ impl RowGroupAccessPlanFilter {
                                 // it implies that *all* rows in this group satisfy the original predicate.
                                 if !inverted_values[i] {
                                     self.is_fully_matched[original_row_group_idx] = true;
+                                    metrics.row_groups_fully_matched_statistics.add(1);
                                 }
                             }
                         }

Original file line number	Diff line number	Diff line change
`@@ -166,8 +166,6 @@ impl RowGroupAccessPlanFilter {`
`166`	`166`
`167`	`167`	`assert_eq!(groups.len(), self.access_plan.len());`
`168`	`168`	`// Indexes of row groups still to scan`
`169`		`- let row_group_indexes_to_consider = self.access_plan.row_group_indexes();`
`170`		`- // Indexes of row groups still to scan`
`171`	`169`	`let row_group_indexes = self.access_plan.row_group_indexes();`
`172`	`170`	`let row_group_metadatas = row_group_indexes`
`173`	`171`	`.iter()`
`@@ -190,7 +188,7 @@ impl RowGroupAccessPlanFilter {`
`190`	`188`	`values.iter().enumerate()`
`191`	`189`	`{`
`192`	`190`	`let original_row_group_idx =`
`193`		`- row_group_indexes_to_consider[idx_in_pruning_stats_result];`
	`191`	`+ row_group_indexes[idx_in_pruning_stats_result];`
`194`	`192`	`if !pruning_result {`
`195`	`193`	`new_access_plan.skip(original_row_group_idx);`
`196`	`194`	`metrics.row_groups_pruned_statistics.add(1);`
`@@ -201,6 +199,8 @@ impl RowGroupAccessPlanFilter {`
`201`	`199`	`}`
`202`	`200`	`}`
`203`	`201`
	`202`	`+ // Note: this part of code shouldn't be expensive with a limited number of row groups`
	`203`	`+ // If we do find it's expensive, we can consider optimizing it further.`
`204`	`204`	`if !fully_contained_candidates_original_idx.is_empty() {`
`205`	`205`	`// Use NotExpr to create the inverted predicate`
`206`	`206`	`let inverted_expr =`
`@@ -232,6 +232,7 @@ impl RowGroupAccessPlanFilter {`
`232`	`232`	`// it implies that all rows in this group satisfy the original predicate.`
`233`	`233`	`if !inverted_values[i] {`
`234`	`234`	`self.is_fully_matched[original_row_group_idx] = true;`
	`235`	`+ metrics.row_groups_fully_matched_statistics.add(1);`
`235`	`236`	`}`
`236`	`237`	`}`
`237`	`238`	`}`