Extend not to support not(expr)

xudong963 · xudong963 · commit c73e9f9eb597 · 2025-11-12T15:32:05.000+08:00
diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs
@@ -37,6 +37,7 @@ use datafusion::{
     prelude::{ParquetReadOptions, SessionConfig, SessionContext},
 };
 use datafusion_expr::{Expr, LogicalPlan, LogicalPlanBuilder};
+use datafusion_physical_plan::execute_stream;
 use parquet::arrow::ArrowWriter;
 use parquet::file::properties::{EnabledStatistics, WriterProperties};
 use std::sync::Arc;
@@ -225,6 +226,7 @@ impl ContextWithParquet {
     ) -> Self {
         // Use a single partition for deterministic results no matter how many CPUs the host has
         config = config.with_target_partitions(1);
+        config.options_mut().execution.parquet.pushdown_filters = true;
         let file = match unit {
             Unit::RowGroup(row_per_group) => {
                 config = config.with_parquet_bloom_filter_pruning(true);
@@ -308,6 +310,15 @@ impl ContextWithParquet {
             .await
             .expect("creating physical plan");
 
+        /*
+        use arrow::util::pretty::print_batches;
+        use futures::TryStreamExt;
+        let res =
+            execute_stream(physical_plan.clone(), self.ctx.task_ctx().clone()).unwrap();
+        let batches = res.try_collect::<Vec<_>>().await.unwrap();
+        print_batches(&batches).unwrap();
+        */
+
         let task_ctx = state.task_ctx();
         let results = datafusion::physical_plan::collect(physical_plan.clone(), task_ctx)
             .await
diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs
@@ -174,10 +174,11 @@ impl RowGroupPruningTest {
         self,
         schema: Arc<Schema>,
         batches: Vec<RecordBatch>,
+        max_row_per_row_group: usize,
     ) {
         let output = ContextWithParquet::with_custom_data(
             self.scenario,
-            RowGroup(2),
+            RowGroup(max_row_per_row_group),
             schema,
             batches,
         )
@@ -1745,7 +1746,7 @@ async fn test_limit_pruning() -> datafusion_common::error::Result<()> {
     // So 3 row groups are effectively pruned due to limit pruning.
 
     let schema = Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)]));
-    let query = "explain verbose SELECT c1 FROM t WHERE c1 > 0 LIMIT 2";
+    let query = "SELECT c1 FROM t WHERE c1 > 0 LIMIT 2";
 
     let batches = vec![
         make_i32_batch("c1", vec![1, 2])?, // RG0: Fully matched, 2 rows
@@ -1764,8 +1765,8 @@ async fn test_limit_pruning() -> datafusion_common::error::Result<()> {
         .with_pruned_by_bloom_filter(Some(0))
         .with_matched_by_stats(Some(3)) // RG0, RG1, RG2 are matched by stats (c1 > 0)
         .with_pruned_by_stats(Some(1)) // RG3 is pruned by stats (c1 = [-1, 0] does not satisfy c1 > 0)
-        // .with_limit_pruned_row_groups(Some(2)) // RG1, RG2 are pruned by limit. (RG3 is already pruned by stats)
-        .test_row_group_prune_with_custom_data(schema, batches)
+        .with_limit_pruned_row_groups(Some(2)) // RG1, RG2 are pruned by limit. (RG3 is already pruned by stats)
+        .test_row_group_prune_with_custom_data(schema, batches, 2)
         .await;
 
     Ok(())
diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs
@@ -183,7 +183,8 @@ impl RowGroupAccessPlanFilter {
         match predicate.prune(&pruning_stats) {
             Ok(values) => {
                 let mut new_access_plan = ParquetAccessPlan::new_all(groups.len());
-                let mut fully_contained_candidates_original_idx: Vec<usize> = Vec::new();
+                let mut fully_contained_candidates_original_idxes: Vec<usize> =
+                    Vec::new();
 
                 for (idx_in_pruning_stats_result, &pruning_result) in
                     values.iter().enumerate()
@@ -194,13 +195,13 @@ impl RowGroupAccessPlanFilter {
                         new_access_plan.skip(original_row_group_idx);
                         metrics.row_groups_pruned_statistics.add(1);
                     } else {
-                        fully_contained_candidates_original_idx
+                        fully_contained_candidates_original_idxes
                             .push(original_row_group_idx);
                         metrics.row_groups_matched_statistics.add(1);
                     }
                 }
 
-                if !fully_contained_candidates_original_idx.is_empty() {
+                if !fully_contained_candidates_original_idxes.is_empty() {
                     // Use NotExpr to create the inverted predicate
                     let inverted_expr =
                         Arc::new(NotExpr::new(predicate.orig_expr().clone()));
@@ -210,18 +211,20 @@ impl RowGroupAccessPlanFilter {
                     ) {
                         let inverted_pruning_stats = RowGroupPruningStatistics {
                             parquet_schema,
-                            row_group_metadatas: fully_contained_candidates_original_idx
-                                .iter()
-                                .map(|&i| &groups[i])
-                                .collect::<Vec<_>>(),
+                            row_group_metadatas:
+                                fully_contained_candidates_original_idxes
+                                    .iter()
+                                    .map(|&i| &groups[i])
+                                    .collect::<Vec<_>>(),
                             arrow_schema,
                         };
-
                         if let Ok(inverted_values) =
                             inverted_predicate.prune(&inverted_pruning_stats)
                         {
                             for (i, &original_row_group_idx) in
-                                fully_contained_candidates_original_idx.iter().enumerate()
+                                fully_contained_candidates_original_idxes
+                                    .iter()
+                                    .enumerate()
                             {
                                 // If the inverted predicate *also* prunes this row group (meaning inverted_values[i] is false),
                                 // it implies that *all* rows in this group satisfy the original predicate.
diff --git a/datafusion/pruning/src/pruning_predicate.rs b/datafusion/pruning/src/pruning_predicate.rs
@@ -1415,14 +1415,39 @@ fn build_predicate_expression(
             .unwrap_or_else(|| unhandled_hook.handle(expr));
     }
     if let Some(not) = expr_any.downcast_ref::<phys_expr::NotExpr>() {
-        // match !col (don't do so recursively)
         if let Some(col) = not.arg().as_any().downcast_ref::<phys_expr::Column>() {
             return build_single_column_expr(col, schema, required_columns, true)
                 .unwrap_or_else(|| unhandled_hook.handle(expr));
-        } else {
+        }
+
+        let inner_expr = build_predicate_expression(
+            not.arg(),
+            schema,
+            required_columns,
+            unhandled_hook,
+        );
+
+        // Only apply NOT if the inner expression is NOT a true literal
+        // (because true literals may come from unhandled cases)
+        if is_always_true(&inner_expr) {
+            // Conservative approach: if inner returns true (possibly unhandled),
+            // then NOT should also return true (unhandled) to be safe
             return unhandled_hook.handle(expr);
         }
+
+        // Handle other boolean literals
+        if let Some(literal) = inner_expr.as_any().downcast_ref::<phys_expr::Literal>() {
+            if let ScalarValue::Boolean(Some(val)) = literal.value() {
+                return Arc::new(phys_expr::Literal::new(ScalarValue::Boolean(Some(
+                    !val,
+                ))));
+            }
+        }
+
+        // Apply NOT to the result
+        return Arc::new(phys_expr::NotExpr::new(inner_expr));
     }
+
     if let Some(in_list) = expr_any.downcast_ref::<phys_expr::InListExpr>() {
         if !in_list.list().is_empty()
             && in_list.list().len() <= MAX_LIST_VALUE_SIZE_REWRITE
@@ -1868,7 +1893,7 @@ mod tests {
 
     use super::*;
     use datafusion_common::test_util::batches_to_string;
-    use datafusion_expr::{and, col, lit, or};
+    use datafusion_expr::{and, col, lit, not, or};
     use insta::assert_snapshot;
 
     use arrow::array::Decimal128Array;
@@ -4422,7 +4447,7 @@ mod tests {
             true,
             // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"]  ==> some rows could pass (must keep)
             true,
-            // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]  ==> no row match. (min, max) maybe truncate 
+            // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]  ==> no row match. (min, max) maybe truncate
             // original (min, max) maybe ("A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}\u{10ffff}\u{10ffff}")
             true,
         ];
@@ -5175,4 +5200,66 @@ mod tests {
             "c1_null_count@2 != row_count@3 AND c1_min@0 <= a AND a <= c1_max@1";
         assert_eq!(res.to_string(), expected);
     }
+
+    #[test]
+    fn test_not_expression_unhandled_inner_true() -> Result<()> {
+        // Test case: when inner expression returns true (unhandled),
+        // NOT should also return true (unhandled) for safety
+        let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]);
+
+        // NOT(c1) for Int32 returns true because build_single_column_expr
+        // only handles boolean columns, so non-boolean columns fall back to unhandled_hook
+        let expr = not(col("c1"));
+        let predicate_expr =
+            test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new());
+        assert_eq!(predicate_expr.to_string(), "true");
+        Ok(())
+    }
+
+    #[test]
+    fn test_not_expression_boolean_literal_handling() -> Result<()> {
+        let schema = Schema::empty();
+
+        // NOT(false) -> true
+        let expr = not(lit(false));
+        let predicate_expr =
+            test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new());
+        assert_eq!(predicate_expr.to_string(), "true");
+
+        // NOT(true) -> true (conservatively)
+        let expr = not(lit(true));
+        let predicate_expr =
+            test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new());
+        assert_eq!(predicate_expr.to_string(), "true");
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_not_expression_wraps_complex_expressions() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]);
+
+        let expr = not(col("c1").gt(lit(5)));
+        let predicate_expr =
+            test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new());
+
+        let result_str = predicate_expr.to_string();
+        assert_eq!(
+            result_str,
+            "NOT c1_null_count@1 != row_count@2 AND c1_max@0 > 5"
+        );
+
+        // NOT(c1 = 10)
+        let expr = not(col("c1").eq(lit(10)));
+        let predicate_expr =
+            test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new());
+
+        let result_str = predicate_expr.to_string();
+        assert_eq!(
+            result_str,
+            "NOT c1_null_count@2 != row_count@3 AND c1_min@0 <= 10 AND 10 <= c1_max@1"
+        );
+
+        Ok(())
+    }
 }