Skip to content

Commit fd542c6

Browse files
fwojciecclaude
authored andcommitted
Fix FilterExec converting Absent column stats to Exact(NULL) (apache#20391)
## Which issue does this PR close? - Closes apache#20388. ## Rationale for this change `collect_new_statistics` in `FilterExec` wraps NULL interval bounds in `Precision::Exact`, converting what should be `Precision::Absent` column statistics into `Precision::Exact(ScalarValue::Int32(None))`. Downstream, `estimate_disjoint_inputs` treats these as real bounds and incorrectly concludes join inputs are disjoint, forcing Partitioned join mode and disabling dynamic filter pushdown for Parquet row group pruning. ## What changes are included in this PR? Single change to `collect_new_statistics` in `filter.rs`: check `is_null()` on interval bounds before wrapping in `Precision`, mapping NULL bounds back to `Absent`. ## Are these changes tested? Yes — includes a regression test (`test_filter_statistics_absent_columns_stay_absent`) that fails on current main and passes with the fix. ## Are there any user-facing changes? No API changes. Corrects statistics propagation for tables/views with absent column statistics. --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 7698fdc commit fd542c6

File tree

1 file changed

+54
-5
lines changed

1 file changed

+54
-5
lines changed

datafusion/physical-plan/src/filter.rs

Lines changed: 54 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -752,6 +752,21 @@ impl EmbeddedProjection for FilterExec {
752752
}
753753
}
754754

755+
/// Converts an interval bound to a [`Precision`] value. NULL bounds (which
756+
/// represent "unbounded" in the interval type) map to [`Precision::Absent`].
757+
fn interval_bound_to_precision(
758+
bound: ScalarValue,
759+
is_exact: bool,
760+
) -> Precision<ScalarValue> {
761+
if bound.is_null() {
762+
Precision::Absent
763+
} else if is_exact {
764+
Precision::Exact(bound)
765+
} else {
766+
Precision::Inexact(bound)
767+
}
768+
}
769+
755770
/// This function ensures that all bounds in the `ExprBoundaries` vector are
756771
/// converted to closed bounds. If a lower/upper bound is initially open, it
757772
/// is adjusted by using the next/previous value for its data type to convert
@@ -784,11 +799,9 @@ fn collect_new_statistics(
784799
};
785800
};
786801
let (lower, upper) = interval.into_bounds();
787-
let (min_value, max_value) = if lower.eq(&upper) {
788-
(Precision::Exact(lower), Precision::Exact(upper))
789-
} else {
790-
(Precision::Inexact(lower), Precision::Inexact(upper))
791-
};
802+
let is_exact = !lower.is_null() && !upper.is_null() && lower == upper;
803+
let min_value = interval_bound_to_precision(lower, is_exact);
804+
let max_value = interval_bound_to_precision(upper, is_exact);
792805
ColumnStatistics {
793806
null_count: input_column_stats[idx].null_count.to_inexact(),
794807
max_value,
@@ -2066,4 +2079,40 @@ mod tests {
20662079

20672080
Ok(())
20682081
}
2082+
2083+
/// Columns with Absent min/max statistics should remain Absent after
2084+
/// FilterExec.
2085+
#[tokio::test]
2086+
async fn test_filter_statistics_absent_columns_stay_absent() -> Result<()> {
2087+
let schema = Schema::new(vec![
2088+
Field::new("a", DataType::Int32, false),
2089+
Field::new("b", DataType::Int32, false),
2090+
]);
2091+
let input = Arc::new(StatisticsExec::new(
2092+
Statistics {
2093+
num_rows: Precision::Inexact(1000),
2094+
total_byte_size: Precision::Absent,
2095+
column_statistics: vec![
2096+
ColumnStatistics::default(),
2097+
ColumnStatistics::default(),
2098+
],
2099+
},
2100+
schema.clone(),
2101+
));
2102+
2103+
let predicate = Arc::new(BinaryExpr::new(
2104+
Arc::new(Column::new("a", 0)),
2105+
Operator::Eq,
2106+
Arc::new(Literal::new(ScalarValue::Int32(Some(42)))),
2107+
));
2108+
let filter: Arc<dyn ExecutionPlan> =
2109+
Arc::new(FilterExec::try_new(predicate, input)?);
2110+
2111+
let statistics = filter.partition_statistics(None)?;
2112+
let col_b_stats = &statistics.column_statistics[1];
2113+
assert_eq!(col_b_stats.min_value, Precision::Absent);
2114+
assert_eq!(col_b_stats.max_value, Precision::Absent);
2115+
2116+
Ok(())
2117+
}
20692118
}

0 commit comments

Comments
 (0)