Skip to content

Commit 8959b3d

Browse files
feat: output statistics for constant columns in projections (#19419)
## Which issue does this PR close? - Closes #19416 ## Rationale for this change Currently, when projecting literal values (constants) in projection operations, the statistics are set to unknown (`ColumnStatistics::new_unknown()`). This prevents the optimizer from using constant column information for various optimizations, such as: - Sort elimination when sorting by constant columns ## What changes are included in this PR? 1. **Updated `project_statistics` method** in `ProjectionExprs`: - Added detection and statistics calculation for `Literal` expressions (non-null constants) - Statistics calculated: `min_value = max_value = literal value`, `distinct_count = 1`, `null_count = 0`, `byte_size = data_type_width × num_rows` (for primitive types) 2. **Added test**: - `test_project_statistics_with_literal`: Tests non-null literal statistics ## Are these changes tested? Yes ## Are there any user-facing changes? No ## Example **Before this change:** `SELECT 42 AS status, name FROM users ORDER BY status ` - Statistics for `status` were unknown - Optimizer couldn't eliminate the sort operation **After this change:** `SELECT 42 AS status, name FROM users ORDER BY status` - Statistics show `status` has `min_value = max_value = 42`, `distinct_count = 1` - Optimizer can eliminate the sort since the column is already sorted (all values are the same)
1 parent fd26321 commit 8959b3d

File tree

1 file changed

+264
-3
lines changed

1 file changed

+264
-3
lines changed

datafusion/physical-expr/src/projection.rs

Lines changed: 264 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,15 @@ use std::ops::Deref;
2121
use std::sync::Arc;
2222

2323
use crate::PhysicalExpr;
24-
use crate::expressions::Column;
24+
use crate::expressions::{Column, Literal};
2525
use crate::utils::collect_columns;
2626

2727
use arrow::array::{RecordBatch, RecordBatchOptions};
2828
use arrow::datatypes::{Field, Schema, SchemaRef};
29-
use datafusion_common::stats::ColumnStatistics;
29+
use datafusion_common::stats::{ColumnStatistics, Precision};
3030
use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
3131
use datafusion_common::{
32-
Result, assert_or_internal_err, internal_datafusion_err, plan_err,
32+
Result, ScalarValue, assert_or_internal_err, internal_datafusion_err, plan_err,
3333
};
3434

3535
use datafusion_physical_expr_common::metrics::ExecutionPlanMetricsSet;
@@ -611,6 +611,54 @@ impl ProjectionExprs {
611611
let expr = &proj_expr.expr;
612612
let col_stats = if let Some(col) = expr.as_any().downcast_ref::<Column>() {
613613
std::mem::take(&mut stats.column_statistics[col.index()])
614+
} else if let Some(literal) = expr.as_any().downcast_ref::<Literal>() {
615+
// Handle literal expressions (constants) by calculating proper statistics
616+
let data_type = expr.data_type(output_schema)?;
617+
618+
if literal.value().is_null() {
619+
let null_count = match stats.num_rows {
620+
Precision::Exact(num_rows) => Precision::Exact(num_rows),
621+
_ => Precision::Absent,
622+
};
623+
624+
ColumnStatistics {
625+
min_value: Precision::Exact(literal.value().clone()),
626+
max_value: Precision::Exact(literal.value().clone()),
627+
distinct_count: Precision::Exact(1),
628+
null_count,
629+
sum_value: Precision::Exact(literal.value().clone()),
630+
byte_size: Precision::Exact(0),
631+
}
632+
} else {
633+
let value = literal.value();
634+
let distinct_count = Precision::Exact(1);
635+
let null_count = Precision::Exact(0);
636+
637+
let byte_size = if let Some(byte_width) = data_type.primitive_width()
638+
{
639+
stats.num_rows.multiply(&Precision::Exact(byte_width))
640+
} else {
641+
// Complex types depend on array encoding, so set to Absent
642+
Precision::Absent
643+
};
644+
645+
let sum_value = Precision::<ScalarValue>::from(stats.num_rows)
646+
.cast_to(&value.data_type())
647+
.ok()
648+
.map(|row_count| {
649+
Precision::Exact(value.clone()).multiply(&row_count)
650+
})
651+
.unwrap_or(Precision::Absent);
652+
653+
ColumnStatistics {
654+
min_value: Precision::Exact(value.clone()),
655+
max_value: Precision::Exact(value.clone()),
656+
distinct_count,
657+
null_count,
658+
sum_value,
659+
byte_size,
660+
}
661+
}
614662
} else {
615663
// TODO stats: estimate more statistics from expressions
616664
// (expressions should compute their statistics themselves)
@@ -2639,4 +2687,217 @@ pub(crate) mod tests {
26392687

26402688
Ok(())
26412689
}
2690+
2691+
// Test statistics calculation for non-null literal (numeric constant)
2692+
#[test]
2693+
fn test_project_statistics_with_literal() -> Result<()> {
2694+
let input_stats = get_stats();
2695+
let input_schema = get_schema();
2696+
2697+
// Projection with literal: SELECT 42 AS constant, col0 AS num
2698+
let projection = ProjectionExprs::new(vec![
2699+
ProjectionExpr {
2700+
expr: Arc::new(Literal::new(ScalarValue::Int64(Some(42)))),
2701+
alias: "constant".to_string(),
2702+
},
2703+
ProjectionExpr {
2704+
expr: Arc::new(Column::new("col0", 0)),
2705+
alias: "num".to_string(),
2706+
},
2707+
]);
2708+
2709+
let output_stats = projection.project_statistics(
2710+
input_stats,
2711+
&projection.project_schema(&input_schema)?,
2712+
)?;
2713+
2714+
// Row count should be preserved
2715+
assert_eq!(output_stats.num_rows, Precision::Exact(5));
2716+
2717+
// Should have 2 column statistics
2718+
assert_eq!(output_stats.column_statistics.len(), 2);
2719+
2720+
// First column (literal 42) should have proper constant statistics
2721+
assert_eq!(
2722+
output_stats.column_statistics[0].min_value,
2723+
Precision::Exact(ScalarValue::Int64(Some(42)))
2724+
);
2725+
assert_eq!(
2726+
output_stats.column_statistics[0].max_value,
2727+
Precision::Exact(ScalarValue::Int64(Some(42)))
2728+
);
2729+
assert_eq!(
2730+
output_stats.column_statistics[0].distinct_count,
2731+
Precision::Exact(1)
2732+
);
2733+
assert_eq!(
2734+
output_stats.column_statistics[0].null_count,
2735+
Precision::Exact(0)
2736+
);
2737+
// Int64 is 8 bytes, 5 rows = 40 bytes
2738+
assert_eq!(
2739+
output_stats.column_statistics[0].byte_size,
2740+
Precision::Exact(40)
2741+
);
2742+
// For a constant column, sum_value = value * num_rows = 42 * 5 = 210
2743+
assert_eq!(
2744+
output_stats.column_statistics[0].sum_value,
2745+
Precision::Exact(ScalarValue::Int64(Some(210)))
2746+
);
2747+
2748+
// Second column (col0) should preserve statistics
2749+
assert_eq!(
2750+
output_stats.column_statistics[1].distinct_count,
2751+
Precision::Exact(5)
2752+
);
2753+
assert_eq!(
2754+
output_stats.column_statistics[1].max_value,
2755+
Precision::Exact(ScalarValue::Int64(Some(21)))
2756+
);
2757+
2758+
Ok(())
2759+
}
2760+
2761+
// Test statistics calculation for NULL literal (constant NULL column)
2762+
#[test]
2763+
fn test_project_statistics_with_null_literal() -> Result<()> {
2764+
let input_stats = get_stats();
2765+
let input_schema = get_schema();
2766+
2767+
// Projection with NULL literal: SELECT NULL AS null_col, col0 AS num
2768+
let projection = ProjectionExprs::new(vec![
2769+
ProjectionExpr {
2770+
expr: Arc::new(Literal::new(ScalarValue::Int64(None))),
2771+
alias: "null_col".to_string(),
2772+
},
2773+
ProjectionExpr {
2774+
expr: Arc::new(Column::new("col0", 0)),
2775+
alias: "num".to_string(),
2776+
},
2777+
]);
2778+
2779+
let output_stats = projection.project_statistics(
2780+
input_stats,
2781+
&projection.project_schema(&input_schema)?,
2782+
)?;
2783+
2784+
// Row count should be preserved
2785+
assert_eq!(output_stats.num_rows, Precision::Exact(5));
2786+
2787+
// Should have 2 column statistics
2788+
assert_eq!(output_stats.column_statistics.len(), 2);
2789+
2790+
// First column (NULL literal) should have proper constant NULL statistics
2791+
assert_eq!(
2792+
output_stats.column_statistics[0].min_value,
2793+
Precision::Exact(ScalarValue::Int64(None))
2794+
);
2795+
assert_eq!(
2796+
output_stats.column_statistics[0].max_value,
2797+
Precision::Exact(ScalarValue::Int64(None))
2798+
);
2799+
assert_eq!(
2800+
output_stats.column_statistics[0].distinct_count,
2801+
Precision::Exact(1) // All NULLs are considered the same
2802+
);
2803+
assert_eq!(
2804+
output_stats.column_statistics[0].null_count,
2805+
Precision::Exact(5) // All rows are NULL
2806+
);
2807+
assert_eq!(
2808+
output_stats.column_statistics[0].byte_size,
2809+
Precision::Exact(0)
2810+
);
2811+
assert_eq!(
2812+
output_stats.column_statistics[0].sum_value,
2813+
Precision::Exact(ScalarValue::Int64(None))
2814+
);
2815+
2816+
// Second column (col0) should preserve statistics
2817+
assert_eq!(
2818+
output_stats.column_statistics[1].distinct_count,
2819+
Precision::Exact(5)
2820+
);
2821+
assert_eq!(
2822+
output_stats.column_statistics[1].max_value,
2823+
Precision::Exact(ScalarValue::Int64(Some(21)))
2824+
);
2825+
2826+
Ok(())
2827+
}
2828+
2829+
// Test statistics calculation for complex type literal (e.g., Utf8 string)
2830+
#[test]
2831+
fn test_project_statistics_with_complex_type_literal() -> Result<()> {
2832+
let input_stats = get_stats();
2833+
let input_schema = get_schema();
2834+
2835+
// Projection with Utf8 literal (complex type): SELECT 'hello' AS text, col0 AS num
2836+
let projection = ProjectionExprs::new(vec![
2837+
ProjectionExpr {
2838+
expr: Arc::new(Literal::new(ScalarValue::Utf8(Some(
2839+
"hello".to_string(),
2840+
)))),
2841+
alias: "text".to_string(),
2842+
},
2843+
ProjectionExpr {
2844+
expr: Arc::new(Column::new("col0", 0)),
2845+
alias: "num".to_string(),
2846+
},
2847+
]);
2848+
2849+
let output_stats = projection.project_statistics(
2850+
input_stats,
2851+
&projection.project_schema(&input_schema)?,
2852+
)?;
2853+
2854+
// Row count should be preserved
2855+
assert_eq!(output_stats.num_rows, Precision::Exact(5));
2856+
2857+
// Should have 2 column statistics
2858+
assert_eq!(output_stats.column_statistics.len(), 2);
2859+
2860+
// First column (Utf8 literal 'hello') should have proper constant statistics
2861+
// but byte_size should be Absent for complex types
2862+
assert_eq!(
2863+
output_stats.column_statistics[0].min_value,
2864+
Precision::Exact(ScalarValue::Utf8(Some("hello".to_string())))
2865+
);
2866+
assert_eq!(
2867+
output_stats.column_statistics[0].max_value,
2868+
Precision::Exact(ScalarValue::Utf8(Some("hello".to_string())))
2869+
);
2870+
assert_eq!(
2871+
output_stats.column_statistics[0].distinct_count,
2872+
Precision::Exact(1)
2873+
);
2874+
assert_eq!(
2875+
output_stats.column_statistics[0].null_count,
2876+
Precision::Exact(0)
2877+
);
2878+
// Complex types (Utf8, List, etc.) should have byte_size = Absent
2879+
// because we can't calculate exact size without knowing the actual data
2880+
assert_eq!(
2881+
output_stats.column_statistics[0].byte_size,
2882+
Precision::Absent
2883+
);
2884+
// Non-numeric types (Utf8) should have sum_value = Absent
2885+
// because sum is only meaningful for numeric types
2886+
assert_eq!(
2887+
output_stats.column_statistics[0].sum_value,
2888+
Precision::Absent
2889+
);
2890+
2891+
// Second column (col0) should preserve statistics
2892+
assert_eq!(
2893+
output_stats.column_statistics[1].distinct_count,
2894+
Precision::Exact(5)
2895+
);
2896+
assert_eq!(
2897+
output_stats.column_statistics[1].max_value,
2898+
Precision::Exact(ScalarValue::Int64(Some(21)))
2899+
);
2900+
2901+
Ok(())
2902+
}
26422903
}

0 commit comments

Comments
 (0)