@@ -21,15 +21,15 @@ use std::ops::Deref;
2121use std:: sync:: Arc ;
2222
2323use crate :: PhysicalExpr ;
24- use crate :: expressions:: Column ;
24+ use crate :: expressions:: { Column , Literal } ;
2525use crate :: utils:: collect_columns;
2626
2727use arrow:: array:: { RecordBatch , RecordBatchOptions } ;
2828use arrow:: datatypes:: { Field , Schema , SchemaRef } ;
29- use datafusion_common:: stats:: ColumnStatistics ;
29+ use datafusion_common:: stats:: { ColumnStatistics , Precision } ;
3030use datafusion_common:: tree_node:: { Transformed , TransformedResult , TreeNode } ;
3131use datafusion_common:: {
32- Result , assert_or_internal_err, internal_datafusion_err, plan_err,
32+ Result , ScalarValue , assert_or_internal_err, internal_datafusion_err, plan_err,
3333} ;
3434
3535use datafusion_physical_expr_common:: metrics:: ExecutionPlanMetricsSet ;
@@ -611,6 +611,54 @@ impl ProjectionExprs {
611611 let expr = & proj_expr. expr ;
612612 let col_stats = if let Some ( col) = expr. as_any ( ) . downcast_ref :: < Column > ( ) {
613613 std:: mem:: take ( & mut stats. column_statistics [ col. index ( ) ] )
614+ } else if let Some ( literal) = expr. as_any ( ) . downcast_ref :: < Literal > ( ) {
615+ // Handle literal expressions (constants) by calculating proper statistics
616+ let data_type = expr. data_type ( output_schema) ?;
617+
618+ if literal. value ( ) . is_null ( ) {
619+ let null_count = match stats. num_rows {
620+ Precision :: Exact ( num_rows) => Precision :: Exact ( num_rows) ,
621+ _ => Precision :: Absent ,
622+ } ;
623+
624+ ColumnStatistics {
625+ min_value : Precision :: Exact ( literal. value ( ) . clone ( ) ) ,
626+ max_value : Precision :: Exact ( literal. value ( ) . clone ( ) ) ,
627+ distinct_count : Precision :: Exact ( 1 ) ,
628+ null_count,
629+ sum_value : Precision :: Exact ( literal. value ( ) . clone ( ) ) ,
630+ byte_size : Precision :: Exact ( 0 ) ,
631+ }
632+ } else {
633+ let value = literal. value ( ) ;
634+ let distinct_count = Precision :: Exact ( 1 ) ;
635+ let null_count = Precision :: Exact ( 0 ) ;
636+
637+ let byte_size = if let Some ( byte_width) = data_type. primitive_width ( )
638+ {
639+ stats. num_rows . multiply ( & Precision :: Exact ( byte_width) )
640+ } else {
641+ // Complex types depend on array encoding, so set to Absent
642+ Precision :: Absent
643+ } ;
644+
645+ let sum_value = Precision :: < ScalarValue > :: from ( stats. num_rows )
646+ . cast_to ( & value. data_type ( ) )
647+ . ok ( )
648+ . map ( |row_count| {
649+ Precision :: Exact ( value. clone ( ) ) . multiply ( & row_count)
650+ } )
651+ . unwrap_or ( Precision :: Absent ) ;
652+
653+ ColumnStatistics {
654+ min_value : Precision :: Exact ( value. clone ( ) ) ,
655+ max_value : Precision :: Exact ( value. clone ( ) ) ,
656+ distinct_count,
657+ null_count,
658+ sum_value,
659+ byte_size,
660+ }
661+ }
614662 } else {
615663 // TODO stats: estimate more statistics from expressions
616664 // (expressions should compute their statistics themselves)
@@ -2639,4 +2687,217 @@ pub(crate) mod tests {
26392687
26402688 Ok ( ( ) )
26412689 }
2690+
2691+ // Test statistics calculation for non-null literal (numeric constant)
2692+ #[ test]
2693+ fn test_project_statistics_with_literal ( ) -> Result < ( ) > {
2694+ let input_stats = get_stats ( ) ;
2695+ let input_schema = get_schema ( ) ;
2696+
2697+ // Projection with literal: SELECT 42 AS constant, col0 AS num
2698+ let projection = ProjectionExprs :: new ( vec ! [
2699+ ProjectionExpr {
2700+ expr: Arc :: new( Literal :: new( ScalarValue :: Int64 ( Some ( 42 ) ) ) ) ,
2701+ alias: "constant" . to_string( ) ,
2702+ } ,
2703+ ProjectionExpr {
2704+ expr: Arc :: new( Column :: new( "col0" , 0 ) ) ,
2705+ alias: "num" . to_string( ) ,
2706+ } ,
2707+ ] ) ;
2708+
2709+ let output_stats = projection. project_statistics (
2710+ input_stats,
2711+ & projection. project_schema ( & input_schema) ?,
2712+ ) ?;
2713+
2714+ // Row count should be preserved
2715+ assert_eq ! ( output_stats. num_rows, Precision :: Exact ( 5 ) ) ;
2716+
2717+ // Should have 2 column statistics
2718+ assert_eq ! ( output_stats. column_statistics. len( ) , 2 ) ;
2719+
2720+ // First column (literal 42) should have proper constant statistics
2721+ assert_eq ! (
2722+ output_stats. column_statistics[ 0 ] . min_value,
2723+ Precision :: Exact ( ScalarValue :: Int64 ( Some ( 42 ) ) )
2724+ ) ;
2725+ assert_eq ! (
2726+ output_stats. column_statistics[ 0 ] . max_value,
2727+ Precision :: Exact ( ScalarValue :: Int64 ( Some ( 42 ) ) )
2728+ ) ;
2729+ assert_eq ! (
2730+ output_stats. column_statistics[ 0 ] . distinct_count,
2731+ Precision :: Exact ( 1 )
2732+ ) ;
2733+ assert_eq ! (
2734+ output_stats. column_statistics[ 0 ] . null_count,
2735+ Precision :: Exact ( 0 )
2736+ ) ;
2737+ // Int64 is 8 bytes, 5 rows = 40 bytes
2738+ assert_eq ! (
2739+ output_stats. column_statistics[ 0 ] . byte_size,
2740+ Precision :: Exact ( 40 )
2741+ ) ;
2742+ // For a constant column, sum_value = value * num_rows = 42 * 5 = 210
2743+ assert_eq ! (
2744+ output_stats. column_statistics[ 0 ] . sum_value,
2745+ Precision :: Exact ( ScalarValue :: Int64 ( Some ( 210 ) ) )
2746+ ) ;
2747+
2748+ // Second column (col0) should preserve statistics
2749+ assert_eq ! (
2750+ output_stats. column_statistics[ 1 ] . distinct_count,
2751+ Precision :: Exact ( 5 )
2752+ ) ;
2753+ assert_eq ! (
2754+ output_stats. column_statistics[ 1 ] . max_value,
2755+ Precision :: Exact ( ScalarValue :: Int64 ( Some ( 21 ) ) )
2756+ ) ;
2757+
2758+ Ok ( ( ) )
2759+ }
2760+
2761+ // Test statistics calculation for NULL literal (constant NULL column)
2762+ #[ test]
2763+ fn test_project_statistics_with_null_literal ( ) -> Result < ( ) > {
2764+ let input_stats = get_stats ( ) ;
2765+ let input_schema = get_schema ( ) ;
2766+
2767+ // Projection with NULL literal: SELECT NULL AS null_col, col0 AS num
2768+ let projection = ProjectionExprs :: new ( vec ! [
2769+ ProjectionExpr {
2770+ expr: Arc :: new( Literal :: new( ScalarValue :: Int64 ( None ) ) ) ,
2771+ alias: "null_col" . to_string( ) ,
2772+ } ,
2773+ ProjectionExpr {
2774+ expr: Arc :: new( Column :: new( "col0" , 0 ) ) ,
2775+ alias: "num" . to_string( ) ,
2776+ } ,
2777+ ] ) ;
2778+
2779+ let output_stats = projection. project_statistics (
2780+ input_stats,
2781+ & projection. project_schema ( & input_schema) ?,
2782+ ) ?;
2783+
2784+ // Row count should be preserved
2785+ assert_eq ! ( output_stats. num_rows, Precision :: Exact ( 5 ) ) ;
2786+
2787+ // Should have 2 column statistics
2788+ assert_eq ! ( output_stats. column_statistics. len( ) , 2 ) ;
2789+
2790+ // First column (NULL literal) should have proper constant NULL statistics
2791+ assert_eq ! (
2792+ output_stats. column_statistics[ 0 ] . min_value,
2793+ Precision :: Exact ( ScalarValue :: Int64 ( None ) )
2794+ ) ;
2795+ assert_eq ! (
2796+ output_stats. column_statistics[ 0 ] . max_value,
2797+ Precision :: Exact ( ScalarValue :: Int64 ( None ) )
2798+ ) ;
2799+ assert_eq ! (
2800+ output_stats. column_statistics[ 0 ] . distinct_count,
2801+ Precision :: Exact ( 1 ) // All NULLs are considered the same
2802+ ) ;
2803+ assert_eq ! (
2804+ output_stats. column_statistics[ 0 ] . null_count,
2805+ Precision :: Exact ( 5 ) // All rows are NULL
2806+ ) ;
2807+ assert_eq ! (
2808+ output_stats. column_statistics[ 0 ] . byte_size,
2809+ Precision :: Exact ( 0 )
2810+ ) ;
2811+ assert_eq ! (
2812+ output_stats. column_statistics[ 0 ] . sum_value,
2813+ Precision :: Exact ( ScalarValue :: Int64 ( None ) )
2814+ ) ;
2815+
2816+ // Second column (col0) should preserve statistics
2817+ assert_eq ! (
2818+ output_stats. column_statistics[ 1 ] . distinct_count,
2819+ Precision :: Exact ( 5 )
2820+ ) ;
2821+ assert_eq ! (
2822+ output_stats. column_statistics[ 1 ] . max_value,
2823+ Precision :: Exact ( ScalarValue :: Int64 ( Some ( 21 ) ) )
2824+ ) ;
2825+
2826+ Ok ( ( ) )
2827+ }
2828+
2829+ // Test statistics calculation for complex type literal (e.g., Utf8 string)
2830+ #[ test]
2831+ fn test_project_statistics_with_complex_type_literal ( ) -> Result < ( ) > {
2832+ let input_stats = get_stats ( ) ;
2833+ let input_schema = get_schema ( ) ;
2834+
2835+ // Projection with Utf8 literal (complex type): SELECT 'hello' AS text, col0 AS num
2836+ let projection = ProjectionExprs :: new ( vec ! [
2837+ ProjectionExpr {
2838+ expr: Arc :: new( Literal :: new( ScalarValue :: Utf8 ( Some (
2839+ "hello" . to_string( ) ,
2840+ ) ) ) ) ,
2841+ alias: "text" . to_string( ) ,
2842+ } ,
2843+ ProjectionExpr {
2844+ expr: Arc :: new( Column :: new( "col0" , 0 ) ) ,
2845+ alias: "num" . to_string( ) ,
2846+ } ,
2847+ ] ) ;
2848+
2849+ let output_stats = projection. project_statistics (
2850+ input_stats,
2851+ & projection. project_schema ( & input_schema) ?,
2852+ ) ?;
2853+
2854+ // Row count should be preserved
2855+ assert_eq ! ( output_stats. num_rows, Precision :: Exact ( 5 ) ) ;
2856+
2857+ // Should have 2 column statistics
2858+ assert_eq ! ( output_stats. column_statistics. len( ) , 2 ) ;
2859+
2860+ // First column (Utf8 literal 'hello') should have proper constant statistics
2861+ // but byte_size should be Absent for complex types
2862+ assert_eq ! (
2863+ output_stats. column_statistics[ 0 ] . min_value,
2864+ Precision :: Exact ( ScalarValue :: Utf8 ( Some ( "hello" . to_string( ) ) ) )
2865+ ) ;
2866+ assert_eq ! (
2867+ output_stats. column_statistics[ 0 ] . max_value,
2868+ Precision :: Exact ( ScalarValue :: Utf8 ( Some ( "hello" . to_string( ) ) ) )
2869+ ) ;
2870+ assert_eq ! (
2871+ output_stats. column_statistics[ 0 ] . distinct_count,
2872+ Precision :: Exact ( 1 )
2873+ ) ;
2874+ assert_eq ! (
2875+ output_stats. column_statistics[ 0 ] . null_count,
2876+ Precision :: Exact ( 0 )
2877+ ) ;
2878+ // Complex types (Utf8, List, etc.) should have byte_size = Absent
2879+ // because we can't calculate exact size without knowing the actual data
2880+ assert_eq ! (
2881+ output_stats. column_statistics[ 0 ] . byte_size,
2882+ Precision :: Absent
2883+ ) ;
2884+ // Non-numeric types (Utf8) should have sum_value = Absent
2885+ // because sum is only meaningful for numeric types
2886+ assert_eq ! (
2887+ output_stats. column_statistics[ 0 ] . sum_value,
2888+ Precision :: Absent
2889+ ) ;
2890+
2891+ // Second column (col0) should preserve statistics
2892+ assert_eq ! (
2893+ output_stats. column_statistics[ 1 ] . distinct_count,
2894+ Precision :: Exact ( 5 )
2895+ ) ;
2896+ assert_eq ! (
2897+ output_stats. column_statistics[ 1 ] . max_value,
2898+ Precision :: Exact ( ScalarValue :: Int64 ( Some ( 21 ) ) )
2899+ ) ;
2900+
2901+ Ok ( ( ) )
2902+ }
26422903}
0 commit comments