@@ -2,7 +2,7 @@ use arrow_schema::Schema;
22use datafusion:: catalog:: Session ;
33use datafusion:: common:: { Result , ScalarValue } ;
44use datafusion:: logical_expr:: { ExprSchemable , LogicalPlan , LogicalPlanBuilder , col, when} ;
5- use datafusion:: prelude:: lit;
5+ use datafusion:: prelude:: { cast , lit} ;
66use datafusion:: { execution:: SessionState , prelude:: DataFrame } ;
77use delta_kernel:: engine:: arrow_conversion:: TryIntoArrow as _;
88use delta_kernel:: table_features:: TableFeature ;
@@ -49,12 +49,40 @@ pub fn with_generated_columns(
4949 }
5050
5151 debug ! ( "Adding missing generated column {}." , name) ;
52- let mut expr =
53- parse_predicate_expression ( plan. schema ( ) , & generated_col. generation_expr , session) ?
54- . alias ( name) ;
55- if let Ok ( field) = table_schema. field_with_name ( name) {
56- expr = expr. cast_to ( field. data_type ( ) , plan. schema ( ) ) ?;
57- }
52+ // Try to resolve the generation expression against the current plan schema.
53+ // When SchemaMode::Merge is used, the input batch may omit nullable columns
54+ // that the expression references. In that case, parse_predicate_expression
55+ // will fail because the column doesn't exist yet (schema evolution hasn't
56+ // run). We fall back to a typed NULL placeholder so the pipeline can
57+ // continue; schema evolution will later add the missing base columns as NULL,
58+ // and DataValidationExec will see NULL IS NOT DISTINCT FROM NULL = true.
59+ let expr = match parse_predicate_expression (
60+ plan. schema ( ) ,
61+ & generated_col. generation_expr ,
62+ session,
63+ ) {
64+ Ok ( resolved) => {
65+ let mut e = resolved. alias ( name) ;
66+ if let Ok ( field) = table_schema. field_with_name ( name) {
67+ e = e. cast_to ( field. data_type ( ) , plan. schema ( ) ) ?;
68+ }
69+ e
70+ }
71+ Err ( _) => {
72+ debug ! (
73+ "Could not resolve generation expression for column {}, \
74+ inserting NULL placeholder (will be resolved after schema evolution).",
75+ name
76+ ) ;
77+ // Use the target data type from the table schema if available,
78+ // otherwise fall back to a bare NULL.
79+ if let Ok ( field) = table_schema. field_with_name ( name) {
80+ cast ( lit ( ScalarValue :: Null ) , field. data_type ( ) . clone ( ) ) . alias ( name)
81+ } else {
82+ lit ( ScalarValue :: Null ) . alias ( name)
83+ }
84+ }
85+ } ;
5886 projection. push ( expr) ;
5987 }
6088
@@ -358,4 +386,61 @@ mod tests {
358386 . is_err( )
359387 ) ;
360388 }
389+
390+ /// Test that a generated column referencing a column not in the input batch
391+ /// does not fail, but instead produces a NULL placeholder.
392+ /// This is the core fix for #4169.
393+ #[ test]
394+ fn test_generated_column_referencing_missing_column_uses_null_placeholder ( ) {
395+ let session = create_test_session ( ) ;
396+ // Plan only has "id" — missing "user" column
397+ let schema = Arc :: new ( Schema :: new ( vec ! [ ArrowField :: new(
398+ "id" ,
399+ ArrowDataType :: Int32 ,
400+ false ,
401+ ) ] ) ) ;
402+ let batch = RecordBatch :: try_new (
403+ schema,
404+ vec ! [ Arc :: new( Int32Array :: from( vec![ 1 , 2 , 3 ] ) ) ] ,
405+ )
406+ . unwrap ( ) ;
407+ let source = provider_as_source ( Arc :: new (
408+ MemTable :: try_new ( batch. schema ( ) , vec ! [ vec![ batch] ] ) . unwrap ( ) ,
409+ ) ) ;
410+ let plan = LogicalPlanBuilder :: scan ( "test" , source, None )
411+ . unwrap ( )
412+ . build ( )
413+ . unwrap ( ) ;
414+
415+ // Table schema has id, user (nullable), and computed = user
416+ let table_schema = Schema :: new ( vec ! [
417+ ArrowField :: new( "id" , ArrowDataType :: Int32 , false ) ,
418+ ArrowField :: new( "user" , ArrowDataType :: Utf8 , true ) ,
419+ ArrowField :: new( "computed" , ArrowDataType :: Utf8 , true ) ,
420+ ] ) ;
421+
422+ // "computed" references "user", which is NOT in the input plan
423+ let generated_cols = vec ! [ GeneratedColumn :: new(
424+ "computed" ,
425+ "\" user\" " ,
426+ & KernelDataType :: STRING ,
427+ ) ] ;
428+
429+ // Previously this would fail with "column user not found"
430+ let result = with_generated_columns ( & session, plan, & table_schema, & generated_cols) ;
431+ assert ! (
432+ result. is_ok( ) ,
433+ "should not fail when generated column references a missing column: {:?}" ,
434+ result. err( )
435+ ) ;
436+
437+ let result_plan = result. unwrap ( ) ;
438+ assert_eq ! ( result_plan. schema( ) . fields( ) . len( ) , 2 ) ; // id + computed
439+ assert ! (
440+ result_plan
441+ . schema( )
442+ . field_with_unqualified_name( "computed" )
443+ . is_ok( )
444+ ) ;
445+ }
361446}
0 commit comments