Skip to content

Commit ac64b4c

Browse files
veeceeyrtyler
authored andcommitted
fix: handle generated column expr referencing missing column in SchemaMode::Merge
When using SchemaMode::Merge, the input batch may omit nullable columns that a generated column expression references. Previously, with_generated_columns() called parse_predicate_expression() against the pre-evolution plan schema, which failed because the column didn't exist yet -- schema evolution hadn't run to add it as NULL. Now when expression resolution fails, we fall back to a typed NULL placeholder. Schema evolution will later add the missing base columns, and DataValidationExec will see NULL IS NOT DISTINCT FROM NULL = true, which correctly passes validation. Closes #4169 Signed-off-by: Varun Chawla <varun_6april@hotmail.com>
1 parent ce6709b commit ac64b4c

File tree

1 file changed

+92
-7
lines changed

1 file changed

+92
-7
lines changed

crates/core/src/operations/write/generated_columns.rs

Lines changed: 92 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ use arrow_schema::Schema;
22
use datafusion::catalog::Session;
33
use datafusion::common::{Result, ScalarValue};
44
use datafusion::logical_expr::{ExprSchemable, LogicalPlan, LogicalPlanBuilder, col, when};
5-
use datafusion::prelude::lit;
5+
use datafusion::prelude::{cast, lit};
66
use datafusion::{execution::SessionState, prelude::DataFrame};
77
use delta_kernel::engine::arrow_conversion::TryIntoArrow as _;
88
use delta_kernel::table_features::TableFeature;
@@ -49,12 +49,40 @@ pub fn with_generated_columns(
4949
}
5050

5151
debug!("Adding missing generated column {}.", name);
52-
let mut expr =
53-
parse_predicate_expression(plan.schema(), &generated_col.generation_expr, session)?
54-
.alias(name);
55-
if let Ok(field) = table_schema.field_with_name(name) {
56-
expr = expr.cast_to(field.data_type(), plan.schema())?;
57-
}
52+
// Try to resolve the generation expression against the current plan schema.
53+
// When SchemaMode::Merge is used, the input batch may omit nullable columns
54+
// that the expression references. In that case, parse_predicate_expression
55+
// will fail because the column doesn't exist yet (schema evolution hasn't
56+
// run). We fall back to a typed NULL placeholder so the pipeline can
57+
// continue; schema evolution will later add the missing base columns as NULL,
58+
// and DataValidationExec will see NULL IS NOT DISTINCT FROM NULL = true.
59+
let expr = match parse_predicate_expression(
60+
plan.schema(),
61+
&generated_col.generation_expr,
62+
session,
63+
) {
64+
Ok(resolved) => {
65+
let mut e = resolved.alias(name);
66+
if let Ok(field) = table_schema.field_with_name(name) {
67+
e = e.cast_to(field.data_type(), plan.schema())?;
68+
}
69+
e
70+
}
71+
Err(_) => {
72+
debug!(
73+
"Could not resolve generation expression for column {}, \
74+
inserting NULL placeholder (will be resolved after schema evolution).",
75+
name
76+
);
77+
// Use the target data type from the table schema if available,
78+
// otherwise fall back to a bare NULL.
79+
if let Ok(field) = table_schema.field_with_name(name) {
80+
cast(lit(ScalarValue::Null), field.data_type().clone()).alias(name)
81+
} else {
82+
lit(ScalarValue::Null).alias(name)
83+
}
84+
}
85+
};
5886
projection.push(expr);
5987
}
6088

@@ -358,4 +386,61 @@ mod tests {
358386
.is_err()
359387
);
360388
}
389+
390+
/// Test that a generated column referencing a column not in the input batch
391+
/// does not fail, but instead produces a NULL placeholder.
392+
/// This is the core fix for #4169.
393+
#[test]
394+
fn test_generated_column_referencing_missing_column_uses_null_placeholder() {
395+
let session = create_test_session();
396+
// Plan only has "id" — missing "user" column
397+
let schema = Arc::new(Schema::new(vec![ArrowField::new(
398+
"id",
399+
ArrowDataType::Int32,
400+
false,
401+
)]));
402+
let batch = RecordBatch::try_new(
403+
schema,
404+
vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
405+
)
406+
.unwrap();
407+
let source = provider_as_source(Arc::new(
408+
MemTable::try_new(batch.schema(), vec![vec![batch]]).unwrap(),
409+
));
410+
let plan = LogicalPlanBuilder::scan("test", source, None)
411+
.unwrap()
412+
.build()
413+
.unwrap();
414+
415+
// Table schema has id, user (nullable), and computed = user
416+
let table_schema = Schema::new(vec![
417+
ArrowField::new("id", ArrowDataType::Int32, false),
418+
ArrowField::new("user", ArrowDataType::Utf8, true),
419+
ArrowField::new("computed", ArrowDataType::Utf8, true),
420+
]);
421+
422+
// "computed" references "user", which is NOT in the input plan
423+
let generated_cols = vec![GeneratedColumn::new(
424+
"computed",
425+
"\"user\"",
426+
&KernelDataType::STRING,
427+
)];
428+
429+
// Previously this would fail with "column user not found"
430+
let result = with_generated_columns(&session, plan, &table_schema, &generated_cols);
431+
assert!(
432+
result.is_ok(),
433+
"should not fail when generated column references a missing column: {:?}",
434+
result.err()
435+
);
436+
437+
let result_plan = result.unwrap();
438+
assert_eq!(result_plan.schema().fields().len(), 2); // id + computed
439+
assert!(
440+
result_plan
441+
.schema()
442+
.field_with_unqualified_name("computed")
443+
.is_ok()
444+
);
445+
}
361446
}

0 commit comments

Comments
 (0)