fix: default values for native_datafusion scan (apache#1756)

mbutrovich · web-flow · commit 9da11c5b5c20 · 2025-05-23T12:35:32.000-06:00
diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
@@ -1115,6 +1115,42 @@ impl PhysicalPlanner {
                     .map(|expr| self.create_expr(expr, Arc::clone(&required_schema)))
                     .collect();
 
+                let default_values: Option<HashMap<usize, ScalarValue>> = if !scan
+                    .default_values
+                    .is_empty()
+                {
+                    // We have default values. Extract the two lists (same length) of values and
+                    // indexes in the schema, and then create a HashMap to use in the SchemaMapper.
+                    let default_values: Result<Vec<ScalarValue>, DataFusionError> = scan
+                        .default_values
+                        .iter()
+                        .map(|expr| {
+                            let literal = self.create_expr(expr, Arc::clone(&required_schema))?;
+                            let df_literal = literal
+                                .as_any()
+                                .downcast_ref::<DataFusionLiteral>()
+                                .ok_or_else(|| {
+                                GeneralError("Expected literal of default value.".to_string())
+                            })?;
+                            Ok(df_literal.value().clone())
+                        })
+                        .collect();
+                    let default_values = default_values?;
+                    let default_values_indexes: Vec<usize> = scan
+                        .default_values_indexes
+                        .iter()
+                        .map(|offset| *offset as usize)
+                        .collect();
+                    Some(
+                        default_values_indexes
+                            .into_iter()
+                            .zip(default_values)
+                            .collect(),
+                    )
+                } else {
+                    None
+                };
+
                 // Get one file from the list of files
                 let one_file = scan
                     .file_partitions
@@ -1152,6 +1188,7 @@ impl PhysicalPlanner {
                     file_groups,
                     Some(projection_vector),
                     Some(data_filters?),
+                    default_values,
                     scan.session_timezone.as_str(),
                 )?;
                 Ok((
@@ -3164,7 +3201,10 @@ mod tests {
 
         let source = Arc::new(
             ParquetSource::default().with_schema_adapter_factory(Arc::new(
-                SparkSchemaAdapterFactory::new(SparkParquetOptions::new(EvalMode::Ansi, "", false)),
+                SparkSchemaAdapterFactory::new(
+                    SparkParquetOptions::new(EvalMode::Ansi, "", false),
+                    None,
+                ),
             )),
         );
 
diff --git a/native/core/src/parquet/mod.rs b/native/core/src/parquet/mod.rs
@@ -715,6 +715,7 @@ pub unsafe extern "system" fn Java_org_apache_comet_parquet_Native_initRecordBat
             file_groups,
             None,
             data_filters,
+            None,
             session_timezone.as_str(),
         )?;
 
diff --git a/native/core/src/parquet/parquet_exec.rs b/native/core/src/parquet/parquet_exec.rs
@@ -28,8 +28,10 @@ use datafusion::datasource::source::DataSourceExec;
 use datafusion::execution::object_store::ObjectStoreUrl;
 use datafusion::physical_expr::expressions::BinaryExpr;
 use datafusion::physical_expr::PhysicalExpr;
+use datafusion::scalar::ScalarValue;
 use datafusion_comet_spark_expr::EvalMode;
 use itertools::Itertools;
+use std::collections::HashMap;
 use std::sync::Arc;
 
 /// Initializes a DataSourceExec plan with a ParquetSource. This may be used by either the
@@ -61,12 +63,14 @@ pub(crate) fn init_datasource_exec(
     file_groups: Vec<Vec<PartitionedFile>>,
     projection_vector: Option<Vec<usize>>,
     data_filters: Option<Vec<Arc<dyn PhysicalExpr>>>,
+    default_values: Option<HashMap<usize, ScalarValue>>,
     session_timezone: &str,
 ) -> Result<Arc<DataSourceExec>, ExecutionError> {
     let (table_parquet_options, spark_parquet_options) = get_options(session_timezone);
-    let mut parquet_source = ParquetSource::new(table_parquet_options).with_schema_adapter_factory(
-        Arc::new(SparkSchemaAdapterFactory::new(spark_parquet_options)),
-    );
+    let mut parquet_source =
+        ParquetSource::new(table_parquet_options).with_schema_adapter_factory(Arc::new(
+            SparkSchemaAdapterFactory::new(spark_parquet_options, default_values),
+        ));
     // Create a conjunctive form of the vector because ParquetExecBuilder takes
     // a single expression
     if let Some(data_filters) = data_filters {
diff --git a/native/core/src/parquet/parquet_support.rs b/native/core/src/parquet/parquet_support.rs
@@ -62,9 +62,6 @@ pub struct SparkParquetOptions {
     pub allow_incompat: bool,
     /// Support casting unsigned ints to signed ints (used by Parquet SchemaAdapter)
     pub allow_cast_unsigned_ints: bool,
-    /// We also use the cast logic for adapting Parquet schemas, so this flag is used
-    /// for that use case
-    pub is_adapting_schema: bool,
     /// Whether to always represent decimals using 128 bits. If false, the native reader may represent decimals using 32 or 64 bits, depending on the precision.
     pub use_decimal_128: bool,
     /// Whether to read dates/timestamps that were written in the legacy hybrid Julian + Gregorian calendar as it is. If false, throw exceptions instead. If the spark type is TimestampNTZ, this should be true.
@@ -80,7 +77,6 @@ impl SparkParquetOptions {
             timezone: timezone.to_string(),
             allow_incompat,
             allow_cast_unsigned_ints: false,
-            is_adapting_schema: false,
             use_decimal_128: false,
             use_legacy_date_timestamp_or_ntz: false,
             case_sensitive: false,
@@ -93,7 +89,6 @@ impl SparkParquetOptions {
             timezone: "".to_string(),
             allow_incompat,
             allow_cast_unsigned_ints: false,
-            is_adapting_schema: false,
             use_decimal_128: false,
             use_legacy_date_timestamp_or_ntz: false,
             case_sensitive: false,
diff --git a/native/core/src/parquet/schema_adapter.rs b/native/core/src/parquet/schema_adapter.rs
@@ -18,11 +18,13 @@
 //! Custom schema adapter that uses Spark-compatible conversions
 
 use crate::parquet::parquet_support::{spark_parquet_convert, SparkParquetOptions};
-use arrow::array::{new_null_array, RecordBatch, RecordBatchOptions};
+use arrow::array::{RecordBatch, RecordBatchOptions};
 use arrow::datatypes::{Schema, SchemaRef};
 use datafusion::common::ColumnStatistics;
 use datafusion::datasource::schema_adapter::{SchemaAdapter, SchemaAdapterFactory, SchemaMapper};
 use datafusion::physical_plan::ColumnarValue;
+use datafusion::scalar::ScalarValue;
+use std::collections::HashMap;
 use std::sync::Arc;
 
 /// An implementation of DataFusion's `SchemaAdapterFactory` that uses a Spark-compatible
@@ -31,12 +33,17 @@ use std::sync::Arc;
 pub struct SparkSchemaAdapterFactory {
     /// Spark cast options
     parquet_options: SparkParquetOptions,
+    default_values: Option<HashMap<usize, ScalarValue>>,
 }
 
 impl SparkSchemaAdapterFactory {
-    pub fn new(options: SparkParquetOptions) -> Self {
+    pub fn new(
+        options: SparkParquetOptions,
+        default_values: Option<HashMap<usize, ScalarValue>>,
+    ) -> Self {
         Self {
             parquet_options: options,
+            default_values,
         }
     }
 }
@@ -56,6 +63,7 @@ impl SchemaAdapterFactory for SparkSchemaAdapterFactory {
         Box::new(SparkSchemaAdapter {
             required_schema,
             parquet_options: self.parquet_options.clone(),
+            default_values: self.default_values.clone(),
         })
     }
 }
@@ -69,6 +77,7 @@ pub struct SparkSchemaAdapter {
     required_schema: SchemaRef,
     /// Spark cast options
     parquet_options: SparkParquetOptions,
+    default_values: Option<HashMap<usize, ScalarValue>>,
 }
 
 impl SchemaAdapter for SparkSchemaAdapter {
@@ -134,6 +143,7 @@ impl SchemaAdapter for SparkSchemaAdapter {
                 required_schema: Arc::<Schema>::clone(&self.required_schema),
                 field_mappings,
                 parquet_options: self.parquet_options.clone(),
+                default_values: self.default_values.clone(),
             }),
             projection,
         ))
@@ -158,16 +168,7 @@ impl SchemaAdapter for SparkSchemaAdapter {
 /// out of the execution of this query. Thus `map_batch` uses
 /// `projected_table_schema` as it can only operate on the projected fields.
 ///
-/// [`map_partial_batch`]  is used to create a RecordBatch with a schema that
-/// can be used for Parquet predicate pushdown, meaning that it may contain
-/// fields which are not in the projected schema (as the fields that parquet
-/// pushdown filters operate can be completely distinct from the fields that are
-/// projected (output) out of the ParquetExec). `map_partial_batch` thus uses
-/// `table_schema` to create the resulting RecordBatch (as it could be operating
-/// on any fields in the schema).
-///
 /// [`map_batch`]: Self::map_batch
-/// [`map_partial_batch`]: Self::map_partial_batch
 #[derive(Debug)]
 pub struct SchemaMapping {
     /// The schema of the table. This is the expected schema after conversion
@@ -181,6 +182,7 @@ pub struct SchemaMapping {
     field_mappings: Vec<Option<usize>>,
     /// Spark cast options
     parquet_options: SparkParquetOptions,
+    default_values: Option<HashMap<usize, ScalarValue>>,
 }
 
 impl SchemaMapper for SchemaMapping {
@@ -197,15 +199,43 @@ impl SchemaMapper for SchemaMapping {
             // go through each field in the projected schema
             .fields()
             .iter()
+            .enumerate()
             // and zip it with the index that maps fields from the projected table schema to the
             // projected file schema in `batch`
             .zip(&self.field_mappings)
             // and for each one...
-            .map(|(field, file_idx)| {
+            .map(|((field_idx, field), file_idx)| {
                 file_idx.map_or_else(
-                    // If this field only exists in the table, and not in the file, then we know
-                    // that it's null, so just return that.
-                    || Ok(new_null_array(field.data_type(), batch_rows)),
+                    // If this field only exists in the table, and not in the file, then we need to
+                    // populate a default value for it.
+                    || {
+                        if self.default_values.is_some() {
+                            // We have a map of default values, see if this field is in there.
+                            if let Some(value) =
+                                self.default_values.as_ref().unwrap().get(&field_idx)
+                            // Default value exists, construct a column from it.
+                            {
+                                let cv = if field.data_type() == &value.data_type() {
+                                    ColumnarValue::Scalar(value.clone())
+                                } else {
+                                    // Data types don't match. This can happen when default values
+                                    // are stored by Spark in a format different than the column's
+                                    // type (e.g., INT32 when the column is DATE32)
+                                    spark_parquet_convert(
+                                        ColumnarValue::Scalar(value.clone()),
+                                        field.data_type(),
+                                        &self.parquet_options,
+                                    )?
+                                };
+                                return cv.into_array(batch_rows);
+                            }
+                        }
+                        // Construct an entire column of nulls. We use the Scalar representation
+                        // for better performance.
+                        let cv =
+                            ColumnarValue::Scalar(ScalarValue::try_new_null(field.data_type())?);
+                        cv.into_array(batch_rows)
+                    },
                     // However, if it does exist in both, then try to cast it to the correct output
                     // type
                     |batch_idx| {
@@ -316,7 +346,7 @@ mod test {
 
         let parquet_source = Arc::new(
             ParquetSource::new(TableParquetOptions::new()).with_schema_adapter_factory(Arc::new(
-                SparkSchemaAdapterFactory::new(spark_parquet_options),
+                SparkSchemaAdapterFactory::new(spark_parquet_options, None),
             )),
         );
 
diff --git a/native/proto/src/proto/operator.proto b/native/proto/src/proto/operator.proto
@@ -91,6 +91,8 @@ message NativeScan {
   repeated SparkFilePartition file_partitions = 7;
   repeated int64 projection_vector = 8;
   string session_timezone = 9;
+  repeated spark.spark_expression.Expr default_values = 10;
+  repeated int64 default_values_indexes = 11;
 }
 
 message Projection {
diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, Normalize
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils
+import org.apache.spark.sql.catalyst.util.ResolveDefaultColumns.getExistenceDefaultValues
 import org.apache.spark.sql.comet._
 import org.apache.spark.sql.comet.execution.shuffle.CometShuffleExchangeExec
 import org.apache.spark.sql.execution
@@ -2307,6 +2308,24 @@ object QueryPlanSerde extends Logging with CometExprShim {
             nativeScanBuilder.addAllDataFilters(dataFilters.asJava)
           }
 
+          val possibleDefaultValues = getExistenceDefaultValues(scan.requiredSchema)
+          if (possibleDefaultValues.exists(_ != null)) {
+            // Our schema has default values. Serialize two lists, one with the default values
+            // and another with the indexes in the schema so the native side can map missing
+            // columns to these default values.
+            val (defaultValues, indexes) = possibleDefaultValues.zipWithIndex
+              .filter { case (expr, _) => expr != null }
+              .map { case (expr, index) =>
+                // ResolveDefaultColumnsUtil.getExistenceDefaultValues has evaluated these
+                // expressions and they should now just be literals.
+                (Literal(expr), index.toLong.asInstanceOf[java.lang.Long])
+              }
+              .unzip
+            nativeScanBuilder.addAllDefaultValues(
+              defaultValues.flatMap(exprToProto(_, scan.output)).toIterable.asJava)
+            nativeScanBuilder.addAllDefaultValuesIndexes(indexes.toIterable.asJava)
+          }
+
           // TODO: modify CometNativeScan to generate the file partitions without instantiating RDD.
           scan.inputRDD match {
             case rdd: DataSourceRDD =>
@@ -2331,18 +2350,18 @@ object QueryPlanSerde extends Logging with CometExprShim {
           val requiredSchema = schema2Proto(scan.requiredSchema.fields)
           val dataSchema = schema2Proto(scan.relation.dataSchema.fields)
 
-          val data_schema_idxs = scan.requiredSchema.fields.map(field => {
+          val dataSchemaIndexes = scan.requiredSchema.fields.map(field => {
             scan.relation.dataSchema.fieldIndex(field.name)
           })
-          val partition_schema_idxs = Array
+          val partitionSchemaIndexes = Array
             .range(
               scan.relation.dataSchema.fields.length,
               scan.relation.dataSchema.length + scan.relation.partitionSchema.fields.length)
 
-          val projection_vector = (data_schema_idxs ++ partition_schema_idxs).map(idx =>
+          val projectionVector = (dataSchemaIndexes ++ partitionSchemaIndexes).map(idx =>
             idx.toLong.asInstanceOf[java.lang.Long])
 
-          nativeScanBuilder.addAllProjectionVector(projection_vector.toIterable.asJava)
+          nativeScanBuilder.addAllProjectionVector(projectionVector.toIterable.asJava)
 
           // In `CometScanRule`, we ensure partitionSchema is supported.
           assert(partitionSchema.length == scan.relation.partitionSchema.fields.length)
diff --git a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala
@@ -57,6 +57,15 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper {
     }
   }
 
+  test("parquet default values") {
+    withTable("t1") {
+      sql("create table t1(col1 boolean) using parquet")
+      sql("insert into t1 values(true)")
+      sql("alter table t1 add column col2 string default 'hello'")
+      checkSparkAnswerAndOperator("select * from t1")
+    }
+  }
+
   test("coalesce should return correct datatype") {
     Seq(true, false).foreach { dictionaryEnabled =>
       withTempDir { dir =>
diff --git a/spark/src/test/scala/org/apache/comet/CometFuzzTestSuite.scala b/spark/src/test/scala/org/apache/comet/CometFuzzTestSuite.scala

Original file line number	Diff line number	Diff line change
`@@ -91,6 +91,8 @@ message NativeScan {`
`91`	`91`	`repeated SparkFilePartition file_partitions = 7;`
`92`	`92`	`repeated int64 projection_vector = 8;`
`93`	`93`	`string session_timezone = 9;`
	`94`	`+ repeated spark.spark_expression.Expr default_values = 10;`
	`95`	`+ repeated int64 default_values_indexes = 11;`
`94`	`96`	`}`
`95`	`97`
`96`	`98`	`message Projection {`