[SPARK-53629][SQL] Implement type widening for MERGE INTO WITH SCHEMA EVOLUTION

szehon-ho · cloud-fan · commit fdcd1405a84b · 2025-09-23T15:38:28.000+08:00
### What changes were proposed in this pull request? MERGE INTO WITH SCHEMA EVOLUTION already support adding new column, and also some type widening (if structs are missing some fields) It should support type widening for primitive data types. Spark will call the V2DataSource TableCatalog to alter the schema, so the V2DataSource can decide whether it is acceptable or not. This change also fixes InMemoryDataSource to support this case ### Why are the changes needed? Support more use case for MERGE INTO WITH SCHEMA EVOLUTION. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Add unit test. ### Was this patch authored or co-authored using generative AI tooling? No Closes #52377 from szehon-ho/merge_type_evolution. Authored-by: Szehon Ho <szehon.apache@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala
@@ -41,7 +41,7 @@ import org.apache.spark.sql.errors.DataTypeErrors.toSQLType
 import org.apache.spark.sql.errors.QueryExecutionErrors
 import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.{ArrayType, BooleanType, DataType, IntegerType, MapType, MetadataBuilder, StringType, StructField, StructType}
+import org.apache.spark.sql.types.{ArrayType, AtomicType, BooleanType, DataType, IntegerType, MapType, MetadataBuilder, StringType, StructField, StructType}
 import org.apache.spark.util.ArrayImplicits._
 import org.apache.spark.util.Utils
 
@@ -967,12 +967,15 @@ object MergeIntoTable {
           schemaChanges(currentElementType, updateElementType,
             originalTarget, originalSource, fieldPath ++ Seq("value"))
 
+      case (currentType: AtomicType, newType: AtomicType) if currentType != newType =>
+        Array(TableChange.updateColumnType(fieldPath, newType))
+
       case (currentType, newType) if currentType == newType =>
         // No change needed
         Array.empty[TableChange]
 
       case _ =>
-        // For now do not support type widening
+        // Do not support change between atomic and complex types for now
         throw QueryExecutionErrors.failedToMergeIncompatibleSchemasError(
           originalTarget, originalSource, null)
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryBaseTable.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryBaseTable.scala
@@ -28,11 +28,12 @@ import scala.collection.mutable.ListBuffer
 import scala.jdk.CollectionConverters._
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, JoinedRow, MetadataStructFieldWithLogicalName}
+import org.apache.spark.sql.catalyst.expressions.{Cast, EvalMode, GenericInternalRow, JoinedRow, Literal, MetadataStructFieldWithLogicalName}
 import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, CaseInsensitiveMap, CharVarcharUtils, DateTimeUtils, GenericArrayData, MapData, ResolveDefaultColumns}
 import org.apache.spark.sql.connector.catalog.constraints.Constraint
 import org.apache.spark.sql.connector.distributions.{Distribution, Distributions}
 import org.apache.spark.sql.connector.expressions._
+import org.apache.spark.sql.connector.expressions.{Literal => V2Literal}
 import org.apache.spark.sql.connector.metric.{CustomMetric, CustomSumMetric, CustomTaskMetric}
 import org.apache.spark.sql.connector.read._
 import org.apache.spark.sql.connector.read.colstats.{ColumnStatistics, Histogram, HistogramBin}
@@ -146,7 +147,7 @@ abstract class InMemoryBaseTable(
     case _: BucketTransform =>
     case _: SortedBucketTransform =>
     case _: ClusterByTransform =>
-    case NamedTransform("truncate", Seq(_: NamedReference, _: Literal[_])) =>
+    case NamedTransform("truncate", Seq(_: NamedReference, _: V2Literal[_])) =>
     case t if !allowUnsupportedTransforms =>
       throw new IllegalArgumentException(s"Transform $t is not a supported transform")
   }
@@ -244,7 +245,7 @@ abstract class InMemoryBaseTable(
         var dataTypeHashCode = 0
         valueTypePairs.foreach(dataTypeHashCode += _._2.hashCode())
         ((valueHashCode + 31 * dataTypeHashCode) & Integer.MAX_VALUE) % numBuckets
-      case NamedTransform("truncate", Seq(ref: NamedReference, length: Literal[_])) =>
+      case NamedTransform("truncate", Seq(ref: NamedReference, length: V2Literal[_])) =>
         extractor(ref.fieldNames, cleanedSchema, row) match {
           case (str: UTF8String, StringType) =>
             str.substring(0, length.value.asInstanceOf[Int])
@@ -910,7 +911,7 @@ private class BufferedRowsReader(
       arrayData: ArrayData,
       readType: DataType,
       writeType: DataType): ArrayData = {
-    val elements = arrayData.toArray[Any](readType)
+    val elements = arrayData.toArray[Any](writeType)
     val convertedElements = extractCollection(elements, readType, writeType)
     new GenericArrayData(convertedElements)
   }
@@ -921,8 +922,8 @@ private class BufferedRowsReader(
       readValueType: DataType,
       writeKeyType: DataType,
       writeValueType: DataType): MapData = {
-    val keys = mapData.keyArray().toArray[Any](readKeyType)
-    val values = mapData.valueArray().toArray[Any](readValueType)
+    val keys = mapData.keyArray().toArray[Any](writeKeyType)
+    val values = mapData.valueArray().toArray[Any](writeValueType)
 
     val convertedKeys = extractCollection(keys, readKeyType, writeKeyType)
     val convertedValues = extractCollection(values, readValueType, writeValueType)
@@ -962,9 +963,20 @@ private class BufferedRowsReader(
               wKeyType, wValueType)
           }
         }
+      case (readType: AtomicType, writeType: AtomicType) if readType != writeType =>
+        elements.map { elem =>
+          if (elem == null) {
+            null
+          } else {
+            castElement(elem, readType, writeType)
+          }
+        }
       case (_, _) => elements
     }
   }
+
+  private def castElement(elem: Any, toType: DataType, fromType: DataType): Any =
+    Cast(Literal(elem, fromType), toType, None, EvalMode.TRY).eval(null)
 }
 
 private class BufferedRowsWriterFactory(schema: StructType)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala