[Kernel] Update column mapping and schema evolution code to support usage with replace table (delta-io#4520)

allisonport-db · web-flow · commit bdd26fed60f6 · 2025-05-09T17:04:24.000-07:00
#### Which Delta project/connector is this regarding?  - [ ] Spark - [ ] Standalone - [ ] Flink - [X] Kernel - [ ] Other (fill in here) ## Description Update SchemaUtils and ColumnMapping with unit tests in order to support REPLACE TABLE with column mapping + fieldId re-use in PR #2. Specifically this involves the following changes (not necessarily related, but combined in this PR) 1) When a connector provides its own column mapping info in the schema pre-populated we require that it's complete (i.e. fieldId AND physicalName must be present) 2) We add an argument to our schema validation checks `allowNewNonNullableFields`. This is useful in cases where we can be sure the table state has been completely cleared, and thus new non-null fields are valid (like REPLACE). 3) We don't allow adding a new column with a fieldId less than the maxColId. For now, do this proactively for safety. In the future in the case of something like RESTORE in the future we will likely need a config to bypass this check. ## How was this patch tested? Updates unit tests. Also, all the changes in this PR are used by delta-io#4520 which adds a lot more E2E tests with multiple schema scenarios. ## Does this PR introduce _any_ user-facing changes? No.
diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/TransactionBuilderImpl.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/TransactionBuilderImpl.java
@@ -608,11 +608,10 @@ private void validateMetadataChange(
               .collect(toSet());
 
       SchemaUtils.validateUpdatedSchema(
-          oldMetadata.getSchema(),
-          newMetadata.getSchema(),
-          oldMetadata.getPartitionColNames(),
+          oldMetadata,
+          newMetadata,
           clusteringColumnPhysicalNames,
-          newMetadata);
+          false /* allowNewRequiredFields*/);
     }
   }
 
diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/util/ColumnMapping.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/util/ColumnMapping.java
@@ -468,6 +468,15 @@ private static StructField assignColumnIdAndPhysicalNameToField(
       AtomicInteger maxColumnId,
       boolean isNewTable,
       boolean useColumnIdForPhysicalName) {
+    if (hasColumnId(field) ^ hasPhysicalName(field)) {
+      // If a connector is providing column mapping metadata in the given schema we require it to be
+      // complete
+      throw new IllegalArgumentException(
+          String.format(
+              "Both columnId and physicalName must be present if one is present. "
+                  + "Found this field with incomplete column mapping metadata: %s",
+              field));
+    }
     if (!hasColumnId(field)) {
       field =
           field.withNewMetadata(
diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/util/SchemaUtils.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/util/SchemaUtils.java
@@ -101,18 +101,27 @@ public static void validateSchema(StructType schema, boolean isColumnMappingEnab
    * </ul>
    */
   public static void validateUpdatedSchema(
-      StructType currentSchema,
-      StructType newSchema,
-      Set<String> currentPartitionColumns,
+      Metadata currentMetadata,
+      Metadata newMetadata,
       Set<String> clusteringColumnPhysicalNames,
-      Metadata newMetadata) {
+      boolean allowNewRequiredFields) {
     checkArgument(
         isColumnMappingModeEnabled(
             ColumnMapping.getColumnMappingMode(newMetadata.getConfiguration())),
         "Cannot validate updated schema when column mapping is disabled");
-    validateSchema(newSchema, true /*columnMappingEnabled*/);
-    validatePartitionColumns(newSchema, new ArrayList<>(currentPartitionColumns));
-    validateSchemaEvolution(currentSchema, newSchema, newMetadata, clusteringColumnPhysicalNames);
+    validateSchema(newMetadata.getSchema(), true /*columnMappingEnabled*/);
+    validatePartitionColumns(
+        newMetadata.getSchema(), new ArrayList<>(newMetadata.getPartitionColNames()));
+    int currentMaxFieldId =
+        Integer.parseInt(
+            currentMetadata.getConfiguration().getOrDefault(COLUMN_MAPPING_MAX_COLUMN_ID_KEY, "0"));
+    validateSchemaEvolution(
+        currentMetadata.getSchema(),
+        newMetadata.getSchema(),
+        ColumnMapping.getColumnMappingMode(newMetadata.getConfiguration()),
+        clusteringColumnPhysicalNames,
+        currentMaxFieldId,
+        allowNewRequiredFields);
   }
 
   /**
@@ -426,14 +435,19 @@ private static void validatePhysicalNameConsistency(
   private static void validateSchemaEvolution(
       StructType currentSchema,
       StructType newSchema,
-      Metadata metadata,
-      Set<String> clusteringColumnPhysicalNames) {
-    ColumnMappingMode columnMappingMode =
-        ColumnMapping.getColumnMappingMode(metadata.getConfiguration());
+      ColumnMappingMode columnMappingMode,
+      Set<String> clusteringColumnPhysicalNames,
+      int currentMaxFieldId,
+      boolean allowNewRequiredFields) {
     switch (columnMappingMode) {
       case ID:
       case NAME:
-        validateSchemaEvolutionById(currentSchema, newSchema, clusteringColumnPhysicalNames);
+        validateSchemaEvolutionById(
+            currentSchema,
+            newSchema,
+            clusteringColumnPhysicalNames,
+            currentMaxFieldId,
+            allowNewRequiredFields);
         return;
       case NONE:
         throw new UnsupportedOperationException(
@@ -449,14 +463,18 @@ private static void validateSchemaEvolution(
    * fields
    */
   private static void validateSchemaEvolutionById(
-      StructType currentSchema, StructType newSchema, Set<String> clusteringColumnPhysicalNames) {
+      StructType currentSchema,
+      StructType newSchema,
+      Set<String> clusteringColumnPhysicalNames,
+      int oldMaxFieldId,
+      boolean allowNewRequiredFields) {
     Map<Integer, StructField> currentFieldsById = fieldsById(currentSchema);
     Map<Integer, StructField> updatedFieldsById = fieldsById(newSchema);
     SchemaChanges schemaChanges = computeSchemaChangesById(currentFieldsById, updatedFieldsById);
     validatePhysicalNameConsistency(schemaChanges.updatedFields());
     // Validates that the updated schema does not contain breaking changes in terms of types and
     // nullability
-    validateUpdatedSchemaCompatibility(schemaChanges);
+    validateUpdatedSchemaCompatibility(schemaChanges, oldMaxFieldId, allowNewRequiredFields);
     validateClusteringColumnsNotDropped(
         schemaChanges.removedFields(), clusteringColumnPhysicalNames);
     // ToDo Potentially validate IcebergCompatV2 nested IDs
@@ -480,12 +498,21 @@ private static void validateClusteringColumnsNotDropped(
    *
    * <p>ToDo: Prevent moving fields outside of their containing struct
    */
-  private static void validateUpdatedSchemaCompatibility(SchemaChanges schemaChanges) {
+  private static void validateUpdatedSchemaCompatibility(
+      SchemaChanges schemaChanges, int oldMaxFieldId, boolean allowNewRequiredFields) {
     for (StructField addedField : schemaChanges.addedFields()) {
-      if (!addedField.isNullable()) {
+      if (!allowNewRequiredFields && !addedField.isNullable()) {
         throw new KernelException(
             String.format("Cannot add non-nullable field %s", addedField.getName()));
       }
+      int colId = getColumnId(addedField);
+      if (colId <= oldMaxFieldId) {
+        throw new IllegalArgumentException(
+            String.format(
+                "Cannot add a new column with a fieldId <= maxFieldId. Found field: %s with"
+                    + "fieldId=%s. Current maxFieldId in the table is: %s",
+                addedField, colId, oldMaxFieldId));
+      }
     }
 
     for (Tuple2<StructField, StructField> updatedFields : schemaChanges.updatedFields()) {
diff --git a/kernel/kernel-api/src/test/scala/io/delta/kernel/internal/util/ColumnMappingSuite.scala b/kernel/kernel-api/src/test/scala/io/delta/kernel/internal/util/ColumnMappingSuite.scala
@@ -626,6 +626,35 @@ class ColumnMappingSuite extends AnyFunSuite with ColumnMappingSuiteBase {
     }
   }
 
+  test("both id and physical name must be provided if one is provided") {
+    val schemaWithoutPhysicalName = new StructType()
+      .add(
+        new StructField(
+          "col1",
+          StringType.STRING,
+          true,
+          FieldMetadata.builder()
+            .putLong(ColumnMapping.COLUMN_MAPPING_ID_KEY, 0)
+            .build()))
+    val schemaWithoutId = new StructType()
+      .add(
+        new StructField(
+          "col1",
+          StringType.STRING,
+          true,
+          FieldMetadata.builder()
+            .putString(ColumnMapping.COLUMN_MAPPING_PHYSICAL_NAME_KEY, "physical-name-col1")
+            .build()))
+
+    Seq(schemaWithoutId, schemaWithoutPhysicalName).foreach { schema =>
+      val e = intercept[IllegalArgumentException] {
+        updateColumnMappingMetadataIfNeeded(testMetadata(schema).withColumnMappingEnabled(), true)
+      }
+      assert(e.getMessage.contains(
+        "Both columnId and physicalName must be present if one is present"))
+    }
+  }
+
   /**
    * A struct type with all necessary CM info won't cause metadata change by
    * [[updateColumnMappingMetadataIfNeeded]]
diff --git a/kernel/kernel-api/src/test/scala/io/delta/kernel/internal/util/SchemaUtilsSuite.scala b/kernel/kernel-api/src/test/scala/io/delta/kernel/internal/util/SchemaUtilsSuite.scala
diff --git a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/DeltaTableSchemaEvolutionSuite.scala b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/DeltaTableSchemaEvolutionSuite.scala

Original file line number	Diff line number	Diff line change
`@@ -608,11 +608,10 @@ private void validateMetadataChange(`
`608`	`608`	`.collect(toSet());`
`609`	`609`
`610`	`610`	`SchemaUtils.validateUpdatedSchema(`
`611`		`- oldMetadata.getSchema(),`
`612`		`- newMetadata.getSchema(),`
`613`		`- oldMetadata.getPartitionColNames(),`
	`611`	`+ oldMetadata,`
	`612`	`+ newMetadata,`
`614`	`613`	`clusteringColumnPhysicalNames,`
`615`		`- newMetadata);`
	`614`	`+ false /* allowNewRequiredFields*/);`
`616`	`615`	`}`
`617`	`616`	`}`
`618`	`617`