[Kernel][Clustering #4] add withClusteringColumn api (delta-io#4327)

KaiqiJinWow · web-flow · commit b5b5f6f992d1 · 2025-04-04T14:28:59.000-07:00
#### Which Delta project/connector is this regarding?  - [ ] Spark - [ ] Standalone - [ ] Flink - [x] Kernel - [ ] Other (fill in here) ## Description  Split the main PR delta-io#4265 for faster review This PR implement the `withClusteringColumn` API in kernel to support table creation as a clustered table. It contains steps below, 1. withClusteringColumn takes logicalColumns as input 2. validation (column exist and cannot present together with partitionColumns) 3. Update the protocol to include `clustering` writer feature 4. convert the logical column name to physical column names to create a metadataDomain 5. Add the domainMetadata to domainMetadatasAdded. ## How was this patch tested?  ## Does this PR introduce _any_ user-facing changes?
diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/TransactionBuilder.java b/kernel/kernel-api/src/main/java/io/delta/kernel/TransactionBuilder.java
@@ -21,6 +21,7 @@
 import io.delta.kernel.exceptions.DomainDoesNotExistException;
 import io.delta.kernel.exceptions.InvalidConfigurationValueException;
 import io.delta.kernel.exceptions.UnknownConfigurationException;
+import io.delta.kernel.expressions.Column;
 import io.delta.kernel.internal.TableConfig;
 import io.delta.kernel.types.StructType;
 import java.util.List;
@@ -65,11 +66,23 @@ public interface TransactionBuilder {
    *
    * @param engine {@link Engine} instance to use.
    * @param partitionColumns The partition columns of the table. These should be a subset of the
-   *     columns in the schema.
+   *     columns in the schema. Only top-level columns are allowed to be partitioned. Note:
+   *     Clustering columns and partition columns cannot coexist in a table.
    * @return updated {@link TransactionBuilder} instance.
    */
   TransactionBuilder withPartitionColumns(Engine engine, List<String> partitionColumns);
 
+  /**
+   * Set the list of clustering columns when create a new clustered table.
+   *
+   * @param engine {@link Engine} instance to use.
+   * @param clusteringColumns The clustering columns of the table. These should be a subset of the
+   *     columns in the schema. Both top-level and nested columns are allowed to be clustered. Note:
+   *     Clustering columns and partition columns cannot coexist in a table.
+   * @return updated {@link TransactionBuilder} instance.
+   */
+  TransactionBuilder withClusteringColumns(Engine engine, List<Column> clusteringColumns);
+
   /**
    * Set the transaction identifier for idempotent writes. Incremental processing systems (e.g.,
    * streaming systems) that track progress using their own application-specific versions need to
diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/TransactionBuilderImpl.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/TransactionBuilderImpl.java
@@ -24,14 +24,17 @@
 import static io.delta.kernel.internal.util.SchemaUtils.casePreservingPartitionColNames;
 import static io.delta.kernel.internal.util.VectorUtils.buildArrayValue;
 import static io.delta.kernel.internal.util.VectorUtils.stringStringMapValue;
+import static java.lang.String.format;
 import static java.util.Objects.requireNonNull;
 import static java.util.stream.Collectors.toSet;
 
 import io.delta.kernel.*;
 import io.delta.kernel.engine.Engine;
 import io.delta.kernel.exceptions.KernelException;
 import io.delta.kernel.exceptions.TableNotFoundException;
+import io.delta.kernel.expressions.Column;
 import io.delta.kernel.internal.actions.*;
+import io.delta.kernel.internal.clustering.ClusteringUtils;
 import io.delta.kernel.internal.fs.Path;
 import io.delta.kernel.internal.icebergcompat.IcebergCompatV2MetadataValidatorAndUpdater;
 import io.delta.kernel.internal.icebergcompat.IcebergWriterCompatV1MetadataValidatorAndUpdater;
@@ -61,6 +64,7 @@ public class TransactionBuilderImpl implements TransactionBuilder {
   private final Operation operation;
   private Optional<StructType> schema = Optional.empty();
   private Optional<List<String>> partitionColumns = Optional.empty();
+  private Optional<List<Column>> clusteringColumns = Optional.empty();
   private Optional<SetTransaction> setTxnOpt = Optional.empty();
   private Optional<Map<String, String>> tableProperties = Optional.empty();
   private boolean needDomainMetadataSupport = false;
@@ -92,6 +96,14 @@ public TransactionBuilder withPartitionColumns(Engine engine, List<String> parti
     return this;
   }
 
+  @Override
+  public TransactionBuilder withClusteringColumns(Engine engine, List<Column> clusteringColumns) {
+    if (!clusteringColumns.isEmpty()) {
+      this.clusteringColumns = Optional.of(clusteringColumns);
+    }
+    return this;
+  }
+
   @Override
   public TransactionBuilder withTransactionId(
       Engine engine, String applicationId, long transactionVersion) {
@@ -174,6 +186,9 @@ public Transaction build(Engine engine) {
     if (needDomainMetadataSupport) {
       manuallyEnabledFeatures.add(TableFeatures.DOMAIN_METADATA_W_FEATURE);
     }
+    if (clusteringColumns.isPresent()) {
+      manuallyEnabledFeatures.add(TableFeatures.CLUSTERING_W_FEATURE);
+    }
 
     Tuple2<Set<TableFeature>, Optional<Metadata>> newFeaturesAndMetadata =
         TableFeatures.extractFeaturePropertyOverrides(newMetadata.orElse(snapshotMetadata));
@@ -234,8 +249,15 @@ public Transaction build(Engine engine) {
 
     /* ----- 5: Validate the metadata change ----- */
     // Now that all the config and schema changes have been made validate the old vs new metadata
-    newMetadata.ifPresent(
-        metadata -> validateMetadataChange(snapshotMetadata, metadata, isNewTable));
+    if (newMetadata.isPresent()) {
+      validateMetadataChange(snapshot, snapshotMetadata, newMetadata.get(), isNewTable);
+    }
+
+    /* ----- 6: Additional validation and adjustment ----- */
+    List<Column> casePreservingClusteringColumns =
+        SchemaUtils.casePreservingEligibleClusterColumns(
+            newMetadata.orElse(snapshotMetadata).getSchema(),
+            clusteringColumns.orElse(Collections.emptyList()));
 
     return new TransactionImpl(
         isNewTable,
@@ -247,6 +269,7 @@ public Transaction build(Engine engine) {
         newProtocol.orElse(snapshotProtocol),
         newMetadata.orElse(snapshotMetadata),
         setTxnOpt,
+        casePreservingClusteringColumns,
         newMetadata.isPresent() /* shouldUpdateMetadata */,
         newProtocol.isPresent() /* shouldUpdateProtocol */,
         maxRetries,
@@ -259,7 +282,8 @@ public Transaction build(Engine engine) {
    * <ul>
    *   <li>Ensures that the table, as defined by the protocol and metadata of its latest version, is
    *       writable by Kernel
-   *   <li>Partition columns are not specified for an existing table
+   *   <li>Partition columns and clustering columns are not specified for an existing table
+   *   <li>Partition columns and clustering columns cannot be set together
    *   <li>The provided schema is valid (e.g. no duplicate columns, valid names)
    *   <li>Partition columns provided are valid (e.g. they exist, valid data types)
    *   <li>Concurrent txn has not already committed to the table with same txnId
@@ -278,7 +302,19 @@ private void validateTransactionInputs(Engine engine, SnapshotImpl snapshot, boo
             "Table already exists, but provided new partition columns. "
                 + "Partition columns can only be set on a new table.");
       }
+      if (clusteringColumns.isPresent()) {
+        throw tableAlreadyExists(
+            tablePath,
+            format(
+                "Table already exists, but provided new clustering columns %s. "
+                    + "Clustering columns can only be set on a new table for now.",
+                clusteringColumns.get()));
+      }
     } else {
+      checkArgument(
+          !(partitionColumns.isPresent() && clusteringColumns.isPresent()),
+          "Partition Columns and Clustering Columns cannot be set at the same time");
+
       // New table verify the given schema and partition columns
       ColumnMappingMode mappingMode =
           ColumnMapping.getColumnMappingMode(tableProperties.orElse(Collections.emptyMap()));
@@ -310,7 +346,7 @@ private void validateTransactionInputs(Engine engine, SnapshotImpl snapshot, boo
    * </ul>
    */
   private void validateMetadataChange(
-      Metadata oldMetadata, Metadata newMetadata, boolean isNewTable) {
+      SnapshotImpl snapshot, Metadata oldMetadata, Metadata newMetadata, boolean isNewTable) {
     ColumnMapping.verifyColumnMappingChange(
         oldMetadata.getConfiguration(), newMetadata.getConfiguration(), isNewTable);
     IcebergWriterCompatV1MetadataValidatorAndUpdater.validateIcebergWriterCompatV1Change(
@@ -330,6 +366,16 @@ private void validateMetadataChange(
         throw new KernelException("Cannot update schema for table when column mapping is disabled");
       }
 
+      // TODO: revisit this once we want to support schema evolution with clustering columns
+      Optional<List<Column>> clusteringColumns =
+          ClusteringUtils.getClusteringColumnsOptional(snapshot);
+      if (clusteringColumns.isPresent() && !clusteringColumns.get().isEmpty()) {
+        throw new KernelException(
+            format(
+                "Update schema for table with clustering columns %s is not yet supported",
+                clusteringColumns.get()));
+      }
+
       SchemaUtils.validateUpdatedSchema(
           oldMetadata.getSchema(),
           newMetadata.getSchema(),
diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/TransactionImpl.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/TransactionImpl.java
@@ -35,6 +35,7 @@
 import io.delta.kernel.internal.actions.*;
 import io.delta.kernel.internal.annotation.VisibleForTesting;
 import io.delta.kernel.internal.checksum.CRCInfo;
+import io.delta.kernel.internal.clustering.ClusteringUtils;
 import io.delta.kernel.internal.data.TransactionStateRow;
 import io.delta.kernel.internal.fs.Path;
 import io.delta.kernel.internal.hook.CheckpointHook;
@@ -79,6 +80,7 @@ public class TransactionImpl implements Transaction {
   private final Protocol protocol;
   private final SnapshotImpl readSnapshot;
   private final Optional<SetTransaction> setTxnOpt;
+  private final List<Column> clusteringColumns;
   private final boolean shouldUpdateProtocol;
   private final Clock clock;
   private final Map<String, DomainMetadata> domainMetadatasAdded = new HashMap<>();
@@ -100,6 +102,7 @@ public TransactionImpl(
       Protocol protocol,
       Metadata metadata,
       Optional<SetTransaction> setTxnOpt,
+      List<Column> clusteringColumns,
       boolean shouldUpdateMetadata,
       boolean shouldUpdateProtocol,
       int maxRetries,
@@ -113,6 +116,7 @@ public TransactionImpl(
     this.protocol = protocol;
     this.metadata = metadata;
     this.setTxnOpt = setTxnOpt;
+    this.clusteringColumns = clusteringColumns;
     this.shouldUpdateMetadata = shouldUpdateMetadata;
     this.shouldUpdateProtocol = shouldUpdateProtocol;
     this.maxRetries = maxRetries;
@@ -198,7 +202,7 @@ public List<DomainMetadata> getDomainMetadatas() {
     if (domainMetadatas.isPresent()) {
       return domainMetadatas.get();
     }
-
+    generateClusteringDomainMetadataIfNeeded();
     if (domainMetadatasAdded.isEmpty() && domainMetadatasRemoved.isEmpty()) {
       // If no domain metadatas are added or removed, return an empty list. This is to avoid
       // unnecessary loading of the domain metadatas from the snapshot (which is an expensive
@@ -585,6 +589,18 @@ private Optional<CRCInfo> buildPostCommitCrcInfoIfCurrentCrcAvailable(
                     Optional.of(txnId.toString())));
   }
 
+  /**
+   * Generate the domain metadata for the clustering columns if they are present in the transaction.
+   */
+  private void generateClusteringDomainMetadataIfNeeded() {
+    if (TableFeatures.isClusteringTableFeatureSupported(protocol) && !clusteringColumns.isEmpty()) {
+      DomainMetadata clusteringDomainMetadata =
+          ClusteringUtils.getClusteringDomainMetadata(clusteringColumns);
+      addDomainMetadataInternal(
+          clusteringDomainMetadata.getDomain(), clusteringDomainMetadata.getConfiguration());
+    }
+  }
+
   /**
    * Get the part of the schema of the table that needs the statistics to be collected per file.
    *
diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/clustering/ClusteringUtils.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/clustering/ClusteringUtils.java
@@ -0,0 +1,49 @@
+/*
+ * Copyright (2025) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.delta.kernel.internal.clustering;
+
+import io.delta.kernel.expressions.Column;
+import io.delta.kernel.internal.SnapshotImpl;
+import io.delta.kernel.internal.actions.DomainMetadata;
+import java.util.List;
+import java.util.Optional;
+
+public class ClusteringUtils {
+
+  private ClusteringUtils() {
+    // Empty private constructor to prevent instantiation
+  }
+
+  /**
+   * Get the domain metadata for the clustering columns. If column mapping is enabled, pass the list
+   * of physical names assigned; otherwise, use the logical column names.
+   */
+  public static DomainMetadata getClusteringDomainMetadata(List<Column> clusteringColumns) {
+    ClusteringMetadataDomain clusteringMetadataDomain =
+        ClusteringMetadataDomain.fromClusteringColumns(clusteringColumns);
+    return clusteringMetadataDomain.toDomainMetadata();
+  }
+
+  /**
+   * Extract ClusteringColumns from a given snapshot. Return None if the clustering domain metadata
+   * is missing.
+   */
+  public static Optional<List<Column>> getClusteringColumnsOptional(SnapshotImpl snapshot) {
+    return ClusteringMetadataDomain.fromSnapshot(snapshot)
+        .map(ClusteringMetadataDomain::getClusteringColumns);
+  }
+}
diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/skipping/StatsSchemaHelper.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/skipping/StatsSchemaHelper.java
@@ -51,6 +51,15 @@ public static boolean isSkippingEligibleLiteral(Literal literal) {
     return isSkippingEligibleDataType(literal.getDataType());
   }
 
+  /** Returns true if the given data type is eligible for MIN/MAX data skipping. */
+  public static boolean isSkippingEligibleDataType(DataType dataType) {
+    return SKIPPING_ELIGIBLE_TYPE_NAMES.contains(dataType.toString())
+        ||
+        // DecimalType is eligible but since its string includes scale + precision it needs to
+        // be matched separately
+        dataType instanceof DecimalType;
+  }
+
   /**
    * Returns the expected statistics schema given a table schema.
    *
@@ -222,15 +231,6 @@ public boolean isSkippingEligibleNullCountColumn(Column column) {
         }
       };
 
-  /** Returns true if the given data type is eligible for MIN/MAX data skipping. */
-  private static boolean isSkippingEligibleDataType(DataType dataType) {
-    return SKIPPING_ELIGIBLE_TYPE_NAMES.contains(dataType.toString())
-        ||
-        // DecimalType is eligible but since its string includes scale + precision it needs to
-        // be matched separately
-        dataType instanceof DecimalType;
-  }
-
   /**
    * Given a data schema returns the expected schema for a min or max statistics column. This means
    * 1) replace logical names with physical names 2) set nullable=true 3) only keep stats eligible
diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/util/SchemaUtils.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/util/SchemaUtils.java
@@ -18,12 +18,14 @@
 import static io.delta.kernel.internal.DeltaErrors.*;
 import static io.delta.kernel.internal.util.ColumnMapping.*;
 import static io.delta.kernel.internal.util.Preconditions.checkArgument;
+import static java.lang.String.format;
 
 import io.delta.kernel.exceptions.KernelException;
 import io.delta.kernel.expressions.Column;
 import io.delta.kernel.expressions.Literal;
 import io.delta.kernel.internal.DeltaErrors;
 import io.delta.kernel.internal.actions.Metadata;
+import io.delta.kernel.internal.skipping.StatsSchemaHelper;
 import io.delta.kernel.types.*;
 import java.util.*;
 import java.util.function.Function;
@@ -196,6 +198,37 @@ public static Map<String, Literal> casePreservingPartitionColNames(
                 Map.Entry::getValue));
   }
 
+  /**
+   * Verify the clustering columns exists in the table schema.
+   *
+   * @param schema The schema of the table
+   * @param clusteringCols List of clustering columns
+   */
+  public static List<Column> casePreservingEligibleClusterColumns(
+      StructType schema, List<Column> clusteringCols) {
+
+    List<Tuple2<Column, DataType>> physicalColumnsWithTypes =
+        clusteringCols.stream()
+            .map(col -> ColumnMapping.getPhysicalColumnNameAndDataType(schema, col))
+            .collect(Collectors.toList());
+
+    List<String> nonSkippingEligibleColumns =
+        physicalColumnsWithTypes.stream()
+            .filter(tuple -> !StatsSchemaHelper.isSkippingEligibleDataType(tuple._2))
+            .map(tuple -> tuple._1.toString() + " : " + tuple._2)
+            .collect(Collectors.toList());
+
+    if (!nonSkippingEligibleColumns.isEmpty()) {
+      throw new KernelException(
+          format(
+              "Clustering is not supported because the following column(s): %s "
+                  + "don't support data skipping",
+              nonSkippingEligibleColumns));
+    }
+
+    return physicalColumnsWithTypes.stream().map(tuple -> tuple._1).collect(Collectors.toList());
+  }
+
   /**
    * Search (case-insensitive) for the given {@code colName} in the {@code schema} and return its
    * position in the {@code schema}.
diff --git a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/DeltaTableClusteringSuite.scala b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/DeltaTableClusteringSuite.scala
diff --git a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/DeltaTableSchemaEvolutionSuite.scala b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/DeltaTableSchemaEvolutionSuite.scala
diff --git a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/DeltaTableWriteSuiteBase.scala b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/DeltaTableWriteSuiteBase.scala