[core] Enable clustering before write phase for incremental clustering table

LsomeYeah · LsomeYeah · commit a787f527059d · 2026-03-02T20:11:07.000+08:00
diff --git a/docs/layouts/shortcodes/generated/core_configuration.html b/docs/layouts/shortcodes/generated/core_configuration.html
@@ -56,6 +56,12 @@
             <td>Boolean</td>
             <td>Write blob field using blob descriptor rather than blob bytes.</td>
         </tr>
+        <tr>
+            <td><h5>blob-descriptor-field</h5></td>
+            <td style="word-wrap: break-word;">(none)</td>
+            <td>String</td>
+            <td>Comma-separated BLOB field names to store as serialized BlobDescriptor bytes inline in data files.</td>
+        </tr>
         <tr>
             <td><h5>blob-field</h5></td>
             <td style="word-wrap: break-word;">(none)</td>
@@ -68,12 +74,6 @@
             <td>Boolean</td>
             <td>Whether to consider blob file size as a factor when performing scan splitting.</td>
         </tr>
-        <tr>
-            <td><h5>blob-descriptor-field</h5></td>
-            <td style="word-wrap: break-word;">(none)</td>
-            <td>String</td>
-            <td>Comma-separated BLOB field names to store as serialized BlobDescriptor bytes inline in data files.</td>
-        </tr>
         <tr>
             <td><h5>blob.target-file-size</h5></td>
             <td style="word-wrap: break-word;">(none)</td>
@@ -200,6 +200,12 @@
             <td>Boolean</td>
             <td>Whether enable incremental clustering.</td>
         </tr>
+        <tr>
+            <td><h5>clustering.pre-write.enabled</h5></td>
+            <td style="word-wrap: break-word;">false</td>
+            <td>Boolean</td>
+            <td>Whether enable perform clustering before write phase when incremental clustering is enabled.</td>
+        </tr>
         <tr>
             <td><h5>clustering.strategy</h5></td>
             <td style="word-wrap: break-word;">"auto"</td>
diff --git a/paimon-api/src/main/java/org/apache/paimon/CoreOptions.java b/paimon-api/src/main/java/org/apache/paimon/CoreOptions.java
@@ -2071,6 +2071,13 @@ public InlineElement getDescription() {
                             "The duration after which a partition without new updates is considered a historical partition. "
                                     + "Historical partitions will be automatically fully clustered during the cluster operation.");
 
+    public static final ConfigOption<Boolean> CLUSTERING_PRE_WRITE_ENABLED =
+            key("clustering.pre-write.enabled")
+                    .booleanType()
+                    .defaultValue(false)
+                    .withDescription(
+                            "Whether enable perform clustering before write phase when incremental clustering is enabled.");
+
     @Immutable
     public static final ConfigOption<Boolean> ROW_TRACKING_ENABLED =
             key("row-tracking.enabled")
@@ -3402,6 +3409,10 @@ public boolean clusteringIncrementalEnabled() {
         return options.get(CLUSTERING_INCREMENTAL);
     }
 
+    public boolean preClusteringEnabled() {
+        return options.get(CLUSTERING_PRE_WRITE_ENABLED);
+    }
+
     public boolean bucketClusterEnabled() {
         return !bucketAppendOrdered()
                 && !deletionVectorsEnabled()
diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/FlinkTableSinkBase.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/FlinkTableSinkBase.java
@@ -41,6 +41,7 @@
 import static org.apache.paimon.CoreOptions.CHANGELOG_PRODUCER;
 import static org.apache.paimon.CoreOptions.CLUSTERING_COLUMNS;
 import static org.apache.paimon.CoreOptions.CLUSTERING_INCREMENTAL;
+import static org.apache.paimon.CoreOptions.CLUSTERING_PRE_WRITE_ENABLED;
 import static org.apache.paimon.CoreOptions.CLUSTERING_STRATEGY;
 import static org.apache.paimon.CoreOptions.MERGE_ENGINE;
 import static org.apache.paimon.flink.FlinkConnectorOptions.CLUSTERING_SAMPLE_FACTOR;
@@ -122,7 +123,8 @@ public SinkRuntimeProvider getSinkRuntimeProvider(Context context) {
                             new DataStream<>(
                                     dataStream.getExecutionEnvironment(),
                                     dataStream.getTransformation()));
-                    if (!conf.get(CLUSTERING_INCREMENTAL)) {
+                    if (!conf.get(CLUSTERING_INCREMENTAL)
+                            || conf.get(CLUSTERING_PRE_WRITE_ENABLED)) {
                         builder.clusteringIfPossible(
                                 conf.get(CLUSTERING_COLUMNS),
                                 conf.get(CLUSTERING_STRATEGY),
diff --git a/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/RangePartitionAndSortForAppendTableITCase.java b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/RangePartitionAndSortForAppendTableITCase.java
@@ -247,6 +247,34 @@ public void testRangePartitionAndSortWithHilbertStrategy() throws Exception {
         Assertions.assertThat(files.size()).isGreaterThan(filesFilter.size());
     }
 
+    @Test
+    public void testClusteringPreWriteEnabled() throws Exception {
+        List<Row> inputRows = generateSinkRows();
+        String id = TestValuesTableFactory.registerData(inputRows);
+        batchSql(
+                "CREATE TEMPORARY TABLE test_source (col1 INT, col2 INT, col3 INT, col4 INT) WITH "
+                        + "('connector'='values', 'bounded'='true', 'data-id'='%s')",
+                id);
+        batchSql(
+                "INSERT INTO test_table /*+ OPTIONS('sink.clustering.by-columns' = 'col1', "
+                        + "'sink.parallelism' = '10', 'sink.clustering.strategy' = 'zorder', "
+                        + "'clustering.incremental' = 'true', 'clustering.pre-write.enabled' = 'true') */ "
+                        + "SELECT * FROM test_source");
+        List<Row> sinkRows = batchSql("SELECT * FROM test_table");
+        assertThat(sinkRows.size()).isEqualTo(SINK_ROW_NUMBER);
+        FileStoreTable testStoreTable = paimonTable("test_table");
+        PredicateBuilder predicateBuilder = new PredicateBuilder(testStoreTable.rowType());
+        Predicate predicate = predicateBuilder.between(0, 100, 200);
+        List<ManifestEntry> files = testStoreTable.store().newScan().plan().files();
+        assertThat(files.size()).isEqualTo(10);
+        List<ManifestEntry> filesFilter =
+                ((AppendOnlyFileStoreScan) testStoreTable.store().newScan())
+                        .withFilter(predicate)
+                        .plan()
+                        .files();
+        Assertions.assertThat(files.size()).isGreaterThan(filesFilter.size());
+    }
+
     private List<Row> generateSinkRows() {
         List<Row> sinkRows = new ArrayList<>();
         Random random = new Random();
diff --git a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/commands/PaimonSparkWriter.scala b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/commands/PaimonSparkWriter.scala
@@ -303,7 +303,10 @@ case class PaimonSparkWriter(
           }
         }
         val clusteringColumns = coreOptions.clusteringColumns()
-        if ((!coreOptions.clusteringIncrementalEnabled()) && (!clusteringColumns.isEmpty)) {
+        if (
+          (!coreOptions.clusteringIncrementalEnabled() || coreOptions
+            .preClusteringEnabled()) && (!clusteringColumns.isEmpty)
+        ) {
           val strategy = coreOptions.clusteringStrategy(tableSchema.fields().size())
           val sorter = TableSorter.getSorter(table, strategy, clusteringColumns)
           input = sorter.sort(data)

Original file line number	Diff line number	Diff line change
`@@ -303,7 +303,10 @@ case class PaimonSparkWriter(`
`303`	`303`	`}`
`304`	`304`	`}`
`305`	`305`	`val clusteringColumns = coreOptions.clusteringColumns()`
`306`		`- if ((!coreOptions.clusteringIncrementalEnabled()) && (!clusteringColumns.isEmpty)) {`
	`306`	`+ if (`
	`307`	`+ (!coreOptions.clusteringIncrementalEnabled() \|\| coreOptions`
	`308`	`+ .preClusteringEnabled()) && (!clusteringColumns.isEmpty)`
	`309`	`+ ) {`
`307`	`310`	`val strategy = coreOptions.clusteringStrategy(tableSchema.fields().size())`
`308`	`311`	`val sorter = TableSorter.getSorter(table, strategy, clusteringColumns)`
`309`	`312`	`input = sorter.sort(data)`