Writing field id when writing iceberg's data file (#14328)

liurenjie1024 · web-flow · commit 1472db112dec · 2026-02-25T09:05:23.000+08:00
Fixes nvbugs-5894334.

### Description

When generating iceberg data file, we should write field into it so that
manifest metrics could be correctly produced.

### Checklists


- [x] This PR has added documentation for new or modified features or
behaviors.
- [x] This PR has added new tests or modified existing tests to cover
new code paths.
(Please explain in the PR description how the new code paths are tested,
such as names of the new/existing tests that cover them.)
- [ ] Performance testing has been performed and its results are added
in the PR description. Or, an issue has been filed with a link in the PR
description.

---------

Signed-off-by: Ray Liu &lt;liurenjie2008@gmail.com&gt;
diff --git a/iceberg/common/src/main/scala/org/apache/iceberg/spark/source/GpuSparkWrite.scala b/iceberg/common/src/main/scala/org/apache/iceberg/spark/source/GpuSparkWrite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,11 +25,12 @@ import com.nvidia.spark.rapids.RapidsPluginImplicits.AutoCloseableSeq
 import com.nvidia.spark.rapids.SpillPriorities.ACTIVE_ON_DECK_PRIORITY
 import com.nvidia.spark.rapids.fileio.iceberg.IcebergFileIO
 import com.nvidia.spark.rapids.iceberg.GpuIcebergSpecPartitioner
+import com.nvidia.spark.rapids.shims.parquet.ParquetFieldIdShims
 import org.apache.hadoop.mapreduce.Job
 import org.apache.hadoop.shaded.org.apache.commons.lang3.reflect.{FieldUtils, MethodUtils}
 import org.apache.iceberg._
 import org.apache.iceberg.io._
-import org.apache.iceberg.spark.{Spark3Util, SparkSchemaUtil}
+import org.apache.iceberg.spark.{GpuTypeToSparkType, Spark3Util, SparkSchemaUtil}
 import org.apache.iceberg.spark.functions.{GpuFieldTransform, GpuTransform}
 import org.apache.iceberg.spark.source.GpuWriteContext.positionDeleteSparkType
 import org.apache.iceberg.spark.source.SparkWrite.TaskCommit
@@ -42,8 +43,10 @@ import org.apache.spark.sql.connector.distributions.Distribution
 import org.apache.spark.sql.connector.expressions.SortOrder
 import org.apache.spark.sql.connector.write.{DataWriter, _}
 import org.apache.spark.sql.connector.write.streaming.StreamingWrite
+import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.datasources.v2.{AtomicCreateTableAsSelectExec, AtomicReplaceTableAsSelectExec}
 import org.apache.spark.sql.rapids.GpuWriteJobStatsTracker
+import org.apache.spark.sql.rapids.shims.SparkSessionUtils
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.util.SerializableConfiguration
@@ -104,7 +107,12 @@ class GpuSparkWrite(cpu: SparkWrite) extends GpuWrite with RequiresDistributionA
     val outputSpecId = FieldUtils.readField(cpu, "outputSpecId", true).asInstanceOf[Int]
     val targetFileSize = FieldUtils.readField(cpu, "targetFileSize", true).asInstanceOf[Long]
     val writeSchema = FieldUtils.readField(cpu, "writeSchema", true).asInstanceOf[Schema]
-    val dsSchema = FieldUtils.readField(cpu, "dsSchema", true).asInstanceOf[StructType]
+    // Convert writeSchema to Spark StructType with Iceberg field IDs (PARQUET:field_id).
+    // The CPU path uses Iceberg's own Parquet writer which natively embeds field IDs, but
+    // the GPU path uses Spark's Parquet infrastructure which requires field IDs in the
+    // StructType metadata. Without them, Iceberg's ParquetMetrics cannot extract file-level
+    // statistics, causing StrictMetricsEvaluator to fail during overwrite validation.
+    val dsSchema = GpuTypeToSparkType.toSparkType(writeSchema)
     val useFanout = FieldUtils.readField(cpu, "useFanoutWriter", true).asInstanceOf[Boolean]
     val writeProps = FieldUtils.readField(cpu, "writeProperties", true)
       .asInstanceOf[java.util.Map[String, String]]
@@ -115,6 +123,7 @@ class GpuSparkWrite(cpu: SparkWrite) extends GpuWrite with RequiresDistributionA
     }
 
     val hadoopConf = sparkContext.hadoopConfiguration
+
     val job = {
       val tmpJob  = Job.getInstance(hadoopConf)
       tmpJob.setOutputKeyClass(classOf[Void])
@@ -180,6 +189,16 @@ object GpuSparkWrite {
       partitionSpec: PartitionSpec,
       meta: SparkPlanMeta[_]): Unit = {
 
+    // Iceberg requires Parquet field IDs for correct file-level metrics. Without them,
+    // StrictMetricsEvaluator fails during overwrite validation.
+    val spark = SparkSessionUtils.sessionFromPlan(meta.wrapped.asInstanceOf[SparkPlan])
+    val hadoopConf = spark.sparkContext.hadoopConfiguration
+    val sqlConf = spark.sessionState.conf
+    if (!ParquetFieldIdShims.getParquetIdWriteEnabled(hadoopConf, sqlConf)) {
+      meta.willNotWorkOnGpu("Iceberg requires Parquet field IDs to be written for correct " +
+        "file-level metrics. Set spark.sql.parquet.fieldId.write.enabled=true")
+    }
+
     // Check file format support
     if (dataFormat.exists(!_.equals(FileFormat.PARQUET))) {
       meta.willNotWorkOnGpu(s"GpuSparkWrite only supports Parquet, but got: ${dataFormat.get}")
@@ -292,6 +311,7 @@ object GpuSparkWrite {
   def convert(cpuWrite: Write): GpuSparkWrite = {
     new GpuSparkWrite(cpuWrite.asInstanceOf[SparkWrite])
   }
+
 }
 
 class GpuWriterFactory(val tableBroadcast: Broadcast[Table],
diff --git a/integration_tests/src/main/python/iceberg/iceberg_overwrite_static_test.py b/integration_tests/src/main/python/iceberg/iceberg_overwrite_static_test.py
@@ -14,6 +14,7 @@
 from typing import Callable, Any
 
 import pytest
+from pyspark.sql import functions as F
 
 from asserts import assert_equal_with_local_sort, assert_gpu_fallback_collect
 from conftest import is_iceberg_remote_catalog
@@ -404,3 +405,61 @@ def overwrite_data(spark, table_name):
     cpu_data = with_cpu_session(lambda spark: spark.table(cpu_table_name).collect())
     gpu_data = with_cpu_session(lambda spark: spark.table(gpu_table_name).collect())
     assert_equal_with_local_sort(cpu_data, gpu_data)
+
+
+@iceberg
+@ignore_order(local=True)
+@allow_non_gpu('ShuffleExchangeExec')
+@pytest.mark.skipif(is_iceberg_remote_catalog(), reason="Skip for remote catalog to reduce test time")
+def test_insert_overwrite_static_df_api_truncate_string(spark_tmp_table_factory):
+    """Test static overwrite via DataFrame writeTo().overwrite() API with truncate(5, string_col)
+    partitioning. Verifies GPU writes produce Parquet files with correct Iceberg field IDs
+    so that file-level statistics are available for overwrite validation.
+    """
+    truncate_width = 5
+    str_length = truncate_width - 2
+    prefix = "T" * str_length
+    partition_col_sql = f"truncate({truncate_width}, _c6)"
+    partition_filter = f"_c6 >= '{prefix}10' AND _c6 < '{prefix}20'"
+
+    table_prop = {"format-version": "2",
+                  "write.format.default": "parquet"}
+
+    conf = copy_and_update(iceberg_static_overwrite_conf, {
+        "spark.sql.adaptive.enabled": "true",
+        "spark.sql.adaptive.coalescePartitions.enabled": "true",
+    })
+
+    # Use standard iceberg schema but override _c6 with a constrained string gen
+    # to produce predictable truncate partitions for the range filter
+    gen_list = list(zip(iceberg_base_table_cols, iceberg_gens_list))
+    gen_list[6] = ('_c6', StringGen(pattern=f'{prefix}[1-9][0-9][A-Z]{{3}}'))
+
+    base_table_name = get_full_table_name(spark_tmp_table_factory)
+    cpu_table_name = f"{base_table_name}_cpu"
+    gpu_table_name = f"{base_table_name}_gpu"
+
+    def create_table_with_ctas(spark, table_name):
+        df = gen_df(spark, gen_list, seed=INITIAL_INSERT_SEED)
+        view_name = spark_tmp_table_factory.get()
+        df.createOrReplaceTempView(view_name)
+        props_sql = ", ".join([f"'{k}' = '{v}'" for k, v in table_prop.items()])
+        spark.sql(f"CREATE TABLE {table_name} USING ICEBERG "
+                  f"PARTITIONED BY ({partition_col_sql}) "
+                  f"TBLPROPERTIES ({props_sql}) "
+                  f"AS SELECT * FROM {view_name}")
+
+    with_cpu_session(lambda spark: create_table_with_ctas(spark, cpu_table_name), conf=conf)
+    with_gpu_session(lambda spark: create_table_with_ctas(spark, gpu_table_name), conf=conf)
+
+    def overwrite_data(spark, table_name):
+        df = gen_df(spark, gen_list, seed=INITIAL_INSERT_SEED + 1)
+        filtered_df = df.filter(partition_filter)
+        filtered_df.writeTo(table_name).overwrite(F.expr(partition_filter))
+
+    with_cpu_session(lambda spark: overwrite_data(spark, cpu_table_name), conf=conf)
+    with_gpu_session(lambda spark: overwrite_data(spark, gpu_table_name), conf=conf)
+
+    cpu_data = with_cpu_session(lambda spark: spark.table(cpu_table_name).collect())
+    gpu_data = with_cpu_session(lambda spark: spark.table(gpu_table_name).collect())
+    assert_equal_with_local_sort(cpu_data, gpu_data)