NVIDIA
diff --git a/‎CHANGELOG.md‎
Lines changed: 218 additions & 181 deletions b/‎CHANGELOG.md‎
Lines changed: 218 additions & 181 deletions
diff --git a/‎delta-lake/README.md‎
Lines changed: 0 additions & 1 deletion b/‎delta-lake/README.md‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎delta-lake/common/src/main/delta-io/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormat.scala‎
Lines changed: 5 additions & 0 deletions b/‎delta-lake/common/src/main/delta-io/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormat.scala‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎delta-lake/delta-33x/src/main/scala/com/nvidia/spark/rapids/delta/delta33x/GpuDelta33xParquetFileFormat.scala‎
Lines changed: 6 additions & 7 deletions b/‎delta-lake/delta-33x/src/main/scala/com/nvidia/spark/rapids/delta/delta33x/GpuDelta33xParquetFileFormat.scala‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎docs/archives/CHANGELOG_25.02.md‎
Lines changed: 182 additions & 0 deletions b/‎docs/archives/CHANGELOG_25.02.md‎
Lines changed: 182 additions & 0 deletions
diff --git a/‎docs/dev/mem_debug.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/dev/mem_debug.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎iceberg/src/main/scala/org/apache/iceberg/spark/source/GpuReaderFactory.scala‎
Lines changed: 9 additions & 3 deletions b/‎iceberg/src/main/scala/org/apache/iceberg/spark/source/GpuReaderFactory.scala‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎integration_tests/src/main/python/date_time_test.py‎
Lines changed: 17 additions & 6 deletions b/‎integration_tests/src/main/python/date_time_test.py‎
Lines changed: 17 additions & 6 deletions
diff --git a/‎integration_tests/src/main/python/iceberg/iceberg_merge_on_read_test.py‎
Lines changed: 1 addition & 0 deletions b/‎integration_tests/src/main/python/iceberg/iceberg_merge_on_read_test.py‎
Lines changed: 1 addition & 0 deletions
@@ -16,7 +16,6 @@ and directory contains the corresponding support code.
 | 2.2.x              | Spark 3.3.x     | `delta-22x`        |
 | 2.3.x              | Spark 3.3.x     | `delta-23x`        |
 | 2.4.x              | Spark 3.4.x     | `delta-24x`        |
-| 3.3.x              | Spark 3.5.[3-]  | `delta-33x`        |
 | Databricks 12.2    | Databricks 12.2 | `delta-spark332db` |
 | Databricks 13.3    | Databricks 13.3 | `delta-spark341db` |
 | Databricks 14.3    | Databricks 14.3 | `delta-spark350db143` |
 
@@ -36,6 +36,11 @@ trait GpuDeltaParquetFileFormat extends GpuReadParquetFileFormat {
   val columnMappingMode: DeltaColumnMappingMode
   val referenceSchema: StructType
 
+  /**
+   * prepareSchema must only be used for parquet read.
+   * It removes "PARQUET_FIELD_ID_METADATA_KEY" for name mapping mode which address columns by
+   * physical name instead of id.
+   */
   def prepareSchema(inputSchema: StructType): StructType = {
     val schema = DeltaColumnMapping.createPhysicalSchema(
       inputSchema, referenceSchema, columnMappingMode)
 
@@ -72,11 +72,10 @@ case class GpuDelta33xParquetFileFormat(
   }
 
   /**
-   * prepareSchemaForRead must only be used for parquet read.
-   * It removes "PARQUET_FIELD_ID_METADATA_KEY" for name mapping mode which address columns by
-   * physical name instead of id.
+   * This function is overridden as Delta 3.3 has an extra `PARQUET_FIELD_NESTED_IDS_METADATA_KEY`
+   * key to remove from the metadata, which does not exist in earlier versions.
    */
-  def prepareSchemaForRead(inputSchema: StructType): StructType = {
+  override def prepareSchema(inputSchema: StructType): StructType = {
     val schema = DeltaColumnMapping.createPhysicalSchema(
       inputSchema, referenceSchema, columnMappingMode)
     if (columnMappingMode == NameMapping) {
@@ -150,9 +149,9 @@ case class GpuDelta33xParquetFileFormat(
 
     val dataReader = super.buildReaderWithPartitionValuesAndMetrics(
       sparkSession,
-      prepareSchemaForRead(dataSchema),
-      prepareSchemaForRead(partitionSchema),
-      prepareSchemaForRead(requiredSchema),
+      dataSchema,
+      partitionSchema,
+      requiredSchema,
       prepareFiltersForRead(filters),
       options,
       hadoopConf,
 
@@ -136,7 +136,7 @@ be set to either `STDERR` or `STDOUT` to see everything that is happening with t
 ```
 
 The format of this is not really documented anywhere, but it uses the
-[logging_resource_adaptor](https://github.com/rapidsai/rmm/blob/main/include/rmm/mr/device/logging_resource_adaptor.hpp)
+[logging_resource_adaptor](https://github.com/rapidsai/rmm/blob/main/cpp/include/rmm/mr/device/logging_resource_adaptor.hpp)
 to log when an allocation succeeded or failed and when memory was freed. The current format
 appears to be.
 
 
@@ -23,6 +23,7 @@ import com.nvidia.spark.rapids.iceberg.parquet.{MultiFile, MultiThread, SingleFi
 import org.apache.iceberg.{FileFormat, ScanTask, ScanTaskGroup}
 import scala.collection.JavaConverters._
 
+import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.connector.read.InputPartition
 import org.apache.spark.sql.connector.read.PartitionReader
@@ -31,7 +32,7 @@ import org.apache.spark.sql.vectorized.ColumnarBatch
 
 class GpuReaderFactory(private val metrics: Map[String, GpuMetric],
     rapidsConf: RapidsConf,
-    queryUsesInputFile: Boolean) extends PartitionReaderFactory {
+    queryUsesInputFile: Boolean) extends PartitionReaderFactory with Logging {
 
   private val allCloudSchemes = rapidsConf.getCloudSchemes.toSet
   private val isParquetPerFileReadEnabled = rapidsConf.isParquetPerFileReadEnabled
@@ -71,8 +72,13 @@ class GpuReaderFactory(private val metrics: Map[String, GpuMetric],
     val allParquet = scans.forall(_.file.format == FileFormat.PARQUET)
 
     if (allParquet) {
-      if (isParquetPerFileReadEnabled) {
-        // If per-file read is enabled, we can only use single threaded reading.
+      // If per-file read is enabled, we can only use single threaded reading.
+      // We also disable multi-thread reader when there exists deletions, as a quick workaround for
+      // https://github.com/NVIDIA/spark-rapids/issues/12885
+      if (isParquetPerFileReadEnabled || !hasNoDeletes) {
+        if (!hasNoDeletes) {
+          logWarning("Multithread iceberg parquet reader disabled with deletions")
+        }
         return SingleFile
       }
 
 
@@ -468,16 +468,27 @@ def test_to_utc_timestamp_fixed_offset(time_zone):
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark: unary_op_df(spark, tz_timestamp_gen).selectExpr(f'to_utc_timestamp(a, "{time_zone}")'))
 
-
+# test from_utc_timestamp
+# If `end_timestamp` is 2200 year, then generated timestamps are < 2200 year, will use GPU to compute both DST and non-DST timezones.
+# If it has any generated timestamp is > 2200 year and timezone is DST, then `fallback` to CPU.
+# The `fallback` means GPU operator invokes CPU to compute, not really fallback to CPU.
 @pytest.mark.parametrize('time_zone', all_timezones, ids=idfn)
-def test_comprehensive_from_utc_timestamp(time_zone):
-    tz_timestamp_gen = TimestampGen(tzinfo=timezone.utc)
+@pytest.mark.parametrize('end_timestamp', [last_supported_tz_time, None], ids=idfn)
+def test_comprehensive_from_utc_timestamp(time_zone, end_timestamp):
+    # if end = None, will use the default value
+    tz_timestamp_gen = TimestampGen(end = end_timestamp, tzinfo=timezone.utc)
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark: unary_op_df(spark, tz_timestamp_gen).selectExpr(f'from_utc_timestamp(a, "{time_zone}")'))
-    
+
+# test to_utc_timestamp
+# If `end_timestamp` is 2200 year, then generated timestamps are < 2200 year, will use GPU to compute both DST and non-DST timezones.
+# If it has any generated timestamp is > 2200 year and timezone is DST, then `fallback` to CPU.
+# The `fallback` means GPU operator invokes CPU to compute, not really fallback to CPU.
 @pytest.mark.parametrize('time_zone', all_timezones, ids=idfn)
-def test_comprehensive_to_utc_timestamp(time_zone):
-    tz_timestamp_gen = TimestampGen(end=last_supported_tz_time, tzinfo=tz.gettz(time_zone))
+@pytest.mark.parametrize('end_timestamp', [last_supported_tz_time, None], ids=idfn)
+def test_comprehensive_to_utc_timestamp(time_zone, end_timestamp):
+    # if end = None, will use the default value
+    tz_timestamp_gen = TimestampGen(end=end_timestamp, tzinfo=tz.gettz(time_zone))
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark: unary_op_df(spark, tz_timestamp_gen).selectExpr(f'to_utc_timestamp(a, "{time_zone}")'))
 
 
@@ -84,6 +84,7 @@ def test_iceberg_v2_position_delete_with_url_encoded_path(spark_tmp_table_factor
 @iceberg
 @ignore_order(local=True)
 @pytest.mark.parametrize('reader_type', rapids_reader_types)
+@pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/12885')
 def test_iceberg_v2_mixed_deletes(spark_tmp_table_factory, spark_tmp_path, reader_type):
     # We use a fixed seed here to ensure that data deletion vector has been generated
     table_name = setup_base_iceberg_table(spark_tmp_table_factory,