[UniForm] Convert stats for TIMESTAMP data type in converting iceberg metadata to delta (delta-io#4339)

ChengJi-db · web-flow · commit b08e8ba2a040 · 2025-03-28T14:35:18.000-07:00
## Description While converting iceberg metadata to delta metadata, convert stats for timestamp type in the fast path ## How was this patch tested? UTs
diff --git a/iceberg/src/main/scala/org/apache/spark/sql/delta/IcebergStatsUtils.scala b/iceberg/src/main/scala/org/apache/spark/sql/delta/IcebergStatsUtils.scala
@@ -35,7 +35,8 @@ import org.apache.iceberg.types.Types.{
   MapType => IcebergMapType,
   NestedField,
   StringType => IcebergStringType,
-  StructType => IcebergStructType
+  StructType => IcebergStructType,
+  TimestampType => IcebergTimestampType
 }
 import org.apache.iceberg.util.DateTimeUtil
 
@@ -56,7 +57,7 @@ object IcebergStatsUtils extends DeltaLogging {
     TypeID.DOUBLE,
     TypeID.DATE,
 //    TypeID.TIME,
-//    TypeID.TIMESTAMP,
+    TypeID.TIMESTAMP,
 //    TypeID.TIMESTAMP_NANO,
     TypeID.STRING,
 //    TypeID.UUID,
@@ -182,6 +183,9 @@ object IcebergStatsUtils extends DeltaLogging {
         case (_: IcebergDateType, bb: ByteBuffer) =>
           val daysFromEpoch = Conversions.fromByteBuffer(ftype, bb).asInstanceOf[Int]
           DateTimeUtil.dateFromDays(daysFromEpoch).toString
+        case (tsType: IcebergTimestampType, bb: ByteBuffer) =>
+          val microts = Conversions.fromByteBuffer(tsType, bb).asInstanceOf[JLong]
+          microTimestampToString(microts, tsType)
         case (_, bb: ByteBuffer) =>
           Conversions.fromByteBuffer(ftype, bb)
         case _ => throw new IllegalArgumentException("unable to deserialize unknown values")
@@ -221,4 +225,15 @@ object IcebergStatsUtils extends DeltaLogging {
       )
     )
   }
+
+  private def microTimestampToString(
+      microTS: JLong, tsType: IcebergTimestampType): String = {
+    // iceberg timestamptz will have shouldAdjustToUTC() as true
+    if (tsType.shouldAdjustToUTC()) {
+      DateTimeUtil.microsToIsoTimestamptz(microTS)
+    } else {
+    // iceberg timestamp doesn't need to adjust to UTC
+      DateTimeUtil.microsToIsoTimestamp(microTS)
+    }
+  }
 }
diff --git a/iceberg/src/test/scala/org/apache/spark/sql/delta/CloneIcebergSuite.scala b/iceberg/src/test/scala/org/apache/spark/sql/delta/CloneIcebergSuite.scala
@@ -18,8 +18,11 @@ package org.apache.spark.sql.delta
 
 // scalastyle:off import.ordering.noEmptyLine
 import java.sql.Date
-import java.time.LocalDate
+import java.sql.Timestamp
+import java.time.LocalDateTime
 import java.time.LocalTime
+import java.time.format.DateTimeFormatter
+import java.util.TimeZone
 
 import scala.collection.JavaConverters._
 import scala.util.Try
@@ -38,8 +41,9 @@ import org.apache.iceberg.types.Types.NestedField
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row}
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.util.DateTimeUtils.{stringToDate, toJavaDate}
+import org.apache.spark.sql.catalyst.util.DateTimeUtils.{getZoneId, microsToLocalDateTime, stringToDate, stringToTimestamp, stringToTimestampWithoutTimeZone, toJavaDate, toJavaTimestamp}
 import org.apache.spark.sql.functions.{col, expr, from_json, lit, struct, substring}
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{Decimal, DecimalType, LongType, StringType, StructField, StructType, TimestampType}
 import org.apache.spark.unsafe.types.UTF8String
 // scalastyle:on import.ordering.noEmptyLine
@@ -554,12 +558,12 @@ trait CloneIcebergSuiteBase extends QueryTest
           val filesRead =
             getFilesRead(spark, deltaLog, predicate, checkEmptyUnusedFilters = false)
           try {
-            assert(filesRead.size == expectedFilesReadNum)
-            assert(filesRead.map(_.partitionValues.head._2).toSet ==
-              expectedFilesReadIndices.map(_.toString))
             checkAnswer(
               spark.sql(s"select * from $cloneTable where $predicate"), df.where(predicate)
             )
+            assert(filesRead.size == expectedFilesReadNum)
+            assert(filesRead.map(_.partitionValues.head._2).toSet ==
+              expectedFilesReadIndices.map(_.toString))
           } catch {
             case e: Throwable =>
               throw new RuntimeException(
@@ -597,7 +601,7 @@ trait CloneIcebergSuiteBase extends QueryTest
           expectedFilesReadIndices = Set(2)
         )
       ),
-      mode
+      mode = mode
     )
   }
 
@@ -620,7 +624,7 @@ trait CloneIcebergSuiteBase extends QueryTest
           expectedFilesReadIndices = Set(1)
         )
       ),
-      mode
+      mode = mode
     )
   }
 
@@ -643,7 +647,7 @@ trait CloneIcebergSuiteBase extends QueryTest
           expectedFilesReadIndices = Set(1)
         )
       ),
-      mode
+      mode = mode
     )
   }
 
@@ -680,7 +684,7 @@ trait CloneIcebergSuiteBase extends QueryTest
           expectedFilesReadIndices = Set()
         )
       ),
-      mode
+      mode = mode
     )
   }
 
@@ -725,8 +729,157 @@ trait CloneIcebergSuiteBase extends QueryTest
           expectedFilesReadIndices = Set(4, 5)
         )
       ),
-      mode
+      mode = mode
+    )
+  }
+
+  // Exactly on minutes
+  testClone("Convert Iceberg timestamptz type - 1") { mode =>
+    testStatsConversionAndDataSkipping(
+      icebergDataType = "timestamp", // spark timestamp => iceberg timestamptz
+      tableData = Seq(
+        toTimestamp("1908-03-15 10:1:17")
+      ),
+      extractFunc = row => {
+        timestamptzExtracter(row, pattern = "yyyy-MM-dd'T'HH:mm:ssXXX")
+      },
+      expectedStats = Seq("1908-03-15T10:01:17+00:00"),
+      dataSkippingTestParams = Seq(
+        DataSkippingTestParam(
+          predicate = "col2 > TIMESTAMP'1908-03-15T10:01:18+00:00'",
+          expectedFilesReadNum = 0,
+          expectedFilesReadIndices = Set()
+        ),
+        DataSkippingTestParam(
+          predicate = "col2 <= TIMESTAMP'1908-03-15T10:01:17+00:00'",
+          expectedFilesReadNum = 1,
+          expectedFilesReadIndices = Set(1)
+        )
+      ),
+      mode = mode
+    )
+  }
+
+  // Fractional time
+  testClone("Convert Iceberg timestamptz type - 2") { mode =>
+    testStatsConversionAndDataSkipping(
+      icebergDataType = "timestamp", // spark timestamp => iceberg timestamptz
+      tableData = Seq(
+        toTimestamp("1997-12-11 5:40:19.23349")
+      ),
+      extractFunc = row => {
+        timestamptzExtracter(row, pattern = "yyyy-MM-dd'T'HH:mm:ss.SSSSSXXX")
+      },
+      expectedStats = Seq("1997-12-11T05:40:19.23349+00:00"),
+      dataSkippingTestParams = Seq(
+        DataSkippingTestParam(
+          predicate = "col2 > TIMESTAMP'1997-12-11T05:40:19.233+00:00'",
+          expectedFilesReadNum = 1,
+          expectedFilesReadIndices = Set(1)
+        ),
+        DataSkippingTestParam(
+          predicate = "col2 <= TIMESTAMP'1997-12-11T05:40:19.10+00:00'",
+          expectedFilesReadNum = 0,
+          expectedFilesReadIndices = Set()
+        )
+      ),
+      mode = mode
+    )
+  }
+
+  // Customized timezone
+  testClone("Convert Iceberg timestamptz type - 3") { mode =>
+    testStatsConversionAndDataSkipping(
+      icebergDataType = "timestamp", // spark timestamp => iceberg timestamptz
+      tableData = Seq(
+        toTimestamp("2077-11-11 3:23:11.23456+02:15")
+      ),
+      extractFunc = row => {
+        timestamptzExtracter(row, pattern = "yyyy-MM-dd'T'HH:mm:ss.SSSSSXXX")
+      },
+      expectedStats = Seq("2077-11-11T01:08:11.23456+00:00"),
+      dataSkippingTestParams = Seq(
+        DataSkippingTestParam(
+          predicate = "col2 > TIMESTAMP'2077-11-11T03:23:11.23456+02:16'",
+          expectedFilesReadNum = 1,
+          expectedFilesReadIndices = Set(1)
+        ),
+        DataSkippingTestParam(
+          predicate = "col2 < TIMESTAMP'2077-11-11T03:23:11.23456+02:16'",
+          expectedFilesReadNum = 0,
+          expectedFilesReadIndices = Set()
+        )
+      ),
+      mode = mode
+    )
+  }
+
+  // Exactly on minutes
+  testClone("Convert Iceberg timestamp type - 1") { mode =>
+    testStatsConversionAndDataSkipping(
+      icebergDataType = "timestamp_ntz", // spark timestamp_ntz => iceberg timestamp
+      tableData = Seq(
+        toTimestampNTZ("2024-01-02T02:04:05.123456")
+      ),
+      extractFunc = row => {
+        row.get(0).asInstanceOf[LocalDateTime].toString
+      },
+      expectedStats = Seq("2024-01-02T02:04:05.123456"),
+      dataSkippingTestParams = Seq(
+        DataSkippingTestParam(
+          predicate = "col2 > TIMESTAMP'2024-01-02T02:04:04.123456'",
+          expectedFilesReadNum = 1,
+          expectedFilesReadIndices = Set(1)
+        )
+      ),
+      mode = mode
+    )
+  }
+
+  // Fractional time
+  testClone("Convert Iceberg timestamp type - 2") { mode =>
+    testStatsConversionAndDataSkipping(
+      icebergDataType = "timestamp_ntz", // spark timestamp_ntz => iceberg timestamp
+      tableData = Seq(
+        toTimestampNTZ("1712-4-29T06:23:49.12")
+      ),
+      extractFunc = row => {
+        row.get(0).asInstanceOf[LocalDateTime].toString
+          .replaceAll("0+$", "") // remove trailing zeros
+      },
+      expectedStats = Seq("1712-04-29T06:23:49.12"),
+      dataSkippingTestParams = Seq(
+        DataSkippingTestParam(
+          predicate = "col2 > TIMESTAMP'1712-04-29T06:23:49.11'",
+          expectedFilesReadNum = 1,
+          expectedFilesReadIndices = Set(1)
+        )
+      ),
+      mode = mode
+    )
+  }
+
+  private def toTimestamp(timestamp: String): Timestamp = {
+    toJavaTimestamp(stringToTimestamp(UTF8String.fromString(timestamp),
+      getZoneId(SQLConf.get.sessionLocalTimeZone)).get)
+  }
+
+  private def toTimestampNTZ(timestampNTZ: String): LocalDateTime = {
+    microsToLocalDateTime(
+      stringToTimestampWithoutTimeZone(
+        UTF8String.fromString(timestampNTZ)
+      ).get
+    )
+  }
+
+  private def timestamptzExtracter(row: Row, pattern: String): String = {
+    val ts = row.getTimestamp(0).toLocalDateTime.atZone(
+      getZoneId(TimeZone.getDefault.getID)
     )
+    ts.withZoneSameInstant(getZoneId(SQLConf.get.sessionLocalTimeZone))
+      .format(DateTimeFormatter.ofPattern(pattern))
+      .replace("UTC", "+00:00")
+      .replace("Z", "+00:00")
   }
 }
 
diff --git a/iceberg/src/test/scala/org/apache/spark/sql/delta/commands/convert/IcebergStatsUtilsSuite.scala b/iceberg/src/test/scala/org/apache/spark/sql/delta/commands/convert/IcebergStatsUtilsSuite.scala
@@ -117,7 +117,7 @@ class IcebergStatsUtilsSuite extends SparkFunSuite {
     assertResult(expectedStatsObj)(actualStatsObj)
   }
 
-  test("stats conversion from timestamp 64 is disabled") {
+  test("stats conversion for decimal and timestamp") {
     val icebergSchema = new Schema(10, Seq[NestedField](
       NestedField.required(1, "col_ts", TimestampType.withZone),
       NestedField.required(2, "col_tsnz", TimestampType.withoutZone),
@@ -152,9 +152,17 @@ class IcebergStatsUtilsSuite extends SparkFunSuite {
     assertResult(
       JsonUtils.fromJson[StatsObject](
         """{"numRecords":1251,
-          |"maxValues":{"col_decimal":9.99999},
-          |"minValues":{"col_decimal":3.44141},
-          |"nullCount":{"col_decimal":31}}""".stripMargin))(
+          |"maxValues":{
+          | "col_ts":"2024-12-17T00:22:59+00:00",
+          | "col_tsnz":"2024-12-17T00:22:59",
+          | "col_decimal":9.99999
+          | },
+          |"minValues":{
+          | "col_ts":"2024-12-16T23:32:59+00:00",
+          | "col_tsnz":"2024-12-16T23:32:59",
+          | "col_decimal":3.44141
+          | },
+          |"nullCount":{"col_ts":20,"col_tsnz":10,"col_decimal":31}}""".stripMargin))(
       JsonUtils.fromJson[StatsObject](deltaStats))
   }