Avoid slow stats conversion fallback for iceberg clone (delta-io#4366)

ChengJi-db · web-flow · commit ef2d4a77d0bb · 2025-04-04T12:01:52.000-07:00
## Description This PR proposes to * Avoid slow stats conversion fallback for iceberg clone by default * Allow partial stats conversion for iceberg clone by default More specifically, * When stats conversion from iceberg off, fallback to slow stats conversion enabled * When stats conversion from iceberg on, fallback to slow stats conversion will not happen if partial stats conversion enabled. It will only happen if partial stats conversion disabled and iceberg source has partial stats - either minValues or maxValues is missing ## How was this patch tested? UTs ## Does this PR introduce _any_ user-facing changes? **Current**: delta tables cloned from iceberg source with only partial stats will collect stats from parquet footers. Here, partial stats means any of (maxValues, minValues, nullCounts) is missing **Future**: delta tables cloned from iceberg source with only partial stats will convert all available stats from iceberg source and not fallback to collecting stats from parquet footers. Here, partial stats means any of (maxValues, minValues, nullCounts) is missing
diff --git a/iceberg/src/main/scala/org/apache/spark/sql/delta/IcebergFileManifest.scala b/iceberg/src/main/scala/org/apache/spark/sql/delta/IcebergFileManifest.scala
@@ -59,6 +59,10 @@ class IcebergFileManifest(
     spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_CONVERT_ICEBERG_PARTITION_EVOLUTION_ENABLED)
 
   private val statsAllowTypes: Set[TypeID] = IcebergStatsUtils.typesAllowStatsConversion(spark)
+  private val allowPartialStatsConverted: Boolean =
+    spark.sessionState.conf.getConf(
+      DeltaSQLConf.DELTA_CLONE_ICEBERG_ALLOW_PARTIAL_STATS
+    )
 
   val basePath = table.location()
 
@@ -128,6 +132,7 @@ class IcebergFileManifest(
     }
 
     val shouldConvertStats = convertStats
+    val partialStatsConvertedEnabled = allowPartialStatsConverted
     val statsAllowTypesSet = statsAllowTypes
 
     val shouldCheckPartitionEvolution = !partitionEvolutionEnabled
@@ -163,7 +168,14 @@ class IcebergFileManifest(
             Some(convertPartition.toDelta(dataFile.partition()))
           } else None,
           stats = if (shouldConvertStats) {
-            IcebergStatsUtils.icebergStatsToDelta(localTable.schema, dataFile, statsAllowTypesSet)
+            IcebergStatsUtils.icebergStatsToDelta(
+              localTable.schema,
+              dataFile,
+              statsAllowTypesSet,
+              shouldSkipForFile = (df: DataFile) => {
+                !partialStatsConvertedEnabled && IcebergStatsUtils.hasPartialStats(df)
+              }
+            )
           } else None
         )
       }
diff --git a/iceberg/src/main/scala/org/apache/spark/sql/delta/IcebergStatsUtils.scala b/iceberg/src/main/scala/org/apache/spark/sql/delta/IcebergStatsUtils.scala
@@ -18,6 +18,7 @@ package org.apache.spark.sql.delta.commands.convert
 
 import java.lang.{Integer => JInt, Long => JLong}
 import java.nio.ByteBuffer
+import java.util.{Map => JMap}
 
 import scala.collection.JavaConverters._
 import scala.util.control.NonFatal
@@ -93,38 +94,50 @@ object IcebergStatsUtils extends DeltaLogging {
    *
    * @param icebergSchema            Iceberg table schema
    * @param dataFile                 Iceberg DataFile that contains stats info
+   * @param statsAllowTypes          Iceberg types that are allowed to convert stats
+   * @param shouldSkipForFile        Function => true if a data file should be skipped
    * @return None if stats is missing on the DataFile or error occurs during conversion
    */
   def icebergStatsToDelta(
       icebergSchema: Schema,
       dataFile: DataFile,
-      statsAllowTypes: Set[TypeID]): Option[String] = {
+      statsAllowTypes: Set[TypeID],
+      shouldSkipForFile: DataFile => Boolean): Option[String] = {
+    if (shouldSkipForFile(dataFile)) {
+      return None
+    }
     try {
-      // Any empty or null fields means Iceberg has disabled column stats
-      if (dataFile.upperBounds == null ||
-        dataFile.upperBounds.isEmpty ||
-        dataFile.lowerBounds == null ||
-        dataFile.lowerBounds.isEmpty ||
-        dataFile.nullValueCounts == null ||
-        dataFile.nullValueCounts.isEmpty
-      ) {
-        return None
-      }
       Some(icebergStatsToDelta(
         icebergSchema,
         dataFile.recordCount,
-        dataFile.upperBounds.asScala.toMap,
-        dataFile.lowerBounds.asScala.toMap,
-        dataFile.nullValueCounts.asScala.toMap,
+        Option(dataFile.upperBounds).map(_.asScala.toMap).filter(_.nonEmpty),
+        Option(dataFile.lowerBounds).map(_.asScala.toMap).filter(_.nonEmpty),
+        Option(dataFile.nullValueCounts).map(_.asScala.toMap).filter(_.nonEmpty),
         statsAllowTypes
       ))
     } catch {
       case NonFatal(e) =>
-        logError("Exception while converting Iceberg stats to Delta format", e)
+        logInfo("[Iceberg-Stats-Conversion] " +
+          "Exception while converting Iceberg stats to Delta format", e)
         None
     }
   }
 
+  def hasPartialStats(dataFile: DataFile): Boolean = {
+    def nonEmptyMap[K, V](m: JMap[K, V]): Boolean = {
+      m != null && !m.isEmpty
+    }
+    // nullValueCounts is less common, so we ignore it
+    val hasPartialStats =
+      !nonEmptyMap(dataFile.upperBounds()) ||
+      !nonEmptyMap(dataFile.lowerBounds())
+    if (hasPartialStats) {
+      logInfo(s"[Iceberg-Stats-Conversion] $dataFile only has partial stats:" +
+        s"upperBounds=${dataFile.upperBounds}, lowerBounds = ${dataFile.lowerBounds()}")
+    }
+    hasPartialStats
+  }
+
   /**
    * Convert Iceberg DataFile stats into Delta stats.
    *
@@ -176,9 +189,9 @@ object IcebergStatsUtils extends DeltaLogging {
   private[convert] def icebergStatsToDelta(
       icebergSchema: Schema,
       numRecords: Long,
-      maxMap: Map[JInt, ByteBuffer],
-      minMap: Map[JInt, ByteBuffer],
-      nullCountMap: Map[JInt, JLong],
+      maxMap: Option[Map[JInt, ByteBuffer]],
+      minMap: Option[Map[JInt, ByteBuffer]],
+      nullCountMap: Option[Map[JInt, JLong]],
       statsAllowTypes: Set[TypeID]): String = {
 
     def deserialize(ftype: IcebergType, value: Any): Any = {
@@ -222,11 +235,14 @@ object IcebergStatsUtils extends DeltaLogging {
 
     JsonUtils.toJson(
       Map(
-        NUM_RECORDS -> numRecords,
-        MAX -> collectStats(icebergSchema.columns, maxMap, deserialize, statsAllowTypes),
-        MIN -> collectStats(icebergSchema.columns, minMap, deserialize, statsAllowTypes),
+        NUM_RECORDS -> numRecords
+      ) ++ maxMap.map(
+        MAX -> collectStats(icebergSchema.columns, _, deserialize, statsAllowTypes)
+      ) ++ minMap.map(
+        MIN -> collectStats(icebergSchema.columns, _, deserialize, statsAllowTypes)
+      ) ++ nullCountMap.map(
         NULL_COUNT -> collectStats(
-          icebergSchema.columns, nullCountMap, (_: IcebergType, v: Any) => v, statsAllowTypes
+          icebergSchema.columns, _, (_: IcebergType, v: Any) => v, statsAllowTypes
         )
       )
     )
diff --git a/iceberg/src/test/scala/org/apache/spark/sql/delta/commands/convert/IcebergStatsUtilsSuite.scala b/iceberg/src/test/scala/org/apache/spark/sql/delta/commands/convert/IcebergStatsUtilsSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.delta.commands.convert
 import java.lang.{Boolean => JBoolean, Double => JDouble, Float => JFloat, Integer => JInt, Long => JLong}
 import java.math.BigDecimal
 import java.nio.ByteBuffer
-import java.util.{List => JList, Map => JMap}
+import java.util.{HashMap => JHashMap, List => JList, Map => JMap}
 
 import scala.collection.JavaConverters._
 
@@ -101,9 +101,9 @@ class IcebergStatsUtilsSuite extends SparkFunSuite with SharedSparkSession {
     val deltaStats = IcebergStatsUtils.icebergStatsToDelta(
       icebergSchema,
       1251,
-      minMap,
-      maxMap,
-      nullCountMap,
+      Some(minMap),
+      Some(maxMap),
+      Some(nullCountMap),
       statsAllowTypes = StatsAllowTypes
     )
 
@@ -131,27 +131,27 @@ class IcebergStatsUtilsSuite extends SparkFunSuite with SharedSparkSession {
     val deltaStats = IcebergStatsUtils.icebergStatsToDelta(
       icebergSchema,
       1251,
-      minMap = Map(
+      minMap = Some(Map(
         Integer.valueOf(1) ->
           Conversions.toByteBuffer(TimestampType.withZone, JLong.valueOf(1734391979000000L)),
         Integer.valueOf(2) ->
           Conversions.toByteBuffer(TimestampType.withoutZone, JLong.valueOf(1734391979000000L)),
         Integer.valueOf(3) ->
           Conversions.toByteBuffer(DecimalType.of(10, 5), new BigDecimal("3.44141"))
-      ),
-      maxMap = Map(
+      )),
+      maxMap = Some(Map(
         Integer.valueOf(1) ->
           Conversions.toByteBuffer(TimestampType.withZone, JLong.valueOf(1734394979000000L)),
         Integer.valueOf(2) ->
           Conversions.toByteBuffer(TimestampType.withoutZone, JLong.valueOf(1734394979000000L)),
         Integer.valueOf(3) ->
           Conversions.toByteBuffer(DecimalType.of(10, 5), new BigDecimal("9.99999"))
-      ),
-      nullCountMap = Map(
+      )),
+      nullCountMap = Some(Map(
         Integer.valueOf(1) -> JLong.valueOf(20),
         Integer.valueOf(2) -> JLong.valueOf(10),
         Integer.valueOf(3) -> JLong.valueOf(31)
-      ),
+      )),
       statsAllowTypes = StatsAllowTypes
     )
     assertResult(
@@ -180,23 +180,23 @@ class IcebergStatsUtilsSuite extends SparkFunSuite with SharedSparkSession {
     val deltaStats = IcebergStatsUtils.icebergStatsToDelta(
       icebergSchema,
       1251,
-      minMap = Map(
+      minMap = Some(Map(
         Integer.valueOf(1) -> Conversions.toByteBuffer(IntegerType.get, JInt.valueOf(-5)),
         Integer.valueOf(2) -> Conversions.toByteBuffer(LongType.get, null),
         Integer.valueOf(3) -> null
-      ),
-      maxMap = Map(
+      )),
+      maxMap = Some(Map(
         Integer.valueOf(1) -> Conversions.toByteBuffer(IntegerType.get, JInt.valueOf(5)),
         // stats for value 2 is missing
         Integer.valueOf(3) -> Conversions.toByteBuffer(StringType.get, "maxval"),
         Integer.valueOf(5) -> Conversions.toByteBuffer(StringType.get, "maxval")
-      ),
-      nullCountMap = Map(
+      )),
+      nullCountMap = Some(Map(
         Integer.valueOf(1) -> JLong.valueOf(0),
         Integer.valueOf(2) -> null,
         Integer.valueOf(3) -> JLong.valueOf(2),
         Integer.valueOf(5) -> JLong.valueOf(3)
-      ),
+      )),
       statsAllowTypes = StatsAllowTypes
     )
     assertResult(
@@ -209,33 +209,68 @@ class IcebergStatsUtilsSuite extends SparkFunSuite with SharedSparkSession {
       JsonUtils.fromJson[StatsObject](deltaStats))
   }
 
+  private def testStatsConversion(
+      expectedStatsJson: String, dataFile: DataFile, icebergSchema: Schema): Unit = {
+    val expectedStats = JsonUtils.fromJson[StatsObject](expectedStatsJson)
+    val actualStats =
+      IcebergStatsUtils.icebergStatsToDelta(
+          icebergSchema, dataFile, StatsAllowTypes, shouldSkipForFile = _ => false
+        )
+        .map(JsonUtils.fromJson[StatsObject](_))
+        .get
+    assertResult(expectedStats)(actualStats)
+  }
+
   test("stats conversion while DataFile misses the stats fields") {
     val icebergSchema = new Schema(10, Seq[NestedField](
       NestedField.required(1, "col_int", IntegerType.get),
       NestedField.required(2, "col_long", LongType.get),
       NestedField.required(3, "col_st", StringType.get)
     ).asJava)
-    val expectedStats = JsonUtils.fromJson[StatsObject](
+    val expectedStatsJson =
       """{"numRecords":0,"maxValues":{"col_int":100992003},
         |"minValues":{"col_int":100992003},"nullCount":{"col_int":2}}"""
-        .stripMargin)
-    val actualStats =
-      IcebergStatsUtils.icebergStatsToDelta(icebergSchema, DummyDataFile(), StatsAllowTypes)
-        .map(JsonUtils.fromJson[StatsObject](_))
-        .get
-    assertResult(expectedStats)(actualStats)
-    assertResult(None)(IcebergStatsUtils.icebergStatsToDelta(
-      icebergSchema,
-      DummyDataFile(upperBounds = null),
-      statsAllowTypes = StatsAllowTypes))
-    assertResult(None)(IcebergStatsUtils.icebergStatsToDelta(
-      icebergSchema,
-      DummyDataFile(lowerBounds = null),
-      statsAllowTypes = StatsAllowTypes))
-    assertResult(None)(IcebergStatsUtils.icebergStatsToDelta(
-      icebergSchema,
-      DummyDataFile(nullValueCounts = null),
-      statsAllowTypes = StatsAllowTypes))
+        .stripMargin
+    testStatsConversion(expectedStatsJson, DummyDataFile(), icebergSchema)
+
+    val expectedStatsWithoutUpperBound =
+      """{"numRecords":0,"minValues":{"col_int":100992003},
+        |"nullCount":{"col_int":2}}"""
+        .stripMargin
+    testStatsConversion(
+      expectedStatsWithoutUpperBound, DummyDataFile(upperBounds = null), icebergSchema
+    )
+    testStatsConversion(
+      expectedStatsWithoutUpperBound,
+      DummyDataFile(upperBounds = new JHashMap[Integer, ByteBuffer]()),
+      icebergSchema
+    )
+
+    val expectedStatsWithoutLowerBound =
+      """{"numRecords":0,"maxValues":{"col_int":100992003},
+        |"nullCount":{"col_int":2}}"""
+        .stripMargin
+    testStatsConversion(
+      expectedStatsWithoutLowerBound, DummyDataFile(lowerBounds = null), icebergSchema
+    )
+    testStatsConversion(
+      expectedStatsWithoutLowerBound,
+      DummyDataFile(lowerBounds = new JHashMap[Integer, ByteBuffer]()),
+      icebergSchema
+    )
+
+    val expectedStatsWithoutNullCounts =
+      """{"numRecords":0,"maxValues":{"col_int":100992003},
+        |"minValues":{"col_int":100992003}}"""
+        .stripMargin
+    testStatsConversion(
+      expectedStatsWithoutNullCounts, DummyDataFile(nullValueCounts = null), icebergSchema
+    )
+    testStatsConversion(
+      expectedStatsWithoutNullCounts,
+      DummyDataFile(nullValueCounts = new JHashMap[Integer, JLong]()),
+      icebergSchema
+    )
   }
 }
 
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSQLConf.scala b/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSQLConf.scala
@@ -2367,6 +2367,23 @@ trait DeltaSQLConfBase {
     .booleanConf
     .createWithDefault(true)
 
+  /**
+   * For iceberg clone,
+   * When stats conversion from iceberg off, fallback to slow stats conversion enabled
+   * When stats conversion from iceberg on,
+   *  fallback to slow stats conversion will not happen if partial stats conversion enabled
+   *  fallback only happens if partial stats conversion disabled and iceberg has partial stats
+   *  - either minValues or maxValues is missing
+   */
+  val DELTA_CLONE_ICEBERG_ALLOW_PARTIAL_STATS =
+    buildConf("clone.iceberg.allowPartialStats")
+      .internal()
+      .doc("If true, allow converting partial stats from iceberg stats " +
+        "to delta stats during clone."
+      )
+      .booleanConf
+      .createWithDefault(true)
+
   /////////////////////
   // Optimized Write
   /////////////////////