feat: Support Type widening: byte → short/int/long, short → int/long (#1770)

huaxingao · web-flow · commit 2665b526b3b3 · 2025-05-27T18:20:16.000-07:00
* Support type widening for Spark 4.0

* formatting

* add one more test

* formatting
diff --git a/native/core/src/parquet/read/column.rs b/native/core/src/parquet/read/column.rs
@@ -154,13 +154,27 @@ impl ColumnReader {
                                         )
                                     }
                                 }
+                                // promote byte to short
+                                PhysicalType::INT32 if promotion_info.bit_width == 16 => {
+                                    typed_reader!(Int16ColumnReader, Int16)
+                                }
+                                // promote byte to int
+                                PhysicalType::INT32 if promotion_info.bit_width == 32 => {
+                                    typed_reader!(Int32ColumnReader, Int32)
+                                }
+                                // promote byte to long
+                                PhysicalType::INT64 => typed_reader!(Int32To64ColumnReader, Int64),
                                 _ => typed_reader!(Int8ColumnReader, Int8),
                             },
                             (8, false) => typed_reader!(UInt8ColumnReader, Int16),
                             (16, true) => match promotion_info.physical_type {
                                 PhysicalType::DOUBLE => {
                                     typed_reader!(Int16ToDoubleColumnReader, Float64)
                                 }
+                                // promote short to long
+                                PhysicalType::INT64 => {
+                                    typed_reader!(Int32To64ColumnReader, Int64)
+                                }
                                 PhysicalType::INT32 if promotion_info.bit_width == 32 => {
                                     typed_reader!(Int32ColumnReader, Int32)
                                 }
diff --git a/spark/src/test/scala/org/apache/comet/parquet/ParquetReadSuite.scala b/spark/src/test/scala/org/apache/comet/parquet/ParquetReadSuite.scala
@@ -1246,6 +1246,88 @@ abstract class ParquetReadSuite extends CometTestBase {
     }
   }
 
+  test("type widening: byte → short/int/long, short → int/long, int → long") {
+    withSQLConf(CometConf.COMET_SCHEMA_EVOLUTION_ENABLED.key -> "true") {
+      withTempPath { dir =>
+        val path = dir.getCanonicalPath
+        val values = 1 to 10
+        val options: Map[String, String] = Map.empty[String, String]
+
+        // Input types and corresponding DataFrames
+        val inputDFs = Seq(
+          "byte" -> values.map(_.toByte).toDF("col1"),
+          "short" -> values.map(_.toShort).toDF("col1"),
+          "int" -> values.map(_.toInt).toDF("col1"))
+
+        // Target Spark read schemas for widening
+        val widenTargets = Seq(
+          "short" -> values.map(_.toShort).toDF("col1"),
+          "int" -> values.map(_.toInt).toDF("col1"),
+          "long" -> values.map(_.toLong).toDF("col1"))
+
+        for ((inputType, inputDF) <- inputDFs) {
+          val writePath = s"$path/$inputType"
+          inputDF.write.format("parquet").options(options).save(writePath)
+
+          for ((targetType, targetDF) <- widenTargets) {
+            // Only test valid widenings (e.g., don't test int → short)
+            val wideningValid = (inputType, targetType) match {
+              case ("byte", "short" | "int" | "long") => true
+              case ("short", "int" | "long") => true
+              case ("int", "long") => true
+              case _ => false
+            }
+
+            if (wideningValid) {
+              val reader = spark.read
+                .schema(s"col1 $targetType")
+                .format("parquet")
+                .options(options)
+                .load(writePath)
+
+              checkAnswer(reader, targetDF)
+            }
+          }
+        }
+      }
+    }
+  }
+
+  test("read byte, int, short, long together") {
+    withSQLConf(CometConf.COMET_SCHEMA_EVOLUTION_ENABLED.key -> "true") {
+      withTempPath { dir =>
+        val path = dir.getCanonicalPath
+
+        val byteDF = (Byte.MaxValue - 2 to Byte.MaxValue).map(_.toByte).toDF("col1")
+        val shortDF = (Short.MaxValue - 2 to Short.MaxValue).map(_.toShort).toDF("col1")
+        val intDF = (Int.MaxValue - 2 to Int.MaxValue).toDF("col1")
+        val longDF = (Long.MaxValue - 2 to Long.MaxValue).toDF("col1")
+        val unionDF = byteDF.union(shortDF).union(intDF).union(longDF)
+
+        val byteDir = s"$path${File.separator}part=byte"
+        val shortDir = s"$path${File.separator}part=short"
+        val intDir = s"$path${File.separator}part=int"
+        val longDir = s"$path${File.separator}part=long"
+
+        val options: Map[String, String] = Map.empty[String, String]
+
+        byteDF.write.format("parquet").options(options).save(byteDir)
+        shortDF.write.format("parquet").options(options).save(shortDir)
+        intDF.write.format("parquet").options(options).save(intDir)
+        longDF.write.format("parquet").options(options).save(longDir)
+
+        val df = spark.read
+          .schema(unionDF.schema)
+          .format("parquet")
+          .options(options)
+          .load(path)
+          .select("col1")
+
+        checkAnswer(df, unionDF)
+      }
+    }
+  }
+
   test("scan metrics") {
     // https://github.com/apache/datafusion-comet/issues/1441
     assume(CometConf.COMET_NATIVE_SCAN_IMPL.get() != CometConf.SCAN_NATIVE_ICEBERG_COMPAT)

Original file line number	Diff line number	Diff line change
`@@ -154,13 +154,27 @@ impl ColumnReader {`
`154`	`154`	`)`
`155`	`155`	`}`
`156`	`156`	`}`
	`157`	`+ // promote byte to short`
	`158`	`+ PhysicalType::INT32 if promotion_info.bit_width == 16 => {`
	`159`	`+ typed_reader!(Int16ColumnReader, Int16)`
	`160`	`+ }`
	`161`	`+ // promote byte to int`
	`162`	`+ PhysicalType::INT32 if promotion_info.bit_width == 32 => {`
	`163`	`+ typed_reader!(Int32ColumnReader, Int32)`
	`164`	`+ }`
	`165`	`+ // promote byte to long`
	`166`	`+ PhysicalType::INT64 => typed_reader!(Int32To64ColumnReader, Int64),`
`157`	`167`	`_ => typed_reader!(Int8ColumnReader, Int8),`
`158`	`168`	`},`
`159`	`169`	`(8, false) => typed_reader!(UInt8ColumnReader, Int16),`
`160`	`170`	`(16, true) => match promotion_info.physical_type {`
`161`	`171`	`PhysicalType::DOUBLE => {`
`162`	`172`	`typed_reader!(Int16ToDoubleColumnReader, Float64)`
`163`	`173`	`}`
	`174`	`+ // promote short to long`
	`175`	`+ PhysicalType::INT64 => {`
	`176`	`+ typed_reader!(Int32To64ColumnReader, Int64)`
	`177`	`+ }`
`164`	`178`	`PhysicalType::INT32 if promotion_info.bit_width == 32 => {`
`165`	`179`	`typed_reader!(Int32ColumnReader, Int32)`
`166`	`180`	`}`