fix: correctly handle schemas with nested array of struct (native_iceberg_compat) (apache#1883)

parthchandra · web-flow · commit 6361bcd682c3 · 2025-06-13T15:53:31.000-07:00
* fix: correctly handle schemas with nested array of struct
diff --git a/common/src/main/java/org/apache/comet/parquet/NativeBatchReader.java b/common/src/main/java/org/apache/comet/parquet/NativeBatchReader.java
@@ -533,13 +533,20 @@ private StructType getSparkSchemaByFieldId(
     return newSchema;
   }
 
+  private static boolean isPrimitiveCatalystType(DataType dataType) {
+    return !(dataType instanceof ArrayType)
+        && !(dataType instanceof MapType)
+        && !(dataType instanceof StructType);
+  }
+
   private DataType getSparkTypeByFieldId(
       DataType dataType, Type parquetType, boolean caseSensitive) {
     DataType newDataType;
     if (dataType instanceof StructType) {
       newDataType =
           getSparkSchemaByFieldId((StructType) dataType, parquetType.asGroupType(), caseSensitive);
-    } else if (dataType instanceof ArrayType) {
+    } else if (dataType instanceof ArrayType
+        && !isPrimitiveCatalystType(((ArrayType) dataType).elementType())) {
 
       newDataType =
           getSparkArrayTypeByFieldId(
@@ -575,11 +582,10 @@ private DataType getSparkTypeByFieldId(
   }
 
   private DataType getSparkArrayTypeByFieldId(
-      ArrayType arrayType, GroupType parquetType, boolean caseSensitive) {
+      ArrayType arrayType, GroupType parquetList, boolean caseSensitive) {
     DataType newDataType;
     DataType elementType = arrayType.elementType();
     DataType newElementType;
-    Type parquetList = parquetType.getFields().get(0);
     Type parquetElementType;
     if (parquetList.getLogicalTypeAnnotation() == null
         && parquetList.isRepetition(Type.Repetition.REPEATED)) {
diff --git a/spark/src/test/scala/org/apache/comet/parquet/ParquetReadSuite.scala b/spark/src/test/scala/org/apache/comet/parquet/ParquetReadSuite.scala
@@ -41,6 +41,7 @@ import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.comet.{CometBatchScanExec, CometNativeScanExec, CometScanExec}
 import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
+import org.apache.spark.sql.execution.datasources.parquet.ParquetUtils
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -1745,6 +1746,77 @@ abstract class ParquetReadSuite extends CometTestBase {
       }
     }
   }
+
+  private def withId(id: Int) =
+    new MetadataBuilder().putLong(ParquetUtils.FIELD_ID_METADATA_KEY, id).build()
+
+  // Based on Spark ParquetIOSuite.test("vectorized reader: array of nested struct")
+  test("array of nested struct with and without field id") {
+    val nestedSchema = StructType(
+      Seq(StructField(
+        "_1",
+        StructType(Seq(
+          StructField("_1", StringType, nullable = true, withId(1)), // Field ID 1
+          StructField(
+            "_2",
+            ArrayType(StructType(Seq(
+              StructField("_1", StringType, nullable = true, withId(2)), // Field ID 2
+              StructField("_2", StringType, nullable = true, withId(3)) // Field ID 3
+            ))),
+            nullable = true))),
+        nullable = true)))
+    val nestedSchemaNoId = StructType(
+      Seq(StructField(
+        "_1",
+        StructType(Seq(
+          StructField("_1", StringType, nullable = true),
+          StructField(
+            "_2",
+            ArrayType(StructType(Seq(
+              StructField("_1", StringType, nullable = true),
+              StructField("_2", StringType, nullable = true)))),
+            nullable = true))),
+        nullable = true)))
+    // data matching the schema
+    val data = Seq(
+      Row(Row("a", null)),
+      Row(Row("b", Seq(Row("c", "d")))),
+      Row(null),
+      Row(Row("e", Seq(Row("f", null), Row(null, "g")))),
+      Row(Row(null, null)),
+      Row(Row(null, Seq(null))),
+      Row(Row(null, Seq(Row(null, null), Row("h", null), null))),
+      Row(Row("i", Seq())),
+      Row(null))
+    val answer =
+      Row(Row("a", null)) ::
+        Row(Row("b", Seq(Row("c", "d")))) ::
+        Row(null) ::
+        Row(Row("e", Seq(Row("f", null), Row(null, "g")))) ::
+        Row(Row(null, null)) ::
+        Row(Row(null, Seq(null))) ::
+        Row(Row(null, Seq(Row(null, null), Row("h", null), null))) ::
+        Row(Row("i", Seq())) ::
+        Row(null) ::
+        Nil
+
+    withSQLConf(SQLConf.PARQUET_FIELD_ID_READ_ENABLED.key -> "true") {
+      val df = spark.createDataFrame(spark.sparkContext.parallelize(data), nestedSchema)
+      withTempPath { path =>
+        df.write.parquet(path.getCanonicalPath)
+        readParquetFile(path.getCanonicalPath) { df =>
+          checkAnswer(df, answer)
+        }
+      }
+      val df2 = spark.createDataFrame(spark.sparkContext.parallelize(data), nestedSchemaNoId)
+      withTempPath { path =>
+        df2.write.parquet(path.getCanonicalPath)
+        readParquetFile(path.getCanonicalPath) { df =>
+          checkAnswer(df, answer)
+        }
+      }
+    }
+  }
 }
 
 class ParquetReadV1Suite extends ParquetReadSuite with AdaptiveSparkPlanHelper {