feat: add read array support (#1456)

comphead · web-flow · commit 59fae9402731 · 2025-03-18T15:17:52.000-07:00
* feat: add read array support
diff --git a/native/Cargo.lock b/native/Cargo.lock
diff --git a/native/core/Cargo.toml b/native/core/Cargo.toml
@@ -77,6 +77,7 @@ jni = { version = "0.21", features = ["invocation"] }
 lazy_static = "1.4"
 assertables = "7"
 hex = "0.4.3"
+datafusion-functions-nested = "46.0.0"
 
 [features]
 default = []
diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
@@ -2686,12 +2686,14 @@ mod tests {
 
     use arrow::array::{DictionaryArray, Int32Array, StringArray};
     use arrow::datatypes::DataType;
-    use datafusion::{physical_plan::common::collect, prelude::SessionContext};
+    use datafusion::logical_expr::ScalarUDF;
+    use datafusion::{assert_batches_eq, physical_plan::common::collect, prelude::SessionContext};
     use tokio::sync::mpsc;
 
     use crate::execution::{operators::InputBatch, planner::PhysicalPlanner};
 
     use crate::execution::operators::ExecutionError;
+    use datafusion_comet_proto::spark_expression::expr::ExprStruct;
     use datafusion_comet_proto::{
         spark_expression::expr::ExprStruct::*,
         spark_expression::Expr,
@@ -3004,4 +3006,130 @@ mod tests {
             type_info: None,
         }
     }
+
+    #[test]
+    fn test_create_array() {
+        let session_ctx = SessionContext::new();
+        session_ctx.register_udf(ScalarUDF::from(
+            datafusion_functions_nested::make_array::MakeArray::new(),
+        ));
+        let task_ctx = session_ctx.task_ctx();
+        let planner = PhysicalPlanner::new(Arc::from(session_ctx));
+
+        // Create a plan for
+        // ProjectionExec: expr=[make_array(col_0@0) as col_0]
+        // ScanExec: source=[CometScan parquet  (unknown)], schema=[col_0: Int32]
+        let op_scan = Operator {
+            plan_id: 0,
+            children: vec![],
+            op_struct: Some(OpStruct::Scan(spark_operator::Scan {
+                fields: vec![
+                    spark_expression::DataType {
+                        type_id: 3, // Int32
+                        type_info: None,
+                    },
+                    spark_expression::DataType {
+                        type_id: 3, // Int32
+                        type_info: None,
+                    },
+                    spark_expression::DataType {
+                        type_id: 3, // Int32
+                        type_info: None,
+                    },
+                ],
+                source: "".to_string(),
+            })),
+        };
+
+        let array_col = spark_expression::Expr {
+            expr_struct: Some(Bound(spark_expression::BoundReference {
+                index: 0,
+                datatype: Some(spark_expression::DataType {
+                    type_id: 3,
+                    type_info: None,
+                }),
+            })),
+        };
+
+        let array_col_1 = spark_expression::Expr {
+            expr_struct: Some(Bound(spark_expression::BoundReference {
+                index: 1,
+                datatype: Some(spark_expression::DataType {
+                    type_id: 3,
+                    type_info: None,
+                }),
+            })),
+        };
+
+        let projection = Operator {
+            children: vec![op_scan],
+            plan_id: 0,
+            op_struct: Some(OpStruct::Projection(spark_operator::Projection {
+                project_list: vec![spark_expression::Expr {
+                    expr_struct: Some(ExprStruct::ScalarFunc(spark_expression::ScalarFunc {
+                        func: "make_array".to_string(),
+                        args: vec![array_col, array_col_1],
+                        return_type: None,
+                    })),
+                }],
+            })),
+        };
+
+        let a = Int32Array::from(vec![0, 3]);
+        let b = Int32Array::from(vec![1, 4]);
+        let c = Int32Array::from(vec![2, 5]);
+        let input_batch = InputBatch::Batch(vec![Arc::new(a), Arc::new(b), Arc::new(c)], 2);
+
+        let (mut scans, datafusion_plan) =
+            planner.create_plan(&projection, &mut vec![], 1).unwrap();
+        scans[0].set_input_batch(input_batch);
+
+        let mut stream = datafusion_plan.native_plan.execute(0, task_ctx).unwrap();
+
+        let runtime = tokio::runtime::Runtime::new().unwrap();
+        let (tx, mut rx) = mpsc::channel(1);
+
+        // Separate thread to send the EOF signal once we've processed the only input batch
+        runtime.spawn(async move {
+            // Create a dictionary array with 100 values, and use it as input to the execution.
+            let a = Int32Array::from(vec![0, 3]);
+            let b = Int32Array::from(vec![1, 4]);
+            let c = Int32Array::from(vec![2, 5]);
+            let input_batch1 = InputBatch::Batch(vec![Arc::new(a), Arc::new(b), Arc::new(c)], 2);
+            let input_batch2 = InputBatch::EOF;
+
+            let batches = vec![input_batch1, input_batch2];
+
+            for batch in batches.into_iter() {
+                tx.send(batch).await.unwrap();
+            }
+        });
+
+        runtime.block_on(async move {
+            loop {
+                let batch = rx.recv().await.unwrap();
+                scans[0].set_input_batch(batch);
+                match poll!(stream.next()) {
+                    Poll::Ready(Some(batch)) => {
+                        assert!(batch.is_ok(), "got error {}", batch.unwrap_err());
+                        let batch = batch.unwrap();
+                        assert_eq!(batch.num_rows(), 2);
+                        let expected = [
+                            "+--------+",
+                            "| col_0  |",
+                            "+--------+",
+                            "| [0, 1] |",
+                            "| [3, 4] |",
+                            "+--------+",
+                        ];
+                        assert_batches_eq!(expected, &[batch]);
+                    }
+                    Poll::Ready(None) => {
+                        break;
+                    }
+                    _ => {}
+                }
+            }
+        });
+    }
 }
diff --git a/native/core/src/execution/shuffle/row.rs b/native/core/src/execution/shuffle/row.rs
@@ -3197,6 +3197,7 @@ fn make_builders(
             // Disable dictionary encoding for array element
             let value_builder =
                 make_builders(field.data_type(), NESTED_TYPE_BUILDER_CAPACITY, 1.0)?;
+
             match field.data_type() {
                 DataType::Boolean => {
                     let builder = downcast_builder!(BooleanBuilder, value_builder);
diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
@@ -61,13 +61,16 @@ object QueryPlanSerde extends Logging with ShimQueryPlanSerde with CometExprShim
     logWarning(s"Comet native execution is disabled due to: $reason")
   }
 
-  def supportedDataType(dt: DataType, allowStruct: Boolean = false): Boolean = dt match {
+  def supportedDataType(dt: DataType, allowComplex: Boolean = false): Boolean = dt match {
     case _: ByteType | _: ShortType | _: IntegerType | _: LongType | _: FloatType |
         _: DoubleType | _: StringType | _: BinaryType | _: TimestampType | _: TimestampNTZType |
         _: DecimalType | _: DateType | _: BooleanType | _: NullType =>
       true
-    case s: StructType if allowStruct =>
-      s.fields.map(_.dataType).forall(supportedDataType(_, allowStruct))
+    case s: StructType if allowComplex =>
+      s.fields.map(_.dataType).forall(supportedDataType(_, allowComplex))
+    // TODO: Add nested array and iceberg compat support
+    // case a: ArrayType if allowComplex =>
+    //  supportedDataType(a.elementType)
     case dt =>
       emitWarning(s"unsupported Spark data type: $dt")
       false
@@ -763,7 +766,8 @@ object QueryPlanSerde extends Logging with ShimQueryPlanSerde with CometExprShim
           binding,
           (builder, binaryExpr) => builder.setLtEq(binaryExpr))
 
-      case Literal(value, dataType) if supportedDataType(dataType, allowStruct = value == null) =>
+      case Literal(value, dataType)
+          if supportedDataType(dataType, allowComplex = value == null) =>
         val exprBuilder = ExprOuterClass.Literal.newBuilder()
 
         if (value == null) {
@@ -2716,7 +2720,9 @@ object QueryPlanSerde extends Logging with ShimQueryPlanSerde with CometExprShim
         withInfo(join, "SortMergeJoin is not enabled")
         None
 
-      case op if isCometSink(op) && op.output.forall(a => supportedDataType(a.dataType, true)) =>
+      case op
+          if isCometSink(op) && op.output.forall(a =>
+            supportedDataType(a.dataType, allowComplex = true)) =>
         // These operators are source of Comet native execution chain
         val scanBuilder = OperatorOuterClass.Scan.newBuilder()
         val source = op.simpleStringWithNodeId()
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometNativeScanExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometNativeScanExec.scala
@@ -182,7 +182,7 @@ object CometNativeScanExec extends DataTypeSupport {
       case null => null
     }
 
-    val newArgs = mapProductIterator(scanExec, transform(_))
+    val newArgs = mapProductIterator(scanExec, transform)
     val wrapped = scanExec.makeCopy(newArgs).asInstanceOf[FileSourceScanExec]
     val batchScanExec = CometNativeScanExec(
       nativeOp,
@@ -202,9 +202,10 @@ object CometNativeScanExec extends DataTypeSupport {
   }
 
   override def isAdditionallySupported(dt: DataType): Boolean = {
-    // TODO add array and map
+    // TODO add map
     dt match {
       case s: StructType => s.fields.map(_.dataType).forall(isTypeSupported)
+      case a: ArrayType => isTypeSupported(a.elementType)
       case _ => false
     }
   }
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometScanExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometScanExec.scala
@@ -487,9 +487,11 @@ object CometScanExec extends DataTypeSupport {
 
   override def isAdditionallySupported(dt: DataType): Boolean = {
     if (CometConf.COMET_NATIVE_SCAN_IMPL.get() == CometConf.SCAN_NATIVE_ICEBERG_COMPAT) {
-      // TODO add array and map
+      // TODO add map
       dt match {
         case s: StructType => s.fields.map(_.dataType).forall(isTypeSupported)
+        // TODO: Add nested array and iceberg compat support
+        // case a: ArrayType => isTypeSupported(a.elementType)
         case _ => false
       }
     } else {
diff --git a/spark/src/test/scala/org/apache/comet/exec/CometNativeReaderSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometNativeReaderSuite.scala
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet.exec
+
+import org.scalactic.source.Position
+import org.scalatest.Tag
+
+import org.apache.spark.sql.CometTestBase
+import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
+import org.apache.spark.sql.internal.SQLConf
+
+import org.apache.comet.CometConf
+
+class CometNativeReaderSuite extends CometTestBase with AdaptiveSparkPlanHelper {
+  override protected def test(testName: String, testTags: Tag*)(testFun: => Any)(implicit
+      pos: Position): Unit = {
+    // TODO: Enable Iceberg compat tests
+    Seq(CometConf.SCAN_NATIVE_DATAFUSION /*, CometConf.SCAN_NATIVE_ICEBERG_COMPAT*/ ).foreach(
+      scan =>
+        super.test(s"$testName - $scan", testTags: _*) {
+          withSQLConf(
+            CometConf.COMET_EXEC_ENABLED.key -> "true",
+            SQLConf.USE_V1_SOURCE_LIST.key -> "parquet",
+            CometConf.COMET_ENABLED.key -> "true",
+            CometConf.COMET_EXPLAIN_FALLBACK_ENABLED.key -> "false",
+            CometConf.COMET_NATIVE_SCAN_IMPL.key -> scan) {
+            testFun
+          }
+        })
+  }
+
+  test("native reader - read simple STRUCT fields") {
+    testSingleLineQuery(
+      """
+        |select named_struct('firstName', 'John', 'lastName', 'Doe', 'age', 35) as personal_info union all
+        |select named_struct('firstName', 'Jane', 'lastName', 'Doe', 'age', 40) as personal_info
+        |""".stripMargin,
+      "select personal_info.* from tbl")
+  }
+
+  test("native reader - read simple ARRAY fields") {
+    testSingleLineQuery(
+      """
+        |select array(1, 2, 3) as arr union all
+        |select array(2, 3, 4) as arr
+        |""".stripMargin,
+      "select arr from tbl")
+  }
+}

Original file line number	Diff line number	Diff line change
`@@ -182,7 +182,7 @@ object CometNativeScanExec extends DataTypeSupport {`
`182`	`182`	`case null => null`
`183`	`183`	`}`
`184`	`184`
`185`		`- val newArgs = mapProductIterator(scanExec, transform(_))`
	`185`	`+ val newArgs = mapProductIterator(scanExec, transform)`
`186`	`186`	`val wrapped = scanExec.makeCopy(newArgs).asInstanceOf[FileSourceScanExec]`
`187`	`187`	`val batchScanExec = CometNativeScanExec(`
`188`	`188`	`nativeOp,`
`@@ -202,9 +202,10 @@ object CometNativeScanExec extends DataTypeSupport {`
`202`	`202`	`}`
`203`	`203`
`204`	`204`	`override def isAdditionallySupported(dt: DataType): Boolean = {`
`205`		`- // TODO add array and map`
	`205`	`+ // TODO add map`
`206`	`206`	`dt match {`
`207`	`207`	`case s: StructType => s.fields.map(_.dataType).forall(isTypeSupported)`
	`208`	`+ case a: ArrayType => isTypeSupported(a.elementType)`
`208`	`209`	`case _ => false`
`209`	`210`	`}`
`210`	`211`	`}`