comphead
diff --git a/‎fuzz-testing/README.md‎
Lines changed: 7 additions & 5 deletions b/‎fuzz-testing/README.md‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎fuzz-testing/pom.xml‎
Lines changed: 5 additions & 0 deletions b/‎fuzz-testing/pom.xml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎fuzz-testing/src/main/scala/org/apache/comet/fuzz/Main.scala‎
Lines changed: 35 additions & 20 deletions b/‎fuzz-testing/src/main/scala/org/apache/comet/fuzz/Main.scala‎
Lines changed: 35 additions & 20 deletions
diff --git a/‎fuzz-testing/src/main/scala/org/apache/comet/fuzz/Meta.scala‎
Lines changed: 9 additions & 1 deletion b/‎fuzz-testing/src/main/scala/org/apache/comet/fuzz/Meta.scala‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎fuzz-testing/src/main/scala/org/apache/comet/fuzz/QueryRunner.scala‎
Lines changed: 32 additions & 19 deletions b/‎fuzz-testing/src/main/scala/org/apache/comet/fuzz/QueryRunner.scala‎
Lines changed: 32 additions & 19 deletions
diff --git a/‎spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala‎
Lines changed: 10 additions & 6 deletions b/‎spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎spark/src/main/scala/org/apache/comet/serde/arrays.scala‎
Lines changed: 60 additions & 0 deletions b/‎spark/src/main/scala/org/apache/comet/serde/arrays.scala‎
Lines changed: 60 additions & 0 deletions
@@ -44,7 +44,9 @@ Planned areas of improvement:
 
 ## Usage
 
-Build the jar file first.
+From the root of the project, run `mvn install -DskipTests` to install Comet. 
+
+Then build the fuzz testing jar.
 
 ```shell
 mvn package
@@ -59,8 +61,8 @@ Set appropriate values for `SPARK_HOME`, `SPARK_MASTER`, and `COMET_JAR` environ
 $SPARK_HOME/bin/spark-submit \
     --master $SPARK_MASTER \
     --class org.apache.comet.fuzz.Main \
-    target/comet-fuzz-spark3.4_2.12-0.5.0-SNAPSHOT-jar-with-dependencies.jar \
-    data --num-files=2 --num-rows=200 --num-columns=100
+    target/comet-fuzz-spark3.4_2.12-0.6.0-SNAPSHOT-jar-with-dependencies.jar \
+    data --num-files=2 --num-rows=200 --exclude-negative-zero --generate-arrays --generate-structs --generate-maps
 ```
 
 There is an optional `--exclude-negative-zero` flag for excluding `-0.0` from the generated data, which is 
@@ -75,7 +77,7 @@ Generate random queries that are based on the available test files.
 $SPARK_HOME/bin/spark-submit \
     --master $SPARK_MASTER \
     --class org.apache.comet.fuzz.Main \
-    target/comet-fuzz-spark3.4_2.12-0.5.0-SNAPSHOT-jar-with-dependencies.jar \
+    target/comet-fuzz-spark3.4_2.12-0.6.0-SNAPSHOT-jar-with-dependencies.jar \
     queries --num-files=2 --num-queries=500
 ```
 
@@ -97,7 +99,7 @@ $SPARK_HOME/bin/spark-submit \
     --conf spark.driver.extraClassPath=$COMET_JAR \
     --conf spark.executor.extraClassPath=$COMET_JAR \
     --class org.apache.comet.fuzz.Main \
-    target/comet-fuzz-spark3.4_2.12-0.5.0-SNAPSHOT-jar-with-dependencies.jar \
+    target/comet-fuzz-spark3.4_2.12-0.6.0-SNAPSHOT-jar-with-dependencies.jar \
     run --num-files=2 --filename=queries.sql
 ```
 
 
@@ -51,6 +51,11 @@ under the License.
             <artifactId>spark-sql_${scala.binary.version}</artifactId>
             <scope>provided</scope>
         </dependency>
+        <dependency>
+            <groupId>org.apache.datafusion</groupId>
+            <artifactId>comet-spark-spark${spark.version.short}_${scala.binary.version}</artifactId>
+            <version>0.6.0-SNAPSHOT</version>
+        </dependency>
         <dependency>
             <groupId>org.rogach</groupId>
             <artifactId>scallop_${scala.binary.version}</artifactId>
 
@@ -26,23 +26,35 @@ import org.rogach.scallop.ScallopOption
 
 import org.apache.spark.sql.SparkSession
 
+import org.apache.comet.testing.{DataGenOptions, ParquetGenerator}
+
 class Conf(arguments: Seq[String]) extends ScallopConf(arguments) {
   object generateData extends Subcommand("data") {
-    val numFiles: ScallopOption[Int] = opt[Int](required = true)
-    val numRows: ScallopOption[Int] = opt[Int](required = true)
-    val numColumns: ScallopOption[Int] = opt[Int](required = true)
-    val excludeNegativeZero: ScallopOption[Boolean] = opt[Boolean](required = false)
+    val numFiles: ScallopOption[Int] =
+      opt[Int](required = true, descr = "Number of files to generate")
+    val numRows: ScallopOption[Int] = opt[Int](required = true, descr = "Number of rows per file")
+    val generateArrays: ScallopOption[Boolean] =
+      opt[Boolean](required = false, descr = "Whether to generate arrays")
+    val generateStructs: ScallopOption[Boolean] =
+      opt[Boolean](required = false, descr = "Whether to generate structs")
+    val generateMaps: ScallopOption[Boolean] =
+      opt[Boolean](required = false, descr = "Whether to generate maps")
+    val excludeNegativeZero: ScallopOption[Boolean] =
+      opt[Boolean](required = false, descr = "Whether to exclude negative zero")
   }
   addSubcommand(generateData)
   object generateQueries extends Subcommand("queries") {
-    val numFiles: ScallopOption[Int] = opt[Int](required = false)
-    val numQueries: ScallopOption[Int] = opt[Int](required = true)
+    val numFiles: ScallopOption[Int] =
+      opt[Int](required = false, descr = "Number of input files to use")
+    val numQueries: ScallopOption[Int] =
+      opt[Int](required = true, descr = "Number of queries to generate")
   }
   addSubcommand(generateQueries)
   object runQueries extends Subcommand("run") {
-    val filename: ScallopOption[String] = opt[String](required = true)
-    val numFiles: ScallopOption[Int] = opt[Int](required = false)
-    val showMatchingResults: ScallopOption[Boolean] = opt[Boolean](required = false)
+    val filename: ScallopOption[String] =
+      opt[String](required = true, descr = "File to write queries to")
+    val numFiles: ScallopOption[Int] =
+      opt[Int](required = false, descr = "Number of input files to use")
   }
   addSubcommand(runQueries)
   verify()
@@ -60,25 +72,28 @@ object Main {
     val conf = new Conf(args.toIndexedSeq)
     conf.subcommand match {
       case Some(conf.generateData) =>
-        DataGen.generateRandomFiles(
-          r,
-          spark,
-          numFiles = conf.generateData.numFiles(),
-          numRows = conf.generateData.numRows(),
-          numColumns = conf.generateData.numColumns(),
+        val options = DataGenOptions(
+          allowNull = true,
+          generateArray = conf.generateData.generateArrays(),
+          generateStruct = conf.generateData.generateStructs(),
+          generateMap = conf.generateData.generateMaps(),
           generateNegativeZero = !conf.generateData.excludeNegativeZero())
+        for (i <- 0 until conf.generateData.numFiles()) {
+          ParquetGenerator.makeParquetFile(
+            r,
+            spark,
+            s"test$i.parquet",
+            numRows = conf.generateData.numRows(),
+            options)
+        }
       case Some(conf.generateQueries) =>
         QueryGen.generateRandomQueries(
           r,
           spark,
           numFiles = conf.generateQueries.numFiles(),
           conf.generateQueries.numQueries())
       case Some(conf.runQueries) =>
-        QueryRunner.runQueries(
-          spark,
-          conf.runQueries.numFiles(),
-          conf.runQueries.filename(),
-          conf.runQueries.showMatchingResults())
+        QueryRunner.runQueries(spark, conf.runQueries.numFiles(), conf.runQueries.filename())
       case _ =>
         // scalastyle:off println
         println("Invalid subcommand")
 
@@ -103,8 +103,16 @@ object Meta {
   val miscScalarFunc: Seq[Function] =
     Seq(Function("isnan", 1), Function("isnull", 1), Function("isnotnull", 1))
 
+  val arrayScalarFunc: Seq[Function] = Seq(
+    Function("array", 2),
+    Function("array_remove", 2),
+    Function("array_insert", 2),
+    Function("array_contains", 2),
+    Function("array_intersect", 2),
+    Function("array_append", 2))
+
   val scalarFunc: Seq[Function] = stringScalarFunc ++ dateScalarFunc ++
-    mathScalarFunc ++ miscScalarFunc
+    mathScalarFunc ++ miscScalarFunc ++ arrayScalarFunc
 
   val aggFunc: Seq[Function] = Seq(
     Function("min", 1),
 
@@ -21,6 +21,7 @@ package org.apache.comet.fuzz
 
 import java.io.{BufferedWriter, FileWriter, PrintWriter, StringWriter}
 
+import scala.collection.mutable.WrappedArray
 import scala.io.Source
 
 import org.apache.spark.sql.{Row, SparkSession}
@@ -31,7 +32,6 @@ object QueryRunner {
       spark: SparkSession,
       numFiles: Int,
       filename: String,
-      showMatchingResults: Boolean,
       showFailedSparkQueries: Boolean = false): Unit = {
 
     val outputFilename = s"results-${System.currentTimeMillis()}.md"
@@ -64,8 +64,12 @@ object QueryRunner {
             val sparkRows = df.collect()
             val sparkPlan = df.queryExecution.executedPlan.toString
 
+            // execute with Comet
             try {
               spark.conf.set("spark.comet.enabled", "true")
+              // complex type support until we support it natively
+              spark.conf.set("spark.comet.sparkToColumnar.enabled", "true")
+              spark.conf.set("spark.comet.convert.parquet.enabled", "true")
               val df = spark.sql(sql)
               val cometRows = df.collect()
               val cometPlan = df.queryExecution.executedPlan.toString
@@ -77,17 +81,7 @@ object QueryRunner {
                   val r = cometRows(i)
                   assert(l.length == r.length)
                   for (j <- 0 until l.length) {
-                    val same = (l(j), r(j)) match {
-                      case (a: Float, b: Float) if a.isInfinity => b.isInfinity
-                      case (a: Float, b: Float) if a.isNaN => b.isNaN
-                      case (a: Float, b: Float) => (a - b).abs <= 0.000001f
-                      case (a: Double, b: Double) if a.isInfinity => b.isInfinity
-                      case (a: Double, b: Double) if a.isNaN => b.isNaN
-                      case (a: Double, b: Double) => (a - b).abs <= 0.000001
-                      case (a: Array[Byte], b: Array[Byte]) => a.sameElements(b)
-                      case (a, b) => a == b
-                    }
-                    if (!same) {
+                    if (!same(l(j), r(j))) {
                       showSQL(w, sql)
                       showPlans(w, sparkPlan, cometPlan)
                       w.write(s"First difference at row $i:\n")
@@ -138,14 +132,33 @@ object QueryRunner {
     }
   }
 
+  private def same(l: Any, r: Any): Boolean = {
+    (l, r) match {
+      case (a: Float, b: Float) if a.isInfinity => b.isInfinity
+      case (a: Float, b: Float) if a.isNaN => b.isNaN
+      case (a: Float, b: Float) => (a - b).abs <= 0.000001f
+      case (a: Double, b: Double) if a.isInfinity => b.isInfinity
+      case (a: Double, b: Double) if a.isNaN => b.isNaN
+      case (a: Double, b: Double) => (a - b).abs <= 0.000001
+      case (a: Array[_], b: Array[_]) =>
+        a.length == b.length && a.zip(b).forall(x => same(x._1, x._2))
+      case (a: WrappedArray[_], b: WrappedArray[_]) =>
+        a.length == b.length && a.zip(b).forall(x => same(x._1, x._2))
+      case (a, b) => a == b
+    }
+  }
+
+  private def format(value: Any): String = {
+    value match {
+      case null => "NULL"
+      case v: WrappedArray[_] => s"[${v.map(format).mkString(",")}]"
+      case v: Array[Byte] => s"[${v.mkString(",")}]"
+      case other => other.toString
+    }
+  }
+
   private def formatRow(row: Row): String = {
-    row.toSeq
-      .map {
-        case null => "NULL"
-        case v: Array[Byte] => v.mkString
-        case other => other.toString
-      }
-      .mkString(",")
+    row.toSeq.map(format).mkString(",")
   }
 
   private def showSQL(w: BufferedWriter, sql: String, maxLength: Int = 120): Unit = {
 
@@ -2284,12 +2284,16 @@ object QueryPlanSerde extends Logging with ShimQueryPlanSerde with CometExprShim
             withInfo(expr, "unsupported arguments for GetArrayStructFields", child)
             None
           }
-        case expr if expr.prettyName == "array_remove" =>
-          createBinaryExpr(
-            expr.children(0),
-            expr.children(1),
-            inputs,
-            (builder, binaryExpr) => builder.setArrayRemove(binaryExpr))
+        case expr: ArrayRemove =>
+          if (CometArrayRemove.checkSupport(expr)) {
+            createBinaryExpr(
+              expr.children(0),
+              expr.children(1),
+              inputs,
+              (builder, binaryExpr) => builder.setArrayRemove(binaryExpr))
+          } else {
+            None
+          }
         case expr if expr.prettyName == "array_contains" =>
           createBinaryExpr(
             expr.children(0),
 
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet.serde
+
+import org.apache.spark.sql.catalyst.expressions.{ArrayRemove, Expression}
+import org.apache.spark.sql.types.{ArrayType, DataType, DataTypes, DecimalType, StructType}
+
+import org.apache.comet.CometSparkSessionExtensions.withInfo
+import org.apache.comet.shims.CometExprShim
+
+trait CometExpression {
+  def checkSupport(expr: Expression): Boolean
+}
+
+object CometArrayRemove extends CometExpression with CometExprShim {
+
+  def isTypeSupported(dt: DataType): Boolean = {
+    import DataTypes._
+    dt match {
+      case BooleanType | ByteType | ShortType | IntegerType | LongType | FloatType | DoubleType |
+          _: DecimalType | DateType | TimestampType | StringType | BinaryType =>
+        true
+      case t if isTimestampNTZType(t) => true
+      case ArrayType(elementType, _) => isTypeSupported(elementType)
+      case _: StructType =>
+        // https://github.com/apache/datafusion-comet/issues/1307
+        false
+      case _ => false
+    }
+  }
+
+  override def checkSupport(expr: Expression): Boolean = {
+    val ar = expr.asInstanceOf[ArrayRemove]
+    val inputTypes: Set[DataType] = ar.children.map(_.dataType).toSet
+    for (dt <- inputTypes) {
+      if (!isTypeSupported(dt)) {
+        withInfo(expr, s"data type not supported: $dt")
+        return false
+      }
+    }
+    true
+  }
+}