chore: Improve test coverage for count aggregates (#2406)

andygrove · web-flow · commit 3bb1b4083e45 · 2025-09-16T14:46:19.000-04:00
* refactor fuzz test

* link to issue

* add new test to CI
diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml
@@ -102,6 +102,7 @@ jobs:
           - name: "fuzz"
             value: |
               org.apache.comet.CometFuzzTestSuite
+              org.apache.comet.CometFuzzAggregateSuite
               org.apache.comet.DataGeneratorSuite
           - name: "shuffle"
             value: |
diff --git a/.github/workflows/pr_build_macos.yml b/.github/workflows/pr_build_macos.yml
@@ -67,6 +67,7 @@ jobs:
           - name: "fuzz"
             value: |
               org.apache.comet.CometFuzzTestSuite
+              org.apache.comet.CometFuzzAggregateSuite
               org.apache.comet.DataGeneratorSuite
           - name: "shuffle"
             value: |
diff --git a/spark/src/test/scala/org/apache/comet/CometFuzzAggregateSuite.scala b/spark/src/test/scala/org/apache/comet/CometFuzzAggregateSuite.scala
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet
+
+class CometFuzzAggregateSuite extends CometFuzzTestBase {
+
+  test("count distinct") {
+    val df = spark.read.parquet(filename)
+    df.createOrReplaceTempView("t1")
+    for (col <- df.columns) {
+      val sql = s"SELECT count(distinct $col) FROM t1"
+      // Comet does not support count distinct yet
+      // https://github.com/apache/datafusion-comet/issues/2292
+      val (_, cometPlan) = checkSparkAnswer(sql)
+      if (usingDataSourceExec) {
+        assert(1 == collectNativeScans(cometPlan).length)
+      }
+    }
+  }
+
+  test("count(*) group by single column") {
+    val df = spark.read.parquet(filename)
+    df.createOrReplaceTempView("t1")
+    for (col <- df.columns) {
+      // cannot run fully natively due to range partitioning and sort
+      val sql = s"SELECT $col, count(*) FROM t1 GROUP BY $col ORDER BY $col"
+      val (_, cometPlan) = checkSparkAnswer(sql)
+      if (usingDataSourceExec) {
+        assert(1 == collectNativeScans(cometPlan).length)
+      }
+    }
+  }
+
+  test("count(col) group by single column") {
+    val df = spark.read.parquet(filename)
+    df.createOrReplaceTempView("t1")
+    val groupCol = df.columns.head
+    for (col <- df.columns.drop(1)) {
+      // cannot run fully natively due to range partitioning and sort
+      val sql = s"SELECT $groupCol, count($col) FROM t1 GROUP BY $groupCol ORDER BY $groupCol"
+      val (_, cometPlan) = checkSparkAnswer(sql)
+      if (usingDataSourceExec) {
+        assert(1 == collectNativeScans(cometPlan).length)
+      }
+    }
+  }
+
+  test("count(col1, col2, ..) group by single column") {
+    val df = spark.read.parquet(filename)
+    df.createOrReplaceTempView("t1")
+    val groupCol = df.columns.head
+    val otherCol = df.columns.drop(1)
+    // cannot run fully natively due to range partitioning and sort
+    val sql = s"SELECT $groupCol, count(${otherCol.mkString(", ")}) FROM t1 " +
+      s"GROUP BY $groupCol ORDER BY $groupCol"
+    val (_, cometPlan) = checkSparkAnswer(sql)
+    if (usingDataSourceExec) {
+      assert(1 == collectNativeScans(cometPlan).length)
+    }
+  }
+
+  test("min/max aggregate") {
+    val df = spark.read.parquet(filename)
+    df.createOrReplaceTempView("t1")
+    for (col <- df.columns) {
+      // cannot run fully native due to HashAggregate
+      val sql = s"SELECT min($col), max($col) FROM t1"
+      val (_, cometPlan) = checkSparkAnswer(sql)
+      if (usingDataSourceExec) {
+        assert(1 == collectNativeScans(cometPlan).length)
+      }
+    }
+  }
+
+}
diff --git a/spark/src/test/scala/org/apache/comet/CometFuzzTestBase.scala b/spark/src/test/scala/org/apache/comet/CometFuzzTestBase.scala
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet
+
+import java.io.File
+import java.text.SimpleDateFormat
+
+import scala.util.Random
+
+import org.scalactic.source.Position
+import org.scalatest.Tag
+
+import org.apache.commons.io.FileUtils
+import org.apache.spark.sql.CometTestBase
+import org.apache.spark.sql.comet.{CometNativeScanExec, CometScanExec}
+import org.apache.spark.sql.comet.execution.shuffle.CometShuffleExchangeExec
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
+import org.apache.spark.sql.internal.SQLConf
+
+import org.apache.comet.testing.{DataGenOptions, ParquetGenerator}
+
+class CometFuzzTestBase extends CometTestBase with AdaptiveSparkPlanHelper {
+
+  var filename: String = null
+
+  /**
+   * We use Asia/Kathmandu because it has a non-zero number of minutes as the offset, so is an
+   * interesting edge case. Also, this timezone tends to be different from the default system
+   * timezone.
+   *
+   * Represents UTC+5:45
+   */
+  val defaultTimezone = "Asia/Kathmandu"
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    val tempDir = System.getProperty("java.io.tmpdir")
+    filename = s"$tempDir/CometFuzzTestSuite_${System.currentTimeMillis()}.parquet"
+    val random = new Random(42)
+    withSQLConf(
+      CometConf.COMET_ENABLED.key -> "false",
+      SQLConf.SESSION_LOCAL_TIMEZONE.key -> defaultTimezone) {
+      val options =
+        DataGenOptions(
+          generateArray = true,
+          generateStruct = true,
+          generateNegativeZero = false,
+          // override base date due to known issues with experimental scans
+          baseDate =
+            new SimpleDateFormat("YYYY-MM-DD hh:mm:ss").parse("2024-05-25 12:34:56").getTime)
+      ParquetGenerator.makeParquetFile(random, spark, filename, 1000, options)
+    }
+  }
+
+  protected override def afterAll(): Unit = {
+    super.afterAll()
+    FileUtils.deleteDirectory(new File(filename))
+  }
+
+  override protected def test(testName: String, testTags: Tag*)(testFun: => Any)(implicit
+      pos: Position): Unit = {
+    Seq("native", "jvm").foreach { shuffleMode =>
+      Seq(
+        CometConf.SCAN_NATIVE_COMET,
+        CometConf.SCAN_NATIVE_DATAFUSION,
+        CometConf.SCAN_NATIVE_ICEBERG_COMPAT).foreach { scanImpl =>
+        super.test(testName + s" ($scanImpl, $shuffleMode shuffle)", testTags: _*) {
+          withSQLConf(
+            CometConf.COMET_NATIVE_SCAN_IMPL.key -> scanImpl,
+            CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.key -> "true",
+            CometConf.COMET_SHUFFLE_MODE.key -> shuffleMode) {
+            testFun
+          }
+        }
+      }
+    }
+  }
+
+  def collectNativeScans(plan: SparkPlan): Seq[SparkPlan] = {
+    collect(plan) {
+      case scan: CometScanExec => scan
+      case scan: CometNativeScanExec => scan
+    }
+  }
+
+  def collectCometShuffleExchanges(plan: SparkPlan): Seq[SparkPlan] = {
+    collect(plan) { case exchange: CometShuffleExchangeExec =>
+      exchange
+    }
+  }
+
+}
diff --git a/spark/src/test/scala/org/apache/comet/CometFuzzTestSuite.scala b/spark/src/test/scala/org/apache/comet/CometFuzzTestSuite.scala
@@ -19,65 +19,18 @@
 
 package org.apache.comet
 
-import java.io.File
-import java.text.SimpleDateFormat
-
 import scala.util.Random
 
-import org.scalactic.source.Position
-import org.scalatest.Tag
-
 import org.apache.commons.codec.binary.Hex
-import org.apache.commons.io.FileUtils
-import org.apache.spark.sql.CometTestBase
-import org.apache.spark.sql.comet.{CometNativeScanExec, CometScanExec}
-import org.apache.spark.sql.comet.execution.shuffle.CometShuffleExchangeExec
-import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, AdaptiveSparkPlanHelper}
+import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.SQLConf.ParquetOutputTimestampType
 import org.apache.spark.sql.types._
 
 import org.apache.comet.DataTypeSupport.isComplexType
 import org.apache.comet.testing.{DataGenOptions, ParquetGenerator}
 
-class CometFuzzTestSuite extends CometTestBase with AdaptiveSparkPlanHelper {
-
-  private var filename: String = null
-
-  /**
-   * We use Asia/Kathmandu because it has a non-zero number of minutes as the offset, so is an
-   * interesting edge case. Also, this timezone tends to be different from the default system
-   * timezone.
-   *
-   * Represents UTC+5:45
-   */
-  private val defaultTimezone = "Asia/Kathmandu"
-
-  override def beforeAll(): Unit = {
-    super.beforeAll()
-    val tempDir = System.getProperty("java.io.tmpdir")
-    filename = s"$tempDir/CometFuzzTestSuite_${System.currentTimeMillis()}.parquet"
-    val random = new Random(42)
-    withSQLConf(
-      CometConf.COMET_ENABLED.key -> "false",
-      SQLConf.SESSION_LOCAL_TIMEZONE.key -> defaultTimezone) {
-      val options =
-        DataGenOptions(
-          generateArray = true,
-          generateStruct = true,
-          generateNegativeZero = false,
-          // override base date due to known issues with experimental scans
-          baseDate =
-            new SimpleDateFormat("YYYY-MM-DD hh:mm:ss").parse("2024-05-25 12:34:56").getTime)
-      ParquetGenerator.makeParquetFile(random, spark, filename, 1000, options)
-    }
-  }
-
-  protected override def afterAll(): Unit = {
-    super.afterAll()
-    FileUtils.deleteDirectory(new File(filename))
-  }
+class CometFuzzTestSuite extends CometFuzzTestBase {
 
   test("select *") {
     val df = spark.read.parquet(filename)
@@ -168,18 +121,6 @@ class CometFuzzTestSuite extends CometTestBase with AdaptiveSparkPlanHelper {
     }
   }
 
-  test("count distinct") {
-    val df = spark.read.parquet(filename)
-    df.createOrReplaceTempView("t1")
-    for (col <- df.columns) {
-      val sql = s"SELECT count(distinct $col) FROM t1"
-      val (_, cometPlan) = checkSparkAnswer(sql)
-      if (usingDataSourceExec) {
-        assert(1 == collectNativeScans(cometPlan).length)
-      }
-    }
-  }
-
   test("order by multiple columns") {
     val df = spark.read.parquet(filename)
     df.createOrReplaceTempView("t1")
@@ -192,32 +133,6 @@ class CometFuzzTestSuite extends CometTestBase with AdaptiveSparkPlanHelper {
     }
   }
 
-  test("aggregate group by single column") {
-    val df = spark.read.parquet(filename)
-    df.createOrReplaceTempView("t1")
-    for (col <- df.columns) {
-      // cannot run fully natively due to range partitioning and sort
-      val sql = s"SELECT $col, count(*) FROM t1 GROUP BY $col ORDER BY $col"
-      val (_, cometPlan) = checkSparkAnswer(sql)
-      if (usingDataSourceExec) {
-        assert(1 == collectNativeScans(cometPlan).length)
-      }
-    }
-  }
-
-  test("min/max aggregate") {
-    val df = spark.read.parquet(filename)
-    df.createOrReplaceTempView("t1")
-    for (col <- df.columns) {
-      // cannot run fully native due to HashAggregate
-      val sql = s"SELECT min($col), max($col) FROM t1"
-      val (_, cometPlan) = checkSparkAnswer(sql)
-      if (usingDataSourceExec) {
-        assert(1 == collectNativeScans(cometPlan).length)
-      }
-    }
-  }
-
   test("distribute by single column (complex types)") {
     val df = spark.read.parquet(filename)
     df.createOrReplaceTempView("t1")
@@ -371,36 +286,4 @@ class CometFuzzTestSuite extends CometTestBase with AdaptiveSparkPlanHelper {
     }
   }
 
-  override protected def test(testName: String, testTags: Tag*)(testFun: => Any)(implicit
-      pos: Position): Unit = {
-    Seq("native", "jvm").foreach { shuffleMode =>
-      Seq(
-        CometConf.SCAN_NATIVE_COMET,
-        CometConf.SCAN_NATIVE_DATAFUSION,
-        CometConf.SCAN_NATIVE_ICEBERG_COMPAT).foreach { scanImpl =>
-        super.test(testName + s" ($scanImpl, $shuffleMode shuffle)", testTags: _*) {
-          withSQLConf(
-            CometConf.COMET_NATIVE_SCAN_IMPL.key -> scanImpl,
-            CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.key -> "true",
-            CometConf.COMET_SHUFFLE_MODE.key -> shuffleMode) {
-            testFun
-          }
-        }
-      }
-    }
-  }
-
-  private def collectNativeScans(plan: SparkPlan): Seq[SparkPlan] = {
-    collect(plan) {
-      case scan: CometScanExec => scan
-      case scan: CometNativeScanExec => scan
-    }
-  }
-
-  private def collectCometShuffleExchanges(plan: SparkPlan): Seq[SparkPlan] = {
-    collect(plan) { case exchange: CometShuffleExchangeExec =>
-      exchange
-    }
-  }
-
 }