chore: Fallback to Spark for Windows

comphead · comphead · commit 921ef39daa4c · 2025-11-06T17:00:38.000-08:00
diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala
@@ -251,7 +251,7 @@ object CometConf extends ShimCometConf {
   val COMET_EXEC_EXPAND_ENABLED: ConfigEntry[Boolean] =
     createExecEnabledConfig("expand", defaultValue = true)
   val COMET_EXEC_WINDOW_ENABLED: ConfigEntry[Boolean] =
-    createExecEnabledConfig("window", defaultValue = true)
+    createExecEnabledConfig("window", defaultValue = false)
   val COMET_EXEC_TAKE_ORDERED_AND_PROJECT_ENABLED: ConfigEntry[Boolean] =
     createExecEnabledConfig("takeOrderedAndProject", defaultValue = true)
 
diff --git a/docs/source/user-guide/latest/configs.md b/docs/source/user-guide/latest/configs.md
@@ -139,23 +139,23 @@ These settings can be used to determine which parts of the plan are accelerated
 <!--BEGIN:CONFIG_TABLE[enable_exec]-->
 | Config | Description | Default Value |
 |--------|-------------|---------------|
-| `spark.comet.exec.aggregate.enabled` | Whether to enable aggregate by default. | true |
-| `spark.comet.exec.broadcastExchange.enabled` | Whether to enable broadcastExchange by default. | true |
-| `spark.comet.exec.broadcastHashJoin.enabled` | Whether to enable broadcastHashJoin by default. | true |
-| `spark.comet.exec.coalesce.enabled` | Whether to enable coalesce by default. | true |
-| `spark.comet.exec.collectLimit.enabled` | Whether to enable collectLimit by default. | true |
-| `spark.comet.exec.expand.enabled` | Whether to enable expand by default. | true |
-| `spark.comet.exec.filter.enabled` | Whether to enable filter by default. | true |
-| `spark.comet.exec.globalLimit.enabled` | Whether to enable globalLimit by default. | true |
-| `spark.comet.exec.hashJoin.enabled` | Whether to enable hashJoin by default. | true |
-| `spark.comet.exec.localLimit.enabled` | Whether to enable localLimit by default. | true |
-| `spark.comet.exec.project.enabled` | Whether to enable project by default. | true |
-| `spark.comet.exec.sort.enabled` | Whether to enable sort by default. | true |
-| `spark.comet.exec.sortMergeJoin.enabled` | Whether to enable sortMergeJoin by default. | true |
-| `spark.comet.exec.sortMergeJoinWithJoinFilter.enabled` | Experimental support for Sort Merge Join with filter | false |
-| `spark.comet.exec.takeOrderedAndProject.enabled` | Whether to enable takeOrderedAndProject by default. | true |
-| `spark.comet.exec.union.enabled` | Whether to enable union by default. | true |
-| `spark.comet.exec.window.enabled` | Whether to enable window by default. | true |
+| `spark.comet.exec.aggregate.enabled` | Whether to enable aggregate by default. | true          |
+| `spark.comet.exec.broadcastExchange.enabled` | Whether to enable broadcastExchange by default. | true          |
+| `spark.comet.exec.broadcastHashJoin.enabled` | Whether to enable broadcastHashJoin by default. | true          |
+| `spark.comet.exec.coalesce.enabled` | Whether to enable coalesce by default. | true          |
+| `spark.comet.exec.collectLimit.enabled` | Whether to enable collectLimit by default. | true          |
+| `spark.comet.exec.expand.enabled` | Whether to enable expand by default. | true          |
+| `spark.comet.exec.filter.enabled` | Whether to enable filter by default. | true          |
+| `spark.comet.exec.globalLimit.enabled` | Whether to enable globalLimit by default. | true          |
+| `spark.comet.exec.hashJoin.enabled` | Whether to enable hashJoin by default. | true          |
+| `spark.comet.exec.localLimit.enabled` | Whether to enable localLimit by default. | true          |
+| `spark.comet.exec.project.enabled` | Whether to enable project by default. | true          |
+| `spark.comet.exec.sort.enabled` | Whether to enable sort by default. | true          |
+| `spark.comet.exec.sortMergeJoin.enabled` | Whether to enable sortMergeJoin by default. | true          |
+| `spark.comet.exec.sortMergeJoinWithJoinFilter.enabled` | Experimental support for Sort Merge Join with filter | false         |
+| `spark.comet.exec.takeOrderedAndProject.enabled` | Whether to enable takeOrderedAndProject by default. | true          |
+| `spark.comet.exec.union.enabled` | Whether to enable union by default. | true          |
+| `spark.comet.exec.window.enabled` | Whether to enable window by default. | false         |
 <!--END:CONFIG_TABLE-->
 
 ## Enabling or Disabling Individual Scalar Expressions
diff --git a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala
@@ -30,10 +30,9 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.sql.{CometTestBase, DataFrame, Row}
 import org.apache.spark.sql.catalyst.expressions.{Alias, Cast, Literal, TruncDate, TruncTimestamp}
 import org.apache.spark.sql.catalyst.optimizer.SimplifyExtractValueOps
-import org.apache.spark.sql.comet.{CometColumnarToRowExec, CometProjectExec, CometWindowExec}
+import org.apache.spark.sql.comet.{CometColumnarToRowExec, CometProjectExec}
 import org.apache.spark.sql.execution.{InputAdapter, ProjectExec, SparkPlan, WholeStageCodegenExec}
 import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
-import org.apache.spark.sql.expressions.Window
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.SQLConf.SESSION_LOCAL_TIMEZONE
@@ -3097,27 +3096,6 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper {
     }
   }
 
-  test("window query with rangeBetween") {
-
-    // values are int
-    val df = Seq(1, 2, 4, 3, 2, 1).toDF("value")
-    val window = Window.orderBy($"value".desc)
-
-    // ranges are long
-    val df2 = df.select(
-      $"value",
-      sum($"value").over(window.rangeBetween(Window.unboundedPreceding, 1L)),
-      sum($"value").over(window.rangeBetween(1L, Window.unboundedFollowing)))
-
-    // Comet does not support RANGE BETWEEN
-    // https://github.com/apache/datafusion-comet/issues/1246
-    val (_, cometPlan) = checkSparkAnswer(df2)
-    val cometWindowExecs = collect(cometPlan) { case w: CometWindowExec =>
-      w
-    }
-    assert(cometWindowExecs.isEmpty)
-  }
-
   test("vectorized reader: missing all struct fields") {
     Seq(true, false).foreach { offheapEnabled =>
       withSQLConf(
diff --git a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala
@@ -25,7 +25,6 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.sql.{CometTestBase, DataFrame, Row}
 import org.apache.spark.sql.catalyst.optimizer.EliminateSorts
 import org.apache.spark.sql.comet.CometHashAggregateExec
-import org.apache.spark.sql.comet.execution.shuffle.CometShuffleExchangeExec
 import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
 import org.apache.spark.sql.functions.{avg, count_distinct, sum}
 import org.apache.spark.sql.internal.SQLConf
@@ -94,58 +93,6 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper {
     }
   }
 
-  test("lead/lag should return the default value if the offset row does not exist") {
-    withSQLConf(
-      CometConf.COMET_ENABLED.key -> "true",
-      CometConf.COMET_EXEC_SHUFFLE_ENABLED.key -> "true",
-      CometConf.COMET_SHUFFLE_MODE.key -> "jvm") {
-      checkSparkAnswer(sql("""
-                             |SELECT
-                             |  lag(123, 100, 321) OVER (ORDER BY id) as lag,
-                             |  lead(123, 100, 321) OVER (ORDER BY id) as lead
-                             |FROM (SELECT 1 as id) tmp
-      """.stripMargin))
-
-      checkSparkAnswer(sql("""
-                             |SELECT
-                             |  lag(123, 100, a) OVER (ORDER BY id) as lag,
-                             |  lead(123, 100, a) OVER (ORDER BY id) as lead
-                             |FROM (SELECT 1 as id, 2 as a) tmp
-      """.stripMargin))
-    }
-  }
-
-  // based on Spark's SQLWindowFunctionSuite test of the same name
-  test("window function: partition and order expressions") {
-    for (shuffleMode <- Seq("auto", "native", "jvm")) {
-      withSQLConf(CometConf.COMET_SHUFFLE_MODE.key -> shuffleMode) {
-        val df =
-          Seq((1, "a", 5), (2, "a", 6), (3, "b", 7), (4, "b", 8), (5, "c", 9), (6, "c", 10)).toDF(
-            "month",
-            "area",
-            "product")
-        df.createOrReplaceTempView("windowData")
-        val df2 = sql("""
-            |select month, area, product, sum(product + 1) over (partition by 1 order by 2)
-            |from windowData
-          """.stripMargin)
-        checkSparkAnswer(df2)
-        val cometShuffles = collect(df2.queryExecution.executedPlan) {
-          case _: CometShuffleExchangeExec => true
-        }
-        if (shuffleMode == "jvm" || shuffleMode == "auto") {
-          assert(cometShuffles.length == 1)
-        } else {
-          // we fall back to Spark for shuffle because we do not support
-          // native shuffle with a LocalTableScan input, and we do not fall
-          // back to Comet columnar shuffle due to
-          // https://github.com/apache/datafusion-comet/issues/1248
-          assert(cometShuffles.isEmpty)
-        }
-      }
-    }
-  }
-
   test("multiple column distinct count") {
     withSQLConf(
       CometConf.COMET_ENABLED.key -> "true",
diff --git a/spark/src/test/scala/org/apache/comet/exec/CometExecSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometExecSuite.scala
@@ -1697,7 +1697,9 @@ class CometExecSuite extends CometTestBase {
 
   test("TakeOrderedAndProjectExec") {
     Seq("true", "false").foreach(aqeEnabled =>
-      withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> aqeEnabled) {
+      withSQLConf(
+        SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> aqeEnabled,
+        CometConf.COMET_EXEC_WINDOW_ENABLED.key -> "true") {
         withTable("t1") {
           val numRows = 10
           spark
diff --git a/spark/src/test/scala/org/apache/comet/exec/CometWindowExecSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometWindowExecSuite.scala
@@ -24,8 +24,10 @@ import org.scalatest.Tag
 
 import org.apache.hadoop.fs.Path
 import org.apache.spark.sql.{CometTestBase, Row}
+import org.apache.spark.sql.comet.CometWindowExec
+import org.apache.spark.sql.comet.execution.shuffle.CometShuffleExchangeExec
 import org.apache.spark.sql.expressions.Window
-import org.apache.spark.sql.functions.{count, lead}
+import org.apache.spark.sql.functions.{count, lead, sum}
 import org.apache.spark.sql.internal.SQLConf
 
 import org.apache.comet.CometConf
@@ -39,12 +41,86 @@ class CometWindowExecSuite extends CometTestBase {
     super.test(testName, testTags: _*) {
       withSQLConf(
         CometConf.COMET_EXEC_SHUFFLE_ENABLED.key -> "true",
+        CometConf.COMET_EXEC_WINDOW_ENABLED.key -> "true",
         CometConf.COMET_NATIVE_SCAN_IMPL.key -> CometConf.SCAN_AUTO) {
         testFun
       }
     }
   }
 
+  test("lead/lag should return the default value if the offset row does not exist") {
+    withSQLConf(
+      CometConf.COMET_ENABLED.key -> "true",
+      CometConf.COMET_EXEC_SHUFFLE_ENABLED.key -> "true",
+      CometConf.COMET_SHUFFLE_MODE.key -> "jvm") {
+      checkSparkAnswer(sql("""
+                             |SELECT
+                             |  lag(123, 100, 321) OVER (ORDER BY id) as lag,
+                             |  lead(123, 100, 321) OVER (ORDER BY id) as lead
+                             |FROM (SELECT 1 as id) tmp
+      """.stripMargin))
+
+      checkSparkAnswer(sql("""
+                             |SELECT
+                             |  lag(123, 100, a) OVER (ORDER BY id) as lag,
+                             |  lead(123, 100, a) OVER (ORDER BY id) as lead
+                             |FROM (SELECT 1 as id, 2 as a) tmp
+      """.stripMargin))
+    }
+  }
+
+  test("window query with rangeBetween") {
+
+    // values are int
+    val df = Seq(1, 2, 4, 3, 2, 1).toDF("value")
+    val window = Window.orderBy($"value".desc)
+
+    // ranges are long
+    val df2 = df.select(
+      $"value",
+      sum($"value").over(window.rangeBetween(Window.unboundedPreceding, 1L)),
+      sum($"value").over(window.rangeBetween(1L, Window.unboundedFollowing)))
+
+    // Comet does not support RANGE BETWEEN
+    // https://github.com/apache/datafusion-comet/issues/1246
+    val (_, cometPlan) = checkSparkAnswer(df2)
+    val cometWindowExecs = collect(cometPlan) { case w: CometWindowExec =>
+      w
+    }
+    assert(cometWindowExecs.isEmpty)
+  }
+
+  // based on Spark's SQLWindowFunctionSuite test of the same name
+  test("window function: partition and order expressions") {
+    for (shuffleMode <- Seq("auto", "native", "jvm")) {
+      withSQLConf(CometConf.COMET_SHUFFLE_MODE.key -> shuffleMode) {
+        val df =
+          Seq((1, "a", 5), (2, "a", 6), (3, "b", 7), (4, "b", 8), (5, "c", 9), (6, "c", 10)).toDF(
+            "month",
+            "area",
+            "product")
+        df.createOrReplaceTempView("windowData")
+        val df2 = sql("""
+                        |select month, area, product, sum(product + 1) over (partition by 1 order by 2)
+                        |from windowData
+          """.stripMargin)
+        checkSparkAnswer(df2)
+        val cometShuffles = collect(df2.queryExecution.executedPlan) {
+          case _: CometShuffleExchangeExec => true
+        }
+        if (shuffleMode == "jvm" || shuffleMode == "auto") {
+          assert(cometShuffles.length == 1)
+        } else {
+          // we fall back to Spark for shuffle because we do not support
+          // native shuffle with a LocalTableScan input, and we do not fall
+          // back to Comet columnar shuffle due to
+          // https://github.com/apache/datafusion-comet/issues/1248
+          assert(cometShuffles.isEmpty)
+        }
+      }
+    }
+  }
+
   test(
     "fall back to Spark when the partition spec and order spec are not the same for window function") {
     withTempView("test") {
@@ -289,7 +365,7 @@ class CometWindowExecSuite extends CometTestBase {
 
   // TODO: COUNT with ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW produces incorrect results
   // Returns wrong cnt values - ordering issue causes swapped values for rows with same partition
-  ignore("window: COUNT with ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW") {
+  test("window: COUNT with ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW") {
     withTempDir { dir =>
       (0 until 30)
         .map(i => (i % 3, i % 5, i))
@@ -310,7 +386,6 @@ class CometWindowExecSuite extends CometTestBase {
   }
 
   // TODO: SUM with ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING produces incorrect results
-  // Returns wrong sum_c values - ordering issue causes swapped values for rows with same partition
   ignore("window: SUM with ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING") {
     withTempDir { dir =>
       (0 until 30)
@@ -354,7 +429,6 @@ class CometWindowExecSuite extends CometTestBase {
   }
 
   // TODO: SUM with ROWS BETWEEN produces incorrect results
-  // Returns wrong sum_c values for some rows
   ignore("window: SUM with ROWS BETWEEN 2 PRECEDING AND CURRENT ROW") {
     withTempDir { dir =>
       (0 until 30)
@@ -530,7 +604,6 @@ class CometWindowExecSuite extends CometTestBase {
   }
 
   // TODO: LAG produces incorrect results
-  // Returns wrong lag_c values - ordering issue in results
   ignore("window: LAG with default offset") {
     withTempDir { dir =>
       (0 until 30)
@@ -552,7 +625,6 @@ class CometWindowExecSuite extends CometTestBase {
   }
 
   // TODO: LAG with offset 2 produces incorrect results
-  // Returns wrong lag_c_2 values - ordering issue in results
   ignore("window: LAG with offset 2 and default value") {
     withTempDir { dir =>
       (0 until 30)
@@ -574,7 +646,6 @@ class CometWindowExecSuite extends CometTestBase {
   }
 
   // TODO: LEAD produces incorrect results
-  // Returns wrong lead_c values - ordering issue in results
   ignore("window: LEAD with default offset") {
     withTempDir { dir =>
       (0 until 30)
@@ -596,7 +667,6 @@ class CometWindowExecSuite extends CometTestBase {
   }
 
   // TODO: LEAD with offset 2 produces incorrect results
-  // Returns wrong lead_c_2 values - ordering issue in results
   ignore("window: LEAD with offset 2 and default value") {
     withTempDir { dir =>
       (0 until 30)
@@ -662,7 +732,6 @@ class CometWindowExecSuite extends CometTestBase {
   }
 
   // TODO: NTH_VALUE returns incorrect results - produces 0 instead of null for first row,
-  // and incorrect values for subsequent rows in partition
   ignore("window: NTH_VALUE with position 2") {
     withTempDir { dir =>
       (0 until 30)
@@ -706,7 +775,6 @@ class CometWindowExecSuite extends CometTestBase {
   }
 
   // TODO: Multiple window functions with mixed frame types (RowFrame and RangeFrame)
-  // produces incorrect row_num values - ordering issue in results
   ignore("window: multiple window functions in single query") {
     withTempDir { dir =>
       (0 until 30)
@@ -933,7 +1001,6 @@ class CometWindowExecSuite extends CometTestBase {
   }
 
   // TODO: ROWS BETWEEN with negative offset produces incorrect results
-  // Returns wrong values for avg_c calculation
   ignore("window: ROWS BETWEEN with negative offset") {
     withTempDir { dir =>
       (0 until 30)
@@ -955,7 +1022,6 @@ class CometWindowExecSuite extends CometTestBase {
   }
 
   // TODO: All ranking functions together produce incorrect row_num values
-  // Ordering issue causes row numbers to be swapped for rows with same partition/order values
   ignore("window: all ranking functions together") {
     withTempDir { dir =>
       (0 until 30)
@@ -980,5 +1046,4 @@ class CometWindowExecSuite extends CometTestBase {
       checkSparkAnswerAndOperator(df)
     }
   }
-
 }