[SPARK-53243][PYTHON][SQL] List the supported eval types in arrow nodes

zhengruifeng · zhengruifeng · commit 7c2c84a20e08 · 2025-08-12T08:21:32.000+08:00
### What changes were proposed in this pull request? List the supported eval types in arrow nodes ### Why are the changes needed? validate the eval types and make code more readability ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? existing tests ### Was this patch authored or co-authored using generative AI tooling? no Closes #51970 from zhengruifeng/arrow_check_eval_type. Authored-by: Ruifeng Zheng <ruifengz@apache.org> Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowAggregatePythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowAggregatePythonExec.scala
@@ -21,8 +21,8 @@ import java.io.File
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.apache.spark.{JobArtifactSet, SparkEnv, TaskContext}
-import org.apache.spark.api.python.ChainedPythonFunctions
+import org.apache.spark.{JobArtifactSet, SparkEnv, SparkException, TaskContext}
+import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -47,8 +47,10 @@ case class ArrowAggregatePythonExec(
     aggExpressions: Seq[AggregateExpression],
     resultExpressions: Seq[NamedExpression],
     child: SparkPlan,
-    evalType: Int)
-  extends UnaryExecNode with PythonSQLMetrics {
+    evalType: Int) extends UnaryExecNode with PythonSQLMetrics {
+  if (!supportedPythonEvalTypes.contains(evalType)) {
+    throw SparkException.internalError(s"Unexpected eval type $evalType")
+  }
 
   override val output: Seq[Attribute] = resultExpressions.map(_.toAttribute)
 
@@ -217,6 +219,11 @@ case class ArrowAggregatePythonExec(
 
     newIter
   }
+
+  private def supportedPythonEvalTypes: Array[Int] =
+    Array(
+      PythonEvalType.SQL_GROUPED_AGG_ARROW_UDF,
+      PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF)
 }
 
 object ArrowAggregatePythonExec {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
@@ -19,8 +19,8 @@ package org.apache.spark.sql.execution.python
 
 import scala.jdk.CollectionConverters._
 
-import org.apache.spark.{JobArtifactSet, TaskContext}
-import org.apache.spark.api.python.ChainedPythonFunctions
+import org.apache.spark.{JobArtifactSet, SparkException, TaskContext}
+import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.errors.QueryExecutionErrors
@@ -62,9 +62,14 @@ private[spark] class BatchIterator[T](iter: Iterator[T], batchSize: Int)
 /**
  * A physical plan that evaluates a [[PythonUDF]].
  */
-case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], resultAttrs: Seq[Attribute], child: SparkPlan,
-    evalType: Int)
-  extends EvalPythonExec with PythonSQLMetrics {
+case class ArrowEvalPythonExec(
+    udfs: Seq[PythonUDF],
+    resultAttrs: Seq[Attribute],
+    child: SparkPlan,
+    evalType: Int) extends EvalPythonExec with PythonSQLMetrics {
+  if (!supportedPythonEvalTypes.contains(evalType)) {
+    throw SparkException.internalError(s"Unexpected eval type $evalType")
+  }
 
   private[this] val jobArtifactUUID = JobArtifactSet.getCurrentJobArtifactState.map(_.uuid)
 
@@ -85,6 +90,14 @@ case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], resultAttrs: Seq[Attribute]
 
   override protected def withNewChildInternal(newChild: SparkPlan): SparkPlan =
     copy(child = newChild)
+
+  private def supportedPythonEvalTypes: Array[Int] =
+    Array(
+      PythonEvalType.SQL_ARROW_BATCHED_UDF,
+      PythonEvalType.SQL_SCALAR_ARROW_UDF,
+      PythonEvalType.SQL_SCALAR_ARROW_ITER_UDF,
+      PythonEvalType.SQL_SCALAR_PANDAS_UDF,
+      PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF)
 }
 
 class ArrowEvalPythonEvaluatorFactory(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowWindowPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowWindowPythonExec.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.execution.python
 
+import org.apache.spark.SparkException
+import org.apache.spark.api.python.PythonEvalType
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -74,8 +76,11 @@ case class ArrowWindowPythonExec(
     partitionSpec: Seq[Expression],
     orderSpec: Seq[SortOrder],
     child: SparkPlan,
-    evalType: Int)
-  extends WindowExecBase with PythonSQLMetrics {
+    evalType: Int) extends WindowExecBase with PythonSQLMetrics {
+  if (!supportedPythonEvalTypes.contains(evalType)) {
+    throw SparkException.internalError(s"Unexpected eval type $evalType")
+  }
+
   override lazy val metrics: Map[String, SQLMetric] = pythonMetrics ++ Map(
     "spillSize" -> SQLMetrics.createSizeMetric(sparkContext, "spill size")
   )
@@ -105,6 +110,11 @@ case class ArrowWindowPythonExec(
 
   override protected def withNewChildInternal(newChild: SparkPlan): ArrowWindowPythonExec =
     copy(child = newChild)
+
+  private def supportedPythonEvalTypes: Array[Int] =
+    Array(
+      PythonEvalType.SQL_WINDOW_AGG_ARROW_UDF,
+      PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF)
 }
 
 object ArrowWindowPythonExec {