[SPARK-51474][SQL] Don't insert redundant ColumnarToRowExec for node supporting both columnar and row output

viirya · viirya · commit c12997230c32 · 2025-03-12T06:51:28.000-07:00
### What changes were proposed in this pull request? This patch fixes a corner case in `ApplyColumnarRulesAndInsertTransitions`. When a plan required to output rows, if the node supports both columnar and row output, the rule currently adds a redundant `ColumnarToRowExec` to its upstream. ### Why are the changes needed? This fix is used to avoid redundant `ColumnarToRowExec`. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Unit test ### Was this patch authored or co-authored using generative AI tooling? No Closes #50239 from viirya/fix_columnar. Authored-by: Liang-Chi Hsieh <viirya@gmail.com> Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Columnar.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Columnar.scala
@@ -533,6 +533,10 @@ case class ApplyColumnarRulesAndInsertTransitions(
         case write: DataWritingCommandExec
             if write.cmd.isInstanceOf[V1WriteCommand] && conf.plannedWriteEnabled =>
           write.child.supportsColumnar
+        // If it is not required to output columnar (`outputsColumnar` is false), and the plan
+        // supports row-based and columnar, we don't need to output row-based data on its children
+        // nodes. So we set `outputsColumnar` to true.
+        case _ if plan.supportsColumnar && plan.supportsRowBased => true
         case _ =>
           false
       }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ColumnarRulesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ColumnarRulesSuite.scala
@@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.test.SharedSparkSession
+import org.apache.spark.sql.vectorized.ColumnarBatch
 
 class ColumnarRulesSuite extends PlanTest with SharedSparkSession {
 
@@ -51,6 +52,15 @@ class ColumnarRulesSuite extends PlanTest with SharedSparkSession {
     val appliedTwice = rules.apply(appliedOnce)
     assert(appliedTwice == expected)
   }
+
+  test("SPARK-51474: Don't insert redundant ColumnarToRowExec") {
+    val rules = ApplyColumnarRulesAndInsertTransitions(
+      spark.sessionState.columnarRules, false)
+
+    val plan = CanDoColumnarAndRowOp(UnaryOp(LeafOp(true), true))
+    val appliedOnce = rules.apply(plan)
+    assert(appliedOnce == plan)
+  }
 }
 
 case class LeafOp(override val supportsColumnar: Boolean) extends LeafExecNode {
@@ -63,3 +73,15 @@ case class UnaryOp(child: SparkPlan, override val supportsColumnar: Boolean) ext
   override def output: Seq[Attribute] = child.output
   override protected def withNewChildInternal(newChild: SparkPlan): UnaryOp = copy(child = newChild)
 }
+
+case class CanDoColumnarAndRowOp(child: SparkPlan) extends UnaryExecNode {
+  override val supportsRowBased: Boolean = true
+  override val supportsColumnar: Boolean = true
+
+  override protected def doExecute(): RDD[InternalRow] = throw SparkUnsupportedOperationException()
+  override protected def doExecuteColumnar(): RDD[ColumnarBatch] =
+    throw SparkUnsupportedOperationException()
+  override def output: Seq[Attribute] = child.output
+  override protected def withNewChildInternal(newChild: SparkPlan): CanDoColumnarAndRowOp =
+    copy(child = newChild)
+}