Revert "[SPARK-54310][SQL] Add numSourceRows metric for MergeIntoExec"

asl3 · cloud-fan · commit ee41857903ee · 2025-12-03T20:41:26.000-08:00
### What changes were proposed in this pull request? Clean revert of d65234b. Will later handle for cases of sourceSide child nodes without `numOutputRows`, and will re-target the new implementation to later Spark release. ### Why are the changes needed? The current implementation may grab the incorrect `numOutputRows` metric if there is an intermediary node (such as custom Spark operator) which does not support the metric. This is because we target the first sourceSide child node with `numOutputRows`. If a SparkExtension node does not contain this metric but transforms the source table, then we could progress all the way to the source table and grab the incorrect metric. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Existing CI, as this is a revert ### Was this patch authored or co-authored using generative AI tooling? No Closes #53293 from asl3/numsourcerowsrevert. Authored-by: Amanda Liu <amanda.liu@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/MergeSummary.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/MergeSummary.java
@@ -27,11 +27,6 @@
 @Evolving
 public interface MergeSummary extends WriteSummary {
 
-  /**
-   * Returns the number of source rows.
-   */
-  long numSourceRows();
-
   /**
    * Returns the number of target rows copied unmodified because they did not match any action,
    * or -1 if not found.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/write/MergeSummaryImpl.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/write/MergeSummaryImpl.scala
@@ -21,7 +21,6 @@ package org.apache.spark.sql.connector.write
  * Implementation of [[MergeSummary]] that provides MERGE operation summary.
  */
 private[sql] case class MergeSummaryImpl(
-    numSourceRows: Long,
     numTargetRowsCopied: Long,
     numTargetRowsDeleted: Long,
     numTargetRowsUpdated: Long,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala
@@ -31,11 +31,10 @@ import org.apache.spark.sql.catalyst.util.RowDeltaUtils.{DELETE_OPERATION, INSER
 import org.apache.spark.sql.connector.catalog.{CatalogV2Util, Column, Identifier, StagedTable, StagingTableCatalog, Table, TableCatalog, TableInfo, TableWritePrivilege}
 import org.apache.spark.sql.connector.expressions.Transform
 import org.apache.spark.sql.connector.metric.CustomMetric
-import org.apache.spark.sql.connector.write.{BatchWrite, DataWriter, DataWriterFactory, DeltaWrite, DeltaWriter, MergeSummaryImpl, PhysicalWriteInfoImpl, RowLevelOperationTable, Write, WriterCommitMessage, WriteSummary}
+import org.apache.spark.sql.connector.write.{BatchWrite, DataWriter, DataWriterFactory, DeltaWrite, DeltaWriter, MergeSummaryImpl, PhysicalWriteInfoImpl, Write, WriterCommitMessage, WriteSummary}
 import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors}
 import org.apache.spark.sql.execution.{QueryExecution, SparkPlan, SQLExecution, UnaryExecNode}
 import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
-import org.apache.spark.sql.execution.joins.BaseJoinExec
 import org.apache.spark.sql.execution.metric.{CustomMetrics, SQLMetric, SQLMetrics}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.util.SchemaValidationMode.PROHIBIT_CHANGES
@@ -493,9 +492,7 @@ trait V2TableWriteExec extends V2CommandExec with UnaryExecNode with AdaptiveSpa
   private def getWriteSummary(query: SparkPlan): Option[WriteSummary] = {
     collectFirst(query) { case m: MergeRowsExec => m }.map { n =>
       val metrics = n.metrics
-      val numSourceRows = getNumSourceRows(n)
       MergeSummaryImpl(
-        numSourceRows,
         metrics.get("numTargetRowsCopied").map(_.value).getOrElse(-1L),
         metrics.get("numTargetRowsDeleted").map(_.value).getOrElse(-1L),
         metrics.get("numTargetRowsUpdated").map(_.value).getOrElse(-1L),
@@ -507,40 +504,6 @@ trait V2TableWriteExec extends V2CommandExec with UnaryExecNode with AdaptiveSpa
       )
     }
   }
-
-  private def getNumSourceRows(mergeRowsExec: MergeRowsExec): Long = {
-    def hasTargetTable(plan: SparkPlan): Boolean = {
-      collectFirst(plan) {
-        case scan @ BatchScanExec(_, _, _, _, _: RowLevelOperationTable, _) => scan
-      }.isDefined
-    }
-
-    def findSourceScan(join: BaseJoinExec): Option[SparkPlan] = {
-      val leftHasTarget = hasTargetTable(join.left)
-      val rightHasTarget = hasTargetTable(join.right)
-
-      val sourceSide = if (leftHasTarget) {
-        Some(join.right)
-      } else if (rightHasTarget) {
-        Some(join.left)
-      } else {
-        None
-      }
-
-      sourceSide.flatMap { side =>
-        collectFirst(side) {
-          case source if source.metrics.contains("numOutputRows") =>
-          source
-        }
-      }
-    }
-
-    (for {
-      join <- collectFirst(mergeRowsExec.child) { case j: BaseJoinExec => j }
-      sourceScan <- findSourceScan(join)
-      metric <- sourceScan.metrics.get("numOutputRows")
-    } yield metric.value).getOrElse(-1L)
-  }
 }
 
 trait WritingSparkTask[W <: DataWriter[InternalRow]] extends Logging with Serializable {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala