[SPARK-55302][SQL] Fix custom metrics in case of KeyGroupedPartitioning

peter-toth · peter-toth · commit c94ce2c23f27 · 2026-02-02T19:44:44.000+01:00
### What changes were proposed in this pull request? This PR adds a new `initMetricsValues()` method to `PartitionReader` so as to initialize custom metrics returned by `currentMetricsValues()`. In case of `KeyGroupedPartitioning` multiple input partitions are grouped and so multiple `PartitionReader` belong to one output partition. A `PartitionReader` needs to be initialized with metrics calculated by the previous `PartitionReader` of the same partition group to calculate the right value. ### Why are the changes needed? To calculate custom metrics correctly. ### Does this PR introduce _any_ user-facing change? It fixes metrics calculation. ### How was this patch tested? New UT is added. ### Was this patch authored or co-authored using generative AI tooling? No. Closes apache#54081 from peter-toth/SPARK-55302-fix-kgp-custom-metrics. Authored-by: Peter Toth <peter.toth@gmail.com> Signed-off-by: Peter Toth <peter.toth@gmail.com>
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/PartitionReader.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/PartitionReader.java
@@ -58,4 +58,13 @@ default CustomTaskMetric[] currentMetricsValues() {
     CustomTaskMetric[] NO_METRICS = {};
     return NO_METRICS;
   }
+
+  /**
+   * Sets the initial value of metrics before fetching any data from the reader. This is called
+   * when multiple {@link PartitionReader}s are grouped into one partition in case of
+   * {@link org.apache.spark.sql.connector.read.partitioning.KeyGroupedPartitioning} and the reader
+   * is initialized with the metrics returned by the previous reader that belongs to the same
+   * partition. By default, this method does nothing.
+   */
+  default void initMetricsValues(CustomTaskMetric[] metrics) {}
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryBaseTable.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryBaseTable.scala
@@ -543,6 +543,10 @@ abstract class InMemoryBaseTable(
       }
       new BufferedRowsReaderFactory(metadataColumns.toSeq, nonMetadataColumns, tableSchema)
     }
+
+    override def supportedCustomMetrics(): Array[CustomMetric] = {
+      Array(new RowsReadCustomMetric)
+    }
   }
 
   case class InMemoryBatchScan(
@@ -830,10 +834,13 @@ private class BufferedRowsReader(
   }
 
   private var index: Int = -1
+  private var rowsRead: Long = 0
 
   override def next(): Boolean = {
     index += 1
-    index < partition.rows.length
+    val hasNext = index < partition.rows.length
+    if (hasNext) rowsRead += 1
+    hasNext
   }
 
   override def get(): InternalRow = {
@@ -976,6 +983,22 @@ private class BufferedRowsReader(
 
   private def castElement(elem: Any, toType: DataType, fromType: DataType): Any =
     Cast(Literal(elem, fromType), toType, None, EvalMode.TRY).eval(null)
+
+  override def initMetricsValues(metrics: Array[CustomTaskMetric]): Unit = {
+    metrics.foreach { m =>
+      m.name match {
+        case "rows_read" => rowsRead = m.value()
+      }
+    }
+  }
+
+  override def currentMetricsValues(): Array[CustomTaskMetric] = {
+    val metric = new CustomTaskMetric {
+      override def name(): String = "rows_read"
+      override def value(): Long = rowsRead
+    }
+    Array(metric)
+  }
 }
 
 private class BufferedRowsWriterFactory(schema: StructType)
@@ -1044,6 +1067,11 @@ class InMemoryCustomDriverTaskMetric(value: Long) extends CustomTaskMetric {
   override def value(): Long = value
 }
 
+class RowsReadCustomMetric extends CustomSumMetric {
+  override def name(): String = "rows_read"
+  override def description(): String = "number of rows read"
+}
+
 case class Commit(id: Long, writeSummary: Option[WriteSummary] = None)
 
 sealed trait Operation
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala
@@ -24,6 +24,7 @@ import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.connector.metric.CustomTaskMetric
 import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory}
 import org.apache.spark.sql.errors.QueryExecutionErrors
 import org.apache.spark.sql.execution.metric.{CustomMetrics, SQLMetric}
@@ -97,7 +98,8 @@ class DataSourceRDD(
           }
 
           // Once we advance to the next partition, update the metric callback for early finish
-          partitionMetricCallback.advancePartition(iter, reader)
+          val previousMetrics = partitionMetricCallback.advancePartition(iter, reader)
+          previousMetrics.foreach(reader.initMetricsValues)
 
           currentIter = Some(iter)
           hasNext
@@ -118,19 +120,26 @@ private class PartitionMetricCallback
   private var iter: MetricsIterator[_] = null
   private var reader: PartitionReader[_] = null
 
-  def advancePartition(iter: MetricsIterator[_], reader: PartitionReader[_]): Unit = {
-    execute()
+  def advancePartition(
+      iter: MetricsIterator[_],
+      reader: PartitionReader[_]): Option[Array[CustomTaskMetric]] = {
+    val metrics = execute()
 
     this.iter = iter
     this.reader = reader
+
+    metrics
   }
 
-  def execute(): Unit = {
+  def execute(): Option[Array[CustomTaskMetric]] = {
     if (iter != null && reader != null) {
-      CustomMetrics
-        .updateMetrics(reader.currentMetricsValues.toImmutableArraySeq, customMetrics)
+      val metrics = reader.currentMetricsValues
+      CustomMetrics.updateMetrics(metrics.toImmutableArraySeq, customMetrics)
       iter.forceUpdateMetrics()
       reader.close()
+      Some(metrics)
+    } else {
+      None
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/KeyGroupedPartitioningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/KeyGroupedPartitioningSuite.scala
@@ -2823,4 +2823,22 @@ class KeyGroupedPartitioningSuite extends DistributionAndOrderingSuiteBase {
       checkAnswer(df, Seq(Row(1, "aa", 40.0, 42.0)))
     }
   }
+
+  test("SPARK-55302: Custom metrics of grouped partitions") {
+    val items_partitions = Array(identity("id"))
+    createTable(items, itemsColumns, items_partitions)
+
+    sql(s"INSERT INTO testcat.ns.$items VALUES " +
+      "(1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " +
+      "(4, 'bb', 10.0, cast('2021-01-01' as timestamp)), " +
+      "(4, 'cc', 15.5, cast('2021-02-01' as timestamp))")
+
+    val metrics = runAndFetchMetrics {
+      val df = sql(s"SELECT * FROM testcat.ns.$items")
+      val scans = collectScans(df.queryExecution.executedPlan)
+      assert(scans(0).inputRDD.partitions.length === 2, "items scan should have 2 partition groups")
+      df.collect()
+    }
+    assert(metrics("number of rows read") == "3")
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/InMemoryTableMetricSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/InMemoryTableMetricSuite.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.datasources
 import java.util.Collections
 
 import org.scalatest.BeforeAndAfter
-import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.connector.catalog.{Column, Identifier, InMemoryTable, InMemoryTableCatalog}
@@ -54,27 +53,8 @@ class InMemoryTableMetricSuite
         Array(Column.create("i", IntegerType)),
         Array.empty[Transform], Collections.emptyMap[String, String])
 
-      func("testcat.table_name")
+      val metrics = runAndFetchMetrics(func("testcat.table_name"))
 
-      // Wait until the new execution is started and being tracked.
-      eventually(timeout(10.seconds), interval(10.milliseconds)) {
-        assert(statusStore.executionsCount() >= oldCount)
-      }
-
-      // Wait for listener to finish computing the metrics for the execution.
-      eventually(timeout(10.seconds), interval(10.milliseconds)) {
-        assert(statusStore.executionsList().nonEmpty &&
-          statusStore.executionsList().last.metricValues != null)
-      }
-
-      val exec = statusStore.executionsList().last
-      val execId = exec.executionId
-      val sqlMetrics = exec.metrics.map { metric =>
-        metric.accumulatorId -> metric.name
-      }.toMap
-      val metrics = statusStore.executionMetrics(execId).map { case (k, v) =>
-        sqlMetrics(k) -> v
-      }
       checker(metrics)
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala
@@ -54,6 +54,29 @@ trait SharedSparkSession extends SQLTestUtils with SharedSparkSessionBase {
       doThreadPostAudit()
     }
   }
+
+  def runAndFetchMetrics(func: => Unit): Map[String, String] = {
+    val statusStore = spark.sharedState.statusStore
+    val oldCount = statusStore.executionsList().size
+
+    func
+
+    // Wait until the new execution is started and being tracked.
+    eventually(timeout(10.seconds), interval(10.milliseconds)) {
+      assert(statusStore.executionsCount() >= oldCount)
+    }
+
+    // Wait for listener to finish computing the metrics for the execution.
+    eventually(timeout(10.seconds), interval(10.milliseconds)) {
+      assert(statusStore.executionsList().nonEmpty &&
+        statusStore.executionsList().last.metricValues != null)
+    }
+
+    val exec = statusStore.executionsList().last
+    val execId = exec.executionId
+    val sqlMetrics = exec.metrics.map { metric => metric.accumulatorId -> metric.name }.toMap
+    statusStore.executionMetrics(execId).map { case (k, v) => sqlMetrics(k) -> v }
+  }
 }
 
 /**

Original file line number	Diff line number	Diff line change
`@@ -58,4 +58,13 @@ default CustomTaskMetric[] currentMetricsValues() {`
`58`	`58`	`CustomTaskMetric[] NO_METRICS = {};`
`59`	`59`	`return NO_METRICS;`
`60`	`60`	`}`
	`61`	`+`
	`62`	`+ /**`
	`63`	`+ * Sets the initial value of metrics before fetching any data from the reader. This is called`
	`64`	`+ * when multiple {@link PartitionReader}s are grouped into one partition in case of`
	`65`	`+ * {@link org.apache.spark.sql.connector.read.partitioning.KeyGroupedPartitioning} and the reader`
	`66`	`+ * is initialized with the metrics returned by the previous reader that belongs to the same`
	`67`	`+ * partition. By default, this method does nothing.`
	`68`	`+ */`
	`69`	`+ default void initMetricsValues(CustomTaskMetric[] metrics) {}`
`61`	`70`	`}`