perf: cache and broadcast serialized plans across partitions

andygrove · claude · andygrove · commit cb66f9ac455b · 2026-01-22T12:48:34.000-07:00
Serialize native query plans once and broadcast to all executors,
avoiding repeated protobuf serialization for each partition.

This optimization:
- Adds serializePlan() method to serialize an Operator once
- Adds getCometIterator() overload accepting pre-serialized bytes
- Updates getNativeLimitRDD to broadcast the serialized plan
- Updates CometTakeOrderedAndProjectExec to broadcast the topK plan

For a query with 1000 partitions across 10 executors, this reduces
plan serialization from 1000x to 1x, and plan transfer from 1000x
to 10x (once per executor via broadcast).

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometExecUtils.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometExecUtils.scala
@@ -53,9 +53,13 @@ object CometExecUtils {
       limit: Int,
       offset: Int = 0): RDD[ColumnarBatch] = {
     val numParts = childPlan.getNumPartitions
+    val numOutputCols = outputAttribute.length
+    // Serialize the plan once and broadcast to all executors to avoid repeated serialization
+    val serializedPlan = CometExec.serializePlan(
+      CometExecUtils.getLimitNativePlan(outputAttribute, limit, offset).get)
+    val broadcastPlan = childPlan.sparkContext.broadcast(serializedPlan)
     childPlan.mapPartitionsWithIndexInternal { case (idx, iter) =>
-      val limitOp = CometExecUtils.getLimitNativePlan(outputAttribute, limit, offset).get
-      CometExec.getCometIterator(Seq(iter), outputAttribute.length, limitOp, numParts, idx)
+      CometExec.getCometIterator(Seq(iter), numOutputCols, broadcastPlan.value, numParts, idx)
     }
   }
 
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometTakeOrderedAndProjectExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometTakeOrderedAndProjectExec.scala
@@ -133,12 +133,20 @@ case class CometTakeOrderedAndProjectExec(
           CometExecUtils.getNativeLimitRDD(childRDD, child.output, limit)
         } else {
           val numParts = childRDD.getNumPartitions
+          val numOutputCols = child.output.length
+          // Serialize the plan once and broadcast to avoid repeated serialization
+          val serializedTopK = CometExec.serializePlan(
+            CometExecUtils
+              .getTopKNativePlan(child.output, sortOrder, child, limit)
+              .get)
+          val broadcastTopK = sparkContext.broadcast(serializedTopK)
           childRDD.mapPartitionsWithIndexInternal { case (idx, iter) =>
-            val topK =
-              CometExecUtils
-                .getTopKNativePlan(child.output, sortOrder, child, limit)
-                .get
-            CometExec.getCometIterator(Seq(iter), child.output.length, topK, numParts, idx)
+            CometExec.getCometIterator(
+              Seq(iter),
+              numOutputCols,
+              broadcastTopK.value,
+              numParts,
+              idx)
           }
         }
 
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
@@ -114,6 +114,20 @@ object CometExec {
 
   def newIterId: Long = curId.getAndIncrement()
 
+  /**
+   * Serializes a native plan operator to a byte array. This method should be called once outside
+   * of partition iteration, and the resulting bytes can be reused across all partitions to avoid
+   * repeated serialization overhead.
+   */
+  def serializePlan(nativePlan: Operator): Array[Byte] = {
+    val size = nativePlan.getSerializedSize
+    val bytes = new Array[Byte](size)
+    val codedOutput = CodedOutputStream.newInstance(bytes)
+    nativePlan.writeTo(codedOutput)
+    codedOutput.checkNoSpaceLeft()
+    bytes
+  }
+
   def getCometIterator(
       inputs: Seq[Iterator[ColumnarBatch]],
       numOutputCols: Int,
@@ -131,6 +145,28 @@ object CometExec {
       encryptedFilePaths = Seq.empty)
   }
 
+  /**
+   * Creates a CometExecIterator from pre-serialized plan bytes. Use this overload when the same
+   * plan is used across multiple partitions to avoid serializing the plan repeatedly.
+   */
+  def getCometIterator(
+      inputs: Seq[Iterator[ColumnarBatch]],
+      numOutputCols: Int,
+      serializedPlan: Array[Byte],
+      numParts: Int,
+      partitionIdx: Int): CometExecIterator = {
+    new CometExecIterator(
+      newIterId,
+      inputs,
+      numOutputCols,
+      serializedPlan,
+      CometMetricNode(Map.empty),
+      numParts,
+      partitionIdx,
+      broadcastedHadoopConfForEncryption = None,
+      encryptedFilePaths = Seq.empty)
+  }
+
   def getCometIterator(
       inputs: Seq[Iterator[ColumnarBatch]],
       numOutputCols: Int,
@@ -140,11 +176,7 @@ object CometExec {
       partitionIdx: Int,
       broadcastedHadoopConfForEncryption: Option[Broadcast[SerializableConfiguration]],
       encryptedFilePaths: Seq[String]): CometExecIterator = {
-    val size = nativePlan.getSerializedSize
-    val bytes = new Array[Byte](size)
-    val codedOutput = CodedOutputStream.newInstance(bytes)
-    nativePlan.writeTo(codedOutput)
-    codedOutput.checkNoSpaceLeft()
+    val bytes = serializePlan(nativePlan)
     new CometExecIterator(
       newIterId,
       inputs,