[SPARK-51316][PYTHON] Allow Arrow batches in bytes instead of number of rows

HyukjinKwon · sunchao · commit 592c03f40380 · 2026-03-10T10:39:44.000-07:00
This PR allows Arrow batches in bytes instead of number of rows We enabled `spark.sql.execution.pythonUDF.arrow.enabled` by default, and we should make sure users won't hit OOM. Yes. Now we will make the Arrow batches in bytes 256MB by default, and users can configure this Tested with changing default value to 1KB, and added a unittest. Also manually tested as below: ```python from pyspark.sql.functions import pandas_udf import pandas as pd pandas_udf("long") def func(s: pd.Series) -> pd.Series: return s a = spark.range(100000).select(func("id")).collect() ``` No. Closes #50080 from HyukjinKwon/bytes-arrow. Authored-by: Hyukjin Kwon <gurwls223@apache.org> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org> (cherry picked from commit 53fc763)
diff --git a/python/pyspark/sql/tests/test_arrow_map.py b/python/pyspark/sql/tests/test_arrow_map.py
@@ -146,6 +146,35 @@ def test_self_join(self):
         expected = df1.join(df1).collect()
         self.assertEqual(sorted(actual), sorted(expected))
 
+    def test_map_in_arrow_with_barrier_mode(self):
+        df = self.spark.range(10)
+
+        def func1(iterator):
+            from pyspark import TaskContext, BarrierTaskContext
+
+            tc = TaskContext.get()
+            assert tc is not None
+            assert not isinstance(tc, BarrierTaskContext)
+            for batch in iterator:
+                yield batch
+
+        df.mapInArrow(func1, "id long", False).collect()
+
+        def func2(iterator):
+            from pyspark import TaskContext, BarrierTaskContext
+
+            tc = TaskContext.get()
+            assert tc is not None
+            assert isinstance(tc, BarrierTaskContext)
+            for batch in iterator:
+                yield batch
+
+        df.mapInArrow(func2, "id long", True).collect()
+
+    def test_negative_and_zero_batch_size(self):
+        for batch_size in [0, -1]:
+            with self.sql_conf({"spark.sql.execution.arrow.maxRecordsPerBatch": batch_size}):
+                MapInArrowTests.test_map_in_arrow(self)
 
 class MapInArrowTests(MapInArrowTestsMixin, ReusedSQLTestCase):
     @classmethod
@@ -170,6 +199,15 @@ def tearDownClass(cls):
         ReusedSQLTestCase.tearDownClass()
 
 
+class MapInArrowWithArrowBatchSlicingTestsAndReducedBatchSizeTests(MapInArrowTests):
+    @classmethod
+    def setUpClass(cls):
+        MapInArrowTests.setUpClass()
+        # Set it to a small odd value to exercise batching logic for all test cases
+        cls.spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "3")
+        cls.spark.conf.set("spark.sql.execution.arrow.maxBytesPerBatch", "10")
+
+
 if __name__ == "__main__":
     from pyspark.sql.tests.test_arrow_map import *  # noqa: F401
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala
@@ -103,6 +103,16 @@ class ArrowWriter(val root: VectorSchemaRoot, fields: Array[ArrowFieldWriter]) {
     count += 1
   }
 
+  def sizeInBytes(): Int = {
+    var i = 0
+    var bytes = 0
+    while (i < fields.size) {
+      bytes += fields(i).getSizeInBytes()
+      i += 1
+    }
+    bytes
+  }
+
   def finish(): Unit = {
     root.setRowCount(count)
     fields.foreach(_.finish())
@@ -141,6 +151,13 @@ private[arrow] abstract class ArrowFieldWriter {
     valueVector.setValueCount(count)
   }
 
+  def getSizeInBytes(): Int = {
+    valueVector.setValueCount(count)
+    // Before calling getBufferSizeFor, we need to call
+    // `setValueCount`, see https://github.com/apache/arrow/pull/9187#issuecomment-763362710
+    valueVector.getBufferSizeFor(count)
+  }
+
   def reset(): Unit = {
     valueVector.reset()
     count = 0
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2908,11 +2908,31 @@ object SQLConf {
       .doc("When using Apache Arrow, limit the maximum number of records that can be written " +
         "to a single ArrowRecordBatch in memory. This configuration is not effective for the " +
         "grouping API such as DataFrame(.cogroup).groupby.applyInPandas because each group " +
-        "becomes each ArrowRecordBatch. If set to zero or negative there is no limit.")
+        "becomes each ArrowRecordBatch. If set to zero or negative there is no limit. " +
+        "See also spark.sql.execution.arrow.maxBytesPerBatch. If both are set, each batch " +
+        "is created when any condition of both is met.")
       .version("2.3.0")
       .intConf
       .createWithDefault(10000)
 
+  val ARROW_EXECUTION_MAX_BYTES_PER_BATCH =
+    buildConf("spark.sql.execution.arrow.maxBytesPerBatch")
+      .internal()
+      .doc("When using Apache Arrow, limit the maximum bytes in each batch that can be written " +
+        "to a single ArrowRecordBatch in memory. This configuration is not effective for the " +
+        "grouping API such as DataFrame(.cogroup).groupby.applyInPandas because each group " +
+        "becomes each ArrowRecordBatch. Unlike 'spark.sql.execution.arrow.maxRecordsPerBatch', " +
+        "this configuration does not work for createDataFrame/toPandas with Arrow/pandas " +
+        "instances. " +
+        "See also spark.sql.execution.arrow.maxRecordsPerBatch. If both are set, each batch " +
+        "is created when any condition of both is met.")
+      .version("4.0.0")
+      .bytesConf(ByteUnit.BYTE)
+      .checkValue(x => x > 0 && x <= Int.MaxValue,
+        errorMsg = "The value of " +
+          "spark.sql.execution.arrow.maxBytesPerBatch should be greater " +
+          "than zero and less than INT_MAX.")
+      .createWithDefaultString("256MB")
   val ARROW_EXECUTION_USE_LARGE_VAR_TYPES =
     buildConf("spark.sql.execution.arrow.useLargeVarTypes")
       .doc("When using Apache Arrow, use large variable width vectors for string and binary " +
@@ -5073,6 +5093,7 @@ class SQLConf extends Serializable with Logging with SqlApiConf {
 
   def arrowMaxRecordsPerBatch: Int = getConf(ARROW_EXECUTION_MAX_RECORDS_PER_BATCH)
 
+  def arrowMaxBytesPerBatch: Long = getConf(ARROW_EXECUTION_MAX_BYTES_PER_BATCH)
   def arrowUseLargeVarTypes: Boolean = getConf(ARROW_EXECUTION_USE_LARGE_VAR_TYPES)
 
   def pandasUDFBufferSize: Int = getConf(PANDAS_UDF_BUFFER_SIZE)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
@@ -36,7 +36,6 @@ import org.apache.spark.sql.catalyst.plans.ReferenceAllColumns
 import org.apache.spark.sql.catalyst.plans.logical.{EventTimeWatermark, FunctionUtils, LogicalGroupState}
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.catalyst.types.DataTypeUtils
-import org.apache.spark.sql.execution.python.BatchIterator
 import org.apache.spark.sql.execution.r.ArrowRRunner
 import org.apache.spark.sql.execution.streaming.GroupStateImpl
 import org.apache.spark.sql.internal.SQLConf
@@ -219,17 +218,13 @@ case class MapPartitionsInRWithArrowExec(
     child: SparkPlan) extends UnaryExecNode {
   override def producedAttributes: AttributeSet = AttributeSet(output)
 
-  private val batchSize = conf.arrowMaxRecordsPerBatch
-
   override def outputPartitioning: Partitioning = child.outputPartitioning
 
   override protected def doExecute(): RDD[InternalRow] = {
     child.execute().mapPartitionsInternal { inputIter =>
       val outputTypes = schema.map(_.dataType)
 
-      // DO NOT use iter.grouped(). See BatchIterator.
-      val batchIter =
-        if (batchSize > 0) new BatchIterator(inputIter, batchSize) else Iterator(inputIter)
+      val batchIter = Iterator(inputIter)
 
       val runner = new ArrowRRunner(func, packageNames, broadcastVars, inputSchema,
         SQLConf.get.sessionLocalTimeZone, RRunnerModes.DATAFRAME_DAPPLY)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
@@ -62,7 +62,6 @@ case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], resultAttrs: Seq[Attribute]
     evalType: Int)
   extends EvalPythonExec with PythonSQLMetrics {
 
-  private val batchSize = conf.arrowMaxRecordsPerBatch
   private val sessionLocalTimeZone = conf.sessionLocalTimeZone
   private val largeVarTypes = conf.arrowUseLargeVarTypes
   private val pythonRunnerConf = ArrowPythonRunner.getPythonRunnerConfMap(conf)
@@ -77,10 +76,9 @@ case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], resultAttrs: Seq[Attribute]
 
     val outputTypes = output.drop(child.output.length).map(_.dataType)
 
-    // DO NOT use iter.grouped(). See BatchIterator.
-    val batchIter = if (batchSize > 0) new BatchIterator(iter, batchSize) else Iterator(iter)
+    val batchIter = Iterator(iter)
 
-    val columnarBatchIter = new ArrowPythonRunner(
+    val pyRunner = new ArrowPythonRunner(
       funcs,
       evalType,
       argOffsets,
@@ -89,7 +87,8 @@ case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], resultAttrs: Seq[Attribute]
       largeVarTypes,
       pythonRunnerConf,
       pythonMetrics,
-      jobArtifactUUID).compute(batchIter, context.partitionId(), context)
+      jobArtifactUUID) with BatchedPythonArrowInput
+    val columnarBatchIter = pyRunner.compute(batchIter, context.partitionId(), context)
 
     columnarBatchIter.flatMap { batch =>
       val actualDataTypes = (0 until batch.numCols()).map(i => batch.column(i).dataType())
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonUDTFRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonUDTFRunner.scala
@@ -42,7 +42,7 @@ class ArrowPythonUDTFRunner(
     jobArtifactUUID: Option[String])
   extends BasePythonRunner[Iterator[InternalRow], ColumnarBatch](
       Seq(ChainedPythonFunctions(Seq(udtf.func))), evalType, Array(offsets), jobArtifactUUID)
-  with BasicPythonArrowInput
+  with BatchedPythonArrowInput
   with BasicPythonArrowOutput {
 
   override protected def writeUDF(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInBatchEvaluatorFactory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInBatchEvaluatorFactory.scala
@@ -58,11 +58,9 @@ class MapInBatchEvaluatorFactory(
       // as a DataFrame.
       val wrappedIter = contextAwareIterator.map(InternalRow(_))
 
-      // DO NOT use iter.grouped(). See BatchIterator.
-      val batchIter =
-        if (batchSize > 0) new BatchIterator(wrappedIter, batchSize) else Iterator(wrappedIter)
+      val batchIter = Iterator(wrappedIter)
 
-      val columnarBatchIter = new ArrowPythonRunner(
+      val pyRunner = new ArrowPythonRunner(
         chainedFunc,
         pythonEvalType,
         argOffsets,
@@ -71,7 +69,8 @@ class MapInBatchEvaluatorFactory(
         largeVarTypes,
         pythonRunnerConf,
         pythonMetrics,
-        jobArtifactUUID).compute(batchIter, context.partitionId(), context)
+        jobArtifactUUID) with BatchedPythonArrowInput
+      val columnarBatchIter = pyRunner.compute(batchIter, context.partitionId(), context)
 
       val unsafeProj = UnsafeProjection.create(output, output)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonArrowInput.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonArrowInput.scala
@@ -27,6 +27,7 @@ import org.apache.spark.api.python.{BasePythonRunner, ChainedPythonFunctions, Py
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.execution.arrow.ArrowWriter
 import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.util.ArrowUtils
 import org.apache.spark.util.Utils
@@ -93,12 +94,14 @@ private[python] trait PythonArrowInput[IN] { self: BasePythonRunner[IN, _] =>
           val writer = new ArrowStreamWriter(root, null, dataOut)
           writer.start()
 
-          writeIteratorToArrowStream(root, writer, dataOut, inputIterator)
-
-          // end writes footer to the output stream and doesn't clean any resources.
-          // It could throw exception if the output stream is closed, so it should be
-          // in the try block.
-          writer.end()
+          Utils.tryWithSafeFinally {
+            writeIteratorToArrowStream(root, writer, dataOut, inputIterator)
+          } {
+            // end writes footer to the output stream and doesn't clean any resources.
+            // It could throw exception if the output stream is closed, so it should be
+            // in the try block.
+            writer.end()
+          }
         } {
           // If we close root and allocator in TaskCompletionListener, there could be a race
           // condition where the writer thread keeps writing to the VectorSchemaRoot while
@@ -144,3 +147,48 @@ private[python] trait BasicPythonArrowInput extends PythonArrowInput[Iterator[In
     }
   }
 }
+
+private[python] trait BatchedPythonArrowInput extends BasicPythonArrowInput {
+  self: BasePythonRunner[Iterator[InternalRow], _] =>
+
+  private val arrowMaxRecordsPerBatch = SQLConf.get.arrowMaxRecordsPerBatch
+  private val maxBytesPerBatch = SQLConf.get.arrowMaxBytesPerBatch
+
+  // Marker inside the input iterator to indicate the start of the next batch.
+  private var nextBatchStart: Iterator[InternalRow] = Iterator.empty
+
+  override protected def writeIteratorToArrowStream(
+      root: VectorSchemaRoot,
+      writer: ArrowStreamWriter,
+      dataOut: DataOutputStream,
+      inputIterator: Iterator[Iterator[InternalRow]]): Unit = {
+    val arrowWriter = ArrowWriter.create(root)
+
+    while (nextBatchStart.hasNext || inputIterator.hasNext) {
+      if (!nextBatchStart.hasNext) {
+        nextBatchStart = inputIterator.next()
+      }
+
+      val startData = dataOut.size()
+      var numRowsInBatch = 0
+
+      def underBatchSizeLimit: Boolean =
+        maxBytesPerBatch == Int.MaxValue || arrowWriter.sizeInBytes() < maxBytesPerBatch
+
+      while (nextBatchStart.hasNext &&
+          (arrowMaxRecordsPerBatch <= 0 || numRowsInBatch < arrowMaxRecordsPerBatch) &&
+          underBatchSizeLimit) {
+        arrowWriter.write(nextBatchStart.next())
+        numRowsInBatch += 1
+      }
+
+      if (numRowsInBatch > 0) {
+        arrowWriter.finish()
+        writer.writeBatch()
+        arrowWriter.reset()
+        val deltaData = dataOut.size() - startData
+        pythonMetrics("pythonDataSent") += deltaData
+      }
+    }
+  }
+}