[SPARK-51316][PYTHON][FOLLOW-UP] Revert unrelated changes and mark mapInPandas/mapInArrow batched in byte size

HyukjinKwon · sunchao · commit a3f4aec00e25 · 2026-03-10T10:39:49.000-07:00
This PR is a followup of #50096 that reverts unrelated changes and mark mapInPandas/mapInArrow batched in byte size To make the original change self-contained, and mark mapInPandas/mapInArrow batched in byte size to be consistent. No, the main change has not been released out yet. Manually. No. Closes #50111 from HyukjinKwon/SPARK-51316-followup. Authored-by: Hyukjin Kwon <gurwls223@apache.org> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org> (cherry picked from commit 5b45671) Signed-off-by: Hyukjin Kwon <gurwls223@apache.org> (cherry picked from commit 1df6fc6)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
@@ -36,6 +36,7 @@ import org.apache.spark.sql.catalyst.plans.ReferenceAllColumns
 import org.apache.spark.sql.catalyst.plans.logical.{EventTimeWatermark, FunctionUtils, LogicalGroupState}
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.catalyst.types.DataTypeUtils
+import org.apache.spark.sql.execution.python.BatchIterator
 import org.apache.spark.sql.execution.r.ArrowRRunner
 import org.apache.spark.sql.execution.streaming.GroupStateImpl
 import org.apache.spark.sql.internal.SQLConf
@@ -218,13 +219,17 @@ case class MapPartitionsInRWithArrowExec(
     child: SparkPlan) extends UnaryExecNode {
   override def producedAttributes: AttributeSet = AttributeSet(output)
 
+  private val batchSize = conf.arrowMaxRecordsPerBatch
+
   override def outputPartitioning: Partitioning = child.outputPartitioning
 
   override protected def doExecute(): RDD[InternalRow] = {
     child.execute().mapPartitionsInternal { inputIter =>
       val outputTypes = schema.map(_.dataType)
 
-      val batchIter = Iterator(inputIter)
+      // DO NOT use iter.grouped(). See BatchIterator.
+      val batchIter =
+        if (batchSize > 0) new BatchIterator(inputIter, batchSize) else Iterator(inputIter)
 
       val runner = new ArrowRRunner(func, packageNames, broadcastVars, inputSchema,
         SQLConf.get.sessionLocalTimeZone, RRunnerModes.DATAFRAME_DAPPLY)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonArrowInput.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonArrowInput.scala
@@ -150,8 +150,10 @@ private[python] trait BasicPythonArrowInput extends PythonArrowInput[Iterator[In
 
 private[python] trait BatchedPythonArrowInput extends BasicPythonArrowInput {
   self: BasePythonRunner[Iterator[InternalRow], _] =>
-
-  private val arrowMaxRecordsPerBatch = SQLConf.get.arrowMaxRecordsPerBatch
+  private val arrowMaxRecordsPerBatch = {
+    val v = SQLConf.get.arrowMaxRecordsPerBatch
+    if (v > 0) v else Int.MaxValue
+  }
   private val maxBytesPerBatch = SQLConf.get.arrowMaxBytesPerBatch
 
   // Marker inside the input iterator to indicate the start of the next batch.
@@ -176,7 +178,7 @@ private[python] trait BatchedPythonArrowInput extends BasicPythonArrowInput {
         maxBytesPerBatch == Int.MaxValue || arrowWriter.sizeInBytes() < maxBytesPerBatch
 
       while (nextBatchStart.hasNext &&
-          (arrowMaxRecordsPerBatch <= 0 || numRowsInBatch < arrowMaxRecordsPerBatch) &&
+          numRowsInBatch < arrowMaxRecordsPerBatch &&
           underBatchSizeLimit) {
         arrowWriter.write(nextBatchStart.next())
         numRowsInBatch += 1