[GLUTEN-10933][VL] feat: Support cached the batches in cpu cache (#11758)

jinchengchenghh · web-flow · commit 8e8a81e3b0a9 · 2026-03-14T01:03:20.000+08:00
Cache the batch in cpu cache, and wait for the join threads to fetch one by one, the build threads will start to fetch as soon as possible, but the probe thread need to wait for build finished. The buffer size is controlled by `spark.gluten.sql.columnar.backend.velox.cudf.shuffleMaxPrefetchBytes` temporally, the size may be changed by the remaining memory in the server. Test: Test in local SF100, adjust the config to enable caching batch. ``` --conf spark.gluten.sql.columnar.backend.velox.cudf.batchSize=10000 \ --conf spark.gluten.sql.columnar.backend.velox.cudf.shuffleMaxPrefetchBytes=1024MB ``` The log prints `Prefetched 171 batches (24057900 bytes) before blocking on GPU lock` Next step: Prefetch the probe side batch when build starts. Related issue: #10933
diff --git a/backends-velox/src/main/java/org/apache/gluten/utils/GpuBufferBatchResizerJniWrapper.java b/backends-velox/src/main/java/org/apache/gluten/utils/GpuBufferBatchResizerJniWrapper.java
@@ -36,5 +36,6 @@ public long rtHandle() {
     return runtime.getHandle();
   }
 
-  public native long create(int minOutputBatchSize, ColumnarBatchInIterator itr);
+  public native long create(
+      int minOutputBatchSize, long maxPrefetchBatchBytes, ColumnarBatchInIterator itr);
 }
diff --git a/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala b/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala
@@ -86,6 +86,8 @@ class VeloxConfig(conf: SQLConf) extends GlutenConfig(conf) {
 
   def cudfBatchSize: Int = getConf(CUDF_BATCH_SIZE)
 
+  def cudfShuffleMaxPrefetchBytes: Long = getConf(CUDF_SHUFFLE_MAX_PREFETCH_BYTES)
+
   def orcUseColumnNames: Boolean = getConf(ORC_USE_COLUMN_NAMES)
 
   def parquetUseColumnNames: Boolean = getConf(PARQUET_USE_COLUMN_NAMES)
@@ -685,6 +687,13 @@ object VeloxConfig extends ConfigRegistry {
       .intConf
       .createWithDefault(Integer.MAX_VALUE)
 
+  val CUDF_SHUFFLE_MAX_PREFETCH_BYTES =
+    buildConf("spark.gluten.sql.columnar.backend.velox.cudf.shuffleMaxPrefetchBytes")
+      .doc("Maximum bytes to prefetch in CPU memory during GPU shuffle read while waiting" +
+        "for GPU available.")
+      .bytesConf(ByteUnit.BYTE)
+      .createWithDefaultString("1028MB")
+
   val MEMORY_DUMP_ON_EXIT =
     buildConf("spark.gluten.monitor.memoryDumpOnExit")
       .internal()
diff --git a/backends-velox/src/main/scala/org/apache/gluten/execution/GpuResizeBufferColumnarBatchExec.scala b/backends-velox/src/main/scala/org/apache/gluten/execution/GpuResizeBufferColumnarBatchExec.scala
@@ -34,7 +34,10 @@ import scala.collection.JavaConverters._
 /**
  * An operator to resize input BufferBatches generated by shuffle reader, and convert to cudf table.
  */
-case class GpuResizeBufferColumnarBatchExec(override val child: SparkPlan, minOutputBatchSize: Int)
+case class GpuResizeBufferColumnarBatchExec(
+    override val child: SparkPlan,
+    minOutputBatchSize: Int,
+    maxPrefetchBatchBytes: Long)
   extends ColumnarToColumnarExec(child) {
 
   override protected def mapIterator(in: Iterator[ColumnarBatch]): Iterator[ColumnarBatch] = {
@@ -44,6 +47,7 @@ case class GpuResizeBufferColumnarBatchExec(override val child: SparkPlan, minOu
       .create(runtime)
       .create(
         minOutputBatchSize,
+        maxPrefetchBatchBytes,
         new ColumnarBatchInIterator(BackendsApiManager.getBackendName, in.asJava))
     new ColumnarBatchOutIterator(runtime, outHandle).asScala
   }
diff --git a/backends-velox/src/main/scala/org/apache/gluten/extension/GpuBufferBatchResizeForShuffleInputOutput.scala b/backends-velox/src/main/scala/org/apache/gluten/extension/GpuBufferBatchResizeForShuffleInputOutput.scala
@@ -36,6 +36,7 @@ case class GpuBufferBatchResizeForShuffleInputOutput() extends Rule[SparkPlan] {
     val range = VeloxConfig.get.veloxResizeBatchesShuffleInputOutputRange
     val preferredBatchBytes = VeloxConfig.get.veloxPreferredBatchBytes
     val batchSize = VeloxConfig.get.cudfBatchSize
+    val prefetchBatchBytes = VeloxConfig.get.cudfShuffleMaxPrefetchBytes
     plan.transformUp {
       case shuffle: ColumnarShuffleExchangeExec
           if shuffle.shuffleWriterType == HashShuffleWriterType &&
@@ -46,36 +47,38 @@ case class GpuBufferBatchResizeForShuffleInputOutput() extends Rule[SparkPlan] {
       case a @ AQEShuffleReadExec(
             ShuffleQueryStageExec(_, _: ColumnarShuffleExchangeExecBase, _),
             _) =>
-        GpuResizeBufferColumnarBatchExec(a, batchSize)
+        GpuResizeBufferColumnarBatchExec(a, batchSize, prefetchBatchBytes)
       case a @ AQEShuffleReadExec(
             ShuffleQueryStageExec(_, ReusedExchangeExec(_, _: ColumnarShuffleExchangeExecBase), _),
             _) =>
-        GpuResizeBufferColumnarBatchExec(a, batchSize)
+        GpuResizeBufferColumnarBatchExec(a, batchSize, prefetchBatchBytes)
       // Since it's transformed in a bottom to up order, so we may first encounter
       // ShuffeQueryStageExec, which is transformed to VeloxResizeBatchesExec(ShuffeQueryStageExec),
       // then we see AQEShuffleReadExec
       case a @ AQEShuffleReadExec(
             GpuResizeBufferColumnarBatchExec(
               s @ ShuffleQueryStageExec(_, _: ColumnarShuffleExchangeExecBase, _),
+              _,
               _),
             _) =>
-        GpuResizeBufferColumnarBatchExec(a.copy(child = s), batchSize)
+        GpuResizeBufferColumnarBatchExec(a.copy(child = s), batchSize, prefetchBatchBytes)
       case a @ AQEShuffleReadExec(
             GpuResizeBufferColumnarBatchExec(
               s @ ShuffleQueryStageExec(
                 _,
                 ReusedExchangeExec(_, _: ColumnarShuffleExchangeExecBase),
                 _),
+              _,
               _),
             _) =>
-        GpuResizeBufferColumnarBatchExec(a.copy(child = s), batchSize)
+        GpuResizeBufferColumnarBatchExec(a.copy(child = s), batchSize, prefetchBatchBytes)
       case s @ ShuffleQueryStageExec(_, _: ColumnarShuffleExchangeExecBase, _) =>
-        GpuResizeBufferColumnarBatchExec(s, batchSize)
+        GpuResizeBufferColumnarBatchExec(s, batchSize, prefetchBatchBytes)
       case s @ ShuffleQueryStageExec(
             _,
             ReusedExchangeExec(_, _: ColumnarShuffleExchangeExecBase),
             _) =>
-        GpuResizeBufferColumnarBatchExec(s, batchSize)
+        GpuResizeBufferColumnarBatchExec(s, batchSize, prefetchBatchBytes)
     }
   }
 }
diff --git a/cpp/velox/config/VeloxConfig.h b/cpp/velox/config/VeloxConfig.h
@@ -208,6 +208,8 @@ const std::string kVeloxPreferredBatchBytes = "spark.gluten.sql.columnar.backend
 const std::string kCudfEnableTableScan = "spark.gluten.sql.columnar.backend.velox.cudf.enableTableScan";
 const bool kCudfEnableTableScanDefault = false;
 const std::string kCudfHiveConnectorId = "cudf-hive";
+const std::string kCudfShuffleMaxPrefetchBytes = "spark.gluten.sql.columnar.backend.velox.cudf.shuffleMaxPrefetchBytes";
+const int64_t kCudfShuffleMaxPrefetchBytesDefault = 1028L * 1024 * 1024; // 1028MB
 
 const std::string kStaticBackendConfPrefix = "spark.gluten.velox.";
 const std::string kDynamicBackendConfPrefix = "spark.gluten.sql.columnar.backend.velox.";
diff --git a/cpp/velox/cudf/GpuLock.cc b/cpp/velox/cudf/GpuLock.cc
@@ -55,6 +55,19 @@ void lockGpu() {
     getGpuLockState().gGpuOwner = tid;
 }
 
+bool tryLockGpu() {
+    std::thread::id tid = std::this_thread::get_id();
+    std::unique_lock<std::mutex> lock(getGpuLockState().gGpuMutex);
+    if (getGpuLockState().gGpuOwner == tid) {
+        return true;
+    }
+    if (getGpuLockState().gGpuOwner.has_value()) {
+        return false;
+    }
+    getGpuLockState().gGpuOwner = tid;
+    return true;
+}
+
 void unlockGpu() {
     std::thread::id tid = std::this_thread::get_id();
     std::unique_lock<std::mutex> lock(getGpuLockState().gGpuMutex);
diff --git a/cpp/velox/cudf/GpuLock.h b/cpp/velox/cudf/GpuLock.h
@@ -21,14 +21,9 @@
 
 namespace gluten {
 
-/**
- * @brief Acquire the GPU lock (reentrant within the same thread)
- */
+/// Acquire the GPU lock, blocking until available. Reentrant for the same thread.
 void lockGpu();
-
-/**
- * @brief Release the GPU lock (must be called by the owning thread)
- */
+bool tryLockGpu();
 void unlockGpu();
 
 } // namespace gluten
diff --git a/cpp/velox/jni/VeloxJniWrapper.cc b/cpp/velox/jni/VeloxJniWrapper.cc
@@ -454,14 +454,15 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_utils_GpuBufferBatchResizerJniWra
     JNIEnv* env,
     jobject wrapper,
     jint minOutputBatchSize,
+    jlong maxPrefetchBatchBytes,
     jobject jIter) {
   JNI_METHOD_START
   auto ctx = getRuntime(env, wrapper);
   auto arrowPool = dynamic_cast<VeloxMemoryManager*>(ctx->memoryManager())->defaultArrowMemoryPool();
   auto pool = dynamic_cast<VeloxMemoryManager*>(ctx->memoryManager())->getLeafMemoryPool();
   auto iter = makeJniColumnarBatchIterator(env, jIter, ctx);
   auto appender = std::make_shared<ResultIterator>(
-      std::make_unique<GpuBufferBatchResizer>(arrowPool, pool.get(), minOutputBatchSize, std::move(iter)));
+      std::make_unique<GpuBufferBatchResizer>(arrowPool, pool.get(), minOutputBatchSize, maxPrefetchBatchBytes, std::move(iter)));
   return ctx->saveObject(appender);
   JNI_METHOD_END(kInvalidObjectHandle)
 }
diff --git a/cpp/velox/memory/GpuBufferColumnarBatch.cc b/cpp/velox/memory/GpuBufferColumnarBatch.cc
@@ -54,7 +54,9 @@ int64_t GpuBufferColumnarBatch::numBytes() {
   if (!numBytes_.has_value()) {
     int64_t bytes = 0;
     for (const auto& buffer : buffers_) {
-      bytes += buffer->size();
+      if (buffer) {
+        bytes += buffer->size();
+      }
     }
     numBytes_ = bytes;
   }
diff --git a/cpp/velox/tests/VeloxGpuShuffleWriterTest.cc b/cpp/velox/tests/VeloxGpuShuffleWriterTest.cc
@@ -116,6 +116,7 @@ RowVectorPtr mergeBufferColumnarBatches(std::vector<std::shared_ptr<GpuBufferCol
       getDefaultMemoryManager()->defaultArrowMemoryPool(),
       getDefaultMemoryManager()->getLeafMemoryPool().get(),
       1200, // output one batch
+      102400000, // prefetch up to 100MB data
       std::make_unique<ColumnarBatchArray>(bufferBatches));
   auto cb = resizer.next();
   auto batch = std::dynamic_pointer_cast<VeloxColumnarBatch>(cb);
diff --git a/cpp/velox/utils/GpuBufferBatchResizer.cc b/cpp/velox/utils/GpuBufferBatchResizer.cc
@@ -17,7 +17,6 @@
 
 #include "GpuBufferBatchResizer.h"
 #include "cudf/GpuLock.h"
-#include "memory/GpuBufferColumnarBatch.h"
 #include "utils/Timer.h"
 #include "velox/experimental/cudf/exec/GpuResources.h"
 #include "velox/experimental/cudf/exec/VeloxCudfInterop.h"
@@ -168,41 +167,82 @@ GpuBufferBatchResizer::GpuBufferBatchResizer(
     arrow::MemoryPool* arrowPool,
     facebook::velox::memory::MemoryPool* pool,
     int32_t minOutputBatchSize,
+    int64_t maxPrefetchSize,
     std::unique_ptr<ColumnarBatchIterator> in)
     : arrowPool_(arrowPool),
       pool_(pool),
       minOutputBatchSize_(minOutputBatchSize),
+      maxPrefetchSize_(maxPrefetchSize),
       in_(std::move(in)) {
   VELOX_CHECK_GT(minOutputBatchSize_, 0, "minOutputBatchSize should be larger than 0");
+  VELOX_CHECK_GT(maxPrefetchSize_, 0, "maxPrefetchSize should be larger than 0");
 }
 
-std::shared_ptr<ColumnarBatch> GpuBufferBatchResizer::next() {
+std::shared_ptr<GpuBufferColumnarBatch> GpuBufferBatchResizer::fetchAndComposeBatch() {
   std::vector<std::shared_ptr<GpuBufferColumnarBatch>> cachedBatches;
   int32_t cachedRows = 0;
   while (cachedRows < minOutputBatchSize_) {
     auto nextCb = in_->next();
     if (!nextCb) {
-      // No more input.
       break;
     }
 
     auto nextBatch = std::dynamic_pointer_cast<GpuBufferColumnarBatch>(nextCb);
     VELOX_CHECK_NOT_NULL(nextBatch);
     if (nextBatch->numRows() == 0) {
-        continue;
+      continue;
     }
 
     cachedRows += nextBatch->numRows();
     cachedBatches.push_back(std::move(nextBatch));
   }
+
   if (cachedRows == 0) {
     return nullptr;
   }
 
-  // Compose all cached batches into one
-  auto batch = GpuBufferColumnarBatch::compose(arrowPool_, cachedBatches, cachedRows);
+  return GpuBufferColumnarBatch::compose(arrowPool_, cachedBatches, cachedRows);
+}
+
+std::shared_ptr<ColumnarBatch> GpuBufferBatchResizer::next() {
+  // Ensure at least one batch is in the prefetch queue.
+  if (prefetchQueue_.empty()) {
+    auto batch = fetchAndComposeBatch();
+    if (batch) {
+      prefetchedBytes_ += batch->numBytes();
+      prefetchQueue_.push_back(std::move(batch));
+    }
+  }
+
+  if (prefetchQueue_.empty()) {
+    return nullptr;
+  }
+
+  // Try to acquire the GPU lock non-blockingly. While we can't get it,
+  // keep prefetching more batches into CPU memory within the budget.
+  while (!tryLockGpu()) {
+    if (prefetchedBytes_ < maxPrefetchSize_) {
+      auto batch = fetchAndComposeBatch();
+      if (batch) {
+        prefetchedBytes_ += batch->numBytes();
+        prefetchQueue_.push_back(std::move(batch));
+      } else {
+        // All the batches consumed.
+        LOG(WARNING) << "Prefetched " << prefetchQueue_.size() << " batches (" << prefetchedBytes_ << " bytes) before blocking on GPU lock.";
+        lockGpu();
+        break;
+      }
+    } else {
+      LOG(WARNING) << "Prefetched " << prefetchQueue_.size() << " batches (" << prefetchedBytes_ << " bytes) before blocking on GPU lock.";
+      lockGpu();
+      break;
+    }
+  }
 
-  lockGpu();
+  // GPU lock acquired. Pop one batch and convert to cuDF on GPU.
+  auto batch = std::move(prefetchQueue_.front());
+  prefetchQueue_.pop_front();
+  prefetchedBytes_ -= batch->numBytes();
 
   return makeCudfTable(batch->getRowType(), batch->numRows(), batch->buffers(), pool_);
 }
diff --git a/cpp/velox/utils/GpuBufferBatchResizer.h b/cpp/velox/utils/GpuBufferBatchResizer.h
@@ -15,7 +15,12 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include <deque>
+
 #include "memory/ColumnarBatchIterator.h"
+#include "memory/GpuBufferColumnarBatch.h"
 #include "memory/VeloxColumnarBatch.h"
 #include "utils/Exception.h"
 #include "velox/common/memory/MemoryPool.h"
@@ -28,17 +33,26 @@ class GpuBufferBatchResizer : public ColumnarBatchIterator {
       arrow::MemoryPool* arrowPool,
       facebook::velox::memory::MemoryPool* pool,
       int32_t minOutputBatchSize,
+      int64_t maxPrefetchSize,
       std::unique_ptr<ColumnarBatchIterator> in);
 
   std::shared_ptr<ColumnarBatch> next() override;
 
   int64_t spillFixedSize(int64_t size) override;
 
  private:
+  /// Read and compose one batch from the input iterator (CPU-only work).
+  /// Returns nullptr if input is exhausted.
+  std::shared_ptr<GpuBufferColumnarBatch> fetchAndComposeBatch();
+
   arrow::MemoryPool* arrowPool_;
   facebook::velox::memory::MemoryPool* pool_;
   const int32_t minOutputBatchSize_;
+  const int64_t maxPrefetchSize_;
   std::unique_ptr<ColumnarBatchIterator> in_;
+
+  std::deque<std::shared_ptr<GpuBufferColumnarBatch>> prefetchQueue_;
+  int64_t prefetchedBytes_ = 0;
 };
 
 } // namespace gluten
diff --git a/docs/velox-configuration.md b/docs/velox-configuration.md
@@ -25,6 +25,7 @@ nav_order: 16
 | spark.gluten.sql.columnar.backend.velox.cudf.enableValidation                    | true              | Heuristics you can apply to validate a cuDF/GPU plan and only offload when the entire stage can be fully and profitably executed on GPU                                                                                                                                                                                                                                                                                                               |
 | spark.gluten.sql.columnar.backend.velox.cudf.memoryPercent                       | 50                | The initial percent of GPU memory to allocate for memory resource for one thread.                                                                                                                                                                                                                                                                                                                                                                     |
 | spark.gluten.sql.columnar.backend.velox.cudf.memoryResource                      | async             | GPU RMM memory resource.                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| spark.gluten.sql.columnar.backend.velox.cudf.shuffleMaxPrefetchBytes             | 1028MB            | Maximum bytes to prefetch in CPU memory during GPU shuffle read while waitingfor GPU available.                                                                                                                                                                                                                                                                                                                                                       |
 | spark.gluten.sql.columnar.backend.velox.directorySizeGuess                       | 32KB              | Deprecated, rename to spark.gluten.sql.columnar.backend.velox.footerEstimatedSize                                                                                                                                                                                                                                                                                                                                                                     |
 | spark.gluten.sql.columnar.backend.velox.fileHandleCacheEnabled                   | false             | Disables caching if false. File handle cache should be disabled if files are mutable, i.e. file content may change while file path stays the same.                                                                                                                                                                                                                                                                                                    |
 | spark.gluten.sql.columnar.backend.velox.filePreloadThreshold                     | 1MB               | Set the file preload threshold for velox file scan, refer to Velox's file-preload-threshold                                                                                                                                                                                                                                                                                                                                                           |

Original file line number	Diff line number	Diff line change
`@@ -36,5 +36,6 @@ public long rtHandle() {`
`36`	`36`	`return runtime.getHandle();`
`37`	`37`	`}`
`38`	`38`
`39`		`- public native long create(int minOutputBatchSize, ColumnarBatchInIterator itr);`
	`39`	`+ public native long create(`
	`40`	`+ int minOutputBatchSize, long maxPrefetchBatchBytes, ColumnarBatchInIterator itr);`
`40`	`41`	`}`
Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,9 @@ int64_t GpuBufferColumnarBatch::numBytes() {`
`54`	`54`	`if (!numBytes_.has_value()) {`
`55`	`55`	`int64_t bytes = 0;`
`56`	`56`	`for (const auto& buffer : buffers_) {`
`57`		`- bytes += buffer->size();`
	`57`	`+ if (buffer) {`
	`58`	`+ bytes += buffer->size();`
	`59`	`+ }`
`58`	`60`	`}`
`59`	`61`	`numBytes_ = bytes;`
`60`	`62`	`}`