[PyTorch Pinned Allocator] Add support of reserved pinned memory segment to avoid slow paths (pytorch#164501)

banitag1 · pytorchmergebot · commit f39789cdabb6 · 2025-10-03T18:11:27.000Z
Summary: This diff adds the feature of allocating a large pinned memory segment upfront based on the provided config. This large segment is then used to serve all the small pinned memory requests to avoid expensive device level APIs (slow paths). Example: PYTORCH_CUDA_ALLOC_CONF=pinned_reserve_segment_size_mb:2048 This reserves a 2GB pinned memory segment for the process and then all incoming small requests are just served from this segment and no cudaHostAlloc/cudaHostRegister apis are being called. Differential Revision: D83779074 Pull Request resolved: pytorch#164501 Approved by: https://github.com/yangw-dev
diff --git a/aten/src/ATen/core/CachingHostAllocator.h b/aten/src/ATen/core/CachingHostAllocator.h
@@ -50,6 +50,46 @@ namespace {
   constexpr size_t MAX_SIZE_INDEX = 64;
 }
 
+// A large reserved pinned memory segment that is created in advance which is used
+// to allocate small pinned memory requests to avoid calling into expensive APIs.
+// We never free this memory and move up the pointer as we allocate new blocks
+// and when blocks are freed, they are cached in the free lists.
+struct PinnedReserveSegment {
+  PinnedReserveSegment(void *start, size_t size) : start_(start), size_(size),
+    current_ptr_(start_), initialized_(true) {}
+
+  PinnedReserveSegment() : start_(nullptr), size_(0), current_ptr_(nullptr), initialized_(false) {}
+
+  bool initialized() {
+    return initialized_;
+  }
+
+  void* allocate(size_t bytes) {
+    std::lock_guard<std::mutex> guard(mutex_);
+
+    // Round up the requested size to 4KB boundary for all including the small ones.
+    size_t rounded_bytes = (bytes + 4096 - 1) & ~(4096 - 1);
+
+    if (((uint8_t*)current_ptr_ + rounded_bytes) > ((uint8_t*)start_ + size_)) {
+      return nullptr;
+    }
+
+    void* ptr = current_ptr_;
+    current_ptr_ = (uint8_t*)current_ptr_ + rounded_bytes;
+    return ptr;
+  }
+
+  bool owns(void* ptr) {
+    return ptr >= start_ && ptr < (uint8_t*)start_ + size_;
+  }
+
+  std::mutex mutex_;
+  void* start_;
+  size_t size_;
+  void* current_ptr_;
+  bool initialized_;
+};
+
 // Struct containing memory allocator summary statistics for host.
 struct TORCH_API HostStats {
   // COUNT: total allocations (active)
@@ -203,17 +243,6 @@ struct CachingHostAllocatorImpl {
     // background.
     if (!pinned_use_background_threads()) {
       process_events();
-    } else {
-      // Launch the background thread and process events in a loop.
-      static bool background_thread_flag [[maybe_unused]] = [this] {
-        getBackgroundThreadPool()->run([&]() {
-          while (active_) {
-            process_events();
-            std::this_thread::sleep_for(std::chrono::microseconds(100));
-          }
-        });
-        return true;
-      }();
     }
 
     // Round up the allocation to the nearest power of two to improve reuse.
@@ -226,6 +255,21 @@ struct CachingHostAllocatorImpl {
       return {block->ptr_, reinterpret_cast<void*>(block)};
     }
 
+    // Check in the recently freed blocks with pending events to see if we
+    // can reuse them. Call get_free_block again after processing events
+    if (pinned_use_background_threads()) {
+      // Launch the background thread and process events in a loop.
+      static bool background_thread_flag [[maybe_unused]] = [this] {
+        getBackgroundThreadPool()->run([&]() {
+          while (active_) {
+            process_events();
+            std::this_thread::sleep_for(std::chrono::microseconds(100));
+          }
+        });
+        return true;
+      }();
+    }
+
     // Slow path: if we can't allocate from the cached free list, we need
     // to create a new block.
     void* ptr = nullptr;
diff --git a/aten/src/ATen/cuda/CachingHostAllocator.cpp b/aten/src/ATen/cuda/CachingHostAllocator.cpp
@@ -9,7 +9,6 @@
 
 #include <cuda_runtime_api.h>
 #include <future>
-#include <unordered_map>
 
 namespace at::cuda {
 namespace {
@@ -72,9 +71,20 @@ using Block = HostBlock<CUDAStream>;
 struct CUDACachingHostAllocatorImpl
     : public CachingHostAllocatorImpl<CUDAStream, EventPool::Event> {
  private:
-  std::unordered_map<void*, bool> use_host_register;
+  ska::flat_hash_map<void*, bool> use_host_register;
 
   void allocate_host_memory(size_t size, void** ptr) override {
+    // try allocating from reserve segment first before calling into expensive APIs
+    if (get_reserve_segment().initialized()) {
+      *ptr = get_reserve_segment().allocate(size);
+      if (*ptr != nullptr) {
+        return;
+      }
+    }
+    allocate_host_memory_slowpath(size, ptr);
+  }
+
+  void allocate_host_memory_slowpath(size_t size, void** ptr) {
     // Pinned memory pointers allocated by any device can be directly used by
     // any other device, regardless of the current device at the time of
     // allocation, since we assume unified addressing. So we grab any existing
@@ -113,6 +123,18 @@ struct CUDACachingHostAllocatorImpl
   }
 
   void free_block(Block* block) override {
+    // We never free blocks from the reserve segment
+    if (get_reserve_segment().initialized()) {
+      // Check if the block is from the reserve segment
+      if (get_reserve_segment().owns(block->ptr_)) {
+        return;
+      }
+    }
+
+    free_block_slowpath(block);
+  }
+
+  void free_block_slowpath(Block* block) {
     auto start = std::chrono::steady_clock::now();
     // Users may change the allocator config at will. torch unit tests do this.
     // However, allocations using cudaHostRegister should use corresonding
@@ -172,6 +194,20 @@ struct CUDACachingHostAllocatorImpl
     return event_pool->get(idx);
   }
 
+  PinnedReserveSegment& get_reserve_segment() {
+    static auto reserve_segment = [&]() {
+      if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_reserve_segment_size_mb() > 0) {
+        void *ptr;
+        size_t sz = c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_reserve_segment_size_mb() * 1024 * 1024;
+        allocate_host_memory_slowpath(sz, &ptr);
+        return PinnedReserveSegment(ptr, sz);
+      } else {
+        return PinnedReserveSegment();
+      }
+    } ();
+    return reserve_segment;
+  }
+
   TaskThreadPool* getThreadPool() {
     static TaskThreadPool* pool = new TaskThreadPool(
         static_cast<int>(c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
diff --git a/c10/cuda/CUDAAllocatorConfig.cpp b/c10/cuda/CUDAAllocatorConfig.cpp
@@ -15,6 +15,7 @@ CUDAAllocatorConfig::CUDAAllocatorConfig()
       m_max_non_split_rounding_size(kLargeBuffer),
       m_garbage_collection_threshold(0),
       m_pinned_num_register_threads(1),
+      m_pinned_reserve_segment_size_mb(0),
       m_expandable_segments(false),
 #if CUDA_VERSION >= 12030
       m_expandable_segments_handle_type(
@@ -371,6 +372,9 @@ void CUDAAllocatorConfig::parseArgs(const std::optional<std::string>& env) {
     } else if (config_item_view == "pinned_num_register_threads") {
       i = parsePinnedNumRegisterThreads(config, i);
       used_native_specific_option = true;
+    } else if (config_item_view == "pinned_reserve_segment_size_mb") {
+      i = parsePinnedReserveSegmentSize(config, i);
+      used_native_specific_option = true;
     } else if (config_item_view == "pinned_use_background_threads") {
       i = parsePinnedUseBackgroundThreads(config, i);
       used_native_specific_option = true;
@@ -451,6 +455,22 @@ size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
   return i;
 }
 
+size_t CUDAAllocatorConfig::parsePinnedReserveSegmentSize(
+    const std::vector<std::string>& config,
+    size_t i) {
+  consumeToken(config, ++i, ':');
+  if (++i < config.size()) {
+    size_t val2 = stoi(config[i]);
+    TORCH_CHECK(
+        val2 > 0, "Pinned reserve segment size has to be greater than 0 ", "");
+    m_pinned_reserve_segment_size_mb = val2;
+  } else {
+    TORCH_CHECK(
+        false, "Error, expecting pinned_reserve_segment_size_mb value", "");
+  }
+  return i;
+}
+
 size_t CUDAAllocatorConfig::parsePinnedUseBackgroundThreads(
     const std::vector<std::string>& config,
     size_t i) {
diff --git a/c10/cuda/CUDAAllocatorConfig.h b/c10/cuda/CUDAAllocatorConfig.h
@@ -70,6 +70,10 @@ class C10_CUDA_API CUDAAllocatorConfig {
     return instance().m_pinned_use_background_threads;
   }
 
+  static size_t pinned_reserve_segment_size_mb() {
+    return instance().m_pinned_reserve_segment_size_mb;
+  }
+
   static size_t pinned_max_register_threads() {
     // Based on the benchmark results, we see better allocation performance
     // with 8 threads. However on future systems, we may need more threads
@@ -143,6 +147,9 @@ class C10_CUDA_API CUDAAllocatorConfig {
   size_t parsePinnedNumRegisterThreads(
       const std::vector<std::string>& config,
       size_t i);
+  size_t parsePinnedReserveSegmentSize(
+      const std::vector<std::string>& config,
+      size_t i);
   size_t parsePinnedUseBackgroundThreads(
       const std::vector<std::string>& config,
       size_t i);
@@ -155,6 +162,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
   std::vector<size_t> m_roundup_power2_divisions;
   std::atomic<double> m_garbage_collection_threshold;
   std::atomic<size_t> m_pinned_num_register_threads;
+  std::atomic<size_t> m_pinned_reserve_segment_size_mb;
   std::atomic<bool> m_expandable_segments;
   std::atomic<Expandable_Segments_Handle_Type>
       m_expandable_segments_handle_type;
diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
@@ -608,6 +608,10 @@ Available options:
   for processing events. This avoids any slow path associated with querying/processing of
   events in the fast allocation path. This feature is disabled by default.
 
+* `pinned_reserve_segment_size_mb` option is a size in MB to reserve for pinned memory
+  segment. This allocates a large segment of pinned memory upfront and then uses to allocate
+  small size requests. This helps reduce the number of expensive device library calls.
+
 * ``graph_capture_record_stream_reuse`` (experimental, default: `False`)
   If set to `True`, the CUDA caching allocator will attempt to reclaim device memory during
   CUDA Graph capture by using the graph topology (instead of CUDA events) to determine