Skip to content

Commit f39789c

Browse files
banitag1pytorchmergebot
authored andcommitted
[PyTorch Pinned Allocator] Add support of reserved pinned memory segment to avoid slow paths (pytorch#164501)
Summary: This diff adds the feature of allocating a large pinned memory segment upfront based on the provided config. This large segment is then used to serve all the small pinned memory requests to avoid expensive device level APIs (slow paths). Example: PYTORCH_CUDA_ALLOC_CONF=pinned_reserve_segment_size_mb:2048 This reserves a 2GB pinned memory segment for the process and then all incoming small requests are just served from this segment and no cudaHostAlloc/cudaHostRegister apis are being called. Differential Revision: D83779074 Pull Request resolved: pytorch#164501 Approved by: https://github.com/yangw-dev
1 parent 3d9d41c commit f39789c

File tree

5 files changed

+125
-13
lines changed

5 files changed

+125
-13
lines changed

aten/src/ATen/core/CachingHostAllocator.h

Lines changed: 55 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,46 @@ namespace {
5050
constexpr size_t MAX_SIZE_INDEX = 64;
5151
}
5252

53+
// A large reserved pinned memory segment that is created in advance which is used
54+
// to allocate small pinned memory requests to avoid calling into expensive APIs.
55+
// We never free this memory and move up the pointer as we allocate new blocks
56+
// and when blocks are freed, they are cached in the free lists.
57+
struct PinnedReserveSegment {
58+
PinnedReserveSegment(void *start, size_t size) : start_(start), size_(size),
59+
current_ptr_(start_), initialized_(true) {}
60+
61+
PinnedReserveSegment() : start_(nullptr), size_(0), current_ptr_(nullptr), initialized_(false) {}
62+
63+
bool initialized() {
64+
return initialized_;
65+
}
66+
67+
void* allocate(size_t bytes) {
68+
std::lock_guard<std::mutex> guard(mutex_);
69+
70+
// Round up the requested size to 4KB boundary for all including the small ones.
71+
size_t rounded_bytes = (bytes + 4096 - 1) & ~(4096 - 1);
72+
73+
if (((uint8_t*)current_ptr_ + rounded_bytes) > ((uint8_t*)start_ + size_)) {
74+
return nullptr;
75+
}
76+
77+
void* ptr = current_ptr_;
78+
current_ptr_ = (uint8_t*)current_ptr_ + rounded_bytes;
79+
return ptr;
80+
}
81+
82+
bool owns(void* ptr) {
83+
return ptr >= start_ && ptr < (uint8_t*)start_ + size_;
84+
}
85+
86+
std::mutex mutex_;
87+
void* start_;
88+
size_t size_;
89+
void* current_ptr_;
90+
bool initialized_;
91+
};
92+
5393
// Struct containing memory allocator summary statistics for host.
5494
struct TORCH_API HostStats {
5595
// COUNT: total allocations (active)
@@ -203,17 +243,6 @@ struct CachingHostAllocatorImpl {
203243
// background.
204244
if (!pinned_use_background_threads()) {
205245
process_events();
206-
} else {
207-
// Launch the background thread and process events in a loop.
208-
static bool background_thread_flag [[maybe_unused]] = [this] {
209-
getBackgroundThreadPool()->run([&]() {
210-
while (active_) {
211-
process_events();
212-
std::this_thread::sleep_for(std::chrono::microseconds(100));
213-
}
214-
});
215-
return true;
216-
}();
217246
}
218247

219248
// Round up the allocation to the nearest power of two to improve reuse.
@@ -226,6 +255,21 @@ struct CachingHostAllocatorImpl {
226255
return {block->ptr_, reinterpret_cast<void*>(block)};
227256
}
228257

258+
// Check in the recently freed blocks with pending events to see if we
259+
// can reuse them. Call get_free_block again after processing events
260+
if (pinned_use_background_threads()) {
261+
// Launch the background thread and process events in a loop.
262+
static bool background_thread_flag [[maybe_unused]] = [this] {
263+
getBackgroundThreadPool()->run([&]() {
264+
while (active_) {
265+
process_events();
266+
std::this_thread::sleep_for(std::chrono::microseconds(100));
267+
}
268+
});
269+
return true;
270+
}();
271+
}
272+
229273
// Slow path: if we can't allocate from the cached free list, we need
230274
// to create a new block.
231275
void* ptr = nullptr;

aten/src/ATen/cuda/CachingHostAllocator.cpp

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99

1010
#include <cuda_runtime_api.h>
1111
#include <future>
12-
#include <unordered_map>
1312

1413
namespace at::cuda {
1514
namespace {
@@ -72,9 +71,20 @@ using Block = HostBlock<CUDAStream>;
7271
struct CUDACachingHostAllocatorImpl
7372
: public CachingHostAllocatorImpl<CUDAStream, EventPool::Event> {
7473
private:
75-
std::unordered_map<void*, bool> use_host_register;
74+
ska::flat_hash_map<void*, bool> use_host_register;
7675

7776
void allocate_host_memory(size_t size, void** ptr) override {
77+
// try allocating from reserve segment first before calling into expensive APIs
78+
if (get_reserve_segment().initialized()) {
79+
*ptr = get_reserve_segment().allocate(size);
80+
if (*ptr != nullptr) {
81+
return;
82+
}
83+
}
84+
allocate_host_memory_slowpath(size, ptr);
85+
}
86+
87+
void allocate_host_memory_slowpath(size_t size, void** ptr) {
7888
// Pinned memory pointers allocated by any device can be directly used by
7989
// any other device, regardless of the current device at the time of
8090
// allocation, since we assume unified addressing. So we grab any existing
@@ -113,6 +123,18 @@ struct CUDACachingHostAllocatorImpl
113123
}
114124

115125
void free_block(Block* block) override {
126+
// We never free blocks from the reserve segment
127+
if (get_reserve_segment().initialized()) {
128+
// Check if the block is from the reserve segment
129+
if (get_reserve_segment().owns(block->ptr_)) {
130+
return;
131+
}
132+
}
133+
134+
free_block_slowpath(block);
135+
}
136+
137+
void free_block_slowpath(Block* block) {
116138
auto start = std::chrono::steady_clock::now();
117139
// Users may change the allocator config at will. torch unit tests do this.
118140
// However, allocations using cudaHostRegister should use corresonding
@@ -172,6 +194,20 @@ struct CUDACachingHostAllocatorImpl
172194
return event_pool->get(idx);
173195
}
174196

197+
PinnedReserveSegment& get_reserve_segment() {
198+
static auto reserve_segment = [&]() {
199+
if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_reserve_segment_size_mb() > 0) {
200+
void *ptr;
201+
size_t sz = c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_reserve_segment_size_mb() * 1024 * 1024;
202+
allocate_host_memory_slowpath(sz, &ptr);
203+
return PinnedReserveSegment(ptr, sz);
204+
} else {
205+
return PinnedReserveSegment();
206+
}
207+
} ();
208+
return reserve_segment;
209+
}
210+
175211
TaskThreadPool* getThreadPool() {
176212
static TaskThreadPool* pool = new TaskThreadPool(
177213
static_cast<int>(c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::

c10/cuda/CUDAAllocatorConfig.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ CUDAAllocatorConfig::CUDAAllocatorConfig()
1515
m_max_non_split_rounding_size(kLargeBuffer),
1616
m_garbage_collection_threshold(0),
1717
m_pinned_num_register_threads(1),
18+
m_pinned_reserve_segment_size_mb(0),
1819
m_expandable_segments(false),
1920
#if CUDA_VERSION >= 12030
2021
m_expandable_segments_handle_type(
@@ -371,6 +372,9 @@ void CUDAAllocatorConfig::parseArgs(const std::optional<std::string>& env) {
371372
} else if (config_item_view == "pinned_num_register_threads") {
372373
i = parsePinnedNumRegisterThreads(config, i);
373374
used_native_specific_option = true;
375+
} else if (config_item_view == "pinned_reserve_segment_size_mb") {
376+
i = parsePinnedReserveSegmentSize(config, i);
377+
used_native_specific_option = true;
374378
} else if (config_item_view == "pinned_use_background_threads") {
375379
i = parsePinnedUseBackgroundThreads(config, i);
376380
used_native_specific_option = true;
@@ -451,6 +455,22 @@ size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
451455
return i;
452456
}
453457

458+
size_t CUDAAllocatorConfig::parsePinnedReserveSegmentSize(
459+
const std::vector<std::string>& config,
460+
size_t i) {
461+
consumeToken(config, ++i, ':');
462+
if (++i < config.size()) {
463+
size_t val2 = stoi(config[i]);
464+
TORCH_CHECK(
465+
val2 > 0, "Pinned reserve segment size has to be greater than 0 ", "");
466+
m_pinned_reserve_segment_size_mb = val2;
467+
} else {
468+
TORCH_CHECK(
469+
false, "Error, expecting pinned_reserve_segment_size_mb value", "");
470+
}
471+
return i;
472+
}
473+
454474
size_t CUDAAllocatorConfig::parsePinnedUseBackgroundThreads(
455475
const std::vector<std::string>& config,
456476
size_t i) {

c10/cuda/CUDAAllocatorConfig.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,10 @@ class C10_CUDA_API CUDAAllocatorConfig {
7070
return instance().m_pinned_use_background_threads;
7171
}
7272

73+
static size_t pinned_reserve_segment_size_mb() {
74+
return instance().m_pinned_reserve_segment_size_mb;
75+
}
76+
7377
static size_t pinned_max_register_threads() {
7478
// Based on the benchmark results, we see better allocation performance
7579
// with 8 threads. However on future systems, we may need more threads
@@ -143,6 +147,9 @@ class C10_CUDA_API CUDAAllocatorConfig {
143147
size_t parsePinnedNumRegisterThreads(
144148
const std::vector<std::string>& config,
145149
size_t i);
150+
size_t parsePinnedReserveSegmentSize(
151+
const std::vector<std::string>& config,
152+
size_t i);
146153
size_t parsePinnedUseBackgroundThreads(
147154
const std::vector<std::string>& config,
148155
size_t i);
@@ -155,6 +162,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
155162
std::vector<size_t> m_roundup_power2_divisions;
156163
std::atomic<double> m_garbage_collection_threshold;
157164
std::atomic<size_t> m_pinned_num_register_threads;
165+
std::atomic<size_t> m_pinned_reserve_segment_size_mb;
158166
std::atomic<bool> m_expandable_segments;
159167
std::atomic<Expandable_Segments_Handle_Type>
160168
m_expandable_segments_handle_type;

docs/source/notes/cuda.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -608,6 +608,10 @@ Available options:
608608
for processing events. This avoids any slow path associated with querying/processing of
609609
events in the fast allocation path. This feature is disabled by default.
610610

611+
* `pinned_reserve_segment_size_mb` option is a size in MB to reserve for pinned memory
612+
segment. This allocates a large segment of pinned memory upfront and then uses to allocate
613+
small size requests. This helps reduce the number of expensive device library calls.
614+
611615
* ``graph_capture_record_stream_reuse`` (experimental, default: `False`)
612616
If set to `True`, the CUDA caching allocator will attempt to reclaim device memory during
613617
CUDA Graph capture by using the graph topology (instead of CUDA events) to determine

0 commit comments

Comments
 (0)