Skip to content

Commit 7b055a0

Browse files
lakshaygpytorchmergebot
authored andcommitted
Add per_process_memory_fraction to PYTORCH_CUDA_ALLOC_CONF (pytorch#161035)
torch.cuda.memory.set_per_process_memory_fraction allows setting an upper bound on how much device memory is allocated. This PR exposes this setting to an environment variable. For example, PYTORCH_CUDA_ALLOC_CONF="per_process_memory_fraction:0.5" will limit the device memory to half of the available memory. Pull Request resolved: pytorch#161035 Approved by: https://github.com/ngimel, https://github.com/eqy
1 parent da2eb31 commit 7b055a0

File tree

7 files changed

+108
-37
lines changed

7 files changed

+108
-37
lines changed

c10/cuda/CUDAAllocatorConfig.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,9 @@ void CUDAAllocatorConfig::parseArgs(const std::string& env) {
106106
} else if (key == "graph_capture_record_stream_reuse") {
107107
i = parseGraphCaptureRecordStreamReuse(tokenizer, i);
108108
used_native_specific_option = true;
109+
} else if (key == "per_process_memory_fraction") {
110+
i = parsePerProcessMemoryFraction(tokenizer, i);
111+
used_native_specific_option = true;
109112
} else {
110113
const auto& keys =
111114
c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys();
@@ -146,6 +149,18 @@ size_t CUDAAllocatorConfig::parseGraphCaptureRecordStreamReuse(
146149
return i;
147150
}
148151

152+
double CUDAAllocatorConfig::parsePerProcessMemoryFraction(
153+
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
154+
size_t i) {
155+
tokenizer.checkToken(++i, ":");
156+
double val_env = tokenizer.toDouble(++i);
157+
TORCH_CHECK_VALUE(
158+
val_env >= 0.0 && val_env <= 1.0,
159+
"per_process_memory_fraction is invalid, set it in [0.0, 1.0]");
160+
m_per_process_memory_fraction = val_env;
161+
return i;
162+
}
163+
149164
size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
150165
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
151166
size_t i) {

c10/cuda/CUDAAllocatorConfig.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ class C10_CUDA_API CUDAAllocatorConfig {
6161
return instance().m_graph_capture_record_stream_reuse;
6262
}
6363

64+
static double per_process_memory_fraction() {
65+
return instance().m_per_process_memory_fraction;
66+
}
67+
6468
/** Pinned memory allocator settings */
6569
static bool pinned_use_cuda_host_register() {
6670
return instance().m_pinned_use_cuda_host_register;
@@ -152,7 +156,8 @@ class C10_CUDA_API CUDAAllocatorConfig {
152156
"pinned_use_hip_host_register",
153157
"graph_capture_record_stream_reuse",
154158
"pinned_reserve_segment_size_mb",
155-
"pinned_num_register_threads"};
159+
"pinned_num_register_threads",
160+
"per_process_memory_fraction"};
156161
return keys;
157162
}
158163

@@ -177,6 +182,9 @@ class C10_CUDA_API CUDAAllocatorConfig {
177182
size_t parseGraphCaptureRecordStreamReuse(
178183
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
179184
size_t i);
185+
double parsePerProcessMemoryFraction(
186+
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
187+
size_t i);
180188

181189
std::atomic<size_t> m_pinned_num_register_threads{1};
182190
std::atomic<size_t> m_pinned_reserve_segment_size_mb{0};
@@ -189,6 +197,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
189197
std::atomic<bool> m_release_lock_on_cudamalloc{false};
190198
std::atomic<bool> m_pinned_use_cuda_host_register{false};
191199
std::atomic<bool> m_graph_capture_record_stream_reuse{false};
200+
std::atomic<double> m_per_process_memory_fraction{1.0};
192201
};
193202

194203
// Keep this for backwards compatibility

c10/cuda/CUDACachingAllocator.cpp

Lines changed: 32 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1100,7 +1100,7 @@ class RingBuffer {
11001100
} // anonymous namespace
11011101
} // namespace Native
11021102

1103-
static std::string reportProcessMemoryInfo(c10::DeviceIndex device) {
1103+
static std::string reportProcessMemoryInfo(const cudaDeviceProp& prop) {
11041104
#ifdef PYTORCH_C10_DRIVER_API_SUPPORTED
11051105
void* nvml_handle = DriverAPI::get_nvml_handle();
11061106
if (!nvml_handle) {
@@ -1111,9 +1111,6 @@ static std::string reportProcessMemoryInfo(c10::DeviceIndex device) {
11111111
return true;
11121112
}();
11131113

1114-
cudaDeviceProp prop{};
1115-
C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
1116-
11171114
// NOLINTNEXTLINE(*-c-arrays)
11181115
char pci_id[80];
11191116
snprintf(
@@ -1215,14 +1212,16 @@ class DeviceCachingAllocator {
12151212
// record used memory.
12161213
size_t total_allocated_memory = 0;
12171214

1218-
size_t allowed_memory_maximum = 0;
1215+
cudaDeviceProp device_prop;
1216+
1217+
// maximum amount of memory that device is allowed to
1218+
// allocate. This is set iff memory fraction is less than 1
1219+
std::optional<size_t> allowed_memory_maximum{std::nullopt};
12191220

12201221
// all live expandable segments
12211222
std::vector<ExpandableSegment*> expandable_segments_;
12221223
std::vector<c10::DeviceIndex> devices_with_peer_access_;
12231224

1224-
bool set_fraction = false;
1225-
12261225
bool record_history = false;
12271226

12281227
std::atomic<CreateContextFn> context_recorder_;
@@ -1264,6 +1263,9 @@ class DeviceCachingAllocator {
12641263
: device_id(id),
12651264
large_blocks(/*small=*/false),
12661265
small_blocks(/*small=*/true) {
1266+
C10_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, id));
1267+
1268+
setMemoryFraction(CUDAAllocatorConfig::per_process_memory_fraction());
12671269
stats.max_split_size =
12681270
static_cast<int64_t>(AcceleratorAllocatorConfig::max_split_size());
12691271
context_recorder_.store(nullptr);
@@ -1399,7 +1401,7 @@ class DeviceCachingAllocator {
13991401
if (!block_found) {
14001402
// Do garbage collection if the flag is set.
14011403
if (C10_UNLIKELY(
1402-
set_fraction &&
1404+
allowed_memory_maximum.has_value() &&
14031405
AcceleratorAllocatorConfig::garbage_collection_threshold() >
14041406
0.0)) {
14051407
garbage_collect_cached_blocks(context);
@@ -1456,11 +1458,12 @@ class DeviceCachingAllocator {
14561458
C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
14571459
std::string allowed_info;
14581460

1459-
if (set_fraction) {
1460-
allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
1461+
if (allowed_memory_maximum.has_value()) {
1462+
allowed_info =
1463+
format_size(allowed_memory_maximum.value()) + " allowed; ";
14611464
}
14621465

1463-
std::string proc_info = reportProcessMemoryInfo(device_id);
1466+
std::string proc_info = reportProcessMemoryInfo(device_prop);
14641467

14651468
record_trace(
14661469
TraceEntry::OOM,
@@ -1518,7 +1521,7 @@ class DeviceCachingAllocator {
15181521
for (const auto& obs : observers_local) {
15191522
obs(device_id,
15201523
alloc_size,
1521-
set_fraction ? allowed_memory_maximum : device_total,
1524+
allowed_memory_maximum.value_or(device_total),
15221525
device_free);
15231526
}
15241527

@@ -2015,25 +2018,26 @@ class DeviceCachingAllocator {
20152018

20162019
/** get memory fraction limiting maximum allocated memory **/
20172020
double getMemoryFraction() {
2018-
if (!set_fraction) {
2021+
if (!allowed_memory_maximum.has_value()) {
20192022
return 1.0;
20202023
}
20212024

2022-
size_t device_free = 0;
2023-
size_t device_total = 0;
2024-
C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
2025-
return static_cast<double>(allowed_memory_maximum) /
2026-
static_cast<double>(device_total);
2025+
return static_cast<double>(allowed_memory_maximum.value()) /
2026+
static_cast<double>(device_prop.totalGlobalMem);
20272027
}
20282028

20292029
/** set memory fraction to limit maximum allocated memory **/
20302030
void setMemoryFraction(double fraction) {
2031-
size_t device_free = 0;
2032-
size_t device_total = 0;
2033-
C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
2034-
allowed_memory_maximum =
2035-
static_cast<size_t>(fraction * static_cast<double>(device_total));
2036-
set_fraction = true;
2031+
TORCH_CHECK(
2032+
0 <= fraction && fraction <= 1,
2033+
"invalid fraction:",
2034+
fraction,
2035+
". Please set within [0, 1].");
2036+
allowed_memory_maximum = std::nullopt;
2037+
if (fraction < 1.0) {
2038+
allowed_memory_maximum = static_cast<size_t>(
2039+
fraction * static_cast<double>(device_prop.totalGlobalMem));
2040+
}
20372041
}
20382042

20392043
/** get expandable segment size for all the streams on device **/
@@ -3010,7 +3014,7 @@ class DeviceCachingAllocator {
30103014
BlockPool& pool = *p.pool;
30113015

30123016
if (C10_UNLIKELY(
3013-
set_fraction &&
3017+
allowed_memory_maximum.has_value() &&
30143018
AcceleratorAllocatorConfig::garbage_collection_threshold() > 0.0)) {
30153019
// Track block reuse interval only when garbage collection is enabled.
30163020
++pool.get_free_blocks_call_count;
@@ -3083,7 +3087,7 @@ class DeviceCachingAllocator {
30833087

30843088
size_t gc_threshold = static_cast<size_t>(
30853089
AcceleratorAllocatorConfig::garbage_collection_threshold() *
3086-
static_cast<double>(allowed_memory_maximum));
3090+
static_cast<double>(allowed_memory_maximum.value()));
30873091
// No need to trigger GC yet
30883092
if (total_allocated_memory <= gc_threshold) {
30893093
return;
@@ -3161,8 +3165,8 @@ class DeviceCachingAllocator {
31613165

31623166
bool active_pool =
31633167
p.pool->owner_PrivatePool && p.pool->owner_PrivatePool->allocator();
3164-
if (set_fraction &&
3165-
total_allocated_memory + size > allowed_memory_maximum) {
3168+
if (allowed_memory_maximum.has_value() &&
3169+
total_allocated_memory + size > allowed_memory_maximum.value()) {
31663170
p.err = cudaErrorMemoryAllocation;
31673171
return false;
31683172
// Temporarily disable checkpointing & cudagraphs internally
@@ -3859,7 +3863,6 @@ class NativeCachingAllocator : public CUDAAllocator {
38593863
"Allocator not initialized for device ",
38603864
device,
38613865
": did you call init?");
3862-
C10_CUDA_CHECK(c10::cuda::SetDevice(device));
38633866
return device_allocator[device]->getMemoryFraction();
38643867
}
38653868

@@ -3869,12 +3872,6 @@ class NativeCachingAllocator : public CUDAAllocator {
38693872
"Allocator not initialized for device ",
38703873
device,
38713874
": did you call init?");
3872-
TORCH_CHECK(
3873-
0 <= fraction && fraction <= 1,
3874-
"invalid fraction:",
3875-
fraction,
3876-
". Please set within [0, 1].");
3877-
C10_CUDA_CHECK(c10::cuda::SetDevice(device));
38783875
device_allocator[device]->setMemoryFraction(fraction);
38793876
}
38803877

c10/cuda/CUDACachingAllocator.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#include <c10/core/AllocatorConfig.h>
44
#include <c10/core/CachingDeviceAllocator.h>
5+
#include <c10/cuda/CUDAAllocatorConfig.h>
56
#include <c10/cuda/CUDAGraphsC10Utils.h>
67
#include <c10/cuda/CUDAMacros.h>
78
#include <c10/cuda/CUDAStream.h>

c10/cuda/CUDAMallocAsyncAllocator.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -427,7 +427,6 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
427427
// on the current device each later call sees.
428428
void init(int dev_count) override {
429429
static bool called = [](int dev_count) {
430-
;
431430
// Are there external guarantees init will be called before
432431
// any of the allocator's other functions?
433432
// std::lock_guard<std::mutex> lk(general_mutex);

docs/source/notes/cuda.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -619,6 +619,10 @@ Available options:
619619
and reallocate buffers across multiple streams, especially when the capture DAG frequently
620620
reaches joined frontiers.
621621

622+
* ``per_process_memory_fraction`` option limits the amount of memory that can be allocated
623+
on all the CUDA devices to a specified fraction of the available memory. This is a value
624+
between 0 and 1. Attempting to allocate more memory will raise an out of memory error.
625+
622626
.. note::
623627

624628
Some stats reported by the

test/test_cuda.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4626,6 +4626,52 @@ def check_output(script: str) -> str:
46264626
rc = check_output(test_script)
46274627
self.assertEqual(rc, "cudaMallocAsync")
46284628

4629+
def test_allocator_memory_fraction_setting(self):
4630+
def make_env(fraction):
4631+
env = os.environ.copy()
4632+
var = "PYTORCH_CUDA_ALLOC_CONF"
4633+
key = "per_process_memory_fraction"
4634+
value = [
4635+
x
4636+
for x in env.get(var, "").split(",")
4637+
if len(x) > 0 and not x.startswith(f"{key}:")
4638+
]
4639+
value.append(f"{key}:{fraction}")
4640+
env[var] = ",".join(value)
4641+
return env
4642+
4643+
def run_test(value):
4644+
test_script = """\
4645+
import os
4646+
import torch
4647+
device = torch._C._cuda_getDevice()
4648+
value = torch.cuda.memory.get_per_process_memory_fraction(device)
4649+
print(value, end="")
4650+
"""
4651+
return subprocess.run(
4652+
[sys.executable, "-c", test_script],
4653+
env=make_env(value),
4654+
text=True,
4655+
check=True,
4656+
capture_output=True,
4657+
)
4658+
4659+
self.assertEqual(run_test(0.0).stdout, "0.0")
4660+
self.assertEqual(run_test(0.5).stdout, "0.5")
4661+
self.assertEqual(run_test(1.0).stdout, "1.0")
4662+
4663+
with self.assertRaises(subprocess.CalledProcessError) as e:
4664+
run_test(-0.1)
4665+
assert "per_process_memory_fraction is invalid" in e.exception.stderr, (
4666+
e.exception.stderr
4667+
)
4668+
4669+
with self.assertRaises(subprocess.CalledProcessError) as e:
4670+
run_test(1.1)
4671+
assert "per_process_memory_fraction is invalid" in e.exception.stderr, (
4672+
e.exception.stderr
4673+
)
4674+
46294675
def test_cachingAllocator_raw_alloc(self):
46304676
# Test that raw_alloc respects the setting that
46314677
# activates/deactivates the caching allocator

0 commit comments

Comments
 (0)