Skip to content

Commit 26d4513

Browse files
authored
Cherry-pick gpu memory limit (#22838)
* add recorded cuda memory apis, fix typo, test=develop * add more ut, test=develop * follow comments, test=release/1.7 * fix py35 incompatible issues, test=release/1.7
1 parent a1c0b24 commit 26d4513

File tree

11 files changed

+440
-50
lines changed

11 files changed

+440
-50
lines changed

paddle/fluid/memory/allocation/cuda_allocator.cc

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,39 +25,48 @@ namespace memory {
2525
namespace allocation {
2626
bool CUDAAllocator::IsAllocThreadSafe() const { return true; }
2727
void CUDAAllocator::FreeImpl(Allocation* allocation) {
28-
platform::CUDADeviceGuard guard(place_.device);
29-
PADDLE_ENFORCE_EQ(boost::get<platform::CUDAPlace>(allocation->place()),
30-
place_);
31-
PADDLE_ENFORCE(cudaFree(allocation->ptr()));
28+
PADDLE_ENFORCE_EQ(
29+
boost::get<platform::CUDAPlace>(allocation->place()), place_,
30+
platform::errors::PermissionDenied(
31+
"GPU memory is freed in incorrect device. This may be a bug"));
32+
platform::RecordedCudaFree(allocation->ptr(), allocation->size(),
33+
place_.device);
3234
delete allocation;
3335
}
3436

3537
Allocation* CUDAAllocator::AllocateImpl(size_t size) {
3638
std::call_once(once_flag_, [this] { platform::SetDeviceId(place_.device); });
3739

38-
platform::CUDADeviceGuard guard(place_.device);
3940
void* ptr;
40-
auto result = cudaMalloc(&ptr, size);
41+
auto result = platform::RecordedCudaMalloc(&ptr, size, place_.device);
4142
if (LIKELY(result == cudaSuccess)) {
4243
return new Allocation(ptr, size, platform::Place(place_));
4344
}
4445

45-
platform::RaiseNonOutOfMemoryError(&result);
46+
size_t avail, total, actual_avail, actual_total;
47+
bool is_limited = platform::RecordedCudaMemGetInfo(
48+
&avail, &total, &actual_avail, &actual_total, place_.device);
4649

47-
size_t avail = 0, total = 0;
48-
result = cudaMemGetInfo(&avail, &total);
49-
if (result != cudaSuccess) avail = 0;
50-
platform::RaiseNonOutOfMemoryError(&result);
50+
std::string err_msg;
51+
if (is_limited) {
52+
auto limit_size = (total >> 20);
53+
err_msg = string::Sprintf(
54+
"Or set environment variable `FLAGS_gpu_memory_limit_mb` to a larger "
55+
"value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the maximum "
56+
"GPU memory usage is limited to %d MB.\n"
57+
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
58+
limit_size, limit_size);
59+
}
5160

5261
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
5362
"\n\nOut of memory error on GPU %d. "
5463
"Cannot allocate %s memory on GPU %d, "
5564
"available memory is only %s.\n\n"
5665
"Please check whether there is any other process using GPU %d.\n"
5766
"1. If yes, please stop them, or start PaddlePaddle on another GPU.\n"
58-
"2. If no, please decrease the batch size of your model.\n",
67+
"2. If no, please decrease the batch size of your model. %s\n\n",
5968
place_.device, string::HumanReadableSize(size), place_.device,
60-
string::HumanReadableSize(avail), place_.device));
69+
string::HumanReadableSize(avail), place_.device, err_msg));
6170
}
6271

6372
} // namespace allocation

paddle/fluid/memory/detail/system_allocator.cc

Lines changed: 18 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -110,29 +110,28 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
110110
// if size is 0. We just make sure it does.
111111
if (size <= 0) return nullptr;
112112

113-
paddle::platform::CUDADeviceGuard guard(gpu_id_);
114-
115113
void* p;
116-
cudaError_t result = cudaMalloc(&p, size);
114+
auto result = platform::RecordedCudaMalloc(&p, size, gpu_id_);
117115

118116
if (result == cudaSuccess) {
119117
*index = 0;
120118
gpu_alloc_size_ += size;
121119
return p;
122120
} else {
123-
platform::RaiseNonOutOfMemoryError(&result);
124-
125-
/**
126-
* NOTE(zjl): Sometimes cudaMemGetInfo would raise OOM error
127-
* if there is very little GPU memory left. In this case, we
128-
* should consider the available GPU memory to be 0, and throw
129-
* exception inside this function instead of throwing exception
130-
* inside cudaMemGetInfo.
131-
*/
132-
size_t avail = 0, total = 0;
133-
result = cudaMemGetInfo(&avail, &total);
134-
if (result != cudaSuccess) avail = 0;
135-
platform::RaiseNonOutOfMemoryError(&result);
121+
size_t avail, total, actual_avail, actual_total;
122+
bool is_limited = platform::RecordedCudaMemGetInfo(
123+
&avail, &total, &actual_avail, &actual_total, gpu_id_);
124+
125+
std::string err_msg;
126+
if (is_limited) {
127+
auto limit_size = (total >> 20);
128+
err_msg = string::Sprintf(
129+
"\n 3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a "
130+
"larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
131+
"maximum GPU memory usage is limited to %d MB.\n"
132+
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
133+
limit_size, limit_size);
134+
}
136135

137136
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
138137
"\n\nOut of memory error on GPU %d. "
@@ -145,28 +144,19 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
145144
" 2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, "
146145
"please set it to a higher value but less than 1.0.\n"
147146
" The command is "
148-
"`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.\n\n",
147+
"`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
149148
gpu_id_, string::HumanReadableSize(size), gpu_id_,
150149
string::HumanReadableSize(avail), gpu_id_,
151-
FLAGS_fraction_of_gpu_memory_to_use));
150+
FLAGS_fraction_of_gpu_memory_to_use, err_msg));
152151
}
153152
}
154153

155154
void GPUAllocator::Free(void* p, size_t size, size_t index) {
156-
cudaError_t err;
157155
PADDLE_ENFORCE_EQ(index, 0);
158156
PADDLE_ENFORCE_GE(gpu_alloc_size_, size);
159157
gpu_alloc_size_ -= size;
160-
err = cudaFree(p);
161158

162-
// Purposefully allow cudaErrorCudartUnloading, because
163-
// that is returned if you ever call cudaFree after the
164-
// driver has already shutdown. This happens only if the
165-
// process is terminating, in which case we don't care if
166-
// cudaFree succeeds.
167-
if (err != cudaErrorCudartUnloading) {
168-
PADDLE_ENFORCE(err, "cudaFree{Host} failed in GPUAllocator::Free.");
169-
}
159+
platform::RecordedCudaFree(p, size, gpu_id_);
170160
}
171161

172162
bool GPUAllocator::UseGpu() const { return true; }

paddle/fluid/platform/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,8 @@ cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
117117
nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
118118
cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
119119

120+
nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
121+
120122
nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
121123

122124
if(NOT APPLE AND NOT WIN32)

paddle/fluid/platform/flags.cc

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,14 @@ DEFINE_uint64(reallocate_gpu_memory_in_mb, 0ul,
449449
"size specified by this flag. Else Paddle will reallocate by "
450450
"FLAGS_fraction_of_gpu_memory_to_use");
451451

452+
DEFINE_uint64(gpu_memory_limit_mb, 0UL,
453+
"The maximum gpu memory limit that the process can allocate. "
454+
"If it is equal to 0, there would be no limit and all gpu memory "
455+
"would be available to the process. If it is larger than 0, "
456+
"the process would raise out of memory error if the allocated "
457+
"memory exceeds the limit even though there is available "
458+
"memory on the gpu card. The unit is MB and default value is 0.");
459+
452460
#endif
453461

454462
/**

paddle/fluid/platform/gpu_info.cc

Lines changed: 162 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,22 @@ limitations under the License. */
1515
#include "paddle/fluid/platform/gpu_info.h"
1616
#include <algorithm>
1717
#include <cstdlib>
18+
#include <memory>
1819
#include <string>
1920

2021
#include "gflags/gflags.h"
22+
#include "paddle/fluid/platform/cuda_device_guard.h"
2123
#include "paddle/fluid/platform/enforce.h"
24+
#include "paddle/fluid/platform/lock_guard_ptr.h"
25+
#include "paddle/fluid/platform/macros.h"
2226
#include "paddle/fluid/string/split.h"
2327

2428
DECLARE_double(fraction_of_gpu_memory_to_use);
2529
DECLARE_uint64(initial_gpu_memory_in_mb);
2630
DECLARE_uint64(reallocate_gpu_memory_in_mb);
2731
DECLARE_bool(enable_cublas_tensor_op_math);
2832
DECLARE_string(selected_gpus);
33+
DECLARE_uint64(gpu_memory_limit_mb);
2934

3035
constexpr static float fraction_reserve_gpu_memory = 0.05f;
3136

@@ -241,11 +246,9 @@ void SetDeviceId(int id) {
241246
}
242247

243248
void GpuMemoryUsage(size_t *available, size_t *total) {
244-
auto error_code = cudaMemGetInfo(available, total);
245-
PADDLE_ENFORCE(error_code,
246-
"cudaMemGetInfo failed in "
247-
"paddle::platform::GetMemoryUsage, error code : %d, %s",
248-
error_code, CudaErrorWebsite());
249+
size_t actual_available, actual_total;
250+
RecordedCudaMemGetInfo(available, total, &actual_available, &actual_total,
251+
platform::GetCurrentDeviceId());
249252
}
250253

251254
size_t GpuAvailableMemToAlloc() {
@@ -359,7 +362,7 @@ void GpuStreamSync(cudaStream_t stream) {
359362
error_code, CudaErrorWebsite()));
360363
}
361364

362-
void RaiseNonOutOfMemoryError(cudaError_t *status) {
365+
static void RaiseNonOutOfMemoryError(cudaError_t *status) {
363366
if (*status == cudaErrorMemoryAllocation) {
364367
*status = cudaSuccess;
365368
}
@@ -374,5 +377,158 @@ void RaiseNonOutOfMemoryError(cudaError_t *status) {
374377
PADDLE_ENFORCE_CUDA_SUCCESS(*status);
375378
}
376379

380+
class RecordedCudaMallocHelper {
381+
private:
382+
explicit RecordedCudaMallocHelper(int dev_id, uint64_t limit_size = 0)
383+
: dev_id_(dev_id), limit_size_(limit_size) {
384+
if (NeedRecord()) {
385+
mtx_.reset(new std::mutex());
386+
}
387+
}
388+
389+
DISABLE_COPY_AND_ASSIGN(RecordedCudaMallocHelper);
390+
391+
public:
392+
static RecordedCudaMallocHelper *Instance(int dev_id) {
393+
std::call_once(once_flag_, [] {
394+
int dev_cnt = GetCUDADeviceCount();
395+
instances_.reserve(dev_cnt);
396+
for (int i = 0; i < dev_cnt; ++i) {
397+
instances_.emplace_back(
398+
new RecordedCudaMallocHelper(i, FLAGS_gpu_memory_limit_mb << 20));
399+
}
400+
});
401+
402+
PADDLE_ENFORCE_GE(
403+
dev_id, 0,
404+
platform::errors::OutOfRange(
405+
"Device id must be not less than 0, but got %d", dev_id));
406+
PADDLE_ENFORCE_LT(
407+
dev_id, instances_.size(),
408+
platform::errors::OutOfRange("Device id %d exceeds gpu card number %d",
409+
dev_id, instances_.size()));
410+
return instances_[dev_id].get();
411+
}
412+
413+
/**
414+
* Try to allocate `size` gpu memory. Only cudaErrorMemoryAllocation
415+
* or cudaSuccess would be returned, and the cudaGetLastError() flag
416+
* would be clear.
417+
*/
418+
cudaError_t Malloc(void **ptr, size_t size) {
419+
LockGuardPtr<std::mutex> lock(mtx_);
420+
if (UNLIKELY(NeedRecord() && cur_size_ + size > limit_size_)) {
421+
return cudaErrorMemoryAllocation;
422+
}
423+
424+
CUDADeviceGuard guard(dev_id_);
425+
auto result = cudaMalloc(ptr, size);
426+
if (result == cudaSuccess) {
427+
if (NeedRecord()) {
428+
cur_size_ += size;
429+
}
430+
return cudaSuccess;
431+
} else {
432+
RaiseNonOutOfMemoryError(&result);
433+
// Non out of memory error would be raised inside
434+
// RaiseNonOutOfMemoryError. Therefore, we can
435+
// return cudaErrorMemoryAllocation directly here.
436+
return cudaErrorMemoryAllocation;
437+
}
438+
}
439+
440+
/**
441+
* Free gpu memory. Usually, free is not allowed to raise error.
442+
* If it does raise error, the process should be crashed.
443+
*/
444+
void Free(void *ptr, size_t size) {
445+
// Purposefully allow cudaErrorCudartUnloading, because
446+
// that is returned if you ever call cudaFree after the
447+
// driver has already shutdown. This happens only if the
448+
// process is terminating, in which case we don't care if
449+
// cudaFree succeeds.
450+
CUDADeviceGuard guard(dev_id_);
451+
auto err = cudaFree(ptr);
452+
if (err != cudaErrorCudartUnloading) {
453+
PADDLE_ENFORCE_CUDA_SUCCESS(
454+
err, platform::errors::External("cudaFree raises unexpected error"));
455+
if (NeedRecord()) {
456+
std::lock_guard<std::mutex> guard(*mtx_);
457+
cur_size_ -= size;
458+
}
459+
} else {
460+
cudaGetLastError(); // clear the error flag when cudaErrorCudartUnloading
461+
}
462+
}
463+
464+
bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail,
465+
size_t *actual_total) {
466+
{
467+
CUDADeviceGuard guard(dev_id_);
468+
auto result = cudaMemGetInfo(actual_avail, actual_total);
469+
if (result != cudaSuccess) {
470+
*actual_avail = 0;
471+
}
472+
RaiseNonOutOfMemoryError(&result);
473+
}
474+
475+
if (NeedRecord()) {
476+
std::lock_guard<std::mutex> guard(*mtx_);
477+
*avail = std::min(*actual_avail, limit_size_ - cur_size_);
478+
*total = std::min(*actual_total, limit_size_);
479+
return *total < *actual_total;
480+
} else {
481+
*avail = *actual_avail;
482+
*total = *actual_total;
483+
return false;
484+
}
485+
}
486+
487+
inline bool NeedRecord() const { return limit_size_ != 0; }
488+
489+
uint64_t RecordedSize() const {
490+
LockGuardPtr<std::mutex> lock(mtx_);
491+
return NeedRecord() ? cur_size_ : 0;
492+
}
493+
494+
uint64_t LimitSize() const { return limit_size_; }
495+
496+
private:
497+
const int dev_id_;
498+
const uint64_t limit_size_;
499+
uint64_t cur_size_{0};
500+
501+
mutable std::unique_ptr<std::mutex> mtx_;
502+
503+
static std::once_flag once_flag_;
504+
static std::vector<std::unique_ptr<RecordedCudaMallocHelper>> instances_;
505+
};
506+
507+
std::once_flag RecordedCudaMallocHelper::once_flag_;
508+
std::vector<std::unique_ptr<RecordedCudaMallocHelper>>
509+
RecordedCudaMallocHelper::instances_;
510+
511+
cudaError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id) {
512+
return RecordedCudaMallocHelper::Instance(dev_id)->Malloc(ptr, size);
513+
}
514+
515+
void RecordedCudaFree(void *p, size_t size, int dev_id) {
516+
return RecordedCudaMallocHelper::Instance(dev_id)->Free(p, size);
517+
}
518+
519+
bool RecordedCudaMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
520+
size_t *actual_total, int dev_id) {
521+
return RecordedCudaMallocHelper::Instance(dev_id)->GetMemInfo(
522+
avail, total, actual_avail, actual_total);
523+
}
524+
525+
uint64_t RecordedCudaMallocSize(int dev_id) {
526+
return RecordedCudaMallocHelper::Instance(dev_id)->RecordedSize();
527+
}
528+
529+
bool IsCudaMallocRecorded(int dev_id) {
530+
return RecordedCudaMallocHelper::Instance(dev_id)->NeedRecord();
531+
}
532+
377533
} // namespace platform
378534
} // namespace paddle

paddle/fluid/platform/gpu_info.h

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,20 @@ void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);
104104
//! Blocks until stream has completed all operations.
105105
void GpuStreamSync(cudaStream_t stream);
106106

107-
//! Raise error if status is not cudaSuccess or OOM, otherwise reset status.
108-
void RaiseNonOutOfMemoryError(cudaError_t *status);
107+
//! CudaMalloc with recorded info
108+
cudaError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id);
109+
110+
//! CudaFree with recorded info
111+
void RecordedCudaFree(void *p, size_t size, int dev_id);
112+
113+
//! Get available and total gpu memory with considering limitation
114+
bool RecordedCudaMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
115+
size_t *actual_total, int dev_id);
116+
117+
//! Get recorded cudaMalloc size. If record is disabled, return 0.
118+
uint64_t RecordedCudaMallocSize(int dev_id);
119+
120+
bool IsCudaMallocRecorded(int dev_id);
109121

110122
} // namespace platform
111123
} // namespace paddle

0 commit comments

Comments
 (0)