Skip to content

Commit 737f21b

Browse files
authored
cherry-pick refine default gpu memory, test=release/1.5 (#19281)
1 parent 71168da commit 737f21b

File tree

4 files changed

+181
-76
lines changed

4 files changed

+181
-76
lines changed

paddle/fluid/memory/detail/buddy_allocator.cc

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ DEFINE_bool(free_idle_memory, false,
2323
"If it is true, Paddle will try to free idle memory trunks during "
2424
"running time.");
2525

26+
#ifdef PADDLE_WITH_CUDA
27+
DECLARE_uint64(reallocate_gpu_memory_in_mb);
28+
#endif
29+
2630
namespace paddle {
2731
namespace memory {
2832
namespace detail {
@@ -200,8 +204,9 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
200204
// Compute the allocation size for gpu for the first allocation.
201205
allocate_bytes = std::max(platform::GpuInitAllocSize(), request_bytes);
202206
} else {
203-
// Reallocation size
204-
if (realloc_size_ == 0) {
207+
// Compute the re-allocation size, we store the re-allocation size when
208+
// user set FLAGS_reallocate_gpu_memory_in_mb to fix value.
209+
if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) {
205210
realloc_size_ = platform::GpuReallocSize();
206211
}
207212
allocate_bytes = std::max(realloc_size_, request_bytes);

paddle/fluid/memory/detail/buddy_allocator_test.cc

Lines changed: 134 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ limitations under the License. */
2222
#include "paddle/fluid/platform/gpu_info.h"
2323

2424
#ifdef PADDLE_WITH_CUDA
25+
#include <cuda_runtime.h>
26+
2527
DECLARE_double(fraction_of_gpu_memory_to_use);
2628
DECLARE_uint64(initial_gpu_memory_in_mb);
2729
DECLARE_uint64(reallocate_gpu_memory_in_mb);
@@ -31,29 +33,37 @@ namespace paddle {
3133
namespace memory {
3234
namespace detail {
3335

34-
constexpr static int test_gpu_id = 0;
36+
constexpr static int TEST_GPU_ID = 0;
3537

36-
void TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes) {
38+
int* TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes,
39+
bool use_system_allocator = false,
40+
bool free_ptr = true) {
3741
bool freed = false;
3842
size_t used_bytes = allocator->Used();
3943

4044
if (size_bytes > 0) {
4145
void* p = allocator->Alloc(size_bytes);
4246

4347
EXPECT_NE(p, nullptr);
48+
4449
#ifdef PADDLE_WITH_CUDA
45-
if (size_bytes < platform::GpuMaxChunkSize()) {
50+
if (size_bytes < allocator->GetMaxChunkSize()) {
4651
#else
47-
if (size_bytes < platform::CpuMaxChunkSize()) {
52+
if (size_bytes < allocator->GetMaxChunkSize()) {
4853
#endif
4954
// Not allocate from SystemAllocator
55+
EXPECT_FALSE(use_system_allocator);
5056
EXPECT_GE(allocator->Used(), used_bytes + size_bytes);
5157
} else {
5258
// Allocate from SystemAllocator doesn't count in Used()
59+
EXPECT_TRUE(use_system_allocator);
5360
EXPECT_EQ(allocator->Used(), used_bytes);
5461
}
5562

5663
int* intp = static_cast<int*>(p);
64+
if (!free_ptr) {
65+
return intp;
66+
}
5767
std::shared_ptr<int> ptr(intp, [&](void* p) {
5868
allocator->Free(intp);
5969
freed = true;
@@ -64,20 +74,30 @@ void TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes) {
6474

6575
EXPECT_EQ(used_bytes, allocator->Used());
6676
EXPECT_TRUE(freed);
77+
return nullptr;
6778
}
6879

6980
#ifdef PADDLE_WITH_CUDA
7081
TEST(BuddyAllocator, GpuFraction) {
82+
// In a 16 GB machine, the pool size will be about 160 MB
7183
FLAGS_fraction_of_gpu_memory_to_use = 0.01;
84+
FLAGS_initial_gpu_memory_in_mb = 0;
85+
FLAGS_reallocate_gpu_memory_in_mb = 0;
7286

7387
BuddyAllocator buddy_allocator(
74-
std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)),
88+
std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
7589
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
7690

91+
// Less than pool size
7792
TestBuddyAllocator(&buddy_allocator, 10);
7893
TestBuddyAllocator(&buddy_allocator, 10 << 10);
7994
TestBuddyAllocator(&buddy_allocator, 10 << 20);
80-
TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30));
95+
96+
// Greater than max chunk size
97+
TestBuddyAllocator(&buddy_allocator, 499 << 20,
98+
/* use_system_allocator = */ true);
99+
TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30),
100+
/* use_system_allocator = */ true);
81101
}
82102

83103
TEST(BuddyAllocator, InitRealloc) {
@@ -87,19 +107,19 @@ TEST(BuddyAllocator, InitRealloc) {
87107
EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast<size_t>(100 << 20));
88108

89109
BuddyAllocator buddy_allocator(
90-
std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)),
110+
std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
91111
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
92112

93113
// Less then initial size and reallocate size
94114
TestBuddyAllocator(&buddy_allocator, 10 << 20);
95115
// Between initial size and reallocate size and not exceed pool
96116
TestBuddyAllocator(&buddy_allocator, 80 << 20);
97-
// Less then reallocate size and exceed pool
98-
TestBuddyAllocator(&buddy_allocator, 40 << 20);
99-
// Greater then reallocate size and exceed pool
100-
TestBuddyAllocator(&buddy_allocator, 80 << 20);
101-
// Greater then initial size and reallocate size
102-
TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30));
117+
TestBuddyAllocator(&buddy_allocator, 99 << 20);
118+
// Greater than max chunk size
119+
TestBuddyAllocator(&buddy_allocator, 101 << 20,
120+
/* use_system_allocator = */ true);
121+
TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30),
122+
/* use_system_allocator = */ true);
103123
}
104124

105125
TEST(BuddyAllocator, ReallocSizeGreaterThanInit) {
@@ -109,23 +129,112 @@ TEST(BuddyAllocator, ReallocSizeGreaterThanInit) {
109129
EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast<size_t>(10 << 20));
110130

111131
BuddyAllocator buddy_allocator(
112-
std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)),
132+
std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
113133
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
114134

115-
// Less then initial size and reallocate size
135+
// Less than initial size and reallocate size
116136
TestBuddyAllocator(&buddy_allocator, 1 << 20);
117-
// Between initial size and reallocate size and not exceed pool
118-
TestBuddyAllocator(&buddy_allocator, 3 << 20);
119-
// Less then initial size and exceed pool
120-
TestBuddyAllocator(&buddy_allocator, 3 << 20);
121-
// Less then reallocate size and not exceed pool (now pool is 15 MB, used 7
122-
// MB)
123-
TestBuddyAllocator(&buddy_allocator, 7 << 20);
124-
// Less then reallocate size and exceed pool
137+
// Between initial size and reallocate size and exceed pool
138+
TestBuddyAllocator(&buddy_allocator, 6 << 20);
125139
TestBuddyAllocator(&buddy_allocator, 8 << 20);
126-
// Greater then initial size and reallocate size
127-
TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30));
140+
TestBuddyAllocator(&buddy_allocator, 9 << 20);
141+
// Greater than max trunk size
142+
TestBuddyAllocator(&buddy_allocator, 11 << 20,
143+
/* use_system_allocator = */ true);
144+
TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30),
145+
/* use_system_allocator = */ true);
146+
}
147+
148+
TEST(BuddyAllocator, FractionRefillPool) {
149+
FLAGS_fraction_of_gpu_memory_to_use = 0.6;
150+
FLAGS_initial_gpu_memory_in_mb = 0;
151+
FLAGS_reallocate_gpu_memory_in_mb = 0;
152+
153+
size_t max_chunk_size = platform::GpuMaxChunkSize();
154+
BuddyAllocator buddy_allocator(
155+
std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
156+
platform::GpuMinChunkSize(), max_chunk_size);
157+
158+
// Less than pool size
159+
int* p0 = TestBuddyAllocator(&buddy_allocator, max_chunk_size - 1000,
160+
/* use_system_allocator = */ false,
161+
/* free_ptr = */ false);
162+
// Max chunk size should be same during allocation
163+
EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
164+
165+
size_t alloc =
166+
platform::GpuAvailableMemToAlloc() * FLAGS_fraction_of_gpu_memory_to_use;
167+
// Exceed pool trigger refilling size of fraction of avaiable gpu, and should
168+
// be able to alloc 60% of the remaining GPU
169+
int* p1 = TestBuddyAllocator(&buddy_allocator, alloc,
170+
/* use_system_allocator = */ false,
171+
/* free_ptr = */ false);
172+
// Max chunk size should be same during allocation
173+
EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
174+
175+
alloc =
176+
platform::GpuAvailableMemToAlloc() * FLAGS_fraction_of_gpu_memory_to_use;
177+
// Exceed pool trigger refilling size of fraction of avaiable gpu, and should
178+
// be able to alloc 60% of the remaining GPU
179+
TestBuddyAllocator(&buddy_allocator, alloc,
180+
/* use_system_allocator = */ false);
181+
// Max chunk size should be same during allocation
182+
EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
183+
184+
buddy_allocator.Free(p0);
185+
buddy_allocator.Free(p1);
186+
}
187+
188+
TEST(BuddyAllocator, AllocFromAvailable) {
189+
FLAGS_fraction_of_gpu_memory_to_use = 0.7;
190+
FLAGS_initial_gpu_memory_in_mb = 0;
191+
FLAGS_reallocate_gpu_memory_in_mb = 0;
192+
193+
size_t total = 0, available = 0;
194+
platform::SetDeviceId(TEST_GPU_ID);
195+
platform::GpuMemoryUsage(&available, &total);
196+
197+
// Take half of available GPU
198+
void* p;
199+
cudaError_t result = cudaMalloc(&p, available >> 1);
200+
EXPECT_TRUE(result == cudaSuccess);
201+
202+
// BuddyAllocator should be able to alloc the remaining GPU
203+
BuddyAllocator buddy_allocator(
204+
std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
205+
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
206+
207+
TestBuddyAllocator(&buddy_allocator, 10);
208+
TestBuddyAllocator(&buddy_allocator, 10 << 10);
209+
TestBuddyAllocator(&buddy_allocator, 10 << 20);
210+
TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(1 << 30));
211+
212+
if (p) {
213+
EXPECT_TRUE(cudaFree(p) == cudaSuccess);
214+
}
128215
}
216+
217+
TEST(BuddyAllocator, AllocFromAvailableWhenFractionIsOne) {
218+
FLAGS_fraction_of_gpu_memory_to_use = 1.0;
219+
FLAGS_initial_gpu_memory_in_mb = 0;
220+
FLAGS_reallocate_gpu_memory_in_mb = 0;
221+
222+
void* p = nullptr;
223+
EXPECT_TRUE(cudaMalloc(&p, static_cast<size_t>(4) << 30) == cudaSuccess);
224+
225+
// BuddyAllocator should be able to alloc the remaining GPU
226+
BuddyAllocator buddy_allocator(
227+
std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
228+
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
229+
230+
TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(1) << 30);
231+
TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(5) << 30);
232+
233+
if (p) {
234+
EXPECT_TRUE(cudaFree(p) == cudaSuccess);
235+
}
236+
}
237+
129238
#endif
130239

131240
} // namespace detail

paddle/fluid/platform/gpu_info.cc

Lines changed: 36 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -215,70 +215,57 @@ void GpuMemoryUsage(size_t *available, size_t *total) {
215215
"cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
216216
}
217217

218-
size_t GpuMaxAllocSize() {
219-
return std::max(GpuInitAllocSize(), GpuReallocSize());
220-
}
221-
222-
size_t GpuInitAllocSize() {
223-
if (FLAGS_initial_gpu_memory_in_mb > 0ul) {
224-
// Initial memory will be allocated by FLAGS_initial_gpu_memory_in_mb
225-
return static_cast<size_t>(FLAGS_initial_gpu_memory_in_mb << 20);
226-
}
227-
228-
// FLAGS_initial_gpu_memory_in_mb is 0, initial memory will be allocated by
229-
// fraction
218+
size_t GpuAvailableMemToAlloc() {
230219
size_t total = 0;
231220
size_t available = 0;
232-
233221
GpuMemoryUsage(&available, &total);
234-
size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total);
235-
236-
return static_cast<size_t>((total - reserving) *
237-
FLAGS_fraction_of_gpu_memory_to_use);
222+
size_t reserving =
223+
static_cast<size_t>(fraction_reserve_gpu_memory * available);
224+
// If available size is less than minimum chunk size, no usable memory exists
225+
size_t available_to_alloc = available - reserving;
226+
size_t min_chunk_size = GpuMinChunkSize();
227+
if (available_to_alloc < min_chunk_size) {
228+
available_to_alloc = 0;
229+
}
230+
VLOG(10) << "GPU usage " << (available >> 20) << "M/" << (total >> 20)
231+
<< "M, " << (available_to_alloc >> 20) << "M available to allocate";
232+
return available_to_alloc;
238233
}
239234

240-
size_t GpuReallocSize() {
241-
if (FLAGS_reallocate_gpu_memory_in_mb > 0ul) {
242-
// Additional memory will be allocated by FLAGS_reallocate_gpu_memory_in_mb
243-
return static_cast<size_t>(FLAGS_reallocate_gpu_memory_in_mb << 20);
244-
}
235+
size_t GpuMaxAllocSize() {
236+
return std::max(GpuInitAllocSize(), GpuReallocSize());
237+
}
245238

246-
// FLAGS_reallocate_gpu_memory_in_mb is 0, additional memory will be allocated
247-
// by fraction
248-
size_t total = 0;
249-
size_t available = 0;
239+
static size_t GpuAllocSize(bool realloc) {
240+
size_t available_to_alloc = GpuAvailableMemToAlloc();
241+
PADDLE_ENFORCE_GT(available_to_alloc, 0, "No enough available GPU memory");
242+
// If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
243+
// allocated by fraction
244+
size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
245+
: FLAGS_initial_gpu_memory_in_mb;
246+
size_t alloc_bytes =
247+
(flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
248+
FLAGS_fraction_of_gpu_memory_to_use);
249+
PADDLE_ENFORCE_GE(available_to_alloc, alloc_bytes,
250+
"No enough available GPU memory");
251+
VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
252+
<< " MiB, is it Re-alloc: " << realloc;
253+
return alloc_bytes;
254+
}
250255

251-
GpuMemoryUsage(&available, &total);
252-
size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total);
256+
size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); }
253257

254-
return static_cast<size_t>((total - reserving) *
255-
FLAGS_fraction_of_gpu_memory_to_use);
256-
}
258+
size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); }
257259

258260
size_t GpuMinChunkSize() {
259261
// Allow to allocate the minimum chunk size is 256 bytes.
260262
return 1 << 8;
261263
}
262264

263265
size_t GpuMaxChunkSize() {
264-
size_t total = 0;
265-
size_t available = 0;
266-
267-
GpuMemoryUsage(&available, &total);
268-
VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
269-
<< total / 1024 / 1024 << "M";
270-
size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total);
271-
// If available less than minimum chunk size, no usable memory exists.
272-
available =
273-
std::min(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(),
274-
total - reserving);
275-
276-
size_t allocating = GpuMaxAllocSize();
277-
278-
PADDLE_ENFORCE_LE(allocating, available,
279-
"Insufficient GPU memory to allocation.");
280-
281-
return allocating;
266+
size_t max_chunk_size = GpuMaxAllocSize();
267+
VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
268+
return max_chunk_size;
282269
}
283270

284271
void GpuMemcpyAsync(void *dst, const void *src, size_t count,

paddle/fluid/platform/gpu_info.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,10 @@ void SetDeviceId(int device_id);
5757
//! Get the memory usage of current GPU device.
5858
void GpuMemoryUsage(size_t *available, size_t *total);
5959

60+
//! Get the available memory to allocate, which is the size of available gpu
61+
//! minus reserving.
62+
size_t GpuAvailableMemToAlloc();
63+
6064
//! Get the maximum allocation size of current GPU device.
6165
size_t GpuMaxAllocSize();
6266

0 commit comments

Comments
 (0)