@@ -15,17 +15,22 @@ limitations under the License. */
15
15
#include " paddle/fluid/platform/gpu_info.h"
16
16
#include < algorithm>
17
17
#include < cstdlib>
18
+ #include < memory>
18
19
#include < string>
19
20
20
21
#include " gflags/gflags.h"
22
+ #include " paddle/fluid/platform/cuda_device_guard.h"
21
23
#include " paddle/fluid/platform/enforce.h"
24
+ #include " paddle/fluid/platform/lock_guard_ptr.h"
25
+ #include " paddle/fluid/platform/macros.h"
22
26
#include " paddle/fluid/string/split.h"
23
27
24
28
DECLARE_double (fraction_of_gpu_memory_to_use);
25
29
DECLARE_uint64 (initial_gpu_memory_in_mb);
26
30
DECLARE_uint64 (reallocate_gpu_memory_in_mb);
27
31
DECLARE_bool (enable_cublas_tensor_op_math);
28
32
DECLARE_string (selected_gpus);
33
+ DECLARE_uint64 (gpu_memory_limit_mb);
29
34
30
35
constexpr static float fraction_reserve_gpu_memory = 0 .05f ;
31
36
@@ -241,11 +246,9 @@ void SetDeviceId(int id) {
241
246
}
242
247
243
248
void GpuMemoryUsage (size_t *available, size_t *total) {
244
- auto error_code = cudaMemGetInfo (available, total);
245
- PADDLE_ENFORCE (error_code,
246
- " cudaMemGetInfo failed in "
247
- " paddle::platform::GetMemoryUsage, error code : %d, %s" ,
248
- error_code, CudaErrorWebsite ());
249
+ size_t actual_available, actual_total;
250
+ RecordedCudaMemGetInfo (available, total, &actual_available, &actual_total,
251
+ platform::GetCurrentDeviceId ());
249
252
}
250
253
251
254
size_t GpuAvailableMemToAlloc () {
@@ -359,7 +362,7 @@ void GpuStreamSync(cudaStream_t stream) {
359
362
error_code, CudaErrorWebsite ()));
360
363
}
361
364
362
- void RaiseNonOutOfMemoryError (cudaError_t *status) {
365
+ static void RaiseNonOutOfMemoryError (cudaError_t *status) {
363
366
if (*status == cudaErrorMemoryAllocation) {
364
367
*status = cudaSuccess;
365
368
}
@@ -374,5 +377,158 @@ void RaiseNonOutOfMemoryError(cudaError_t *status) {
374
377
PADDLE_ENFORCE_CUDA_SUCCESS (*status);
375
378
}
376
379
380
+ class RecordedCudaMallocHelper {
381
+ private:
382
+ explicit RecordedCudaMallocHelper (int dev_id, uint64_t limit_size = 0 )
383
+ : dev_id_(dev_id), limit_size_(limit_size) {
384
+ if (NeedRecord ()) {
385
+ mtx_.reset (new std::mutex ());
386
+ }
387
+ }
388
+
389
+ DISABLE_COPY_AND_ASSIGN (RecordedCudaMallocHelper);
390
+
391
+ public:
392
+ static RecordedCudaMallocHelper *Instance (int dev_id) {
393
+ std::call_once (once_flag_, [] {
394
+ int dev_cnt = GetCUDADeviceCount ();
395
+ instances_.reserve (dev_cnt);
396
+ for (int i = 0 ; i < dev_cnt; ++i) {
397
+ instances_.emplace_back (
398
+ new RecordedCudaMallocHelper (i, FLAGS_gpu_memory_limit_mb << 20 ));
399
+ }
400
+ });
401
+
402
+ PADDLE_ENFORCE_GE (
403
+ dev_id, 0 ,
404
+ platform::errors::OutOfRange (
405
+ " Device id must be not less than 0, but got %d" , dev_id));
406
+ PADDLE_ENFORCE_LT (
407
+ dev_id, instances_.size (),
408
+ platform::errors::OutOfRange (" Device id %d exceeds gpu card number %d" ,
409
+ dev_id, instances_.size ()));
410
+ return instances_[dev_id].get ();
411
+ }
412
+
413
+ /* *
414
+ * Try to allocate `size` gpu memory. Only cudaErrorMemoryAllocation
415
+ * or cudaSuccess would be returned, and the cudaGetLastError() flag
416
+ * would be clear.
417
+ */
418
+ cudaError_t Malloc (void **ptr, size_t size) {
419
+ LockGuardPtr<std::mutex> lock (mtx_);
420
+ if (UNLIKELY (NeedRecord () && cur_size_ + size > limit_size_)) {
421
+ return cudaErrorMemoryAllocation;
422
+ }
423
+
424
+ CUDADeviceGuard guard (dev_id_);
425
+ auto result = cudaMalloc (ptr, size);
426
+ if (result == cudaSuccess) {
427
+ if (NeedRecord ()) {
428
+ cur_size_ += size;
429
+ }
430
+ return cudaSuccess;
431
+ } else {
432
+ RaiseNonOutOfMemoryError (&result);
433
+ // Non out of memory error would be raised inside
434
+ // RaiseNonOutOfMemoryError. Therefore, we can
435
+ // return cudaErrorMemoryAllocation directly here.
436
+ return cudaErrorMemoryAllocation;
437
+ }
438
+ }
439
+
440
+ /* *
441
+ * Free gpu memory. Usually, free is not allowed to raise error.
442
+ * If it does raise error, the process should be crashed.
443
+ */
444
+ void Free (void *ptr, size_t size) {
445
+ // Purposefully allow cudaErrorCudartUnloading, because
446
+ // that is returned if you ever call cudaFree after the
447
+ // driver has already shutdown. This happens only if the
448
+ // process is terminating, in which case we don't care if
449
+ // cudaFree succeeds.
450
+ CUDADeviceGuard guard (dev_id_);
451
+ auto err = cudaFree (ptr);
452
+ if (err != cudaErrorCudartUnloading) {
453
+ PADDLE_ENFORCE_CUDA_SUCCESS (
454
+ err, platform::errors::External (" cudaFree raises unexpected error" ));
455
+ if (NeedRecord ()) {
456
+ std::lock_guard<std::mutex> guard (*mtx_);
457
+ cur_size_ -= size;
458
+ }
459
+ } else {
460
+ cudaGetLastError (); // clear the error flag when cudaErrorCudartUnloading
461
+ }
462
+ }
463
+
464
+ bool GetMemInfo (size_t *avail, size_t *total, size_t *actual_avail,
465
+ size_t *actual_total) {
466
+ {
467
+ CUDADeviceGuard guard (dev_id_);
468
+ auto result = cudaMemGetInfo (actual_avail, actual_total);
469
+ if (result != cudaSuccess) {
470
+ *actual_avail = 0 ;
471
+ }
472
+ RaiseNonOutOfMemoryError (&result);
473
+ }
474
+
475
+ if (NeedRecord ()) {
476
+ std::lock_guard<std::mutex> guard (*mtx_);
477
+ *avail = std::min (*actual_avail, limit_size_ - cur_size_);
478
+ *total = std::min (*actual_total, limit_size_);
479
+ return *total < *actual_total;
480
+ } else {
481
+ *avail = *actual_avail;
482
+ *total = *actual_total;
483
+ return false ;
484
+ }
485
+ }
486
+
487
+ inline bool NeedRecord () const { return limit_size_ != 0 ; }
488
+
489
+ uint64_t RecordedSize () const {
490
+ LockGuardPtr<std::mutex> lock (mtx_);
491
+ return NeedRecord () ? cur_size_ : 0 ;
492
+ }
493
+
494
+ uint64_t LimitSize () const { return limit_size_; }
495
+
496
+ private:
497
+ const int dev_id_;
498
+ const uint64_t limit_size_;
499
+ uint64_t cur_size_{0 };
500
+
501
+ mutable std::unique_ptr<std::mutex> mtx_;
502
+
503
+ static std::once_flag once_flag_;
504
+ static std::vector<std::unique_ptr<RecordedCudaMallocHelper>> instances_;
505
+ };
506
+
507
+ std::once_flag RecordedCudaMallocHelper::once_flag_;
508
+ std::vector<std::unique_ptr<RecordedCudaMallocHelper>>
509
+ RecordedCudaMallocHelper::instances_;
510
+
511
+ cudaError_t RecordedCudaMalloc (void **ptr, size_t size, int dev_id) {
512
+ return RecordedCudaMallocHelper::Instance (dev_id)->Malloc (ptr, size);
513
+ }
514
+
515
+ void RecordedCudaFree (void *p, size_t size, int dev_id) {
516
+ return RecordedCudaMallocHelper::Instance (dev_id)->Free (p, size);
517
+ }
518
+
519
+ bool RecordedCudaMemGetInfo (size_t *avail, size_t *total, size_t *actual_avail,
520
+ size_t *actual_total, int dev_id) {
521
+ return RecordedCudaMallocHelper::Instance (dev_id)->GetMemInfo (
522
+ avail, total, actual_avail, actual_total);
523
+ }
524
+
525
+ uint64_t RecordedCudaMallocSize (int dev_id) {
526
+ return RecordedCudaMallocHelper::Instance (dev_id)->RecordedSize ();
527
+ }
528
+
529
+ bool IsCudaMallocRecorded (int dev_id) {
530
+ return RecordedCudaMallocHelper::Instance (dev_id)->NeedRecord ();
531
+ }
532
+
377
533
} // namespace platform
378
534
} // namespace paddle
0 commit comments