PaddlePaddle
diff --git a/‎paddle/fluid/memory/allocation/cuda_allocator.cc
Lines changed: 22 additions & 13 deletions b/‎paddle/fluid/memory/allocation/cuda_allocator.cc
Lines changed: 22 additions & 13 deletions
diff --git a/‎paddle/fluid/memory/detail/system_allocator.cc
Lines changed: 18 additions & 28 deletions b/‎paddle/fluid/memory/detail/system_allocator.cc
Lines changed: 18 additions & 28 deletions
diff --git a/‎paddle/fluid/platform/CMakeLists.txt
Lines changed: 2 additions & 0 deletions b/‎paddle/fluid/platform/CMakeLists.txt
Lines changed: 2 additions & 0 deletions
diff --git a/‎paddle/fluid/platform/flags.cc
Lines changed: 8 additions & 0 deletions b/‎paddle/fluid/platform/flags.cc
Lines changed: 8 additions & 0 deletions
diff --git a/‎paddle/fluid/platform/gpu_info.cc
Lines changed: 162 additions & 6 deletions b/‎paddle/fluid/platform/gpu_info.cc
Lines changed: 162 additions & 6 deletions
diff --git a/‎paddle/fluid/platform/gpu_info.h
Lines changed: 14 additions & 2 deletions b/‎paddle/fluid/platform/gpu_info.h
Lines changed: 14 additions & 2 deletions
@@ -25,39 +25,48 @@ namespace memory {
 namespace allocation {
 bool CUDAAllocator::IsAllocThreadSafe() const { return true; }
 void CUDAAllocator::FreeImpl(Allocation* allocation) {
-  platform::CUDADeviceGuard guard(place_.device);
-  PADDLE_ENFORCE_EQ(boost::get<platform::CUDAPlace>(allocation->place()),
-                    place_);
-  PADDLE_ENFORCE(cudaFree(allocation->ptr()));
+  PADDLE_ENFORCE_EQ(
+      boost::get<platform::CUDAPlace>(allocation->place()), place_,
+      platform::errors::PermissionDenied(
+          "GPU memory is freed in incorrect device. This may be a bug"));
+  platform::RecordedCudaFree(allocation->ptr(), allocation->size(),
+                             place_.device);
   delete allocation;
 }
 
 Allocation* CUDAAllocator::AllocateImpl(size_t size) {
   std::call_once(once_flag_, [this] { platform::SetDeviceId(place_.device); });
 
-  platform::CUDADeviceGuard guard(place_.device);
   void* ptr;
-  auto result = cudaMalloc(&ptr, size);
+  auto result = platform::RecordedCudaMalloc(&ptr, size, place_.device);
   if (LIKELY(result == cudaSuccess)) {
     return new Allocation(ptr, size, platform::Place(place_));
   }
 
-  platform::RaiseNonOutOfMemoryError(&result);
+  size_t avail, total, actual_avail, actual_total;
+  bool is_limited = platform::RecordedCudaMemGetInfo(
+      &avail, &total, &actual_avail, &actual_total, place_.device);
 
-  size_t avail = 0, total = 0;
-  result = cudaMemGetInfo(&avail, &total);
-  if (result != cudaSuccess) avail = 0;
-  platform::RaiseNonOutOfMemoryError(&result);
+  std::string err_msg;
+  if (is_limited) {
+    auto limit_size = (total >> 20);
+    err_msg = string::Sprintf(
+        "Or set environment variable `FLAGS_gpu_memory_limit_mb` to a larger "
+        "value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the maximum "
+        "GPU memory usage is limited to %d MB.\n"
+        "   The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
+        limit_size, limit_size);
+  }
 
   PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
       "\n\nOut of memory error on GPU %d. "
       "Cannot allocate %s memory on GPU %d, "
       "available memory is only %s.\n\n"
       "Please check whether there is any other process using GPU %d.\n"
       "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n"
-      "2. If no, please decrease the batch size of your model.\n",
+      "2. If no, please decrease the batch size of your model. %s\n\n",
       place_.device, string::HumanReadableSize(size), place_.device,
-      string::HumanReadableSize(avail), place_.device));
+      string::HumanReadableSize(avail), place_.device, err_msg));
 }
 
 }  // namespace allocation
 
@@ -110,29 +110,28 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
   // if size is 0.  We just make sure it does.
   if (size <= 0) return nullptr;
 
-  paddle::platform::CUDADeviceGuard guard(gpu_id_);
-
   void* p;
-  cudaError_t result = cudaMalloc(&p, size);
+  auto result = platform::RecordedCudaMalloc(&p, size, gpu_id_);
 
   if (result == cudaSuccess) {
     *index = 0;
     gpu_alloc_size_ += size;
     return p;
   } else {
-    platform::RaiseNonOutOfMemoryError(&result);
-
-    /**
-     * NOTE(zjl): Sometimes cudaMemGetInfo would raise OOM error
-     * if there is very little GPU memory left. In this case, we
-     * should consider the available GPU memory to be 0, and throw
-     * exception inside this function instead of throwing exception
-     * inside cudaMemGetInfo.
-     */
-    size_t avail = 0, total = 0;
-    result = cudaMemGetInfo(&avail, &total);
-    if (result != cudaSuccess) avail = 0;
-    platform::RaiseNonOutOfMemoryError(&result);
+    size_t avail, total, actual_avail, actual_total;
+    bool is_limited = platform::RecordedCudaMemGetInfo(
+        &avail, &total, &actual_avail, &actual_total, gpu_id_);
+
+    std::string err_msg;
+    if (is_limited) {
+      auto limit_size = (total >> 20);
+      err_msg = string::Sprintf(
+          "\n   3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a "
+          "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
+          "maximum GPU memory usage is limited to %d MB.\n"
+          "      The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
+          limit_size, limit_size);
+    }
 
     PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
         "\n\nOut of memory error on GPU %d. "
@@ -145,28 +144,19 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
         "   2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, "
         "please set it to a higher value but less than 1.0.\n"
         "      The command is "
-        "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.\n\n",
+        "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
         gpu_id_, string::HumanReadableSize(size), gpu_id_,
         string::HumanReadableSize(avail), gpu_id_,
-        FLAGS_fraction_of_gpu_memory_to_use));
+        FLAGS_fraction_of_gpu_memory_to_use, err_msg));
   }
 }
 
 void GPUAllocator::Free(void* p, size_t size, size_t index) {
-  cudaError_t err;
   PADDLE_ENFORCE_EQ(index, 0);
   PADDLE_ENFORCE_GE(gpu_alloc_size_, size);
   gpu_alloc_size_ -= size;
-  err = cudaFree(p);
 
-  // Purposefully allow cudaErrorCudartUnloading, because
-  // that is returned if you ever call cudaFree after the
-  // driver has already shutdown. This happens only if the
-  // process is terminating, in which case we don't care if
-  // cudaFree succeeds.
-  if (err != cudaErrorCudartUnloading) {
-    PADDLE_ENFORCE(err, "cudaFree{Host} failed in GPUAllocator::Free.");
-  }
+  platform::RecordedCudaFree(p, size, gpu_id_);
 }
 
 bool GPUAllocator::UseGpu() const { return true; }
 
@@ -117,6 +117,8 @@ cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
 nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
 cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
 
+nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
+
 nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
 
 if(NOT APPLE AND NOT WIN32)
 
@@ -449,6 +449,14 @@ DEFINE_uint64(reallocate_gpu_memory_in_mb, 0ul,
               "size specified by this flag. Else Paddle will reallocate by "
               "FLAGS_fraction_of_gpu_memory_to_use");
 
+DEFINE_uint64(gpu_memory_limit_mb, 0UL,
+              "The maximum gpu memory limit that the process can allocate. "
+              "If it is equal to 0, there would be no limit and all gpu memory "
+              "would be available to the process. If it is larger than 0, "
+              "the process would raise out of memory error if the allocated "
+              "memory exceeds the limit even though there is available "
+              "memory on the gpu card. The unit is MB and default value is 0.");
+
 #endif
 
 /**
 
@@ -15,17 +15,22 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 #include <algorithm>
 #include <cstdlib>
+#include <memory>
 #include <string>
 
 #include "gflags/gflags.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/lock_guard_ptr.h"
+#include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/string/split.h"
 
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_uint64(initial_gpu_memory_in_mb);
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
 DECLARE_bool(enable_cublas_tensor_op_math);
 DECLARE_string(selected_gpus);
+DECLARE_uint64(gpu_memory_limit_mb);
 
 constexpr static float fraction_reserve_gpu_memory = 0.05f;
 
@@ -241,11 +246,9 @@ void SetDeviceId(int id) {
 }
 
 void GpuMemoryUsage(size_t *available, size_t *total) {
-  auto error_code = cudaMemGetInfo(available, total);
-  PADDLE_ENFORCE(error_code,
-                 "cudaMemGetInfo failed in "
-                 "paddle::platform::GetMemoryUsage, error code : %d, %s",
-                 error_code, CudaErrorWebsite());
+  size_t actual_available, actual_total;
+  RecordedCudaMemGetInfo(available, total, &actual_available, &actual_total,
+                         platform::GetCurrentDeviceId());
 }
 
 size_t GpuAvailableMemToAlloc() {
@@ -359,7 +362,7 @@ void GpuStreamSync(cudaStream_t stream) {
           error_code, CudaErrorWebsite()));
 }
 
-void RaiseNonOutOfMemoryError(cudaError_t *status) {
+static void RaiseNonOutOfMemoryError(cudaError_t *status) {
   if (*status == cudaErrorMemoryAllocation) {
     *status = cudaSuccess;
   }
@@ -374,5 +377,158 @@ void RaiseNonOutOfMemoryError(cudaError_t *status) {
   PADDLE_ENFORCE_CUDA_SUCCESS(*status);
 }
 
+class RecordedCudaMallocHelper {
+ private:
+  explicit RecordedCudaMallocHelper(int dev_id, uint64_t limit_size = 0)
+      : dev_id_(dev_id), limit_size_(limit_size) {
+    if (NeedRecord()) {
+      mtx_.reset(new std::mutex());
+    }
+  }
+
+  DISABLE_COPY_AND_ASSIGN(RecordedCudaMallocHelper);
+
+ public:
+  static RecordedCudaMallocHelper *Instance(int dev_id) {
+    std::call_once(once_flag_, [] {
+      int dev_cnt = GetCUDADeviceCount();
+      instances_.reserve(dev_cnt);
+      for (int i = 0; i < dev_cnt; ++i) {
+        instances_.emplace_back(
+            new RecordedCudaMallocHelper(i, FLAGS_gpu_memory_limit_mb << 20));
+      }
+    });
+
+    PADDLE_ENFORCE_GE(
+        dev_id, 0,
+        platform::errors::OutOfRange(
+            "Device id must be not less than 0, but got %d", dev_id));
+    PADDLE_ENFORCE_LT(
+        dev_id, instances_.size(),
+        platform::errors::OutOfRange("Device id %d exceeds gpu card number %d",
+                                     dev_id, instances_.size()));
+    return instances_[dev_id].get();
+  }
+
+  /**
+   * Try to allocate `size` gpu memory. Only cudaErrorMemoryAllocation
+   * or cudaSuccess would be returned, and the cudaGetLastError() flag
+   * would be clear.
+   */
+  cudaError_t Malloc(void **ptr, size_t size) {
+    LockGuardPtr<std::mutex> lock(mtx_);
+    if (UNLIKELY(NeedRecord() && cur_size_ + size > limit_size_)) {
+      return cudaErrorMemoryAllocation;
+    }
+
+    CUDADeviceGuard guard(dev_id_);
+    auto result = cudaMalloc(ptr, size);
+    if (result == cudaSuccess) {
+      if (NeedRecord()) {
+        cur_size_ += size;
+      }
+      return cudaSuccess;
+    } else {
+      RaiseNonOutOfMemoryError(&result);
+      // Non out of memory error would be raised inside
+      // RaiseNonOutOfMemoryError. Therefore, we can
+      // return cudaErrorMemoryAllocation directly here.
+      return cudaErrorMemoryAllocation;
+    }
+  }
+
+  /**
+   * Free gpu memory. Usually, free is not allowed to raise error.
+   * If it does raise error, the process should be crashed.
+   */
+  void Free(void *ptr, size_t size) {
+    // Purposefully allow cudaErrorCudartUnloading, because
+    // that is returned if you ever call cudaFree after the
+    // driver has already shutdown. This happens only if the
+    // process is terminating, in which case we don't care if
+    // cudaFree succeeds.
+    CUDADeviceGuard guard(dev_id_);
+    auto err = cudaFree(ptr);
+    if (err != cudaErrorCudartUnloading) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          err, platform::errors::External("cudaFree raises unexpected error"));
+      if (NeedRecord()) {
+        std::lock_guard<std::mutex> guard(*mtx_);
+        cur_size_ -= size;
+      }
+    } else {
+      cudaGetLastError();  // clear the error flag when cudaErrorCudartUnloading
+    }
+  }
+
+  bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail,
+                  size_t *actual_total) {
+    {
+      CUDADeviceGuard guard(dev_id_);
+      auto result = cudaMemGetInfo(actual_avail, actual_total);
+      if (result != cudaSuccess) {
+        *actual_avail = 0;
+      }
+      RaiseNonOutOfMemoryError(&result);
+    }
+
+    if (NeedRecord()) {
+      std::lock_guard<std::mutex> guard(*mtx_);
+      *avail = std::min(*actual_avail, limit_size_ - cur_size_);
+      *total = std::min(*actual_total, limit_size_);
+      return *total < *actual_total;
+    } else {
+      *avail = *actual_avail;
+      *total = *actual_total;
+      return false;
+    }
+  }
+
+  inline bool NeedRecord() const { return limit_size_ != 0; }
+
+  uint64_t RecordedSize() const {
+    LockGuardPtr<std::mutex> lock(mtx_);
+    return NeedRecord() ? cur_size_ : 0;
+  }
+
+  uint64_t LimitSize() const { return limit_size_; }
+
+ private:
+  const int dev_id_;
+  const uint64_t limit_size_;
+  uint64_t cur_size_{0};
+
+  mutable std::unique_ptr<std::mutex> mtx_;
+
+  static std::once_flag once_flag_;
+  static std::vector<std::unique_ptr<RecordedCudaMallocHelper>> instances_;
+};
+
+std::once_flag RecordedCudaMallocHelper::once_flag_;
+std::vector<std::unique_ptr<RecordedCudaMallocHelper>>
+    RecordedCudaMallocHelper::instances_;
+
+cudaError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id) {
+  return RecordedCudaMallocHelper::Instance(dev_id)->Malloc(ptr, size);
+}
+
+void RecordedCudaFree(void *p, size_t size, int dev_id) {
+  return RecordedCudaMallocHelper::Instance(dev_id)->Free(p, size);
+}
+
+bool RecordedCudaMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
+                            size_t *actual_total, int dev_id) {
+  return RecordedCudaMallocHelper::Instance(dev_id)->GetMemInfo(
+      avail, total, actual_avail, actual_total);
+}
+
+uint64_t RecordedCudaMallocSize(int dev_id) {
+  return RecordedCudaMallocHelper::Instance(dev_id)->RecordedSize();
+}
+
+bool IsCudaMallocRecorded(int dev_id) {
+  return RecordedCudaMallocHelper::Instance(dev_id)->NeedRecord();
+}
+
 }  // namespace platform
 }  // namespace paddle
@@ -104,8 +104,20 @@ void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);
 //! Blocks until stream has completed all operations.
 void GpuStreamSync(cudaStream_t stream);
 
-//! Raise error if status is not cudaSuccess or OOM, otherwise reset status.
-void RaiseNonOutOfMemoryError(cudaError_t *status);
+//! CudaMalloc with recorded info
+cudaError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id);
+
+//! CudaFree with recorded info
+void RecordedCudaFree(void *p, size_t size, int dev_id);
+
+//! Get available and total gpu memory with considering limitation
+bool RecordedCudaMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
+                            size_t *actual_total, int dev_id);
+
+//! Get recorded cudaMalloc size. If record is disabled, return 0.
+uint64_t RecordedCudaMallocSize(int dev_id);
+
+bool IsCudaMallocRecorded(int dev_id);
 
 }  // namespace platform
 }  // namespace paddle