Fix NCCLBcast hang up bug in Parallel Executor (#11377)

velconia · typhoonzero · commit 046bb5c8cb51 · 2018-06-14T11:30:28.000+08:00
* 1. Create buddy allocator in each places before NcclBcast the variables
2. Check the memory usage of ALL gpus rather than the first one

* 1. Make NCCLGroupGuard guards only the ncclBcast part, which avoid ncclGroupEnd blocking the exception throwing
2. NOTE the usage of NCCLGroupGuard

* Remove the memory usage check of gpus

* Fix code style
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
@@ -145,9 +145,9 @@ void ParallelExecutor::BCastParamsToGPUs(
     auto &dims = main_tensor.dims();
     if (paddle::platform::is_gpu_place(main_tensor.place())) {
 #ifdef PADDLE_WITH_CUDA
+      std::vector<void *> buffers;
       size_t numel = main_tensor.numel();
       ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
-      platform::NCCLGroupGuard guard;
       for (size_t i = 0; i < member_->places_.size(); ++i) {
         auto place = member_->places_[i];
         void *buffer;
@@ -159,11 +159,21 @@ void ParallelExecutor::BCastParamsToGPUs(
           t->Resize(dims);
           buffer = t->mutable_data(place, main_tensor.type());
         }
-        auto &nccl_ctx = member_->nccl_ctxs_->at(place);
-        platform::dynload::ncclBcast(buffer, numel, data_type, 0,
-                                     nccl_ctx.comm_, nccl_ctx.stream());
+        buffers.push_back(buffer);
       }
-      member_->nccl_ctxs_->WaitAll();
+
+      PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(),
+                        "variables' buffer size to bcast NOT equal to places");
+      {
+        platform::NCCLGroupGuard guard;
+        for (size_t i = 0; i < member_->places_.size(); ++i) {
+          auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]);
+          platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
+                                       nccl_ctx.comm_, nccl_ctx.stream());
+        }
+        member_->nccl_ctxs_->WaitAll();
+      }
+
 #else
       PADDLE_THROW("Not compiled with CUDA");
 #endif
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
@@ -41,6 +41,11 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) {
   }
 }
 
+// NOTE(minqiyang): according to the ncclGroupEnd documentations:
+// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
+// ncclGroupEnd will wait for all communicators to be initialized, which will
+// cause blocking problem when a runtime_error was thrown, so try only guard
+// NCCL actions when use it.
 class NCCLGroupGuard {
  public:
   static std::mutex &NCCLMutex() {

Original file line number	Diff line number	Diff line change
`@@ -41,6 +41,11 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) {`
`41`	`41`	`}`
`42`	`42`	`}`
`43`	`43`
	`44`	`+// NOTE(minqiyang): according to the ncclGroupEnd documentations:`
	`45`	`+// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,`
	`46`	`+// ncclGroupEnd will wait for all communicators to be initialized, which will`
	`47`	`+// cause blocking problem when a runtime_error was thrown, so try only guard`
	`48`	`+// NCCL actions when use it.`
`44`	`49`	`class NCCLGroupGuard {`
`45`	`50`	`public:`
`46`	`51`	`static std::mutex &NCCLMutex() {`