Skip to content

Commit 046bb5c

Browse files
velconiatyphoonzero
authored andcommitted
Fix NCCLBcast hang up bug in Parallel Executor (#11377)
* 1. Create buddy allocator in each places before NcclBcast the variables 2. Check the memory usage of ALL gpus rather than the first one * 1. Make NCCLGroupGuard guards only the ncclBcast part, which avoid ncclGroupEnd blocking the exception throwing 2. NOTE the usage of NCCLGroupGuard * Remove the memory usage check of gpus * Fix code style
1 parent cbaa24f commit 046bb5c

File tree

2 files changed

+20
-5
lines changed

2 files changed

+20
-5
lines changed

paddle/fluid/framework/parallel_executor.cc

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -145,9 +145,9 @@ void ParallelExecutor::BCastParamsToGPUs(
145145
auto &dims = main_tensor.dims();
146146
if (paddle::platform::is_gpu_place(main_tensor.place())) {
147147
#ifdef PADDLE_WITH_CUDA
148+
std::vector<void *> buffers;
148149
size_t numel = main_tensor.numel();
149150
ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
150-
platform::NCCLGroupGuard guard;
151151
for (size_t i = 0; i < member_->places_.size(); ++i) {
152152
auto place = member_->places_[i];
153153
void *buffer;
@@ -159,11 +159,21 @@ void ParallelExecutor::BCastParamsToGPUs(
159159
t->Resize(dims);
160160
buffer = t->mutable_data(place, main_tensor.type());
161161
}
162-
auto &nccl_ctx = member_->nccl_ctxs_->at(place);
163-
platform::dynload::ncclBcast(buffer, numel, data_type, 0,
164-
nccl_ctx.comm_, nccl_ctx.stream());
162+
buffers.push_back(buffer);
165163
}
166-
member_->nccl_ctxs_->WaitAll();
164+
165+
PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(),
166+
"variables' buffer size to bcast NOT equal to places");
167+
{
168+
platform::NCCLGroupGuard guard;
169+
for (size_t i = 0; i < member_->places_.size(); ++i) {
170+
auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]);
171+
platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
172+
nccl_ctx.comm_, nccl_ctx.stream());
173+
}
174+
member_->nccl_ctxs_->WaitAll();
175+
}
176+
167177
#else
168178
PADDLE_THROW("Not compiled with CUDA");
169179
#endif

paddle/fluid/platform/nccl_helper.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,11 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) {
4141
}
4242
}
4343

44+
// NOTE(minqiyang): according to the ncclGroupEnd documentations:
45+
// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
46+
// ncclGroupEnd will wait for all communicators to be initialized, which will
47+
// cause blocking problem when a runtime_error was thrown, so try only guard
48+
// NCCL actions when use it.
4449
class NCCLGroupGuard {
4550
public:
4651
static std::mutex &NCCLMutex() {

0 commit comments

Comments
 (0)