Skip to content

Commit b09ba8a

Browse files
author
chengduo
authored
Cherry pick Fix Bug-prone code of PE (#18355)
* update pe reduce config test=release/1.5 * drop the local_exe_scopes of the previous parallel_executor test=release/1.5
1 parent 401c03f commit b09ba8a

File tree

2 files changed

+35
-31
lines changed

2 files changed

+35
-31
lines changed

paddle/fluid/framework/parallel_executor.cc

Lines changed: 33 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -311,8 +311,8 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
311311
member_->global_scope_ = scope;
312312
member_->use_cuda_ = exec_strategy.use_cuda_;
313313
member_->build_strategy_ = build_strategy;
314-
member_->use_all_reduce_ =
315-
build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
314+
member_->use_all_reduce_ = member_->build_strategy_.reduce_ ==
315+
BuildStrategy::ReduceStrategy::kAllReduce;
316316
member_->nranks_ = build_strategy.num_trainers_ * places.size();
317317
if (!member_->use_all_reduce_ && member_->nranks_ == 1) {
318318
LOG(INFO) << "If you set build_strategy.reduce with 'Reduce',"
@@ -348,7 +348,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
348348
}
349349

350350
std::vector<ir::Graph *> graphs;
351-
if (build_strategy.async_mode_) {
351+
if (member_->build_strategy_.async_mode_) {
352352
PADDLE_ENFORCE(!member_->use_cuda_,
353353
"gpu mode does not support async_mode_ now!");
354354
graphs.push_back(graph);
@@ -362,17 +362,18 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
362362
// FIXME(Yancey1989): parallel graph mode get better performance
363363
// in GPU allreduce distributed training. Need an elegant way to
364364
// choice the execution strategy.
365-
build_strategy.enable_parallel_graph_ =
366-
EnableParallelGraphExecution(*graph, exec_strategy, build_strategy);
367-
if (build_strategy.enable_parallel_graph_) {
365+
member_->build_strategy_.enable_parallel_graph_ =
366+
EnableParallelGraphExecution(*graph, exec_strategy,
367+
member_->build_strategy_);
368+
if (member_->build_strategy_.enable_parallel_graph_) {
368369
LOG(INFO) << "The Executor would execute the graph by ParallelGraph "
369370
"Execution which can get better performance,"
370371
<< "you can force it off by env FLAGS_enable_parallel_graph=0";
371372
}
372373

373374
if (member_->use_cuda_ && member_->nranks_ > 1) {
374375
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
375-
member_->InitOrGetNCCLCommunicator(scope, build_strategy);
376+
member_->InitOrGetNCCLCommunicator(scope, member_->build_strategy_);
376377

377378
// Initialize device context's nccl comm, will be used by normal
378379
// Operators like sync_batch_norm, and collective ops.
@@ -395,7 +396,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
395396
}
396397
// broadcast parameters from the 0th device to others:
397398
auto need_broadcast = [&]() -> bool {
398-
if (build_strategy.num_trainers_ > 1) {
399+
if (member_->build_strategy_.num_trainers_ > 1) {
399400
// 1. num_tariners would be grater than 1 for nccl distributed training.
400401
return true;
401402
} else if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
@@ -407,7 +408,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
407408
};
408409
// Bcast Parameters to all GPUs
409410
if (need_broadcast()) {
410-
BCastParamsToDevices(bcast_vars, build_strategy.trainer_id_);
411+
BCastParamsToDevices(bcast_vars, member_->build_strategy_.trainer_id_);
411412
}
412413

413414
// Startup Program has been run. All local scopes has correct parameters.
@@ -416,39 +417,40 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
416417
// ncclOp
417418
std::vector<ir::Graph *> async_graphs(places.size());
418419
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
419-
if (build_strategy.async_mode_) {
420+
if (member_->build_strategy_.async_mode_) {
420421
VLOG(3) << "use local async mode";
421-
graph = build_strategy.Apply(graph, {member_->places_[0]}, loss_var_name,
422-
{member_->local_scopes_[0]}, 1,
423-
member_->use_cuda_, member_->nccl_ctxs_);
422+
graph = member_->build_strategy_.Apply(
423+
graph, {member_->places_[0]}, loss_var_name,
424+
{member_->local_scopes_[0]}, 1, member_->use_cuda_,
425+
member_->nccl_ctxs_);
424426
for (size_t i = 1; i < member_->places_.size(); ++i) {
425-
graphs[i] =
426-
build_strategy.Apply(graphs[i], {member_->places_[i]}, loss_var_name,
427-
{member_->local_scopes_[i]}, 1,
428-
member_->use_cuda_, member_->nccl_ctxs_);
427+
graphs[i] = member_->build_strategy_.Apply(
428+
graphs[i], {member_->places_[i]}, loss_var_name,
429+
{member_->local_scopes_[i]}, 1, member_->use_cuda_,
430+
member_->nccl_ctxs_);
429431
async_graphs[i] = graphs[i];
430432
}
431433
} else {
432-
graph = build_strategy.Apply(graph, member_->places_, loss_var_name,
433-
member_->local_scopes_, member_->nranks_,
434-
member_->use_cuda_, member_->nccl_ctxs_);
434+
graph = member_->build_strategy_.Apply(
435+
graph, member_->places_, loss_var_name, member_->local_scopes_,
436+
member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_);
435437
}
436438
#else
437-
if (build_strategy.async_mode_) {
439+
if (member_->build_strategy_.async_mode_) {
438440
VLOG(3) << "use local async mode";
439-
graph = build_strategy.Apply(graph, {member_->places_[0]}, loss_var_name,
440-
{member_->local_scopes_[0]}, 1,
441-
member_->use_cuda_);
441+
graph = member_->build_strategy_.Apply(
442+
graph, {member_->places_[0]}, loss_var_name,
443+
{member_->local_scopes_[0]}, 1, member_->use_cuda_);
442444
for (size_t i = 1; i < member_->places_.size(); ++i) {
443-
graphs[i] = build_strategy.Apply(
445+
graphs[i] = member_->build_strategy_.Apply(
444446
graphs[i], {member_->places_[i]}, loss_var_name,
445447
{member_->local_scopes_[i]}, 1, member_->use_cuda_);
446448
async_graphs[i] = graphs[i];
447449
}
448450
} else {
449-
graph = build_strategy.Apply(graph, member_->places_, loss_var_name,
450-
member_->local_scopes_, member_->nranks_,
451-
member_->use_cuda_);
451+
graph = member_->build_strategy_.Apply(
452+
graph, member_->places_, loss_var_name, member_->local_scopes_,
453+
member_->nranks_, member_->use_cuda_);
452454
}
453455
#endif
454456

@@ -489,11 +491,11 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
489491
}
490492
}
491493

492-
if (build_strategy.async_mode_) {
494+
if (member_->build_strategy_.async_mode_) {
493495
VLOG(3) << "use AsyncSSAGraphExecutor";
494496
member_->executor_.reset(new details::AsyncSSAGraphExecutor(
495497
exec_strategy, member_->local_scopes_, member_->places_, async_graphs));
496-
} else if (build_strategy.enable_parallel_graph_) {
498+
} else if (member_->build_strategy_.enable_parallel_graph_) {
497499
VLOG(3) << "use ParallelSSAGraphExecutor";
498500
#ifdef PADDLE_WITH_CUDA
499501
// TODO(Yancey1989): Remove passing in the main_program when
@@ -517,7 +519,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
517519
}
518520

519521
VLOG(3) << "use ScopeBufferedSSAGraphExecutor";
520-
if (!build_strategy.async_mode_) {
522+
if (!member_->build_strategy_.async_mode_) {
521523
member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
522524
exec_strategy, member_->local_scopes_, std::move(var_infos),
523525
member_->places_, std::move(member_->executor_)));

python/paddle/fluid/compiler.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,8 @@ def _compile_data_parallel(self, use_cuda=False, scope=None):
274274
"share_vars_from is not compiled and run, so there is no "
275275
"var to share.")
276276
self._local_scopes = self._share_vars_from._executor.local_scopes()
277+
# drop the local_exe_scopes of the previous parallel_executor
278+
self._share_vars_from._executor.drop_local_exe_scopes()
277279
else:
278280
assert scope is not None, ""
279281
self._local_scopes = []

0 commit comments

Comments
 (0)