@@ -311,8 +311,8 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
311
311
member_->global_scope_ = scope;
312
312
member_->use_cuda_ = exec_strategy.use_cuda_ ;
313
313
member_->build_strategy_ = build_strategy;
314
- member_->use_all_reduce_ =
315
- build_strategy. reduce_ == BuildStrategy::ReduceStrategy::kAllReduce ;
314
+ member_->use_all_reduce_ = member_-> build_strategy_ . reduce_ ==
315
+ BuildStrategy::ReduceStrategy::kAllReduce ;
316
316
member_->nranks_ = build_strategy.num_trainers_ * places.size ();
317
317
if (!member_->use_all_reduce_ && member_->nranks_ == 1 ) {
318
318
LOG (INFO) << " If you set build_strategy.reduce with 'Reduce',"
@@ -348,7 +348,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
348
348
}
349
349
350
350
std::vector<ir::Graph *> graphs;
351
- if (build_strategy .async_mode_ ) {
351
+ if (member_-> build_strategy_ .async_mode_ ) {
352
352
PADDLE_ENFORCE (!member_->use_cuda_ ,
353
353
" gpu mode does not support async_mode_ now!" );
354
354
graphs.push_back (graph);
@@ -362,17 +362,18 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
362
362
// FIXME(Yancey1989): parallel graph mode get better performance
363
363
// in GPU allreduce distributed training. Need an elegant way to
364
364
// choice the execution strategy.
365
- build_strategy.enable_parallel_graph_ =
366
- EnableParallelGraphExecution (*graph, exec_strategy, build_strategy);
367
- if (build_strategy.enable_parallel_graph_ ) {
365
+ member_->build_strategy_ .enable_parallel_graph_ =
366
+ EnableParallelGraphExecution (*graph, exec_strategy,
367
+ member_->build_strategy_ );
368
+ if (member_->build_strategy_ .enable_parallel_graph_ ) {
368
369
LOG (INFO) << " The Executor would execute the graph by ParallelGraph "
369
370
" Execution which can get better performance,"
370
371
<< " you can force it off by env FLAGS_enable_parallel_graph=0" ;
371
372
}
372
373
373
374
if (member_->use_cuda_ && member_->nranks_ > 1 ) {
374
375
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
375
- member_->InitOrGetNCCLCommunicator (scope, build_strategy );
376
+ member_->InitOrGetNCCLCommunicator (scope, member_-> build_strategy_ );
376
377
377
378
// Initialize device context's nccl comm, will be used by normal
378
379
// Operators like sync_batch_norm, and collective ops.
@@ -395,7 +396,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
395
396
}
396
397
// broadcast parameters from the 0th device to others:
397
398
auto need_broadcast = [&]() -> bool {
398
- if (build_strategy .num_trainers_ > 1 ) {
399
+ if (member_-> build_strategy_ .num_trainers_ > 1 ) {
399
400
// 1. num_tariners would be grater than 1 for nccl distributed training.
400
401
return true ;
401
402
} else if (member_->local_scopes_ .size () != 1 && local_scopes.empty ()) {
@@ -407,7 +408,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
407
408
};
408
409
// Bcast Parameters to all GPUs
409
410
if (need_broadcast ()) {
410
- BCastParamsToDevices (bcast_vars, build_strategy .trainer_id_ );
411
+ BCastParamsToDevices (bcast_vars, member_-> build_strategy_ .trainer_id_ );
411
412
}
412
413
413
414
// Startup Program has been run. All local scopes has correct parameters.
@@ -416,39 +417,40 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
416
417
// ncclOp
417
418
std::vector<ir::Graph *> async_graphs (places.size ());
418
419
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
419
- if (build_strategy .async_mode_ ) {
420
+ if (member_-> build_strategy_ .async_mode_ ) {
420
421
VLOG (3 ) << " use local async mode" ;
421
- graph = build_strategy.Apply (graph, {member_->places_ [0 ]}, loss_var_name,
422
- {member_->local_scopes_ [0 ]}, 1 ,
423
- member_->use_cuda_ , member_->nccl_ctxs_ );
422
+ graph = member_->build_strategy_ .Apply (
423
+ graph, {member_->places_ [0 ]}, loss_var_name,
424
+ {member_->local_scopes_ [0 ]}, 1 , member_->use_cuda_ ,
425
+ member_->nccl_ctxs_ );
424
426
for (size_t i = 1 ; i < member_->places_ .size (); ++i) {
425
- graphs[i] =
426
- build_strategy. Apply ( graphs[i], {member_->places_ [i]}, loss_var_name,
427
- {member_->local_scopes_ [i]}, 1 ,
428
- member_-> use_cuda_ , member_->nccl_ctxs_ );
427
+ graphs[i] = member_-> build_strategy_ . Apply (
428
+ graphs[i], {member_->places_ [i]}, loss_var_name,
429
+ {member_->local_scopes_ [i]}, 1 , member_-> use_cuda_ ,
430
+ member_->nccl_ctxs_ );
429
431
async_graphs[i] = graphs[i];
430
432
}
431
433
} else {
432
- graph = build_strategy. Apply (graph, member_->places_ , loss_var_name,
433
- member_->local_scopes_ , member_->nranks_ ,
434
- member_->use_cuda_ , member_->nccl_ctxs_ );
434
+ graph = member_->build_strategy_ . Apply (
435
+ graph, member_->places_ , loss_var_name, member_->local_scopes_ ,
436
+ member_-> nranks_ , member_->use_cuda_ , member_->nccl_ctxs_ );
435
437
}
436
438
#else
437
- if (build_strategy .async_mode_ ) {
439
+ if (member_-> build_strategy_ .async_mode_ ) {
438
440
VLOG (3 ) << " use local async mode" ;
439
- graph = build_strategy. Apply (graph, { member_->places_ [ 0 ]}, loss_var_name,
440
- {member_->local_scopes_ [0 ]}, 1 ,
441
- member_->use_cuda_ );
441
+ graph = member_->build_strategy_ . Apply (
442
+ graph, {member_->places_ [0 ]}, loss_var_name ,
443
+ {member_-> local_scopes_ [ 0 ]}, 1 , member_->use_cuda_ );
442
444
for (size_t i = 1 ; i < member_->places_ .size (); ++i) {
443
- graphs[i] = build_strategy .Apply (
445
+ graphs[i] = member_-> build_strategy_ .Apply (
444
446
graphs[i], {member_->places_ [i]}, loss_var_name,
445
447
{member_->local_scopes_ [i]}, 1 , member_->use_cuda_ );
446
448
async_graphs[i] = graphs[i];
447
449
}
448
450
} else {
449
- graph = build_strategy. Apply (graph, member_->places_ , loss_var_name,
450
- member_->local_scopes_ , member_->nranks_ ,
451
- member_->use_cuda_ );
451
+ graph = member_->build_strategy_ . Apply (
452
+ graph, member_->places_ , loss_var_name, member_->local_scopes_ ,
453
+ member_-> nranks_ , member_->use_cuda_ );
452
454
}
453
455
#endif
454
456
@@ -489,11 +491,11 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
489
491
}
490
492
}
491
493
492
- if (build_strategy .async_mode_ ) {
494
+ if (member_-> build_strategy_ .async_mode_ ) {
493
495
VLOG (3 ) << " use AsyncSSAGraphExecutor" ;
494
496
member_->executor_ .reset (new details::AsyncSSAGraphExecutor (
495
497
exec_strategy, member_->local_scopes_ , member_->places_ , async_graphs));
496
- } else if (build_strategy .enable_parallel_graph_ ) {
498
+ } else if (member_-> build_strategy_ .enable_parallel_graph_ ) {
497
499
VLOG (3 ) << " use ParallelSSAGraphExecutor" ;
498
500
#ifdef PADDLE_WITH_CUDA
499
501
// TODO(Yancey1989): Remove passing in the main_program when
@@ -517,7 +519,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
517
519
}
518
520
519
521
VLOG (3 ) << " use ScopeBufferedSSAGraphExecutor" ;
520
- if (!build_strategy .async_mode_ ) {
522
+ if (!member_-> build_strategy_ .async_mode_ ) {
521
523
member_->executor_ .reset (new details::ScopeBufferedSSAGraphExecutor (
522
524
exec_strategy, member_->local_scopes_ , std::move (var_infos),
523
525
member_->places_ , std::move (member_->executor_ )));
0 commit comments