@@ -297,7 +297,7 @@ Reducer::Reducer(const std::vector<std::shared_ptr<imperative::VarBase>> &vars,
297
297
is_sparse_gradient_(is_sparse_gradient),
298
298
parallel_ctx_(parallel_ctx),
299
299
group_size_limits_(group_size_limits),
300
- find_unused_vars_ (find_unused_vars) {
300
+ find_unused_vars_each_step_ (find_unused_vars) {
301
301
VLOG (3 ) << " Start construct the Reducer ..." ;
302
302
nrings_ = parallel_ctx->GetNRings ();
303
303
nranks_ = parallel_ctx->GetNRanks ();
@@ -457,42 +457,8 @@ void Reducer::PrepareDeps(const std::unordered_set<GradOpNode *> &init_nodes) {
457
457
}
458
458
}
459
459
460
- // After each batch is calculated, the counter of each group(group.pending_)
461
- // and allreudce sequence counter(next_group_) will be cleaned up again.
462
- void Reducer::PrepareForBackward (
460
+ void Reducer::TraverseBackwardGraph (
463
461
const std::vector<std::shared_ptr<imperative::VarBase>> &outputs) {
464
- VLOG (3 ) << " after forward, then reset count for backward." ;
465
- next_group_ = 0 ;
466
- std::for_each (groups_.begin (), groups_.end (), [](Group &group) {
467
- group.pending_ = group.variable_indices_ .size ();
468
- group.sparse_contents_ = nullptr ;
469
- });
470
-
471
- // reinitialize vars_marked_ready_ for next iteration
472
- vars_marked_ready_.clear ();
473
- vars_marked_ready_.resize (vars_.size (), false );
474
-
475
- PADDLE_ENFORCE_EQ (
476
- groups_need_finalize_, false ,
477
- platform::errors::PreconditionNotMet (
478
- " A serious error has occurred here. There may be several reasons: "
479
- " 1) Please note that all forward outputs derived from the module "
480
- " parameters must participate in the calculation of losses and "
481
- " subsequent gradient calculations. If not, the wrapper will hang, "
482
- " waiting for autograd to generate gradients for these parameters. "
483
- " you can use detach or stop_gradient to make the unused parameters "
484
- " detached from the autograd graph. "
485
- " 2) Used multiple forwards and one backward. You may be able to wrap "
486
- " multiple forwards in a model." ));
487
-
488
- // The first var to trigger the unused parameter
489
- has_marked_unused_vars_ = false ;
490
- unused_vars_.clear ();
491
-
492
- if (!find_unused_vars_) {
493
- return ;
494
- }
495
-
496
462
node_deps_.clear ();
497
463
std::queue<std::shared_ptr<GradOpNode>> q;
498
464
std::unordered_set<VariableWrapper *> var_visited;
@@ -554,8 +520,50 @@ void Reducer::PrepareForBackward(
554
520
<< " ] is not used" ;
555
521
}
556
522
}
523
+ }
557
524
558
- if (unused_vars_.empty ()) {
525
+ // After each batch is calculated, the counter of each group(group.pending_)
526
+ // and allreudce sequence counter(next_group_) will be cleaned up again.
527
+ void Reducer::PrepareForBackward (
528
+ const std::vector<std::shared_ptr<imperative::VarBase>> &outputs) {
529
+ VLOG (3 ) << " after forward, then reset count for backward." ;
530
+ next_group_ = 0 ;
531
+ std::for_each (groups_.begin (), groups_.end (), [](Group &group) {
532
+ group.pending_ = group.variable_indices_ .size ();
533
+ group.sparse_contents_ = nullptr ;
534
+ });
535
+
536
+ // reinitialize vars_marked_ready_ for next iteration
537
+ vars_marked_ready_.clear ();
538
+ vars_marked_ready_.resize (vars_.size (), false );
539
+
540
+ PADDLE_ENFORCE_EQ (
541
+ groups_need_finalize_, false ,
542
+ platform::errors::PreconditionNotMet (
543
+ " A serious error has occurred here. Please "
544
+ " set find_unused_parameters=True to traverse backward graph "
545
+ " in each step to prepare reduce in advance. If you have "
546
+ " set, There may be several reasons for this error: "
547
+ " 1) Please note that all forward outputs derived from the module "
548
+ " parameters must participate in the calculation of losses and "
549
+ " subsequent gradient calculations. If not, the wrapper will hang, "
550
+ " waiting for autograd to generate gradients for these parameters. "
551
+ " you can use detach or stop_gradient to make the unused parameters "
552
+ " detached from the autograd graph. "
553
+ " 2) Used multiple forwards and one backward. You may be able to wrap "
554
+ " multiple forwards in a model." ));
555
+
556
+ // The first var to trigger the unused parameter
557
+ has_marked_unused_vars_ = false ;
558
+
559
+ if (find_unused_vars_once_ || find_unused_vars_each_step_) {
560
+ unused_vars_.clear ();
561
+ TraverseBackwardGraph (outputs);
562
+ // only check once in first step
563
+ find_unused_vars_once_ = false ;
564
+ }
565
+
566
+ if (find_unused_vars_each_step_ && unused_vars_.empty ()) {
559
567
LOG_FIRST_N (WARNING, 1 )
560
568
<< " All parameters are involved in the backward pass. "
561
569
" It is recommended to set find_unused_parameters to False "
@@ -564,7 +572,9 @@ void Reducer::PrepareForBackward(
564
572
" will occur. Please make it clear that in the subsequent "
565
573
" training, there will be no parameters that are not used "
566
574
" in the backward pass, and then set find_unused_parameters" ;
567
- } else if (unused_vars_.size () == vars_.size ()) {
575
+ }
576
+
577
+ if (unused_vars_.size () == vars_.size ()) {
568
578
LOG_FIRST_N (WARNING, 1 )
569
579
<< " There is no parameter in the device involved "
570
580
" in the backward calculation. If there are "
@@ -595,13 +605,13 @@ void Reducer::AddDistHook(size_t var_index) {
595
605
596
606
local_used_vars_[var_index] = 1 ;
597
607
598
- // rebuild group when find_unused_vars_ is false
608
+ // rebuild group when find_unused_vars_each_step_ is false
599
609
if (NeedRebuildGroup ()) {
600
610
rebuild_vars_.push_back (vars_[var_index]);
601
611
rebuild_var_indices_.push_back (var_index);
602
612
}
603
613
604
- if (!has_marked_unused_vars_ && find_unused_vars_ ) {
614
+ if (!has_marked_unused_vars_) {
605
615
has_marked_unused_vars_ = true ;
606
616
for (const auto &unused_index : unused_vars_) {
607
617
MarkVarReady (unused_index, false );
@@ -622,7 +632,9 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
622
632
if (vars_marked_ready_[var_index]) {
623
633
auto error_info = string::Sprintf (
624
634
" Error happened, when parameter[%d][%s] has been ready before. "
625
- " There may be several reasons for this error: "
635
+ " Please set find_unused_parameters=True to traverse backward graph "
636
+ " in each step to prepare reduce in advance. If you have set, "
637
+ " there may be several reasons for this error: "
626
638
" 1) In multiple reentrant backward phase, some parameters are reused."
627
639
" 2) Using model parameters outside of forward function. Please "
628
640
" make sure that model parameters are not shared in concurrent "
@@ -690,10 +702,16 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
690
702
}
691
703
} else {
692
704
// process sparse group
693
- PADDLE_ENFORCE_EQ (HasGrad (var_index), true ,
694
- platform::errors::PreconditionNotMet (
695
- " The sparse parameter[%d][%s] must have a gradient" ,
696
- var_index, vars_[var_index]->Name ()));
705
+ PADDLE_ENFORCE_EQ (
706
+ HasGrad (var_index), true ,
707
+ platform::errors::PreconditionNotMet (
708
+ " The sparse parameter[%d][%s] should have gradient. "
709
+ " Currently, DataParallel does not support sparse "
710
+ " parameters without generating gradients during training. "
711
+ " For example, if is_sparese=True is used in Embedding, "
712
+ " the current step of this parameter cannot generate gradient "
713
+ " because of stop_gradient/detatch, where error will occur." ,
714
+ var_index, vars_[var_index]->Name ()));
697
715
auto var_base = vars_[var_index]->GradVarBase ();
698
716
// need to check tensor type
699
717
PADDLE_ENFORCE_EQ (
@@ -943,7 +961,7 @@ void Reducer::FinalizeBackward() {
943
961
InitializeGroups (group_indices_);
944
962
}
945
963
946
- if (find_unused_vars_ ) {
964
+ if (find_unused_vars_each_step_ ) {
947
965
// TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector
948
966
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
949
967
ProcessUnusedDenseVars ();
0 commit comments