Skip to content

Commit 266cdf7

Browse files
authored
Fix dgc bug. (#16709)
1 parent 7e56055 commit 266cdf7

File tree

3 files changed

+14
-4
lines changed

3 files changed

+14
-4
lines changed

paddle/fluid/framework/details/all_reduce_op_handle.cc

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
5353
this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
5454
}
5555
}
56+
// TODO(gongwb) :polish them!
57+
if (is_encoded) {
58+
VLOG(1) << "Use dgc allreduce mode";
59+
}
5660
}
5761
#else
5862
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
@@ -86,7 +90,7 @@ void AllReduceOpHandle::RunImplEncoded() {
8690
paddle::framework::GradOriginalVarName(in_var_handles[i]->name());
8791
auto encode_var_name = original_name + g_dgc_encoded;
8892
auto *in_var = local_scope->FindVar(encode_var_name);
89-
PADDLE_ENFORCE_NOT_NULL(in_var);
93+
PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name);
9094
auto &in = in_var->Get<LoDTensor>();
9195
ins.emplace_back(&in);
9296

python/paddle/fluid/optimizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -752,7 +752,7 @@ def _append_dgc_ops(self, param_and_grads):
752752
force_cpu=True)
753753

754754
for param_var, grad_var in param_and_grads:
755-
var_numel = reduce(lambda x, y: x * y, param_var.shape)
755+
var_numel = abs(reduce(lambda x, y: x * y, param_var.shape))
756756
if var_numel < 16384 or \
757757
param_var.type == core.VarDesc.VarType.SELECTED_ROWS or \
758758
grad_var.type == core.VarDesc.VarType.SELECTED_ROWS or \

python/paddle/fluid/parallel_executor.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,10 +104,11 @@ def __init__(self,
104104
self._scope = scope if scope is not None else executor.global_scope()
105105

106106
if main_program is not None and main_program._enable_dgc:
107+
assert num_trainers > 1, "dgc is not useful for single trainer training."
107108
assert build_strategy.reduce_strategy == BuildStrategy.ReduceStrategy.AllReduce
108109
assert num_trainers * len(
109-
self._places) > 1, "dgc is not useful for single card training"
110-
assert use_cuda
110+
self._places) > 1, "dgc is not useful for single card training."
111+
assert use_cuda, "dgc only used when cuda is used."
111112

112113
main_program = main_program if main_program is not None \
113114
else framework.default_main_program()
@@ -123,6 +124,11 @@ def __init__(self,
123124
exec_strategy=exec_strategy,
124125
share_vars_from=share_vars_from._compiled_program
125126
if share_vars_from else None)
127+
128+
# FIXME(gongwb): I will move dgc from dist mode to allreduce mode in next pr.
129+
if main_program._enable_dgc:
130+
self._compiled_program._build_strategy.is_distribution = True
131+
126132
self._place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
127133
self._exe = executor.Executor(self._place)
128134
self._compiled_program._compile(place=self._place, scope=self._scope)

0 commit comments

Comments
 (0)