Fix dgc bug. (#16709)

gongweibao · web-flow · commit 266cdf7d016a · 2019-04-09T11:07:35.000+08:00
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -53,6 +53,10 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
       this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
     }
   }
+  // TODO(gongwb) :polish them!
+  if (is_encoded) {
+    VLOG(1) << "Use dgc allreduce mode";
+  }
 }
 #else
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
@@ -86,7 +90,7 @@ void AllReduceOpHandle::RunImplEncoded() {
         paddle::framework::GradOriginalVarName(in_var_handles[i]->name());
     auto encode_var_name = original_name + g_dgc_encoded;
     auto *in_var = local_scope->FindVar(encode_var_name);
-    PADDLE_ENFORCE_NOT_NULL(in_var);
+    PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name);
     auto &in = in_var->Get<LoDTensor>();
     ins.emplace_back(&in);
 
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
@@ -752,7 +752,7 @@ def _append_dgc_ops(self, param_and_grads):
             force_cpu=True)
 
         for param_var, grad_var in param_and_grads:
-            var_numel = reduce(lambda x, y: x * y, param_var.shape)
+            var_numel = abs(reduce(lambda x, y: x * y, param_var.shape))
             if var_numel < 16384 or \
                 param_var.type == core.VarDesc.VarType.SELECTED_ROWS  or \
                 grad_var.type == core.VarDesc.VarType.SELECTED_ROWS  or  \
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
@@ -104,10 +104,11 @@ def __init__(self,
         self._scope = scope if scope is not None else executor.global_scope()
 
         if main_program is not None and main_program._enable_dgc:
+            assert num_trainers > 1, "dgc is not useful for single trainer training."
             assert build_strategy.reduce_strategy == BuildStrategy.ReduceStrategy.AllReduce
             assert num_trainers * len(
-                self._places) > 1, "dgc is not useful for single card training"
-            assert use_cuda
+                self._places) > 1, "dgc is not useful for single card training."
+            assert use_cuda, "dgc only used when cuda is used."
 
         main_program = main_program if main_program is not None \
             else framework.default_main_program()
@@ -123,6 +124,11 @@ def __init__(self,
             exec_strategy=exec_strategy,
             share_vars_from=share_vars_from._compiled_program
             if share_vars_from else None)
+
+        # FIXME(gongwb): I will move dgc from dist mode to allreduce mode in next pr.
+        if main_program._enable_dgc:
+            self._compiled_program._build_strategy.is_distribution = True
+
         self._place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
         self._exe = executor.Executor(self._place)
         self._compiled_program._compile(place=self._place, scope=self._scope)

Original file line number	Diff line number	Diff line change
`@@ -53,6 +53,10 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,`
`53`	`53`	`this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));`
`54`	`54`	`}`
`55`	`55`	`}`
	`56`	`+ // TODO(gongwb) :polish them!`
	`57`	`+ if (is_encoded) {`
	`58`	`+ VLOG(1) << "Use dgc allreduce mode";`
	`59`	`+ }`
`56`	`60`	`}`
`57`	`61`	`#else`
`58`	`62`	`AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,`
`@@ -86,7 +90,7 @@ void AllReduceOpHandle::RunImplEncoded() {`
`86`	`90`	`paddle::framework::GradOriginalVarName(in_var_handles[i]->name());`
`87`	`91`	`auto encode_var_name = original_name + g_dgc_encoded;`
`88`	`92`	`auto *in_var = local_scope->FindVar(encode_var_name);`
`89`		`- PADDLE_ENFORCE_NOT_NULL(in_var);`
	`93`	`+ PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name);`
`90`	`94`	`auto &in = in_var->Get<LoDTensor>();`
`91`	`95`	`ins.emplace_back(&in);`
`92`	`96`