Fix the counter if the worker retry to aggregate gradient in ElasticDL (#2497)

workingloong · web-flow · commit e60551bdd377 · 2021-02-02T20:11:06.000+08:00
* Fix the counter when retring

* Fix by comments

* Polish
diff --git a/elasticai_api/tensorflow/optimizer.py b/elasticai_api/tensorflow/optimizer.py
@@ -330,15 +330,17 @@ def compute_gradients(self, grads):
             grads = get_not_none_from_list(grads)
             assert len(grads) == len(self.locally_aggregated_grads)
 
-            # Allreduce locally aggregated gradients when the counter is
-            # equivalent to `backward_passes_per_step`. This the condition is
-            # true, it also resets the counter back to 0.
+            # Allreduce locally aggregated gradients when the counter equals
+            # or exceeds backward_passes_per_step. The counter may exceed
+            # backward_passes_per_step because of retries in the fault-tolerant
+            # allreduce. When the condition is true, it also resets the counter
+            # back to 0.
             allreduced_grads = tf.cond(
-                tf.equal(
+                tf.math.less(
                     self.counter, self.mutable_local_backward_passes_per_step
                 ),
-                lambda: self._allreduce_grads_helper(grads),
                 lambda: grads,
+                lambda: self._allreduce_grads_helper(grads),
             )
 
             # Handle case where there is only one variable.