Update distributed.py (bigscience-workshop#275)

zhangsheng377 · web-flow · commit a7b7cb7602bb · 2023-10-28T15:27:31.000-07:00
diff --git a/megatron/model/distributed.py b/megatron/model/distributed.py
@@ -217,7 +217,6 @@ def allreduce_gradients(self):
                     if tp not in buckets:
                         buckets[tp] = []
                     buckets[tp].append(param)
-                    param.main_grad = param.grad
 
             # For each bucket, all-reduce and copy all-reduced grads.
             for tp in buckets: