Fix ping-pong buffer index reset and removing redundant stream sync (deepspeedai#7805)

undersilence · tohtana · web-flow · commit 15ad92b459c6 · 2026-01-22T11:12:17.000Z
Fix deepspeedai#7804 and deepspeedai#7188 After investigating the code in `deepspeed/runtime/zero/stage_1_and_2.py`, I have identified the root cause. The regression regarding communication overlap was introduced in PR deepspeedai#7371 (deepspeedai#7371). While the additional two-stream synchronization in that PR fixes gradient corruption, it effectively disables the overlapping behavior. The underlying issue causing the gradient corruption (which deepspeedai#7371 attempted to fix) was actually introduced in PR deepspeedai#6993 (deepspeedai#6993). In that PR, `bucket.clear()` incorrectly resets the ping-pong buffer index to 0 at the end of `reduce_ipg_grads`. This logic disrupts the buffer index swapping mechanism within `reduce_independent_p_g_buckets_and_remove_grads`. To fix this, L121 in `deepspeed/runtime/zero/stage_1_and_2.py` should be removed to prevent resetting the buffer index. Additionally, the stream synchronization logic introduced in deepspeedai#7371 should be removed to restore the `overlap_comm=True` functionality. --------- Signed-off-by: szlent <metarufolds@gmail.com> Signed-off-by: Masahiro Tanaka <mtanaka@anyscale.com> Co-authored-by: Masahiro Tanaka <mtanaka@anyscale.com>
diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py
@@ -118,7 +118,6 @@ def clear(self):
         self.params.clear()
         self.grads.clear()
         self.elements = 0
-        self.index = 0
         self.has_moe_params = False
 
 
@@ -1052,11 +1051,8 @@ def reduce_independent_p_g_buckets_and_remove_grads(self, param, i):
         bucket = self.ipg_buckets[comm_dtype]
         if bucket.elements + param.numel() > self.reduce_bucket_size:
             self.report_ipg_memory_usage("In ipg_remove_grads before reduce_ipg_grads", param.numel())
-            self.reduce_ipg_grads()
+            self.reduce_ipg_grads(comm_dtype=comm_dtype)
             if self.contiguous_gradients and self.overlap_comm:
-                if not get_accelerator().resolves_data_dependency():
-                    self.reduction_stream.wait_stream(get_accelerator().current_stream())
-                    get_accelerator().current_stream().wait_stream(self.reduction_stream)
                 # Swap index between 0 and 1
                 bucket.index = 1 - bucket.index
             self.report_ipg_memory_usage("In ipg_remove_grads after reduce_ipg_grads", param.numel())
@@ -1500,8 +1496,11 @@ def copy_grads_in_partition(self, param):
         #print(f"Grad norm after copy to contiguous_buffer {param.grad.data.norm()}")
         self.grads_in_partition_offset += param.numel()
 
-    def reduce_ipg_grads(self):
-        for comm_dtype in sort_dtypes(self.ipg_buckets.keys()):
+    def reduce_ipg_grads(self, comm_dtype=None):
+        dtypes = sort_dtypes(self.ipg_buckets.keys())
+        if comm_dtype is not None:
+            dtypes = [comm_dtype]
+        for comm_dtype in dtypes:
             bucket = self.ipg_buckets[comm_dtype]
 
             if self.contiguous_gradients:
@@ -1536,7 +1535,7 @@ def reduce_ipg_grads(self):
             stream = get_accelerator().current_stream()
 
         with get_accelerator().stream(stream):
-            for comm_dtype in sort_dtypes(self.ipg_buckets.keys()):
+            for comm_dtype in dtypes:
                 bucket = self.ipg_buckets[comm_dtype]
 
                 for group_idx, param_idx_in_group, param_id in bucket.params: