Skip to content

Commit 3e64f49

Browse files
[bug]: fixed comm_dtype in extra_large_param_to_reduce (#7660)
Fixes #7653 The extra-large params were recorded in `param.dtype` but the reducer looks up using `comm_dtype`. https://github.com/deepspeedai/DeepSpeed/blob/d56e847bac2853d5b8819ce176eeafff65a3798e/deepspeed/runtime/zero/stage_1_and_2.py#L1461 cc @sfc-gh-truwase Signed-off-by: Naveenraj Kamalakannan <[email protected]> Co-authored-by: Masahiro Tanaka <[email protected]>
1 parent d56e847 commit 3e64f49

File tree

1 file changed

+3
-2
lines changed

1 file changed

+3
-2
lines changed

deepspeed/runtime/zero/stage_1_and_2.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1002,7 +1002,8 @@ def flatten_dense_tensors_aligned(self, tensor_list, alignment, use_cpu_data=Fal
10021002
def reduce_independent_p_g_buckets_and_remove_grads(self, param, i):
10031003

10041004
grad_reduc = self.get_gradient_for_reduction(param)
1005-
bucket = self.ipg_buckets[self.get_param_comm_dtype(param)]
1005+
comm_dtype = self.get_param_comm_dtype(param)
1006+
bucket = self.ipg_buckets[comm_dtype]
10061007
if bucket.elements + param.numel() > self.reduce_bucket_size:
10071008
self.report_ipg_memory_usage("In ipg_remove_grads before reduce_ipg_grads", param.numel())
10081009
self.reduce_ipg_grads()
@@ -1022,7 +1023,7 @@ def reduce_independent_p_g_buckets_and_remove_grads(self, param, i):
10221023

10231024
if self.contiguous_gradients:
10241025
if param.numel() > self.reduce_bucket_size:
1025-
self.extra_large_param_to_reduce[param.dtype] = param
1026+
self.extra_large_param_to_reduce[comm_dtype] = param
10261027
else:
10271028
# keeping the gradients contiguous to prevent memory fragmentation, and avoid flattening
10281029
new_grad_tensor = bucket.buffer[bucket.index].narrow(0, bucket.elements, param.numel())

0 commit comments

Comments
 (0)