[Megatron-FSDP] Fix incorrect gradient scaling target. (#3023)

cspades · web-flow · commit ba456fdad991 · 2026-01-21T19:10:00.000Z
Signed-off-by: Cory Ye &lt;cye@nvidia.com&gt;
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py
@@ -3177,7 +3177,7 @@ def _bucket_group_gradient_reduce(
                     # Scale gradients.
                     scaling_factor = gbuf.gradient_scaling_factor
                     reduce_op = gradient_reduce_preprocessing(
-                        gbuf.data, scaling_factor, gbuf.ddp_config
+                        bucket.data, scaling_factor, gbuf.ddp_config
                     )
                     if not gbuf.is_data_distributed:
                         # All-reduce the gradients on every rank. No scattering

Original file line number	Diff line number	Diff line change
`@@ -3177,7 +3177,7 @@ def _bucket_group_gradient_reduce(`
`3177`	`3177`	`# Scale gradients.`
`3178`	`3178`	`scaling_factor = gbuf.gradient_scaling_factor`
`3179`	`3179`	`reduce_op = gradient_reduce_preprocessing(`
`3180`		`- gbuf.data, scaling_factor, gbuf.ddp_config`
	`3180`	`+ bucket.data, scaling_factor, gbuf.ddp_config`
`3181`	`3181`	`)`
`3182`	`3182`	`if not gbuf.is_data_distributed:`
`3183`	`3183`	`# All-reduce the gradients on every rank. No scattering`