Merge branch 'dev' into gdn_thd

kunlunl · web-flow · commit 80d2d1c02dd9 · 2026-01-29T14:14:56.000+08:00
diff --git a/megatron/core/distributed/param_and_grad_buffer.py b/megatron/core/distributed/param_and_grad_buffer.py
@@ -78,6 +78,8 @@ class _ParamAndGradBucket:
             communication. Its application is twofold: it facilitates the averaging of gradients
             and the scaling of gradients in the context of the Mixture of Experts (MoE) model.
         bucket_id: Index of bucket in buffer.
+        param_index_map: Mapping from param to (start, end, bucket_id) in the global buffer.
+            Used to derive bucket-local offsets for param_to_index.
     """
 
     def __init__(
@@ -89,6 +91,7 @@ def __init__(
         numel_unpadded: int,
         gradient_scaling_factor: float,
         bucket_id: int,
+        param_index_map: Dict[torch.nn.Parameter, tuple],
     ):
         self.params_list = params
         self.params = set(params)
@@ -102,11 +105,11 @@ def __init__(
         self.numel_unpadded = numel_unpadded
         self.gradient_scaling_factor = gradient_scaling_factor
         self.bucket_id = bucket_id
+        # Derive bucket-local param offsets from the global param_index_map.
         self.param_to_index = {}
-        offset = 0
         for param in params:
-            self.param_to_index[param] = (offset, offset + param.numel())
-            offset += param.numel()
+            global_start, global_end, _ = param_index_map[param]
+            self.param_to_index[param] = (global_start - offset, global_end - offset)
 
 
 class _ParamAndGradBucketGroup:
@@ -926,6 +929,7 @@ def _new_bucket(
             numel_unpadded=numel_unpadded,
             gradient_scaling_factor=self.gradient_scaling_factor,
             bucket_id=bucket_id,
+            param_index_map=self.param_index_map,
         )
         for bucket_param in bucket_params:
             assert bucket_param not in self.param_to_bucket
diff --git a/tests/test_utils/recipes/mamba-static-inference.yaml b/tests/test_utils/recipes/mamba-static-inference.yaml
@@ -59,8 +59,8 @@ products:
       - environment: [dev]
         scope: [mr-broken, mr-github-broken]
         platforms: [dgx_h100]
-  - test_case: [hybrid_static_inference_tp1_pp1_2B_cudagraphs]
-    products:
-      - environment: [dev]
-        scope: [mr]
-        platforms: [dgx_h100]
+  # - test_case: [hybrid_static_inference_tp1_pp1_2B_cudagraphs]
+  #   products:
+  #     - environment: [dev]
+  #       scope: [mr]
+  #       platforms: [dgx_h100] # Broken after dev2main sync 01/27
diff --git a/tests/unit_tests/distributed/test_param_and_grad_buffer.py b/tests/unit_tests/distributed/test_param_and_grad_buffer.py
@@ -162,6 +162,59 @@ def _pad_param_if_needed(numel_unpadded):
     Utils.destroy_model_parallel()
 
 
+def test_param_to_index_alignment_with_padding():
+    """Ensure bucket-local param offsets honor padding when DistOpt pads params."""
+    Utils.initialize_model_parallel()
+
+    # With input_dim=4, output_dim=4:
+    #   - weight: 4*4 = 16 elements
+    #   - bias: 4 elements
+    # Since 16 % 64 != 0, the bias must be padded away from the weight,
+    # making padding observable.
+    input_dim = 4
+    output_dim = 4
+    model, param_and_grad_buffer, _ = get_model_and_buffers(
+        input_dim=input_dim,
+        output_dim=output_dim,
+        num_layers=1,
+        bias=True,
+        shared_embedding=False,
+        bucket_size=None,  # single bucket
+        use_distributed_optimizer=True,  # enforces 64-element alignment
+        overlap_grad_reduce=True,
+        average_in_collective=False,
+    )
+
+    bucket = param_and_grad_buffer.buckets[0]
+    naive_offset = 0
+    padding_observed = False
+
+    for param in bucket.params_list:
+        global_start, global_end, _ = param_and_grad_buffer.param_index_map[param]
+        expected_local_start = global_start - bucket.offset
+        expected_local_end = global_end - bucket.offset
+        local_start, local_end = bucket.param_to_index[param]
+
+        # param_to_index should match the padded offsets used in the global buffer.
+        assert (local_start, local_end) == (expected_local_start, expected_local_end)
+
+        # At least one param should have been padded relative to naive packing.
+        if local_start != naive_offset:
+            padding_observed = True
+        naive_offset = local_end
+
+        # Verify the slice retrieved via param_to_index matches param.data view.
+        param_slice = bucket.param_data.view(-1)[local_start:local_end]
+        torch.testing.assert_close(param_slice, param.data.view(-1))
+
+    assert padding_observed, (
+        "Expected padding to be applied between params. "
+        "Ensure model dimensions are chosen such that param sizes are not multiples of 64."
+    )
+
+    Utils.destroy_model_parallel()
+
+
 @pytest.mark.parametrize("use_distributed_optimizer", [False, True])
 @pytest.mark.parametrize("overlap_grad_reduce", [False, True])
 @pytest.mark.parametrize("average_in_collective", [False, True])