fix tests

jenchen13 · jenchen13 · commit 22b8b73cc82e · 2025-10-02T00:09:31.000Z
Signed-off-by: Jennifer Chen &lt;jennifchen@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
@@ -80,22 +80,21 @@ def max_calibrate(model: nn.Module, forward_loop: ForwardLoop | None = None, dis
     if not distributed_sync:
         return
 
-    def sync_quantizer_amax_across_dp_cp(quantizer, parallel_state):
-        """Synchronize the amax across all ranks in the data parallel and context parallel groups."""
+    def sync_quantizer_amax_across_dp(quantizer, parallel_state):
+        """Synchronize the amax across all ranks in the data parallel group."""
         if isinstance(quantizer, SequentialQuantizer):
             for _q in quantizer:
-                sync_quantizer_amax_across_dp_cp(_q, parallel_state)
+                sync_quantizer_amax_across_dp(_q, parallel_state)
             return
         if getattr(quantizer, "_amax", None) is not None:
             quantizer.sync_amax_across_distributed_group(parallel_state.data_parallel_group)
-            quantizer.sync_amax_across_distributed_group(parallel_state.context_parallel_group)
         # TODO: create sync_bias_across_distributed_group
 
     for name, module in model.named_modules():
         if isinstance(module, QuantModule):
             for child in module.children():
                 if isinstance(child, (TensorQuantizer, SequentialQuantizer)):
-                    sync_quantizer_amax_across_dp_cp(child, module.parallel_state)
+                    sync_quantizer_amax_across_dp(child, module.parallel_state)
     # TP sync:
     # Objective: the quantization parameters when TP = 8 then changed to TP=4 then back to TP=8 should be the same
 
@@ -600,17 +599,12 @@ def forward(self, input, *args, **kwargs):
     # This will also perform distributed amax sync for input_quantizers
     max_calibrate(model, lambda model: None)
 
-    def sync_act_scale_across_dp_cp(module, data_parallel_group, context_parallel_group):
-        # Sync across Data Parallel (DP)
+    def sync_act_scale_across_dp(module, data_parallel_group):
+        """Sync activation scale across Data Parallel (DP)."""
         if data_parallel_group.is_initialized():
             dist.all_reduce(
                 module.awq_lite.act_scale, op=dist.ReduceOp.AVG, group=data_parallel_group.group
             )
-        # Sync across Context Parallel (CP)
-        if context_parallel_group.is_initialized():
-            dist.all_reduce(
-                module.awq_lite.act_scale, op=dist.ReduceOp.AVG, group=context_parallel_group.group
-            )
 
     for name, module in model.named_modules():
         if (
@@ -627,10 +621,9 @@ def sync_act_scale_across_dp_cp(module, data_parallel_group, context_parallel_gr
             ):
                 module.awq_lite.is_enabled = False
             else:
-                sync_act_scale_across_dp_cp(
+                sync_act_scale_across_dp(
                     module,
                     module.parallel_state.data_parallel_group,
-                    module.parallel_state.context_parallel_group,
                 )
 
     AWQLiteHelper.cache_mode = False
diff --git a/modelopt/torch/quantization/plugins/megatron.py b/modelopt/torch/quantization/plugins/megatron.py
@@ -15,6 +15,7 @@
 
 """Support quantization for megatron linear layers."""
 
+import logging
 import warnings
 from typing import Any
 
@@ -39,6 +40,8 @@
 from ..qtensor import QTensorWrapper
 from .custom import CUSTOM_MODEL_PLUGINS, _ParallelLinear
 
+logger = logging.getLogger(__name__)
+
 __all__ = []
 
 
@@ -222,11 +225,11 @@ def _setup(self):
         try:
             data_parallel_group = get_data_parallel_group(with_context_parallel=True)
         except AssertionError:
+            logger.warning("Context parallel group is not initialized, using data parallel group")
             data_parallel_group = get_data_parallel_group()
         self.parallel_state = ParallelState(
             data_parallel_group,
             mcore_parallel.get_tensor_model_parallel_group(),
-            mcore_parallel.get_context_parallel_group(),
         )
         super()._setup()
 
diff --git a/modelopt/torch/utils/distributed.py b/modelopt/torch/utils/distributed.py
@@ -241,18 +241,15 @@ def __init__(
         self,
         data_parallel_group: torch.distributed.ProcessGroup | int | None = None,
         tensor_parallel_group: torch.distributed.ProcessGroup | int | None = -1,
-        context_parallel_group: torch.distributed.ProcessGroup | int | None = -1,
     ):
         """Initialize the parallel state."""
         self.data_parallel_group = DistributedProcessGroup(data_parallel_group)
         self.tensor_parallel_group = DistributedProcessGroup(tensor_parallel_group)
-        self.context_parallel_group = DistributedProcessGroup(context_parallel_group)
 
     def __repr__(self) -> str:
         return (
             f"data_parallel_group: {self.data_parallel_group}, "
             f"tensor_parallel_group: {self.tensor_parallel_group}, "
-            f"context_parallel_group: {self.context_parallel_group}"
         )
 
 
diff --git a/tests/_test_utils/torch_dist/plugins/megatron_common.py b/tests/_test_utils/torch_dist/plugins/megatron_common.py
@@ -127,7 +127,11 @@ def forward(self, x):
                 x = x[0]
         return x
 
-    def get_dummy_input(self) -> torch.Tensor:
+    def get_dummy_input(self, seed: int | None = None) -> torch.Tensor:
+        if seed is not None:
+            gen = torch.Generator()
+            gen.manual_seed(seed)
+            return torch.randn(1, 4, 32, generator=gen)
         return torch.randn(1, 4, 32)
 
 
diff --git a/tests/_test_utils/torch_quantization/quantize_common.py b/tests/_test_utils/torch_quantization/quantize_common.py
@@ -172,12 +172,6 @@ def forward_loop(model):
             dist.ReduceOp.AVG,
             group=tp_group,
         )
-        _reduce_quantizer_attr(
-            model.fc2.awq_lite,
-            "act_scale",
-            dist.ReduceOp.AVG,
-            group=tp_group,
-        )
 
     dist.destroy_process_group()
 
@@ -191,6 +185,9 @@ def forward_loop(model):
 
     model = mtq.quantize(model, config, forward_loop)
 
+    # Sanity check
+    forward_loop(model)
+
     # Input quantizer amax
     if config not in [mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, mtq.INT4_AWQ_CFG]:
         _reduce_quantizer_attr(model.fc1.input_quantizer, "amax", dist.ReduceOp.MAX, group=group)
@@ -226,105 +223,46 @@ def forward_loop(model):
 
 @patch("modelopt.torch.quantization.model_calib.awq_lite", side_effect=_debug_awq_lite)
 def data_tensor_context_parallel_test_helper(model, config, dp_group, tp_group, mock_awq_lite):
-    # Print rank information for debugging
-    world_rank = dist.get_rank()
-    world_size = dist.get_world_size()
-
-    print("\n=== RANK INFORMATION ===")
-    print(f"World Rank: {world_rank}, World Size: {world_size}")
-
-    # Get group information with actual ranks
-    def get_group_ranks(group):
-        if group is None:
-            return None
-        ranks = []
-        ranks = [
-            i for i in range(world_size) if dist.get_rank(group=group) == dist.get_rank(group=group)
-        ]
-        return ranks
-
-    if dp_group is not None:
-        dp_rank = dist.get_rank(group=dp_group)
-        dp_size = dist.get_world_size(group=dp_group)
-        print(f"DP Group - Rank: {dp_rank}, Size: {dp_size}")
-
-    if tp_group is not None:
-        tp_rank = dist.get_rank(group=tp_group)
-        tp_size = dist.get_world_size(group=tp_group)
-        print(f"TP Group - Rank: {tp_rank}, Size: {tp_size}")
-
-    print("=== END RANK INFO ===\n")
-
-    # Print a summary of all ranks
-    print("=== ALL RANKS SUMMARY ===")
-    print(f"Total GPUs: {world_size}")
-    print(f"Current rank: {world_rank}")
-    if dp_group is not None:
-        print(f"DP groups: {dp_size} groups of {world_size // dp_size} ranks each")
-    if tp_group is not None:
-        print(f"TP groups: {tp_size} groups of {world_size // tp_size} ranks each")
-    print("=== END SUMMARY ===\n")
-
-    calib_data = model.get_dummy_input().cuda()
-    # data should be same across each TP rank
-    dist.all_reduce(calib_data, op=dist.ReduceOp.AVG, group=tp_group)
+    # Calib data should be same across each DP rank
+    dp_rank = dist.get_rank(group=dp_group)
+    calib_data = model.get_dummy_input(seed=dp_rank).cuda()
 
     def forward_loop(model):
         model(calib_data)
 
     model = mtq.quantize(model, config, forward_loop)
 
     def _reduce_quantizer_attr(quantizer, attr=str, op=dist.ReduceOp.MAX):
-        world_rank = dist.get_rank()
-        print(f"\n--- Rank {world_rank}: Reducing {attr} ---")
-        from megatron.core.parallel_state import (
-            _CONTEXT_PARALLEL_GLOBAL_RANKS,
-            _DATA_PARALLEL_GLOBAL_RANKS,
-            _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP,
-            _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS,
-        )
-
-        print(f"DATA_PARALLEL_GLOBAL_RANKS: {_DATA_PARALLEL_GLOBAL_RANKS}")
-        print(f"CONTEXT_PARALLEL_GLOBAL_RANKS: {_CONTEXT_PARALLEL_GLOBAL_RANKS}")
-        print(f"DATA_PARALLEL_GLOBAL_RANKS_WITH_CP: {_DATA_PARALLEL_GLOBAL_RANKS_WITH_CP}")
-        print(f"TENSOR_MODEL_PARALLEL_GLOBAL_RANKS: {_TENSOR_MODEL_PARALLEL_GLOBAL_RANKS}")
         quantizer_attr = getattr(quantizer, attr).clone()
-        print(f"Rank {world_rank} - quantizer_attr before reduce", quantizer_attr)
-        print(f"Rank {world_rank} - quantizer.attr before reduce", getattr(quantizer, attr))
 
         # Perform all-reduce operations
-        if tp_group is not None:
-            tp_rank = dist.get_rank(group=tp_group)
-            print(f"Rank {world_rank} - TP reduce (TP rank {tp_rank})")
-            dist.all_reduce(quantizer_attr, op=op, group=tp_group)
+        dist.all_reduce(quantizer_attr, op=op, group=tp_group)
 
-        if dp_group is not None:
-            dp_rank = dist.get_rank(group=dp_group)
-            print(f"Rank {world_rank} - DP reduce (DP rank {dp_rank})")
-            dist.all_reduce(quantizer_attr, op=op, group=dp_group)
+        dist.all_reduce(quantizer_attr, op=op, group=dp_group)
 
-            print(f"Rank {world_rank} - quantizer_attr after reduce", quantizer_attr)
-        print(f"Rank {world_rank} - quantizer.attr after reduce", getattr(quantizer, attr))
-        print(f"--- End Rank {world_rank} ---\n")
-
-        assert torch.allclose(quantizer_attr, getattr(quantizer, attr))
+        assert torch.allclose(quantizer_attr, getattr(quantizer, attr)), getattr(quantizer, attr)
 
     # Input quantizer amax
     if config not in [mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, mtq.INT4_AWQ_CFG]:
         _reduce_quantizer_attr(model.fc1.input_quantizer, "amax", dist.ReduceOp.MAX)
         _reduce_quantizer_attr(model.fc2.input_quantizer, "amax", dist.ReduceOp.MAX)
 
-    if isinstance(model.fc1.weight_quantizer, SequentialQuantizer):
-        for quantizer in model.fc1.weight_quantizer:
-            _reduce_quantizer_attr(quantizer, "amax", dist.ReduceOp.MAX)
-    else:
-        _reduce_quantizer_attr(model.fc1.weight_quantizer, "amax", dist.ReduceOp.MAX)
-
-    if isinstance(model.fc2.weight_quantizer, SequentialQuantizer):
-        for quantizer in model.fc2.weight_quantizer:
-            _reduce_quantizer_attr(quantizer, "amax", dist.ReduceOp.MAX)
-    else:
-        _reduce_quantizer_attr(model.fc2.weight_quantizer, "amax", dist.ReduceOp.MAX)
+    # Per-tensor quantization (FP8/NVFP4) expects same amax across row and column parallel ranks
+    # Channel-wise (INT8) only expects same amax across row parallel ranks
+    # Block-wise quantization does not expect same amax across row and column parallel ranks
+    if config in [mtq.FP8_DEFAULT_CFG, mtq.NVFP4_DEFAULT_CFG]:
+        if isinstance(model.fc1.weight_quantizer, SequentialQuantizer):
+            for quantizer in model.fc1.weight_quantizer:
+                _reduce_quantizer_attr(quantizer, "amax", dist.ReduceOp.MAX)
+        else:
+            _reduce_quantizer_attr(model.fc1.weight_quantizer, "amax", dist.ReduceOp.MAX)
+
+    if config in [mtq.FP8_DEFAULT_CFG, mtq.NVFP4_DEFAULT_CFG, mtq.INT8_DEFAULT_CFG]:
+        if isinstance(model.fc2.weight_quantizer, SequentialQuantizer):
+            for quantizer in model.fc2.weight_quantizer:
+                _reduce_quantizer_attr(quantizer, "amax", dist.ReduceOp.MAX)
+        else:
+            _reduce_quantizer_attr(model.fc2.weight_quantizer, "amax", dist.ReduceOp.MAX)
 
     # Check act scale
     if config in [mtq.INT4_AWQ_CFG, mtq.W4A8_AWQ_BETA_CFG]:
@@ -333,11 +271,6 @@ def _reduce_quantizer_attr(quantizer, attr=str, op=dist.ReduceOp.MAX):
             "act_scale",
             dist.ReduceOp.AVG,
         )
-        _reduce_quantizer_attr(
-            model.fc2.awq_lite,
-            "act_scale",
-            dist.ReduceOp.AVG,
-        )
 
 
 def auto_quantize_helper(model):
diff --git a/tests/gpu/torch/quantization/plugins/test_megatron.py b/tests/gpu/torch/quantization/plugins/test_megatron.py
@@ -199,7 +199,7 @@ def _test_data_tensor_context_parallel_helper(config, rank, size):
 )
 def test_data_tensor_context_parallel(need_8_gpus, config):
     spawn_multiprocess_job(
-        size=8, job=partial(_test_data_tensor_context_parallel_helper, config), backend="nccl"
+        size=4, job=partial(_test_data_tensor_context_parallel_helper, config), backend="nccl"
     )
 
 

Original file line number	Diff line number	Diff line change
`@@ -199,7 +199,7 @@ def _test_data_tensor_context_parallel_helper(config, rank, size):`
`199`	`199`	`)`
`200`	`200`	`def test_data_tensor_context_parallel(need_8_gpus, config):`
`201`	`201`	`spawn_multiprocess_job(`
`202`		`- size=8, job=partial(_test_data_tensor_context_parallel_helper, config), backend="nccl"`
	`202`	`+ size=4, job=partial(_test_data_tensor_context_parallel_helper, config), backend="nccl"`
`203`	`203`	`)`
`204`	`204`
`205`	`205`