fix tests

jenchen13 · jenchen13 · commit 93bfd5287532 · 2025-10-08T19:40:28.000Z
Signed-off-by: Jennifer Chen &lt;jennifchen@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
@@ -114,9 +114,9 @@ def sync_quantizer_amax_across_tp(
         axes_for_sync: list,
         parallel_state: ParallelState,
     ):
+        # Syncing amax across TP for sequential quantizer
         if isinstance(quantizer, SequentialQuantizer):
             for _q in quantizer:
-                "Syncing amax across TP for sequential quantizer"
                 sync_quantizer_amax_across_tp(
                     _q, linear_name, quantizer_type, axes_for_sync, parallel_state
                 )
@@ -616,9 +616,18 @@ def sync_act_scale_across_dp(module, data_parallel_group):
             module._if_calib = True
             module.awq_lite.act_scale = module.awq_lite.act_scale / module.awq_lite.num_cache_steps
 
-            if torch.any(torch.isnan(module.awq_lite.act_scale)) or torch.any(
+            has_nan_local = torch.any(torch.isnan(module.awq_lite.act_scale)) or torch.any(
                 torch.isnan(module.awq_lite.weight_scale)
-            ):
+            )
+            has_nan = torch.tensor(int(has_nan_local), device=module.weight.device)
+            if module.parallel_state.data_parallel_group.is_initialized():
+                dist.all_reduce(
+                    has_nan,
+                    op=dist.ReduceOp.MAX,
+                    group=module.parallel_state.data_parallel_group.group,
+                )
+
+            if has_nan.item() > 0:
                 module.awq_lite.is_enabled = False
             else:
                 sync_act_scale_across_dp(
diff --git a/tests/_test_utils/torch_dist/plugins/megatron_common.py b/tests/_test_utils/torch_dist/plugins/megatron_common.py
@@ -83,7 +83,9 @@
 
 
 class MegatronModel(MegatronModule):
-    def __init__(self, tp_size: int = 1, cp_size: int = 1, use_te_norm: bool = False):
+    def __init__(
+        self, tp_size: int = 1, cp_size: int = 1, use_te_norm: bool = False, tp_group=None
+    ):
         config = TransformerConfig(
             tensor_model_parallel_size=tp_size,
             context_parallel_size=cp_size,
@@ -104,6 +106,7 @@ def __init__(self, tp_size: int = 1, cp_size: int = 1, use_te_norm: bool = False
             gather_output=False,
             skip_bias_add=True,
             is_expert=False,
+            tp_group=tp_group,
         )
         self.activation = nn.ReLU()
         if use_te_norm:
@@ -118,6 +121,7 @@ def __init__(self, tp_size: int = 1, cp_size: int = 1, use_te_norm: bool = False
             skip_bias_add=True,
             input_is_parallel=True,
             is_expert=False,
+            tp_group=tp_group,
         )
 
     def forward(self, x):
diff --git a/tests/_test_utils/torch_quantization/quantize_common.py b/tests/_test_utils/torch_quantization/quantize_common.py
@@ -119,21 +119,20 @@ def save_restore_test(model_cls, device, quant_config, compress=False, version=N
         mto.restore_from_modelopt_state(model_ref, state_dict)
 
 
-def _reduce_quantizer_attr(quantizer, attr: str, op=dist.ReduceOp.MAX, group=None):
-    quantizer_attr = getattr(quantizer, attr).clone()
-    print("quantizer.attr before reduce", getattr(quantizer, attr))
-    dist.all_reduce(quantizer_attr, op=op, group=group)
-    print("quantizer.attr after reduce", getattr(quantizer, attr))
-    print("quantizer_attr after reduce", quantizer_attr)
+def _distributed_attr_check(quantizer, attr: str, op=dist.ReduceOp.MAX, groups=[]):
+    for group in groups:
+        if group is not None:
+            quantizer_attr = getattr(quantizer, attr).clone()
+            dist.all_reduce(quantizer_attr, op=op, group=group)
     assert torch.allclose(quantizer_attr, getattr(quantizer, attr))
 
 
 original_awq_lite = model_calib_module.awq_lite
 
 
-def _debug_awq_lite(model, forward_loop, alpha_step=0.1, debug=True):
+def _debug_awq_lite(model, forward_loop, alpha_step=0.1, debug=True, **kwargs):
     """Function to mock awq_lite function to always use debug=True for testing"""
-    return original_awq_lite(model, forward_loop, alpha_step, debug=True)
+    return original_awq_lite(model, forward_loop, alpha_step, debug=True, **kwargs)
 
 
 @patch("modelopt.torch.quantization.model_calib.awq_lite", side_effect=_debug_awq_lite)
@@ -151,125 +150,101 @@ def forward_loop(model):
 
     if config in [mtq.INT8_DEFAULT_CFG, mtq.FP8_DEFAULT_CFG, mtq.INT8_SMOOTHQUANT_CFG]:
         # Lets check the amax for row parallel input quantizer; it should be the same across all tp ranks
-        _reduce_quantizer_attr(model.fc2.input_quantizer, "amax", dist.ReduceOp.MAX, group=tp_group)
+        _distributed_attr_check(
+            model.fc2.input_quantizer, "amax", dist.ReduceOp.MAX, groups=[tp_group]
+        )
         # Lets check the row parallel weight amax; it should be the same across all tp ranks
-        _reduce_quantizer_attr(
-            model.fc2.weight_quantizer, "amax", dist.ReduceOp.MAX, group=tp_group
+        _distributed_attr_check(
+            model.fc2.weight_quantizer, "amax", dist.ReduceOp.MAX, groups=[tp_group]
         )
 
     if config in [mtq.INT8_SMOOTHQUANT_CFG, mtq.INT4_AWQ_CFG, mtq.W4A8_AWQ_BETA_CFG]:
         # Lets check the column parallel pre_quant_scale; it should be the same across all tp ranks
         input_quantizer = model.fc1.input_quantizer
-        _reduce_quantizer_attr(
-            input_quantizer, "pre_quant_scale", dist.ReduceOp.MAX, group=tp_group
+        _distributed_attr_check(
+            input_quantizer, "pre_quant_scale", dist.ReduceOp.MAX, groups=[tp_group]
         )
 
     if config in [mtq.INT4_AWQ_CFG, mtq.W4A8_AWQ_BETA_CFG]:
         # Check activation scale for AWQ lite
-        _reduce_quantizer_attr(
+        _distributed_attr_check(
             model.fc1.awq_lite,
             "act_scale",
             dist.ReduceOp.AVG,
-            group=tp_group,
+            groups=[tp_group],
         )
 
     dist.destroy_process_group()
 
 
 @patch("modelopt.torch.quantization.model_calib.awq_lite", side_effect=_debug_awq_lite)
-def dp_cp_parallel_test_helper(model, config, group, mock_awq_lite):
-    calib_data = model.get_dummy_input().cuda()
-
-    def forward_loop(model):
-        model(calib_data)
-
-    model = mtq.quantize(model, config, forward_loop)
-
-    # Sanity check
-    forward_loop(model)
-
-    # Input quantizer amax
-    if config not in [mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, mtq.INT4_AWQ_CFG]:
-        _reduce_quantizer_attr(model.fc1.input_quantizer, "amax", dist.ReduceOp.MAX, group=group)
-        _reduce_quantizer_attr(model.fc2.input_quantizer, "amax", dist.ReduceOp.MAX, group=group)
-
-    # Weight quantizer amax
-    if isinstance(model.fc1.weight_quantizer, SequentialQuantizer):
-        for quantizer in model.fc1.weight_quantizer:
-            _reduce_quantizer_attr(quantizer, "amax", dist.ReduceOp.MAX, group=group)
-    else:
-        _reduce_quantizer_attr(model.fc1.weight_quantizer, "amax", dist.ReduceOp.MAX, group=group)
-    if isinstance(model.fc2.weight_quantizer, SequentialQuantizer):
-        for quantizer in model.fc2.weight_quantizer:
-            _reduce_quantizer_attr(quantizer, "amax", dist.ReduceOp.MAX, group=group)
-    else:
-        _reduce_quantizer_attr(model.fc2.weight_quantizer, "amax", dist.ReduceOp.MAX, group=group)
-
-    if config in [mtq.INT4_AWQ_CFG, mtq.W4A8_AWQ_BETA_CFG]:
-        # Check act scale
-        _reduce_quantizer_attr(
-            model.fc1.awq_lite,
-            "act_scale",
-            dist.ReduceOp.AVG,
-            group=group,
-        )
-        _reduce_quantizer_attr(
-            model.fc2.awq_lite,
-            "act_scale",
-            dist.ReduceOp.AVG,
-            group=group,
-        )
-
-
-@patch("modelopt.torch.quantization.model_calib.awq_lite", side_effect=_debug_awq_lite)
-def data_tensor_context_parallel_test_helper(model, config, dp_group, tp_group, mock_awq_lite):
-    # Calib data should be same across each DP rank
+def data_tensor_context_parallel_test_helper(
+    model, config, mock_awq_lite, dp_group=None, tp_group=None
+):
+    # Calib data should be different across each DP rank
     dp_rank = dist.get_rank(group=dp_group)
     calib_data = model.get_dummy_input(seed=dp_rank).cuda()
 
+    if tp_group is not None:
+        # The input to first layer, the column parallel should be the same across all tp ranks
+        dist.all_reduce(calib_data, op=dist.ReduceOp.AVG, group=tp_group)
+
     def forward_loop(model):
         model(calib_data)
 
     model = mtq.quantize(model, config, forward_loop)
 
-    def _reduce_quantizer_attr(quantizer, attr=str, op=dist.ReduceOp.MAX):
-        quantizer_attr = getattr(quantizer, attr).clone()
-
-        # Perform all-reduce operations
-        dist.all_reduce(quantizer_attr, op=op, group=tp_group)
-
-        dist.all_reduce(quantizer_attr, op=op, group=dp_group)
-
-        assert torch.allclose(quantizer_attr, getattr(quantizer, attr)), getattr(quantizer, attr)
-
     # Input quantizer amax
     if config not in [mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, mtq.INT4_AWQ_CFG]:
-        _reduce_quantizer_attr(model.fc1.input_quantizer, "amax", dist.ReduceOp.MAX)
-        _reduce_quantizer_attr(model.fc2.input_quantizer, "amax", dist.ReduceOp.MAX)
+        _distributed_attr_check(
+            model.fc1.input_quantizer, "amax", dist.ReduceOp.MAX, groups=[dp_group, tp_group]
+        )
+        _distributed_attr_check(
+            model.fc2.input_quantizer, "amax", dist.ReduceOp.MAX, groups=[dp_group, tp_group]
+        )
 
     # Per-tensor quantization (FP8/NVFP4) expects same amax across row and column parallel ranks
     # Channel-wise (INT8) only expects same amax across row parallel ranks
     # Block-wise quantization does not expect same amax across row and column parallel ranks
     if config in [mtq.FP8_DEFAULT_CFG, mtq.NVFP4_DEFAULT_CFG]:
         if isinstance(model.fc1.weight_quantizer, SequentialQuantizer):
             for quantizer in model.fc1.weight_quantizer:
-                _reduce_quantizer_attr(quantizer, "amax", dist.ReduceOp.MAX)
+                _distributed_attr_check(
+                    quantizer, "amax", dist.ReduceOp.MAX, groups=[dp_group, tp_group]
+                )
         else:
-            _reduce_quantizer_attr(model.fc1.weight_quantizer, "amax", dist.ReduceOp.MAX)
-
-    if config in [mtq.FP8_DEFAULT_CFG, mtq.NVFP4_DEFAULT_CFG, mtq.INT8_DEFAULT_CFG]:
+            _distributed_attr_check(
+                model.fc1.weight_quantizer, "amax", dist.ReduceOp.MAX, groups=[dp_group, tp_group]
+            )
+
+    if config in [
+        mtq.FP8_DEFAULT_CFG,
+        mtq.NVFP4_DEFAULT_CFG,
+        mtq.INT8_DEFAULT_CFG,
+        mtq.INT8_SMOOTHQUANT_CFG,
+    ]:
         if isinstance(model.fc2.weight_quantizer, SequentialQuantizer):
             for quantizer in model.fc2.weight_quantizer:
-                _reduce_quantizer_attr(quantizer, "amax", dist.ReduceOp.MAX)
+                _distributed_attr_check(
+                    quantizer, "amax", dist.ReduceOp.MAX, groups=[dp_group, tp_group]
+                )
         else:
-            _reduce_quantizer_attr(model.fc2.weight_quantizer, "amax", dist.ReduceOp.MAX)
+            _distributed_attr_check(
+                model.fc2.weight_quantizer, "amax", dist.ReduceOp.MAX, groups=[dp_group, tp_group]
+            )
+
+    # Lets check the column parallel pre_quant_scale; it should be the same across all tp ranks
+    # It is different across DP/CP ranks since the input is different
+    if tp_group and config in [mtq.INT8_SMOOTHQUANT_CFG, mtq.INT4_AWQ_CFG, mtq.W4A8_AWQ_BETA_CFG]:
+        input_quantizer = model.fc1.input_quantizer
+        _distributed_attr_check(
+            input_quantizer, "pre_quant_scale", dist.ReduceOp.MAX, groups=[dp_group, tp_group]
+        )
 
     # Check act scale
     if config in [mtq.INT4_AWQ_CFG, mtq.W4A8_AWQ_BETA_CFG]:
-        _reduce_quantizer_attr(
-            model.fc1.awq_lite,
-            "act_scale",
-            dist.ReduceOp.AVG,
+        _distributed_attr_check(
+            model.fc1.awq_lite, "act_scale", dist.ReduceOp.AVG, groups=[dp_group, tp_group]
         )
 
 
diff --git a/tests/gpu/torch/quantization/plugins/test_megatron.py b/tests/gpu/torch/quantization/plugins/test_megatron.py
@@ -32,8 +32,6 @@
 from _test_utils.torch_quantization.quantize_common import (
     auto_quantize_helper,
     data_tensor_context_parallel_test_helper,
-    dp_cp_parallel_test_helper,
-    tensor_parallel_test_helper,
 )
 from packaging.version import Version
 
@@ -97,9 +95,10 @@ def test_convert_megatron_parallel_linear(distributed_setup_size_1):
 # 1. Tensor Parallel Test
 def _test_tensor_parallel_helper(config, rank, size):
     initialize_for_megatron(tensor_model_parallel_size=2, seed=SEED)
-    model = MegatronModel(tp_size=size).cuda()
+    tp_group = get_tensor_model_parallel_group()
+    model = MegatronModel(tp_size=size, tp_group=tp_group).cuda()
 
-    tensor_parallel_test_helper(model, config, get_tensor_model_parallel_group())
+    data_tensor_context_parallel_test_helper(model, config, tp_group=tp_group)
 
 
 @pytest.mark.parametrize(
@@ -125,7 +124,7 @@ def _test_data_parallel_helper(config, rank, size):
     initialize_for_megatron(seed=SEED + rank)  # modify seed so data is different across ranks
     model = MegatronModel().cuda()
 
-    dp_cp_parallel_test_helper(model, config, get_data_parallel_group())
+    data_tensor_context_parallel_test_helper(model, config, dp_group=get_data_parallel_group())
 
 
 @pytest.mark.parametrize(
@@ -151,7 +150,9 @@ def _test_context_parallel_helper(config, rank, size):
     )  # modify seed so data is different across ranks
     model = MegatronModel(cp_size=size).cuda()
 
-    dp_cp_parallel_test_helper(model, config, get_data_parallel_group(with_context_parallel=True))
+    data_tensor_context_parallel_test_helper(
+        model, config, dp_group=get_data_parallel_group(with_context_parallel=True)
+    )
 
 
 @pytest.mark.parametrize(
@@ -175,13 +176,14 @@ def test_context_parallel(need_2_gpus, config):
 # 4. DP=2 + TP=2 + CP=2 Test (on 2*2*2=8 GPUs)
 def _test_data_tensor_context_parallel_helper(config, rank, size):
     initialize_for_megatron(tensor_model_parallel_size=2, context_parallel_size=2, seed=SEED + rank)
-    model = MegatronModel(tp_size=2, cp_size=2).cuda()
+    tp_group = get_tensor_model_parallel_group()
+    model = MegatronModel(tp_size=2, cp_size=2, tp_group=tp_group).cuda()
 
     data_tensor_context_parallel_test_helper(
         model,
         config,
-        get_data_parallel_group(with_context_parallel=True),
-        get_tensor_model_parallel_group(),
+        dp_group=get_data_parallel_group(with_context_parallel=True),
+        tp_group=tp_group,
     )