fix bug

jenchen13 · jenchen13 · commit 50000dd619f9 · 2025-10-10T14:49:25.000Z
Signed-off-by: Jennifer Chen &lt;jennifchen@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
@@ -619,7 +619,7 @@ def sync_act_scale_across_dp(module, data_parallel_group):
             has_nan_local = torch.any(torch.isnan(module.awq_lite.act_scale)) or torch.any(
                 torch.isnan(module.awq_lite.weight_scale)
             )
-            has_nan = torch.tensor(int(has_nan_local), device=module.weight.device)
+            has_nan = torch.tensor(int(has_nan_local), device=module.awq_lite.act_scale.device)
             if module.parallel_state.data_parallel_group.is_initialized():
                 dist.all_reduce(
                     has_nan,
diff --git a/tests/_test_utils/torch_quantization/quantize_common.py b/tests/_test_utils/torch_quantization/quantize_common.py
@@ -120,9 +120,9 @@ def save_restore_test(model_cls, device, quant_config, compress=False, version=N
 
 
 def _distributed_attr_check(quantizer, attr: str, op=dist.ReduceOp.MAX, groups=[]):
+    quantizer_attr = getattr(quantizer, attr).clone()
     for group in groups:
         if group is not None:
-            quantizer_attr = getattr(quantizer, attr).clone()
             dist.all_reduce(quantizer_attr, op=op, group=group)
     assert torch.allclose(quantizer_attr, getattr(quantizer, attr))
 
@@ -137,7 +137,7 @@ def _debug_awq_lite(model, forward_loop, alpha_step=0.1, debug=True, **kwargs):
 
 @patch("modelopt.torch.quantization.model_calib.awq_lite", side_effect=_debug_awq_lite)
 def data_tensor_context_parallel_test_helper(
-    model, config, mock_awq_lite, dp_group=None, tp_group=None
+    model, config, mock_awq_lite, dp_group=None, tp_group=None, test_pre_quant_scale=True
 ):
     # Calib data should be different across each DP rank
     dp_rank = dist.get_rank(group=dp_group)
@@ -193,7 +193,11 @@ def forward_loop(model):
 
     # Lets check the column parallel pre_quant_scale; it should be the same across all tp ranks
     # It is different across DP/CP ranks since the input is different
-    if tp_group and config in [mtq.INT8_SMOOTHQUANT_CFG, mtq.INT4_AWQ_CFG, mtq.W4A8_AWQ_BETA_CFG]:
+    if (
+        test_pre_quant_scale
+        and tp_group
+        and config in [mtq.INT8_SMOOTHQUANT_CFG, mtq.INT4_AWQ_CFG, mtq.W4A8_AWQ_BETA_CFG]
+    ):
         input_quantizer = model.fc1.input_quantizer
         _distributed_attr_check(
             input_quantizer, "pre_quant_scale", dist.ReduceOp.MAX, groups=[dp_group, tp_group]
diff --git a/tests/gpu/torch/quantization/plugins/test_megatron.py b/tests/gpu/torch/quantization/plugins/test_megatron.py
@@ -98,6 +98,7 @@ def _test_parallelism_helper(
     tensor_model_parallel_size=1,
     context_parallel_size=1,
     use_rank_in_seed=False,
+    test_pre_quant_scale=True,
 ):
     """
     Unified helper for testing different parallelism configurations.
@@ -133,6 +134,7 @@ def _test_parallelism_helper(
         config,
         dp_group=dp_group,
         tp_group=tp_group,
+        test_pre_quant_scale=test_pre_quant_scale,
     )
 
 
@@ -219,6 +221,7 @@ def test_data_tensor_context_parallel(need_8_gpus, config):
             tensor_model_parallel_size=2,
             context_parallel_size=2,
             use_rank_in_seed=True,
+            test_pre_quant_scale=False,
         ),
         backend="nccl",
     )

Original file line number	Diff line number	Diff line change
`@@ -619,7 +619,7 @@ def sync_act_scale_across_dp(module, data_parallel_group):`
`619`	`619`	`has_nan_local = torch.any(torch.isnan(module.awq_lite.act_scale)) or torch.any(`
`620`	`620`	`torch.isnan(module.awq_lite.weight_scale)`
`621`	`621`	`)`
`622`		`- has_nan = torch.tensor(int(has_nan_local), device=module.weight.device)`
	`622`	`+ has_nan = torch.tensor(int(has_nan_local), device=module.awq_lite.act_scale.device)`
`623`	`623`	`if module.parallel_state.data_parallel_group.is_initialized():`
`624`	`624`	`dist.all_reduce(`
`625`	`625`	`has_nan,`