move awq test inside megatron tests

jenchen13 · jenchen13 · commit 5a572da4bcfb · 2025-09-29T23:17:00.000Z
Signed-off-by: Jennifer Chen &lt;jennifchen@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
@@ -581,6 +581,7 @@ def forward(self, input, *args, **kwargs):
         return out_actual
 
     for name, module in model.named_modules():
+        print(name, module, module.weight_quantizer.is_enabled)
         if is_quantized_linear(module) and module.weight_quantizer.is_enabled:
             with enable_weight_access_and_writeback(module, model):
                 module.awq_lite = AWQLiteHelper(module, name)
diff --git a/tests/_test_utils/torch_quantization/quantize_common.py b/tests/_test_utils/torch_quantization/quantize_common.py
@@ -117,6 +117,12 @@ def save_restore_test(model_cls, device, quant_config, compress=False, version=N
         mto.restore_from_modelopt_state(model_ref, state_dict)
 
 
+def _reduce_quantizer_attr(quantizer, attr=str, op=dist.ReduceOp.MAX, group=None):
+    quantizer_attr = getattr(quantizer, attr).clone()
+    dist.all_reduce(quantizer_attr, op=op, group=group)
+    assert torch.allclose(quantizer_attr, getattr(quantizer, attr))
+
+
 def tensor_parallel_test_helper(model, config, tp_group):
     # The input to first layer, the column parallel should be the same across all tp ranks
     calib_data = model.get_dummy_input().cuda()
@@ -126,27 +132,39 @@ def forward_loop(model):
         model(calib_data)
 
     model = mtq.quantize(model, config, forward_loop)
-
     # Sanity check
     forward_loop(model)
 
     if config in [mtq.INT8_DEFAULT_CFG, mtq.FP8_DEFAULT_CFG, mtq.INT8_SMOOTHQUANT_CFG]:
         # Lets check the amax for row parallel input quantizer; it should be the same across all tp ranks
-        activation_amax = model.fc2.input_quantizer.amax.clone()
-        dist.all_reduce(activation_amax, op=dist.ReduceOp.MAX, group=tp_group)
-        assert torch.allclose(activation_amax, model.fc2.input_quantizer.amax)
+        _reduce_quantizer_attr(model.fc2.input_quantizer, "amax", dist.ReduceOp.MAX, group=tp_group)
 
         # Lets check the row parallel weight amax; it should be the same across all tp ranks
-        weight_amax = model.fc2.weight_quantizer.amax.clone()
-        dist.all_reduce(weight_amax, op=dist.ReduceOp.MAX, group=tp_group)
-        assert torch.allclose(weight_amax, model.fc2.weight_quantizer.amax)
+        _reduce_quantizer_attr(
+            model.fc2.weight_quantizer, "amax", dist.ReduceOp.MAX, group=tp_group
+        )
 
     if config in [mtq.INT8_SMOOTHQUANT_CFG, mtq.INT4_AWQ_CFG, mtq.W4A8_AWQ_BETA_CFG]:
         # Lets check the column parallel pre_quant_scale; it should be the same across all tp ranks
         input_quantizer = model.fc1.input_quantizer
-        pre_quant_scale = input_quantizer.pre_quant_scale.clone()
-        dist.all_reduce(pre_quant_scale, op=dist.ReduceOp.MAX, group=tp_group)
-        assert torch.allclose(pre_quant_scale, input_quantizer.pre_quant_scale)
+        _reduce_quantizer_attr(
+            input_quantizer, "pre_quant_scale", dist.ReduceOp.MAX, group=tp_group
+        )
+
+    if config in [mtq.INT4_AWQ_CFG, mtq.W4A8_AWQ_BETA_CFG]:
+        # Check act scale
+        _reduce_quantizer_attr(
+            model.fc1.weight_quantizer.awq_lite.act_scale,
+            "act_scale",
+            dist.ReduceOp.AVG,
+            group=tp_group,
+        )
+        _reduce_quantizer_attr(
+            model.fc2.weight_quantizer.awq_lite.act_scale,
+            "act_scale",
+            dist.ReduceOp.AVG,
+            group=tp_group,
+        )
 
     dist.destroy_process_group()
 
@@ -159,27 +177,37 @@ def forward_loop(model):
 
     model = mtq.quantize(model, config, forward_loop)
 
-    def reduce_amax(quantizer):
-        amax = quantizer.amax.clone()
-        dist.all_reduce(amax, op=dist.ReduceOp.MAX, group=group)
-        assert torch.allclose(amax, quantizer.amax)
-
     # Input quantizer amax
     if config not in [mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, mtq.INT4_AWQ_CFG]:
-        reduce_amax(model.fc1.input_quantizer)
-        reduce_amax(model.fc2.input_quantizer)
+        _reduce_quantizer_attr(model.fc1.input_quantizer, "amax", dist.ReduceOp.MAX, group=group)
+        _reduce_quantizer_attr(model.fc2.input_quantizer, "amax", dist.ReduceOp.MAX, group=group)
 
     # Weight quantizer amax
     if isinstance(model.fc1.weight_quantizer, SequentialQuantizer):
         for quantizer in model.fc1.weight_quantizer:
-            reduce_amax(quantizer)
+            _reduce_quantizer_attr(quantizer, "amax", dist.ReduceOp.MAX, group=group)
     else:
-        reduce_amax(model.fc1.weight_quantizer)
+        _reduce_quantizer_attr(model.fc1.weight_quantizer, "amax", dist.ReduceOp.MAX, group=group)
     if isinstance(model.fc2.weight_quantizer, SequentialQuantizer):
         for quantizer in model.fc2.weight_quantizer:
-            reduce_amax(quantizer)
+            _reduce_quantizer_attr(quantizer, "amax", dist.ReduceOp.MAX, group=group)
     else:
-        reduce_amax(model.fc2.weight_quantizer)
+        _reduce_quantizer_attr(model.fc2.weight_quantizer, "amax", dist.ReduceOp.MAX, group=group)
+
+    if config in [mtq.INT4_AWQ_CFG, mtq.W4A8_AWQ_BETA_CFG]:
+        # Check act scale
+        _reduce_quantizer_attr(
+            model.fc1.weight_quantizer.awq_lite.act_scale,
+            "act_scale",
+            dist.ReduceOp.AVG,
+            group=group,
+        )
+        _reduce_quantizer_attr(
+            model.fc2.weight_quantizer.awq_lite.act_scale,
+            "act_scale",
+            dist.ReduceOp.AVG,
+            group=group,
+        )
 
 
 def data_tensor_context_parallel_test_helper(model, config, dp_group, tp_group, cp_group):
@@ -192,33 +220,52 @@ def forward_loop(model):
 
     model = mtq.quantize(model, config, forward_loop)
 
-    def reduce_amax(quantizer):
-        amax = quantizer.amax.clone()
-        print("amax before reduce", amax)
-        print("quantizer.amax before reduce", quantizer.amax)
-        dist.all_reduce(amax, op=dist.ReduceOp.MAX, group=dp_group)
-        dist.all_reduce(amax, op=dist.ReduceOp.MAX, group=cp_group)
-        dist.all_reduce(amax, op=dist.ReduceOp.MAX, group=tp_group)
-        print("amax after reduce", amax)
-        print("quantizer.amax after reduce", quantizer.amax)
-        assert torch.allclose(amax, quantizer.amax)
+    def _reduce_quantizer_attr(quantizer, attr=str, op=dist.ReduceOp.MAX):
+        quantizer_attr = getattr(quantizer, attr).clone()
+        print("quantizer_attr before reduce", quantizer_attr)
+        print("quantizer.attr before reduce", getattr(quantizer, attr))
+        dist.all_reduce(quantizer_attr, op=op, group=dp_group)
+        dist.all_reduce(quantizer_attr, op=op, group=cp_group)
+        dist.all_reduce(quantizer_attr, op=op, group=tp_group)
+        print("quantizer_attr after reduce", quantizer_attr)
+        print("quantizer.attr after reduce", getattr(quantizer, attr))
+        assert torch.allclose(quantizer_attr, getattr(quantizer, attr))
 
     # Input quantizer amax
     if config not in [mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, mtq.INT4_AWQ_CFG]:
-        reduce_amax(model.fc1.input_quantizer)
-        reduce_amax(model.fc2.input_quantizer)
+        _reduce_quantizer_attr(model.fc1.input_quantizer, "amax", dist.ReduceOp.MAX, group=dp_group)
+        _reduce_quantizer_attr(model.fc2.input_quantizer, "amax", dist.ReduceOp.MAX, group=dp_group)
 
     if isinstance(model.fc1.weight_quantizer, SequentialQuantizer):
         for quantizer in model.fc1.weight_quantizer:
-            reduce_amax(quantizer)
+            _reduce_quantizer_attr(quantizer, "amax", dist.ReduceOp.MAX, group=dp_group)
     else:
-        reduce_amax(model.fc1.weight_quantizer)
+        _reduce_quantizer_attr(
+            model.fc1.weight_quantizer, "amax", dist.ReduceOp.MAX, group=dp_group
+        )
 
     if isinstance(model.fc2.weight_quantizer, SequentialQuantizer):
         for quantizer in model.fc2.weight_quantizer:
-            reduce_amax(quantizer)
+            _reduce_quantizer_attr(quantizer, "amax", dist.ReduceOp.MAX, group=dp_group)
     else:
-        reduce_amax(model.fc2.weight_quantizer)
+        _reduce_quantizer_attr(
+            model.fc2.weight_quantizer, "amax", dist.ReduceOp.MAX, group=dp_group
+        )
+
+    # Check act scale
+    if config in [mtq.INT4_AWQ_CFG, mtq.W4A8_AWQ_BETA_CFG]:
+        _reduce_quantizer_attr(
+            model.fc1.weight_quantizer.awq_lite.act_scale,
+            "act_scale",
+            dist.ReduceOp.AVG,
+            group=tp_group,
+        )
+        _reduce_quantizer_attr(
+            model.fc2.weight_quantizer.awq_lite.act_scale,
+            "act_scale",
+            dist.ReduceOp.AVG,
+            group=tp_group,
+        )
 
 
 def auto_quantize_helper(model):
diff --git a/tests/gpu/torch/quantization/plugins/test_megatron.py b/tests/gpu/torch/quantization/plugins/test_megatron.py
@@ -175,7 +175,7 @@ def test_context_parallel(need_2_gpus, config):
 
 # 4. DP=2 + TP=2 + CP=2 Test (on 2*2*2=8 GPUs)
 def _test_data_tensor_context_parallel_helper(config, rank, size):
-    initialize_for_megatron(tensor_model_parallel_size=2, context_parallel_size=2, seed=SEED)
+    initialize_for_megatron(tensor_model_parallel_size=2, context_parallel_size=2, seed=SEED + rank)
     model = MegatronModel(tp_size=2, cp_size=2).cuda()
 
     data_tensor_context_parallel_test_helper(
diff --git a/tests/gpu/torch/quantization/test_model_calib.py b/tests/gpu/torch/quantization/test_model_calib.py