awq test

jenchen13 · jenchen13 · commit d02365c5f31d · 2025-09-29T21:38:48.000Z
Signed-off-by: Jennifer Chen &lt;jennifchen@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
@@ -617,20 +617,20 @@ def sync_act_scale_across_dp_cp(module, data_parallel_group, context_parallel_gr
             and hasattr(module, "awq_lite")
             and module.awq_lite.num_cache_steps > 0
         ):
+            # Hack: MoEs forward all tokens through all experts if _if_calib is True
+            module._if_calib = True
             module.awq_lite.act_scale = module.awq_lite.act_scale / module.awq_lite.num_cache_steps
-            
+
             if torch.any(torch.isnan(module.awq_lite.act_scale)) or torch.any(
                 torch.isnan(module.awq_lite.weight_scale)
             ):
                 module.awq_lite.is_enabled = False
-                
-            sync_act_scale_across_dp_cp(
-                module,
-                module.parallel_state.data_parallel_group,
-                module.parallel_state.context_parallel_group,
-            )
-            # Hack: MoEs forward all tokens through all experts if _if_calib is True
-            module._if_calib = True
+            else:
+                sync_act_scale_across_dp_cp(
+                    module,
+                    module.parallel_state.data_parallel_group,
+                    module.parallel_state.context_parallel_group,
+                )
 
     AWQLiteHelper.cache_mode = False
     print_rank_0("awq_lite: Searching parameters...")
diff --git a/tests/_test_utils/torch_dist/plugins/megatron_common.py b/tests/_test_utils/torch_dist/plugins/megatron_common.py
@@ -384,7 +384,10 @@ def run_mcore_inference_with_dummy_input(
 
 
 def initialize_for_megatron(
-    tensor_model_parallel_size=1, pipeline_model_parallel_size=1, context_parallel_size=1, seed=1234
+    tensor_model_parallel_size=1,
+    pipeline_model_parallel_size=1,
+    seed=1234,
+    context_parallel_size=1,
 ):
     """Initialize Megatron model parallelism.
 
diff --git a/tests/_test_utils/torch_quantization/quantize_common.py b/tests/_test_utils/torch_quantization/quantize_common.py
@@ -194,9 +194,13 @@ def forward_loop(model):
 
     def reduce_amax(quantizer):
         amax = quantizer.amax.clone()
-        dist.all_reduce(amax, op=dist.ReduceOp.MAX, group=tp_group)
-        dist.all_reduce(amax, op=dist.ReduceOp.MAX, group=cp_group)
+        print("amax before reduce", amax)
+        print("quantizer.amax before reduce", quantizer.amax)
         dist.all_reduce(amax, op=dist.ReduceOp.MAX, group=dp_group)
+        dist.all_reduce(amax, op=dist.ReduceOp.MAX, group=cp_group)
+        dist.all_reduce(amax, op=dist.ReduceOp.MAX, group=tp_group)
+        print("amax after reduce", amax)
+        print("quantizer.amax after reduce", quantizer.amax)
         assert torch.allclose(amax, quantizer.amax)
 
     # Input quantizer amax
diff --git a/tests/gpu/torch/quantization/plugins/test_megatron.py b/tests/gpu/torch/quantization/plugins/test_megatron.py
@@ -123,8 +123,7 @@ def test_tensor_parallel(need_2_gpus, config):
 
 # 2. Data Parallel Test
 def _test_data_parallel_helper(config, rank, size):
-    # TODO does this model automatically get copied to both DP ranks?
-    initialize_for_megatron(seed=SEED)
+    initialize_for_megatron(seed=SEED + rank)  # modify seed so data is different across ranks
     model = MegatronModel().cuda()
 
     dp_cp_parallel_test_helper(model, config, get_data_parallel_group())
@@ -148,7 +147,9 @@ def test_data_parallel(need_2_gpus, config):
 
 # 3. Context Parallel Test
 def _test_context_parallel_helper(config, rank, size):
-    initialize_for_megatron(context_parallel_size=size, seed=SEED)
+    initialize_for_megatron(
+        context_parallel_size=size, seed=SEED + rank
+    )  # modify seed so data is different across ranks
     model = MegatronModel(cp_size=size).cuda()
 
     dp_cp_parallel_test_helper(model, config, get_context_parallel_group())
diff --git a/tests/gpu/torch/quantization/test_model_calib.py b/tests/gpu/torch/quantization/test_model_calib.py
@@ -0,0 +1,33 @@
+import torch
+import torch.distributed as dist
+from _test_utils.torch_dist.dist_utils import spawn_multiprocess_job
+from _test_utils.torch_dist.plugins.megatron_common import MegatronModel, initialize_for_megatron
+from megatron.core.parallel_state import get_data_parallel_group
+
+from modelopt.torch.quantization.model_calib import awq_lite
+
+
+def _test_awq_lite_act_scale_sync_helper(rank, size):
+    initialize_for_megatron(seed=1234 + rank)
+    model = MegatronModel().cuda()
+
+    calib_data = model.get_dummy_input().cuda()
+
+    def forward_loop(model):
+        model(calib_data)
+
+    model = awq_lite(model, forward_loop)
+    # Sanity check
+    forward_loop(model)
+
+    act_scale = model.fc1.weight_quantizer.awq_lite.act_scale.clone()
+    dist.all_reduce(act_scale, op=dist.ReduceOp.AVG, group=get_data_parallel_group())
+    assert torch.allclose(act_scale, model.fc1.weight_quantizer.awq_lite.act_scale)
+
+    act_scale = model.fc2.weight_quantizer.awq_lite.act_scale.clone()
+    dist.all_reduce(act_scale, op=dist.ReduceOp.AVG, group=get_data_parallel_group())
+    assert torch.allclose(act_scale, model.fc2.weight_quantizer.awq_lite.act_scale)
+
+
+def test_awq_lite_act_scale_sync(need_2_gpus):
+    spawn_multiprocess_job(size=2, job=_test_awq_lite_act_scale_sync_helper, backend="nccl")