fix test

jenchen13 · jenchen13 · commit 34c11ef46cb4 · 2025-09-30T19:04:19.000Z
Signed-off-by: Jennifer Chen &lt;jennifchen@nvidia.com&gt;
diff --git a/examples/nemo_run/qat/README.md b/examples/nemo_run/qat/README.md
@@ -56,15 +56,16 @@ The resulting exported checkpoint also is much smaller in memory at 6.4GB compar
 
 You can run the example either locally  or on a [Slurm cluster](ADVANCED.md).
 
-To run the example locally, launch a [NeMo container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) with version 25.07 or higher. Clone the `TensorRT-Model-Optimizer` repository and `NeMo` repository (checkout a specific commit for NeMo), then mount it onto your docker container.
+To run the example locally, launch a [NeMo container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) with version 25.09 or higher. Clone the `TensorRT-Model-Optimizer` repository and `NeMo` repository (checkout a specific commit for NeMo), then mount it onto your docker container.
 
 - `git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git`
-- `git clone https://github.com/NVIDIA-NeMo/NeMo.git && cd NeMo && git checkout 676ed1a`
+- `git clone https://github.com/NVIDIA-NeMo/NeMo.git`
+- `git clone https://github.com/NVIDIA/Megatron-LM.git`
 
 Example docker command:
 
 ```bash
-docker run -v  /home/user/:/home/user/ -v /home/user/NeMo:/opt/NeMo -v /home/user/TensorRT-Model-Optimizer/modelopt/:/usr/local/lib/python3.12/dist-packages/modelopt --gpus all -it --shm-size 20g --rm nvcr.io/nvidia/nemo:25.07 bash
+docker run -v  /home/user/:/home/user/ -v /home/user/NeMo:/opt/NeMo -v /home/user/TensorRT-Model-Optimizer/modelopt/:/usr/local/lib/python3.12/dist-packages/modelopt -v /home/user/Megatron-LM:/opt/megatron-lm --gpus all -it --shm-size 20g --rm nvcr.io/nvidia/nemo:25.09 bash
 ```
 
 You will also need to set your Huggingface token with `export HF_TOKEN=<your-token>`. You may also need to enable write access to the docker container to the `examples/nemo_run` folder by doing `chmod 777 nemo_run` so that logs can be written.
diff --git a/tests/_test_utils/torch_quantization/quantize_common.py b/tests/_test_utils/torch_quantization/quantize_common.py
@@ -23,6 +23,7 @@
 
 import modelopt.torch.opt as mto
 import modelopt.torch.quantization as mtq
+import modelopt.torch.quantization.model_calib as model_calib_module  # needed for patching awq_lite
 from modelopt.torch.quantization.backends.gemm_registry import enable_real_quant_gemm
 from modelopt.torch.quantization.nn.modules.tensor_quantizer import SequentialQuantizer
 from modelopt.torch.quantization.utils import is_quantized_linear
@@ -127,9 +128,6 @@ def _reduce_quantizer_attr(quantizer, attr=str, op=dist.ReduceOp.MAX, group=None
     assert torch.allclose(quantizer_attr, getattr(quantizer, attr))
 
 
-# Store the original function before patching
-import modelopt.torch.quantization.model_calib as model_calib_module
-
 original_awq_lite = model_calib_module.awq_lite
 
 
@@ -252,38 +250,32 @@ def _reduce_quantizer_attr(quantizer, attr=str, op=dist.ReduceOp.MAX):
 
     # Input quantizer amax
     if config not in [mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, mtq.INT4_AWQ_CFG]:
-        _reduce_quantizer_attr(model.fc1.input_quantizer, "amax", dist.ReduceOp.MAX, group=dp_group)
-        _reduce_quantizer_attr(model.fc2.input_quantizer, "amax", dist.ReduceOp.MAX, group=dp_group)
+        _reduce_quantizer_attr(model.fc1.input_quantizer, "amax", dist.ReduceOp.MAX)
+        _reduce_quantizer_attr(model.fc2.input_quantizer, "amax", dist.ReduceOp.MAX)
 
     if isinstance(model.fc1.weight_quantizer, SequentialQuantizer):
         for quantizer in model.fc1.weight_quantizer:
-            _reduce_quantizer_attr(quantizer, "amax", dist.ReduceOp.MAX, group=dp_group)
+            _reduce_quantizer_attr(quantizer, "amax", dist.ReduceOp.MAX)
     else:
-        _reduce_quantizer_attr(
-            model.fc1.weight_quantizer, "amax", dist.ReduceOp.MAX, group=dp_group
-        )
+        _reduce_quantizer_attr(model.fc1.weight_quantizer, "amax", dist.ReduceOp.MAX)
 
     if isinstance(model.fc2.weight_quantizer, SequentialQuantizer):
         for quantizer in model.fc2.weight_quantizer:
-            _reduce_quantizer_attr(quantizer, "amax", dist.ReduceOp.MAX, group=dp_group)
+            _reduce_quantizer_attr(quantizer, "amax", dist.ReduceOp.MAX)
     else:
-        _reduce_quantizer_attr(
-            model.fc2.weight_quantizer, "amax", dist.ReduceOp.MAX, group=dp_group
-        )
+        _reduce_quantizer_attr(model.fc2.weight_quantizer, "amax", dist.ReduceOp.MAX)
 
     # Check act scale
     if config in [mtq.INT4_AWQ_CFG, mtq.W4A8_AWQ_BETA_CFG]:
         _reduce_quantizer_attr(
             model.fc1.awq_lite,
             "act_scale",
             dist.ReduceOp.AVG,
-            group=tp_group,
         )
         _reduce_quantizer_attr(
             model.fc2.awq_lite,
             "act_scale",
             dist.ReduceOp.AVG,
-            group=tp_group,
         )
 
 
diff --git a/tests/gpu/torch/quantization/plugins/test_megatron.py b/tests/gpu/torch/quantization/plugins/test_megatron.py
@@ -214,7 +214,7 @@ def _gpt_model_provider(tp_size: int, hidden_size=256, vocab_size=64, meta_devic
                 tensor_model_parallel_size=tp_size,
                 num_layers=4,
                 ffn_hidden_size=None,
-                num_attention_heads=4,
+                num_attention_heads=8,
                 activation_func="squared_relu",
                 transformer_impl="local",
                 hidden_size=hidden_size,
@@ -226,7 +226,7 @@ def _gpt_model_provider(tp_size: int, hidden_size=256, vocab_size=64, meta_devic
             tensor_model_parallel_size=tp_size,
             num_layers=4,
             ffn_hidden_size=None,
-            num_attention_heads=4,
+            num_attention_heads=8,
             activation_func="squared_relu",
             transformer_impl="local",
             hidden_size=hidden_size,