Skip to content

Commit 34c11ef

Browse files
committed
fix test
Signed-off-by: Jennifer Chen <[email protected]>
1 parent 95da832 commit 34c11ef

File tree

3 files changed

+13
-20
lines changed

3 files changed

+13
-20
lines changed

examples/nemo_run/qat/README.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,15 +56,16 @@ The resulting exported checkpoint also is much smaller in memory at 6.4GB compar
5656

5757
You can run the example either locally or on a [Slurm cluster](ADVANCED.md).
5858

59-
To run the example locally, launch a [NeMo container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) with version 25.07 or higher. Clone the `TensorRT-Model-Optimizer` repository and `NeMo` repository (checkout a specific commit for NeMo), then mount it onto your docker container.
59+
To run the example locally, launch a [NeMo container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) with version 25.09 or higher. Clone the `TensorRT-Model-Optimizer` repository and `NeMo` repository (checkout a specific commit for NeMo), then mount it onto your docker container.
6060

6161
- `git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git`
62-
- `git clone https://github.com/NVIDIA-NeMo/NeMo.git && cd NeMo && git checkout 676ed1a`
62+
- `git clone https://github.com/NVIDIA-NeMo/NeMo.git`
63+
- `git clone https://github.com/NVIDIA/Megatron-LM.git`
6364

6465
Example docker command:
6566

6667
```bash
67-
docker run -v /home/user/:/home/user/ -v /home/user/NeMo:/opt/NeMo -v /home/user/TensorRT-Model-Optimizer/modelopt/:/usr/local/lib/python3.12/dist-packages/modelopt --gpus all -it --shm-size 20g --rm nvcr.io/nvidia/nemo:25.07 bash
68+
docker run -v /home/user/:/home/user/ -v /home/user/NeMo:/opt/NeMo -v /home/user/TensorRT-Model-Optimizer/modelopt/:/usr/local/lib/python3.12/dist-packages/modelopt -v /home/user/Megatron-LM:/opt/megatron-lm --gpus all -it --shm-size 20g --rm nvcr.io/nvidia/nemo:25.09 bash
6869
```
6970

7071
You will also need to set your Huggingface token with `export HF_TOKEN=<your-token>`. You may also need to enable write access to the docker container to the `examples/nemo_run` folder by doing `chmod 777 nemo_run` so that logs can be written.

tests/_test_utils/torch_quantization/quantize_common.py

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
import modelopt.torch.opt as mto
2525
import modelopt.torch.quantization as mtq
26+
import modelopt.torch.quantization.model_calib as model_calib_module # needed for patching awq_lite
2627
from modelopt.torch.quantization.backends.gemm_registry import enable_real_quant_gemm
2728
from modelopt.torch.quantization.nn.modules.tensor_quantizer import SequentialQuantizer
2829
from modelopt.torch.quantization.utils import is_quantized_linear
@@ -127,9 +128,6 @@ def _reduce_quantizer_attr(quantizer, attr=str, op=dist.ReduceOp.MAX, group=None
127128
assert torch.allclose(quantizer_attr, getattr(quantizer, attr))
128129

129130

130-
# Store the original function before patching
131-
import modelopt.torch.quantization.model_calib as model_calib_module
132-
133131
original_awq_lite = model_calib_module.awq_lite
134132

135133

@@ -252,38 +250,32 @@ def _reduce_quantizer_attr(quantizer, attr=str, op=dist.ReduceOp.MAX):
252250

253251
# Input quantizer amax
254252
if config not in [mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, mtq.INT4_AWQ_CFG]:
255-
_reduce_quantizer_attr(model.fc1.input_quantizer, "amax", dist.ReduceOp.MAX, group=dp_group)
256-
_reduce_quantizer_attr(model.fc2.input_quantizer, "amax", dist.ReduceOp.MAX, group=dp_group)
253+
_reduce_quantizer_attr(model.fc1.input_quantizer, "amax", dist.ReduceOp.MAX)
254+
_reduce_quantizer_attr(model.fc2.input_quantizer, "amax", dist.ReduceOp.MAX)
257255

258256
if isinstance(model.fc1.weight_quantizer, SequentialQuantizer):
259257
for quantizer in model.fc1.weight_quantizer:
260-
_reduce_quantizer_attr(quantizer, "amax", dist.ReduceOp.MAX, group=dp_group)
258+
_reduce_quantizer_attr(quantizer, "amax", dist.ReduceOp.MAX)
261259
else:
262-
_reduce_quantizer_attr(
263-
model.fc1.weight_quantizer, "amax", dist.ReduceOp.MAX, group=dp_group
264-
)
260+
_reduce_quantizer_attr(model.fc1.weight_quantizer, "amax", dist.ReduceOp.MAX)
265261

266262
if isinstance(model.fc2.weight_quantizer, SequentialQuantizer):
267263
for quantizer in model.fc2.weight_quantizer:
268-
_reduce_quantizer_attr(quantizer, "amax", dist.ReduceOp.MAX, group=dp_group)
264+
_reduce_quantizer_attr(quantizer, "amax", dist.ReduceOp.MAX)
269265
else:
270-
_reduce_quantizer_attr(
271-
model.fc2.weight_quantizer, "amax", dist.ReduceOp.MAX, group=dp_group
272-
)
266+
_reduce_quantizer_attr(model.fc2.weight_quantizer, "amax", dist.ReduceOp.MAX)
273267

274268
# Check act scale
275269
if config in [mtq.INT4_AWQ_CFG, mtq.W4A8_AWQ_BETA_CFG]:
276270
_reduce_quantizer_attr(
277271
model.fc1.awq_lite,
278272
"act_scale",
279273
dist.ReduceOp.AVG,
280-
group=tp_group,
281274
)
282275
_reduce_quantizer_attr(
283276
model.fc2.awq_lite,
284277
"act_scale",
285278
dist.ReduceOp.AVG,
286-
group=tp_group,
287279
)
288280

289281

tests/gpu/torch/quantization/plugins/test_megatron.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ def _gpt_model_provider(tp_size: int, hidden_size=256, vocab_size=64, meta_devic
214214
tensor_model_parallel_size=tp_size,
215215
num_layers=4,
216216
ffn_hidden_size=None,
217-
num_attention_heads=4,
217+
num_attention_heads=8,
218218
activation_func="squared_relu",
219219
transformer_impl="local",
220220
hidden_size=hidden_size,
@@ -226,7 +226,7 @@ def _gpt_model_provider(tp_size: int, hidden_size=256, vocab_size=64, meta_devic
226226
tensor_model_parallel_size=tp_size,
227227
num_layers=4,
228228
ffn_hidden_size=None,
229-
num_attention_heads=4,
229+
num_attention_heads=8,
230230
activation_func="squared_relu",
231231
transformer_impl="local",
232232
hidden_size=hidden_size,

0 commit comments

Comments
 (0)