unit test

meenchen · meenchen · commit e599d43520d3 · 2025-10-14T18:48:02.000Z
Signed-off-by: weimingc &lt;17592131+meenchen@users.noreply.github.com&gt;
diff --git a/examples/vllm_serve/vllm_serve_fakequant.py b/examples/vllm_serve/vllm_serve_fakequant.py
@@ -97,7 +97,8 @@ def disable_compilation(model):
 quant_config: dict[str, Any] = {
     "quant_dataset": "cnn_dailymail",
     "quant_num_samples": 512,
-    "quant_format": "NVFP4_DEFAULT_CFG",
+    # "quant_format": "NVFP4_DEFAULT_CFG",
+    "quant_format": "NVFP4_AWQ_LITE_CFG",
     "amax_file_path": None,  # Optional: path to pre-computed amax values (e.g., "/path/to/amax.pt")
 }
 
@@ -176,6 +177,7 @@ def calibrate_loop(model: Any = None) -> None:
 
     quant_cfg = getattr(mtq, quant_config["quant_format"])
 
+    print(f"Quantizing model with {quant_config['quant_format']} format")
     with disable_compilation(self.model):
         mtq.quantize(self.model, quant_cfg, forward_loop=calibrate_loop)
 
diff --git a/tests/gpu/torch/export/test_quant_utils.py b/tests/gpu/torch/export/test_quant_utils.py
@@ -0,0 +1,99 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+
+pytest.importorskip("transformers")
+
+from transformers import LlamaConfig, LlamaForCausalLM
+
+import modelopt.torch.quantization as mtq
+from modelopt.torch.export.quant_utils import pattern_fuse_prequant
+
+
+def get_tiny_llama(attention_heads=4, key_value_heads=4):
+    """Create a tiny Llama model for testing."""
+    config = LlamaConfig(
+        hidden_size=64,
+        intermediate_size=128,
+        num_hidden_layers=2,
+        num_attention_heads=attention_heads,
+        num_key_value_heads=key_value_heads,
+        max_position_embeddings=128,
+        vocab_size=256,
+    )
+    return LlamaForCausalLM(config)
+
+
+@pytest.mark.parametrize(
+    "quant_config",
+    [
+        mtq.INT4_AWQ_CFG,
+        mtq.NVFP4_AWQ_LITE_CFG,
+    ],
+)
+@pytest.mark.parametrize(
+    "attention_kv_heads_pair",
+    [
+        (4, 4),  # MHA
+        (4, 2),  # GQA
+        (4, 1),  # MQA
+    ],
+)
+def test_pattern_fuse_prequant(quant_config, attention_kv_heads_pair):
+    """Test pattern_fuse_prequant on modules from a tiny Llama model."""
+    model = get_tiny_llama(attention_kv_heads_pair[0], attention_kv_heads_pair[1]).to("cuda")
+
+    # Quantize the model
+    dummy_input = torch.randint(0, 256, (1, 16), device="cuda")
+    mtq.quantize(model, quant_config, lambda m: m(dummy_input))
+
+    # Run forward pass before fusion
+    model.eval()
+    with torch.no_grad():
+        output_before_fuse = model(dummy_input)
+
+    traget_module_name_list = [
+        "model.layers.0.self_attn.o_proj",
+        "model.layers.0.mlp.down_proj",
+        "model.layers.1.self_attn.o_proj",
+        "model.layers.1.mlp.down_proj",
+    ]
+
+    # Apply fusion
+    pattern_fuse_prequant(model)
+
+    # Check if pre_quant_scale and fused_with_prequant flag are removed correctly
+    for target_module_name in traget_module_name_list:
+        target_module = model.get_submodule(target_module_name)
+
+        # Verify pre_quant_scale was removed
+        assert not hasattr(target_module.input_quantizer, "_pre_quant_scale"), (
+            f"{target_module_name}: pre_quant_scale should be removed after fusion"
+        )
+
+        # Verify fused_with_prequant flag was set
+        assert (
+            hasattr(target_module, "fused_with_prequant") and target_module.fused_with_prequant
+        ), f"{target_module_name}: fused_with_prequant flag should be set"
+
+    # Verify output is close to the original output
+    with torch.no_grad():
+        output_after_fuse = model(dummy_input)
+    # There will be some small difference due to quantization errors after pre_quant_scale fusion to the weights
+    assert torch.allclose(
+        output_before_fuse.logits, output_after_fuse.logits, rtol=1e-1, atol=5e-1
+    ), "Output should be the same before and after fusion"