|
| 1 | +import json |
| 2 | +import tempfile |
| 3 | +from pathlib import Path |
| 4 | + |
1 | 5 | import pytest |
2 | 6 |
|
3 | 7 | from tensorrt_llm._tensorrt_engine import LLM |
| 8 | +from tensorrt_llm._torch.model_config import ModelConfig |
4 | 9 | from tensorrt_llm.llmapi import KvCacheConfig, SamplingParams |
5 | 10 | from tensorrt_llm.llmapi.llm_utils import CalibConfig, QuantAlgo, QuantConfig |
6 | 11 |
|
@@ -71,6 +76,105 @@ def test_llm_fp8_quantization_modelOpt_ckpt(): |
71 | 76 | assert output.outputs[0].text == " D E F G H I" |
72 | 77 |
|
73 | 78 |
|
| 79 | +def test_quant_cfg_from_quant_cfg_json(): |
| 80 | + """ |
| 81 | + Test loading MIXED_PRECISION config from quant_cfg.json with per-layer quantization. |
| 82 | + This supports the workflow from examples/quantization/quantize_mixed_precision_moe.py. |
| 83 | + """ |
| 84 | + with tempfile.TemporaryDirectory() as tmp_dir: |
| 85 | + model_dir = Path(tmp_dir) |
| 86 | + |
| 87 | + # Create dummy quant_cfg.json |
| 88 | + quant_cfg_content = { |
| 89 | + "quant_algo": "MIXED_PRECISION", |
| 90 | + "kv_cache_quant_algo": "FP8", |
| 91 | + "quantized_layers": { |
| 92 | + "model.layers.0.self_attn.q_proj": { |
| 93 | + "quant_algo": "FP8" |
| 94 | + }, |
| 95 | + "model.layers.0.self_attn.k_proj": { |
| 96 | + "quant_algo": "FP8" |
| 97 | + }, |
| 98 | + "model.layers.1.mlp.gate_proj": { |
| 99 | + "quant_algo": "W4A8_AWQ", |
| 100 | + "group_size": 128 |
| 101 | + } |
| 102 | + } |
| 103 | + } |
| 104 | + |
| 105 | + quant_cfg_file = model_dir / "quant_cfg.json" |
| 106 | + with open(quant_cfg_file, 'w') as f: |
| 107 | + json.dump(quant_cfg_content, f) |
| 108 | + |
| 109 | + # Create dummy hf_quant_config.json |
| 110 | + hf_quant_config_content = { |
| 111 | + "quantization": { |
| 112 | + "quant_algo": "MIXED_PRECISION", |
| 113 | + "kv_cache_quant_algo": None, |
| 114 | + } |
| 115 | + } |
| 116 | + |
| 117 | + hf_quant_config_file = model_dir / "hf_quant_config.json" |
| 118 | + with open(hf_quant_config_file, 'w') as f: |
| 119 | + json.dump(hf_quant_config_content, f) |
| 120 | + |
| 121 | + quant_config, layer_quant_config = ModelConfig.load_modelopt_quant_config( |
| 122 | + hf_quant_config_file, model_dir, None) |
| 123 | + |
| 124 | + # Verify quant_cfg.json was loaded |
| 125 | + assert quant_config.quant_algo == QuantAlgo.MIXED_PRECISION |
| 126 | + assert quant_config.kv_cache_quant_algo == "FP8" |
| 127 | + |
| 128 | + # Verify layer configs were created correctly |
| 129 | + assert layer_quant_config[ |
| 130 | + "model.layers.0.self_attn.q_proj"].quant_algo == "FP8" |
| 131 | + assert layer_quant_config[ |
| 132 | + "model.layers.0.self_attn.q_proj"].kv_cache_quant_algo == "FP8" |
| 133 | + assert layer_quant_config[ |
| 134 | + "model.layers.1.mlp.gate_proj"].quant_algo == "W4A8_AWQ" |
| 135 | + assert layer_quant_config[ |
| 136 | + "model.layers.1.mlp.gate_proj"].group_size == 128 |
| 137 | + |
| 138 | + |
| 139 | +def test_quant_cfg_from_hf_quant_config(): |
| 140 | + """Test fallback to hf_quant_config.json when quant_cfg.json is missing.""" |
| 141 | + with tempfile.TemporaryDirectory() as tmp_dir: |
| 142 | + model_dir = Path(tmp_dir) |
| 143 | + |
| 144 | + # Create dummy hf_quant_config.json |
| 145 | + hf_quant_config_content = { |
| 146 | + "quantization": { |
| 147 | + "quant_algo": "MIXED_PRECISION", |
| 148 | + "kv_cache_quant_algo": "FP8", |
| 149 | + "quantized_layers": { |
| 150 | + "model.layers.0.self_attn.q_proj": { |
| 151 | + "quant_algo": "FP8" |
| 152 | + }, |
| 153 | + "model.layers.0.mlp.up_proj": { |
| 154 | + "quant_algo": "W4A16_AWQ", |
| 155 | + "group_size": 64 |
| 156 | + } |
| 157 | + } |
| 158 | + } |
| 159 | + } |
| 160 | + hf_quant_config_file = model_dir / "hf_quant_config.json" |
| 161 | + with open(hf_quant_config_file, 'w') as f: |
| 162 | + json.dump(hf_quant_config_content, f) |
| 163 | + quant_config, layer_quant_config = ModelConfig.load_modelopt_quant_config( |
| 164 | + hf_quant_config_file, model_dir, None) |
| 165 | + |
| 166 | + # Verify layer configs |
| 167 | + assert quant_config.quant_algo == QuantAlgo.MIXED_PRECISION |
| 168 | + assert quant_config.kv_cache_quant_algo == "FP8" |
| 169 | + assert layer_quant_config[ |
| 170 | + "model.layers.0.self_attn.q_proj"].quant_algo == "FP8" |
| 171 | + assert layer_quant_config[ |
| 172 | + "model.layers.0.mlp.up_proj"].quant_algo == "W4A16_AWQ" |
| 173 | + assert layer_quant_config["model.layers.0.mlp.up_proj"].group_size == 64 |
| 174 | + |
| 175 | + |
74 | 176 | if __name__ == "__main__": |
75 | 177 | test_llm_int4_awq_quantization() |
76 | 178 | test_llm_fp8_quantization_modelOpt_ckpt() |
| 179 | + test_quant_cfg_from_quant_cfg_json() |
| 180 | + test_quant_cfg_from_hf_quant_config() |
0 commit comments