Fix AutoQuantize config for MoE (#641)

realAsma · web-flow · commit 940941205182 · 2025-12-04T17:37:01.000Z
## What does this PR do? **Type of change:** Bug fix [aq_Qwen3-235B-A22B-Thinking-2507_scores.html](https://github.com/user-attachments/files/23915284/aq_Qwen3-235B-A22B-Thinking-2507_scores.html) **Overview:** For Qwen3 MoE, sensitivity should be measured at `layer.x.mlp` not `layer.x.mlp.experts` (`layer.x.mlp.experts` module forward is never called. Hence sensitivity was not correctly estimated previously). After this PR, Qwen3 MoE sensitivity is correctly estimated. (See attached file) ## Usage  ```python # Add a code snippet demonstrating how to use this ``` ## Testing  ## Before your PR is "*Ready for review*"  - **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. - **Is this change backward compatible?**: Yes/No  - **Did you write any new necessary tests?**: Yes/No - **Did you add or update any necessary documentation?**: Yes/No - **Did you update [Changelog](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CHANGELOG.rst)?**: Yes/No  ## Additional Information  --------- Signed-off-by: realAsma <akuriparambi@nvidia.com>
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -50,7 +50,7 @@
     get_model_type,
 )
 from modelopt.torch.export.model_utils import get_language_model_from_vl, is_multimodal_model
-from modelopt.torch.quantization.config import need_calibration
+from modelopt.torch.quantization.config import _default_disabled_quantizer_cfg, need_calibration
 from modelopt.torch.quantization.plugins.accelerate import init_quantized_weights
 from modelopt.torch.quantization.utils import is_quantized
 from modelopt.torch.utils.dataset_utils import (
@@ -155,7 +155,8 @@ def forward_step(model, batch):
         # AutoQuantize scoring is the costly phase; allow smaller sample counts than calibration.
         num_score_steps=min(len(calib_dataloader), max(auto_quantize_score_size // batch_size, 1)),
         verbose=True,
-        disabled_layers=["*lm_head*"],
+        # Disable all default disabled layers such as lm_head, mlp.gate, router etc.
+        disabled_layers=list(_default_disabled_quantizer_cfg.keys()),
         method=auto_quantize_method,
         checkpoint=auto_quantize_checkpoint,
     )
diff --git a/modelopt/torch/quantization/algorithms.py b/modelopt/torch/quantization/algorithms.py
@@ -755,7 +755,7 @@ class AutoQuantizeGradientSearcher(_AutoQuantizeBaseSearcher):
 
     score_module_rules = [
         # Use MLP layer output for gate_proj, up_proj, down_proj for Qwen3 like MoE models (local and shared experts)
-        r"^(.*?\.mlp\.experts)\.\d+\.(gate_proj|up_proj|down_proj)$",
+        r"^(.*?\.mlp)\.experts\.\d+\.(gate_proj|up_proj|down_proj)$",
         r"^(.*?)\.(\d+\.(w1|w2|w3))$",  # mixtral experts
         r"^(.*?)\.((w1_linear|w2_linear|w3_linear)\.\d+)$",  # dbrx experts
     ]
diff --git a/tests/_test_utils/torch/transformers_models.py b/tests/_test_utils/torch/transformers_models.py
@@ -30,6 +30,8 @@
     LlamaForCausalLM,
     Qwen3Config,
     Qwen3ForCausalLM,
+    Qwen3MoeConfig,
+    Qwen3MoeForCausalLM,
     T5Config,
     T5ForConditionalGeneration,
     T5Tokenizer,
@@ -61,6 +63,28 @@ def get_tiny_qwen3(**config_kwargs) -> "Qwen3ForCausalLM":
     return tiny_qwen3
 
 
+def get_tiny_qwen3_moe(**config_kwargs) -> "Qwen3MoeForCausalLM":
+    set_seed(SEED)
+
+    kwargs = {
+        "hidden_size": 32,
+        "intermediate_size": 32,
+        "moe_intermediate_size": 32,
+        "num_hidden_layers": 2,
+        "num_attention_heads": 16,
+        "num_key_value_heads": 2,
+        "max_position_embeddings": 32,
+        "vocab_size": 32,
+        "num_experts": 4,
+        "num_experts_per_tok": 2,
+        "decoder_sparse_step": 1,
+    }
+    kwargs.update(**config_kwargs)
+    tiny_qwen3_moe = Qwen3MoeForCausalLM(Qwen3MoeConfig(**kwargs))
+
+    return tiny_qwen3_moe
+
+
 def get_tiny_llama(**config_kwargs) -> LlamaForCausalLM:
     set_seed(SEED)
     kwargs = {
diff --git a/tests/unit/torch/quantization/plugins/test_huggingface.py b/tests/unit/torch/quantization/plugins/test_huggingface.py
@@ -24,6 +24,7 @@
 from _test_utils.torch.transformers_models import (
     create_tiny_llama_dir,
     get_tiny_llama,
+    get_tiny_qwen3_moe,
     tf_modelopt_state_and_output_tester,
 )
 
@@ -137,12 +138,10 @@ def test_dbrx():
     assert torch.allclose(out_1[0], out_2[0])
 
 
-@pytest.mark.parametrize(
-    "method",
-    ["gradient", "kl_div"],
-)
-def test_autoquantize_huggingface(method):
-    model = get_tiny_llama()
+@pytest.mark.parametrize("method", ["gradient", "kl_div"])
+@pytest.mark.parametrize("model_provider", [get_tiny_llama, get_tiny_qwen3_moe])
+def test_autoquantize_huggingface(model_provider, method):
+    model = model_provider()
     input_ids = model.dummy_inputs["input_ids"]
 
     def forward_step(model, batch):

Original file line number	Diff line number	Diff line change
`@@ -755,7 +755,7 @@ class AutoQuantizeGradientSearcher(_AutoQuantizeBaseSearcher):`
`755`	`755`
`756`	`756`	`score_module_rules = [`
`757`	`757`	`# Use MLP layer output for gate_proj, up_proj, down_proj for Qwen3 like MoE models (local and shared experts)`
`758`		`- r"^(.*?\.mlp\.experts)\.\d+\.(gate_proj\|up_proj\|down_proj)$",`
	`758`	`+ r"^(.*?\.mlp)\.experts\.\d+\.(gate_proj\|up_proj\|down_proj)$",`
`759`	`759`	`r"^(.*?)\.(\d+\.(w1\|w2\|w3))$", # mixtral experts`
`760`	`760`	`r"^(.*?)\.((w1_linear\|w2_linear\|w3_linear)\.\d+)$", # dbrx experts`
`761`	`761`	`]`