fix moe fusion

meenchen · meenchen · commit 6020e94b56c8 · 2025-11-03T20:56:54.000Z
Signed-off-by: weimingc &lt;17592131+meenchen@users.noreply.github.com&gt;
diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
@@ -938,9 +938,8 @@ def all_items_same(item_list):
 ]
 
 
-# TODO: make this more general instead of rule based
-def pattern_fuse_prequant(model: torch.nn.Module, fuse_mismatch_dim=False):
-    """Fuse pre_quant_scale to the linear weights.
+def fuse_prequant_to_linear(model: torch.nn.Module, fuse_grouped_heads=False):
+    """Fuse pre_quant_scale to the linear weights if possible.
 
     For example, we can fuse the pre_quant_scale of o_proj to the output_dimension of v_proj, such that
     the results are mathematically equivalent to the following::
@@ -955,26 +954,13 @@ def pattern_fuse_prequant(model: torch.nn.Module, fuse_mismatch_dim=False):
 
     Args:
         model: The model to fuse pre_quant_scale to.
-        fuse_mismatch_dim: If True, fuse the pre_quant_scale even if dimension between pre_quant_scale
+        fuse_grouped_heads: If True, fuse the pre_quant_scale even if dimension between pre_quant_scale
         and linear weights is not the same. This is useful for GQA/MQA models but may lead to accuracy
         drop.
 
     Note:
-        This is an experimental feature, and it might mess up the quantization errors
-        of fused linear modules.
+        Fuse_grouped_heads is useful for GQA/MQA models but may lead to accuracy drop.
     """
-    # For MoE models, let's first resmooth the w1 and w3 in experts to get the average pre_quant_scale
-    for _, module in model.named_modules():
-        if (
-            hasattr(module, "experts")
-            and "Qwen3MoeSparseMoeBlock".lower() in type(module).__name__.lower()
-        ):
-            linear_list = []
-            linear_list.extend([getattr(expert, "up_proj") for expert in module.experts])
-            linear_list.extend([getattr(expert, "gate_proj") for expert in module.experts])
-            preprocess_linear_fusion(linear_list, resmooth_only=True)
-
-    # import pdb; pdb.set_trace()
     # Fuse pre_quant_scale to the linear weights
     for _, module in model.named_modules():
         for module_map in PQS_FUSE_MODULE_MAPPING:
@@ -988,10 +974,10 @@ def pattern_fuse_prequant(model: torch.nn.Module, fuse_mismatch_dim=False):
                 ):
                     pre_quant_scale = linear_pqs_from.input_quantizer._pre_quant_scale
 
-                    # for GQA/MQA models, we apply averaging to the pre_quant_scale for shared head groups
+                    # for GQA/MQA models, we can apply averaging to the pre_quant_scale for shared head groups
                     if pre_quant_scale.numel() != linear_fuse_into.weight.shape[-2]:
                         if (
-                            not fuse_mismatch_dim
+                            not fuse_grouped_heads
                             or "attention" not in type(module).__name__.lower()
                         ):
                             warn(
@@ -1041,7 +1027,7 @@ def _update_pre_quant_scale(module, new_pre_quant_scale):
                         # Use averaged scale (flattened) for v_proj fusion
                         pre_quant_scale = averaged_scale.reshape(-1)
 
-                    # Fuse the pre_quant_scale to v_proj weight
+                    # Fuse the pre_quant_scale to weight
                     linear_fuse_into.weight = torch.nn.Parameter(
                         linear_fuse_into.weight * pre_quant_scale.view(-1, 1)
                     )
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -57,14 +57,14 @@
 from .plugins import export_spec_ckpt_config, export_spec_ckpt_state_dict, spec_opt_only
 from .quant_utils import (
     fuse_prequant_layernorm,
+    fuse_prequant_to_linear,
     get_activation_scaling_factor,
     get_quant_config,
     get_quantization_format,
     get_weight_block_size,
     get_weight_scaling_factor,
     get_weight_scaling_factor_2,
     maybe_transpose_expert_weight_dimensions,
-    pattern_fuse_prequant,
     postprocess_state_dict,
     preprocess_linear_fusion,
     to_quantized_weight,
@@ -107,6 +107,9 @@ def _output_hook(module, input, output):
     fused_linears = {}
     module_names = set()
 
+    # Fuse pre_quant_scale to the linear weights if possible
+    fuse_prequant_to_linear(model)
+
     for name, module in model.named_modules():
         module_names.add(name)
 
@@ -174,8 +177,6 @@ def _output_hook(module, input, output):
             # Pre quant scale of modules is already updated to avg_pre_quant_scale
             fuse_prequant_layernorm(output_to_layernorm[tensor], modules)
 
-    pattern_fuse_prequant(model)
-
     # The dummy forward may not be able to activate all the experts.
     # Process experts by naming rules like experts.0, experts.1, etc.
     for name, modules_fused in fused_linears.items():
diff --git a/tests/gpu/torch/export/test_quant_utils.py b/tests/gpu/torch/export/test_quant_utils.py
@@ -21,7 +21,7 @@
 from transformers import LlamaConfig, LlamaForCausalLM
 
 import modelopt.torch.quantization as mtq
-from modelopt.torch.export.quant_utils import pattern_fuse_prequant
+from modelopt.torch.export.quant_utils import fuse_prequant_to_linear
 
 
 def get_tiny_llama(attention_heads=4, key_value_heads=4):
@@ -74,7 +74,7 @@ def test_pattern_fuse_prequant(quant_config, attention_kv_heads_pair):
     ]
 
     # Apply fusion
-    pattern_fuse_prequant(model, fuse_mismatch_dim=True)
+    fuse_prequant_to_linear(model, fuse_grouped_heads=True)
 
     # Check if pre_quant_scale and fused_with_prequant flag are removed correctly
     for target_module_name in traget_module_name_list:
@@ -172,7 +172,7 @@ def test_pattern_fuse_prequant_moe(quant_config):
         output_before_fuse = model(dummy_input)
 
     # Apply fusion (fuse_mismatch_dim only needed for GQA/MQA attention, not for MLP)
-    pattern_fuse_prequant(model)
+    fuse_prequant_to_linear(model)
 
     # Check if down_proj's pre_quant_scale was removed and fused into up_proj
     for name, module in moe_down_proj_modules: