pattern-based fusion

meenchen · meenchen · commit 16ad77f984bb · 2025-10-14T18:45:48.000Z
Signed-off-by: weimingc &lt;17592131+meenchen@users.noreply.github.com&gt;
diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
@@ -478,7 +478,7 @@ def _get_quantization_from_layer(layer, quantizer_attr_names: QuantizerAttrNames
 
             if input_quantizer is not None and hasattr(input_quantizer, "_pre_quant_scale"):
                 return QUANTIZATION_NVFP4_AWQ
-            if getattr(layer, "fused_with_layernorm", False):
+            if getattr(layer, "fused_with_prequant", False):
                 return QUANTIZATION_NVFP4_AWQ
             assert input_quantizer is not None, (
                 f"input_quantizer is None for {quantizer_attr_names}"
@@ -923,18 +923,77 @@ def all_items_same(item_list):
     return all(x == item_list[0] for x in item_list)
 
 
+PQS_FUSE_MODULE_MAPPING = [
+    # format: (list of target modules, tuple of (linear_pqs_fuse_to, linear_pqs_from), dim to fuse)
+    (["LlamaAttention", "Qwen3Attention", "Qwen3MoeAttention"], ("v_proj", "o_proj"), "input"),
+    (["LlamaMLP", "Qwen3MLP", "Qwen3MoeMLP"], ("up_proj", "down_proj"), "output"),
+]
+
+
+# TODO: make this more general instead of rule based
+def pattern_fuse_prequant(model: torch.nn.Module):
+    """Fuse pre_quant_scale to the linear weights.
+
+    For example, we can fuse the pre_quant_scale of o_proj to the output_dimension of v_proj, such that
+    The results are mathematically equivalent to the following:
+
+    out_proj.input = (attn_weights @ v_proj.output)
+    out_proj.output = (out_proj.input * pre_quant_scale) * out_proj.weight
+                    = attn_weights @ (v_proj.output * pre_quant_scale) * out_proj.weight
+
+    Note: This is an experimental feature, and it might mess up the quantization errors of fused linear modules.
+    """
+    for _, module in model.named_modules():
+        for module_map in PQS_FUSE_MODULE_MAPPING:
+            target_module_list = module_map[0]
+            linear_pair = module_map[1]
+            dim_to_fuse = module_map[2]
+            if any(module_name in type(module).__name__ for module_name in target_module_list):
+                linear_to = module.get_submodule(linear_pair[0])
+                linear_from = module.get_submodule(linear_pair[1])
+                if hasattr(linear_from, "input_quantizer") and hasattr(
+                    linear_from.input_quantizer, "_pre_quant_scale"
+                ):
+                    pre_quant_scale = linear_from.input_quantizer._pre_quant_scale
+                    # check if we need to apply to the last dimension or the first dimension
+                    pre_quant_scale = (
+                        pre_quant_scale.view(-1, 1)
+                        if dim_to_fuse == "output"
+                        else pre_quant_scale.view(1, -1)
+                    )
+                    linear_to.weight = torch.nn.Parameter(linear_to.weight * pre_quant_scale)
+                    if hasattr(linear_to, "bias") and linear_to.bias is not None:
+                        linear_to.bias = torch.nn.Parameter(linear_to.bias * pre_quant_scale)
+                    delattr(linear_from.input_quantizer, "_pre_quant_scale")
+                    setattr(linear_from, "fused_with_prequant", True)
+
+
 def fuse_prequant_layernorm(
     layernorm_module: torch.nn.Module,
     modules: list[torch.Tensor],
 ):
-    """Scales layernorm weights with avg_pre_quant_scale of the modules list and sets pre_quant_scales to be deleted."""
+    """Scales layernorm weights with avg_pre_quant_scale of the modules list and sets pre_quant_scales to be deleted.
+
+    original:
+        layernorm_output = (normalization(input) * weight) + bias
+        layernorm_output_scaled = layernorm_output * pre_quant_scale
+
+    fused:
+        fused_weight = weight * avg_pre_quant_scale
+        fused_bias = bias * avg_pre_quant_scale
+        layernorm_output_scaled = (normalization(input) * fused_weight) + fused_bias
+    """
     layernorm_module.weight = torch.nn.Parameter(
         layernorm_module.weight * getattr(modules[0].input_quantizer, "_pre_quant_scale")
     )
+    if hasattr(layernorm_module, "bias"):
+        layernorm_module.bias = torch.nn.Parameter(
+            layernorm_module.bias * getattr(modules[0].input_quantizer, "_pre_quant_scale")
+        )
     # Pre_quant_scales of modules must not be exported, since they have been fused with layernorm
     for module in modules:
         delattr(module.input_quantizer, "_pre_quant_scale")
-        setattr(module, "fused_with_layernorm", True)
+        setattr(module, "fused_with_prequant", True)
 
 
 def preprocess_linear_fusion(modules: list[torch.nn.Module], resmooth_only=False):
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -64,6 +64,7 @@
     get_weight_scaling_factor,
     get_weight_scaling_factor_2,
     maybe_transpose_expert_weight_dimensions,
+    pattern_fuse_prequant,
     postprocess_state_dict,
     preprocess_linear_fusion,
     to_quantized_weight,
@@ -173,6 +174,8 @@ def _output_hook(module, input, output):
             # Pre quant scale of modules is already updated to avg_pre_quant_scale
             fuse_prequant_layernorm(output_to_layernorm[tensor], modules)
 
+    pattern_fuse_prequant(model)
+
     # The dummy forward may not be able to activate all the experts.
     # Process experts by naming rules like experts.0, experts.1, etc.
     for name, modules_fused in fused_linears.items():