Fix SmoothQuant to smooth all experts in MoE models

rahul-tuli · rahul-tuli · commit 5c4d2d616a14 · 2025-12-02T10:48:26.000-05:00
Replace get_matching_layer() with match_named_modules() to iterate
over ALL matched layers instead of returning only the first match.
This fixes a critical bug where only expert.0 was smoothed in MoE
models, leaving all other experts unsmoothed and causing severe
accuracy degradation.

Changes:
- Use match_named_modules from compressed_tensors.utils to iterate
  over all matching modules
- Search for balance layers within the parent module scope for
  better locality
- Follow the same pattern already proven to work in AWQModifier

This fix ensures all experts in MoE models (Mixtral, Qwen3, Phi,
DeepSeek) are properly smoothed during quantization.

Signed-off-by: Rahul-Tuli &lt;rtuli@redhat.com&gt;
diff --git a/src/llmcompressor/modifiers/smoothquant/base.py b/src/llmcompressor/modifiers/smoothquant/base.py
@@ -2,7 +2,7 @@
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import torch
-from compressed_tensors.utils import align_module_device
+from compressed_tensors.utils import align_module_device, match_named_modules
 from loguru import logger
 from pydantic import ConfigDict, Field
 from torch.nn import Module
@@ -14,11 +14,7 @@
     handle_mapping_resolution_errors,
 )
 from llmcompressor.utils.fsdp.helpers import get_fsdp_parent
-from llmcompressor.utils.pytorch.module import (
-    get_layers,
-    get_matching_layer,
-    match_targets,
-)
+from llmcompressor.utils.pytorch.module import get_layer_by_name
 
 MINIMUM_SMOOTHING_SCALE = 1e-5
 
@@ -196,31 +192,38 @@ def _resolve_mappings(self, model: Module) -> List[SmoothQuantMapping]:
         Transforms the list of activations to smooth and their corresponding weights
         into SmoothQuantMapping objects, resolving regular expressions.
 
-        For each activation in the mapping list, we find the corresponding weight to
-        balance by searching for the longest substring. For instance, if our balance
-        weight is ".*re:.*q_proj" and the activation is "re:.*self_attn_layer_norm" we
-        would match model.layer.0.p_proj to model.layer.0.self_attn_layer_norm and
-        repeat for model.layer.1 and so on
+        For each activation in the mapping list, we find ALL corresponding weights to
+        balance by matching within the parent scope. This ensures all matching layers
+        are included, which is critical for MoE models where multiple experts need to
+        be balanced.
         """
         resolved_mappings = []
         for to_balance, to_smooth in self.mappings:
-            to_smooth_layers = get_layers(to_smooth, model)
-            for layer_name, smooth_layer in to_smooth_layers.items():
-                if not match_targets(layer_name, self.ignore)[0]:
-                    balance_layers = []
-                    for balance_suffix in to_balance:
-                        # find the submodule that matches the activation layer
-                        _, balance_layer = get_matching_layer(
-                            balance_suffix, layer_name, model
-                        )
-                        if balance_layer:
-                            balance_layers.append(balance_layer)
-                    # each mapping can contain multiple layers to balance, but only
-                    # one layer to smooth
-                    mapping = SmoothQuantMapping(
-                        layer_name, smooth_layer, balance_layers
+            to_smooth_list = [to_smooth] if isinstance(to_smooth, str) else to_smooth
+
+            for smooth_name, smooth_layer in match_named_modules(
+                model, to_smooth_list, self.ignore
+            ):
+                # Search for balance layers within the parent scope
+                smooth_parent_name = ".".join(smooth_name.split(".")[:-1])
+                smooth_parent = (
+                    get_layer_by_name(smooth_parent_name, model)
+                    if smooth_parent_name
+                    else model
+                )
+
+                balance_layers = []
+                for balance_regex in to_balance:
+                    for _, balance_layer in match_named_modules(
+                        smooth_parent, [balance_regex], self.ignore
+                    ):
+                        balance_layers.append(balance_layer)
+
+                if balance_layers:
+                    resolved_mappings.append(
+                        SmoothQuantMapping(smooth_name, smooth_layer, balance_layers)
                     )
-                    resolved_mappings.append(mapping)
+
         return resolved_mappings
 
     def _setup_scale_hooks(self):