Merge branch 'vllm-project:main' into feat/group-size-divisibility-check

GOavi101 · web-flow · commit 8b8236e5db46 · 2026-02-15T18:48:05.000+05:30
diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
@@ -76,6 +76,10 @@ class AWQModifier(Modifier, QuantizationMixin):
           balance_layers: ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"]
         - smooth_layer: "re:.*final_layer_norm"
           balance_layers: ["re:.*fc1"]
+        # activation_hook_target specifies which submodule of the parent to hook
+        # for activation caching.
+        # This change is only useful for MoE models with parallel transformer blocks,
+        # and one should use the default value (None) in most cases.
       ignore: ["lm_head"]
       config_groups:
         group_0:
@@ -122,6 +126,11 @@ class AWQModifier(Modifier, QuantizationMixin):
         to smoothed) and the second entry is the layer whose output is scaled to
         achieve the smoothing.
         If regex is used, it matches layers with the largest overlap in module name.
+        Each mapping may also include an ``activation_hook_target``: a dotted
+        attribute path relative to the parent module (lowest common ancestor)
+        specifying which submodule to hook for activation caching. This is useful
+        for parallel transformer blocks where the default (hooking
+        ``balance_layers[0]``) would capture the wrong activations.
     :param ignore: list of layers to ignore during quantization (not smoothed).
         It should match the name of layers whose outputs are scaled to achieve
         smoothing (the second entry of the mappings list).
@@ -389,6 +398,17 @@ def _set_resolved_mappings(self, model: Module) -> None:
                     balance_names, model, torch.nn.ModuleList
                 )
 
+                activation_hook_target = None
+                if mapping.activation_hook_target:
+                    activation_hook_target = getattr_chain(
+                        ancestor, mapping.activation_hook_target
+                    )
+                    if activation_hook_target is None:
+                        raise ValueError(
+                            f"activation_hook_target '{mapping.activation_hook_target}'"
+                            f" not found on parent module '{ancestor_name}'"
+                        )
+
                 resolved_mappings.append(
                     ResolvedMapping(
                         smooth_name,
@@ -397,6 +417,7 @@ def _set_resolved_mappings(self, model: Module) -> None:
                         balance_names=balance_names,
                         parent=ancestor,
                         parent_name=ancestor_name,
+                        activation_hook_target=activation_hook_target,
                     )
                 )
         self._resolved_mappings = resolved_mappings
@@ -468,16 +489,14 @@ def cache_smooth_activations_hook(
             # input activations to balance layers needed for loss function
             # storing inputs to first balance layer is sufficient
             # other balance layers get the same input
-
-            # The line below is useful for models that use parallel transformer block,
-            # such as gemma 3, command A. Need a better way to integrate it to the code.
-            # layer_to_hook = (
-            #     mapping.parent.mlp
-            #     if hasattr(mapping.parent, 'mlp')
-            #     else mapping.balance_layers[0]
-            # )
+            #
+            # For parallel transformer blocks (e.g. Command A, Gemma 3) the first
+            # balance layer may not receive the right activations.  When
+            # activation_hook_target is set on the mapping, hook that module
+            # instead of balance_layers[0].
+            layer_to_hook = mapping.activation_hook_target or mapping.balance_layers[0]
             self.register_hook(
-                mapping.balance_layers[0],
+                layer_to_hook,
                 create_cache_smooth_activations_hook_fn(mapping.smooth_name),
                 "forward",
             )
diff --git a/src/llmcompressor/modifiers/awq/mappings.py b/src/llmcompressor/modifiers/awq/mappings.py
@@ -16,10 +16,21 @@ class AWQMapping:
     `AWQMapping`s are resolved into `ResolvedMapping`s, which
     retain pointers to the actual `torch.nn.Module`s and additional
     metadata at runtime
+
+    :param smooth_layer: regex or name of the activation layer to smooth
+    :param balance_layers: list of regex or names of weight layers that must
+        be balanced to offset the smoothing
+    :param activation_hook_target: optional dotted attribute path relative to the
+        parent module (lowest common ancestor of balance_layers) specifying which
+        submodule to hook for activation caching. Useful for parallel transformer
+        blocks (e.g. Cohere, Gemma 3) where the first balance layer is not the
+        correct place to capture activations. When ``None`` (default), the hook
+        is placed on ``balance_layers[0]``.
     """
 
     smooth_layer: str
     balance_layers: list[str]
+    activation_hook_target: str | None = None
 
 
 _default_mappings = [
@@ -181,6 +192,30 @@ class AWQMapping:
     ),
 ]
 
+# Example mapping for MoE models with parallel transformer blocks, where
+# attention and MoE share the same input. This is the only case where
+# activation_hook_target is needed. Without it, the hook lands on
+# balance_layers[0] — which could be a single expert — capturing only that expert's
+# input rather than the full activation flowing into the MLP & Attention branch.
+# Setting activation_hook_target="mlp" hooks parent.mlp instead, so the cached
+# activations reflect the complete input to the MoE & Attention branch.
+_example_parallel_transformer_block_mappings = [
+    AWQMapping(
+        "re:.*input_layernorm$",
+        [
+            "re:.*mlp.experts.[0-9]+.gate_proj$",
+            "re:.*mlp.experts.[0-9]+.up_proj$",
+            "re:.*mlp.shared_experts.gate_proj$",
+            "re:.*mlp.shared_experts.up_proj$",
+            "re:.*mlp.gate$",
+            "re:.*q_proj$",
+            "re:.*k_proj$",
+            "re:.*v_proj$",
+        ],
+        activation_hook_target="mlp",
+    )
+]
+
 AWQ_MAPPING_REGISTRY: dict[str, list[AWQMapping]] = {
     "BloomForCausalLM": _bloom_mappings,
     "CohereForCausalLM": _cohere_mappings,
@@ -223,6 +258,10 @@ class ResolvedMapping:
     :param balance_names: optional list of names of the balance_layers
     :param parent: parent module of the balance_layers
     :param parent_name: name of the parent module
+    :param activation_hook_target: optional resolved module to hook for activation
+        caching. When set, the activation cache hook is placed on this module
+        instead of ``balance_layers[0]``. Populated from
+        ``AWQMapping.activation_hook_target``.
     """
 
     smooth_name: str
@@ -231,6 +270,7 @@ class ResolvedMapping:
     balance_names: list[str]
     parent: Module
     parent_name: str
+    activation_hook_target: Module | None = None
 
 
 def get_layer_mappings_from_architecture(architecture: str) -> list[AWQMapping]: