fix: add norm calibration context for unit-offset RMSNorm (Gemma/Qwen3Next) (#2500)

Yatimai · brian-dellabetta · web-flow · commit cc6a9645b4c3 · 2026-03-25T14:24:26.000-04:00
## SUMMARY Some architectures (Gemma, Gemma2, Gemma3, Qwen3Next) use an offset normalization where the forward computes `output * (1 + weight)` instead of `output * weight`. This breaks any modifier that smooths norm weights (AWQ, SmoothQuant, SpinQuant, QuIP) because dividing a `(1+weight)` parameter by scales produces `1 + weight/scales` instead of `(1 + weight)/scales`. Following @brian-dellabetta's suggestion, this adds a `norm_calibration_context` that temporarily replaces offset-norm modules with standard-norm equivalents during calibration, following the same pattern as `moe_calibration_context`. On entry, offset norms are replaced with `CalibrationOffsetNorm` modules (`weight = 1 + original`). On exit, modules are restored with updated weights (`weight = smoothed - 1`). Only norms operating on `hidden_size` are converted. Norms operating on `head_dim` (e.g. `q_norm`/`k_norm` in Gemma3 attention) are skipped since no modifier smooths them. ## TEST PLAN Unit tests (8/8 passing): - Weight conversion and dtype preservation - Forward equivalence with original norm - Restore roundtrip (with and without smoothing) - Registry detection (positive and negative) - `hidden_size` filter: `q_norm`/`k_norm` correctly skipped E2E validation: | Model | Modifier | Norms converted | Output | |---|---|---|---| | `google/gemma-2-2b-it` | AWQ W4A16 | 105 | Coherent | | `google/medgemma-27b-text-it` | AWQ W4A16 | 249 (373 total, 124 q/k skipped) | Coherent | | upstream (no fix) on medgemma | AWQ W4A16 | 0 | Garbage | Qwen3-Next architecture verified structurally: `hidden_size=2048`, `head_dim=256`, `Qwen3NextRMSNorm` uses same `(1+weight)` pattern. No smaller Qwen3-Next model exists for e2e testing (80B MoE only). Fixes #2365 Fixes #2102 Related to #2202 Related to #2059 Signed-off-by: Gilles Turpin <turpingilles15@gmail.com> Co-authored-by: Brian Dellabetta <brian-dellabetta@users.noreply.github.com>
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
@@ -23,6 +23,7 @@
 from llmcompressor.datasets import get_calibration_dataloader
 from llmcompressor.entrypoints.utils import post_process, pre_process
 from llmcompressor.modeling.moe_context import moe_calibration_context
+from llmcompressor.modeling.offset_norm import norm_calibration_context
 from llmcompressor.pipelines import CalibrationPipeline
 
 __all__ = ["Oneshot", "oneshot"]
@@ -217,8 +218,8 @@ def apply_recipe_modifiers(
         session.reset()
 
         # (Helen INFERENG-661): validate recipe modifiers before initialization
-        # Apply MoE calibration context for the entire calibration process
-        with moe_calibration_context(
+        # Apply calibration contexts for the entire calibration process
+        with norm_calibration_context(self.model), moe_calibration_context(
             self.model,
             calibrate_all_experts=self.dataset_args.moe_calibrate_all_experts,
         ):
diff --git a/src/llmcompressor/modeling/__init__.py b/src/llmcompressor/modeling/__init__.py
@@ -18,6 +18,7 @@
 from .qwen3_5_moe import CalibrationQwen3_5MoeSparseMoeBlock
 from .qwen3_vl_moe import CalibrateQwen3VLMoeTextSparseMoeBlock  # noqa: F401
 from .qwen3_next_moe import CalibrationQwen3NextSparseMoeBlock  # noqa: F401
+from .offset_norm import CalibrationOffsetNorm  # noqa: F401
 # TODO: add granite4
 
 from .fuse import *
diff --git a/src/llmcompressor/modeling/offset_norm.py b/src/llmcompressor/modeling/offset_norm.py
@@ -0,0 +1,150 @@
+"""
+Calibration context for offset-norm layers.
+
+Some architectures (Gemma, Qwen3Next) use an offset normalization pattern where
+the forward pass computes ``output * (1 + weight)`` instead of the standard
+``output * weight``.  This breaks any modifier that smooths norm weights
+(AWQ, SmoothQuant, SpinQuant) because dividing a (1+weight) parameter by scales
+produces incorrect results.
+
+This module provides the infrastructure to temporarily replace offset-norm
+modules with standard-norm equivalents during calibration, and restore the
+original convention after modifiers have run.
+
+Key components:
+- NormCalibrationModule: Abstract base class for norm calibration modules
+- norm_calibration_context: Context manager that applies norm conversion
+"""
+
+import contextlib
+from abc import ABC, abstractmethod
+
+import torch
+from compressed_tensors.registry import RegistryMixin, standardize_lookup_name
+from loguru import logger
+from transformers import PreTrainedModel
+
+__all__ = [
+    "NormCalibrationModule",
+    "norm_calibration_context",
+]
+
+
+class NormCalibrationModule(ABC, torch.nn.Module, RegistryMixin):
+    """
+    Abstract base class for norm calibration modules.
+
+    Calibration modules replace original norm modules during the calibration
+    phase so that modifiers see standard ``output * weight`` semantics.
+    """
+
+    is_permanent: bool = False
+
+    @abstractmethod
+    def restore(self, original: torch.nn.Module) -> torch.nn.Module:
+        """
+        Restore the original module with updated weights.
+
+        Returns:
+            The original module with weights converted back to offset convention
+        """
+        ...
+
+
+@NormCalibrationModule.register(
+    "GemmaRMSNorm",
+    alias=["Gemma2RMSNorm", "Gemma3RMSNorm", "Qwen3NextRMSNorm"],
+)
+class CalibrationOffsetNorm(NormCalibrationModule):
+    """
+    Replaces offset-norm modules (``output * (1 + weight)``) with standard-norm
+    equivalents (``output * weight``) during calibration.
+
+    On enter: ``self.weight = 1 + original.weight``
+    On restore: ``original.weight = self.weight - 1``
+    """
+
+    is_permanent = False
+
+    def __init__(self, original: torch.nn.Module, config):
+        super().__init__()
+        self.eps = original.eps
+        self._orig_dtype = original.weight.dtype
+        self.weight = torch.nn.Parameter(
+            (1.0 + original.weight.data.float()).to(original.weight.dtype)
+        )
+
+    def _norm(self, x: torch.Tensor) -> torch.Tensor:
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        output = self._norm(x.float())
+        output = output * self.weight.float()
+        return output.type_as(x)
+
+    def restore(self, original: torch.nn.Module) -> torch.nn.Module:
+        original.weight.data = (self.weight.data.float() - 1.0).to(self._orig_dtype)
+        return original
+
+
+@contextlib.contextmanager
+def norm_calibration_context(model: PreTrainedModel):
+    """
+    Context manager that converts offset-norm modules to standard-norm.
+
+    This scans all modules in the model and replaces any offset-norm modules
+    (``output * (1 + weight)``) with standard-norm equivalents
+    (``output * weight``).  After the context exits, modules are restored
+    to their original convention with updated weights.
+
+    The model is modified in-place, so the same model object should be used
+    within the context.
+
+    Args:
+        model: The model to apply norm conversion to (modified in-place)
+
+    Example:
+        with norm_calibration_context(model):
+            # Modifiers see standard norm weights
+            run_calibration(model)
+        # Norms restored to offset convention with smoothed weights
+    """
+
+    replaced = {}
+
+    # Step 1: Collect all offset-norm modules that need replacement
+    logger.debug("Entering norm calibration context")
+    modules_to_replace = []
+    for name, module in model.named_modules():
+        class_name = module.__class__.__name__
+        if _is_registered(class_name, NormCalibrationModule):
+            modules_to_replace.append((name, module, class_name))
+
+    # Step 2: Replace modules
+    if modules_to_replace:
+        logger.info(f"Found {len(modules_to_replace)} offset-norm modules to convert")
+        for name, module, class_name in modules_to_replace:
+            replacement = NormCalibrationModule.load_from_registry(
+                class_name,
+                original=module,
+                config=model.config,
+            )
+            model.set_submodule(name, replacement)
+            replaced[name] = (module, replacement)
+
+    try:
+        yield
+    finally:
+        # Step 3: Restore original modules with updated weights
+        if replaced:
+            logger.info(f"Restoring {len(replaced)} norm modules to offset convention")
+        for name, (original, replacement) in replaced.items():
+            restored = replacement.restore(original)
+            model.set_submodule(name, restored)
+
+
+def _is_registered(name: str, subclass: RegistryMixin):
+    lookup = standardize_lookup_name(name)
+    return (
+        lookup in subclass.registered_names() or lookup in subclass.registered_aliases()
+    )
diff --git a/tests/llmcompressor/modeling/test_calib_offset_norm.py b/tests/llmcompressor/modeling/test_calib_offset_norm.py
@@ -0,0 +1,201 @@
+from types import SimpleNamespace
+
+import pytest
+import torch
+from torch import nn
+
+from llmcompressor.modeling.offset_norm import (
+    CalibrationOffsetNorm,
+    NormCalibrationModule,
+    norm_calibration_context,
+)
+
+# ---------------------------------------------------------------------------
+# Mock offset-norm module matching Gemma's (1 + weight) convention
+# ---------------------------------------------------------------------------
+
+
+class FakeGemmaRMSNorm(nn.Module):
+    """Minimal mock matching the GemmaRMSNorm forward: output * (1 + weight)"""
+
+    def __init__(self, dim, eps=1e-6, dtype=torch.bfloat16):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.zeros(dim, dtype=dtype))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        output = self._norm(x.float())
+        output = output * (1.0 + self.weight.float())
+        return output.type_as(x)
+
+
+# Patch class name so the registry picks it up
+FakeGemmaRMSNorm.__name__ = "GemmaRMSNorm"
+FakeGemmaRMSNorm.__qualname__ = "GemmaRMSNorm"
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.unit
+class TestCalibrationOffsetNormInit:
+    """Test that __init__ converts weights and stores dtype."""
+
+    def test_weight_conversion(self):
+        original = FakeGemmaRMSNorm(dim=4)
+        original.weight.data = torch.tensor([0.1, -0.05, 0.0, 0.2])
+        calib = CalibrationOffsetNorm(original, config=None)
+
+        expected = torch.tensor([1.1, 0.95, 1.0, 1.2])
+        assert torch.allclose(calib.weight.data, expected)
+
+    def test_dtype_stored(self):
+        original = FakeGemmaRMSNorm(dim=4, dtype=torch.bfloat16)
+        calib = CalibrationOffsetNorm(original, config=None)
+
+        assert calib._orig_dtype == torch.bfloat16
+        assert calib.weight.dtype == torch.bfloat16
+
+
+@pytest.mark.unit
+class TestCalibrationOffsetNormForward:
+    """Test that forward produces the same result as the original."""
+
+    def test_output_matches_original(self):
+        original = FakeGemmaRMSNorm(dim=8, dtype=torch.float32)
+        original.weight.data = torch.randn(8) * 0.1
+        calib = CalibrationOffsetNorm(original, config=None)
+
+        x = torch.randn(2, 4, 8)
+        original_out = original(x)
+        calib_out = calib(x)
+
+        assert torch.allclose(original_out, calib_out, atol=1e-5)
+
+
+@pytest.mark.unit
+class TestCalibrationOffsetNormRestore:
+    """Test that restore reconverts weights correctly."""
+
+    def test_restore_roundtrip(self):
+        original = FakeGemmaRMSNorm(dim=4, dtype=torch.bfloat16)
+        original.weight.data = torch.tensor(
+            [0.1, -0.05, 0.0, 0.2], dtype=torch.bfloat16
+        )
+        saved = original.weight.data.clone()
+
+        calib = CalibrationOffsetNorm(original, config=None)
+        calib.restore(original)
+
+        assert original.weight.dtype == torch.bfloat16
+        assert torch.allclose(original.weight.data.float(), saved.float(), atol=2e-2)
+
+    def test_restore_after_smoothing(self):
+        original = FakeGemmaRMSNorm(dim=4, dtype=torch.float32)
+        original.weight.data = torch.tensor([0.1, -0.05, 0.0, 0.2])
+
+        calib = CalibrationOffsetNorm(original, config=None)
+        # Simulate a modifier dividing weights by scales=2
+        calib.weight.data.div_(2.0)
+        calib.restore(original)
+
+        # Standard weight after smoothing: [1.1, 0.95, 1.0, 1.2] / 2
+        #   = [0.55, 0.475, 0.5, 0.6]
+        # Restored offset weight: standard - 1
+        #   = [-0.45, -0.525, -0.5, -0.4]
+        expected = torch.tensor([-0.45, -0.525, -0.5, -0.4])
+        assert torch.allclose(original.weight.data, expected, atol=1e-5)
+
+        # Verify: 1 + restored_weight == smoothed standard weight
+        effective = 1.0 + original.weight.data
+        expected_effective = torch.tensor([0.55, 0.475, 0.5, 0.6])
+        assert torch.allclose(effective, expected_effective, atol=1e-5)
+
+
+@pytest.mark.unit
+class TestNormRegistration:
+    """Test that registered norms are detected and standard norms are not."""
+
+    def test_gemma_detected(self):
+        """GemmaRMSNorm (and aliases) should be in the registry."""
+        names = NormCalibrationModule.registered_names()
+        aliases = NormCalibrationModule.registered_aliases()
+        all_registered = names + aliases
+        for name in [
+            "gemmarmsnorm",
+            "gemma2rmsnorm",
+            "gemma3rmsnorm",
+            "qwen3nextrmsnorm",
+        ]:
+            assert name in all_registered, f"{name} not in registry"
+
+    def test_standard_norm_not_detected(self):
+        """Standard LayerNorm should not be in the registry."""
+        registered = NormCalibrationModule.registered_names()
+        assert "layernorm" not in registered
+        assert "rmsnorm" not in registered
+
+
+@pytest.mark.unit
+class TestNormCalibrationContext:
+    """Test that norm_calibration_context replaces and restores modules."""
+
+    def test_modules_replaced_inside_context(self):
+        """Offset norms should be replaced with CalibrationOffsetNorm inside."""
+        layer = nn.Module()
+        layer.input_layernorm = FakeGemmaRMSNorm(dim=8, dtype=torch.float32)
+        layer.post_attention_layernorm = FakeGemmaRMSNorm(dim=8, dtype=torch.float32)
+
+        model = nn.Module()
+        model.layer = layer
+        model.config = SimpleNamespace(hidden_size=8)
+
+        with norm_calibration_context(model):
+            assert isinstance(layer.input_layernorm, CalibrationOffsetNorm)
+            assert isinstance(layer.post_attention_layernorm, CalibrationOffsetNorm)
+
+    def test_modules_restored_after_context(self):
+        """Original modules should be restored with correct weights."""
+        layer = nn.Module()
+        layer.input_layernorm = FakeGemmaRMSNorm(dim=4, dtype=torch.bfloat16)
+        layer.input_layernorm.weight.data = torch.tensor(
+            [0.1, -0.05, 0.0, 0.2], dtype=torch.bfloat16
+        )
+        saved = layer.input_layernorm.weight.data.clone()
+
+        model = nn.Module()
+        model.layer = layer
+        model.config = SimpleNamespace(hidden_size=4)
+
+        with norm_calibration_context(model):
+            pass
+
+        assert isinstance(layer.input_layernorm, FakeGemmaRMSNorm)
+        assert layer.input_layernorm.weight.dtype == torch.bfloat16
+        assert torch.allclose(
+            layer.input_layernorm.weight.data.float(), saved.float(), atol=2e-2
+        )
+
+    def test_weights_updated_after_smoothing(self):
+        """Weights modified inside the context should be reflected after."""
+        layer = nn.Module()
+        layer.norm = FakeGemmaRMSNorm(dim=4, dtype=torch.float32)
+        layer.norm.weight.data = torch.tensor([0.1, -0.05, 0.0, 0.2])
+
+        model = nn.Module()
+        model.layer = layer
+        model.config = SimpleNamespace(hidden_size=4)
+
+        with norm_calibration_context(model):
+            # Simulate modifier dividing weights by scales=2
+            layer.norm.weight.data.div_(2.0)
+
+        # Standard weight was [1.1, 0.95, 1.0, 1.2] / 2 = [0.55, 0.475, 0.5, 0.6]
+        # Restored offset weight: standard - 1 = [-0.45, -0.525, -0.5, -0.4]
+        expected = torch.tensor([-0.45, -0.525, -0.5, -0.4])
+        assert torch.allclose(layer.norm.weight.data, expected, atol=1e-5)