cleanup

kylesayrs · kylesayrs · commit 5392b2b7c3d8 · 2025-08-05T16:43:26.000Z
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/llmcompressor/modeling/fuse.py b/src/llmcompressor/modeling/fuse.py
@@ -6,58 +6,55 @@
     get_execution_device,
     update_offload_parameter,
 )
-from transformers.models.llama.modeling_llama import LlamaRMSNorm
 
-__all__ = ["normalize_embedding", "fuse_norm_linears"]
+__all__ = ["center_embeddings", "fuse_norm_linears"]
 
 
 PRECISION = torch.float64
 
 
-def normalize_embedding(embedding: torch.nn.Module):
+def center_embeddings(embedding: torch.nn.Module):
     """
-    Normalize each embedding to have a mean of zero
+    Shift each embedding to have a mean of zero
 
     :param embedding: embedding module containing embeddings to center
     """
-    if isinstance(embedding, (torch.nn.Embedding)):
-        with align_module_device(embedding):
-            weight_dtype = embedding.weight.dtype
-            weight = embedding.weight.to(PRECISION)
-            new_weight = weight - weight.mean(dim=-1, keepdim=True)
-            new_weight = new_weight.to(weight_dtype)
+    if not hasattr(embedding, "weight"):
+        raise ValueError(f"Cannot fuse norm of type {type(embedding)}")
 
-        update_offload_parameter(embedding, "weight", new_weight)
+    with align_module_device(embedding):
+        weight_dtype = embedding.weight.dtype
+        weight = embedding.weight.to(PRECISION)
+        new_weight = weight - weight.mean(dim=-1, keepdim=True)
+        new_weight = new_weight.to(weight_dtype)
 
-    else:
-        raise ValueError(f"Cannot normalize embedding of type {type(embedding)}")
+    update_offload_parameter(embedding, "weight", new_weight)
 
 
 def fuse_norm_linears(norm: torch.nn.Module, linears: Iterable[torch.nn.Linear]):
     """
-    Fuse a norm layer into subsequent linear layers. This useful for ensuring transform
-    invariance between norm and linear layers.
+    Fuse the scaling operation of norm layer into subsequent linear layers.
+    This useful for ensuring transform invariance between norm and linear layers.
 
-    Note that a model cannot be properly trained after its norms have been fused
+    Note that unitary transforms (rotation) commute with normalization, but not scaling
 
     :param norm: norm layer whose weight will be fused into subsequent linears
     :param linears: linear layers which directly follow the norm layer
     """
-    if isinstance(norm, (torch.nn.RMSNorm, LlamaRMSNorm, torch.nn.LayerNorm)):
-        for linear in linears:
-            # NOTE: spinquant does this op in float64
-            exec_device = get_execution_device(norm)
-            with align_module_device(norm, exec_device), align_module_device(
-                linear, exec_device
-            ):
-                weight_dtype = linear.weight.dtype
-                new_weight = linear.weight.to(PRECISION) * norm.weight.to(PRECISION)
-                new_weight = new_weight.to(weight_dtype)
-
-            update_offload_parameter(linear, "weight", new_weight)
-
-        new_norm_weight = torch.ones_like(norm.weight, device="cpu")
-        update_offload_parameter(norm, "weight", new_norm_weight)
-
-    else:
+    if not hasattr(norm, "weight"):
         raise ValueError(f"Cannot fuse norm of type {type(norm)}")
+
+    for linear in linears:
+        # NOTE: spinquant does this op in float64
+        exec_device = get_execution_device(norm)
+        with align_module_device(norm, exec_device), align_module_device(
+            linear, exec_device
+        ):
+            weight_dtype = linear.weight.dtype
+            new_weight = linear.weight.to(PRECISION) * norm.weight.to(PRECISION)
+            new_weight = new_weight.to(weight_dtype)
+
+        update_offload_parameter(linear, "weight", new_weight)
+
+    new_norm_weight = torch.ones_like(norm.weight, device="cpu")
+    update_offload_parameter(norm, "weight", new_norm_weight)
diff --git a/src/llmcompressor/modifiers/transform/quip/base.py b/src/llmcompressor/modifiers/transform/quip/base.py
@@ -1,11 +1,13 @@
 from typing import List, Literal, Optional, Union
 
+import torch
 from compressed_tensors.transform import (
     TransformArgs,
     TransformConfig,
     TransformScheme,
     apply_transform_config,
 )
+from compressed_tensors.utils import TorchDtype
 from pydantic import Field, ValidationInfo, field_validator
 
 from llmcompressor.core import Event, EventType, State
@@ -36,17 +38,19 @@ class QuIPModifier(Modifier):
         `"random-matrix"` has the greatest performance cost, but supports any size
     :param randomize: If true, create distinct transforms for each application
     :param learnable: If true, attach gradients to transform weights for training
+    :param precision: Precision at which all transforms should be applied. This applies
+        to both weight fusing and online rotations
     :param ignore: Modules to ignore when attaching transforms
     :param transform_config: Optional transform config for overriding provided arguments
     """
 
     transform_type: Literal["hadamard", "random-hadamard", "random-matrix"] = Field(
-        default="hadamard", exclude=True
+        default="random-hadamard"
     )
-    randomize: bool = Field(default=False, exclude=True)
-    learnable: bool = Field(default=False, exclude=True)
-    precision: 
-    ignore: Union[str, List[str]] = Field(default="lm_head", exclude=True)
+    randomize: bool = Field(default=False)
+    learnable: bool = Field(default=False)
+    precision: TorchDtype = Field(default=torch.float64)
+    ignore: Union[str, List[str]] = Field(default="lm_head")
 
     # optional override for more fine-grained control
     # also included in recipe serialization
@@ -105,21 +109,20 @@ def _create_config(self) -> TransformConfig:
                         TransformArgs(
                             targets=["Linear"],
                             location="weight_input",
-                            # location="input",
                             inverse=True,
                             ignore=self.ignore,
                         ),
                     ],
                     randomize=self.randomize,
                     requires_grad=self.learnable,
+                    precision=self.precision,
                 ),
                 "u": TransformScheme(
                     type=self.transform_type,
                     apply=[
                         TransformArgs(
                             targets=["Linear"],
                             location="weight_output",
-                            # location="output",
                             ignore=self.ignore,
                         ),
                         TransformArgs(
@@ -131,6 +134,7 @@ def _create_config(self) -> TransformConfig:
                     ],
                     randomize=self.randomize,
                     requires_grad=self.learnable,
+                    precision=self.precision,
                 ),
             }
         )
diff --git a/tests/llmcompressor/modeling/test_fuse.py b/tests/llmcompressor/modeling/test_fuse.py
@@ -1,13 +1,13 @@
 import pytest
 import torch
 
-from llmcompressor.modeling.fuse import fuse_norm_linears, normalize_embedding
+from llmcompressor.modeling.fuse import center_embeddings, fuse_norm_linears
 
 
 @pytest.mark.unit
-def test_normalize_embedding():
+def test_center_embeddings():
     embedding = torch.nn.Embedding(10, 10)
-    normalize_embedding(embedding)
+    center_embeddings(embedding)
 
     assert torch.allclose(
         embedding.weight.mean(dim=1), torch.zeros(embedding.num_embeddings), atol=1e-5
diff --git a/tests/llmcompressor/modifiers/transform/quip/test_correctness.py b/tests/llmcompressor/modifiers/transform/quip/test_correctness.py
@@ -1,4 +1,5 @@
 import os
+
 import pytest
 import torch
 from transformers import AutoModelForCausalLM
@@ -9,23 +10,25 @@
 
 
 @requires_gpu
-# @pytest.mark.skipif(
-#     (not os.getenv("HF_TOKEN")),
-#     reason="Skipping tracing tests requiring gated model access",
-# )
+@pytest.mark.skipif(
+    (not os.getenv("HF_TOKEN")),
+    reason="Skipping tracing tests requiring gated model access",
+)
 @pytest.mark.parametrize(
-    "dtype,exp_mse",
+    "model_dtype,precision,exp_mse",
     [
-        (torch.bfloat16, 5e-3),
-        (torch.float32, 5e-11),
+        (torch.bfloat16, torch.bfloat16, 5e-3),  # 0.0019
+        (torch.bfloat16, torch.float32, 5e-3),  # 0.0022
+        (torch.float32, torch.float32, 5e-10),  # 1.0777e-10
+        (torch.float32, torch.float64, 5e-11),  # 2.6632e-11
     ],
 )
-def test_apply_correctness(dtype, exp_mse):
+def test_apply_correctness(model_dtype, precision, exp_mse):
     model = AutoModelForCausalLM.from_pretrained(
-        "meta-llama/Llama-3.2-1B-Instruct", device_map="cuda", torch_dtype=dtype
+        "meta-llama/Llama-3.2-1B-Instruct", device_map="cuda", torch_dtype=model_dtype
     )
     state = State(model=model)
-    modifier = QuIPModifier(transform_type="random-hadamard")
+    modifier = QuIPModifier(transform_type="random-hadamard", precision=precision)
 
     input = {k: v.to("cuda") for k, v in model.dummy_inputs.items()}
     with torch.no_grad():
diff --git a/tests/llmcompressor/modifiers/transform/quip/test_serialization.py b/tests/llmcompressor/modifiers/transform/quip/test_serialization.py
@@ -0,0 +1,7 @@
+from llmcompressor.modifiers.transform import QuIPModifier
+
+
+def test_reload():
+    modifier = QuIPModifier(transform_type="hadamard")
+    dump = modifier.model_dump()
+    assert QuIPModifier.model_validate(dump) == modifier