r3 r4 works, but not with sdpa

kylesayrs · kylesayrs · commit a9b2f517f34f · 2025-07-17T17:28:57.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/examples/transform/spinquant_example.py b/examples/transform/spinquant_example.py
@@ -13,7 +13,7 @@
     MODEL_ID,
     torch_dtype="auto",
 )
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, attn_implementation="eager")
 
 # Select calibration dataset.
 DATASET_ID = "HuggingFaceH4/ultrachat_200k"
@@ -58,8 +58,10 @@ def tokenize(sample):
 #   * apply spinquant transforms to model in order to make quantization easier
 #   * quantize the weights to 4 bit with GPTQ with a group size 128
 recipe = [
-    SpinQuantModifier(rotations=["R1", "R2"], transform_type="hadamard"),
-    QuantizationModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
+    SpinQuantModifier(
+        rotations=["R1", "R2", "R3", "R4"], transform_type="random-hadamard"
+    ),
+    # QuantizationModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
 ]
 
 # Apply algorithms.
@@ -75,9 +77,12 @@ def tokenize(sample):
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
-output = model.generate(input_ids, max_new_tokens=100)
-print(tokenizer.decode(output[0]))
+from llmcompressor.utils import calibration_forward_context
+
+with calibration_forward_context(model):
+    input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+    output = model.generate(input_ids, max_new_tokens=100)
+    print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
 
 # Save to disk compressed.
diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py
@@ -21,6 +21,8 @@
 from llmcompressor.utils.helpers import getattr_chain
 
 if TYPE_CHECKING:
+    from compressed_tensors.modeling.attention import CompressedAttentionImpl
+
     from llmcompressor.modifiers.utils.hooks import HooksMixin
 
 
@@ -213,7 +215,7 @@ def calibrate_activations(module: Module, value: torch.Tensor, base_name: str):
 
 
 def register_calibrate_attn_hooks(
-    modifier: HooksMixin, attention_impl
+    modifier: "HooksMixin", attention_impl: "CompressedAttentionImpl"
 ) -> Set[RemovableHandle]:
     return {
         modifier.register_hook(
diff --git a/src/llmcompressor/modifiers/quantization/quantization/mixin.py b/src/llmcompressor/modifiers/quantization/quantization/mixin.py
@@ -242,10 +242,6 @@ def _initialize_observers(self, module: torch.nn.Module):
     def _initialize_hooks(self, model: torch.nn.Module) -> Set[RemovableHandle]:
         hooks = set()
 
-        # TODO: attnq
-        # attention_impl = get_compressed_attention_impl()
-        # hooks |= register_calibrate_attn_hooks(self, attention_impl)
-
         for module in model.modules():
             if not hasattr(module, "quantization_scheme"):
                 continue
@@ -264,6 +260,11 @@ def _initialize_hooks(self, model: torch.nn.Module) -> Set[RemovableHandle]:
                     self.register_hook(module, calibrate_input_hook, "forward_pre")
                 )
 
+            # TODO: attnq
+            # if is_attention:
+            #     attention_impl = CompressedAttentionImpl.from_module(module)
+            #     hooks |= register_calibrate_attn_hooks(self, attention_impl)
+
             # kv_cache activations. Within `apply_quantization_config`, the config is
             # modified to use attention output quantization if a kv_cache_scheme exists
             if is_attention and output:
diff --git a/src/llmcompressor/modifiers/transform/spinquant/base.py b/src/llmcompressor/modifiers/transform/spinquant/base.py
@@ -109,7 +109,7 @@ def on_initialize(self, state: State, **kwargs) -> bool:
             config_groups["R2"] = self._create_r2_scheme(state.model)
 
         if SpinquantRotation.R3 in self.rotations:
-            config_groups["R3"] = self._create_r3_scheme()
+            config_groups["R3"] = self._create_r3_scheme(state.model)
 
         if SpinquantRotation.R4 in self.rotations:
             config_groups["R4"] = self._create_r4_scheme()
@@ -214,41 +214,47 @@ def _create_r2_scheme(self, model: PreTrainedModel) -> TransformScheme:
             ],
         )
 
-    def _create_r3_scheme(self) -> TransformScheme:
-        return (
-            TransformScheme(
-                type=self.transform_type,
-                randomize=self.randomize,
-                requires_grad=self.learnable,
-                apply=[
-                    TransformArgs(
-                        targets=[self.mappings.attn],
-                        location="attn_q",
-                    ),
-                    TransformArgs(
-                        targets=[self.mappings.attn],
-                        location="attn_k",
-                    ),
-                ],
-            ),
+    def _create_r3_scheme(self, model: PreTrainedModel) -> TransformScheme:
+        config = model.config
+
+        if hasattr(config, "head_dim"):
+            head_dim = config.head_dim
+        elif hasattr(config, "hidden_size") and hasattr(config, "num_attention_heads"):
+            head_dim = config.hidden_size // config.num_attention_heads
+        else:
+            raise NotImplementedError()
+
+        return TransformScheme(
+            type=self.transform_type,
+            randomize=self.randomize,
+            requires_grad=self.learnable,
+            head_dim=head_dim,
+            apply=[
+                TransformArgs(
+                    targets=[self.mappings.attn],
+                    location="attn_q",
+                ),
+                TransformArgs(
+                    targets=[self.mappings.attn],
+                    location="attn_k",
+                ),
+            ],
         )
 
     def _create_r4_scheme(self) -> TransformScheme:
-        return (
-            TransformScheme(
-                type=self.transform_type,
-                randomize=self.randomize,
-                requires_grad=self.learnable,
-                apply=[
-                    TransformArgs(
-                        targets=[*self.mappings.mlp_out],
-                        location="input",
-                    ),
-                    TransformArgs(
-                        targets=[*self.mappings.mlp_out],
-                        location="weight_input",
-                        inverse=True,
-                    ),
-                ],
-            ),
+        return TransformScheme(
+            type=self.transform_type,
+            randomize=self.randomize,
+            requires_grad=self.learnable,
+            apply=[
+                TransformArgs(
+                    targets=[*self.mappings.mlp_out],
+                    location="input",
+                ),
+                TransformArgs(
+                    targets=[*self.mappings.mlp_out],
+                    location="weight_input",
+                    inverse=True,
+                ),
+            ],
         )