fix shapes

kylesayrs · kylesayrs · commit e1ca4fd1b951 · 2025-10-08T14:44:32.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/compressed_tensors/quantization/lifecycle/forward.py b/src/compressed_tensors/quantization/lifecycle/forward.py
@@ -330,7 +330,7 @@ def _process_quantization(
             inv_perm = torch.argsort(perm)
             output = output.index_select(-1, inv_perm)
 
-    else:  # covers channel, token and tensor strategies
+    else:  # covers tensor, channel, token, and attn_head strategies
         if do_quantize:
             output = _quantize(
                 x=x,
diff --git a/src/compressed_tensors/quantization/lifecycle/initialize.py b/src/compressed_tensors/quantization/lifecycle/initialize.py
@@ -14,7 +14,7 @@
 
 
 import logging
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Union
 
 import torch
 from compressed_tensors.quantization import (
@@ -152,7 +152,7 @@ def initialize_qparams(
     module: Module,
     base_name: str,
     quantization_args: QuantizationArgs,
-    observed_shape: Tuple[int],
+    observed_shape: Tuple[Union[int, None]],
     observed_dtype: torch.dtype,
     force_zero_point: bool = True,
 ):
@@ -199,7 +199,7 @@ def initialize_qparams(
         expected_shape = (1,)
 
     elif strategy == QuantizationStrategy.TOKEN:
-        expected_shape = (1, 1)
+        raise ValueError("Cannot perform static token quantization")
 
     elif strategy == QuantizationStrategy.CHANNEL:
         if len(observed_shape) < 2:
@@ -235,10 +235,11 @@ def initialize_qparams(
         expected_shape = (num_rows, num_cols)
 
     elif strategy == QuantizationStrategy.ATTN_HEAD:
-        if len(observed_shape) < 2:
-            raise ValueError("Attention quant requires at least 2 observed dimensions")
+        # (batch_size, num_attention_heads, seq_len, head_dim)
+        if len(observed_shape) < 3:
+            raise ValueError("Attention quant requires at least 3 observed dimensions")
 
-        expected_shape = (observed_shape[-2], 1)
+        expected_shape = (observed_shape[-3], 1, 1)
 
     else:
         assert False, f"Unknown strategy {strategy}"
diff --git a/tests/mock_observer.py b/tests/mock_observer.py
@@ -162,7 +162,7 @@ def flatten_attention_for_quantization(value: torch.Tensor, args: QuantizationAr
         raise ValueError("Block quantization cannot be applied to attention")
 
     if args.strategy == QuantizationStrategy.ATTN_HEAD:
-        # (batch_size * seq_len, num_heads, 1, head_dim)
-        return value.flatten(0, 1).unsqueeze(-2)
+        # (batch_size * seq_len, num_heads, 1, 1, head_dim)
+        return value.transpose(1, 2).flatten(0, 1).unsqueeze(-2).unsqueeze(-2)
 
     assert False, f"Unknown strategy {args.strategy}"
diff --git a/tests/test_quantization/lifecycle/test_static_lifecycle.py b/tests/test_quantization/lifecycle/test_static_lifecycle.py
@@ -310,17 +310,17 @@ class MockAttention(torch.nn.Module):
                 symmetric=True,
                 strategy="attn_head",
             ),
-            torch.tensor([[0], [3]]),
-            torch.tensor([[8], [11]]),
+            torch.tensor([[[0.0]], [[6.0]]]),
+            torch.tensor([[[5.0]], [[11.0]]]),
             torch.tensor(
                 [
                     [
-                        [[0.0000, 1.0703, 2.1406], [2.9375, 4.4062, 4.4062]],
-                        [[6.4375, 7.5000, 7.5000], [8.8125, 10.2500, 10.2500]],
+                        [[0.0000, 1.3359, 2.0000], [2.6719, 4.0000, 4.6875]],
+                        [[5.8750, 7.3438, 7.3438], [8.8125, 10.2500, 10.2500]],
                     ]
                 ]
             ),
-            0.16,
+            0.13,
         ),
     ],
 )
@@ -335,7 +335,7 @@ def test_static_attention_quantization(
                       [ 9., 10., 11.]]]])
     """
     # set up activation (and identity weight)
-    batch_size, seq_len, num_heads, head_dim = 1, 2, 2, 3
+    batch_size, num_heads, seq_len, head_dim = 1, 2, 2, 3
     input = torch.arange(
         (batch_size * seq_len * num_heads * head_dim), dtype=torch.bfloat16
     ).reshape((batch_size, seq_len, num_heads, head_dim))
@@ -344,7 +344,7 @@ def test_static_attention_quantization(
     # initialize quantization parameters
     scheme = QuantizationScheme(targets=[], input_activations=args)
     initialize_qparams(
-        attention, "k", args, (num_heads, head_dim), observed_dtype=torch.bfloat16
+        attention, "k", args, (num_heads, None, head_dim), observed_dtype=torch.bfloat16
     )
     attention.quantization_scheme = scheme
     attention.quantization_status = QuantizationStatus.INITIALIZED
@@ -366,5 +366,7 @@ def test_static_attention_quantization(
         assert torch.equal(attention.k_observer.max_vals, exp_max_val)
 
     # check forward pass
+    print(output)
+    print(torch.nn.functional.mse_loss(output, input))
     assert torch.allclose(output, exp_quant.to(output.dtype))
     assert torch.nn.functional.mse_loss(output, input) <= exp_loss