activations have one row

kylesayrs · kylesayrs · commit 199f274a4395 · 2025-09-12T12:10:03.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/compressed_tensors/quantization/lifecycle/forward.py b/src/compressed_tensors/quantization/lifecycle/forward.py
@@ -280,17 +280,8 @@ def _process_quantization(
                     f"by the given group_size {group_size}"
                 )
 
-        # support column-order (default) quantization as well as other orderings
-        # such as activation ordering. Below checks if g_idx has been initialized
-        is_column_order = g_idx is None or -1 in g_idx
-        if is_column_order:
-            num_groups = int(ceil(columns / group_size))
-            group_sizes = torch.full((num_groups,), group_size, dtype=torch.int)
-
-        else:
-            group_indices, group_sizes = torch.unique(g_idx, return_counts=True)
-            group_sizes = group_sizes[torch.argsort(group_indices)]
-
+        # permute groups
+        if g_idx is not None:
             perm = torch.argsort(g_idx)
             x = x.index_select(-1, perm)
 
@@ -299,6 +290,8 @@ def _process_quantization(
             ceil(x.shape[-1] / group_size),
             group_size,
         )
+        # we should potentially be folding reshaped_dims[0] into x.shape[-2]
+        # in order to allow for multi-headed activations
         x = x.unflatten(-1, reshaped_dims)
 
         if do_quantize:
@@ -325,9 +318,9 @@ def _process_quantization(
         output = output.flatten(-2, -1)
         output = output.to(output_dtype)
 
-        if not is_column_order:
-            inv_perm = torch.argsort(perm)
-            output = output.index_select(-1, inv_perm)
+        # unpermute groups
+        if g_idx is not None:
+            x = x.index_select(-1, g_idx)
 
     else:  # covers channel, token and tensor strategies
         if do_quantize:
diff --git a/src/compressed_tensors/quantization/lifecycle/initialize.py b/src/compressed_tensors/quantization/lifecycle/initialize.py
@@ -14,10 +14,8 @@
 
 
 import logging
-import math
-import warnings
 from enum import Enum
-from typing import Optional
+from typing import Optional, Tuple
 
 import torch
 from compressed_tensors.quantization.lifecycle.forward import (
@@ -32,7 +30,11 @@
 )
 from compressed_tensors.quantization.quant_config import QuantizationStatus
 from compressed_tensors.quantization.quant_scheme import QuantizationScheme
-from compressed_tensors.quantization.utils import is_fp4, is_kv_cache_quant_scheme
+from compressed_tensors.quantization.utils import (
+    is_fp4,
+    is_kv_cache_quant_scheme,
+    strict_divide,
+)
 from compressed_tensors.utils import (
     disable_hf_hook,
     get_execution_device,
@@ -102,7 +104,7 @@ def initialize_module_for_quantization(
         if scheme.input_activations is not None:
             base_name = "input"
             args = scheme.input_activations
-            observed_shape = weight.shape[-1:]
+            observed_shape = (1, weight.size(-1))
             observed_dtype = weight.dtype
 
         if scheme.weights is not None:
@@ -148,7 +150,7 @@ def _initialize_scale_zero_point(
     module: Module,
     base_name: str,
     quantization_args: QuantizationArgs,
-    observed_shape: torch.Size,
+    observed_shape: Tuple[int],
     observed_dtype: torch.dtype,
     force_zero_point: bool = True,
 ):
@@ -191,8 +193,8 @@ def _initialize_scale_zero_point(
             raise ValueError("Group quant requires at least 1 observed dimension")
 
         group_size = quantization_args.group_size
-        num_groups = _strict_divide(observed_shape[-1], group_size, strategy)
-        expected_shape = (num_groups, group_size)
+        num_groups = strict_divide(observed_shape[-1], group_size, strategy)
+        expected_shape = (*observed_shape[:-1], num_groups)
 
         # initialize activation ordering if applicable
         if actorder == ActivationOrdering.GROUP:
@@ -208,8 +210,8 @@ def _initialize_scale_zero_point(
             raise ValueError("Block quant requires at least 2 observed dimensions")
 
         block_structure = quantization_args.block_structure
-        num_rows = _strict_divide(observed_shape[-2], block_structure[-2], strategy)
-        num_cols = _strict_divide(observed_shape[-1], block_structure[-1], strategy)
+        num_rows = strict_divide(observed_shape[-2], block_structure[-2], strategy)
+        num_cols = strict_divide(observed_shape[-1], block_structure[-1], strategy)
         expected_shape = (num_rows, num_cols)
 
     # 2. Identify quantization scale and zp dtype
@@ -264,16 +266,3 @@ def _initialize_attn_scales(module: Module) -> None:
         requires_grad=False,
     )
     register_offload_parameter(module, KVCacheScaleType.VALUE.value, init_scale)
-
-
-def _strict_divide(observed: int, divisor: int, strategy: QuantizationStrategy) -> int:
-    out = observed // divisor
-    if out * divisor != observed:
-        raise ValueError(
-            f"{strategy} quantization strategy requires strict division of "
-            f"weight/activation size {observed} and group/block size {divisor}. "
-            "consider reducing the group/block size or ignoring modules with weights "
-            f"not divisible by {divisor}"
-        )
-
-    return out
diff --git a/src/compressed_tensors/quantization/quant_args.py b/src/compressed_tensors/quantization/quant_args.py
@@ -283,8 +283,9 @@ def validate_model_after(model: "QuantizationArgs") -> "QuantizationArgs":
         has_block_structure = block_structure is not None
         if has_block_strategy != has_block_structure:
             raise ValueError(
-                "Block strategy requires `block_structure`, and vice versa. "
-                f"Instead got ({strategy}, {block_structure})"
+                "`strategy = block` requires `block_structure != None`, and vice versa."
+                f" Instead got `strategy={strategy}` and "
+                f"`block_structure={block_structure}`"
             )
 
         # validate group strategy
@@ -296,8 +297,8 @@ def validate_model_after(model: "QuantizationArgs") -> "QuantizationArgs":
         has_actorder = actorder is not None
         if has_group_strategy != has_group_size:
             raise ValueError(
-                "Group strategies require `group_size`, and vice versa. "
-                f"Instead got ({strategy}, {group_size})"
+                "`strategy = group` requires `group_size != None`, and vice versa. "
+                f"Instead got `strategy={strategy}` and `group_size={group_size}`"
             )
         if has_actorder and not has_group_strategy:
             raise ValueError(
diff --git a/src/compressed_tensors/quantization/utils/helpers.py b/src/compressed_tensors/quantization/utils/helpers.py
@@ -48,6 +48,7 @@
     "calculate_qparams",
     "generate_gparam",
     "is_fp4",
+    "strict_divide",
 ]
 
 # target the self_attn layer
@@ -477,3 +478,19 @@ def generate_gparam(
     max_val_pos = torch.max(torch.abs(min_vals), torch.abs(max_vals))
     global_scale = scale_data.max * quant_data.max / max_val_pos
     return global_scale.to(dtype).reshape([1])
+
+
+def strict_divide(
+    observed: int, divisor: int, strategy: Optional[QuantizationStrategy] = None
+) -> int:
+    out = observed // divisor
+    if out * divisor != observed:
+        if strategy is not None:
+            raise ValueError(
+                f"{strategy} quantization strategy requires strict division of "
+                f"weight/activation size {observed} and group/block size {divisor}. "
+                "consider reducing the group/block size or ignoring modules with "
+                f"weights not divisible by {divisor}"
+            )
+
+    return out