refactor

kylesayrs · kylesayrs · commit fde779c2cccb · 2025-09-11T15:08:27.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/compressed_tensors/quantization/lifecycle/apply.py b/src/compressed_tensors/quantization/lifecycle/apply.py
@@ -223,14 +223,16 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):
         # This is because the normal workflow of using the weight's dtype
         # will be incorrect as the model weight will be compressed
         # Therfore, use the dtype set by the user using the PretrainedModel
-        scale_dtype = None
+        force_scale_dtype = None
         if status == QuantizationStatus.FROZEN:
             if hasattr(model, "dtype"):
-                scale_dtype = model.dtype
+                force_scale_dtype = model.dtype
 
         model.apply(
             lambda module: initialize_module_for_quantization(
-                module, force_zero_point=force_zero_point_init, scale_dtype=scale_dtype
+                module,
+                force_zero_point=force_zero_point_init,
+                force_scale_dtype=force_scale_dtype,
             )
         )
 
diff --git a/src/compressed_tensors/quantization/lifecycle/initialize.py b/src/compressed_tensors/quantization/lifecycle/initialize.py
@@ -26,6 +26,7 @@
 from compressed_tensors.quantization.quant_args import (
     FP8_E4M3_DATA,
     ActivationOrdering,
+    DynamicType,
     QuantizationArgs,
     QuantizationStrategy,
 )
@@ -58,8 +59,8 @@ class KVCacheScaleType(Enum):
 def initialize_module_for_quantization(
     module: Module,
     scheme: Optional[QuantizationScheme] = None,
+    force_scale_dtype: Optional[torch.dtype] = None,
     force_zero_point: bool = True,
-    scale_dtype: Optional[torch.dtype] = None,
 ):
     """
     attaches appropriate scales, zero points, and observers to a layer
@@ -76,51 +77,58 @@ def initialize_module_for_quantization(
     :param scale_dtype: dtype to used for the scales, if overriding the
         weight dtype as the scale dtype
     """
-    # TODO: don't initialize parameters when running decompression
     scheme = scheme or getattr(module, "quantization_scheme", None)
     if scheme is None:
-        # no scheme passed and layer not targeted for quantization - skip
         return
 
     if is_attention_module(module):
         # quantized actions based on calltime status
         _initialize_attn_scales(module)
 
     else:
-        if scheme.input_activations is not None:
-            _initialize_scale_zero_point(
-                module,
-                "input",
-                scheme.input_activations,
-                force_zero_point=force_zero_point,
-                scale_dtype=scale_dtype,
+        if not isinstance(module, torch.nn.Linear):
+            _LOGGER.warning(f"Attempting to quantize module of type {type(module)}")
+
+        # use weight to determine observed shapes and dtype
+        if hasattr(module, "weight"):
+            weight = module.weight
+            assert isinstance(weight, torch.Tensor)
+        else:
+            # Note that a weight is required for both weight and activation
+            # quantization in order to know the dtype of activation scales
+            _LOGGER.warning(
+                f"module type {type(module)} targeted for quantization but "
+                f"has no attribute weight, skipping quantization for {type(module)}"
             )
+            return
+
+        if scheme.input_activations is not None:
+            base_name = "input"
+            args = scheme.input_activations
+            observed_shape = weight.shape[-1:]
+            observed_dtype = force_scale_dtype or weight.dtype
 
         if scheme.weights is not None:
-            if hasattr(module, "weight"):
-                weight_shape = None
-                if isinstance(module, torch.nn.Linear):
-                    weight_shape = module.weight.shape
-                _initialize_scale_zero_point(
-                    module,
-                    "weight",
-                    scheme.weights,
-                    weight_shape=weight_shape,
-                    force_zero_point=force_zero_point,
-                    scale_dtype=scale_dtype,
-                )
-            else:
-                _LOGGER.warning(
-                    f"module type {type(module)} targeted for weight quantization but "
-                    "has no attribute weight, skipping weight quantization "
-                    f"for {type(module)}"
-                )
+            base_name = "weight"
+            args = scheme.weights
+            observed_shape = weight.shape
+            observed_dtype = force_scale_dtype or weight.dtype
 
         if scheme.output_activations is not None:
-            if not is_kv_cache_quant_scheme(scheme):
-                _initialize_scale_zero_point(
-                    module, "output", scheme.output_activations, scale_dtype=scale_dtype
-                )
+            base_name = "output"
+            args = scheme.output_activations
+            observed_shape = weight.shape[:-1]
+            observed_dtype = force_scale_dtype or weight.dtype
+
+        if not is_kv_cache_quant_scheme(scheme):
+            _initialize_scale_zero_point(
+                module,
+                base_name,
+                args,
+                observed_shape=observed_shape,
+                observed_dtype=observed_dtype,
+                force_zero_point=force_zero_point,
+            )
 
         module.quantization_scheme = scheme
         module.quantization_status = QuantizationStatus.INITIALIZED
@@ -143,19 +151,21 @@ def _initialize_scale_zero_point(
     module: Module,
     base_name: str,
     quantization_args: QuantizationArgs,
-    weight_shape: Optional[torch.Size] = None,
+    observed_shape: torch.Size,
+    observed_dtype: torch.dtype,
     force_zero_point: bool = True,
-    scale_dtype: Optional[torch.dtype] = None,
 ):
-    if quantization_args.dynamic is True:
-        return
+    strategy = quantization_args.strategy
+    dynamic = quantization_args.dynamic
+    actorder = quantization_args.actorder
+    device = get_execution_device(module)  # avoid performing intialization ops on cpu
 
-    # initialize on execution device to avoid performing quantized ops on cpu
-    device = get_execution_device(module)
+    # Skip all intialization for fully dynamic quantization
+    if dynamic is True:
+        return
 
-    # 1. Create global_scales for tensor_group - generates
-    # a per tensor scale
-    if quantization_args.strategy == QuantizationStrategy.TENSOR_GROUP:
+    # 0. Create global scale for tensor-group quantization
+    if strategy == QuantizationStrategy.TENSOR_GROUP:
         init_global_scale = Parameter(
             torch.empty(1, dtype=torch.float32, device=device),
             requires_grad=False,
@@ -164,56 +174,49 @@ def _initialize_scale_zero_point(
             module, f"{base_name}_global_scale", init_global_scale
         )
 
-    # 2. Infer expected scale/zero point shape
-    if quantization_args.strategy == QuantizationStrategy.TOKEN:
-        expected_shape = (1, 1)
-    else:
-        expected_shape = 1
-
-    if base_name == "weight" and weight_shape is not None:
-        if quantization_args.strategy == QuantizationStrategy.CHANNEL:
-            # (output_channels, 1) - only for weights
-            expected_shape = (weight_shape[0], 1)
-        elif quantization_args.strategy in (
-            QuantizationStrategy.TENSOR_GROUP,
-            QuantizationStrategy.GROUP,
-        ):
-            # GROUP/TENSOR_GROUP for both weights and activations
-            num_groups = math.ceil(weight_shape[1] / quantization_args.group_size)
-            expected_shape = (weight_shape[0], max(num_groups, 1))
-        elif quantization_args.strategy == QuantizationStrategy.BLOCK:
-            # For block quantization, scale shape should match number of blocks - only
-            # for weights
-            if quantization_args.block_structure is None:
-                raise ValueError(
-                    "Block quantization requires block_structure to be specified"
-                )
-            block_height, block_width = quantization_args.block_structure
-            rows, cols = weight_shape[-2], weight_shape[-1]
-            num_rows_blocks = math.ceil(rows / block_height)
-            num_cols_blocks = math.ceil(cols / block_width)
-
-            # Warn if dimensions don't divide evenly
-            if rows % block_height != 0 or cols % block_width != 0:
-                warnings.warn(
-                    f"Block quantization: tensor shape {weight_shape} does not divide"
-                    f"evenly by block structure {quantization_args.block_structure}. "
-                    f"Some blocks will be incomplete which may affect quantization"
-                    "quality.",
-                    UserWarning,
-                )
-
-            expected_shape = (num_rows_blocks, num_cols_blocks)
-    elif quantization_args.strategy == QuantizationStrategy.BLOCK:
-        warnings.warn(
-            f"BLOCK quantization not supported for {base_name} activations. "
-            f"Falling back to tensor-level quantization.",
-            UserWarning,
-        )
-        expected_shape = 1
+    # Skip scale/zp initialization for locally dynamic quantization
+    if dynamic == DynamicType.LOCAL:
+        return
+
+    # 1. Infer expected scale/zp shape
+    if strategy in (QuantizationStrategy.TENSOR, QuantizationStrategy.TOKEN):
+        expected_shape = (1,)
+
+    elif strategy == QuantizationStrategy.CHANNEL:
+        if len(observed_shape) < 1:
+            raise ValueError("Channel quant requires at least 1 observed dimension")
+
+        expected_shape = (observed_shape[-1], 1)
 
-    # 3. Identify quantization scale and zp dtype
-    scale_dtype = scale_dtype if scale_dtype is not None else module.weight.dtype
+    elif strategy in (QuantizationStrategy.GROUP, QuantizationStrategy.TENSOR_GROUP):
+        assert quantization_args.group_size is not None
+        if len(observed_shape) < 1:
+            raise ValueError("Group quant requires at least 1 observed dimension")
+
+        group_size = quantization_args.group_size
+        num_groups = _strict_divide(observed_shape[-1], group_size, strategy)
+        expected_shape = (num_groups, group_size)
+
+        # initialize activation ordering if applicable
+        if actorder == ActivationOrdering.GROUP:
+            init_g_idx = Parameter(
+                torch.full((observed_shape[-1],), -1, device=device, dtype=torch.int),
+                requires_grad=False,
+            )
+            register_offload_parameter(module, f"{base_name}_g_idx", init_g_idx)
+
+    elif strategy == QuantizationStrategy.BLOCK:
+        assert quantization_args.block_structure is not None
+        if len(observed_shape) < 2:
+            raise ValueError("Block quant requires at least 2 observed dimensions")
+
+        block_structure = quantization_args.block_structure
+        num_rows = _strict_divide(observed_shape[-2], block_structure[-2], strategy)
+        num_cols = _strict_divide(observed_shape[-1], block_structure[-1], strategy)
+        expected_shape = (num_rows, num_cols)
+
+    # 2. Identify quantization scale and zp dtype
+    scale_dtype = observed_dtype
 
     if is_fp4(quantization_args=quantization_args):
         scale_dtype = zp_dtype = FP8_E4M3_DATA.dtype
@@ -229,14 +232,12 @@ def _initialize_scale_zero_point(
             scale_dtype = torch.float16
         zp_dtype = quantization_args.pytorch_dtype()
 
-    # 4. Initializes empty scale, zero point, and g_idx parameters for the module
-    # do not init scales for quantzation_args.dynamic == DynamicType.local
-    if not quantization_args.dynamic:
-        init_scale = Parameter(
-            torch.empty(expected_shape, dtype=scale_dtype, device=device),
-            requires_grad=False,
-        )
-        register_offload_parameter(module, f"{base_name}_scale", init_scale)
+    # 3. Initializes scale/zp for the module
+    init_scale = Parameter(
+        torch.empty(expected_shape, dtype=scale_dtype, device=device),
+        requires_grad=False,
+    )
+    register_offload_parameter(module, f"{base_name}_scale", init_scale)
 
     if force_zero_point or not quantization_args.symmetric:
         init_zero_point = Parameter(
@@ -245,16 +246,6 @@ def _initialize_scale_zero_point(
         )
         register_offload_parameter(module, f"{base_name}_zero_point", init_zero_point)
 
-    # only grouped activation ordering has g_idx
-    if quantization_args.actorder == ActivationOrdering.GROUP:
-        g_idx_shape = (weight_shape[1],)
-        g_idx_dtype = torch.int
-        init_g_idx = Parameter(
-            torch.full(g_idx_shape, -1, device=device, dtype=g_idx_dtype),
-            requires_grad=False,
-        )
-        register_offload_parameter(module, f"{base_name}_g_idx", init_g_idx)
-
 
 def _initialize_attn_scales(module: Module) -> None:
     """Initlaize k_scale, v_scale for  self_attn"""
@@ -276,3 +267,16 @@ def _initialize_attn_scales(module: Module) -> None:
         requires_grad=False,
     )
     register_offload_parameter(module, KVCacheScaleType.VALUE.value, init_scale)
+
+
+def _strict_divide(observed: int, divisor: int, strategy: QuantizationStrategy) -> int:
+    out = observed // divisor
+    if out * divisor != observed:
+        raise ValueError(
+            f"{strategy} quantization strategy requires strict division of "
+            f"weight/activation size {observed} and group/block size {divisor}. "
+            "consider reducing the group/block size or ignoring modules with weights "
+            f"not divisible by {divisor}"
+        )
+
+    return out
diff --git a/src/compressed_tensors/quantization/quant_args.py b/src/compressed_tensors/quantization/quant_args.py
@@ -262,6 +262,7 @@ def validate_model_after(model: "QuantizationArgs") -> "QuantizationArgs":
         actorder = model.actorder
         dynamic = model.dynamic
         observer = model.observer
+        block_structure = model.block_structure
 
         # infer strategy
         if strategy is None:
@@ -277,23 +278,28 @@ def validate_model_after(model: "QuantizationArgs") -> "QuantizationArgs":
                     "strategy='group' and group_size = -1 for 'channel'"
                 )
 
-        # validate strategy and group
-        if strategy == QuantizationStrategy.GROUP:
-            if group_size is None or group_size <= 0:
-                raise ValueError(
-                    f"strategy {strategy} requires group_size to be "
-                    "set to a positive value"
-                )
-        if (
-            group_size is not None
-            and group_size > 0
-            and strategy
-            not in (QuantizationStrategy.GROUP, QuantizationStrategy.TENSOR_GROUP)
-        ):
-            raise ValueError("group_size requires strategy to be set to 'group'")
-
-        # validate activation ordering and strategy
-        if actorder is not None and strategy != QuantizationStrategy.GROUP:
+        # validate block strategy and structure
+        has_block_strategy = strategy == QuantizationStrategy.BLOCK
+        has_block_structure = block_structure is not None
+        if has_block_strategy != has_block_structure:
+            raise ValueError(
+                "Block strategy requires `block_structure`, and vice versa. "
+                f"Instead got ({strategy}, {block_structure})"
+            )
+
+        # validate group strategy
+        has_group_strategy = strategy in (
+            QuantizationStrategy.GROUP,
+            QuantizationStrategy.TENSOR_GROUP,
+        )
+        has_group_size = group_size is not None and group_size > 0
+        has_actorder = actorder is not None
+        if has_group_strategy != has_group_size:
+            raise ValueError(
+                "Group strategies require `group_size`, and vice versa. "
+                f"Instead got ({strategy}, {group_size})"
+            )
+        if has_actorder and not has_group_strategy:
             raise ValueError(
                 "Must use group quantization strategy in order to apply "
                 "activation ordering"