refactor

kylesayrs · kylesayrs · commit 1c217e4d774a · 2025-09-11T15:12:56.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/compressed_tensors/quantization/lifecycle/apply.py b/src/compressed_tensors/quantization/lifecycle/apply.py
@@ -221,7 +221,8 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):
 
         model.apply(
             lambda module: initialize_module_for_quantization(
-                module, force_zero_point=force_zero_point_init
+                module,
+                force_zero_point=force_zero_point_init,
             )
         )
 
diff --git a/src/compressed_tensors/quantization/lifecycle/initialize.py b/src/compressed_tensors/quantization/lifecycle/initialize.py
@@ -26,6 +26,7 @@
 from compressed_tensors.quantization.quant_args import (
     FP8_E4M3_DATA,
     ActivationOrdering,
+    DynamicType,
     QuantizationArgs,
     QuantizationStrategy,
 )
@@ -73,49 +74,58 @@ def initialize_module_for_quantization(
     :param force_zero_point: whether to force initialization of a zero point for
         symmetric quantization
     """
-    # TODO: don't initialize parameters when running decompression
     scheme = scheme or getattr(module, "quantization_scheme", None)
     if scheme is None:
-        # no scheme passed and layer not targeted for quantization - skip
         return
 
     if is_attention_module(module):
         # quantized actions based on calltime status
         _initialize_attn_scales(module)
 
     else:
-        if scheme.input_activations is not None:
-            _initialize_scale_zero_point(
-                module,
-                "input",
-                scheme.input_activations,
-                force_zero_point=force_zero_point,
+        if not isinstance(module, torch.nn.Linear):
+            _LOGGER.warning(f"Attempting to quantize module of type {type(module)}")
+
+        # use weight to determine observed shapes and dtype
+        if hasattr(module, "weight"):
+            weight = module.weight
+            assert isinstance(weight, torch.Tensor)
+        else:
+            # Note that a weight is required for both weight and activation
+            # quantization in order to know the dtype of activation scales
+            _LOGGER.warning(
+                f"module type {type(module)} targeted for quantization but "
+                f"has no attribute weight, skipping quantization for {type(module)}"
             )
+            return
+
+        if scheme.input_activations is not None:
+            base_name = "input"
+            args = scheme.input_activations
+            observed_shape = weight.shape[-1:]
+            observed_dtype = weight.dtype
 
         if scheme.weights is not None:
-            if hasattr(module, "weight"):
-                weight_shape = None
-                if isinstance(module, torch.nn.Linear):
-                    weight_shape = module.weight.shape
-                _initialize_scale_zero_point(
-                    module,
-                    "weight",
-                    scheme.weights,
-                    weight_shape=weight_shape,
-                    force_zero_point=force_zero_point,
-                )
-            else:
-                _LOGGER.warning(
-                    f"module type {type(module)} targeted for weight quantization but "
-                    "has no attribute weight, skipping weight quantization "
-                    f"for {type(module)}"
-                )
+            base_name = "weight"
+            args = scheme.weights
+            observed_shape = weight.shape
+            observed_dtype = weight.dtype
 
         if scheme.output_activations is not None:
-            if not is_kv_cache_quant_scheme(scheme):
-                _initialize_scale_zero_point(
-                    module, "output", scheme.output_activations
-                )
+            base_name = "output"
+            args = scheme.output_activations
+            observed_shape = weight.shape[:-1]
+            observed_dtype = weight.dtype
+
+        if not is_kv_cache_quant_scheme(scheme):
+            _initialize_scale_zero_point(
+                module,
+                base_name,
+                args,
+                observed_shape=observed_shape,
+                observed_dtype=observed_dtype,
+                force_zero_point=force_zero_point,
+            )
 
         module.quantization_scheme = scheme
         module.quantization_status = QuantizationStatus.INITIALIZED
@@ -138,18 +148,21 @@ def _initialize_scale_zero_point(
     module: Module,
     base_name: str,
     quantization_args: QuantizationArgs,
-    weight_shape: Optional[torch.Size] = None,
+    observed_shape: torch.Size,
+    observed_dtype: torch.dtype,
     force_zero_point: bool = True,
 ):
-    if quantization_args.dynamic is True:
-        return
+    strategy = quantization_args.strategy
+    dynamic = quantization_args.dynamic
+    actorder = quantization_args.actorder
+    device = get_execution_device(module)  # avoid performing intialization ops on cpu
 
-    # initialize on execution device to avoid performing quantized ops on cpu
-    device = get_execution_device(module)
+    # Skip all intialization for fully dynamic quantization
+    if dynamic is True:
+        return
 
-    # 1. Create global_scales for tensor_group - generates
-    # a per tensor scale
-    if quantization_args.strategy == QuantizationStrategy.TENSOR_GROUP:
+    # 0. Create global scale for tensor-group quantization
+    if strategy == QuantizationStrategy.TENSOR_GROUP:
         init_global_scale = Parameter(
             torch.empty(1, dtype=torch.float32, device=device),
             requires_grad=False,
@@ -158,56 +171,54 @@ def _initialize_scale_zero_point(
             module, f"{base_name}_global_scale", init_global_scale
         )
 
-    # 2. Infer expected scale/zero point shape
-    if quantization_args.strategy == QuantizationStrategy.TOKEN:
-        expected_shape = (1, 1)
-    else:
-        expected_shape = 1
-
-    if base_name == "weight" and weight_shape is not None:
-        if quantization_args.strategy == QuantizationStrategy.CHANNEL:
-            # (output_channels, 1) - only for weights
-            expected_shape = (weight_shape[0], 1)
-        elif quantization_args.strategy in (
-            QuantizationStrategy.TENSOR_GROUP,
-            QuantizationStrategy.GROUP,
-        ):
-            # GROUP/TENSOR_GROUP for both weights and activations
-            num_groups = math.ceil(weight_shape[1] / quantization_args.group_size)
-            expected_shape = (weight_shape[0], max(num_groups, 1))
-        elif quantization_args.strategy == QuantizationStrategy.BLOCK:
-            # For block quantization, scale shape should match number of blocks - only
-            # for weights
-            if quantization_args.block_structure is None:
-                raise ValueError(
-                    "Block quantization requires block_structure to be specified"
-                )
-            block_height, block_width = quantization_args.block_structure
-            rows, cols = weight_shape[-2], weight_shape[-1]
-            num_rows_blocks = math.ceil(rows / block_height)
-            num_cols_blocks = math.ceil(cols / block_width)
-
-            # Warn if dimensions don't divide evenly
-            if rows % block_height != 0 or cols % block_width != 0:
-                warnings.warn(
-                    f"Block quantization: tensor shape {weight_shape} does not divide"
-                    f"evenly by block structure {quantization_args.block_structure}. "
-                    f"Some blocks will be incomplete which may affect quantization"
-                    "quality.",
-                    UserWarning,
-                )
-
-            expected_shape = (num_rows_blocks, num_cols_blocks)
-    elif quantization_args.strategy == QuantizationStrategy.BLOCK:
-        warnings.warn(
-            f"BLOCK quantization not supported for {base_name} activations. "
-            f"Falling back to tensor-level quantization.",
-            UserWarning,
-        )
-        expected_shape = 1
+    # Skip scale/zp initialization for locally dynamic quantization
+    if dynamic == DynamicType.LOCAL:
+        return
+
+    # 1. Infer expected scale/zp shape
+    if strategy in (QuantizationStrategy.TENSOR, QuantizationStrategy.TOKEN):
+        expected_shape = (1,)
+
+    elif strategy == QuantizationStrategy.CHANNEL:
+        if len(observed_shape) < 1:
+            raise ValueError("Channel quant requires at least 1 observed dimension")
 
+        expected_shape = (observed_shape[-1], 1)
+
+<<<<<<< HEAD
     # 3. Identify quantization scale and zp dtype
     scale_dtype = module.weight.dtype
+=======
+    elif strategy in (QuantizationStrategy.GROUP, QuantizationStrategy.TENSOR_GROUP):
+        assert quantization_args.group_size is not None
+        if len(observed_shape) < 1:
+            raise ValueError("Group quant requires at least 1 observed dimension")
+
+        group_size = quantization_args.group_size
+        num_groups = _strict_divide(observed_shape[-1], group_size, strategy)
+        expected_shape = (num_groups, group_size)
+
+        # initialize activation ordering if applicable
+        if actorder == ActivationOrdering.GROUP:
+            init_g_idx = Parameter(
+                torch.full((observed_shape[-1],), -1, device=device, dtype=torch.int),
+                requires_grad=False,
+            )
+            register_offload_parameter(module, f"{base_name}_g_idx", init_g_idx)
+
+    elif strategy == QuantizationStrategy.BLOCK:
+        assert quantization_args.block_structure is not None
+        if len(observed_shape) < 2:
+            raise ValueError("Block quant requires at least 2 observed dimensions")
+
+        block_structure = quantization_args.block_structure
+        num_rows = _strict_divide(observed_shape[-2], block_structure[-2], strategy)
+        num_cols = _strict_divide(observed_shape[-1], block_structure[-1], strategy)
+        expected_shape = (num_rows, num_cols)
+
+    # 2. Identify quantization scale and zp dtype
+    scale_dtype = observed_dtype
+>>>>>>> fde779c (refactor)
 
     if is_fp4(quantization_args=quantization_args):
         scale_dtype = zp_dtype = FP8_E4M3_DATA.dtype
@@ -223,14 +234,12 @@ def _initialize_scale_zero_point(
             scale_dtype = torch.bfloat16
         zp_dtype = quantization_args.pytorch_dtype()
 
-    # 4. Initializes empty scale, zero point, and g_idx parameters for the module
-    # do not init scales for quantzation_args.dynamic == DynamicType.local
-    if not quantization_args.dynamic:
-        init_scale = Parameter(
-            torch.empty(expected_shape, dtype=scale_dtype, device=device),
-            requires_grad=False,
-        )
-        register_offload_parameter(module, f"{base_name}_scale", init_scale)
+    # 3. Initializes scale/zp for the module
+    init_scale = Parameter(
+        torch.empty(expected_shape, dtype=scale_dtype, device=device),
+        requires_grad=False,
+    )
+    register_offload_parameter(module, f"{base_name}_scale", init_scale)
 
     if force_zero_point or not quantization_args.symmetric:
         init_zero_point = Parameter(
@@ -239,16 +248,6 @@ def _initialize_scale_zero_point(
         )
         register_offload_parameter(module, f"{base_name}_zero_point", init_zero_point)
 
-    # only grouped activation ordering has g_idx
-    if quantization_args.actorder == ActivationOrdering.GROUP:
-        g_idx_shape = (weight_shape[1],)
-        g_idx_dtype = torch.int
-        init_g_idx = Parameter(
-            torch.full(g_idx_shape, -1, device=device, dtype=g_idx_dtype),
-            requires_grad=False,
-        )
-        register_offload_parameter(module, f"{base_name}_g_idx", init_g_idx)
-
 
 def _initialize_attn_scales(module: Module) -> None:
     """Initlaize k_scale, v_scale for  self_attn"""
@@ -270,3 +269,16 @@ def _initialize_attn_scales(module: Module) -> None:
         requires_grad=False,
     )
     register_offload_parameter(module, KVCacheScaleType.VALUE.value, init_scale)
+
+
+def _strict_divide(observed: int, divisor: int, strategy: QuantizationStrategy) -> int:
+    out = observed // divisor
+    if out * divisor != observed:
+        raise ValueError(
+            f"{strategy} quantization strategy requires strict division of "
+            f"weight/activation size {observed} and group/block size {divisor}. "
+            "consider reducing the group/block size or ignoring modules with weights "
+            f"not divisible by {divisor}"
+        )
+
+    return out
diff --git a/src/compressed_tensors/quantization/quant_args.py b/src/compressed_tensors/quantization/quant_args.py
@@ -262,6 +262,7 @@ def validate_model_after(model: "QuantizationArgs") -> "QuantizationArgs":
         actorder = model.actorder
         dynamic = model.dynamic
         observer = model.observer
+        block_structure = model.block_structure
 
         # infer strategy
         if strategy is None:
@@ -277,23 +278,28 @@ def validate_model_after(model: "QuantizationArgs") -> "QuantizationArgs":
                     "strategy='group' and group_size = -1 for 'channel'"
                 )
 
-        # validate strategy and group
-        if strategy == QuantizationStrategy.GROUP:
-            if group_size is None or group_size <= 0:
-                raise ValueError(
-                    f"strategy {strategy} requires group_size to be "
-                    "set to a positive value"
-                )
-        if (
-            group_size is not None
-            and group_size > 0
-            and strategy
-            not in (QuantizationStrategy.GROUP, QuantizationStrategy.TENSOR_GROUP)
-        ):
-            raise ValueError("group_size requires strategy to be set to 'group'")
-
-        # validate activation ordering and strategy
-        if actorder is not None and strategy != QuantizationStrategy.GROUP:
+        # validate block strategy and structure
+        has_block_strategy = strategy == QuantizationStrategy.BLOCK
+        has_block_structure = block_structure is not None
+        if has_block_strategy != has_block_structure:
+            raise ValueError(
+                "Block strategy requires `block_structure`, and vice versa. "
+                f"Instead got ({strategy}, {block_structure})"
+            )
+
+        # validate group strategy
+        has_group_strategy = strategy in (
+            QuantizationStrategy.GROUP,
+            QuantizationStrategy.TENSOR_GROUP,
+        )
+        has_group_size = group_size is not None and group_size > 0
+        has_actorder = actorder is not None
+        if has_group_strategy != has_group_size:
+            raise ValueError(
+                "Group strategies require `group_size`, and vice versa. "
+                f"Instead got ({strategy}, {group_size})"
+            )
+        if has_actorder and not has_group_strategy:
             raise ValueError(
                 "Must use group quantization strategy in order to apply "
                 "activation ordering"

Original file line number	Diff line number	Diff line change
`@@ -221,7 +221,8 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):`
`221`	`221`
`222`	`222`	`model.apply(`
`223`	`223`	`lambda module: initialize_module_for_quantization(`
`224`		`- module, force_zero_point=force_zero_point_init`
	`224`	`+ module,`
	`225`	`+ force_zero_point=force_zero_point_init,`
`225`	`226`	`)`
`226`	`227`	`)`
`227`	`228`