refactor

kylesayrs · kylesayrs · commit db53247043d6 · 2025-10-07T18:09:18.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/compressed_tensors/quantization/lifecycle/initialize.py b/src/compressed_tensors/quantization/lifecycle/initialize.py
@@ -14,23 +14,22 @@
 
 
 import logging
+from enum import Enum
 from typing import Optional, Tuple
 
 import torch
-from compressed_tensors.quantization import (
+from compressed_tensors.quantization.lifecycle.forward import (
+    wrap_module_forward_quantized,
+)
+from compressed_tensors.quantization.quant_args import (
     FP8_E4M3_DATA,
     ActivationOrdering,
     DynamicType,
-    KVCacheScaleType,
     QuantizationArgs,
-    QuantizationMetadata,
-    QuantizationScheme,
-    QuantizationStatus,
     QuantizationStrategy,
 )
-from compressed_tensors.quantization.lifecycle.forward import (
-    wrap_module_forward_quantized,
-)
+from compressed_tensors.quantization.quant_config import QuantizationStatus
+from compressed_tensors.quantization.quant_scheme import QuantizationScheme
 from compressed_tensors.quantization.utils import (
     is_fp4,
     is_kv_cache_quant_scheme,
@@ -54,17 +53,21 @@
 _LOGGER = logging.getLogger(__name__)
 
 
+class KVCacheScaleType(Enum):
+    KEY = "k_scale"
+    VALUE = "v_scale"
+
+
 def initialize_module_for_quantization(
     module: Module,
     scheme: Optional[QuantizationScheme] = None,
     force_zero_point: bool = True,
 ):
     """
-    Attaches appropriate scales, zero points, and observers to a layer
-    given its target quantization scheme.
+    attaches appropriate scales, zero points, and observers to a layer
+    given its target quantization scheme
 
-    Previously initialized scales and zero points will be removed from
-    module if they no longer apply to the scheme
+    apply to full model with `model.apply(initialize_module_for_quantization)`
 
     :param module: module to set for calibration
     :param scheme: scheme to use for quantization. if None is provided,
@@ -77,8 +80,6 @@ def initialize_module_for_quantization(
     if scheme is None:
         return
 
-    QuantizationMetadata.clear_all_qparams(module)
-
     if is_attention_module(module):
         # quantized actions based on calltime status
         _initialize_attn_scales(module)
diff --git a/src/compressed_tensors/quantization/utils/helpers.py b/src/compressed_tensors/quantization/utils/helpers.py
@@ -33,6 +33,7 @@
 
 
 __all__ = [
+    "infer_quantization_status",
     "is_module_quantized",
     "is_model_quantized",
     "module_type",
@@ -235,6 +236,21 @@ def calculate_range(quantization_args: QuantizationArgs, device: str) -> Tuple:
     return q_min, q_max
 
 
+def infer_quantization_status(model: Module) -> Optional["QuantizationStatus"]:  # noqa
+    """
+    Checks the quantization status of a model. Assumes all modules in the model have
+    the same status, so only the first quantized model is checked.
+
+    :param model: model to check quantization status for
+    :return: quantization status if the model is quantized, otherwise None
+    """
+    for module in model.modules():
+        status = getattr(module, "quantization_status", None)
+        if status is not None:
+            return status
+    return None
+
+
 def is_module_quantized(module: Module) -> bool:
     """
     Check if a module is quantized, based on the existence of a non-empty quantization