clear previously initialized qparams

brian-dellabetta · brian-dellabetta · commit 6ba47e5db087 · 2025-09-11T19:52:14.000Z
Signed-off-by: Brian Dellabetta &lt;bdellabe@redhat.com&gt;
diff --git a/src/compressed_tensors/quantization/lifecycle/apply.py b/src/compressed_tensors/quantization/lifecycle/apply.py
@@ -243,14 +243,6 @@ def find_name_or_class_matches(
     return match_targets(name, module, targets)
 
 
-def _infer_status(model: Module) -> Optional[QuantizationStatus]:
-    for module in model.modules():
-        status = getattr(module, "quantization_status", None)
-        if status is not None:
-            return status
-    return None
-
-
 def _load_quant_args_from_mapping(
     base_name: str, module_name: str, module: Module, mapping: Dict
 ):
diff --git a/src/compressed_tensors/quantization/lifecycle/initialize.py b/src/compressed_tensors/quantization/lifecycle/initialize.py
@@ -33,6 +33,7 @@
 from compressed_tensors.quantization.quant_scheme import QuantizationScheme
 from compressed_tensors.quantization.utils import is_fp4, is_kv_cache_quant_scheme
 from compressed_tensors.utils import (
+    delete_offload_parameter,
     disable_hf_hook,
     get_execution_device,
     register_offload_parameter,
@@ -61,10 +62,11 @@ def initialize_module_for_quantization(
     force_zero_point: bool = True,
 ):
     """
-    attaches appropriate scales, zero points, and observers to a layer
-    given its target quantization scheme
+    Attaches appropriate scales, zero points, and observers to a layer
+    given its target quantization scheme.
 
-    apply to full model with `model.apply(initialize_module_for_quantization)`
+    Previously initialized scales and zero points will be removed from
+    module if they no longer apply to the scheme
 
     :param module: module to set for calibration
     :param scheme: scheme to use for quantization. if None is provided,
@@ -73,6 +75,8 @@ def initialize_module_for_quantization(
     :param force_zero_point: whether to force initialization of a zero point for
         symmetric quantization
     """
+    _clear_all_qparams(module)
+
     # TODO: don't initialize parameters when running decompression
     scheme = scheme or getattr(module, "quantization_scheme", None)
     if scheme is None:
@@ -134,6 +138,29 @@ def is_attention_module(module: Module):
     )
 
 
+def _clear_all_qparams(
+    module: Module,
+):
+    """
+    Clear all previously registered quantization parameters from module
+
+    :param module: module to clear qparams from
+    """
+    keys = [KVCacheScaleType.KEY.value, KVCacheScaleType.VALUE.value] + [
+        f"{base_name}_{suffix}"
+        for base_name in ("input", "weight", "output")
+        for suffix in (
+            "global_scale",
+            "scale",
+            "zero_point",
+            "g_idx",
+        )
+    ]
+    for key in keys:
+        if hasattr(module, key):
+            delete_offload_parameter(module, key)
+
+
 def _initialize_scale_zero_point(
     module: Module,
     base_name: str,
diff --git a/tests/test_quantization/lifecycle/test_apply.py b/tests/test_quantization/lifecycle/test_apply.py
@@ -265,7 +265,7 @@ def get_sample_tinyllama_quant_config(
         [("Linear", "re:.*foobarbaz"), True],
     ],
 )
-def test_apply_quantization_status(caplog, target, should_raise_warning):
+def test_apply_quantization_config(caplog, target, should_raise_warning):
     import logging
 
     # load a dense, unquantized tiny llama model

Original file line number	Diff line number	Diff line change
`@@ -265,7 +265,7 @@ def get_sample_tinyllama_quant_config(`
`265`	`265`	`[("Linear", "re:.*foobarbaz"), True],`
`266`	`266`	`],`
`267`	`267`	`)`
`268`		`-def test_apply_quantization_status(caplog, target, should_raise_warning):`
	`268`	`+def test_apply_quantization_config(caplog, target, should_raise_warning):`
`269`	`269`	`import logging`
`270`	`270`
`271`	`271`	`# load a dense, unquantized tiny llama model`