remove compress_quantized_weight, test fixes, remove sparseml references

brian-dellabetta · brian-dellabetta · commit 14a359f78a85 · 2025-09-10T22:51:12.000Z
Signed-off-by: Brian Dellabetta &lt;bdellabe@redhat.com&gt;
diff --git a/examples/quantize_and_pack_int4.ipynb b/examples/quantize_and_pack_int4.ipynb
@@ -144,7 +144,7 @@
    "outputs": [],
    "source": [
     "quantization_config_dict = {\n",
-    "\t\"quant_method\": \"sparseml\",\n",
+    "\t\"quant_method\": \"compressed-tensors\",\n",
     "\t\"format\": \"pack-quantized\",\n",
     "\t\"global_compression_ratio\": None,\n",
     "\t\"config_groups\": {\n",
diff --git a/src/compressed_tensors/quantization/lifecycle/apply.py b/src/compressed_tensors/quantization/lifecycle/apply.py
@@ -21,9 +21,6 @@
 
 import torch
 from compressed_tensors.config import CompressionFormat
-from compressed_tensors.quantization.lifecycle.compressed import (
-    compress_quantized_weights,
-)
 from compressed_tensors.quantization.lifecycle.initialize import (
     initialize_module_for_quantization,
 )
@@ -219,20 +216,17 @@ def apply_quantization_status(module: Module, status: QuantizationStatus):
     # When decompressing, we set the scale_dtype as the model's dtype
     # This is because the normal workflow of using the weight's dtype
     # will be incorrect as the model weight will be compressed
-    # Therfore, use the dtype set by the user using the PretrainedModel
+    # Therefore, use the dtype set by the user using the PretrainedModel
     scale_dtype = None
     if status == QuantizationStatus.FROZEN:
         if hasattr(module, "dtype"):
             scale_dtype = module.dtype
 
-    module.apply(
-        lambda module: initialize_module_for_quantization(
-            module, force_zero_point=force_zero_point_init, scale_dtype=scale_dtype
-        )
+    initialize_module_for_quantization(
+        module, force_zero_point=force_zero_point_init, scale_dtype=scale_dtype
     )
 
-    if status >= QuantizationStatus.COMPRESSED:
-        module.apply(compress_quantized_weights)
+    module.quantization_status = status
 
 
 @deprecated(
diff --git a/src/compressed_tensors/quantization/quant_config.py b/src/compressed_tensors/quantization/quant_config.py
@@ -113,8 +113,8 @@ class QuantizationConfig(BaseModel):
     :param config_groups: dict of QuantizationSchemes specifying the quantization
     settings for each quantized layer. A group could also be a reference to
     a predefined scheme name, mapped to a list of its target layers/classes
-    :param quant_method: a constant used to differentiate sparseML quantization from
-    other quantization configs
+    :param quant_method: a constant used to differentiate compressed-tensors
+    quantization from other quantization configs
     :param format: specifies how the quantized model is stored on disk
     :quantization_status: specifies the current status of all quantized layers. It is
         assumed all layers are in the same state.
diff --git a/src/compressed_tensors/utils/helpers.py b/src/compressed_tensors/utils/helpers.py
@@ -71,9 +71,6 @@ def infer_compressor_from_model_config(
     return compressor
 
 
-# TODO: There is already the same function in
-# SparseML, should be moved to a shared location
-# in the future
 def fix_fsdp_module_name(name: str) -> str:
     """
     Remove FSDP wrapper prefixes from a module name
diff --git a/tests/test_quantization/lifecycle/test_apply.py b/tests/test_quantization/lifecycle/test_apply.py
@@ -25,10 +25,7 @@
     QuantizationConfig,
     QuantizationStatus,
 )
-from compressed_tensors.quantization.lifecycle import (
-    apply_quantization_config,
-    apply_quantization_status,
-)
+from compressed_tensors.quantization.lifecycle import apply_quantization_config
 from tests.testing_utils import requires_accelerate
 from transformers import AutoModelForCausalLM
 
@@ -105,7 +102,9 @@ def test_target_prioritization(mock_frozen):
 
 
 def test_apply_quantization_config_tinyllama():
-    quant_config = get_sample_tinyllama_quant_config(status="calibration")
+    quant_config = get_sample_tinyllama_quant_config(
+        status=QuantizationStatus.CALIBRATION
+    )
     model = get_tinyllama_model()
 
     # check that model is not already quantized
@@ -146,7 +145,8 @@ def test_apply_quantization_config_tinyllama():
     # test quantization compression
     # sample forward pass to fill scales, zps
     model(torch.zeros((1, 1), dtype=int), torch.zeros((1, 1), dtype=int))
-    apply_quantization_status(model, QuantizationStatus.COMPRESSED)
+    quant_config.quantization_status = QuantizationStatus.COMPRESSED
+    apply_quantization_config(model, quant_config)
     for name, module in model.named_modules():
         if name in quant_config.ignore:
             continue
@@ -157,7 +157,6 @@ def test_apply_quantization_config_tinyllama():
                 inputs=True,
                 weights=True,
                 expected_status=QuantizationStatus.COMPRESSED,
-                expected_dtype=torch.int8,
             )
 
 
@@ -218,7 +217,9 @@ def get_tinyllama_model():
     )
 
 
-def get_sample_tinyllama_quant_config(status: str = "frozen"):
+def get_sample_tinyllama_quant_config(
+    status: QuantizationStatus = QuantizationStatus.FROZEN,
+):
     config_dict = {
         "quant_method": "compressed-tensors",
         "format": "fakequant",
@@ -270,7 +271,7 @@ def test_apply_quantization_status(caplog, target, should_raise_warning):
     # load a dense, unquantized tiny llama model
     model = get_tinyllama_model()
     quantization_config_dict = {
-        "quant_method": "sparseml",
+        "quant_method": "compressed-tensors",
         "format": "pack-quantized",
         "global_compression_ratio": None,
         "config_groups": {