[Decompression] Clean-up and some fixes (#461)

dsikka · web-flow · commit 9170fb312bf7 · 2025-09-10T17:47:13.000-04:00
* clean-up

* update

* update

* update

* remove import

* Update initialize.py
diff --git a/src/compressed_tensors/compressors/model_compressors/model_compressor.py b/src/compressed_tensors/compressors/model_compressors/model_compressor.py
@@ -224,7 +224,8 @@ def parse_sparsity_config(
             s_config = compression_config.sparsity_config
             return s_config.model_dump() if s_config is not None else None
 
-        return compression_config.get(SPARSITY_CONFIG_NAME, None)
+        # explicitly return None if {} in config
+        return compression_config.get(SPARSITY_CONFIG_NAME, None) or None
 
     @staticmethod
     def parse_quantization_config(
@@ -712,17 +713,16 @@ def decompress(self, model_path: str, model: Module):
                 # Load activation scales/zp or any other quantization parameters
                 # Conditionally load the weight quantization parameters if we have a
                 # dense compressor or if a sparsity compressor has already been applied
+                load_weight_qparams = sparse_decompressed or isinstance(
+                    quant_compressor, DenseCompressor
+                )
                 load_pretrained_quantization_parameters(
                     model,
                     model_path,
                     # TODO: all weight quantization params will be moved to the
                     # compressor in a follow-up including initialization
-                    load_weight_quantization=(
-                        sparse_decompressed
-                        or isinstance(quant_compressor, DenseCompressor)
-                    ),
+                    load_weight_qparams=load_weight_qparams,
                 )
-
             model_path_or_state_dict = (
                 model.state_dict() if sparse_decompressed else model_path
             )
@@ -732,7 +732,9 @@ def decompress(self, model_path: str, model: Module):
             )
             # TODO: all weight quantization params will be moved to the compressor
             # to prevent duplicate parameter updates in update_parameter_data
-            self._replace_weights(dense_gen, model)
+            self._replace_weights(
+                dense_gen, model, load_weight_qparams=not load_weight_qparams
+            )
 
             def freeze_quantization_status(module):
                 module.quantization_status = QuantizationStatus.FROZEN
@@ -819,7 +821,9 @@ def _replace_sparsity_weights(self, dense_weight_generator, model: Module):
             param = torch.nn.Parameter(data.to(device), requires_grad=requires_grad)
             register_offload_parameter(module, param_name, param)
 
-    def _replace_weights(self, dense_weight_generator, model: Module):
+    def _replace_weights(
+        self, dense_weight_generator, model: Module, load_weight_qparams: bool = True
+    ):
         """
         Replace the weights of the model with the
         provided dense weights.
@@ -847,6 +851,7 @@ def _replace_weights(self, dense_weight_generator, model: Module):
                     # decompression in init to be consistent with loading which happens
                     # later as well however, update_data does a good shape check -
                     # should be moved to the compressor
+
                     if param_name == "weight":
                         delattr(module, param_name)
                         requires_grad = param_data.dtype in (
@@ -858,7 +863,7 @@ def _replace_weights(self, dense_weight_generator, model: Module):
                             param_data.to(device), requires_grad=requires_grad
                         )
                         register_offload_parameter(module, param_name, param)
-                    else:
+                    elif load_weight_qparams:
                         # Should already be registered to the correct device for
                         # for scales/zero-points
                         update_parameter_data(module, param_data, param_name)
diff --git a/src/compressed_tensors/quantization/lifecycle/apply.py b/src/compressed_tensors/quantization/lifecycle/apply.py
@@ -65,19 +65,19 @@
 def load_pretrained_quantization_parameters(
     model: Module,
     model_name_or_path: Optional[str] = None,
-    load_weight_quantization: Optional[bool] = False,
+    load_weight_qparams: Optional[bool] = False,
 ):
     """
     Loads the quantization parameters (scale and zero point) from model_name_or_path to
     a model that has already been initialized with a quantization config.
 
     NOTE: Will always load inputs/output parameters. Will conditioanlly load weight
-    parameters, if load_weight_quantization is set to True.
+    parameters, if load_weight_qparams is set to True.
 
     :param model: model to load pretrained quantization parameters to
     :param model_name_or_path: Hugging Face stub or local folder containing a quantized
         model, which is used to load quantization parameters
-    :param load_weight_quantization: whether or not the weight quantization parameters
+    :param load_weight_qparams: whether or not the weight quantization parameters
         should be loaded
     """
     model_path = get_safetensors_folder(model_name_or_path)
@@ -103,7 +103,7 @@ def load_pretrained_quantization_parameters(
                 mapping=mapping,
             )
 
-        if load_weight_quantization and submodule.quantization_scheme.weights:
+        if load_weight_qparams and submodule.quantization_scheme.weights:
             base_name = "weight"
             _load_quant_args_from_mapping(
                 base_name=base_name,
@@ -219,18 +219,9 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):
     if status >= QuantizationStatus.INITIALIZED > current_status:
         force_zero_point_init = status != QuantizationStatus.COMPRESSED
 
-        # When decompressing, we set the scale_dtype as the model's dtype
-        # This is because the normal workflow of using the weight's dtype
-        # will be incorrect as the model weight will be compressed
-        # Therfore, use the dtype set by the user using the PretrainedModel
-        scale_dtype = None
-        if status == QuantizationStatus.FROZEN:
-            if hasattr(model, "dtype"):
-                scale_dtype = model.dtype
-
         model.apply(
             lambda module: initialize_module_for_quantization(
-                module, force_zero_point=force_zero_point_init, scale_dtype=scale_dtype
+                module, force_zero_point=force_zero_point_init
             )
         )
 
diff --git a/src/compressed_tensors/quantization/lifecycle/initialize.py b/src/compressed_tensors/quantization/lifecycle/initialize.py
@@ -59,7 +59,6 @@ def initialize_module_for_quantization(
     module: Module,
     scheme: Optional[QuantizationScheme] = None,
     force_zero_point: bool = True,
-    scale_dtype: Optional[torch.dtype] = None,
 ):
     """
     attaches appropriate scales, zero points, and observers to a layer
@@ -73,8 +72,6 @@ def initialize_module_for_quantization(
         if not provided, the layer will be skipped
     :param force_zero_point: whether to force initialization of a zero point for
         symmetric quantization
-    :param scale_dtype: dtype to used for the scales, if overriding the
-        weight dtype as the scale dtype
     """
     # TODO: don't initialize parameters when running decompression
     scheme = scheme or getattr(module, "quantization_scheme", None)
@@ -93,7 +90,6 @@ def initialize_module_for_quantization(
                 "input",
                 scheme.input_activations,
                 force_zero_point=force_zero_point,
-                scale_dtype=scale_dtype,
             )
 
         if scheme.weights is not None:
@@ -107,7 +103,6 @@ def initialize_module_for_quantization(
                     scheme.weights,
                     weight_shape=weight_shape,
                     force_zero_point=force_zero_point,
-                    scale_dtype=scale_dtype,
                 )
             else:
                 _LOGGER.warning(
@@ -119,7 +114,7 @@ def initialize_module_for_quantization(
         if scheme.output_activations is not None:
             if not is_kv_cache_quant_scheme(scheme):
                 _initialize_scale_zero_point(
-                    module, "output", scheme.output_activations, scale_dtype=scale_dtype
+                    module, "output", scheme.output_activations
                 )
 
         module.quantization_scheme = scheme
@@ -145,7 +140,6 @@ def _initialize_scale_zero_point(
     quantization_args: QuantizationArgs,
     weight_shape: Optional[torch.Size] = None,
     force_zero_point: bool = True,
-    scale_dtype: Optional[torch.dtype] = None,
 ):
     if quantization_args.dynamic is True:
         return
@@ -213,7 +207,7 @@ def _initialize_scale_zero_point(
         expected_shape = 1
 
     # 3. Identify quantization scale and zp dtype
-    scale_dtype = scale_dtype if scale_dtype is not None else module.weight.dtype
+    scale_dtype = module.weight.dtype
 
     if is_fp4(quantization_args=quantization_args):
         scale_dtype = zp_dtype = FP8_E4M3_DATA.dtype
@@ -226,7 +220,7 @@ def _initialize_scale_zero_point(
             torch.float32,
             torch.float64,
         ]:
-            scale_dtype = torch.float16
+            scale_dtype = torch.bfloat16
         zp_dtype = quantization_args.pytorch_dtype()
 
     # 4. Initializes empty scale, zero point, and g_idx parameters for the module