neuralmagic
diff --git a/‎src/compressed_tensors/compressors/model_compressors/model_compressor.py‎
Lines changed: 18 additions & 120 deletions b/‎src/compressed_tensors/compressors/model_compressors/model_compressor.py‎
Lines changed: 18 additions & 120 deletions
diff --git a/‎src/compressed_tensors/compressors/quantized_compressors/nvfp4_quantized.py‎
Lines changed: 5 additions & 4 deletions b/‎src/compressed_tensors/compressors/quantized_compressors/nvfp4_quantized.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎src/compressed_tensors/quantization/lifecycle/apply.py‎
Lines changed: 4 additions & 4 deletions b/‎src/compressed_tensors/quantization/lifecycle/apply.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/compressed_tensors/quantization/lifecycle/initialize.py‎
Lines changed: 3 additions & 9 deletions b/‎src/compressed_tensors/quantization/lifecycle/initialize.py‎
Lines changed: 3 additions & 9 deletions
diff --git a/‎src/compressed_tensors/quantization/quant_scheme.py‎
Lines changed: 20 additions & 0 deletions b/‎src/compressed_tensors/quantization/quant_scheme.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎src/compressed_tensors/transform/factory/base.py‎
Lines changed: 11 additions & 4 deletions b/‎src/compressed_tensors/transform/factory/base.py‎
Lines changed: 11 additions & 4 deletions
@@ -49,7 +49,6 @@
     get_offloaded_device,
     get_safetensors_folder,
     has_offloaded_params,
-    merge_names,
     patch_attr,
     register_offload_parameter,
     update_parameter_data,
@@ -226,7 +225,8 @@ def parse_sparsity_config(
             s_config = compression_config.sparsity_config
             return s_config.model_dump() if s_config is not None else None
 
-        return compression_config.get(SPARSITY_CONFIG_NAME, None)
+        # explicitly return None if {} in config
+        return compression_config.get(SPARSITY_CONFIG_NAME, None) or None
 
     @staticmethod
     def parse_quantization_config(
@@ -316,117 +316,11 @@ def __init__(
 
             self.quantization_compressor = {}
             for format in self.compression_formats:
-                self.quantization_compressor[
-                    format
-                ] = BaseCompressor.load_from_registry(
-                    format, config=quantization_config
-                )
-
-    # ----- used by hf quantizer ----- #
-
-    def get_missing_module_keys(self, model: Module) -> List[str]:
-        """
-        Identifies the expected missing weight keys in the compressed state_dict.
-
-        When a model undergoes sparsity or quantization compression, certain
-        weight tensors may be absent from the checkpoint by virtue of compression.
-        This function determines which weight keys are missing based on the
-        applied compression techniques.
-
-        :param model: The PyTorch model to check for missing keys.
-        :return: A list of missing keys expected in the compressed state_dict.
-        """
-        missing_keys = set()
-
-        # Determine missing keys due to sparsity compression
-        if (
-            self.sparsity_compressor
-            and self.sparsity_config.format != CompressionFormat.dense.value
-        ):
-            sparse_targets = match_named_modules(
-                model=model,
-                targets=self.sparsity_config.targets,
-                ignore=self.sparsity_config.ignore,
-            )
-
-            missing_keys.update(
-                merge_names(target_name, "weight")
-                for target_name, _module in sparse_targets
-            )
-
-        # Determine missing keys due to pack quantization
-        if (
-            self.quantization_compressor
-            and self.quantization_config.format
-            == CompressionFormat.pack_quantized.value
-        ):
-            for scheme in self.quantization_config.config_groups.values():
-                quant_targets = match_named_modules(
-                    model=model,
-                    targets=scheme.targets,
-                    ignore=self.quantization_config.ignore,
-                )
-                missing_keys.update(
-                    merge_names(target_name, "weight")
-                    for target_name, _module in quant_targets
-                )
-
-        return list(missing_keys)
-
-    def get_unexpected_file_keys(self, model: Module) -> List[str]:
-        """
-        Identifies extra keys introduced by the compression process in the
-        compressed state_dict that are not expected by the model graph.
-
-        During sparsity or quantization compression, additional metadata or
-        auxiliary parameters may be stored in the checkpoint, which do not
-        correspond to any parameter in the original model. These keys are
-        typically introduced to support the reconstruction of compressed weights.
-
-        For example, Sparse24Bitmask compression may introduce keys such as
-        'compressed', 'bitmask', and 'shape' in the checkpoint, which are
-        not part of the original model parameters.
-
-        :param model: The PyTorch model to check for unexpected keys.
-        :return: A list of extra keys introduced by the compression process
-                that are not expected by the model.
-        """
-
-        unexpected_keys = set()
-
-        # Identify unexpected keys from sparsity compression
-        if (
-            self.sparsity_compressor
-            and self.sparsity_config.format != CompressionFormat.dense.value
-        ):
-            sparse_targets = match_named_modules(
-                model=model,
-                targets=self.sparsity_config.targets,
-                ignore=self.sparsity_config.ignore,
-            )
-            unexpected_keys.update(
-                merge_names(target_name, param)
-                for target_name, _module in sparse_targets
-                for param in self.sparsity_compressor.compression_param_names
-            )
-
-        # Identify unexpected keys from quantization compression
-        if self.quantization_compressor:
-            for scheme in self.quantization_config.config_groups.values():
-                quant_targets = match_named_modules(
-                    model=model,
-                    targets=scheme.targets,
-                    ignore=self.quantization_config.ignore,
-                )
-                for quant_compressor in self.quantization_compressor.values():
-                    unexpected_keys.update(
-                        merge_names(target_name, param)
-                        for target_name, _module in quant_targets
-                        for param in quant_compressor.compression_param_names
-                        if param != "weight"
+                self.quantization_compressor[format] = (
+                    BaseCompressor.load_from_registry(
+                        format, config=quantization_config
                     )
-
-        return list(unexpected_keys)
+                )
 
     # ----- model memory compression/decompression pathways ----- #
 
@@ -716,17 +610,16 @@ def decompress(self, model_path: str, model: Module):
                 # Load activation scales/zp or any other quantization parameters
                 # Conditionally load the weight quantization parameters if we have a
                 # dense compressor or if a sparsity compressor has already been applied
+                load_weight_qparams = sparse_decompressed or isinstance(
+                    quant_compressor, DenseCompressor
+                )
                 load_pretrained_quantization_parameters(
                     model,
                     model_path,
                     # TODO: all weight quantization params will be moved to the
                     # compressor in a follow-up including initialization
-                    load_weight_quantization=(
-                        sparse_decompressed
-                        or isinstance(quant_compressor, DenseCompressor)
-                    ),
+                    load_weight_qparams=load_weight_qparams,
                 )
-
             model_path_or_state_dict = (
                 model.state_dict() if sparse_decompressed else model_path
             )
@@ -736,7 +629,9 @@ def decompress(self, model_path: str, model: Module):
             )
             # TODO: all weight quantization params will be moved to the compressor
             # to prevent duplicate parameter updates in update_parameter_data
-            self._replace_weights(dense_gen, model)
+            self._replace_weights(
+                dense_gen, model, load_weight_qparams=not load_weight_qparams
+            )
 
             def freeze_quantization_status(module):
                 module.quantization_status = QuantizationStatus.FROZEN
@@ -823,7 +718,9 @@ def _replace_sparsity_weights(self, dense_weight_generator, model: Module):
             param = torch.nn.Parameter(data.to(device), requires_grad=requires_grad)
             register_offload_parameter(module, param_name, param)
 
-    def _replace_weights(self, dense_weight_generator, model: Module):
+    def _replace_weights(
+        self, dense_weight_generator, model: Module, load_weight_qparams: bool = True
+    ):
         """
         Replace the weights of the model with the
         provided dense weights.
@@ -851,6 +748,7 @@ def _replace_weights(self, dense_weight_generator, model: Module):
                     # decompression in init to be consistent with loading which happens
                     # later as well however, update_data does a good shape check -
                     # should be moved to the compressor
+
                     if param_name == "weight":
                         delattr(module, param_name)
                         requires_grad = param_data.dtype in (
@@ -862,7 +760,7 @@ def _replace_weights(self, dense_weight_generator, model: Module):
                             param_data.to(device), requires_grad=requires_grad
                         )
                         register_offload_parameter(module, param_name, param)
-                    else:
+                    elif load_weight_qparams:
                         # Should already be registered to the correct device for
                         # for scales/zero-points
                         update_parameter_data(module, param_data, param_name)
 
@@ -140,6 +140,11 @@ def pack_fp4_to_uint8(x: torch.Tensor) -> torch.Tensor:
     m, n = x.shape
     device = x.device
 
+    if n % 2 != 0:
+        raise ValueError(
+            "tensor must have an even number of columns for nvfp4 compression"
+        )
+
     # Create lookup table for FP4 values to indices
     # Map the absolute values to 0-7 indices
     kE2M1 = torch.tensor(FLOAT_TO_E2M1, device=device, dtype=x.dtype)
@@ -155,10 +160,6 @@ def pack_fp4_to_uint8(x: torch.Tensor) -> torch.Tensor:
     # Reshape to prepare for packing pairs of values
     indices = indices.reshape(-1)
 
-    # Handle odd length by padding if necessary
-    if indices.numel() % 2 != 0:
-        indices = torch.cat([indices, torch.zeros(1, dtype=torch.long, device=device)])
-
     # Reshape to pair consecutive elements
     indices = indices.reshape(-1, 2)
 
 
@@ -61,19 +61,19 @@
 def load_pretrained_quantization_parameters(
     model: Module,
     model_name_or_path: Optional[str] = None,
-    load_weight_quantization: Optional[bool] = False,
+    load_weight_qparams: Optional[bool] = False,
 ):
     """
     Loads the quantization parameters (scale and zero point) from model_name_or_path to
     a model that has already been initialized with a quantization config.
 
     NOTE: Will always load inputs/output parameters. Will conditioanlly load weight
-    parameters, if load_weight_quantization is set to True.
+    parameters, if load_weight_qparams is set to True.
 
     :param model: model to load pretrained quantization parameters to
     :param model_name_or_path: Hugging Face stub or local folder containing a quantized
         model, which is used to load quantization parameters
-    :param load_weight_quantization: whether or not the weight quantization parameters
+    :param load_weight_qparams: whether or not the weight quantization parameters
         should be loaded
     """
     model_path = get_safetensors_folder(model_name_or_path)
@@ -99,7 +99,7 @@ def load_pretrained_quantization_parameters(
                 mapping=mapping,
             )
 
-        if load_weight_quantization and submodule.quantization_scheme.weights:
+        if load_weight_qparams and submodule.quantization_scheme.weights:
             base_name = "weight"
             _load_quant_args_from_mapping(
                 base_name=base_name,
 
@@ -59,7 +59,6 @@ def initialize_module_for_quantization(
     module: Module,
     scheme: Optional[QuantizationScheme] = None,
     force_zero_point: bool = True,
-    scale_dtype: Optional[torch.dtype] = None,
 ):
     """
     attaches appropriate scales, zero points, and observers to a layer
@@ -73,8 +72,6 @@ def initialize_module_for_quantization(
         if not provided, the layer will be skipped
     :param force_zero_point: whether to force initialization of a zero point for
         symmetric quantization
-    :param scale_dtype: dtype to used for the scales, if overriding the
-        weight dtype as the scale dtype
     """
     # TODO: don't initialize parameters when running decompression
     scheme = scheme or getattr(module, "quantization_scheme", None)
@@ -93,7 +90,6 @@ def initialize_module_for_quantization(
                 "input",
                 scheme.input_activations,
                 force_zero_point=force_zero_point,
-                scale_dtype=scale_dtype,
             )
 
         if scheme.weights is not None:
@@ -107,7 +103,6 @@ def initialize_module_for_quantization(
                     scheme.weights,
                     weight_shape=weight_shape,
                     force_zero_point=force_zero_point,
-                    scale_dtype=scale_dtype,
                 )
             else:
                 _LOGGER.warning(
@@ -119,7 +114,7 @@ def initialize_module_for_quantization(
         if scheme.output_activations is not None:
             if not is_kv_cache_quant_scheme(scheme):
                 _initialize_scale_zero_point(
-                    module, "output", scheme.output_activations, scale_dtype=scale_dtype
+                    module, "output", scheme.output_activations
                 )
 
         module.quantization_scheme = scheme
@@ -145,7 +140,6 @@ def _initialize_scale_zero_point(
     quantization_args: QuantizationArgs,
     weight_shape: Optional[torch.Size] = None,
     force_zero_point: bool = True,
-    scale_dtype: Optional[torch.dtype] = None,
 ):
     if quantization_args.dynamic is True:
         return
@@ -213,7 +207,7 @@ def _initialize_scale_zero_point(
         expected_shape = 1
 
     # 3. Identify quantization scale and zp dtype
-    scale_dtype = scale_dtype if scale_dtype is not None else module.weight.dtype
+    scale_dtype = module.weight.dtype
 
     if is_fp4(quantization_args=quantization_args):
         scale_dtype = zp_dtype = FP8_E4M3_DATA.dtype
@@ -226,7 +220,7 @@ def _initialize_scale_zero_point(
             torch.float32,
             torch.float64,
         ]:
-            scale_dtype = torch.float16
+            scale_dtype = torch.bfloat16
         zp_dtype = quantization_args.pytorch_dtype()
 
     # 4. Initializes empty scale, zero point, and g_idx parameters for the module
 
@@ -60,6 +60,26 @@ def validate_model_after(model: "QuantizationScheme") -> "QuantizationScheme":
         format = model.format
 
         if inputs is not None:
+            if inputs.strategy not in (
+                QuantizationStrategy.TOKEN,
+                QuantizationStrategy.TENSOR,
+                QuantizationStrategy.GROUP,
+                QuantizationStrategy.TENSOR_GROUP,
+            ):
+                if (
+                    inputs.strategy == QuantizationStrategy.GROUP
+                    and inputs.dynamic is True
+                ):
+                    raise NotImplementedError(
+                        "Static and local group-wise activation "
+                        "quantization is not supported"
+                    )
+
+                raise NotImplementedError(
+                    f"Using {inputs.strategy} strategy is not supported for "
+                    "activation quantization"
+                )
+
             if inputs.actorder is not None:
                 raise ValueError("Cannot apply actorder to input activations")
 
 
@@ -18,6 +18,7 @@
 
 import torch
 import torch.nn.utils.parametrize as P
+import tqdm
 from compressed_tensors.registry.registry import RegistryMixin, T
 from compressed_tensors.transform import (
     TransformArgs,
@@ -84,15 +85,21 @@ def create_transform(self, module: Module, args: TransformArgs) -> "TransformBas
         """
         raise NotImplementedError()
 
-    def apply_to_model(self, model: Module):
+    def apply_to_model(self, model: Module, use_tqdm=True):
         """
         Create transforms and apply them to the model
 
         :param model: module to apply transforms to
         """
-        for arg in self.scheme.apply:
-            for _, module in match_named_modules(model, arg.targets, arg.ignore):
-                self._apply_to_module(module, arg)
+        modules_args = [
+            (module, arg)
+            for arg in self.scheme.apply
+            for _, module in match_named_modules(model, arg.targets, arg.ignore)
+        ]
+
+        desc = f"Applying {self.name} transforms"
+        for module, arg in tqdm.tqdm(modules_args, desc=desc, disable=(not use_tqdm)):
+            self._apply_to_module(module, arg)
 
         self._update_tied_weights()