neuralmagic
diff --git a/‎setup.py‎
Lines changed: 1 addition & 1 deletion b/‎setup.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/compressed_tensors/config/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎src/compressed_tensors/config/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/compressed_tensors/config/format.py‎
Lines changed: 142 additions & 0 deletions b/‎src/compressed_tensors/config/format.py‎
Lines changed: 142 additions & 0 deletions
diff --git a/‎src/compressed_tensors/quantization/lifecycle/forward.py‎
Lines changed: 3 additions & 3 deletions b/‎src/compressed_tensors/quantization/lifecycle/forward.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/compressed_tensors/transform/apply.py‎
Lines changed: 35 additions & 0 deletions b/‎src/compressed_tensors/transform/apply.py‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎src/compressed_tensors/transform/factory/base.py‎
Lines changed: 2 additions & 36 deletions b/‎src/compressed_tensors/transform/factory/base.py‎
Lines changed: 2 additions & 36 deletions
diff --git a/‎src/compressed_tensors/transform/factory/hadamard.py‎
Lines changed: 4 additions & 2 deletions b/‎src/compressed_tensors/transform/factory/hadamard.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/compressed_tensors/transform/factory/matrix_multiply.py‎
Lines changed: 1 addition & 1 deletion b/‎src/compressed_tensors/transform/factory/matrix_multiply.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/compressed_tensors/transform/transform_scheme.py‎
Lines changed: 18 additions & 2 deletions b/‎src/compressed_tensors/transform/transform_scheme.py‎
Lines changed: 18 additions & 2 deletions
@@ -88,7 +88,7 @@ def _setup_packages() -> List:
     )
 
 def _setup_install_requires() -> List:
-    return ["torch>=1.7.0", "transformers", "pydantic>=2.0", "frozendict", "loguru"]
+    return ["torch>=1.7.0", "transformers", "pydantic>=2.0", "loguru"]
 
 def _setup_extras() -> Dict:
     return {
 
@@ -15,5 +15,6 @@
 # flake8: noqa
 from .base import *
 from .dense import *
+from .format import *
 from .sparse_24_bitmask import *
 from .sparse_bitmask import *
@@ -0,0 +1,142 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional
+
+import torch
+from compressed_tensors.config import CompressionFormat, SparsityStructure
+from compressed_tensors.quantization import (
+    QuantizationArgs,
+    QuantizationStrategy,
+    QuantizationType,
+)
+from compressed_tensors.quantization.utils import is_module_quantized
+from loguru import logger
+
+
+__all__ = ["infer_and_set_per_module_quantization_format"]
+
+
+def _get_quant_compression_format(
+    input_args: Optional[QuantizationArgs],
+    weight_args: Optional[QuantizationArgs],
+    sparsity_structure: Optional[str] = None,
+) -> CompressionFormat:
+    """
+    Using the weight and input quantization args as well as an optional
+    sparsity structure, determine the compression format that should be
+    applied to a given module
+
+    :param input_args: input quantization parameters
+    :param weight_args: weight quantization parameters
+    :param sparsity_structure: optional (global) modle sparsity
+        structure
+    :return CompresssionFormat for the module
+    """
+    is_24_structure = (
+        SparsityStructure(sparsity_structure) == SparsityStructure.TWO_FOUR
+    )
+    is_weight_only = weight_args is not None and input_args is None
+
+    if weight_args.num_bits == 4 and weight_args.type == QuantizationType.FLOAT.value:
+        return CompressionFormat.nvfp4_pack_quantized
+
+    if is_weight_only:  # w4a16 and w8a16
+        is_valid_pack = (
+            weight_args.num_bits in [4, 8]
+            and weight_args.type == QuantizationType.INT.value
+        )
+        if not is_valid_pack:  # packing only valid for int4 and int 8
+            return CompressionFormat.naive_quantized
+
+        if is_24_structure and weight_args.strategy in (
+            QuantizationStrategy.CHANNEL.value,
+            QuantizationStrategy.GROUP.value,
+        ):
+            # marlin24 kernel only applicable for channel/group quantization
+            # Note: vLLM may only support group quant for marlin24
+            return CompressionFormat.marlin_24
+        return CompressionFormat.pack_quantized
+
+    else:  # w8a8 float and int
+        if (
+            weight_args.type == QuantizationType.FLOAT.value
+            and weight_args.num_bits == 8
+        ):
+            return CompressionFormat.float_quantized
+        if weight_args.type == QuantizationType.INT.value:
+            return CompressionFormat.int_quantized
+
+        return CompressionFormat.naive_quantized
+
+
+def set_per_module_format(
+    module: torch.nn.Module, sparsity_structure: Optional[str] = None
+):
+    """
+    Determine and set the per module quantization format given quantization args
+    and sparsity structure.
+
+    :param module: module which has its quantization inferred
+    :param sparsity_structure: optional sparsity applied to the module
+
+    """
+    weight_scheme = module.quantization_scheme.weights
+    input_scheme = module.quantization_scheme.input_activations
+    if weight_scheme is None:
+        return  # no weight quant - nothing to compress
+    compression_format = _get_quant_compression_format(
+        input_scheme, weight_scheme, sparsity_structure
+    )
+
+    # If set, we check if it matches our inferred one
+    if module.quantization_scheme.format is not None:
+        # If it does not, warn the user
+        if module.quantization_scheme.format != compression_format.value:
+            logger.warning(
+                "The provided format for the module does not match the "
+                "inferred format. Compression may fail "
+            )
+    else:
+        # If not set, we set ours
+        module.quantization_scheme.format = compression_format.value
+
+
+def infer_and_set_per_module_quantization_format(
+    model: torch.nn.Module,
+    sparsity_structure: Optional[str] = None,
+) -> List[str]:
+    """
+    Infers the quantization format for a model based on its state and provided
+    compression arguments. Updates thhe quantization_scheme.format value
+    based on the inferred format. Returns the unique list of formats in the model
+    or None if empty list
+
+    For a summary of the formats, see `docs/guides/compression_formats.md`.
+
+    :param model: model to check for quantization
+    :param sparsity_structure: optional sparsity applied to the module
+    :return compression format appropriate for model
+    """
+    unique_formats = []
+    for submodule in model.modules():
+        if is_module_quantized(submodule):
+            assert hasattr(submodule, "quantization_scheme")
+            set_per_module_format(submodule, sparsity_structure)
+            if submodule.quantization_scheme.format not in unique_formats:
+                unique_formats.append(submodule.quantization_scheme.format)
+
+    if len(unique_formats) > 0:
+        return unique_formats
+    return [CompressionFormat.dense.value]
@@ -29,7 +29,6 @@
     calculate_range,
     compute_dynamic_scales_and_zp,
 )
-from compressed_tensors.utils import safe_permute
 from torch.nn import Module
 
 
@@ -294,7 +293,7 @@ def _process_quantization(
             group_sizes = group_sizes[torch.argsort(group_indices)]
 
             perm = torch.argsort(g_idx)
-            x = safe_permute(x, perm, dim=1)
+            x = x.index_select(-1, perm)
 
         # Maintain all dimensions except the last dim, which is divided by group_size
         reshaped_dims = (
@@ -328,7 +327,8 @@ def _process_quantization(
         output = output.to(output_dtype)
 
         if not is_column_order:
-            output = safe_permute(output, torch.argsort(perm), dim=1)
+            inv_perm = torch.argsort(perm)
+            output = output.index_select(-1, inv_perm)
 
     else:  # covers channel, token and tensor strategies
         if do_quantize:
 
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Dict
+
 import torch
+from accelerate.utils import has_offloaded_params
 from compressed_tensors import TRANSFORM_CONFIG_NAME
 from compressed_tensors.transform import TransformConfig, TransformFactory
 
@@ -34,3 +37,35 @@ def apply_transform_config(model: torch.nn.Module, config: TransformConfig):
 
     # attach config to model for compression/serialization
     setattr(model, TRANSFORM_CONFIG_NAME, config)
+
+    # ensure that tied weight transforms can be serialized without aliases
+    # In the future, this could be done by transformers or model compressor
+    # which would make this more robust to changing dispatches after transforms
+    _tie_offloaded_tensors(model)
+
+
+def _tie_offloaded_tensors(model: torch.nn.Module):
+    """
+    When accelerate replaces tensors with meta tensors during offloading, the meta
+    tensors may not be identical, even if the offloaded values are identical.
+
+    However, transformers can only serialize correctly if meta tensors are identical
+    (see transformers#39263).
+
+    This function collects all meta tensors which have shared offloaded values and sets
+    those tensors to be identical so that they can be removed during serialization
+
+    :param model: model potentially containing offloaded meta tensors to fix
+    """
+
+    # ensure that if a location shares an offloaded tensor pointers, that the
+    # meta tensor is also identical (assigned to the first instance of parameter)
+    ptr_to_meta: Dict[int, torch.nn.Parameter] = dict()
+    for module in model.modules():
+        if has_offloaded_params(module):
+            for key, _ in module.named_parameters(recurse=False):
+                offloaded_ptr = module._hf_hook.weights_map[key].data_ptr()
+
+                if offloaded_ptr not in ptr_to_meta:
+                    ptr_to_meta[offloaded_ptr] = getattr(module, key)
+                setattr(module, key, ptr_to_meta[offloaded_ptr])
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 from abc import ABC, abstractmethod
-from collections import defaultdict
-from typing import List, Optional, Set, Tuple
+from typing import List, Optional
 
 import torch
 import torch.nn.utils.parametrize as P
@@ -57,7 +56,6 @@ def __init__(self, name: str, scheme: TransformScheme, seed: Optional[int] = Non
         self.name = name
         self.scheme = scheme
         self.generator = torch.Generator()
-        self.transforms = list()
         if seed is not None:
             self.generator.manual_seed(seed)
 
@@ -101,8 +99,6 @@ def apply_to_model(self, model: Module, use_tqdm=True):
         for module, arg in tqdm.tqdm(modules_args, desc=desc, disable=(not use_tqdm)):
             self._apply_to_module(module, arg)
 
-        self._update_tied_weights()
-
     def _apply_to_module(self, module: Module, args: TransformArgs):
         """
         Create transforms and apply them to the module
@@ -120,7 +116,6 @@ def _apply_to_module(self, module: Module, args: TransformArgs):
         # create transform as submodule
         transform_name = f"{self.name}_{args.location}"
         transform = self.create_transform(module, args)
-        self.transforms.append(transform)
         register_offload_module(module, transform_name, transform)
 
         # register input transformation hook
@@ -165,31 +160,6 @@ def output_hook(_, _input, output):
         else:
             raise NotImplementedError()
 
-    def _update_tied_weights(self):
-        """
-        Populate the `_dynamic_tied_weights_keys` attribute of transforms,
-        which is used by transformers to detect and remove shared pointers
-        during saving
-        """
-        # map from data_ptrs to keys
-        ptr_to_keys: dict[int, List[Tuple[TransformBase, str]]] = defaultdict(list)
-        for transform in self.transforms:
-            for name, param in transform.named_parameters(recurse=False):
-                # NOTE: previously asserted that parent._hf_hook.place_submodules=False
-                if has_offloaded_params(transform):
-                    param = transform._hf_hook.weights_map[name]
-                ptr_to_keys[param.data_ptr()].append((transform, name))
-
-        # populate `_dynamic_tied_weights_keys` if there is more than one key
-        # and ensure that they share tensors
-        for shared_keys in ptr_to_keys.values():
-            if len(shared_keys) > 1:
-                tensor = getattr(shared_keys[0][0], shared_keys[0][1])
-
-                for transform, name in shared_keys:
-                    transform._dynamic_tied_weights_keys.add(name)
-                    setattr(transform, name, tensor)
-
 
 class TransformBase(InternalModule, ABC):
     """
@@ -198,11 +168,7 @@ class TransformBase(InternalModule, ABC):
 
     args: TransformArgs
     weight: Parameter
-    _dynamic_tied_weights_keys: Set[str]
-
-    def __init__(self):
-        super().__init__()
-        self._dynamic_tied_weights_keys = set()
+    _dynamic_tied_weights_keys: List[str] = ["weight"]
 
     @abstractmethod
     def forward(self, value: Tensor) -> Tensor:
 
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional
+from typing import List, Optional
 
 import torch
 from compressed_tensors.transform import TransformArgs, TransformScheme
@@ -52,7 +52,7 @@ def create_transform(self, module: Module, args: TransformArgs):
         :param args: defines how the transform will be applied to the module
         """
         assert hasattr(module, "weight")
-        size = get_transform_size(module, args.location, self.scheme.head_dim)
+        size = get_transform_size(module, args.location, self.scheme.block_size)
         exec_device = get_execution_device(module)
         device = get_offloaded_device(module)
         precision = self.scheme.precision if args.is_online() else torch.float64
@@ -84,6 +84,8 @@ def _create_permutation(self, weight: Parameter) -> Parameter:
 
 
 class HadamardTransform(TransformBase):
+    _dynamic_tied_weights_keys: List[str] = ["weight", "perm"]
+
     def __init__(
         self,
         weight: Parameter,
 
@@ -51,7 +51,7 @@ def create_transform(self, module: Module, args: TransformArgs):
         :param args: defines how the transform will be applied to the module
         """
         assert hasattr(module, "weight")
-        size = get_transform_size(module, args.location, self.scheme.head_dim)
+        size = get_transform_size(module, args.location, self.scheme.block_size)
         device = get_offloaded_device(module)
         precision = self.scheme.precision if args.is_online() else torch.float64
 
 
@@ -17,7 +17,7 @@
 import torch
 from compressed_tensors.transform import TransformArgs
 from compressed_tensors.utils import TorchDtype
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict, Field, model_validator
 
 
 __all__ = ["TransformScheme"]
@@ -36,6 +36,8 @@ class TransformScheme(BaseModel):
     :param randomize: True if uniquely randomized transform weights should be used,
         otherwise use identical transform weights where applicable
     :param requires_grad: True if weights include gradients for training
+    :param block_size: If set, the transform matrix will be block diagonal, with each
+        block being a square matrix of this size.
     :param precision: Precision at which this transform should be applied during online
         rotations. Fused (offline) rotations are always performed in float64
     """
@@ -44,7 +46,21 @@ class TransformScheme(BaseModel):
     apply: List[TransformArgs] = Field(default_factory=list)
     randomize: bool = Field(default=False)
     requires_grad: bool = Field(default=False)
-    head_dim: Optional[int] = Field(default=None)
+    block_size: Optional[int] = Field(default=None)
+    head_dim: Optional[int] = Field(
+        default=None, deprecated="head_dim is deprecated, use block_size instead"
+    )
     precision: TorchDtype = Field(default=torch.float32)
 
+    @model_validator(mode="after")
+    def validate_model_after(model: "TransformScheme") -> "TransformScheme":
+        """
+        If head_dim is used instead of block_size, set block_size to head_dim
+        and remove head_dim
+        """
+        if model.block_size is None and model.head_dim is not None:
+            model.block_size = model.head_dim
+            model.head_dim = None
+        return model
+
     model_config = ConfigDict(extra="forbid")
Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,7 @@ def _setup_packages() -> List:`
`88`	`88`	`)`
`89`	`89`
`90`	`90`	`def _setup_install_requires() -> List:`
`91`		`- return ["torch>=1.7.0", "transformers", "pydantic>=2.0", "frozendict", "loguru"]`
	`91`	`+ return ["torch>=1.7.0", "transformers", "pydantic>=2.0", "loguru"]`
`92`	`92`
`93`	`93`	`def _setup_extras() -> Dict:`
`94`	`94`	`return {`