Merge branch 'main' into kylesayrs/transform-merge

brian-dellabetta · brian-dellabetta · commit ee40ddf1fb4a · 2025-07-16T17:22:35.000-04:00
diff --git a/src/compressed_tensors/compressors/model_compressors/model_compressor.py b/src/compressed_tensors/compressors/model_compressors/model_compressor.py
@@ -392,8 +392,8 @@ def compress_model(self, model: Module):
         for prefix, module in tqdm(model.named_modules(), desc="Compressing model"):
 
             if prefix in module_to_scheme or prefix in sparse_compression_targets:
-                module_device = get_execution_device(module).type
-                is_meta = module_device == "meta"
+                module_device = get_execution_device(module)
+                is_meta = (module_device.type == "meta")
 
                 exec_device = "meta" if is_meta else "cpu"
                 onloading_device = "meta" if is_meta else module_device
@@ -747,12 +747,16 @@ def _replace_weights(self, dense_weight_generator, model: Module):
 
 def map_module_to_scheme(model: Module) -> Dict[str, QuantizationScheme]:
     """
-    Returns a dictionary which maps quantized module names to their quantization schemes
+    Returns a dictionary which maps quantized module names to their quantization
+    schemes. Only includes modules with weight quantization
     """
     return {
         fix_fsdp_module_name(name): module.quantization_scheme
         for name, module in model.named_modules()
-        if is_module_quantized(module)
+        if (
+            hasattr(module, "quantization_scheme") and
+            module.quantization_scheme.weights is not None
+        )
     }
 
 
diff --git a/src/compressed_tensors/transform/factory/hadamard.py b/src/compressed_tensors/transform/factory/hadamard.py
@@ -15,6 +15,7 @@
 import math
 from typing import Optional
 
+import math
 import torch
 from compressed_tensors.transform import TransformArgs, TransformScheme
 from compressed_tensors.transform.factory.base import TransformBase, TransformFactory
@@ -26,7 +27,6 @@
 from compressed_tensors.utils import (
     get_execution_device,
     get_offloaded_device,
-    match_modules_set,
 )
 from compressed_tensors.utils.helpers import ParameterizedDefaultDict
 from torch import Tensor, device, dtype
@@ -107,8 +107,7 @@ def forward(self, value: Tensor) -> Tensor:
 
         if self.args.inverse:
             weight = weight.T
-
-        return (
-            apply_transform_weight(weight, value, self.args.location, self.module_type)
-            / self._scale
-        )
+ 
+        return apply_transform_weight(
+            weight, value, self.args.location, self.module_type
+        ) / self._scale
diff --git a/src/compressed_tensors/transform/utils/matrix.py b/src/compressed_tensors/transform/utils/matrix.py
@@ -59,47 +59,13 @@ def get_transform_size(
 
 
 def apply_transform_weight(
-    weight: torch.Tensor,
+    transform_weight: torch.Tensor,
     value: torch.Tensor,
     location: TransformLocation,
     module_type: type[torch.nn.Module],
 ) -> torch.Tensor:
     """
-    :param weight: transform weight to apply
-    :param value: value to apply weight to
-    :param location: determines how weight should be applied
-    :param model_type: result of type(module), passed in to determine application of
-        weight transform. This is needed because torch uses convention:
-        - torch.nn.Linear(in_features,out_features) has weight shape
-            (out_features, in_features)
-        - torch.nn.Embedding(num_embeddings, embedding_dim) has weight shape
-            (num_embeddings, embedding_dim)
-        The transform has to account for Linear's transposed weights
-    :return: value after weight has been applied
-    """
-    # get function used to apply transform
-    fn, axis = _get_transform_method(module_type, location)
-
-    # reshape for head_dim
-    head_dim = weight.shape[0]
-    num_heads = value.shape[axis] // head_dim
-    value = value.unflatten(axis, (num_heads, head_dim))
-
-    # apply transform
-    value = fn(weight, value)
-
-    # [undo] reshape for head_dim
-    value = value.flatten(axis - 1, axis)
-
-    return value
-
-
-def _get_transform_method(
-    module_type: type[torch.nn.Module],
-    location: TransformLocation,
-) -> Tuple[Callable[[torch.Tensor, torch.Tensor], torch.Tensor], int]:
-    """
-    Using the transform location, determine how to apply the transform weight to the
+    Using the transform location, apply the transform_weight to the
     given value wrt linear weights. For more info on input and output transforms,
     see `TransformLocation`
 
@@ -129,51 +95,85 @@ def _get_transform_method(
                 = y U
                 = yh
 
-    :param weight: transform weight to apply
-    :param value: value to apply weight to
+    :param transform_weight: transform weight to apply
+    :param value: value to apply transform_weight to
     :param location: determines how weight should be applied
-    :return: value after transform weight has been applied
+    :param model_type: result of type(module), passed in to determine application of
+        weight transform
+    :return: value after transform_weight has been applied
     """
-    fn = axis = None
+
+    assert transform_weight.shape[0] == transform_weight.shape[1]
 
     if module_type == torch.nn.Linear:
         if location == TransformLocation.INPUT:
-            fn = lambda weight, value: value @ weight
-            axis = -1
+            return _multihead_matmul(value, transform_weight)
 
         elif location == TransformLocation.WEIGHT_INPUT:
-            fn = lambda weight, value: value @ weight.T
-            axis = -1
+            # equivalent to (transform_weight @ value.T).T
+            return _multihead_matmul(value, transform_weight.T)
 
         elif location == TransformLocation.WEIGHT_OUTPUT:
-            fn = lambda weight, value: weight.T @ value
-            axis = -2
+            # equivalent to (value.T @ transform_weight).T
+            return _multihead_matmul(transform_weight.T, value)
 
         elif location == TransformLocation.OUTPUT:
-            fn = lambda weight, value: value @ weight
-            axis = -1
+            return _multihead_matmul(value, transform_weight)
 
     # similar derivation to torch.nn.Linear, but `y = (x W)`
-    if module_type == torch.nn.Embedding:
+    elif module_type == torch.nn.Embedding:
         if location == TransformLocation.INPUT:
-            fn = lambda weight, value: value @ weight
-            axis = -1
+            return _multihead_matmul(value, transform_weight)
 
         elif location == TransformLocation.WEIGHT_INPUT:
-            fn = lambda weight, value: weight @ value
-            axis = -1
+            return _multihead_matmul(
+                transform_weight,
+                value,
+            )
 
         elif location == TransformLocation.WEIGHT_OUTPUT:
-            fn = lambda weight, value: value @ weight
-            axis = -1
+            return _multihead_matmul(value, transform_weight)
 
         elif location == TransformLocation.OUTPUT:
-            fn = lambda weight, value: value @ weight
-            axis = -1
+            return _multihead_matmul(value, transform_weight)
 
-    if fn is None:
-        raise NotImplementedError(
-            f"Applying transforms to {module_type} {location} is not supported"
-        )
+    raise NotImplementedError(
+        f"Applying transforms to {module_type} {location} is not supported"
+    )
 
-    return fn, axis
+
+def _multihead_matmul(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+    """
+    Performs A @ B for last two dims of two matrices A and B that possibly
+    have different shapes, as is the case in multi-headed dimension. If
+    shapes are different, this is equivalent to converting the last two dims
+    of the smaller matrix into a block-diagonal matrix with the same shape as
+    the last two dims of the larger matrix.
+
+    E.g. if A is half the size of B, this function will perform
+    [[A  ]  @ B
+     [  A]]
+
+    If B is a third of the size of A, this function will perform
+    A @ [[B    ]
+         [  B  ]
+         [    B]]
+
+    This function will error out if the shapes are not evenly divisble
+
+    :param A: left-hand tensor
+    :param B: right-hand tensor
+    :return: result
+    """
+    if A.shape[-1] > B.shape[-2]:
+        head_dim = B.shape[-2]
+        num_heads = A.shape[-1] // head_dim
+        A = A.unflatten(-1, (num_heads, head_dim))
+        return (A @ B).flatten(-2, -1)
+    elif A.shape[-1] < B.shape[-2]:
+        head_dim = A.shape[-1]
+        num_heads = B.shape[-2] // head_dim
+        B = B.unflatten(-2, (num_heads, head_dim))
+        return (A @ B).flatten(-3, -2)
+    else:
+        return A @ B
diff --git a/tests/test_transform/factory/test_correctness.py b/tests/test_transform/factory/test_correctness.py
@@ -29,7 +29,8 @@
 @pytest.mark.parametrize("type", ("hadamard", "random-hadamard"))
 @pytest.mark.parametrize("randomized", (True, False))
 @pytest.mark.parametrize("head_dim", (None, 2, 4))
-def test_correctness_linear(type, randomized, head_dim):
+@pytest.mark.parametrize("input_batch_size", (1, 5, 17))
+def test_correctness_linear(type, randomized, head_dim, input_batch_size):
     size = (4, 8)
     module = torch.nn.Linear(*size, bias=False)
     scheme = TransformScheme(type=type, randomized=randomized, head_dim=head_dim)
@@ -48,7 +49,7 @@ def test_correctness_linear(type, randomized, head_dim):
         module, TransformArgs(targets="Linear", location="output", inverse=True)
     )
 
-    input = torch.rand((17, 5, size[0]))
+    input = torch.rand((input_batch_size, 5, size[0]))
     true_output = input @ module.weight.T
     input_transformed = input_tfm(input)
     weight_transformed = w_out_tfm(w_in_tfm(module.weight))
@@ -57,10 +58,10 @@ def test_correctness_linear(type, randomized, head_dim):
 
 
 @pytest.mark.parametrize("type", ("hadamard", "random-hadamard"))
-@pytest.mark.parametrize("randomize", (True, False))
+@pytest.mark.parametrize("randomized", (True, False))
 @pytest.mark.parametrize("embed_loc", ("weight_output", "output"))
 @pytest.mark.parametrize("linear_loc", ("input", "weight_input"))
-def test_correctness_embedding(type, randomize, embed_loc, linear_loc):
+def test_correctness_embedding(type, randomized, embed_loc, linear_loc):
     model = torch.nn.Sequential(
         torch.nn.Embedding(2, 4),
         torch.nn.Linear(4, 8, bias=False),
@@ -73,7 +74,7 @@ def test_correctness_embedding(type, randomize, embed_loc, linear_loc):
         config_groups={
             "": TransformScheme(
                 type=type,
-                randomize=randomize,
+                randomized=randomized,
                 apply=[
                     TransformArgs(targets="Embedding", location=embed_loc),
                     TransformArgs(targets="Linear", location=linear_loc, inverse=True),
@@ -155,6 +156,47 @@ def test_correctness_attention_heads(type, randomize, head_dim):
 @requires_gpu
 @requires_accelerate()
 @pytest.mark.parametrize("type", ("hadamard", "random-hadamard"))
-@pytest.mark.parametrize("randomize", (True, False))
-def test_correctness_model_offload(type, randomize, model_apply):
-    test_correctness_model(type, randomize, model_apply, offload=True)
+@pytest.mark.parametrize("randomized", (True, False))
+@pytest.mark.parametrize("input_batch_size", (1, 5, 17))
+def test_correctness_model_offload(type, randomized, input_batch_size, model_apply):
+    test_correctness_model(
+        type, randomized, input_batch_size, model_apply, offload=True
+    )
+
+
+@pytest.mark.parametrize("type", ("hadamard", "random-hadamard"))
+@pytest.mark.parametrize("randomized", (True, False))
+@pytest.mark.parametrize("head_dim", (4, 8))
+@pytest.mark.parametrize("input_batch_size", (1, 5, 17))
+def test_correctness_attention_heads(type, randomized, head_dim, input_batch_size):
+    hidden_size = 64
+    num_attention_heads = 8
+
+    attention = MockAttention(
+        hidden_size=hidden_size,
+        num_attention_heads=num_attention_heads,
+        num_key_value_heads=head_dim,
+    )
+
+    input = torch.rand(input_batch_size, 5, hidden_size)
+    true_output = attention(input)
+
+    config = TransformConfig(
+        config_groups={
+            "": TransformScheme(
+                type=type,
+                randomized=randomized,
+                head_dim=head_dim,
+                apply=[
+                    TransformArgs(targets="v_proj", location="weight_output"),
+                    TransformArgs(
+                        targets="o_proj", location="weight_input", inverse=True
+                    ),
+                ],
+            )
+        }
+    )
+    apply_transform_config(attention, config)
+
+    output = attention(input)
+    assert torch.allclose(true_output, output, atol=1e-5, rtol=0.0)