break out _tie_offloaded_tensors, add test

kylesayrs · kylesayrs · commit 7aec12c7d67b · 2025-09-11T15:09:14.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/compressed_tensors/transform/apply.py b/src/compressed_tensors/transform/apply.py
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from collections import defaultdict
+from typing import List, Tuple
+
 import torch
 from compressed_tensors import TRANSFORM_CONFIG_NAME
 from compressed_tensors.transform import TransformConfig, TransformFactory
@@ -34,3 +37,36 @@ def apply_transform_config(model: torch.nn.Module, config: TransformConfig):
 
     # attach config to model for compression/serialization
     setattr(model, TRANSFORM_CONFIG_NAME, config)
+
+    # ensure that tied weight transforms can be serialized without aliases
+    # In the future, this could be done by transformers or model compressor
+    # which would make this more robust to changing dispatches after transforms
+    _tie_offloaded_tensors(model)
+
+
+def _tie_offloaded_tensors(model: torch.nn.Module):
+    """
+    Populate the `_dynamic_tied_weights_keys` attribute of transforms,
+    which is used by transformers to detect and remove shared pointers
+    during saving
+    """
+    from compressed_tensors.utils import has_offloaded_params
+
+    # map from  to keys
+    offloaded_ptrs: dict[int, List[Tuple[torch.nn.Module, str]]] = defaultdict(list)
+    for module in model.modules():
+        # NOTE: previously asserted that parent._hf_hook.place_submodules=False
+        if has_offloaded_params(module):
+            for key, _ in module.named_parameters(recurse=False):
+                param = module._hf_hook.weights_map[key]
+                offloaded_ptrs[id(param)].append((module, key))
+
+    # populate `_dynamic_tied_weights_keys` if there is more than one key
+    # and ensure that they share tensors. In the case of offloading, this
+    for shared_keys in offloaded_ptrs.values():
+        if len(shared_keys) > 1:
+            first_tensor = getattr(shared_keys[0][0], shared_keys[0][1])
+            assert first_tensor.device.type == "meta"
+            for module, key in shared_keys:
+                assert getattr(module, key).device.type == "meta"
+                setattr(module, key, first_tensor)
diff --git a/src/compressed_tensors/transform/factory/base.py b/src/compressed_tensors/transform/factory/base.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from abc import ABC, abstractmethod
-from typing import List, Optional, Set
+from typing import List, Optional
 
 import torch
 import torch.nn.utils.parametrize as P
@@ -56,7 +56,6 @@ def __init__(self, name: str, scheme: TransformScheme, seed: Optional[int] = Non
         self.name = name
         self.scheme = scheme
         self.generator = torch.Generator()
-        self.transforms = list()
         if seed is not None:
             self.generator.manual_seed(seed)
 
@@ -117,7 +116,6 @@ def _apply_to_module(self, module: Module, args: TransformArgs):
         # create transform as submodule
         transform_name = f"{self.name}_{args.location}"
         transform = self.create_transform(module, args)
-        self.transforms.append(transform)
         register_offload_module(module, transform_name, transform)
 
         # register input transformation hook
diff --git a/tests/test_transform/conftest.py b/tests/test_transform/conftest.py
@@ -19,6 +19,8 @@
 
 
 class TransformableModel(PreTrainedModel):
+    config_class = PretrainedConfig
+
     def __init__(self, *sizes):
         super().__init__(config=PretrainedConfig())
         self.fcs = torch.nn.ModuleList(
diff --git a/tests/test_transform/factory/test_serialization.py b/tests/test_transform/factory/test_serialization.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+
 import pytest
 import torch
 from compressed_tensors.transform import (
@@ -20,7 +22,9 @@
     apply_transform_config,
 )
 from compressed_tensors.utils import offloaded_dispatch
+from safetensors import safe_open
 from tests.testing_utils import requires_accelerate, requires_gpu
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
 @pytest.mark.parametrize("type", ("hadamard", "random-hadamard"))
@@ -38,15 +42,57 @@ def test_serialization(type, randomize, model_apply, tmp_path, offload=False):
     apply_transform_config(model, config)
 
     # save model
-    model.save_pretrained(tmp_path)
+    model_path = os.path.join(tmp_path, "test_model_path")
+    model.save_pretrained(model_path)
+
+    # check that saved values match model values
+    # note that shared weights are only serialized once
+    safetensors_path = os.path.join(model_path, "model.safetensors")
+    with safe_open(safetensors_path, framework="pt", device="cpu") as file:
+        saved_keys = set(file.keys())
+        assert {
+            "fcs.0.weight",
+            "fcs.1.weight",
+            "fcs.2.weight",
+            "fcs.3.weight",
+            "fcs.4.weight",
+        } <= saved_keys
+        for key in saved_keys:
+            param = model.get_parameter(key)
+            saved_param = file.get_tensor(key)
 
-    # TODO: reload model
+            if param.device.type != "meta":  # skip testing values in offload case
+                assert torch.equal(param, saved_param)
 
 
-@pytest.mark.skip(reason="Requires changes in upstream transformers")
 @requires_gpu
 @requires_accelerate()
 @pytest.mark.parametrize("type", ("hadamard", "random-hadamard"))
 @pytest.mark.parametrize("randomize", (True, False))
 def test_serialization_offload(type, randomize, model_apply, tmp_path):
     test_serialization(type, randomize, model_apply, tmp_path, offload=True)
+
+
+@pytest.mark.skip("Requires transformers#40673")
+@requires_gpu
+@pytest.mark.parametrize(
+    "model_stub,exp_perplexity",
+    [
+        ("nm-testing/Llama-3.2-1B-Instruct-spinquantR1R2R4-w4a16", 10.0),
+        ("nm-testing/Llama-3.2-1B-Instruct-quip-w4a16", 10.0),
+    ],
+)
+def test_load_perplexity(model_stub, exp_perplexity):
+    model = AutoModelForCausalLM.from_pretrained(model_stub, device_map="cuda")
+    tokenizer = AutoTokenizer.from_pretrained(model_stub)
+
+    prompt = "The capital of France is Paris, the capital of Germany is Berlin"
+    inputs = tokenizer(prompt, return_tensors="pt")
+    inputs = {key: value.to(model.device) for key, value in inputs.items()}
+    labels = inputs["input_ids"]
+
+    with torch.no_grad():
+        outputs = model(**inputs, labels=labels)
+
+    perplexity = torch.exp(outputs.loss)
+    assert perplexity <= exp_perplexity