use untie_word_embeddings

kylesayrs · kylesayrs · commit dc7ac1a1e4a9 · 2025-07-15T11:08:09.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py
@@ -20,7 +20,7 @@
 from llmcompressor.pytorch.model_load.helpers import parse_dtype
 from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
     modify_save_pretrained,
-    patch_tied_tensors_bug,
+    untie_word_embeddings,
 )
 from llmcompressor.transformers.utils.helpers import (
     detect_last_checkpoint,
@@ -61,7 +61,8 @@ def pre_process(model_args: "ModelArguments"):
         )
 
     # untie tie_word_embeddings weights
-    patch_tied_tensors_bug(model_args.model)
+    if not model_args.tie_word_embeddings:
+        untie_word_embeddings(model_args.model)
 
     # wrap model.save_pretrained
     modify_save_pretrained(model_args.model)
@@ -143,7 +144,6 @@ def initialize_model_from_path(
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         use_auth_token=True if model_args.use_auth_token else None,
-        tie_word_embeddings=model_args.tie_word_embeddings,
         trust_remote_code=model_args.trust_remote_code_model,
     )
 
@@ -156,7 +156,6 @@ def initialize_model_from_path(
             AutoConfig.from_pretrained(
                 model_args.distill_teacher,
                 use_auth_token=True if model_args.use_auth_token else None,
-                tie_word_embeddings=model_args.tie_word_embeddings,
                 trust_remote_code=model_args.trust_remote_code_model,
             )
             if model_args.distill_teacher
diff --git a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py
@@ -9,8 +9,9 @@
     CompressionFormat,
     ModelCompressor,
     SparsityCompressionConfig,
+    delete_offload_parameter,
     is_module_offloaded,
-    update_offload_parameter,
+    register_offload_parameter,
 )
 from loguru import logger
 from safetensors.torch import storage_ptr
@@ -27,7 +28,7 @@
 from llmcompressor.transformers.utils import RECIPE_FILE_NAME
 from llmcompressor.transformers.utils.helpers import infer_recipe_from_model_path
 
-__all__ = ["modify_save_pretrained"]
+__all__ = ["modify_save_pretrained", "untie_word_embeddings"]
 
 
 def modify_save_pretrained(model: PreTrainedModel):
@@ -120,7 +121,7 @@ def save_pretrained_wrapper(
         model.save_pretrained = save_pretrained_compressed(model.save_pretrained)
 
 
-def patch_tied_tensors_bug(model: torch.nn.Module):
+def untie_word_embeddings(model: PreTrainedModel):
     """
     Patches bug where HF transformers will fail to untie weights under specific
     circumstances (https://github.com/huggingface/transformers/issues/33689).
@@ -129,28 +130,27 @@ def patch_tied_tensors_bug(model: torch.nn.Module):
 
     :param model: model to fix
     """
-    if (
-        hasattr(model.config, "tie_word_embeddings")
-        and not model.config.tie_word_embeddings
-    ):
-        input_embed = model.get_input_embeddings()
-        output_embed = model.get_output_embeddings()
-
-        if input_embed is None or output_embed is None:
-            # some models fail to properly override the abstract methods
-            return
-
-        if storage_ptr(input_embed.weight) == storage_ptr(output_embed.weight):
-            for module in (input_embed, output_embed):
-                if not is_module_offloaded(module):
-                    # create new storage ptr for onloaded weight
-                    untied_data = module.weight.data.clone()
-                    module.weight.data = untied_data
-                else:
-                    # create new storage ptr for offloaded weight
-                    # note `update_offload_parameter` does not create a new storage ptr
-                    untied_data = module._hf_hook.weights_map["weight"].clone()
-                    update_offload_parameter(module, "weight", untied_data)
+    input_embed = model.get_input_embeddings()
+    output_embed = model.get_output_embeddings()
+
+    for module in (input_embed, output_embed):
+        if module is None or not hasattr(module, "weight"):
+            logger.warning(f"Cannot untie {module} which does not have weight param")
+            continue
+
+        # this could be replaced by a `get_offloaded_parameter` util
+        if not is_module_offloaded(module):
+            untied_data = module.weight.data.clone()
+        else:
+            untied_data = module._hf_hook.weights_map["weight"].clone()
+
+        requires_grad = module.weight.requires_grad
+        new_parameter = torch.nn.Parameter(untied_data, requires_grad=requires_grad)
+        delete_offload_parameter(module, "weight")
+        register_offload_parameter(module, "weight", new_parameter)
+
+    if hasattr(model.config, "tie_word_embeddings"):
+        model.config.tie_word_embeddings = False
 
 
 def get_model_compressor(
diff --git a/tests/llmcompressor/transformers/sparsification/test_compress_tensor_utils.py b/tests/llmcompressor/transformers/sparsification/test_compress_tensor_utils.py
@@ -28,7 +28,7 @@
 from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
     get_model_compressor,
     modify_save_pretrained,
-    patch_tied_tensors_bug,
+    untie_word_embeddings,
 )
 from tests.testing_utils import requires_gpu
 
@@ -224,8 +224,6 @@ def test_quant_model_reload(format, dtype, tmp_path):
         shutil.rmtree(tmp_path)
 
 
-# technically only tie_word_embeddings=False is supported right now
-# setting to True is discouraged
 @pytest.mark.parametrize(
     "offload,torch_dtype,tie_word_embeddings,device",
     [
@@ -237,25 +235,23 @@ def test_quant_model_reload(format, dtype, tmp_path):
         # offloading
         (True, torch.float16, False, "cpu"),
         (True, torch.float32, False, "cpu"),
-        # (True, torch.float16, True, "cpu"),  # TODO: fails
-        # (True, torch.float32, True, "cpu"),  # TODO: fails
+        (True, torch.float16, True, "cpu"),
+        (True, torch.float32, True, "cpu"),
     ],
 )
 def test_model_reload(offload, torch_dtype, tie_word_embeddings, device, tmp_path):
     model_path = "nm-testing/llama2.c-stories15M"
     save_path = tmp_path / "save_path"
 
-    model = AutoModelForCausalLM.from_pretrained(
-        model_path,
-        tie_word_embeddings=tie_word_embeddings,
-        torch_dtype=torch_dtype,
-    )
+    model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch_dtype)
     if offload:
         model = dispatch_model(model, {"": device}, force_hooks=True)
     else:
         model = model.to(device)
 
-    patch_tied_tensors_bug(model)
+    if not tie_word_embeddings:
+        untie_word_embeddings(model)
+
     modify_save_pretrained(model)
     model.save_pretrained(save_path, safe_serialization=True)
 
@@ -294,22 +290,18 @@ def test_model_reload_gpu(offload, torch_dtype, tie_word_embeddings, device, tmp
         (True, torch.float32, True, "cpu"),
     ],
 )
-def test_model_shared_tensors(
-    offload, torch_dtype, tie_word_embeddings, device, tmp_path
-):
+def test_model_shared_tensors(offload, torch_dtype, tie_word_embeddings, device):
     # load model
-    model = AutoModelForCausalLM.from_pretrained(
-        "nm-testing/llama2.c-stories15M",
-        torch_dtype=torch_dtype,
-        tie_word_embeddings=tie_word_embeddings,
-    )
-    patch_tied_tensors_bug(model)
-
+    model_path = "nm-testing/llama2.c-stories15M"
+    model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch_dtype)
     if offload:
         model = dispatch_model(model, {"": device}, force_hooks=True)
     else:
         model = model.to(device)
 
+    if not tie_word_embeddings:
+        untie_word_embeddings(model)
+
     # modify lm head
     with torch.no_grad(), align_module_device(model.lm_head):
         update_offload_parameter(model.lm_head, "weight", model.lm_head.weight + 1)
@@ -332,12 +324,8 @@ def test_model_shared_tensors(
         (False, torch.float32, True, "cuda:0"),
     ],
 )
-def test_model_shared_tensors_gpu(
-    offload, torch_dtype, tie_word_embeddings, device, tmp_path
-):
-    test_model_shared_tensors(
-        offload, torch_dtype, tie_word_embeddings, device, tmp_path
-    )
+def test_model_shared_tensors_gpu(offload, torch_dtype, tie_word_embeddings, device):
+    test_model_shared_tensors(offload, torch_dtype, tie_word_embeddings, device)
 
 
 @requires_gpu