Patch release 0.18.1 (#2983)

BenjaminBossan · vladmandic · web-flow · commit e3398fc05c55 · 2026-01-09T13:42:21.000+01:00
* FIX Transformers v5 fixes (#2934) With the v5 rc being out, we should now ensure that the PEFT tests pass. This PR contains fixes to achieve that. 1. hub_online_once was failing because transformers.utils.hub._is_offline_mode no longer exists. Using the new function instead if transformers v5 is detected. 2. tests/test_encoder_decoder_models.py::TestEncoderDecoderModels::test_merge_layers[LoraConfig-config_kwargs10-peft-internal-testing/tiny-random-BartForConditionalGeneration] failing due to TrainableTokensWrapper not being applied to all layers owing to changes to _tied_weights_keys. 3. While working on this, I discovered a tangential bug in TrainableTokensLayer.get_merged_weights. This method returns a torch.Tensor but the expected type is nn.Parameter (since foo.bar.weight is supposed to be a nn.Parameter). This type mismatch would cause torch's model.get_parameter, which I used in _get_module_names_tied_with_embedding, to fail. At first, I wanted to change the return type to nn.Parameter but this causes all kinds of issues. Therefore, I left this bug as is. Instead, in _get_module_names_tied_with_embedding, I opted to use attrgetter instead of model.get_parameter. * FIX Detect if torch.distributed is available (#2963) E.g. it's not available for the torch rocm build. Signed-off-by: vladmandic <mandic00@live.com> * FIX Don't implicitly require transformers v4.52 (#2976) Resolves #2975 In #2826, we inadvertently added a dependency on transformers v4.52 to PEFT. However, this is really only needed under very specific circumstances (aLoRA + gradient checkpointing). With this PR, unless we're in these circumstances, this requirement is no longer there. * Release: v0.18.1 Contains the following changes: - #2934 - #2963 - #2976 --------- Signed-off-by: vladmandic <mandic00@live.com> Co-authored-by: Vladimir Mandic <mandic00@live.com>
diff --git a/setup.py b/setup.py
@@ -15,7 +15,7 @@
 from setuptools import find_packages, setup
 
 
-VERSION = "0.18.0"
+VERSION = "0.18.1"
 
 extras = {}
 extras["quality"] = [
diff --git a/src/peft/__init__.py b/src/peft/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.18.0"
+__version__ = "0.18.1"
 
 from .auto import (
     MODEL_TYPE_TO_PEFT_MODEL_MAPPING,
diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py
@@ -21,9 +21,10 @@
 from functools import partial, reduce
 from typing import Literal, Optional
 
+import packaging.version
 import torch
+import transformers
 from torch import nn
-from transformers.modeling_layers import GradientCheckpointingLayer
 
 from peft.import_utils import is_bnb_4bit_available, is_bnb_available
 from peft.tuners.tuners_utils import (
@@ -360,6 +361,16 @@ def _enable_peft_forward_hooks(self, *args, **kwargs):
         hook_handles = []
 
         if alora_offsets is not None:
+            # TODO: remove once transformers 4.52 is no longer supported. Note that 4.52.0 is yanked, so 4.52.1
+            # is the first 4.52 release.
+            transformers_lt_4_52 = packaging.version.parse(transformers.__version__) < packaging.version.parse(
+                "4.52.1"
+            )
+            if transformers_lt_4_52:
+                raise ValueError("Using aLoRA requires transformers >= 4.52.1.")
+
+            from transformers.modeling_layers import GradientCheckpointingLayer
+
             for n, layer in self.named_modules():
                 # gradient checkpointing layer are executed concurrently to the 'normal' forward call
                 # (in the backward step the gradient checkpointing layer's forward will be executed again).
diff --git a/src/peft/tuners/osf/utils.py b/src/peft/tuners/osf/utils.py
@@ -101,7 +101,7 @@ def project_gradient_to_orthogonal_space(svd_dict: dict[str, Any]) -> None:
         # Use addmm_ for efficient in-place operation
         # Compute local contribution to (U_high^T @ dU); all-reduce to get global projection
         proj_coeff = torch.mm(local_U_high.transpose(0, 1), local_dU)
-        if dist.is_initialized() and dist.get_world_size() > 1:
+        if dist.is_available() and dist.is_initialized() and dist.get_world_size() > 1:
             dist.all_reduce(proj_coeff, op=dist.ReduceOp.SUM)
         # Apply projection using only local rows of U_high
         local_dU.addmm_(local_U_high, proj_coeff, alpha=-1.0)
@@ -120,7 +120,7 @@ def project_gradient_to_orthogonal_space(svd_dict: dict[str, Any]) -> None:
         # Compute Gram matrix G = V_high^T @ V_high for global projection across row-sharded V_high
         # Assumes column dimension is consistent across ranks (row sharding over singular vectors)
         G_local = torch.mm(local_V_high.transpose(0, 1), local_V_high)
-        if dist.is_initialized() and dist.get_world_size() > 1:
+        if dist.is_available() and dist.is_initialized() and dist.get_world_size() > 1:
             dist.all_reduce(G_local, op=dist.ReduceOp.SUM)
 
         # Apply projection: dV = dV - dV @ G (use local shard of dV)
diff --git a/src/peft/tuners/trainable_tokens/layer.py b/src/peft/tuners/trainable_tokens/layer.py
@@ -90,7 +90,7 @@ def _collect_token_weights(self, weight: torch.Tensor, rows: torch.Tensor, embed
         device = torch.device("cuda", torch.cuda.current_device())
 
         with gather_params_ctx([weight], modifier_rank=None):
-            if dist.get_rank() == src_rank:
+            if dist.is_available() and dist.is_initialized() and dist.get_rank() == src_rank:
                 token_weights = weight[rows].clone()
             else:
                 # build an empty tensor with correct shape/type/device
@@ -199,14 +199,17 @@ def unmerge(self) -> None:
             originals = self.trainable_tokens_original[adapter_name].to(self.base_layer.weight)
             self.base_layer.weight.data.index_copy_(dim=0, index=index, source=originals)
 
-    def get_merged_weights(self, active_adapters):
+    def get_merged_weights(self, active_adapters) -> torch.Tensor:
         W = self.base_layer.weight
 
         for adapter_name in active_adapters:
             index = torch.tensor(self.token_indices[adapter_name]).to(W.device)
             deltas = self.trainable_tokens_delta[adapter_name].to(W)
             W = W.index_copy(dim=0, index=index, source=deltas)
 
+        # Note: the return type is a Tensor, not an nn.Parameter. This can lead to some errors, e.g. torch's
+        # model.get_parameter fails as it does a type check. But we cannot return an nn.Parameter here, as it can lead
+        # to other failures, as this is not a true nn.Parameter of the model.
         return W
 
     def forward_adapters(self, x: torch.Tensor, active_adapters, *args, **kwargs) -> torch.Tensor:
diff --git a/src/peft/tuners/tuners_utils.py b/src/peft/tuners/tuners_utils.py
@@ -58,6 +58,10 @@
 from ._buffer_dict import BufferDict
 
 
+_torch_supports_dtensor = version.parse(torch.__version__) >= version.parse("2.5.0")
+_torch_supports_distributed = _torch_supports_dtensor and torch.distributed.is_available()
+
+
 @contextmanager
 def onload_layer(layer):
     r"""
@@ -157,8 +161,7 @@ def _get_in_out_features(module: nn.Module) -> tuple[int, int] | tuple[None, Non
     this function returns a valid result does not imply that the layer type is supported.
     """
     if isinstance(module, nn.Linear):
-        torch_supports_dtensor = version.parse(torch.__version__) >= version.parse("2.5.0")
-        if torch_supports_dtensor and isinstance(module.weight, torch.distributed.tensor.DTensor):
+        if _torch_supports_distributed and isinstance(module.weight, torch.distributed.tensor.DTensor):
             # If Tensor Parallel is used, the weight is sharded, so we need to get the local shape
             out_features, in_features = module.weight.to_local().shape
         else:
diff --git a/src/peft/utils/__init__.py b/src/peft/utils/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .integrations import map_cache_to_layer_device_map
+from .integrations import is_transformers_ge_v5, map_cache_to_layer_device_map
 from .loftq_utils import replace_lora_weights_loftq
 from .other import (
     CONFIG_NAME,
@@ -120,6 +120,7 @@
     "get_quantization_config",
     "id_tensor_storage",
     "infer_device",
+    "is_transformers_ge_v5",
     "load_peft_weights",
     "map_cache_to_layer_device_map",
     "prepare_model_for_kbit_training",
diff --git a/src/peft/utils/integrations.py b/src/peft/utils/integrations.py
@@ -24,6 +24,9 @@
 from torch import nn
 
 
+is_transformers_ge_v5 = packaging.version.parse(transformers.__version__) >= packaging.version.parse("5.0.0.dev0")
+
+
 def check_deepspeed_zero3_enabled() -> bool:
     if packaging.version.parse(transformers.__version__) >= packaging.version.parse("4.33.0"):
         from transformers.integrations import is_deepspeed_zero3_enabled
diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py
@@ -1573,7 +1573,7 @@ def _get_module_names_tied_with_embedding(model) -> list[str]:
     that the weight tying definition is present but the tying is disabled via `model_config.tie_word_embeddings=False`.
     You have to check that yourself.
     """
-    tied_weights = []
+    tied_weights: list[str] = []
 
     if hasattr(model, "get_base_model"):
         # unpack PeftModel
@@ -1595,6 +1595,17 @@ def _get_module_names_tied_with_embedding(model) -> list[str]:
                 "'get_input_embeddings' so we can't determine which weights are tied to embeddings."
             )
 
+        # collect all _tied_weights_keys, as sub-modules may have additional entries
+        tied_weights_keys: dict[str, str] = {}
+        for module_name, module in model.named_modules():
+            module_tied_weights_keys = getattr(module, "_tied_weights_keys", None)
+            if module_tied_weights_keys and not module_name:
+                tied_weights_keys.update(module_tied_weights_keys)
+            elif module_tied_weights_keys:
+                tied_weights_keys.update(
+                    {f"{module_name}.{k}": f"{module_name}.{v}" for k, v in module_tied_weights_keys.items()}
+                )
+
         # technically it would be sufficient to just return candidates since that contains all the keys of
         # all models that are tied (not just equal!) to the input embeddings. the only reason why we aren't
         # doing that is because we need to filter out the original embedding name since we promise to just
@@ -1613,12 +1624,13 @@ def _get_module_names_tied_with_embedding(model) -> list[str]:
 
         tied_weights.extend(
             peft_reverse_mapping.get(k, k)
-            for k, v in model._tied_weights_keys.items()
+            for k, v in tied_weights_keys.items()
             if peft_reverse_mapping.get(v, v) in candidates
         )
 
     elif model._tied_weights_keys is not None:
         # TODO remove this when transformers <v5 is no longer supported
         tied_weights.extend(model._tied_weights_keys)
 
+    # get module names from parameter names
     return sorted({name.rpartition(".")[0] for name in tied_weights})
diff --git a/tests/testing_utils.py b/tests/testing_utils.py
@@ -40,6 +40,7 @@
     is_optimum_available,
     is_torchao_available,
 )
+from peft.utils import is_transformers_ge_v5
 
 
 # Globally shared model cache used by `hub_online_once`.
@@ -279,18 +280,23 @@ def test_something(model_id, config_kwargs):
         if model_id in _HUB_MODEL_ACCESSES:
             override = {"HF_HUB_OFFLINE": "1"}
             _HUB_MODEL_ACCESSES[model_id] += 1
-        else:
-            if model_id not in _HUB_MODEL_ACCESSES:
-                _HUB_MODEL_ACCESSES[model_id] = 0
+        elif model_id not in _HUB_MODEL_ACCESSES:
+            _HUB_MODEL_ACCESSES[model_id] = 0
+        is_offline = override.get("HF_HUB_OFFLINE", False) == "1"
+
         with (
             # strictly speaking it is not necessary to set the environment variable since most code that's out there
             # is evaluating it at import time and we'd have to reload the modules for it to take effect. It's
             # probably still a good idea to have it if there's some dynamic code that checks it.
             mock.patch.dict(os.environ, override),
-            mock.patch("huggingface_hub.constants.HF_HUB_OFFLINE", override.get("HF_HUB_OFFLINE", False) == "1"),
-            mock.patch("transformers.utils.hub._is_offline_mode", override.get("HF_HUB_OFFLINE", False) == "1"),
+            mock.patch("huggingface_hub.constants.HF_HUB_OFFLINE", is_offline),
         ):
-            yield
+            if is_transformers_ge_v5:
+                with mock.patch("transformers.utils.hub.is_offline_mode", lambda: is_offline):
+                    yield
+            else:  # TODO remove if transformers <= 4 no longer supported
+                with mock.patch("transformers.utils.hub._is_offline_mode", is_offline):
+                    yield
     except Exception:
         # in case of an error we have to assume that we didn't access the model properly from the hub
         # for the first time, so the next call cannot be considered cached.