Skip to content

Commit e3398fc

Browse files
Patch release 0.18.1 (#2983)
* FIX Transformers v5 fixes (#2934) With the v5 rc being out, we should now ensure that the PEFT tests pass. This PR contains fixes to achieve that. 1. hub_online_once was failing because transformers.utils.hub._is_offline_mode no longer exists. Using the new function instead if transformers v5 is detected. 2. tests/test_encoder_decoder_models.py::TestEncoderDecoderModels::test_merge_layers[LoraConfig-config_kwargs10-peft-internal-testing/tiny-random-BartForConditionalGeneration] failing due to TrainableTokensWrapper not being applied to all layers owing to changes to _tied_weights_keys. 3. While working on this, I discovered a tangential bug in TrainableTokensLayer.get_merged_weights. This method returns a torch.Tensor but the expected type is nn.Parameter (since foo.bar.weight is supposed to be a nn.Parameter). This type mismatch would cause torch's model.get_parameter, which I used in _get_module_names_tied_with_embedding, to fail. At first, I wanted to change the return type to nn.Parameter but this causes all kinds of issues. Therefore, I left this bug as is. Instead, in _get_module_names_tied_with_embedding, I opted to use attrgetter instead of model.get_parameter. * FIX Detect if torch.distributed is available (#2963) E.g. it's not available for the torch rocm build. Signed-off-by: vladmandic <[email protected]> * FIX Don't implicitly require transformers v4.52 (#2976) Resolves #2975 In #2826, we inadvertently added a dependency on transformers v4.52 to PEFT. However, this is really only needed under very specific circumstances (aLoRA + gradient checkpointing). With this PR, unless we're in these circumstances, this requirement is no longer there. * Release: v0.18.1 Contains the following changes: - #2934 - #2963 - #2976 --------- Signed-off-by: vladmandic <[email protected]> Co-authored-by: Vladimir Mandic <[email protected]>
1 parent 77daa8d commit e3398fc

File tree

10 files changed

+57
-18
lines changed

10 files changed

+57
-18
lines changed

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from setuptools import find_packages, setup
1616

1717

18-
VERSION = "0.18.0"
18+
VERSION = "0.18.1"
1919

2020
extras = {}
2121
extras["quality"] = [

src/peft/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
__version__ = "0.18.0"
15+
__version__ = "0.18.1"
1616

1717
from .auto import (
1818
MODEL_TYPE_TO_PEFT_MODEL_MAPPING,

src/peft/tuners/lora/model.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,10 @@
2121
from functools import partial, reduce
2222
from typing import Literal, Optional
2323

24+
import packaging.version
2425
import torch
26+
import transformers
2527
from torch import nn
26-
from transformers.modeling_layers import GradientCheckpointingLayer
2728

2829
from peft.import_utils import is_bnb_4bit_available, is_bnb_available
2930
from peft.tuners.tuners_utils import (
@@ -360,6 +361,16 @@ def _enable_peft_forward_hooks(self, *args, **kwargs):
360361
hook_handles = []
361362

362363
if alora_offsets is not None:
364+
# TODO: remove once transformers 4.52 is no longer supported. Note that 4.52.0 is yanked, so 4.52.1
365+
# is the first 4.52 release.
366+
transformers_lt_4_52 = packaging.version.parse(transformers.__version__) < packaging.version.parse(
367+
"4.52.1"
368+
)
369+
if transformers_lt_4_52:
370+
raise ValueError("Using aLoRA requires transformers >= 4.52.1.")
371+
372+
from transformers.modeling_layers import GradientCheckpointingLayer
373+
363374
for n, layer in self.named_modules():
364375
# gradient checkpointing layer are executed concurrently to the 'normal' forward call
365376
# (in the backward step the gradient checkpointing layer's forward will be executed again).

src/peft/tuners/osf/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def project_gradient_to_orthogonal_space(svd_dict: dict[str, Any]) -> None:
101101
# Use addmm_ for efficient in-place operation
102102
# Compute local contribution to (U_high^T @ dU); all-reduce to get global projection
103103
proj_coeff = torch.mm(local_U_high.transpose(0, 1), local_dU)
104-
if dist.is_initialized() and dist.get_world_size() > 1:
104+
if dist.is_available() and dist.is_initialized() and dist.get_world_size() > 1:
105105
dist.all_reduce(proj_coeff, op=dist.ReduceOp.SUM)
106106
# Apply projection using only local rows of U_high
107107
local_dU.addmm_(local_U_high, proj_coeff, alpha=-1.0)
@@ -120,7 +120,7 @@ def project_gradient_to_orthogonal_space(svd_dict: dict[str, Any]) -> None:
120120
# Compute Gram matrix G = V_high^T @ V_high for global projection across row-sharded V_high
121121
# Assumes column dimension is consistent across ranks (row sharding over singular vectors)
122122
G_local = torch.mm(local_V_high.transpose(0, 1), local_V_high)
123-
if dist.is_initialized() and dist.get_world_size() > 1:
123+
if dist.is_available() and dist.is_initialized() and dist.get_world_size() > 1:
124124
dist.all_reduce(G_local, op=dist.ReduceOp.SUM)
125125

126126
# Apply projection: dV = dV - dV @ G (use local shard of dV)

src/peft/tuners/trainable_tokens/layer.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def _collect_token_weights(self, weight: torch.Tensor, rows: torch.Tensor, embed
9090
device = torch.device("cuda", torch.cuda.current_device())
9191

9292
with gather_params_ctx([weight], modifier_rank=None):
93-
if dist.get_rank() == src_rank:
93+
if dist.is_available() and dist.is_initialized() and dist.get_rank() == src_rank:
9494
token_weights = weight[rows].clone()
9595
else:
9696
# build an empty tensor with correct shape/type/device
@@ -199,14 +199,17 @@ def unmerge(self) -> None:
199199
originals = self.trainable_tokens_original[adapter_name].to(self.base_layer.weight)
200200
self.base_layer.weight.data.index_copy_(dim=0, index=index, source=originals)
201201

202-
def get_merged_weights(self, active_adapters):
202+
def get_merged_weights(self, active_adapters) -> torch.Tensor:
203203
W = self.base_layer.weight
204204

205205
for adapter_name in active_adapters:
206206
index = torch.tensor(self.token_indices[adapter_name]).to(W.device)
207207
deltas = self.trainable_tokens_delta[adapter_name].to(W)
208208
W = W.index_copy(dim=0, index=index, source=deltas)
209209

210+
# Note: the return type is a Tensor, not an nn.Parameter. This can lead to some errors, e.g. torch's
211+
# model.get_parameter fails as it does a type check. But we cannot return an nn.Parameter here, as it can lead
212+
# to other failures, as this is not a true nn.Parameter of the model.
210213
return W
211214

212215
def forward_adapters(self, x: torch.Tensor, active_adapters, *args, **kwargs) -> torch.Tensor:

src/peft/tuners/tuners_utils.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,10 @@
5858
from ._buffer_dict import BufferDict
5959

6060

61+
_torch_supports_dtensor = version.parse(torch.__version__) >= version.parse("2.5.0")
62+
_torch_supports_distributed = _torch_supports_dtensor and torch.distributed.is_available()
63+
64+
6165
@contextmanager
6266
def onload_layer(layer):
6367
r"""
@@ -157,8 +161,7 @@ def _get_in_out_features(module: nn.Module) -> tuple[int, int] | tuple[None, Non
157161
this function returns a valid result does not imply that the layer type is supported.
158162
"""
159163
if isinstance(module, nn.Linear):
160-
torch_supports_dtensor = version.parse(torch.__version__) >= version.parse("2.5.0")
161-
if torch_supports_dtensor and isinstance(module.weight, torch.distributed.tensor.DTensor):
164+
if _torch_supports_distributed and isinstance(module.weight, torch.distributed.tensor.DTensor):
162165
# If Tensor Parallel is used, the weight is sharded, so we need to get the local shape
163166
out_features, in_features = module.weight.to_local().shape
164167
else:

src/peft/utils/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
from .integrations import map_cache_to_layer_device_map
15+
from .integrations import is_transformers_ge_v5, map_cache_to_layer_device_map
1616
from .loftq_utils import replace_lora_weights_loftq
1717
from .other import (
1818
CONFIG_NAME,
@@ -120,6 +120,7 @@
120120
"get_quantization_config",
121121
"id_tensor_storage",
122122
"infer_device",
123+
"is_transformers_ge_v5",
123124
"load_peft_weights",
124125
"map_cache_to_layer_device_map",
125126
"prepare_model_for_kbit_training",

src/peft/utils/integrations.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@
2424
from torch import nn
2525

2626

27+
is_transformers_ge_v5 = packaging.version.parse(transformers.__version__) >= packaging.version.parse("5.0.0.dev0")
28+
29+
2730
def check_deepspeed_zero3_enabled() -> bool:
2831
if packaging.version.parse(transformers.__version__) >= packaging.version.parse("4.33.0"):
2932
from transformers.integrations import is_deepspeed_zero3_enabled

src/peft/utils/other.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1573,7 +1573,7 @@ def _get_module_names_tied_with_embedding(model) -> list[str]:
15731573
that the weight tying definition is present but the tying is disabled via `model_config.tie_word_embeddings=False`.
15741574
You have to check that yourself.
15751575
"""
1576-
tied_weights = []
1576+
tied_weights: list[str] = []
15771577

15781578
if hasattr(model, "get_base_model"):
15791579
# unpack PeftModel
@@ -1595,6 +1595,17 @@ def _get_module_names_tied_with_embedding(model) -> list[str]:
15951595
"'get_input_embeddings' so we can't determine which weights are tied to embeddings."
15961596
)
15971597

1598+
# collect all _tied_weights_keys, as sub-modules may have additional entries
1599+
tied_weights_keys: dict[str, str] = {}
1600+
for module_name, module in model.named_modules():
1601+
module_tied_weights_keys = getattr(module, "_tied_weights_keys", None)
1602+
if module_tied_weights_keys and not module_name:
1603+
tied_weights_keys.update(module_tied_weights_keys)
1604+
elif module_tied_weights_keys:
1605+
tied_weights_keys.update(
1606+
{f"{module_name}.{k}": f"{module_name}.{v}" for k, v in module_tied_weights_keys.items()}
1607+
)
1608+
15981609
# technically it would be sufficient to just return candidates since that contains all the keys of
15991610
# all models that are tied (not just equal!) to the input embeddings. the only reason why we aren't
16001611
# doing that is because we need to filter out the original embedding name since we promise to just
@@ -1613,12 +1624,13 @@ def _get_module_names_tied_with_embedding(model) -> list[str]:
16131624

16141625
tied_weights.extend(
16151626
peft_reverse_mapping.get(k, k)
1616-
for k, v in model._tied_weights_keys.items()
1627+
for k, v in tied_weights_keys.items()
16171628
if peft_reverse_mapping.get(v, v) in candidates
16181629
)
16191630

16201631
elif model._tied_weights_keys is not None:
16211632
# TODO remove this when transformers <v5 is no longer supported
16221633
tied_weights.extend(model._tied_weights_keys)
16231634

1635+
# get module names from parameter names
16241636
return sorted({name.rpartition(".")[0] for name in tied_weights})

tests/testing_utils.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
is_optimum_available,
4141
is_torchao_available,
4242
)
43+
from peft.utils import is_transformers_ge_v5
4344

4445

4546
# Globally shared model cache used by `hub_online_once`.
@@ -279,18 +280,23 @@ def test_something(model_id, config_kwargs):
279280
if model_id in _HUB_MODEL_ACCESSES:
280281
override = {"HF_HUB_OFFLINE": "1"}
281282
_HUB_MODEL_ACCESSES[model_id] += 1
282-
else:
283-
if model_id not in _HUB_MODEL_ACCESSES:
284-
_HUB_MODEL_ACCESSES[model_id] = 0
283+
elif model_id not in _HUB_MODEL_ACCESSES:
284+
_HUB_MODEL_ACCESSES[model_id] = 0
285+
is_offline = override.get("HF_HUB_OFFLINE", False) == "1"
286+
285287
with (
286288
# strictly speaking it is not necessary to set the environment variable since most code that's out there
287289
# is evaluating it at import time and we'd have to reload the modules for it to take effect. It's
288290
# probably still a good idea to have it if there's some dynamic code that checks it.
289291
mock.patch.dict(os.environ, override),
290-
mock.patch("huggingface_hub.constants.HF_HUB_OFFLINE", override.get("HF_HUB_OFFLINE", False) == "1"),
291-
mock.patch("transformers.utils.hub._is_offline_mode", override.get("HF_HUB_OFFLINE", False) == "1"),
292+
mock.patch("huggingface_hub.constants.HF_HUB_OFFLINE", is_offline),
292293
):
293-
yield
294+
if is_transformers_ge_v5:
295+
with mock.patch("transformers.utils.hub.is_offline_mode", lambda: is_offline):
296+
yield
297+
else: # TODO remove if transformers <= 4 no longer supported
298+
with mock.patch("transformers.utils.hub._is_offline_mode", is_offline):
299+
yield
294300
except Exception:
295301
# in case of an error we have to assume that we didn't access the model properly from the hub
296302
# for the first time, so the next call cannot be considered cached.

0 commit comments

Comments
 (0)