Skip to content

Commit c25744c

Browse files
committed
tests passing and fixes following feedback
rebase
1 parent b1aabde commit c25744c

File tree

11 files changed

+202
-192
lines changed

11 files changed

+202
-192
lines changed

src/peft/__init__.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,8 @@
8787
PromptEncoderReparameterizationType,
8888
PromptTuningConfig,
8989
PromptTuningInit,
90+
RandLoraConfig,
91+
RandLoraModel,
9092
TrainableTokensConfig,
9193
TrainableTokensModel,
9294
VBLoRAConfig,
@@ -95,8 +97,6 @@
9597
VeraModel,
9698
XLoraConfig,
9799
XLoraModel,
98-
RandLoraConfig,
99-
RandLoraModel,
100100
get_eva_state_dict,
101101
initialize_lora_eva_weights,
102102
)
@@ -180,6 +180,8 @@
180180
"PromptLearningConfig",
181181
"PromptTuningConfig",
182182
"PromptTuningInit",
183+
"RandLoraConfig",
184+
"RandLoraModel",
183185
"TaskType",
184186
"TrainableTokensConfig",
185187
"TrainableTokensModel",

src/peft/tuners/__init__.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,12 @@
3939
from .poly import PolyConfig, PolyModel
4040
from .prefix_tuning import PrefixEncoder, PrefixTuningConfig
4141
from .prompt_tuning import PromptEmbedding, PromptTuningConfig, PromptTuningInit
42+
from .randlora import RandLoraConfig, RandLoraModel
4243
from .trainable_tokens import TrainableTokensConfig, TrainableTokensModel
4344
from .vblora import VBLoRAConfig, VBLoRAModel
4445
from .vera import VeraConfig, VeraModel
4546
from .xlora import XLoraConfig, XLoraModel
46-
from .randlora import RandLoraConfig, RandLoraModel
47+
4748

4849
__all__ = [
4950
"AdaLoraConfig",
@@ -89,6 +90,8 @@
8990
"PromptEncoderReparameterizationType",
9091
"PromptTuningConfig",
9192
"PromptTuningInit",
93+
"RandLoraConfig",
94+
"RandLoraModel",
9295
"TrainableTokensConfig",
9396
"TrainableTokensModel",
9497
"VBLoRAConfig",
@@ -97,8 +100,6 @@
97100
"VeraModel",
98101
"XLoraConfig",
99102
"XLoraModel",
100-
"RandLoraConfig",
101-
"RandLoraModel",
102103
"get_eva_state_dict",
103104
"initialize_lora_eva_weights",
104105
]

src/peft/tuners/randlora/__init__.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2023-present the HuggingFace Inc. team.
1+
# Copyright 2025-present the HuggingFace Inc. team.
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -20,11 +20,10 @@
2020
from .model import RandLoraModel
2121

2222

23-
__all__ = ["RandLoraConfig", "RandLoraLayer", "Linear", "RandLoraModel"]
23+
__all__ = ["Linear", "RandLoraConfig", "RandLoraLayer", "RandLoraModel"]
24+
25+
register_peft_method(name="randlora", config_cls=RandLoraConfig, model_cls=RandLoraModel, prefix="randlora_")
2426

25-
register_peft_method(
26-
name="randlora", config_cls=RandLoraConfig, model_cls=RandLoraModel, prefix="randlora_"
27-
)
2827

2928
def __getattr__(name):
3029
if (name == "Linear8bitLt") and is_bnb_available():

src/peft/tuners/randlora/bnb.py

Lines changed: 36 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2024-present the HuggingFace Inc. team.
1+
# Copyright 2025-present the HuggingFace Inc. team.
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -124,10 +124,11 @@ def unmerge(self) -> None:
124124
).to(weight.device)
125125
state.reset_grads()
126126

127-
def get_scaled_bases(self, adapter) -> List[torch.Tensor, torch.Tensor, torch.dtype]:
127+
def get_scaled_bases(self, adapter) -> list[torch.Tensor, torch.Tensor, torch.dtype]:
128128
"""
129-
Performs scaling on the smallest random base (randlora_A) and returns randlora_A and randlora_B in the correct order
130-
to fit the target layers' dimensions
129+
Performs scaling on the smallest random base (randlora_A) and returns randlora_A and randlora_B in the
130+
correct order to fit the target layers' dimensions
131+
131132
Args:
132133
adapter (str):
133134
The name of the adapter for which the delta weight should be computed.
@@ -153,15 +154,15 @@ def get_scaled_bases(self, adapter) -> List[torch.Tensor, torch.Tensor, torch.dt
153154
randlora_lambda = randlora_lambda.float()
154155
randlora_gamma = randlora_gamma.float()
155156

156-
#The trainable paramters are always applied to randlora_A, the smallest basis.
157+
# The trainable paramters are always applied to randlora_A, the smallest basis.
157158
min_dim, max_dim = min(self.out_features, self.in_features), max(self.out_features, self.in_features)
158159

159160
# As adapted layers may have different shapes and RandLora contains a single shared pair of A and B matrices,
160161
# we initialize these matrices with the largest required size for each dimension.
161162
# During the forward pass, required submatrices are sliced out from the shared randlora_A and randlora_B.
162-
sliced_A = randlora_A[:, : self.n, : min_dim]
163-
sliced_B = randlora_B[: max_dim, : self.n, :]
164-
#Flattening the matrices over the rank and number of bases dimensions is more memory efficient
163+
sliced_A = randlora_A[:, : self.n, :min_dim]
164+
sliced_B = randlora_B[:max_dim, : self.n, :]
165+
# Flattening the matrices over the rank and number of bases dimensions is more memory efficient
165166
update_B = sliced_B.flatten(start_dim=1)
166167
update_A = UniqueBaseGrad.apply(sliced_A, randlora_lambda, randlora_gamma).flatten(end_dim=1)
167168
if min_dim == self.in_features:
@@ -188,11 +189,11 @@ def get_delta_weight(self, adapter) -> torch.Tensor:
188189

189190
# cast back the weights
190191
# TODO: why?, taken from the VeRA implementation
191-
self.randlora_lambda[adapter].data = randlora_lambda.to(dtype)
192-
self.randlora_gamma[adapter].data = randlora_gamma.to(dtype)
192+
self.randlora_lambda[adapter].data = self.randlora_lambda[adapter].data.to(dtype)
193+
self.randlora_gamma[adapter].data = self.randlora_gamma[adapter].data.to(dtype)
193194

194195
scaling = self.scaling[adapter]
195-
196+
196197
return output_tensor * scaling
197198

198199
def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
@@ -206,9 +207,9 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
206207
torch.Tensor: Output tensor after applying the RandLora adaptation.
207208
208209
Note:
209-
This method implements the RandLora-specific forward pass. It applies the shared projections (randlora_A and
210-
randlora_B) along with the per-layer trainable parameters (lambda and gamma) to compute the adapter
211-
output.
210+
This method implements the RandLora-specific forward pass. It applies the shared projections
211+
(randlora_A and randlora_B) along with the per-layer trainable parameters (lambda and gamma) to compute
212+
the adapter output.
212213
"""
213214
if self.disable_adapters:
214215
if self.merged:
@@ -221,7 +222,7 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
221222
for active_adapter in self.active_adapters:
222223
if active_adapter not in self.randlora_lambda.keys():
223224
continue
224-
225+
225226
update_B, update_A, dtype = self.get_scaled_bases(active_adapter)
226227
requires_conversion = not torch.is_autocast_enabled()
227228
if requires_conversion:
@@ -232,14 +233,12 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
232233

233234
dropout = self.randlora_dropout[active_adapter]
234235
x_temp = dropout(x.to(update_A.dtype))
235-
236-
adapter_output = torch.nn.functional.linear(
237-
torch.nn.functional.linear(x_temp, update_B), update_A
238-
)
236+
237+
adapter_output = torch.nn.functional.linear(torch.nn.functional.linear(x_temp, update_B), update_A)
239238

240239
if requires_conversion:
241240
adapter_output = adapter_output.to(expected_dtype)
242-
241+
243242
scaling = self.scaling[active_adapter]
244243
result = result + adapter_output * scaling
245244

@@ -337,10 +336,11 @@ def unmerge(self) -> None:
337336
weight.device
338337
)
339338

340-
def get_scaled_bases(self, adapter) -> List[torch.Tensor, torch.Tensor, torch.dtype]:
339+
def get_scaled_bases(self, adapter) -> list[torch.Tensor, torch.Tensor, torch.dtype]:
341340
"""
342-
Performs scaling on the smallest random base (randlora_A) and returns randlora_A and randlora_B in the correct order
343-
to fit the target layers' dimensions
341+
Performs scaling on the smallest random base (randlora_A) and returns randlora_A and randlora_B in the
342+
correct order to fit the target layers' dimensions
343+
344344
Args:
345345
adapter (str):
346346
The name of the adapter for which the delta weight should be computed.
@@ -366,15 +366,15 @@ def get_scaled_bases(self, adapter) -> List[torch.Tensor, torch.Tensor, torch.dt
366366
randlora_lambda = randlora_lambda.float()
367367
randlora_gamma = randlora_gamma.float()
368368

369-
#The trainable paramters are always applied to randlora_A, the smallest basis.
369+
# The trainable paramters are always applied to randlora_A, the smallest basis.
370370
min_dim, max_dim = min(self.out_features, self.in_features), max(self.out_features, self.in_features)
371371

372372
# As adapted layers may have different shapes and RandLora contains a single shared pair of A and B matrices,
373373
# we initialize these matrices with the largest required size for each dimension.
374-
# During the forward pass, required submatrices are sliced out from the shared randlora_A and randlora_B.
375-
sliced_A = randlora_A[:, : self.n, : min_dim]
376-
sliced_B = randlora_B[: max_dim, : self.n, :]
377-
#Flattening the matrices over the rank and number of bases dimensions is more memory efficient
374+
# During the forward pass, required submatrices are sliced out from the shared randlora_A and randlora_B.
375+
sliced_A = randlora_A[:, : self.n, :min_dim]
376+
sliced_B = randlora_B[:max_dim, : self.n, :]
377+
# Flattening the matrices over the rank and number of bases dimensions is more memory efficient
378378
update_B = sliced_B.flatten(start_dim=1)
379379
update_A = UniqueBaseGrad.apply(sliced_A, randlora_lambda, randlora_gamma).flatten(end_dim=1)
380380
if min_dim == self.in_features:
@@ -385,6 +385,7 @@ def get_scaled_bases(self, adapter) -> List[torch.Tensor, torch.Tensor, torch.dt
385385
def get_delta_weight(self, adapter) -> torch.Tensor:
386386
"""
387387
Compute the delta weight for the given adapter.
388+
388389
Args:
389390
adapter (str):
390391
The name of the adapter for which the delta weight should be computed.
@@ -400,13 +401,13 @@ def get_delta_weight(self, adapter) -> torch.Tensor:
400401

401402
# cast back the weights
402403
# TODO: why?, taken from the VeRA implementation
403-
self.randlora_lambda[adapter].data = randlora_lambda.to(dtype)
404-
self.randlora_gamma[adapter].data = randlora_gamma.to(dtype)
404+
self.randlora_lambda[adapter].data = self.randlora_lambda[adapter].to(dtype)
405+
self.randlora_gamma[adapter].data = self.randlora_gamma[adapter].to(dtype)
405406

406407
scaling = self.scaling[adapter]
407-
408+
408409
return output_tensor * scaling
409-
410+
410411
def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
411412
if self.disable_adapters:
412413
if self.merged:
@@ -419,7 +420,7 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
419420
result = result.clone()
420421
for active_adapter in self.active_adapters:
421422
if active_adapter not in self.randlora_lambda.keys():
422-
continue
423+
continue
423424
update_B, update_A, dtype = self.get_scaled_bases(active_adapter)
424425
requires_conversion = not torch.is_autocast_enabled()
425426
if requires_conversion:
@@ -431,16 +432,14 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
431432
dropout = self.randlora_dropout[active_adapter]
432433
x_temp = dropout(x.to(update_A.dtype))
433434

434-
adapter_output = torch.nn.functional.linear(
435-
torch.nn.functional.linear(x_temp, update_B), update_A
436-
)
435+
adapter_output = torch.nn.functional.linear(torch.nn.functional.linear(x_temp, update_B), update_A)
437436

438437
if requires_conversion:
439438
adapter_output = adapter_output.to(expected_dtype)
440439

441440
scaling = self.scaling[active_adapter]
442441
result = result + adapter_output * scaling
443-
442+
444443
# Ensure the output tensor has the same dtype as the input tensor
445444
return result.to(x.dtype)
446445

src/peft/tuners/randlora/config.py

Lines changed: 31 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2023-present the HuggingFace Inc. team.
1+
# Copyright 2025-present the HuggingFace Inc. team.
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -13,40 +13,44 @@
1313
# limitations under the License.
1414

1515
import warnings
16-
import math
1716
from dataclasses import dataclass, field
1817
from typing import List, Optional, Union
1918

2019
from peft.config import PeftConfig
2120
from peft.utils import PeftType
2221

22+
2323
@dataclass
2424
class RandLoraConfig(PeftConfig):
2525
"""
2626
This is the configuration class to store the configuration of a [`RandLoraModel`].
2727
28-
Paper: {}.
28+
Paper: https://arxiv.org/pdf/2502.00987.
2929
3030
Args:
3131
r (`int`, *optional*, defaults to `32`):
32-
RandLora's random basis rank dimension. This parameter is inversely proportional to the amount of trainable parameters.
32+
RandLora's random basis rank dimension. This parameter is inversely proportional to the amount of trainable
33+
parameters.
3334
target_modules (`Union[List[str], str]`):
3435
The names of the modules to apply RandLora to. Only linear layers are supported.
3536
projection_prng_key (`int`):
36-
RandLora PRNG init key. Used for initialising basis_A and basis_B for new models or when loading a checkpoint
37-
that did not include these projections. Defaults to `int(math.exp(1)*3.1415*1000)`.
37+
RandLora PRNG init key. Used for initialising basis_A and basis_B for new models or when loading a
38+
checkpoint that did not include these projections. Defaults to `0`.
3839
save_projection (`bool`):
39-
Whether to save the global basis_A / basis_B random basis in the state dict alongside per layer lambda / gamma diagonal matrices.
40-
weights. This will increase the size of the checkpoint, but guarantee that we can reload the checkpoint on
41-
all system configurations. Defaults to `True`.
40+
Whether to save the global basis_A / basis_B random basis in the state dict alongside per layer lambda /
41+
gamma diagonal matrices. This will increase the size of the checkpoint, but guarantee that we can
42+
reload the checkpoint on all system configurations. Defaults to `True`.
4243
sparse (`bool`):
43-
Whether to use sparse random bases as described in the RandLora paper. The current implementation is a proof of concept where the sparseness is not used to improve speed or memory usage. Defaults to `False`.
44+
Whether to use sparse random bases as described in the RandLora paper. The current implementation is a
45+
proof of concept where the sparseness is not used to improve speed or memory usage. Defaults to `False`.
4446
very_sparse (`bool`):
45-
Whether to use very sparse random bases. The current implementation is a proof of concept where the sparseness is not used to improve speed or memory usage. Defaults to `False`.
47+
Whether to use very sparse random bases. The current implementation is a proof of concept where the
48+
sparseness is not used to improve speed or memory usage. Defaults to `False`.
4649
randlora_dropout (`float`):
4750
The dropout probability for RandLora layers.
4851
randlora_alpha (`float`):
49-
The scaling coefficient for RandLora layers, this would be typically be the same as LoRA, e.g. 2 times the rank.
52+
The scaling coefficient for RandLora layers, this would be typically be the same as LoRA, e.g. 2 times the
53+
rank.
5054
fan_in_fan_out (`bool`):
5155
Set this to True if the layer to replace stores weight like (fan_in, fan_out). For example, gpt-2 uses
5256
`Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`.
@@ -57,12 +61,12 @@ class RandLoraConfig(PeftConfig):
5761
modules_to_save (`List[str]`):
5862
List of modules apart from RandLora layers to be set as trainable and saved in the final checkpoint.
5963
init_weights (`bool`):
60-
Whether to initialize the weights of the RandLora layers with their default initialization. Don't change this
61-
setting, except if you know exactly what you're doing.
64+
Whether to initialize the weights of the RandLora layers with their default initialization. Don't change
65+
this setting, except if you know exactly what you're doing.
6266
layers_to_transform (`Union[List[int],int]`):
63-
The layer indexes to transform, if this argument is specified, it will apply the RandLora transformations on
64-
the layer indexes that are specified in this list. If a single integer is passed, it will apply the RandLora
65-
transformations on the layer at this index.
67+
The layer indexes to transform, if this argument is specified, it will apply the RandLora transformations
68+
on the layer indexes that are specified in this list. If a single integer is passed, it will apply the
69+
RandLora transformations on the layer at this index.
6670
layers_pattern (`str`):
6771
The layer pattern name, used only if `layers_to_transform` is different from `None` and if the layer
6872
pattern is not in the common layers pattern.
@@ -81,7 +85,7 @@ class RandLoraConfig(PeftConfig):
8185
},
8286
)
8387
projection_prng_key: int = field(
84-
default=int(math.exp(1)*3.1415*1000),
88+
default=0,
8589
metadata={
8690
"help": (
8791
"RandLora PRNG init key. Used for initialising basis_A and basis_B for new models or when loading a "
@@ -124,8 +128,15 @@ class RandLoraConfig(PeftConfig):
124128
default=False,
125129
metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"},
126130
)
127-
randlora_alpha: int = field(default=64, metadata={"help": "Scaling coefficient in the adapter layers, typically 2 times the rank of the random bases."})
128-
bias: str = field(default="none", metadata={"help": "Bias type for RandLora. Can be 'none', 'all' or 'randlora_only'"})
131+
randlora_alpha: int = field(
132+
default=64,
133+
metadata={
134+
"help": "Scaling coefficient in the adapter layers, typically 2 times the rank of the random bases."
135+
},
136+
)
137+
bias: str = field(
138+
default="none", metadata={"help": "Bias type for RandLora. Can be 'none', 'all' or 'randlora_only'"}
139+
)
129140
modules_to_save: Optional[List[str]] = field(
130141
default=None,
131142
metadata={

0 commit comments

Comments
 (0)