Skip to content

Commit 7f14c50

Browse files
Paul AlbertPaulAlbert31
authored andcommitted
reverting licence change
rebase
1 parent e90b52a commit 7f14c50

File tree

10 files changed

+439
-55
lines changed

10 files changed

+439
-55
lines changed

src/peft/tuners/randlora/bnb.py

Lines changed: 14 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ def unmerge(self) -> None:
124124
).to(weight.device)
125125
state.reset_grads()
126126

127-
def get_scaled_bases(self, adapter) -> list[torch.Tensor, torch.Tensor, torch.dtype]:
127+
def get_scaled_bases(self, adapter) -> list[torch.Tensor, torch.Tensor]:
128128
"""
129129
Performs scaling on the smallest random base (randlora_A) and returns randlora_A and randlora_B in the
130130
correct order to fit the target layers' dimensions
@@ -160,15 +160,15 @@ def get_scaled_bases(self, adapter) -> list[torch.Tensor, torch.Tensor, torch.dt
160160
# As adapted layers may have different shapes and RandLora contains a single shared pair of A and B matrices,
161161
# we initialize these matrices with the largest required size for each dimension.
162162
# During the forward pass, required submatrices are sliced out from the shared randlora_A and randlora_B.
163-
sliced_A = randlora_A[:, : self.n, :min_dim]
164-
sliced_B = randlora_B[:max_dim, : self.n, :]
163+
sliced_A = randlora_A[:, : self.num_bases, :min_dim]
164+
sliced_B = randlora_B[:max_dim, : self.num_bases, :]
165165
# Flattening the matrices over the rank and number of bases dimensions is more memory efficient
166166
update_B = sliced_B.flatten(start_dim=1)
167167
update_A = UniqueBaseGrad.apply(sliced_A, randlora_lambda, randlora_gamma).flatten(end_dim=1)
168168
if min_dim == self.in_features:
169-
return update_A, update_B, dtype
169+
return update_A, update_B
170170

171-
return update_B.T, update_A.T, dtype
171+
return update_B.T, update_A.T
172172

173173
def get_delta_weight(self, adapter) -> torch.Tensor:
174174
"""
@@ -179,19 +179,11 @@ def get_delta_weight(self, adapter) -> torch.Tensor:
179179
The name of the adapter for which the delta weight should be computed.
180180
"""
181181

182-
update_B, update_A, dtype = self.get_scaled_bases(adapter)
182+
update_B, update_A = self.get_scaled_bases(adapter)
183183

184184
update = update_B @ update_A
185185
output_tensor = transpose(update, self.fan_in_fan_out)
186186

187-
if dtype != self.randlora_B[adapter].dtype:
188-
output_tensor = output_tensor.to(dtype=dtype)
189-
190-
# cast back the weights
191-
# TODO: why?, taken from the VeRA implementation
192-
self.randlora_lambda[adapter].data = self.randlora_lambda[adapter].data.to(dtype)
193-
self.randlora_gamma[adapter].data = self.randlora_gamma[adapter].data.to(dtype)
194-
195187
scaling = self.scaling[adapter]
196188

197189
return output_tensor * scaling
@@ -223,7 +215,7 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
223215
if active_adapter not in self.randlora_lambda.keys():
224216
continue
225217

226-
update_B, update_A, dtype = self.get_scaled_bases(active_adapter)
218+
update_B, update_A = self.get_scaled_bases(active_adapter)
227219
requires_conversion = not torch.is_autocast_enabled()
228220
if requires_conversion:
229221
expected_dtype = result.dtype
@@ -336,7 +328,7 @@ def unmerge(self) -> None:
336328
weight.device
337329
)
338330

339-
def get_scaled_bases(self, adapter) -> list[torch.Tensor, torch.Tensor, torch.dtype]:
331+
def get_scaled_bases(self, adapter) -> list[torch.Tensor, torch.Tensor]:
340332
"""
341333
Performs scaling on the smallest random base (randlora_A) and returns randlora_A and randlora_B in the
342334
correct order to fit the target layers' dimensions
@@ -372,15 +364,15 @@ def get_scaled_bases(self, adapter) -> list[torch.Tensor, torch.Tensor, torch.dt
372364
# As adapted layers may have different shapes and RandLora contains a single shared pair of A and B matrices,
373365
# we initialize these matrices with the largest required size for each dimension.
374366
# During the forward pass, required submatrices are sliced out from the shared randlora_A and randlora_B.
375-
sliced_A = randlora_A[:, : self.n, :min_dim]
376-
sliced_B = randlora_B[:max_dim, : self.n, :]
367+
sliced_A = randlora_A[:, : self.num_bases, :min_dim]
368+
sliced_B = randlora_B[:max_dim, : self.num_bases, :]
377369
# Flattening the matrices over the rank and number of bases dimensions is more memory efficient
378370
update_B = sliced_B.flatten(start_dim=1)
379371
update_A = UniqueBaseGrad.apply(sliced_A, randlora_lambda, randlora_gamma).flatten(end_dim=1)
380372
if min_dim == self.in_features:
381-
return update_A, update_B, dtype
373+
return update_A, update_B
382374

383-
return update_B.T, update_A.T, dtype
375+
return update_B.T, update_A.T
384376

385377
def get_delta_weight(self, adapter) -> torch.Tensor:
386378
"""
@@ -391,19 +383,11 @@ def get_delta_weight(self, adapter) -> torch.Tensor:
391383
The name of the adapter for which the delta weight should be computed.
392384
"""
393385

394-
update_B, update_A, dtype = self.get_scaled_bases(adapter)
386+
update_B, update_A = self.get_scaled_bases(adapter)
395387

396388
update = update_B @ update_A
397389
output_tensor = transpose(update, self.fan_in_fan_out)
398390

399-
if dtype != self.randlora_B[adapter].dtype:
400-
output_tensor = output_tensor.to(dtype=dtype)
401-
402-
# cast back the weights
403-
# TODO: why?, taken from the VeRA implementation
404-
self.randlora_lambda[adapter].data = self.randlora_lambda[adapter].to(dtype)
405-
self.randlora_gamma[adapter].data = self.randlora_gamma[adapter].to(dtype)
406-
407391
scaling = self.scaling[adapter]
408392

409393
return output_tensor * scaling
@@ -421,7 +405,7 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
421405
for active_adapter in self.active_adapters:
422406
if active_adapter not in self.randlora_lambda.keys():
423407
continue
424-
update_B, update_A, dtype = self.get_scaled_bases(active_adapter)
408+
update_B, update_A = self.get_scaled_bases(active_adapter)
425409
requires_conversion = not torch.is_autocast_enabled()
426410
if requires_conversion:
427411
expected_dtype = result.dtype

src/peft/tuners/randlora/config.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,9 @@ class RandLoraConfig(PeftConfig):
2828
Paper: https://arxiv.org/pdf/2502.00987.
2929
3030
Args:
31-
r (`int`, *optional*, defaults to `32`):
32-
RandLora's random basis rank dimension. This parameter is inversely proportional to the amount of trainable
33-
parameters.
31+
r (`int`, *optional*, defaults to `10`):
32+
RandLora's random basis rank dimension. Contrary to Lora, this parameter is inversely proportional to the amount of trainable
33+
parameters as reducing it increases trainable parameters.
3434
target_modules (`Union[List[str], str]`):
3535
The names of the modules to apply RandLora to. Only linear layers are supported.
3636
projection_prng_key (`int`):
@@ -41,11 +41,14 @@ class RandLoraConfig(PeftConfig):
4141
gamma diagonal matrices. This will increase the size of the checkpoint, but guarantee that we can
4242
reload the checkpoint on all system configurations. Defaults to `True`.
4343
sparse (`bool`):
44-
Whether to use sparse random bases as described in the RandLora paper. The current implementation is a
45-
proof of concept where the sparseness is not used to improve speed or memory usage. Defaults to `False`.
44+
Whether to use sparse random bases as described in the RandLora paper. The bases are ternary sparse bases (only containing -1, 0 and 1) where the attribution probability is 1/6 for -1 and 1 and 2/3 for 0.
45+
These sparse matrices aim to be used for matmul free computation in the future, see https://arxiv.org/pdf/2406.02528v1
46+
The current implementation is a proof of concept however where the sparseness is not used to improve speed or memory usage. Using sparse matrices typically does not reduce performance and can even help reduce overfitting.
47+
Defaults to `False`.
4648
very_sparse (`bool`):
47-
Whether to use very sparse random bases. The current implementation is a proof of concept where the
48-
sparseness is not used to improve speed or memory usage. Defaults to `False`.
49+
Whether to use highly sparse random bases as described in the RandLora paper. The very sparse bases are ternary sparse bases (only containing -1, 0 and 1) given a matrix with smallest dimension d, the attribution probability is 1/√D for -1 and 1 and 1- 2/√D for 0.
50+
Using these sparse matrices can further reduce overfitting over the `sparse` alternatives but will most likely decrease performance as a results. Use carefully.
51+
Defaults to `False`.
4952
randlora_dropout (`float`):
5053
The dropout probability for RandLora layers.
5154
randlora_alpha (`float`):
@@ -72,7 +75,7 @@ class RandLoraConfig(PeftConfig):
7275
pattern is not in the common layers pattern.
7376
"""
7477

75-
r: int = field(default=32, metadata={"help": "RandLora random basis rank"})
78+
r: int = field(default=10, metadata={"help": "RandLora random basis rank"})
7679

7780
target_modules: Optional[Union[List[str], str]] = field(
7881
default=None,
@@ -129,7 +132,7 @@ class RandLoraConfig(PeftConfig):
129132
metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"},
130133
)
131134
randlora_alpha: int = field(
132-
default=64,
135+
default=20,
133136
metadata={
134137
"help": "Scaling coefficient in the adapter layers, typically 2 times the rank of the random bases."
135138
},

src/peft/tuners/randlora/layer.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ def unmerge(self) -> None:
246246
if active_adapter in self.randlora_lambda.keys():
247247
self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter)
248248

249-
def get_scaled_bases(self, adapter) -> tuple[torch.Tensor, torch.Tensor, torch.dtype]:
249+
def get_scaled_bases(self, adapter) -> tuple[torch.Tensor, torch.Tensor]:
250250
"""
251251
Performs scaling on the smallest random base (randlora_A) and returns randlora_A and randlora_B in the correct
252252
order to fit the target layers' dimensions
@@ -291,8 +291,8 @@ def get_scaled_bases(self, adapter) -> tuple[torch.Tensor, torch.Tensor, torch.d
291291

292292
# Since update_A is applied on the smallest dimension, test whether update_A or update_B should applied first. This is done to reduce trainable parameters.
293293
if min_dim == self.in_features:
294-
return update_A, update_B, dtype
295-
return update_B.T, update_A.T, dtype
294+
return update_A, update_B
295+
return update_B.T, update_A.T
296296

297297
def get_delta_weight(self, adapter) -> torch.Tensor:
298298
"""
@@ -303,7 +303,7 @@ def get_delta_weight(self, adapter) -> torch.Tensor:
303303
The name of the adapter for which the delta weight should be computed.
304304
"""
305305

306-
update_B, update_A, dtype = self.get_scaled_bases(adapter)
306+
update_B, update_A = self.get_scaled_bases(adapter)
307307

308308
update = (update_B.T @ update_A.T).T
309309
output_tensor = transpose(update, self.fan_in_fan_out)
@@ -326,7 +326,7 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
326326
if active_adapter not in self.randlora_lambda.keys():
327327
continue
328328
dropout = self.randlora_dropout[active_adapter]
329-
update_B, update_A, _ = self.get_scaled_bases(active_adapter)
329+
update_B, update_A = self.get_scaled_bases(active_adapter)
330330
x = x.to(update_A.dtype)
331331
scaling = self.scaling[active_adapter]
332332
result = result + F.linear(F.linear(dropout(x), update_B), update_A) * scaling

src/peft/tuners/randlora/model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ def _init_randlora_A_randlora_B_sparse(self, config: RandLoraConfig, adapter_nam
151151

152152
# deterministic init of randlora_A and randlora_B if we know the key
153153
generator = torch.Generator(device="cpu").manual_seed(config.projection_prng_key)
154+
154155
# The gamma matrix is applied on A meaning it can be unique (shared) accross the n scaling matrices.
155156
# We also set randlora_A as the smallest matrix to reduce trainable parameters.
156157
randlora_A = torch.rand((config.r, 1, min_dim), generator=generator)
@@ -369,7 +370,6 @@ def _create_new_module(randlora_config, randlora_A, randlora_B, adapter_name, ta
369370
eightbit_kwargs.update(
370371
{
371372
"has_fp16_weights": target_base_layer.state.has_fp16_weights,
372-
"memory_efficient_backward": target_base_layer.state.memory_efficient_backward,
373373
"threshold": target_base_layer.state.threshold,
374374
"index": target_base_layer.index,
375375
}

0 commit comments

Comments
 (0)