huggingface
diff --git a/‎src/peft/__init__.py‎
Lines changed: 4 additions & 2 deletions b/‎src/peft/__init__.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/peft/tuners/__init__.py‎
Lines changed: 4 additions & 3 deletions b/‎src/peft/tuners/__init__.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎src/peft/tuners/randlora/__init__.py‎
Lines changed: 4 additions & 5 deletions b/‎src/peft/tuners/randlora/__init__.py‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎src/peft/tuners/randlora/bnb.py‎
Lines changed: 36 additions & 37 deletions b/‎src/peft/tuners/randlora/bnb.py‎
Lines changed: 36 additions & 37 deletions
diff --git a/‎src/peft/tuners/randlora/config.py‎
Lines changed: 31 additions & 20 deletions b/‎src/peft/tuners/randlora/config.py‎
Lines changed: 31 additions & 20 deletions
@@ -87,6 +87,8 @@
     PromptEncoderReparameterizationType,
     PromptTuningConfig,
     PromptTuningInit,
+    RandLoraConfig,
+    RandLoraModel,
     TrainableTokensConfig,
     TrainableTokensModel,
     VBLoRAConfig,
@@ -95,8 +97,6 @@
     VeraModel,
     XLoraConfig,
     XLoraModel,
-    RandLoraConfig,
-    RandLoraModel,
     get_eva_state_dict,
     initialize_lora_eva_weights,
 )
@@ -180,6 +180,8 @@
     "PromptLearningConfig",
     "PromptTuningConfig",
     "PromptTuningInit",
+    "RandLoraConfig",
+    "RandLoraModel",
     "TaskType",
     "TrainableTokensConfig",
     "TrainableTokensModel",
 
@@ -39,11 +39,12 @@
 from .poly import PolyConfig, PolyModel
 from .prefix_tuning import PrefixEncoder, PrefixTuningConfig
 from .prompt_tuning import PromptEmbedding, PromptTuningConfig, PromptTuningInit
+from .randlora import RandLoraConfig, RandLoraModel
 from .trainable_tokens import TrainableTokensConfig, TrainableTokensModel
 from .vblora import VBLoRAConfig, VBLoRAModel
 from .vera import VeraConfig, VeraModel
 from .xlora import XLoraConfig, XLoraModel
-from .randlora import RandLoraConfig, RandLoraModel
+
 
 __all__ = [
     "AdaLoraConfig",
@@ -89,6 +90,8 @@
     "PromptEncoderReparameterizationType",
     "PromptTuningConfig",
     "PromptTuningInit",
+    "RandLoraConfig",
+    "RandLoraModel",
     "TrainableTokensConfig",
     "TrainableTokensModel",
     "VBLoRAConfig",
@@ -97,8 +100,6 @@
     "VeraModel",
     "XLoraConfig",
     "XLoraModel",
-    "RandLoraConfig",
-    "RandLoraModel",
     "get_eva_state_dict",
     "initialize_lora_eva_weights",
 ]
@@ -1,4 +1,4 @@
-# Copyright 2023-present the HuggingFace Inc. team.
+# Copyright 2025-present the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -20,11 +20,10 @@
 from .model import RandLoraModel
 
 
-__all__ = ["RandLoraConfig", "RandLoraLayer", "Linear", "RandLoraModel"]
+__all__ = ["Linear", "RandLoraConfig", "RandLoraLayer", "RandLoraModel"]
+
+register_peft_method(name="randlora", config_cls=RandLoraConfig, model_cls=RandLoraModel, prefix="randlora_")
 
-register_peft_method(
-    name="randlora", config_cls=RandLoraConfig, model_cls=RandLoraModel, prefix="randlora_"
-)
 
 def __getattr__(name):
     if (name == "Linear8bitLt") and is_bnb_available():
 
@@ -1,4 +1,4 @@
-# Copyright 2024-present the HuggingFace Inc. team.
+# Copyright 2025-present the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -124,10 +124,11 @@ def unmerge(self) -> None:
                 ).to(weight.device)
                 state.reset_grads()
 
-        def get_scaled_bases(self, adapter) -> List[torch.Tensor, torch.Tensor, torch.dtype]:
+        def get_scaled_bases(self, adapter) -> list[torch.Tensor, torch.Tensor, torch.dtype]:
             """
-            Performs scaling on the smallest random base (randlora_A) and returns randlora_A and randlora_B in the correct order
-            to fit the target layers' dimensions
+            Performs scaling on the smallest random base (randlora_A) and returns randlora_A and randlora_B in the
+            correct order to fit the target layers' dimensions
+
             Args:
                 adapter (str):
                     The name of the adapter for which the delta weight should be computed.
@@ -153,15 +154,15 @@ def get_scaled_bases(self, adapter) -> List[torch.Tensor, torch.Tensor, torch.dt
                 randlora_lambda = randlora_lambda.float()
                 randlora_gamma = randlora_gamma.float()
 
-            #The trainable paramters are always applied to randlora_A, the smallest basis.
+            # The trainable paramters are always applied to randlora_A, the smallest basis.
             min_dim, max_dim = min(self.out_features, self.in_features), max(self.out_features, self.in_features)
 
             # As adapted layers may have different shapes and RandLora contains a single shared pair of A and B matrices,
             # we initialize these matrices with the largest required size for each dimension.
             # During the forward pass, required submatrices are sliced out from the shared randlora_A and randlora_B.
-            sliced_A = randlora_A[:, : self.n, : min_dim]
-            sliced_B = randlora_B[: max_dim, : self.n, :]
-            #Flattening the matrices over the rank and number of bases dimensions is more memory efficient
+            sliced_A = randlora_A[:, : self.n, :min_dim]
+            sliced_B = randlora_B[:max_dim, : self.n, :]
+            # Flattening the matrices over the rank and number of bases dimensions is more memory efficient
             update_B = sliced_B.flatten(start_dim=1)
             update_A = UniqueBaseGrad.apply(sliced_A, randlora_lambda, randlora_gamma).flatten(end_dim=1)
             if min_dim == self.in_features:
@@ -188,11 +189,11 @@ def get_delta_weight(self, adapter) -> torch.Tensor:
 
                 # cast back the weights
                 # TODO: why?, taken from the VeRA implementation
-                self.randlora_lambda[adapter].data = randlora_lambda.to(dtype)
-                self.randlora_gamma[adapter].data = randlora_gamma.to(dtype)
+                self.randlora_lambda[adapter].data = self.randlora_lambda[adapter].data.to(dtype)
+                self.randlora_gamma[adapter].data = self.randlora_gamma[adapter].data.to(dtype)
 
             scaling = self.scaling[adapter]
-            
+
             return output_tensor * scaling
 
         def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
@@ -206,9 +207,9 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
                 torch.Tensor: Output tensor after applying the RandLora adaptation.
 
             Note:
-                This method implements the RandLora-specific forward pass. It applies the shared projections (randlora_A and
-                randlora_B) along with the per-layer trainable parameters (lambda and gamma) to compute the adapter
-                output.
+                This method implements the RandLora-specific forward pass. It applies the shared projections
+                (randlora_A and randlora_B) along with the per-layer trainable parameters (lambda and gamma) to compute
+                the adapter output.
             """
             if self.disable_adapters:
                 if self.merged:
@@ -221,7 +222,7 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
                 for active_adapter in self.active_adapters:
                     if active_adapter not in self.randlora_lambda.keys():
                         continue
-                    
+
                     update_B, update_A, dtype = self.get_scaled_bases(active_adapter)
                     requires_conversion = not torch.is_autocast_enabled()
                     if requires_conversion:
@@ -232,14 +233,12 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
 
                     dropout = self.randlora_dropout[active_adapter]
                     x_temp = dropout(x.to(update_A.dtype))
-                    
-                    adapter_output = torch.nn.functional.linear(
-                        torch.nn.functional.linear(x_temp, update_B), update_A
-                    )
+
+                    adapter_output = torch.nn.functional.linear(torch.nn.functional.linear(x_temp, update_B), update_A)
 
                     if requires_conversion:
                         adapter_output = adapter_output.to(expected_dtype)
-                        
+
                     scaling = self.scaling[active_adapter]
                     result = result + adapter_output * scaling
 
@@ -337,10 +336,11 @@ def unmerge(self) -> None:
                     weight.device
                 )
 
-        def get_scaled_bases(self, adapter) -> List[torch.Tensor, torch.Tensor, torch.dtype]:
+        def get_scaled_bases(self, adapter) -> list[torch.Tensor, torch.Tensor, torch.dtype]:
             """
-            Performs scaling on the smallest random base (randlora_A) and returns randlora_A and randlora_B in the correct order
-            to fit the target layers' dimensions
+            Performs scaling on the smallest random base (randlora_A) and returns randlora_A and randlora_B in the
+            correct order to fit the target layers' dimensions
+
             Args:
                 adapter (str):
                     The name of the adapter for which the delta weight should be computed.
@@ -366,15 +366,15 @@ def get_scaled_bases(self, adapter) -> List[torch.Tensor, torch.Tensor, torch.dt
                 randlora_lambda = randlora_lambda.float()
                 randlora_gamma = randlora_gamma.float()
 
-            #The trainable paramters are always applied to randlora_A, the smallest basis.
+            # The trainable paramters are always applied to randlora_A, the smallest basis.
             min_dim, max_dim = min(self.out_features, self.in_features), max(self.out_features, self.in_features)
 
             # As adapted layers may have different shapes and RandLora contains a single shared pair of A and B matrices,
             # we initialize these matrices with the largest required size for each dimension.
-            # During the forward pass, required submatrices are sliced out from the shared randlora_A and randlora_B.        
-            sliced_A = randlora_A[:, : self.n, : min_dim]
-            sliced_B = randlora_B[: max_dim, : self.n, :]
-            #Flattening the matrices over the rank and number of bases dimensions is more memory efficient
+            # During the forward pass, required submatrices are sliced out from the shared randlora_A and randlora_B.
+            sliced_A = randlora_A[:, : self.n, :min_dim]
+            sliced_B = randlora_B[:max_dim, : self.n, :]
+            # Flattening the matrices over the rank and number of bases dimensions is more memory efficient
             update_B = sliced_B.flatten(start_dim=1)
             update_A = UniqueBaseGrad.apply(sliced_A, randlora_lambda, randlora_gamma).flatten(end_dim=1)
             if min_dim == self.in_features:
@@ -385,6 +385,7 @@ def get_scaled_bases(self, adapter) -> List[torch.Tensor, torch.Tensor, torch.dt
         def get_delta_weight(self, adapter) -> torch.Tensor:
             """
             Compute the delta weight for the given adapter.
+
             Args:
                 adapter (str):
                     The name of the adapter for which the delta weight should be computed.
@@ -400,13 +401,13 @@ def get_delta_weight(self, adapter) -> torch.Tensor:
 
                 # cast back the weights
                 # TODO: why?, taken from the VeRA implementation
-                self.randlora_lambda[adapter].data = randlora_lambda.to(dtype)
-                self.randlora_gamma[adapter].data = randlora_gamma.to(dtype)
+                self.randlora_lambda[adapter].data = self.randlora_lambda[adapter].to(dtype)
+                self.randlora_gamma[adapter].data = self.randlora_gamma[adapter].to(dtype)
 
             scaling = self.scaling[adapter]
-            
+
             return output_tensor * scaling
-    
+
         def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
             if self.disable_adapters:
                 if self.merged:
@@ -419,7 +420,7 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
                 result = result.clone()
                 for active_adapter in self.active_adapters:
                     if active_adapter not in self.randlora_lambda.keys():
-                        continue                    
+                        continue
                     update_B, update_A, dtype = self.get_scaled_bases(active_adapter)
                     requires_conversion = not torch.is_autocast_enabled()
                     if requires_conversion:
@@ -431,16 +432,14 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
                     dropout = self.randlora_dropout[active_adapter]
                     x_temp = dropout(x.to(update_A.dtype))
 
-                    adapter_output = torch.nn.functional.linear(
-                        torch.nn.functional.linear(x_temp, update_B), update_A
-                    )
+                    adapter_output = torch.nn.functional.linear(torch.nn.functional.linear(x_temp, update_B), update_A)
 
                     if requires_conversion:
                         adapter_output = adapter_output.to(expected_dtype)
 
                     scaling = self.scaling[active_adapter]
                     result = result + adapter_output * scaling
-                    
+
             # Ensure the output tensor has the same dtype as the input tensor
             return result.to(x.dtype)
 
 
@@ -1,4 +1,4 @@
-# Copyright 2023-present the HuggingFace Inc. team.
+# Copyright 2025-present the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,40 +13,44 @@
 # limitations under the License.
 
 import warnings
-import math
 from dataclasses import dataclass, field
 from typing import List, Optional, Union
 
 from peft.config import PeftConfig
 from peft.utils import PeftType
 
+
 @dataclass
 class RandLoraConfig(PeftConfig):
     """
     This is the configuration class to store the configuration of a [`RandLoraModel`].
 
-    Paper: {}.
+    Paper: https://arxiv.org/pdf/2502.00987.
 
     Args:
         r (`int`, *optional*, defaults to `32`):
-            RandLora's random basis rank dimension. This parameter is inversely proportional to the amount of trainable parameters.
+            RandLora's random basis rank dimension. This parameter is inversely proportional to the amount of trainable
+            parameters.
         target_modules (`Union[List[str], str]`):
             The names of the modules to apply RandLora to. Only linear layers are supported.
         projection_prng_key (`int`):
-            RandLora PRNG init key. Used for initialising basis_A and basis_B for new models or when loading a checkpoint
-            that did not include these projections. Defaults to `int(math.exp(1)*3.1415*1000)`.
+            RandLora PRNG init key. Used for initialising basis_A and basis_B for new models or when loading a
+            checkpoint that did not include these projections. Defaults to `0`.
         save_projection (`bool`):
-            Whether to save the global basis_A / basis_B random basis in the state dict alongside per layer lambda / gamma diagonal matrices.
-            weights. This will increase the size of the checkpoint, but guarantee that we can reload the checkpoint on
-            all system configurations. Defaults to `True`.
+            Whether to save the global basis_A / basis_B random basis in the state dict alongside per layer lambda /
+            gamma diagonal matrices. This will increase the size of the checkpoint, but guarantee that we can
+            reload the checkpoint on all system configurations. Defaults to `True`.
         sparse (`bool`):
-            Whether to use sparse random bases as described in the RandLora paper. The current implementation is a proof of concept where the sparseness is not used to improve speed or memory usage. Defaults to `False`.
+            Whether to use sparse random bases as described in the RandLora paper. The current implementation is a
+            proof of concept where the sparseness is not used to improve speed or memory usage. Defaults to `False`.
         very_sparse (`bool`):
-            Whether to use very sparse random bases. The current implementation is a proof of concept where the sparseness is not used to improve speed or memory usage. Defaults to `False`.
+            Whether to use very sparse random bases. The current implementation is a proof of concept where the
+            sparseness is not used to improve speed or memory usage. Defaults to `False`.
         randlora_dropout (`float`):
             The dropout probability for RandLora layers.
         randlora_alpha (`float`):
-            The scaling coefficient for RandLora layers, this would be typically be the same as LoRA, e.g. 2 times the rank.
+            The scaling coefficient for RandLora layers, this would be typically be the same as LoRA, e.g. 2 times the
+            rank.
         fan_in_fan_out (`bool`):
             Set this to True if the layer to replace stores weight like (fan_in, fan_out). For example, gpt-2 uses
             `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`.
@@ -57,12 +61,12 @@ class RandLoraConfig(PeftConfig):
         modules_to_save (`List[str]`):
             List of modules apart from RandLora layers to be set as trainable and saved in the final checkpoint.
         init_weights (`bool`):
-            Whether to initialize the weights of the RandLora layers with their default initialization. Don't change this
-            setting, except if you know exactly what you're doing.
+            Whether to initialize the weights of the RandLora layers with their default initialization. Don't change
+            this setting, except if you know exactly what you're doing.
         layers_to_transform (`Union[List[int],int]`):
-            The layer indexes to transform, if this argument is specified, it will apply the RandLora transformations on
-            the layer indexes that are specified in this list. If a single integer is passed, it will apply the RandLora
-            transformations on the layer at this index.
+            The layer indexes to transform, if this argument is specified, it will apply the RandLora transformations
+            on the layer indexes that are specified in this list. If a single integer is passed, it will apply the
+            RandLora transformations on the layer at this index.
         layers_pattern (`str`):
             The layer pattern name, used only if `layers_to_transform` is different from `None` and if the layer
             pattern is not in the common layers pattern.
@@ -81,7 +85,7 @@ class RandLoraConfig(PeftConfig):
         },
     )
     projection_prng_key: int = field(
-        default=int(math.exp(1)*3.1415*1000),
+        default=0,
         metadata={
             "help": (
                 "RandLora PRNG init key. Used for initialising basis_A and basis_B for new models or when loading a "
@@ -124,8 +128,15 @@ class RandLoraConfig(PeftConfig):
         default=False,
         metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"},
     )
-    randlora_alpha: int = field(default=64, metadata={"help": "Scaling coefficient in the adapter layers, typically 2 times the rank of the random bases."})
-    bias: str = field(default="none", metadata={"help": "Bias type for RandLora. Can be 'none', 'all' or 'randlora_only'"})
+    randlora_alpha: int = field(
+        default=64,
+        metadata={
+            "help": "Scaling coefficient in the adapter layers, typically 2 times the rank of the random bases."
+        },
+    )
+    bias: str = field(
+        default="none", metadata={"help": "Bias type for RandLora. Can be 'none', 'all' or 'randlora_only'"}
+    )
     modules_to_save: Optional[List[str]] = field(
         default=None,
         metadata={