better hyper-parameters and comformity with new dtype casts

PaulAlbert31 · PaulAlbert31 · commit 7942a9c369d1 · 2025-04-17T05:05:29.000Z
diff --git a/src/peft/tuners/randlora/__init__.py b/src/peft/tuners/randlora/__init__.py
@@ -1,4 +1,5 @@
 # Copyright 2025-present the HuggingFace Inc. team.
+
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/peft/tuners/randlora/bnb.py b/src/peft/tuners/randlora/bnb.py
@@ -162,6 +162,7 @@ def get_scaled_bases(self, adapter) -> list[torch.Tensor, torch.Tensor]:
             # During the forward pass, required submatrices are sliced out from the shared randlora_A and randlora_B.
             sliced_A = randlora_A[:, : self.num_bases, :min_dim]
             sliced_B = randlora_B[:max_dim, : self.num_bases, :]
+
             # Flattening the matrices over the rank and number of bases dimensions is more memory efficient
             update_B = sliced_B.flatten(start_dim=1)
             update_A = UniqueBaseGrad.apply(sliced_A, randlora_lambda, randlora_gamma).flatten(end_dim=1)
@@ -216,6 +217,7 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
                         continue
 
                     update_B, update_A = self.get_scaled_bases(active_adapter)
+
                     requires_conversion = not torch.is_autocast_enabled()
                     if requires_conversion:
                         expected_dtype = result.dtype
@@ -382,7 +384,6 @@ def get_delta_weight(self, adapter) -> torch.Tensor:
                 adapter (str):
                     The name of the adapter for which the delta weight should be computed.
             """
-
             update_B, update_A = self.get_scaled_bases(adapter)
 
             update = update_B @ update_A
@@ -405,7 +406,9 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
                 for active_adapter in self.active_adapters:
                     if active_adapter not in self.randlora_lambda.keys():
                         continue
+
                     update_B, update_A = self.get_scaled_bases(active_adapter)
+
                     requires_conversion = not torch.is_autocast_enabled()
                     if requires_conversion:
                         expected_dtype = result.dtype
diff --git a/src/peft/tuners/randlora/config.py b/src/peft/tuners/randlora/config.py
@@ -14,7 +14,7 @@
 
 import warnings
 from dataclasses import dataclass, field
-from typing import List, Optional, Union
+from typing import Optional, Union
 
 from peft.config import PeftConfig
 from peft.utils import PeftType
@@ -28,10 +28,10 @@ class RandLoraConfig(PeftConfig):
     Paper: https://arxiv.org/pdf/2502.00987.
 
     Args:
-        r (`int`, *optional*, defaults to `10`):
+        r (`int`, *optional*, defaults to `32`):
             RandLora's random basis rank dimension. Contrary to Lora, this parameter is inversely proportional to the amount of trainable
             parameters as reducing it increases trainable parameters.
-        target_modules (`Union[List[str], str]`):
+        target_modules (`Union[list[str], str]`):
             The names of the modules to apply RandLora to. Only linear layers are supported.
         projection_prng_key (`int`):
             RandLora PRNG init key. Used for initialising basis_A and basis_B for new models or when loading a
@@ -52,21 +52,20 @@ class RandLoraConfig(PeftConfig):
         randlora_dropout (`float`):
             The dropout probability for RandLora layers.
         randlora_alpha (`float`):
-            The scaling coefficient for RandLora layers, this would be typically be the same as LoRA, e.g. 2 times the
-            rank.
+            The scaling coefficient for RandLora layers, this would typically be 20 times the rank.
         fan_in_fan_out (`bool`):
             Set this to True if the layer to replace stores weight like (fan_in, fan_out). For example, gpt-2 uses
             `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`.
         bias (`str`):
             Bias type. Can be 'none', 'all' or 'randlora_only'. If 'all' or 'randlora_only', the corresponding biases
             will be updated during training. Be aware that this means that, even when disabling the adapters, the model
             will not produce the same output as the base model would have without adaptation.
-        modules_to_save (`List[str]`):
-            List of modules apart from RandLora layers to be set as trainable and saved in the final checkpoint.
+        modules_to_save (`list[str]`):
+            list of modules apart from RandLora layers to be set as trainable and saved in the final checkpoint.
         init_weights (`bool`):
             Whether to initialize the weights of the RandLora layers with their default initialization. Don't change
             this setting, except if you know exactly what you're doing.
-        layers_to_transform (`Union[List[int],int]`):
+        layers_to_transform (`Union[list[int],int]`):
             The layer indexes to transform, if this argument is specified, it will apply the RandLora transformations
             on the layer indexes that are specified in this list. If a single integer is passed, it will apply the
             RandLora transformations on the layer at this index.
@@ -75,13 +74,13 @@ class RandLoraConfig(PeftConfig):
             pattern is not in the common layers pattern.
     """
 
-    r: int = field(default=10, metadata={"help": "RandLora random basis rank"})
+    r: int = field(default=32, metadata={"help": "RandLora random basis rank"})
 
-    target_modules: Optional[Union[List[str], str]] = field(
+    target_modules: Optional[Union[list[str], str]] = field(
         default=None,
         metadata={
             "help": (
-                "List of module names or regex expression of the module names to replace with RandLora."
+                "list of module names or regex expression of the module names to replace with RandLora."
                 "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. "
                 "Only linear layers are supported."
             )
@@ -132,19 +131,19 @@ class RandLoraConfig(PeftConfig):
         metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"},
     )
     randlora_alpha: int = field(
-        default=20,
+        default=640,
         metadata={
-            "help": "Scaling coefficient in the adapter layers, typically 2 times the rank of the random bases."
+            "help": "Scaling coefficient in the adapter layers, typically 20 times the rank of the random bases."
         },
     )
     bias: str = field(
         default="none", metadata={"help": "Bias type for RandLora. Can be 'none', 'all' or 'randlora_only'"}
     )
-    modules_to_save: Optional[List[str]] = field(
+    modules_to_save: Optional[list[str]] = field(
         default=None,
         metadata={
             "help": (
-                "List of modules apart from RandLora layers to be set as trainable and saved in the final checkpoint. For"
+                "list of modules apart from RandLora layers to be set as trainable and saved in the final checkpoint. For"
                 " example, in Sequence Classification or Token Classification tasks, the final layer"
                 " `classifier/score` are randomly initialized and as such need to be trainable and saved."
             )
@@ -159,7 +158,7 @@ class RandLoraConfig(PeftConfig):
             ),
         },
     )
-    layers_to_transform: Optional[Union[List[int], int]] = field(
+    layers_to_transform: Optional[Union[list[int], int]] = field(
         default=None,
         metadata={
             "help": (
diff --git a/src/peft/tuners/randlora/layer.py b/src/peft/tuners/randlora/layer.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import math
 import warnings
-from typing import List, Optional
+from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -72,6 +71,9 @@ def __init__(self, base_layer: nn.Module, **kwargs):
         self._disable_adapters = False
         self.merged_adapters = []
 
+        # flag to enable/disable casting of input to weight dtype during forward call
+        self.cast_input_dtype_enabled = True
+
         base_layer = self.get_base_layer()
         if isinstance(base_layer, nn.Linear):
             in_features, out_features = base_layer.in_features, base_layer.out_features
@@ -118,7 +120,7 @@ def update_layer(
             requires_grad=True,
         )
 
-        self.scaling[adapter_name] = randlora_alpha / r / math.sqrt(self.num_bases)
+        self.scaling[adapter_name] = randlora_alpha / r
 
         # non trainable references to randlora_A/B buffers
         self.randlora_A = randlora_A
@@ -153,6 +155,7 @@ def update_layer(
             )
             if randlora_A_param.shape[0] < self.r[adapter_name]:
                 raise ValueError(error_tmpl.format("randlora_A", randlora_A_param.shape[0], self.r[adapter_name]))
+
             if randlora_B_param.shape[-1] < self.r[adapter_name]:
                 raise ValueError(error_tmpl.format("randlora_B", randlora_B_param.shape[-1], self.r[adapter_name]))
 
@@ -169,9 +172,7 @@ def reset_randlora_parameters(self, adapter_name):
         if adapter_name in self.randlora_lambda.keys():
             with torch.no_grad():
                 nn.init.zeros_(self.randlora_lambda[adapter_name])
-                nn.init.ones_(self.randlora_gamma[adapter_name]).fill_(
-                    1 / max(self.randlora_gamma[adapter_name].shape)
-                )
+                nn.init.constant_(self.randlora_gamma[adapter_name], 1 / max(self.randlora_gamma[adapter_name].shape))
 
 
 class Linear(nn.Linear, RandLoraLayer):
@@ -198,7 +199,7 @@ def __init__(
         self.update_layer(adapter_name, randlora_A, randlora_B, r, randlora_alpha, randlora_dropout, init_weights)
         self.is_target_conv_1d_layer = is_target_conv_1d_layer
 
-    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
         """
         Merge the active adapter weights into the base weights
 
@@ -207,7 +208,7 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = N
                 If True, the merge operation will be performed in a copy of the original weights and check for NaNs
                 before merging the weights. This is useful if you want to check if the merge operation will produce
                 NaNs. Defaults to `False`.
-            adapter_names (`List[str]`, *optional*):
+            adapter_names (`list[str]`, *optional*):
                 The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
                 to `None`.
         """
@@ -219,6 +220,8 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = N
         for active_adapter in adapter_names:
             if active_adapter in self.randlora_lambda.keys():
                 base_layer = self.get_base_layer()
+                orig_dtype = base_layer.weight.dtype
+
                 if safe_merge:
                     # Note that safe_merge will be slower than the normal merge
                     # because of the copy operation.
@@ -231,9 +234,11 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = N
                             f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
                         )
 
-                    base_layer.weight.data = orig_weights
+                    base_layer.weight.data = orig_weights.to(orig_dtype)
                 else:
-                    base_layer.weight.data += self.get_delta_weight(active_adapter)
+                    delta_weight = self.get_delta_weight(active_adapter)
+                    base_layer.weight.data += delta_weight.to(orig_dtype)
+
                 self.merged_adapters.append(active_adapter)
 
     def unmerge(self) -> None:
@@ -242,9 +247,12 @@ def unmerge(self) -> None:
             return
 
         while len(self.merged_adapters) > 0:
+            base_layer = self.get_base_layer()
+            orig_dtype = base_layer.weight.dtype
             active_adapter = self.merged_adapters.pop()
             if active_adapter in self.randlora_lambda.keys():
-                self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter)
+                delta_weight = self.get_delta_weight(active_adapter)
+                base_layer.weight.data -= delta_weight.to(orig_dtype)
 
     def get_scaled_bases(self, adapter) -> tuple[torch.Tensor, torch.Tensor]:
         """
@@ -289,7 +297,7 @@ def get_scaled_bases(self, adapter) -> tuple[torch.Tensor, torch.Tensor]:
         update_B = sliced_B.flatten(start_dim=1)
         update_A = UniqueBaseGrad.apply(sliced_A, randlora_lambda, randlora_gamma).flatten(end_dim=1)
 
-        # Since update_A is applied on the smallest dimension, test whether update_A or update_B should applied first. This is done to reduce trainable parameters.
+        # Since update_A is applied on the smallest dimension, test whether update_A or update_B should be applied first. This is done to reduce trainable parameters.
         if min_dim == self.in_features:
             return update_A, update_B
         return update_B.T, update_A.T
diff --git a/src/peft/tuners/randlora/model.py b/src/peft/tuners/randlora/model.py
@@ -188,6 +188,7 @@ def _init_randlora_A_randlora_B(self, config: RandLoraConfig, adapter_name: str)
 
         # deterministic init of randlora_A and randlora_B if we know the key
         generator = torch.Generator(device="cpu").manual_seed(config.projection_prng_key)
+
         # The gamma matrix is applied on A meaning it can be unique (shared) accross the n scaling matrices.
         # We also set randlora_A as the smallest matrix to reduce trainable parameters.
         randlora_A = _kaiming_init((config.r, 1, min_dim), generator=generator)
diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
@@ -515,17 +515,28 @@
     ########
     # RandLora #
     ########
-    ("Vanilla MLP 1 RandLora", "MLP", RandLoraConfig, {"target_modules": "lin0"}),
-    ("Vanilla MLP 2 RandLora", "MLP", RandLoraConfig, {"target_modules": ["lin0"]}),
-    ("Vanilla MLP 3 RandLora", "MLP", RandLoraConfig, {"target_modules": ["lin1"]}),
-    ("Vanilla MLP 4 RandLora", "MLP", RandLoraConfig, {"target_modules": ["lin0", "lin1"]}),
-    ("Vanilla MLP 5 RandLora", "MLP", RandLoraConfig, {"target_modules": ["lin0", "lin1"], "sparse": True}),
-    ("Vanilla MLP 6 RandLora", "MLP", RandLoraConfig, {"target_modules": ["lin0", "lin1"], "very_sparse": True}),
+    # We have to reduce the default scaling parameter to avoid nans when using large learning rates
+    ("Vanilla MLP 1 RandLora", "MLP", RandLoraConfig, {"target_modules": "lin0", "randlora_alpha": 64}),
+    ("Vanilla MLP 2 RandLora", "MLP", RandLoraConfig, {"target_modules": ["lin0"], "randlora_alpha": 64}),
+    ("Vanilla MLP 3 RandLora", "MLP", RandLoraConfig, {"target_modules": ["lin1"], "randlora_alpha": 64}),
+    ("Vanilla MLP 4 RandLora", "MLP", RandLoraConfig, {"target_modules": ["lin0", "lin1"], "randlora_alpha": 64}),
+    (
+        "Vanilla MLP 5 RandLora",
+        "MLP",
+        RandLoraConfig,
+        {"target_modules": ["lin0", "lin1"], "sparse": True, "randlora_alpha": 64},
+    ),
+    (
+        "Vanilla MLP 6 RandLora",
+        "MLP",
+        RandLoraConfig,
+        {"target_modules": ["lin0", "lin1"], "very_sparse": True, "randlora_alpha": 64},
+    ),
     (
         "Vanilla MLP 7 RandLora",
         "MLP",
         RandLoraConfig,
-        {"target_modules": ["lin0"], "modules_to_save": ["lin1"]},
+        {"target_modules": ["lin0"], "modules_to_save": ["lin1"], "randlora_alpha": 64},
     ),
 ]
 
@@ -1465,7 +1476,7 @@ def test_parameters_after_loading_model(self, test_name, model_id, config_cls, c
             lr = 0.1  # otherwise we get nan
         elif "mha" in model_id.lower():
             lr = 1e-3  # we get exploding gradients with MHA when learning rate is too high
-        elif issubclass(config_cls, VBLoRAConfig):
+        elif issubclass(config_cls, VBLoRAConfig) or issubclass(config_cls, RandLoraConfig):
             lr = 0.01  # otherwise we get nan
         optimizer = torch.optim.SGD(model.parameters(), lr=lr)
 
diff --git a/tests/testing_common.py b/tests/testing_common.py
@@ -145,8 +145,8 @@
     },
     # RandLoRA
     {
-        "r": 10,
-        "randlora_alpha": 20,
+        "r": 32,
+        "randlora_alpha": 64,
         "target_modules": None,
         "randlora_dropout": 0.05,
         "projection_prng_key": 0xFF,

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`# Copyright 2025-present the HuggingFace Inc. team.`
	`2`	`+`
`2`	`3`	`#`
`3`	`4`	`# Licensed under the Apache License, Version 2.0 (the "License");`
`4`	`5`	`# you may not use this file except in compliance with the License.`