huggingface
diff --git a/‎src/peft/tuners/randlora/bnb.py‎
Lines changed: 14 additions & 30 deletions b/‎src/peft/tuners/randlora/bnb.py‎
Lines changed: 14 additions & 30 deletions
diff --git a/‎src/peft/tuners/randlora/config.py‎
Lines changed: 12 additions & 9 deletions b/‎src/peft/tuners/randlora/config.py‎
Lines changed: 12 additions & 9 deletions
diff --git a/‎src/peft/tuners/randlora/layer.py‎
Lines changed: 5 additions & 5 deletions b/‎src/peft/tuners/randlora/layer.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/peft/tuners/randlora/model.py‎
Lines changed: 1 addition & 1 deletion b/‎src/peft/tuners/randlora/model.py‎
Lines changed: 1 addition & 1 deletion
@@ -124,7 +124,7 @@ def unmerge(self) -> None:
                 ).to(weight.device)
                 state.reset_grads()
 
-        def get_scaled_bases(self, adapter) -> list[torch.Tensor, torch.Tensor, torch.dtype]:
+        def get_scaled_bases(self, adapter) -> list[torch.Tensor, torch.Tensor]:
             """
             Performs scaling on the smallest random base (randlora_A) and returns randlora_A and randlora_B in the
             correct order to fit the target layers' dimensions
@@ -160,15 +160,15 @@ def get_scaled_bases(self, adapter) -> list[torch.Tensor, torch.Tensor, torch.dt
             # As adapted layers may have different shapes and RandLora contains a single shared pair of A and B matrices,
             # we initialize these matrices with the largest required size for each dimension.
             # During the forward pass, required submatrices are sliced out from the shared randlora_A and randlora_B.
-            sliced_A = randlora_A[:, : self.n, :min_dim]
-            sliced_B = randlora_B[:max_dim, : self.n, :]
+            sliced_A = randlora_A[:, : self.num_bases, :min_dim]
+            sliced_B = randlora_B[:max_dim, : self.num_bases, :]
             # Flattening the matrices over the rank and number of bases dimensions is more memory efficient
             update_B = sliced_B.flatten(start_dim=1)
             update_A = UniqueBaseGrad.apply(sliced_A, randlora_lambda, randlora_gamma).flatten(end_dim=1)
             if min_dim == self.in_features:
-                return update_A, update_B, dtype
+                return update_A, update_B
 
-            return update_B.T, update_A.T, dtype
+            return update_B.T, update_A.T
 
         def get_delta_weight(self, adapter) -> torch.Tensor:
             """
@@ -179,19 +179,11 @@ def get_delta_weight(self, adapter) -> torch.Tensor:
                     The name of the adapter for which the delta weight should be computed.
             """
 
-            update_B, update_A, dtype = self.get_scaled_bases(adapter)
+            update_B, update_A = self.get_scaled_bases(adapter)
 
             update = update_B @ update_A
             output_tensor = transpose(update, self.fan_in_fan_out)
 
-            if dtype != self.randlora_B[adapter].dtype:
-                output_tensor = output_tensor.to(dtype=dtype)
-
-                # cast back the weights
-                # TODO: why?, taken from the VeRA implementation
-                self.randlora_lambda[adapter].data = self.randlora_lambda[adapter].data.to(dtype)
-                self.randlora_gamma[adapter].data = self.randlora_gamma[adapter].data.to(dtype)
-
             scaling = self.scaling[adapter]
 
             return output_tensor * scaling
@@ -223,7 +215,7 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
                     if active_adapter not in self.randlora_lambda.keys():
                         continue
 
-                    update_B, update_A, dtype = self.get_scaled_bases(active_adapter)
+                    update_B, update_A = self.get_scaled_bases(active_adapter)
                     requires_conversion = not torch.is_autocast_enabled()
                     if requires_conversion:
                         expected_dtype = result.dtype
@@ -336,7 +328,7 @@ def unmerge(self) -> None:
                     weight.device
                 )
 
-        def get_scaled_bases(self, adapter) -> list[torch.Tensor, torch.Tensor, torch.dtype]:
+        def get_scaled_bases(self, adapter) -> list[torch.Tensor, torch.Tensor]:
             """
             Performs scaling on the smallest random base (randlora_A) and returns randlora_A and randlora_B in the
             correct order to fit the target layers' dimensions
@@ -372,15 +364,15 @@ def get_scaled_bases(self, adapter) -> list[torch.Tensor, torch.Tensor, torch.dt
             # As adapted layers may have different shapes and RandLora contains a single shared pair of A and B matrices,
             # we initialize these matrices with the largest required size for each dimension.
             # During the forward pass, required submatrices are sliced out from the shared randlora_A and randlora_B.
-            sliced_A = randlora_A[:, : self.n, :min_dim]
-            sliced_B = randlora_B[:max_dim, : self.n, :]
+            sliced_A = randlora_A[:, : self.num_bases, :min_dim]
+            sliced_B = randlora_B[:max_dim, : self.num_bases, :]
             # Flattening the matrices over the rank and number of bases dimensions is more memory efficient
             update_B = sliced_B.flatten(start_dim=1)
             update_A = UniqueBaseGrad.apply(sliced_A, randlora_lambda, randlora_gamma).flatten(end_dim=1)
             if min_dim == self.in_features:
-                return update_A, update_B, dtype
+                return update_A, update_B
 
-            return update_B.T, update_A.T, dtype
+            return update_B.T, update_A.T
 
         def get_delta_weight(self, adapter) -> torch.Tensor:
             """
@@ -391,19 +383,11 @@ def get_delta_weight(self, adapter) -> torch.Tensor:
                     The name of the adapter for which the delta weight should be computed.
             """
 
-            update_B, update_A, dtype = self.get_scaled_bases(adapter)
+            update_B, update_A = self.get_scaled_bases(adapter)
 
             update = update_B @ update_A
             output_tensor = transpose(update, self.fan_in_fan_out)
 
-            if dtype != self.randlora_B[adapter].dtype:
-                output_tensor = output_tensor.to(dtype=dtype)
-
-                # cast back the weights
-                # TODO: why?, taken from the VeRA implementation
-                self.randlora_lambda[adapter].data = self.randlora_lambda[adapter].to(dtype)
-                self.randlora_gamma[adapter].data = self.randlora_gamma[adapter].to(dtype)
-
             scaling = self.scaling[adapter]
 
             return output_tensor * scaling
@@ -421,7 +405,7 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
                 for active_adapter in self.active_adapters:
                     if active_adapter not in self.randlora_lambda.keys():
                         continue
-                    update_B, update_A, dtype = self.get_scaled_bases(active_adapter)
+                    update_B, update_A = self.get_scaled_bases(active_adapter)
                     requires_conversion = not torch.is_autocast_enabled()
                     if requires_conversion:
                         expected_dtype = result.dtype
 
@@ -28,9 +28,9 @@ class RandLoraConfig(PeftConfig):
     Paper: https://arxiv.org/pdf/2502.00987.
 
     Args:
-        r (`int`, *optional*, defaults to `32`):
-            RandLora's random basis rank dimension. This parameter is inversely proportional to the amount of trainable
-            parameters.
+        r (`int`, *optional*, defaults to `10`):
+            RandLora's random basis rank dimension. Contrary to Lora, this parameter is inversely proportional to the amount of trainable
+            parameters as reducing it increases trainable parameters.
         target_modules (`Union[List[str], str]`):
             The names of the modules to apply RandLora to. Only linear layers are supported.
         projection_prng_key (`int`):
@@ -41,11 +41,14 @@ class RandLoraConfig(PeftConfig):
             gamma diagonal matrices. This will increase the size of the checkpoint, but guarantee that we can
             reload the checkpoint on all system configurations. Defaults to `True`.
         sparse (`bool`):
-            Whether to use sparse random bases as described in the RandLora paper. The current implementation is a
-            proof of concept where the sparseness is not used to improve speed or memory usage. Defaults to `False`.
+            Whether to use sparse random bases as described in the RandLora paper. The bases are ternary sparse bases (only containing -1, 0 and 1) where the attribution probability is 1/6 for -1 and 1 and 2/3 for 0.
+            These sparse matrices aim to be used for matmul free computation in the future, see https://arxiv.org/pdf/2406.02528v1
+            The current implementation is a proof of concept however where the sparseness is not used to improve speed or memory usage. Using sparse matrices typically does not reduce performance and can even help reduce overfitting.
+            Defaults to `False`.
         very_sparse (`bool`):
-            Whether to use very sparse random bases. The current implementation is a proof of concept where the
-            sparseness is not used to improve speed or memory usage. Defaults to `False`.
+            Whether to use highly sparse random bases as described in the RandLora paper. The very sparse bases are ternary sparse bases (only containing -1, 0 and 1) given a matrix with smallest dimension d, the attribution probability is 1/√D for -1 and 1 and 1- 2/√D for 0.
+            Using these sparse matrices can further reduce overfitting over the `sparse` alternatives but will most likely decrease performance as a results. Use carefully.
+            Defaults to `False`.
         randlora_dropout (`float`):
             The dropout probability for RandLora layers.
         randlora_alpha (`float`):
@@ -72,7 +75,7 @@ class RandLoraConfig(PeftConfig):
             pattern is not in the common layers pattern.
     """
 
-    r: int = field(default=32, metadata={"help": "RandLora random basis rank"})
+    r: int = field(default=10, metadata={"help": "RandLora random basis rank"})
 
     target_modules: Optional[Union[List[str], str]] = field(
         default=None,
@@ -129,7 +132,7 @@ class RandLoraConfig(PeftConfig):
         metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"},
     )
     randlora_alpha: int = field(
-        default=64,
+        default=20,
         metadata={
             "help": "Scaling coefficient in the adapter layers, typically 2 times the rank of the random bases."
         },
 
@@ -246,7 +246,7 @@ def unmerge(self) -> None:
             if active_adapter in self.randlora_lambda.keys():
                 self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter)
 
-    def get_scaled_bases(self, adapter) -> tuple[torch.Tensor, torch.Tensor, torch.dtype]:
+    def get_scaled_bases(self, adapter) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Performs scaling on the smallest random base (randlora_A) and returns randlora_A and randlora_B in the correct
         order to fit the target layers' dimensions
@@ -291,8 +291,8 @@ def get_scaled_bases(self, adapter) -> tuple[torch.Tensor, torch.Tensor, torch.d
 
         # Since update_A is applied on the smallest dimension, test whether update_A or update_B should applied first. This is done to reduce trainable parameters.
         if min_dim == self.in_features:
-            return update_A, update_B, dtype
-        return update_B.T, update_A.T, dtype
+            return update_A, update_B
+        return update_B.T, update_A.T
 
     def get_delta_weight(self, adapter) -> torch.Tensor:
         """
@@ -303,7 +303,7 @@ def get_delta_weight(self, adapter) -> torch.Tensor:
                 The name of the adapter for which the delta weight should be computed.
         """
 
-        update_B, update_A, dtype = self.get_scaled_bases(adapter)
+        update_B, update_A = self.get_scaled_bases(adapter)
 
         update = (update_B.T @ update_A.T).T
         output_tensor = transpose(update, self.fan_in_fan_out)
@@ -326,7 +326,7 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
                 if active_adapter not in self.randlora_lambda.keys():
                     continue
                 dropout = self.randlora_dropout[active_adapter]
-                update_B, update_A, _ = self.get_scaled_bases(active_adapter)
+                update_B, update_A = self.get_scaled_bases(active_adapter)
                 x = x.to(update_A.dtype)
                 scaling = self.scaling[active_adapter]
                 result = result + F.linear(F.linear(dropout(x), update_B), update_A) * scaling
 
@@ -151,6 +151,7 @@ def _init_randlora_A_randlora_B_sparse(self, config: RandLoraConfig, adapter_nam
 
         # deterministic init of randlora_A and randlora_B if we know the key
         generator = torch.Generator(device="cpu").manual_seed(config.projection_prng_key)
+
         # The gamma matrix is applied on A meaning it can be unique (shared) accross the n scaling matrices.
         # We also set randlora_A as the smallest matrix to reduce trainable parameters.
         randlora_A = torch.rand((config.r, 1, min_dim), generator=generator)
@@ -369,7 +370,6 @@ def _create_new_module(randlora_config, randlora_A, randlora_B, adapter_name, ta
             eightbit_kwargs.update(
                 {
                     "has_fp16_weights": target_base_layer.state.has_fp16_weights,
-                    "memory_efficient_backward": target_base_layer.state.memory_efficient_backward,
                     "threshold": target_base_layer.state.threshold,
                     "index": target_base_layer.index,
                 }
Original file line number	Diff line number	Diff line change
`@@ -151,6 +151,7 @@ def _init_randlora_A_randlora_B_sparse(self, config: RandLoraConfig, adapter_nam`
`151`	`151`
`152`	`152`	`# deterministic init of randlora_A and randlora_B if we know the key`
`153`	`153`	`generator = torch.Generator(device="cpu").manual_seed(config.projection_prng_key)`
	`154`	`+`
`154`	`155`	`# The gamma matrix is applied on A meaning it can be unique (shared) accross the n scaling matrices.`
`155`	`156`	`# We also set randlora_A as the smallest matrix to reduce trainable parameters.`
`156`	`157`	`randlora_A = torch.rand((config.r, 1, min_dim), generator=generator)`
`@@ -369,7 +370,6 @@ def _create_new_module(randlora_config, randlora_A, randlora_B, adapter_name, ta`
`369`	`370`	`eightbit_kwargs.update(`
`370`	`371`	`{`
`371`	`372`	`"has_fp16_weights": target_base_layer.state.has_fp16_weights,`
`372`		`- "memory_efficient_backward": target_base_layer.state.memory_efficient_backward,`
`373`	`373`	`"threshold": target_base_layer.state.threshold,`
`374`	`374`	`"index": target_base_layer.index,`
`375`	`375`	`}`