huggingface
diff --git a/‎src/peft/tuners/randlora/bnb.py‎
Lines changed: 46 additions & 25 deletions b/‎src/peft/tuners/randlora/bnb.py‎
Lines changed: 46 additions & 25 deletions
diff --git a/‎src/peft/tuners/randlora/config.py‎
Lines changed: 15 additions & 11 deletions b/‎src/peft/tuners/randlora/config.py‎
Lines changed: 15 additions & 11 deletions
diff --git a/‎src/peft/tuners/randlora/layer.py‎
Lines changed: 13 additions & 10 deletions b/‎src/peft/tuners/randlora/layer.py‎
Lines changed: 13 additions & 10 deletions
@@ -59,11 +59,18 @@ def __init__(
             )
 
         def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
-            if self.merged:
-                warnings.warn(
-                    f"Already following adapters were merged {','.join(self.merged_adapters)}. "
-                    f"You are now additionally merging {','.join(self.active_adapters)}."
-                )
+            """
+            Merge the active adapter weights into the base weights
+
+            Args:
+                safe_merge (`bool`, *optional*):
+                    If True, the merge operation will be performed in a copy of the original weights and check for NaNs
+                    before merging the weights. This is useful if you want to check if the merge operation will produce
+                    NaNs. Defaults to `False`.
+                adapter_names (`list[str]`, *optional*):
+                    The list of adapter names that should be merged. If None, all active adapters will be merged.
+                    Defaults to `None`.
+            """
 
             adapter_names = check_adapters_to_merge(self, adapter_names)
             if not adapter_names:
@@ -98,6 +105,9 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N
                 self.merged_adapters.append(active_adapter)
 
         def unmerge(self) -> None:
+            """
+            This method unmerges all merged adapter layers from the base weights.
+            """
             if not self.merged:
                 warnings.warn("Already unmerged. Nothing to do")
                 return
@@ -124,7 +134,7 @@ def unmerge(self) -> None:
                 ).to(weight.device)
                 state.reset_grads()
 
-        def get_scaled_bases(self, adapter) -> list[torch.Tensor, torch.Tensor]:
+        def get_scaled_bases(self, adapter, device=None) -> list[torch.Tensor, torch.Tensor]:
             """
             Performs scaling on the smallest random base (randlora_A) and returns randlora_A and randlora_B in the
             correct order to fit the target layers' dimensions
@@ -137,16 +147,17 @@ def get_scaled_bases(self, adapter) -> list[torch.Tensor, torch.Tensor]:
             randlora_A = self.randlora_A[adapter]
             randlora_B = self.randlora_B[adapter]
 
-            device = randlora_B.device
+            if device is None:
+                device = randlora_B.device
             dtype = randlora_B.dtype
 
             # In case users wants to merge the adapter weights that are in
             # (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to
             # (b)float16 because some CPUs have slow bf16/fp16 matmuls.
             cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16)
 
-            randlora_lambda = self.randlora_lambda[adapter]
-            randlora_gamma = self.randlora_gamma[adapter]
+            randlora_lambda = self.randlora_lambda[adapter].to(device)
+            randlora_gamma = self.randlora_gamma[adapter].to(device)
 
             if cast_to_fp32:
                 randlora_A = randlora_A.float()
@@ -160,8 +171,8 @@ def get_scaled_bases(self, adapter) -> list[torch.Tensor, torch.Tensor]:
             # As adapted layers may have different shapes and RandLora contains a single shared pair of A and B matrices,
             # we initialize these matrices with the largest required size for each dimension.
             # During the forward pass, required submatrices are sliced out from the shared randlora_A and randlora_B.
-            sliced_A = randlora_A[:, : self.num_bases, :min_dim]
-            sliced_B = randlora_B[:max_dim, : self.num_bases, :]
+            sliced_A = randlora_A[:, : self.num_bases, :min_dim].to(device)
+            sliced_B = randlora_B[:max_dim, : self.num_bases, :].to(device)
 
             # Flattening the matrices over the rank and number of bases dimensions is more memory efficient
             update_B = sliced_B.flatten(start_dim=1)
@@ -216,7 +227,7 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
                     if active_adapter not in self.randlora_lambda.keys():
                         continue
 
-                    update_B, update_A = self.get_scaled_bases(active_adapter)
+                    update_B, update_A = self.get_scaled_bases(active_adapter, device=x.device)
 
                     requires_conversion = not torch.is_autocast_enabled()
                     if requires_conversion:
@@ -275,11 +286,18 @@ def __init__(
             )
 
         def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
-            if self.merged:
-                warnings.warn(
-                    f"Already following adapters were merged {','.join(self.merged_adapters)}. "
-                    f"You are now additionally merging {','.join(self.active_adapters)}."
-                )
+            """
+            Merge the active adapter weights into the base weights
+
+            Args:
+                safe_merge (`bool`, *optional*):
+                    If True, the merge operation will be performed in a copy of the original weights and check for NaNs
+                    before merging the weights. This is useful if you want to check if the merge operation will produce
+                    NaNs. Defaults to `False`.
+                adapter_names (`list[str]`, *optional*):
+                    The list of adapter names that should be merged. If None, all active adapters will be merged.
+                    Defaults to `None`.
+            """
 
             adapter_names = check_adapters_to_merge(self, adapter_names)
             if not adapter_names:
@@ -309,6 +327,9 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N
                 self.merged_adapters.append(active_adapter)
 
         def unmerge(self) -> None:
+            """
+            This method unmerges all merged adapter layers from the base weights.
+            """
             if not self.merged:
                 warnings.warn("Already unmerged. Nothing to do")
                 return
@@ -330,7 +351,7 @@ def unmerge(self) -> None:
                     weight.device
                 )
 
-        def get_scaled_bases(self, adapter) -> list[torch.Tensor, torch.Tensor]:
+        def get_scaled_bases(self, adapter, device=None) -> list[torch.Tensor, torch.Tensor]:
             """
             Performs scaling on the smallest random base (randlora_A) and returns randlora_A and randlora_B in the
             correct order to fit the target layers' dimensions
@@ -342,17 +363,17 @@ def get_scaled_bases(self, adapter) -> list[torch.Tensor, torch.Tensor]:
 
             randlora_A = self.randlora_A[adapter]
             randlora_B = self.randlora_B[adapter]
-
-            device = randlora_B.device
+            if device is None:
+                device = randlora_B.device
             dtype = randlora_B.dtype
 
             # In case users wants to merge the adapter weights that are in
             # (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to
             # (b)float16 because some CPUs have slow bf16/fp16 matmuls.
             cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16)
 
-            randlora_lambda = self.randlora_lambda[adapter]
-            randlora_gamma = self.randlora_gamma[adapter]
+            randlora_lambda = self.randlora_lambda[adapter].to(device)
+            randlora_gamma = self.randlora_gamma[adapter].to(device)
 
             if cast_to_fp32:
                 randlora_A = randlora_A.float()
@@ -366,8 +387,8 @@ def get_scaled_bases(self, adapter) -> list[torch.Tensor, torch.Tensor]:
             # As adapted layers may have different shapes and RandLora contains a single shared pair of A and B matrices,
             # we initialize these matrices with the largest required size for each dimension.
             # During the forward pass, required submatrices are sliced out from the shared randlora_A and randlora_B.
-            sliced_A = randlora_A[:, : self.num_bases, :min_dim]
-            sliced_B = randlora_B[:max_dim, : self.num_bases, :]
+            sliced_A = randlora_A[:, : self.num_bases, :min_dim].to(device)
+            sliced_B = randlora_B[:max_dim, : self.num_bases, :].to(device)
             # Flattening the matrices over the rank and number of bases dimensions is more memory efficient
             update_B = sliced_B.flatten(start_dim=1)
             update_A = UniqueBaseGrad.apply(sliced_A, randlora_lambda, randlora_gamma).flatten(end_dim=1)
@@ -407,7 +428,7 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
                     if active_adapter not in self.randlora_lambda.keys():
                         continue
 
-                    update_B, update_A = self.get_scaled_bases(active_adapter)
+                    update_B, update_A = self.get_scaled_bases(active_adapter, device=x.device)
 
                     requires_conversion = not torch.is_autocast_enabled()
                     if requires_conversion:
 
@@ -29,26 +29,30 @@ class RandLoraConfig(PeftConfig):
 
     Args:
         r (`int`, *optional*, defaults to `32`):
-            RandLora's random basis rank dimension. Contrary to Lora, this parameter is inversely proportional to the amount of trainable
-            parameters as reducing it increases trainable parameters.
+            RandLora's random basis rank dimension. Contrary to Lora, this parameter is inversely proportional to the
+            amount of trainable parameters as reducing it increases trainable parameters.
         target_modules (`Union[list[str], str]`):
             The names of the modules to apply RandLora to. Only linear layers are supported.
         projection_prng_key (`int`):
             RandLora PRNG init key. Used for initialising basis_A and basis_B for new models or when loading a
             checkpoint that did not include these projections. Defaults to `0`.
         save_projection (`bool`):
             Whether to save the global basis_A / basis_B random basis in the state dict alongside per layer lambda /
-            gamma diagonal matrices. This will increase the size of the checkpoint, but guarantee that we can
-            reload the checkpoint on all system configurations. Defaults to `True`.
+            gamma diagonal matrices. This will increase the size of the checkpoint, but guarantee that we can reload
+            the checkpoint on all system configurations. Defaults to `True`.
         sparse (`bool`):
-            Whether to use sparse random bases as described in the RandLora paper. The bases are ternary sparse bases (only containing -1, 0 and 1) where the attribution probability is 1/6 for -1 and 1 and 2/3 for 0.
-            These sparse matrices aim to be used for matmul free computation in the future, see https://arxiv.org/pdf/2406.02528v1
-            The current implementation is a proof of concept however where the sparseness is not used to improve speed or memory usage. Using sparse matrices typically does not reduce performance and can even help reduce overfitting.
-            Defaults to `False`.
+            Whether to use sparse random bases as described in the RandLora paper. The bases are ternary sparse bases
+            (only containing -1, 0 and 1) where the attribution probability is 1/6 for -1 and 1 and 2/3 for 0. These
+            sparse matrices aim to be used for matmul free computation in the future, see
+            https://arxiv.org/pdf/2406.02528v1 The current implementation is a proof of concept however where the
+            sparseness is not used to improve speed or memory usage. Using sparse matrices typically does not reduce
+            performance and can even help reduce overfitting. Defaults to `False`.
         very_sparse (`bool`):
-            Whether to use highly sparse random bases as described in the RandLora paper. The very sparse bases are ternary sparse bases (only containing -1, 0 and 1) given a matrix with smallest dimension d, the attribution probability is 1/√D for -1 and 1 and 1- 2/√D for 0.
-            Using these sparse matrices can further reduce overfitting over the `sparse` alternatives but will most likely decrease performance as a results. Use carefully.
-            Defaults to `False`.
+            Whether to use highly sparse random bases as described in the RandLora paper. The very sparse bases are
+            ternary sparse bases (only containing -1, 0 and 1) given a matrix with smallest dimension d, the
+            attribution probability is 1/√D for -1 and 1 and 1- 2/√D for 0. Using these sparse matrices can further
+            reduce overfitting over the `sparse` alternatives but will most likely decrease performance as a results.
+            Use carefully. Defaults to `False`.
         randlora_dropout (`float`):
             The dropout probability for RandLora layers.
         randlora_alpha (`float`):
 
@@ -30,9 +30,9 @@ class UniqueBaseGrad(torch.autograd.Function):
     # Memory efficent for a unique base
     @staticmethod
     def forward(ctx, randlora_A, randlora_lambda, randlora_gamma):
-        Out = randlora_lambda[:, :, None] * randlora_A * randlora_gamma[None,]
+        out = randlora_lambda[:, :, None] * randlora_A * randlora_gamma[None,]
         ctx.save_for_backward(randlora_A, randlora_lambda, randlora_gamma)
-        return Out
+        return out
 
     @staticmethod
     def backward(ctx, grad_output):
@@ -242,6 +242,9 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N
                 self.merged_adapters.append(active_adapter)
 
     def unmerge(self) -> None:
+        """
+        This method unmerges all merged adapter layers from the base weights.
+        """
         if not self.merged:
             warnings.warn("Already unmerged. Nothing to do.")
             return
@@ -254,7 +257,7 @@ def unmerge(self) -> None:
                 delta_weight = self.get_delta_weight(active_adapter)
                 base_layer.weight.data -= delta_weight.to(orig_dtype)
 
-    def get_scaled_bases(self, adapter) -> tuple[torch.Tensor, torch.Tensor]:
+    def get_scaled_bases(self, adapter, device=None) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Performs scaling on the smallest random base (randlora_A) and returns randlora_A and randlora_B in the correct
         order to fit the target layers' dimensions
@@ -266,17 +269,17 @@ def get_scaled_bases(self, adapter) -> tuple[torch.Tensor, torch.Tensor]:
 
         randlora_A = self.randlora_A[adapter]
         randlora_B = self.randlora_B[adapter]
-
-        device = randlora_B.device
+        if device is None:
+            device = randlora_B.device
         dtype = randlora_B.dtype
 
         # In case users wants to merge the adapter weights that are in
         # (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to
         # (b)float16 because some CPUs have slow bf16/fp16 matmuls.
         cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16)
 
-        randlora_lambda = self.randlora_lambda[adapter]
-        randlora_gamma = self.randlora_gamma[adapter]
+        randlora_lambda = self.randlora_lambda[adapter].to(device)
+        randlora_gamma = self.randlora_gamma[adapter].to(device)
 
         if cast_to_fp32:
             randlora_A = randlora_A.float()
@@ -290,8 +293,8 @@ def get_scaled_bases(self, adapter) -> tuple[torch.Tensor, torch.Tensor]:
         # As adapted layers may have different shapes and RandLora contains a single shared pair of A and B matrices,
         # we initialize these matrices with the largest required size for each dimension.
         # During the forward pass, required submatrices are sliced out from the shared randlora_A and randlora_B.
-        sliced_A = randlora_A[:, : self.num_bases, :min_dim]
-        sliced_B = randlora_B[:max_dim, : self.num_bases, :]
+        sliced_A = randlora_A[:, : self.num_bases, :min_dim].to(device)
+        sliced_B = randlora_B[:max_dim, : self.num_bases, :].to(device)
 
         # Flattening the matrices over the rank and number of bases dimensions is more memory efficient
         update_B = sliced_B.flatten(start_dim=1)
@@ -334,7 +337,7 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
                 if active_adapter not in self.randlora_lambda.keys():
                     continue
                 dropout = self.randlora_dropout[active_adapter]
-                update_B, update_A = self.get_scaled_bases(active_adapter)
+                update_B, update_A = self.get_scaled_bases(active_adapter, device=x.device)
                 x = x.to(update_A.dtype)
                 scaling = self.scaling[active_adapter]
                 result = result + F.linear(F.linear(dropout(x), update_B), update_A) * scaling