REFACTOR: integrate GraLoRA tests into existing test files

yeonjoon-jung01 · yeonjoon-jung01 · commit 3f69d8f64a07 · 2025-10-25T03:19:51.000+09:00
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -116,6 +116,8 @@
       title: VeRA
     - local: package_reference/fourierft
       title: FourierFT
+    - local: package_reference/gralora
+      title: GraLoRA
     - local: package_reference/vblora
       title: VB-LoRA
     - local: package_reference/hra
diff --git a/src/peft/tuners/gralora/config.py b/src/peft/tuners/gralora/config.py
@@ -21,6 +21,57 @@
 
 @dataclass
 class GraloraConfig(PeftConfig):
+    """
+    This is the configuration class to store the configuration of a [`GraloraModel`].
+
+    Args:
+        r (`int`):
+            GraLoRA attention dimension determines the rank of the GraLoRA adapter.
+            The total parameter count of the GraLoRA adapter is same as LoRA with same rank r, while the expressivitiy is multiplied by gralora_k.
+        hybrid_r (`int`):
+            Hybrid GraLoRA rank determines the rank allocated to vanilla LoRA method when using Hybrid GraLoRA method.
+            Hybrid GraLoRA, a combination of GraLoRA and vanilla LoRA, becomes available when hybrid_r > 0.
+            The parameter count of the GraLoRA adapter is r + hybrid_r.
+        target_modules (`Union[List[str], str]`):
+            List of module names or regex expression of the module names to replace with GraLoRA. "
+            For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. "
+            This can also be a wildcard 'all-linear' which matches all linear/Conv1D "
+            "(if the model is a PreTrainedModel, the output layer excluded). "
+            If not specified, modules will be chosen according to the model architecture, If the architecture is "
+            not known, an error will be raised -- in this case, you should specify the target modules manually. "
+            To avoid targeting any modules (because you want to apply `target_parameters`), set "
+            `target_modules=[]`.
+        gralora_alpha (`int`): GraLoRA alpha.
+            GraLoRA alpha is the scaling factor for the GraLoRA adapter.
+            Scale becomes gralora_alpha / (r + hybrid_r).
+        gralora_dropout (`float`):
+            GraLoRA dropout is the dropout probability for the GraLoRA adapter.
+            It is used to prevent overfitting and improve the generalization of the GraLoRA adapter.
+        gralora_k (`int`):
+            GraLoRA k determines the number of subblocks in the GraLoRA adapter.
+            The rank r must be divisible by gralora_k for the GraLoRA adapter to be valid.
+            The total parameter count is preserved regardles of gralora_k.
+            The entire rank of the GraLoRA adapter is increased by gralora_k, while the rank of each subblock is reduced by gralora_k.
+            gralora_k=2 is recommended for rank 32 or lower, and gralora_k=4 is recommended for rank 64 or higher.
+        fan_in_fan_out (`bool`):
+            Set this to True if the layer to replace stores weight like (fan_in, fan_out).
+            For example, gpt-2 uses `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`.
+        bias (`str`):
+            Bias type for gralora. Can be 'none', 'all' or 'gralora_only'.
+            If 'all' or 'gralora_only', the corresponding biases will be updated during training.
+            Be aware that this means that, even when disabling the adapters, the model will not produce the same output as the base model would have without adaptation.
+        init_weights (`bool`):
+            Whether to initialize the weights of the GraLoRA layers with their default initialization.
+            Don't change this setting, except if you know exactly what you're doing.
+        layers_to_transform (`Union[List[int], int]`):
+            The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list.
+            If a single integer is passed, PEFT will transform only the layer at this index.
+            This only works when target_modules is a list of str.
+        layers_pattern (`Optional[Union[List[str], str]]`):
+            The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern.
+            This only works when target_modules is a list of str. This should target the `nn.ModuleList` of the model, which is often called `'layers'` or `'h'`.
+    """
+
     r: int = field(
         default=32,
         metadata={
@@ -44,18 +95,23 @@ class GraloraConfig(PeftConfig):
         default=None,
         metadata={
             "help": (
-                "List of module names or regex expression of the module names to replace with gralora. "
+                "List of module names or regex expression of the module names to replace with LoRA. "
                 "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. "
-                "Only linear layers are supported."
+                "This can also be a wildcard 'all-linear' which matches all linear/Conv1D "
+                "(if the model is a PreTrainedModel, the output layer excluded). "
+                "If not specified, modules will be chosen according to the model architecture, If the architecture is "
+                "not known, an error will be raised -- in this case, you should specify the target modules manually. "
+                "To avoid targeting any modules (because you want to apply `target_parameters`), set "
+                "`target_modules=[]`."
             )
         },
     )
     gralora_alpha: int = field(
         default=64,
         metadata={
             "help": (
-                "gralora alpha is the scaling factor for the GraLoRA adapter."
-                "Scale becomes gralora_alpha / (r + hybrid_r)."
+                "gralora alpha is the scaling factor for the GraLoRA adapter. "
+                "Scale becomes gralora_alpha / (r + hybrid_r). "
             )
         },
     )
@@ -64,8 +120,11 @@ class GraloraConfig(PeftConfig):
         default=2,
         metadata={
             "help": (
-                "gralora_k determines the number of subblocks in the GraLoRA adapter."
-                "The total parameter count is preserved regardles of gralora_k, while the expressivitiy is multiplied by gralora_k."
+                "gralora_k determines the number of subblocks in the GraLoRA adapter. "
+                "The rank r must be divisible by gralora_k for the GraLoRA adapter to be valid. "
+                "The total parameter count is preserved regardles of gralora_k. "
+                "The entire rank of the GraLoRA adapter is increased by gralora_k, while the rank of each subblock is reduced by gralora_k. "
+                "gralora_k=2 is recommended for rank 32 or lower, and gralora_k=4 is recommended for rank 64 or higher. "
             )
         },
     )
@@ -99,18 +158,19 @@ class GraloraConfig(PeftConfig):
         default=None,
         metadata={
             "help": (
-                "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers"
-                " indexes that are specified inside this list. If a single integer is passed, PEFT will transform only"
-                " the layer at this index."
+                "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. "
+                "If a single integer is passed, PEFT will transform only the layer at this index. "
+                "This only works when target_modules is a list of str."
             )
         },
     )
     layers_pattern: Optional[str] = field(
         default=None,
         metadata={
             "help": (
-                "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer"
-                " pattern is not in the common layers pattern."
+                "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern. "
+                "This only works when target_modules is a list of str. This should target the `nn.ModuleList` of the "
+                "model, which is often called `'layers'` or `'h'`."
             )
         },
     )
diff --git a/src/peft/tuners/gralora/layer.py b/src/peft/tuners/gralora/layer.py
@@ -271,12 +271,6 @@ def get_delta_weight(self, adapter) -> torch.Tensor:
         in_features = self.in_features
         out_features = self.out_features
         gralora_rank = r
-        if in_features % gralora_k != 0:
-            raise ValueError(f"in_features should be divisible by gralora_k, but got {in_features} and {gralora_k}")
-        elif out_features % gralora_k != 0:
-            raise ValueError(f"out_features should be divisible by gralora_k, but got {out_features} and {gralora_k}")
-        elif gralora_rank % gralora_k != 0:
-            raise ValueError(f"rank should be divisible by gralora_k, but got {gralora_rank} and {gralora_k}")
         subblock_gralora_rank = gralora_rank // gralora_k
 
         # scatter gralora_A to get the scattered weight matrix
diff --git a/src/peft/tuners/gralora/model.py b/src/peft/tuners/gralora/model.py
@@ -15,23 +15,15 @@
 from __future__ import annotations
 
 import warnings
-from dataclasses import asdict
-from enum import Enum
-from typing import Optional
 
 import torch
-import torch.nn as nn
-from tqdm import tqdm
 from transformers.pytorch_utils import Conv1D
 
-from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists
+from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer
 from peft.utils import (
     TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
-    ModulesToSaveWrapper,
-    _get_submodules,
 )
 
-from .config import GraloraConfig
 from .layer import GraloraLayer, Linear
 
 
diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
@@ -680,6 +680,18 @@
         GraloraConfig,
         {"target_modules": ["lin0"], "modules_to_save": ["lin1"]},
     ),
+    (
+        "Vanilla MLP 6 GraLoRA",
+        "MLP",
+        GraloraConfig,
+        {"target_modules": ["lin0", "lin1"], "modules_to_save": ["lin1"]},
+    ),
+    (
+        "Vanilla MLP 7 Hybrid GraLoRA",
+        "MLP",
+        GraloraConfig,
+        {"target_modules": ["lin0", "lin1"], "modules_to_save": ["lin1"], "hybrid_r": 4},
+    ),
     (
         "Embedding + transformers Conv1D 1 GraLoRA",
         "EmbConv1D",
@@ -3124,12 +3136,12 @@ def test_add_weighted_adapter_subtraction_with_negative_weights(self):
                 cancelled_B = module.lora_B["cancelled"].weight.data
 
                 # The weights should be approximately zero (they cancel out)
-                assert torch.allclose(
-                    cancelled_A, torch.zeros_like(cancelled_A), atol=1e-5
-                ), f"Cancelled A should be ~0, got max abs value {cancelled_A.abs().max()}"
-                assert torch.allclose(
-                    cancelled_B, torch.zeros_like(cancelled_B), atol=1e-5
-                ), f"Cancelled B should be ~0, got max abs value {cancelled_B.abs().max()}"
+                assert torch.allclose(cancelled_A, torch.zeros_like(cancelled_A), atol=1e-5), (
+                    f"Cancelled A should be ~0, got max abs value {cancelled_A.abs().max()}"
+                )
+                assert torch.allclose(cancelled_B, torch.zeros_like(cancelled_B), atol=1e-5), (
+                    f"Cancelled B should be ~0, got max abs value {cancelled_B.abs().max()}"
+                )
 
     def test_add_weighted_adapter_negative_weight_with_different_scaling(self):
         # Test negative weights with different scaling factors (lora_alpha)
@@ -3440,6 +3452,24 @@ def test_dora_save_and_load_remapping(self):
         for k in state_dict:
             assert torch.allclose(state_dict[k], state_dict_loaded[k])
 
+    def test_gralora_and_hybrid_gralora_parameter_count(self):
+        # Here we test the parameter count of GraLoRA is preserved
+        # when rank r + hybrid_r is the same regardless of the value of gralora_k.
+        model1 = MLP()
+        config1 = GraloraConfig(target_modules=["lin0"], r=12, gralora_k=2, hybrid_r=0)
+        model1 = get_peft_model(model1, config1)
+        model2 = MLP()
+        config2 = GraloraConfig(target_modules=["lin0"], r=10, gralora_k=2, hybrid_r=2)
+        model2 = get_peft_model(model2, config2)
+        model3 = MLP()
+        config3 = GraloraConfig(target_modules=["lin0"], r=10, gralora_k=5, hybrid_r=2)
+        model3 = get_peft_model(model3, config3)
+        trainable_params1, all_params1 = model1.get_nb_trainable_parameters()
+        trainable_params2, all_params2 = model2.get_nb_trainable_parameters()
+        trainable_params3, all_params3 = model3.get_nb_trainable_parameters()
+        assert trainable_params1 == trainable_params2 == trainable_params3
+        assert all_params1 == all_params2 == all_params3
+
     @pytest.mark.parametrize("with_forward_call", [False, True])
     def test_mha_gradients_set_correctly(self, with_forward_call):
         # check for this bug: https://github.com/huggingface/peft/issues/761#issuecomment-1893804738
@@ -3535,9 +3565,9 @@ def test_multirank_2(self):
                 if isinstance(module, BaseTunerLayer):
                     rank_expected = rank_pattern.get(key, r)
                     rank_current = module.lora_A[adapter].weight.shape[0]
-                    assert (
-                        rank_current == rank_expected
-                    ), f"Rank {rank_current} is not equal to expected {rank_expected}"
+                    assert rank_current == rank_expected, (
+                        f"Rank {rank_current} is not equal to expected {rank_expected}"
+                    )
 
 
 class TestLayerRepr:
diff --git a/tests/test_gralora.py b/tests/test_gralora.py
diff --git a/tests/test_initialization.py b/tests/test_initialization.py