From 6dfa24e4b62652a7b6fcfda01ce873a44999c956 Mon Sep 17 00:00:00 2001
From: yeonjoon-jung01 <yeonjoon.jung@squeezebits.com>
Date: Thu, 16 Oct 2025 16:43:12 +0900
Subject: [PATCH 01/11] feat: Add Gralora configuration and basic
 implementation

---
 src/peft/__init__.py                |   4 +
 src/peft/tuners/__init__.py         |   3 +
 src/peft/tuners/gralora/__init__.py |  20 ++
 src/peft/tuners/gralora/config.py   |  82 ++++++
 src/peft/tuners/gralora/layer.py    | 267 ++++++++++++++++++++
 src/peft/tuners/gralora/model.py    | 377 ++++++++++++++++++++++++++++
 src/peft/utils/peft_types.py        |   2 +
 7 files changed, 755 insertions(+)
 create mode 100644 src/peft/tuners/gralora/__init__.py
 create mode 100644 src/peft/tuners/gralora/config.py
 create mode 100644 src/peft/tuners/gralora/layer.py
 create mode 100644 src/peft/tuners/gralora/model.py

diff --git a/src/peft/__init__.py b/src/peft/__init__.py
index f8fdd48ff0..9a89b19554 100644
--- a/src/peft/__init__.py
+++ b/src/peft/__init__.py
@@ -64,6 +64,8 @@
     EvaConfig,
     FourierFTConfig,
     FourierFTModel,
+    GraloraConfig,
+    GraloraModel,
     HRAConfig,
     HRAModel,
     IA3Config,
@@ -163,6 +165,8 @@
     "EvaConfig",
     "FourierFTConfig",
     "FourierFTModel",
+    "GraloraConfig",
+    "GraloraModel",
     "HRAConfig",
     "HRAModel",
     "IA3Config",
diff --git a/src/peft/tuners/__init__.py b/src/peft/tuners/__init__.py
index 3bf53d7da9..364bbb8fb2 100644
--- a/src/peft/tuners/__init__.py
+++ b/src/peft/tuners/__init__.py
@@ -20,6 +20,7 @@
 from .cpt import CPTConfig, CPTEmbedding
 from .delora import DeloraConfig, DeloraModel
 from .fourierft import FourierFTConfig, FourierFTModel
+from .gralora import GraloraConfig, GraloraModel
 from .hra import HRAConfig, HRAModel
 from .ia3 import IA3Config, IA3Model
 from .ln_tuning import LNTuningConfig, LNTuningModel
@@ -74,6 +75,8 @@
     "EvaConfig",
     "FourierFTConfig",
     "FourierFTModel",
+    "GraloraConfig",
+    "GraloraModel",
     "HRAConfig",
     "HRAModel",
     "IA3Config",
diff --git a/src/peft/tuners/gralora/__init__.py b/src/peft/tuners/gralora/__init__.py
new file mode 100644
index 0000000000..db1927c442
--- /dev/null
+++ b/src/peft/tuners/gralora/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config import GraloraConfig
+from .layer import GraloraLayer
+from .model import GraloraModel
+
+
+__all__ = ["GraloraConfig", "GraloraLayer", "GraloraModel"]
diff --git a/src/peft/tuners/gralora/config.py b/src/peft/tuners/gralora/config.py
new file mode 100644
index 0000000000..fb919fbcbf
--- /dev/null
+++ b/src/peft/tuners/gralora/config.py
@@ -0,0 +1,82 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import Optional, Union
+
+from peft.config import PeftConfig
+from peft.utils import PeftType
+
+
+@dataclass
+class GraloraConfig(PeftConfig):
+    r: int = field(default=8, metadata={"help": "gralora attention dimension"})
+    hybrid_r: int = field(
+        default=0, metadata={"help": "hybrid_r is the rank allocated to vanilla LoRA method when using Hybrid GraLoRA"}
+    )
+    target_modules: Optional[Union[list[str], str]] = field(
+        default=None,
+        metadata={
+            "help": (
+                "List of module names or regex expression of the module names to replace with gralora."
+                "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. "
+                "Only linear layers are supported."
+            )
+        },
+    )
+    gralora_alpha: int = field(default=8, metadata={"help": "gralora alpha"})
+    gralora_dropout: float = field(default=0.0, metadata={"help": "gralora dropout"})
+    gralora_k: int = field(default=2, metadata={"help": "gralora k"})
+    fan_in_fan_out: bool = field(
+        default=False,
+        metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"},
+    )
+    bias: str = field(
+        default="none", metadata={"help": "Bias type for gralora. Can be 'none', 'all' or 'gralora_only'"}
+    )
+    modules_to_save: Optional[list[str]] = field(
+        default=None,
+        metadata={
+            "help": (
+                "List of modules apart from gralora layers to be set as trainable and saved in the final checkpoint. For"
+                " example, in Sequence Classification or Token Classification tasks, the final layer"
+                " `classifier/score` are randomly initialized and as such need to be trainable and saved."
+            )
+        },
+    )
+    layers_to_transform: Optional[Union[list[int], int]] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers"
+                " indexes that are specified inside this list. If a single integer is passed, PEFT will transform only"
+                " the layer at this index."
+            )
+        },
+    )
+    layers_pattern: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer"
+                " pattern is not in the common layers pattern."
+            )
+        },
+    )
+
+    def __post_init__(self):
+        self.peft_type = PeftType.GRALORA
+        self.target_modules = (
+            set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
+        )
diff --git a/src/peft/tuners/gralora/layer.py b/src/peft/tuners/gralora/layer.py
new file mode 100644
index 0000000000..6e6c220145
--- /dev/null
+++ b/src/peft/tuners/gralora/layer.py
@@ -0,0 +1,267 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from transformers.pytorch_utils import Conv1D
+
+from peft.tuners.tuners_utils import BaseTunerLayer
+
+
+class GraloraLayer(BaseTunerLayer):
+    # List all names of layers that may contain adapter weight
+    adapter_layer_names = ("gralora_A", "gralora_B", "gralora_A_general", "gralora_B_general")
+    other_param_names = ("r", "hybrid_r", "gralora_alpha", "scaling", "gralora_dropout")
+
+    def __init__(self, base_layer: nn.Module, **kwargs):
+        self.base_layer = base_layer
+        self.r = {}
+        self.gralora_alpha = {}
+        self.gralora_k = {}
+        self.hybrid_r = {}
+        self.scaling = {}
+        self.gralora_dropout = nn.ModuleDict({})
+
+        # Set to `None` otherwise to avoid computation with random weight
+        self.gralora_A = nn.ParameterDict({})
+        self.gralora_B = nn.ParameterDict({})
+        self.gralora_A_general = nn.ModuleDict({})
+        self.gralora_B_general = nn.ModuleDict({})
+
+        # Mark the weight as unmerged
+        self._disable_adapters = False
+        self.merged_adapters = []
+
+        base_layer = self.get_base_layer()
+        if isinstance(base_layer, nn.Linear):
+            in_features, out_features = base_layer.in_features, base_layer.out_features
+        elif isinstance(base_layer, Conv1D):
+            in_features, out_features = (
+                base_layer.weight.ds_shape if hasattr(base_layer.weight, "ds_shape") else base_layer.weight.shape
+            )
+
+        self.in_features = in_features
+        self.out_features = out_features
+        self.kwargs = kwargs
+
+    def _move_adapter_to_device_of_base_layer(self, adapter_name: str, device: Optional[torch.device] = None) -> None:
+        """
+        Move the adapter of the given name to the device of the base layer.
+        """
+        from peft.tuners.vera.buffer_dict import BufferDict
+
+        if device is None:
+            # check weight and qweight (for GPTQ)
+            for weight_name in ("weight", "qweight"):
+                weight = getattr(self.get_base_layer(), weight_name, None)
+                if weight is not None:
+                    device = weight.device
+                    dtype = weight.dtype
+                    break
+            else:
+                # no break encountered: could not determine the device
+                return
+
+        # loop through all potential adapter layers and move them to the device of the base layer; be careful to only
+        # move this specific adapter to the device, as the other adapters could be on different devices
+        # see #1639
+        for adapter_layer_name in self.adapter_layer_names + self.other_param_names:
+            adapter_layer = getattr(self, adapter_layer_name, None)
+            if not isinstance(adapter_layer, (nn.ModuleDict, nn.ParameterDict, BufferDict)):
+                continue
+            if adapter_name not in adapter_layer:
+                continue
+            if weight.dtype.is_floating_point or weight.dtype.is_complex:
+                adapter_layer[adapter_name] = adapter_layer[adapter_name].to(device, dtype=dtype)
+            else:
+                adapter_layer[adapter_name] = adapter_layer[adapter_name].to(device)
+
+    @property
+    def merged(self) -> bool:
+        return bool(self.merged_adapters)
+
+    @property
+    def bias(self) -> torch.Tensor:
+        base_layer = self.get_base_layer()
+        if isinstance(base_layer, nn.Linear):
+            return base_layer.bias
+        elif isinstance(base_layer, Conv1D):
+            return base_layer.bias
+        else:
+            return None
+
+    def update_layer(
+        self,
+        adapter_name,
+        module_name,
+        r,
+        gralora_alpha,
+        gralora_dropout,
+        gralora_k: int = 2,
+        hybrid_r: int = 0,
+    ):
+        if r <= 0:
+            raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
+
+        self.r[adapter_name] = r
+        self.gralora_alpha[adapter_name] = gralora_alpha
+        self.gralora_k[adapter_name] = gralora_k
+        self.hybrid_r[adapter_name] = hybrid_r
+
+        if gralora_dropout > 0.0:
+            gralora_dropout_layer = nn.Dropout(p=gralora_dropout)
+        else:
+            gralora_dropout_layer = nn.Identity()
+
+        self.gralora_dropout.update(nn.ModuleDict({adapter_name: gralora_dropout_layer}))
+
+        # Actual trainable parameters
+        subblock_in_features = self.in_features // gralora_k
+        subblock_out_features = self.out_features // gralora_k
+
+        gralora_r = r - hybrid_r  # gralora_r is the rank allocated to gralora method
+        assert gralora_r % gralora_k == 0, f"r should be divisible by gralora_k, but got {r} and {gralora_k}"
+
+        gralora_A = nn.ParameterList()
+        gralora_B = nn.ParameterList()
+        for _ in range(gralora_k):
+            new_A = nn.Parameter(torch.zeros(gralora_r, subblock_in_features))
+            new_B = nn.Parameter(torch.zeros(subblock_out_features, gralora_r))
+            nn.init.kaiming_uniform_(new_A, a=math.sqrt(5))
+            gralora_A.append(new_A)
+            gralora_B.append(new_B)
+        # stack A and B and transpose to get the final shape
+        gralora_A = torch.stack(tuple(gralora_A), dim=0)  # [N, rank, in_features//N]
+        gralora_A = gralora_A.transpose(1, 2).contiguous()  # [N, in_features//N, rank]
+
+        gralora_B = torch.stack(tuple(gralora_B), dim=0)  # [N, out_features//N, rank]
+        gralora_B = gralora_B.transpose(1, 2).contiguous()  # [N, rank, out_features//N]
+
+        if hybrid_r > 0:
+            general_gralora_A = nn.Linear(self.in_features, hybrid_r, bias=False)
+            general_gralora_B = nn.Linear(hybrid_r, self.out_features, bias=False)
+            nn.init.kaiming_uniform_(general_gralora_A.weight, a=math.sqrt(5))
+            nn.init.zeros_(general_gralora_B.weight)
+        else:
+            general_gralora_A = nn.Identity()
+            general_gralora_B = nn.Identity()
+
+        self.gralora_A[adapter_name] = gralora_A
+        self.gralora_B[adapter_name] = gralora_B
+        self.gralora_A_general[adapter_name] = general_gralora_A
+        self.gralora_B_general[adapter_name] = general_gralora_B
+
+        self.module_name = module_name
+
+        self.scaling[adapter_name] = gralora_alpha / r
+        self._move_adapter_to_device_of_base_layer(adapter_name)
+        self.set_adapter(self.active_adapters)
+
+
+class Linear(nn.Linear, GraloraLayer):
+    # Gralora implemented in a dense layer
+    def __init__(
+        self,
+        base_layer,
+        adapter_name: str,
+        module_name,
+        r: int = 0,
+        gralora_alpha: int = 1,
+        gralora_dropout: float = 0.0,
+        gralora_k: int = 2,
+        hybrid_r: int = 0,
+        fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
+        **kwargs,
+    ) -> None:
+        # this gets the init from nn.Linear's super perspective, i.e. nn.Module.__init__, which should always be called
+        super(nn.Linear, self).__init__()
+        GraloraLayer.__init__(self, base_layer, **kwargs)
+        self.fan_in_fan_out = fan_in_fan_out
+
+        self._active_adapter = adapter_name
+        self.update_layer(adapter_name, module_name, r, gralora_alpha, gralora_dropout, gralora_k, hybrid_r)
+
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
+        raise NotImplementedError("Merging is not supported for GraloraLayer yet.")
+
+    def unmerge(self) -> None:
+        raise NotImplementedError("Unmerging is not supported for GraloraLayer yet.")
+
+    def get_delta_weight(self, adapter) -> torch.Tensor:
+        raise NotImplementedError("Getting delta weight is not supported for GraloraLayer yet.")
+
+    def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        previous_dtype = x.dtype
+
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self.base_layer(x, *args, **kwargs)
+        elif self.merged:
+            result = self.base_layer(x, *args, **kwargs)
+        else:
+            result = self.base_layer(x, *args, **kwargs)
+            torch_result_dtype = result.dtype
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.gralora_A.keys():
+                    continue
+                gralora_A = self.gralora_A[active_adapter]
+                gralora_B = self.gralora_B[active_adapter]
+
+                gralora_A_general = self.gralora_A_general[active_adapter]
+                gralora_B_general = self.gralora_B_general[active_adapter]
+
+                r = self.r[active_adapter]
+                gralora_k = self.gralora_k[active_adapter]
+                hybrid_r = self.hybrid_r[active_adapter]
+
+                assert len(gralora_A) == len(gralora_B)
+
+                dropout = self.gralora_dropout[active_adapter]
+                scaling = self.scaling[active_adapter]
+
+                gralora_dtype = gralora_A.dtype
+                gralora_rank = r - hybrid_r
+
+                B, L, in_features = x.shape
+                N = gralora_k
+                subblock_gralora_rank = gralora_rank // N
+
+                output = torch.einsum(
+                    "bljr, jro -> bljo",
+                    torch.einsum(
+                        "blni, nir -> blnr",
+                        dropout(x.to(gralora_dtype)).view(B, L, N, in_features // N),
+                        gralora_A,
+                    )
+                    .view(B, L, N, N, subblock_gralora_rank)
+                    .permute(0, 1, 3, 2, 4)
+                    .reshape(B, L, N, N * subblock_gralora_rank),
+                    gralora_B,
+                ).reshape(B, L, -1)
+                result += scaling * output.to(torch_result_dtype)
+                if hybrid_r > 0:
+                    result += scaling * gralora_B_general(gralora_A_general(dropout(x.to(gralora_dtype)))).to(
+                        torch_result_dtype
+                    )
+
+        result = result.to(previous_dtype)
+        return result
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "gralora." + rep
diff --git a/src/peft/tuners/gralora/model.py b/src/peft/tuners/gralora/model.py
new file mode 100644
index 0000000000..26ae333174
--- /dev/null
+++ b/src/peft/tuners/gralora/model.py
@@ -0,0 +1,377 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import re
+import warnings
+from dataclasses import asdict
+from enum import Enum
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+from transformers.pytorch_utils import Conv1D
+
+from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists
+from peft.utils import (
+    TRANSFORMERS_MODELS_TO_ORA_TARGET_MODULES_MAPPING,
+    ModulesToSaveWrapper,
+    _get_submodules,
+)
+
+from .config import GraloraConfig
+from .layer import GraloraLayer, Linear
+
+
+class GraloraModel(BaseTuner):
+    """
+    Creates Vector-based Random Matrix Adaptation (Gralora) model from a pretrained transformers model.
+
+    Args:
+        model ([`~transformers.PreTrainedModel`]): The model to be adapted.
+        config ([`GraloraConfig`]): The configuration of the Gralora model.
+        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
+
+    Returns:
+        `torch.nn.Module`: The Gralora model.
+
+    Example:
+
+        ```py
+        >>> from transformers import AutoModelForCausalLM
+        >>> from peft import GraloraConfig, get_peft_model
+
+        >>> base_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
+        >>> config = GraloraConfig(r=128)
+        >>> model = get_peft_model(base_model, config)
+        ```
+
+    **Attributes**:
+        - **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted.
+        - **peft_config** ([`GraloraConfig`]): The configuration of the Gralora model.
+    """
+
+    prefix: str = "gralora_"
+
+    def __init__(self, model, config, adapter_name) -> None:
+        super().__init__(model, config, adapter_name)
+
+    def _check_new_adapter_config(self, config: GraloraConfig) -> None:
+        """
+        A helper method to check the config when a new adapter is being added.
+
+        Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters.
+
+        """
+        # the below todo is copied from LoRA
+        # TODO: there should be a check if any of the existing adapters actually has bias != "none", or else the check
+        # does not fully correspond to the error message.
+        if (len(self.peft_config) > 1) and (config.bias != "none"):
+            raise ValueError(
+                f"{self.__class__.__name__} supports only 1 adapter with bias. When using multiple adapters, "
+                "set bias to 'none' for all adapters."
+            )
+
+        for existing_config in self.peft_config.values():
+            if existing_config is config:
+                # skip the current config
+                continue
+
+            if existing_config.projection_prng_key != config.projection_prng_key:
+                raise ValueError(
+                    f"Gralora PRNG initialisation key must be the same for all adapters. Got {config.projection_prng_key=} but "
+                    f"previous config had {existing_config.projection_prng_key}."
+                )
+
+    @staticmethod
+    def _check_target_module_exists(gralora_config, key):
+        return check_target_module_exists(gralora_config, key)
+
+    def _create_and_replace(
+        self,
+        gralora_config,
+        adapter_name,
+        target,
+        target_name,
+        parent,
+        current_key,
+        **optional_kwargs,
+    ):
+        if current_key is None:
+            raise ValueError("Current Key shouldn't be `None`")
+
+        pattern = re.compile(r"layers\.(\d+)\.(.+)")
+        match = pattern.search(current_key)
+        if match:
+            module_name = match.group(2).replace(".", "__")
+        else:
+            raise ValueError("Invalid target module type")
+
+        r = gralora_config.r
+        bias = hasattr(target, "bias") and target.bias is not None
+        kwargs = {
+            "r": r,
+            "gralora_alpha": gralora_config.gralora_alpha,
+            "gralora_dropout": gralora_config.gralora_dropout,
+            "gralora_k": gralora_config.gralora_k,
+            "fan_in_fan_out": gralora_config.fan_in_fan_out,
+            "hybrid_r": gralora_config.hybrid_r,
+        }
+        kwargs["bias"] = bias
+
+        if isinstance(target, Linear):
+            target.update_layer(
+                adapter_name,
+                module_name,
+                r,
+                gralora_config.gralora_alpha,
+                gralora_config.gralora_dropout,
+                gralora_config.gralora_k,
+                gralora_config.hybrid_r,
+            )
+        else:
+            new_module = self._create_new_module(gralora_config, adapter_name, target, module_name, **kwargs)
+            if adapter_name not in self.active_adapter:
+                # adding an additional adapter: it is not automatically trainable
+                new_module.requires_grad_(False)
+            self._replace_module(parent, target_name, new_module, target)
+
+    @staticmethod
+    def _replace_module(parent, child_name, new_module, child):
+        setattr(parent, child_name, new_module)
+        # It's not necessary to set requires_grad here, as that is handled by
+        # _mark_only_adapters_as_trainable
+
+        # child layer wraps the original module, unpack it
+        if hasattr(child, "base_layer"):
+            child = child.base_layer
+
+        if not hasattr(new_module, "base_layer"):
+            new_module.weight = child.weight
+            if hasattr(child, "bias"):
+                new_module.bias = child.bias
+
+        if getattr(child, "state", None) is not None:
+            if hasattr(new_module, "base_layer"):
+                new_module.base_layer.state = child.state
+            else:
+                new_module.state = child.state
+            new_module.to(child.weight.device)
+
+        # dispatch to correct device
+        for name, module in new_module.named_modules():
+            if "gralora_" in name:
+                module.to(child.weight.device)
+
+    def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None:
+        for n, p in model.named_parameters():
+            if self.prefix not in n:
+                p.requires_grad = False
+
+        for active_adapter in self.active_adapters:
+            bias = self.peft_config[active_adapter].bias
+            if bias == "none":
+                continue
+
+            if bias == "all":
+                for n, p in model.named_parameters():
+                    if "bias" in n:
+                        p.requires_grad = True
+            elif bias == "gralora_only":
+                for m in model.modules():
+                    if isinstance(m, GraloraLayer) and hasattr(m, "bias") and m.bias is not None:
+                        m.bias.requires_grad = True
+            else:
+                raise NotImplementedError(f"Requested bias: {bias}, is not implemented.")
+
+    @staticmethod
+    def _create_new_module(gralora_config, adapter_name, target, module_name, **kwargs):
+        if isinstance(target, BaseTunerLayer):
+            target_base_layer = target.get_base_layer()
+        else:
+            target_base_layer = target
+
+        if isinstance(target_base_layer, torch.nn.Linear):
+            if kwargs["fan_in_fan_out"]:
+                warnings.warn(
+                    "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
+                    "Setting fan_in_fan_out to False."
+                )
+                kwargs["fan_in_fan_out"] = gralora_config.fan_in_fan_out = False
+        elif isinstance(target_base_layer, Conv1D):
+            kwargs["is_target_conv_1d_layer"] = True
+            if not kwargs["fan_in_fan_out"]:
+                warnings.warn(
+                    "fan_in_fan_out is set to False but the target module is `Conv1D`. Setting fan_in_fan_out to True."
+                )
+                kwargs["fan_in_fan_out"] = gralora_config.fan_in_fan_out = True
+        else:
+            raise ValueError(
+                f"Target module {target} is not supported. Currently, only the following modules are supported: "
+                "`torch.nn.Linear`, `transformers.pytorch_utils.Conv1D`."
+            )
+        new_module = Linear(
+            target,
+            adapter_name,
+            module_name,
+            **kwargs,
+        )
+
+        return new_module
+
+    def __getattr__(self, name: str):
+        """Forward missing attributes to the wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            return getattr(self.model, name)
+
+    def get_peft_config_as_dict(self, inference: bool = False):
+        config_dict = {}
+        for key, value in self.peft_config.items():
+            config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()}
+            if inference:
+                config["inference_mode"] = True
+        config_dict[key] = config
+        return config
+
+    def _set_adapter_layers(self, enabled=True):
+        for module in self.model.modules():
+            if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)):
+                module.enable_adapters(enabled)
+
+    def enable_adapter_layers(self):
+        self._set_adapter_layers(enabled=True)
+
+    def disable_adapter_layers(self):
+        for active_adapter in self.active_adapters:
+            val = self.peft_config[active_adapter].bias
+            if val != "none":
+                msg = (
+                    f"Careful, disabling adapter layers with bias configured to be '{val}' does not produce the same "
+                    "output as the the base model would without adaption."
+                )
+                warnings.warn(msg)
+        self._set_adapter_layers(enabled=False)
+
+    def set_adapter(self, adapter_name):
+        for module in self.model.modules():
+            if isinstance(module, GraloraLayer):
+                if module.merged:
+                    warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.")
+                    module.unmerge()
+                module.set_adapter(adapter_name)
+        self.active_adapter = adapter_name
+
+    @staticmethod
+    def _prepare_adapter_config(peft_config, model_config):
+        if peft_config.target_modules is None:
+            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_ORA_TARGET_MODULES_MAPPING:
+                raise ValueError("Please specify `target_modules` in `peft_config`")
+            peft_config.target_modules = set(
+                TRANSFORMERS_MODELS_TO_ORA_TARGET_MODULES_MAPPING[model_config["model_type"]]
+            )
+        return peft_config
+
+    def _unload_and_optionally_merge(
+        self,
+        merge=True,
+        progressbar: bool = False,
+        safe_merge: bool = False,
+        adapter_names: Optional[list[str]] = None,
+    ):
+        # we cannot use self.prefix as we want to include non-trainable gralora parameters
+        key_list = [key for key, _ in self.model.named_modules() if "gralora" not in key]
+        desc = "Unloading " + ("and merging " if merge else "") + "model"
+        for key in tqdm(key_list, disable=not progressbar, desc=desc):
+            try:
+                parent, target, target_name = _get_submodules(self.model, key)
+            except AttributeError:
+                continue
+
+            if hasattr(target, "base_layer"):
+                if merge:
+                    target.merge(safe_merge=safe_merge, adapter_names=adapter_names)
+
+                self._replace_module(parent, target_name, target.get_base_layer(), target)
+            elif isinstance(target, ModulesToSaveWrapper):
+                # save any additional trainable modules part of `modules_to_save`
+                setattr(parent, target_name, target.modules_to_save[target.active_adapter])
+
+        return self.model
+
+    def delete_adapter(self, adapter_name: str):
+        """
+        Deletes an existing adapter.
+
+        Args:
+            adapter_name (str): Name of the adapter to be deleted.
+        """
+        if adapter_name not in list(self.peft_config.keys()):
+            raise ValueError(f"Adapter {adapter_name} does not exist")
+        del self.peft_config[adapter_name]
+
+        # we cannot use self.prefix as we want to include non-trainable gralora parameters
+        key_list = [key for key, _ in self.model.named_modules() if "gralora" not in key]
+        new_adapter = None
+        for key in key_list:
+            _, target, _ = _get_submodules(self.model, key)
+            if isinstance(target, GraloraLayer):
+                target.delete_adapter(adapter_name)
+                if new_adapter is None:
+                    new_adapter = target.active_adapter[:]
+
+        self.active_adapter = new_adapter or []
+
+    def merge_and_unload(
+        self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[list[str]] = None
+    ):
+        r"""
+        This method merges the Gralora layers into the base model. This is needed if someone wants to use the base model
+        as a standalone model.
+
+        Args:
+            progressbar (`bool`):
+                whether to show a progressbar indicating the unload and merge process
+            safe_merge (`bool`):
+                whether to activate the safe merging check to check if there is any potential Nan in the adapter
+                weights
+            adapter_names (`list[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
+
+        Example:
+
+        ```py
+        >>> from transformers import AutoModelForCausalLM
+        >>> from peft import PeftModel
+
+        >>> base_model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-40b")
+        >>> peft_model_id = "smangrul/falcon-40B-int4-peft-lrasa-sfttrainer-sample"
+        >>> model = PeftModel.from_pretrained(base_model, peft_model_id)
+        >>> merged_model = model.merge_and_unload()
+        ```
+        """
+        return self._unload_and_optionally_merge(
+            progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names
+        )
+
+    def unload(self):
+        """
+        Gets back the base model by removing all the Gralora modules without merging. This gives back the original base
+        model.
+        """
+        return self._unload_and_optionally_merge(merge=False)
diff --git a/src/peft/utils/peft_types.py b/src/peft/utils/peft_types.py
index 8f55a8f2b8..ddac0c8c70 100644
--- a/src/peft/utils/peft_types.py
+++ b/src/peft/utils/peft_types.py
@@ -48,6 +48,7 @@ class PeftType(str, enum.Enum):
     - WAVEFT
     - OSF
     - DELORA
+    - GRALORA
     """
 
     PROMPT_TUNING = "PROMPT_TUNING"
@@ -80,6 +81,7 @@ class PeftType(str, enum.Enum):
     WAVEFT = "WAVEFT"
     OSF = "OSF"
     DELORA = "DELORA"
+    GRALORA = "GRALORA"
 
 
 class TaskType(str, enum.Enum):

From bfa1ef7633c76274e31509e3d7289fadd8a85e60 Mon Sep 17 00:00:00 2001
From: HaohanTsao <andy94729@gmail.com>
Date: Thu, 16 Oct 2025 16:37:49 +0800
Subject: [PATCH 02/11] ENH Support merge/unmerge in GraLoRA functionality;
 support init_weights parameter for flexible initialization

---
 src/peft/tuners/gralora/__init__.py |   6 +-
 src/peft/tuners/gralora/config.py   |  12 +-
 src/peft/tuners/gralora/layer.py    | 222 ++++++++++++++++++++++++++--
 src/peft/tuners/gralora/model.py    |  39 ++---
 4 files changed, 233 insertions(+), 46 deletions(-)

diff --git a/src/peft/tuners/gralora/__init__.py b/src/peft/tuners/gralora/__init__.py
index db1927c442..830e0a477c 100644
--- a/src/peft/tuners/gralora/__init__.py
+++ b/src/peft/tuners/gralora/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2023-present the HuggingFace Inc. team.
+# Copyright 2025-present the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,9 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from peft.utils import register_peft_method
+
 from .config import GraloraConfig
 from .layer import GraloraLayer
 from .model import GraloraModel
 
 
 __all__ = ["GraloraConfig", "GraloraLayer", "GraloraModel"]
+
+register_peft_method(name="gralora", config_cls=GraloraConfig, model_cls=GraloraModel)
diff --git a/src/peft/tuners/gralora/config.py b/src/peft/tuners/gralora/config.py
index fb919fbcbf..9e78b81afa 100644
--- a/src/peft/tuners/gralora/config.py
+++ b/src/peft/tuners/gralora/config.py
@@ -1,4 +1,4 @@
-# Copyright 2023-present the HuggingFace Inc. team.
+# Copyright 2025-present the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -55,6 +55,15 @@ class GraloraConfig(PeftConfig):
             )
         },
     )
+    init_weights: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether to initialize the weights of the GraLoRA layers with their default initialization. "
+                "Don't change this setting, except if you know exactly what you're doing."
+            )
+        },
+    )
     layers_to_transform: Optional[Union[list[int], int]] = field(
         default=None,
         metadata={
@@ -76,6 +85,7 @@ class GraloraConfig(PeftConfig):
     )
 
     def __post_init__(self):
+        super().__post_init__()
         self.peft_type = PeftType.GRALORA
         self.target_modules = (
             set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
diff --git a/src/peft/tuners/gralora/layer.py b/src/peft/tuners/gralora/layer.py
index 6e6c220145..926469f8a0 100644
--- a/src/peft/tuners/gralora/layer.py
+++ b/src/peft/tuners/gralora/layer.py
@@ -1,4 +1,4 @@
-# Copyright 2023-present the HuggingFace Inc. team.
+# Copyright 2025-present the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import math
+import warnings
 from typing import Optional
 
 import torch
@@ -20,6 +21,7 @@
 from transformers.pytorch_utils import Conv1D
 
 from peft.tuners.tuners_utils import BaseTunerLayer
+from peft.utils.other import transpose
 
 
 class GraloraLayer(BaseTunerLayer):
@@ -62,7 +64,7 @@ def _move_adapter_to_device_of_base_layer(self, adapter_name: str, device: Optio
         """
         Move the adapter of the given name to the device of the base layer.
         """
-        from peft.tuners.vera.buffer_dict import BufferDict
+        from peft.tuners._buffer_dict import BufferDict
 
         if device is None:
             # check weight and qweight (for GPTQ)
@@ -113,6 +115,7 @@ def update_layer(
         gralora_dropout,
         gralora_k: int = 2,
         hybrid_r: int = 0,
+        init_weights: bool = True,
     ):
         if r <= 0:
             raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
@@ -141,21 +144,34 @@ def update_layer(
         for _ in range(gralora_k):
             new_A = nn.Parameter(torch.zeros(gralora_r, subblock_in_features))
             new_B = nn.Parameter(torch.zeros(subblock_out_features, gralora_r))
-            nn.init.kaiming_uniform_(new_A, a=math.sqrt(5))
+            if init_weights:
+                # Initialize to identity: A is random, B is zero
+                nn.init.kaiming_uniform_(new_A, a=math.sqrt(5))
+                # new_B is already initialized to zeros
+            else:
+                # Initialize to random: both A and B are random (for testing)
+                nn.init.kaiming_uniform_(new_A, a=math.sqrt(5))
+                nn.init.kaiming_uniform_(new_B, a=math.sqrt(5))
             gralora_A.append(new_A)
             gralora_B.append(new_B)
         # stack A and B and transpose to get the final shape
-        gralora_A = torch.stack(tuple(gralora_A), dim=0)  # [N, rank, in_features//N]
-        gralora_A = gralora_A.transpose(1, 2).contiguous()  # [N, in_features//N, rank]
+        gralora_A = torch.stack(tuple(gralora_A), dim=0)  # [N, gralora_r, in_features//N]
+        gralora_A = gralora_A.transpose(1, 2).contiguous()  # [N, in_features//N, gralora_r]
 
-        gralora_B = torch.stack(tuple(gralora_B), dim=0)  # [N, out_features//N, rank]
-        gralora_B = gralora_B.transpose(1, 2).contiguous()  # [N, rank, out_features//N]
+        gralora_B = torch.stack(tuple(gralora_B), dim=0)  # [N, out_features//N, gralora_r]
+        gralora_B = gralora_B.transpose(1, 2).contiguous()  # [N, gralora_r, out_features//N]
 
         if hybrid_r > 0:
             general_gralora_A = nn.Linear(self.in_features, hybrid_r, bias=False)
             general_gralora_B = nn.Linear(hybrid_r, self.out_features, bias=False)
-            nn.init.kaiming_uniform_(general_gralora_A.weight, a=math.sqrt(5))
-            nn.init.zeros_(general_gralora_B.weight)
+            if init_weights:
+                # Initialize to identity: A is random, B is zero
+                nn.init.kaiming_uniform_(general_gralora_A.weight, a=math.sqrt(5))
+                nn.init.zeros_(general_gralora_B.weight)
+            else:
+                # Initialize to random: both A and B are random (for testing)
+                nn.init.kaiming_uniform_(general_gralora_A.weight, a=math.sqrt(5))
+                nn.init.kaiming_uniform_(general_gralora_B.weight, a=math.sqrt(5))
         else:
             general_gralora_A = nn.Identity()
             general_gralora_B = nn.Identity()
@@ -185,6 +201,7 @@ def __init__(
         gralora_k: int = 2,
         hybrid_r: int = 0,
         fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
+        init_weights: bool = True,
         **kwargs,
     ) -> None:
         # this gets the init from nn.Linear's super perspective, i.e. nn.Module.__init__, which should always be called
@@ -193,16 +210,176 @@ def __init__(
         self.fan_in_fan_out = fan_in_fan_out
 
         self._active_adapter = adapter_name
-        self.update_layer(adapter_name, module_name, r, gralora_alpha, gralora_dropout, gralora_k, hybrid_r)
+        self.update_layer(
+            adapter_name, module_name, r, gralora_alpha, gralora_dropout, gralora_k, hybrid_r, init_weights
+        )
 
     def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
-        raise NotImplementedError("Merging is not supported for GraloraLayer yet.")
+        """
+        Merge the active adapter weights into the base weights
+
+        Args:
+            safe_merge (`bool`, *optional*):
+                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
+                before merging the weights. This is useful if you want to check if the merge operation will produce
+                NaNs. Defaults to `False`.
+            adapter_names (`list[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged.
+                Defaults to `None`.
+        """
+        from peft.tuners.tuners_utils import check_adapters_to_merge
+
+        adapter_names = check_adapters_to_merge(self, adapter_names)
+        if not adapter_names:
+            # no adapter to merge
+            return
+
+        for active_adapter in adapter_names:
+            if active_adapter in self.gralora_A.keys():
+                base_layer = self.get_base_layer()
+                if safe_merge:
+                    # Note that safe_merge will be slower than the normal merge
+                    # because of the copy operation.
+                    orig_weights = base_layer.weight.data.clone()
+                    delta_weight = self.get_delta_weight(active_adapter)
+                    orig_weights += delta_weight
+
+                    if not torch.isfinite(orig_weights).all():
+                        raise ValueError(
+                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                        )
+
+                    base_layer.weight.data = orig_weights
+                else:
+                    delta_weight = self.get_delta_weight(active_adapter)
+                    base_layer.weight.data += delta_weight
+
+                self.merged_adapters.append(active_adapter)
 
     def unmerge(self) -> None:
-        raise NotImplementedError("Unmerging is not supported for GraloraLayer yet.")
+        """
+        This method unmerges all merged adapter layers from the base weights.
+        """
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+
+        while len(self.merged_adapters) > 0:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter in self.gralora_A.keys():
+                delta_weight = self.get_delta_weight(active_adapter)
+                self.get_base_layer().weight.data -= delta_weight
 
     def get_delta_weight(self, adapter) -> torch.Tensor:
-        raise NotImplementedError("Getting delta weight is not supported for GraloraLayer yet.")
+        """
+        Compute the delta weight for GraLoRA adapter.
+
+        GraLoRA applies block-wise low-rank adaptation with information exchange.
+        This method computes the equivalent weight matrix that would be added to
+        the base weight during merge.
+
+        Args:
+            adapter (str): The name of the adapter
+
+        Returns:
+            torch.Tensor: The delta weight matrix with shape [out_features, in_features]
+        """
+        gralora_A = self.gralora_A[adapter]  # [N, in_features//N, rank]
+        gralora_B = self.gralora_B[adapter]  # [N, rank, out_features//N]
+        gralora_A_general = self.gralora_A_general[adapter]
+        gralora_B_general = self.gralora_B_general[adapter]
+
+        device = gralora_A.device
+        dtype = gralora_A.dtype
+
+        gralora_k = self.gralora_k[adapter]
+        hybrid_r = self.hybrid_r[adapter]
+        r = self.r[adapter]
+
+        # Handle CPU fp16/bf16 casting
+        cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16)
+
+        if cast_to_fp32:
+            gralora_A = gralora_A.float()
+            gralora_B = gralora_B.float()
+
+        # Get dimensions
+        in_features = self.in_features
+        out_features = self.out_features
+        subblock_in = in_features // gralora_k
+        subblock_out = out_features // gralora_k
+        gralora_rank = r - hybrid_r
+        subblock_gralora_rank = gralora_rank // gralora_k
+
+        # Simulate the forward pass computation to get equivalent weight matrix
+        # We need to compute: W_delta such that W_delta @ x = gralora_forward(x) - base_forward(x)
+
+        # Create an identity matrix for each input dimension and compute output
+        # This gives us the columns of the weight matrix
+        delta_weight = torch.zeros(out_features, in_features, device=device, dtype=gralora_A.dtype)
+
+        # Process in batches to avoid memory issues
+        batch_size = min(256, in_features)
+        for start_idx in range(0, in_features, batch_size):
+            end_idx = min(start_idx + batch_size, in_features)
+            batch_len = end_idx - start_idx
+
+            # Create identity input: [batch_len, in_features]
+            x = torch.zeros(batch_len, in_features, device=device, dtype=gralora_A.dtype)
+            for i in range(batch_len):
+                x[i, start_idx + i] = 1.0
+
+            # Apply GraLoRA transformation (following forward logic)
+            # x shape: [batch_len, in_features]
+            N = gralora_k
+
+            # Reshape x: [batch_len, N, in_features//N]
+            x_reshaped = x.view(batch_len, N, in_features // N)
+
+            # Apply gralora_A: [batch_len, N, in_features//N] @ [N, in_features//N, rank]
+            # Result: [batch_len, N, rank]
+            temp = torch.einsum("bni, nir -> bnr", x_reshaped, gralora_A)
+
+            # Reshape and permute for information exchange
+            # [batch_len, N, rank] -> [batch_len, N, N, subblock_rank]
+            temp = temp.view(batch_len, N, N, subblock_gralora_rank)
+            # Permute: [batch_len, N, N, subblock_rank] -> [batch_len, N, N, subblock_rank]
+            temp = temp.permute(0, 2, 1, 3)
+            # Reshape: [batch_len, N, N * subblock_rank]
+            temp = temp.reshape(batch_len, N, N * subblock_gralora_rank)
+
+            # Apply gralora_B: [batch_len, N, N*subblock_rank] @ [N, rank, out_features//N]
+            # Note: rank here is actually gralora_rank = N * subblock_gralora_rank
+            # Result: [batch_len, N, out_features//N]
+            output = torch.einsum("bnr, nro -> bno", temp, gralora_B)
+
+            # Reshape to [batch_len, out_features]
+            output = output.reshape(batch_len, out_features)
+
+            # Store in delta_weight (transpose because weight is [out, in])
+            delta_weight[:, start_idx:end_idx] = output.T
+
+        # Add hybrid LoRA component if present
+        if hybrid_r > 0:
+            # general_A: [in_features, hybrid_r], general_B: [hybrid_r, out_features]
+            weight_A_general = gralora_A_general.weight  # [hybrid_r, in_features]
+            weight_B_general = gralora_B_general.weight  # [out_features, hybrid_r]
+
+            if cast_to_fp32:
+                weight_A_general = weight_A_general.float()
+                weight_B_general = weight_B_general.float()
+
+            # Compute delta for hybrid part: [out_features, hybrid_r] @ [hybrid_r, in_features]
+            delta_weight += weight_B_general @ weight_A_general
+
+        # Apply scaling and transpose if needed
+        delta_weight = transpose(delta_weight, self.fan_in_fan_out) * self.scaling[adapter]
+
+        # Cast back if needed
+        if cast_to_fp32:
+            delta_weight = delta_weight.to(dtype=dtype)
+
+        return delta_weight
 
     def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
         previous_dtype = x.dtype
@@ -216,6 +393,13 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
         else:
             result = self.base_layer(x, *args, **kwargs)
             torch_result_dtype = result.dtype
+
+            # Handle 2D input: [batch, features] -> [batch, 1, features]
+            # This is common for MLPs and other non-sequence models
+            x_is_2d = x.ndim == 2
+            if x_is_2d:
+                x = x.unsqueeze(1)  # [B, F] -> [B, 1, F]
+
             for active_adapter in self.active_adapters:
                 if active_adapter not in self.gralora_A.keys():
                     continue
@@ -253,11 +437,17 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
                     .reshape(B, L, N, N * subblock_gralora_rank),
                     gralora_B,
                 ).reshape(B, L, -1)
+
+                # Squeeze back to 2D if input was 2D
+                if x_is_2d:
+                    output = output.squeeze(1)  # [B, 1, F] -> [B, F]
+
                 result += scaling * output.to(torch_result_dtype)
                 if hybrid_r > 0:
-                    result += scaling * gralora_B_general(gralora_A_general(dropout(x.to(gralora_dtype)))).to(
-                        torch_result_dtype
-                    )
+                    hybrid_output = gralora_B_general(gralora_A_general(dropout(x.to(gralora_dtype))))
+                    if x_is_2d:
+                        hybrid_output = hybrid_output.squeeze(1)
+                    result += scaling * hybrid_output.to(torch_result_dtype)
 
         result = result.to(previous_dtype)
         return result
diff --git a/src/peft/tuners/gralora/model.py b/src/peft/tuners/gralora/model.py
index 26ae333174..c5c159dacf 100644
--- a/src/peft/tuners/gralora/model.py
+++ b/src/peft/tuners/gralora/model.py
@@ -1,4 +1,4 @@
-# Copyright 2023-present the HuggingFace Inc. team.
+# Copyright 2025-present the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +14,6 @@
 
 from __future__ import annotations
 
-import re
 import warnings
 from dataclasses import asdict
 from enum import Enum
@@ -27,7 +26,7 @@
 
 from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists
 from peft.utils import (
-    TRANSFORMERS_MODELS_TO_ORA_TARGET_MODULES_MAPPING,
+    TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
     ModulesToSaveWrapper,
     _get_submodules,
 )
@@ -85,17 +84,6 @@ def _check_new_adapter_config(self, config: GraloraConfig) -> None:
                 "set bias to 'none' for all adapters."
             )
 
-        for existing_config in self.peft_config.values():
-            if existing_config is config:
-                # skip the current config
-                continue
-
-            if existing_config.projection_prng_key != config.projection_prng_key:
-                raise ValueError(
-                    f"Gralora PRNG initialisation key must be the same for all adapters. Got {config.projection_prng_key=} but "
-                    f"previous config had {existing_config.projection_prng_key}."
-                )
-
     @staticmethod
     def _check_target_module_exists(gralora_config, key):
         return check_target_module_exists(gralora_config, key)
@@ -113,13 +101,6 @@ def _create_and_replace(
         if current_key is None:
             raise ValueError("Current Key shouldn't be `None`")
 
-        pattern = re.compile(r"layers\.(\d+)\.(.+)")
-        match = pattern.search(current_key)
-        if match:
-            module_name = match.group(2).replace(".", "__")
-        else:
-            raise ValueError("Invalid target module type")
-
         r = gralora_config.r
         bias = hasattr(target, "bias") and target.bias is not None
         kwargs = {
@@ -129,22 +110,24 @@ def _create_and_replace(
             "gralora_k": gralora_config.gralora_k,
             "fan_in_fan_out": gralora_config.fan_in_fan_out,
             "hybrid_r": gralora_config.hybrid_r,
+            "init_weights": gralora_config.init_weights,
         }
         kwargs["bias"] = bias
 
         if isinstance(target, Linear):
             target.update_layer(
                 adapter_name,
-                module_name,
+                current_key,
                 r,
                 gralora_config.gralora_alpha,
                 gralora_config.gralora_dropout,
                 gralora_config.gralora_k,
                 gralora_config.hybrid_r,
+                gralora_config.init_weights,
             )
         else:
-            new_module = self._create_new_module(gralora_config, adapter_name, target, module_name, **kwargs)
-            if adapter_name not in self.active_adapter:
+            new_module = self._create_new_module(gralora_config, adapter_name, target, current_key, **kwargs)
+            if adapter_name not in self.active_adapters:
                 # adding an additional adapter: it is not automatically trainable
                 new_module.requires_grad_(False)
             self._replace_module(parent, target_name, new_module, target)
@@ -267,22 +250,22 @@ def disable_adapter_layers(self):
                 warnings.warn(msg)
         self._set_adapter_layers(enabled=False)
 
-    def set_adapter(self, adapter_name):
+    def set_adapter(self, adapter_name, inference_mode: bool = False):
         for module in self.model.modules():
             if isinstance(module, GraloraLayer):
                 if module.merged:
                     warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.")
                     module.unmerge()
-                module.set_adapter(adapter_name)
+                module.set_adapter(adapter_name, inference_mode=inference_mode)
         self.active_adapter = adapter_name
 
     @staticmethod
     def _prepare_adapter_config(peft_config, model_config):
         if peft_config.target_modules is None:
-            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_ORA_TARGET_MODULES_MAPPING:
+            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING:
                 raise ValueError("Please specify `target_modules` in `peft_config`")
             peft_config.target_modules = set(
-                TRANSFORMERS_MODELS_TO_ORA_TARGET_MODULES_MAPPING[model_config["model_type"]]
+                TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING[model_config["model_type"]]
             )
         return peft_config
 

From 9813b170bb949f1169505fb5427fd04fd37f0732 Mon Sep 17 00:00:00 2001
From: HaohanTsao <andy94729@gmail.com>
Date: Thu, 16 Oct 2025 16:54:05 +0800
Subject: [PATCH 03/11] TST Add test suite for GraLoRA.

---
 src/peft/tuners/gralora/layer.py |   9 +-
 src/peft/tuners/gralora/model.py |   4 +-
 tests/test_config.py             |   2 +
 tests/test_decoder_models.py     |  25 ++
 tests/test_gralora.py            | 533 +++++++++++++++++++++++++++++++
 5 files changed, 566 insertions(+), 7 deletions(-)
 create mode 100644 tests/test_gralora.py

diff --git a/src/peft/tuners/gralora/layer.py b/src/peft/tuners/gralora/layer.py
index 926469f8a0..907730d49b 100644
--- a/src/peft/tuners/gralora/layer.py
+++ b/src/peft/tuners/gralora/layer.py
@@ -224,8 +224,8 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N
                 before merging the weights. This is useful if you want to check if the merge operation will produce
                 NaNs. Defaults to `False`.
             adapter_names (`list[str]`, *optional*):
-                The list of adapter names that should be merged. If None, all active adapters will be merged.
-                Defaults to `None`.
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
         """
         from peft.tuners.tuners_utils import check_adapters_to_merge
 
@@ -274,9 +274,8 @@ def get_delta_weight(self, adapter) -> torch.Tensor:
         """
         Compute the delta weight for GraLoRA adapter.
 
-        GraLoRA applies block-wise low-rank adaptation with information exchange.
-        This method computes the equivalent weight matrix that would be added to
-        the base weight during merge.
+        GraLoRA applies block-wise low-rank adaptation with information exchange. This method computes the equivalent
+        weight matrix that would be added to the base weight during merge.
 
         Args:
             adapter (str): The name of the adapter
diff --git a/src/peft/tuners/gralora/model.py b/src/peft/tuners/gralora/model.py
index c5c159dacf..b7f6e15097 100644
--- a/src/peft/tuners/gralora/model.py
+++ b/src/peft/tuners/gralora/model.py
@@ -323,8 +323,8 @@ def merge_and_unload(
         self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[list[str]] = None
     ):
         r"""
-        This method merges the Gralora layers into the base model. This is needed if someone wants to use the base model
-        as a standalone model.
+        This method merges the Gralora layers into the base model. This is needed if someone wants to use the base
+        model as a standalone model.
 
         Args:
             progressbar (`bool`):
diff --git a/tests/test_config.py b/tests/test_config.py
index 9277d3bb68..5cb7523d84 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -27,6 +27,7 @@
     BoneConfig,
     C3AConfig,
     FourierFTConfig,
+    GraloraConfig,
     HRAConfig,
     IA3Config,
     LNTuningConfig,
@@ -64,6 +65,7 @@
     (BoneConfig, {}),
     (C3AConfig, {}),
     (FourierFTConfig, {}),
+    (GraloraConfig, {}),
     (HRAConfig, {}),
     (IA3Config, {}),
     (LNTuningConfig, {}),
diff --git a/tests/test_decoder_models.py b/tests/test_decoder_models.py
index 5b23fa74e2..acb0d9c7d2 100644
--- a/tests/test_decoder_models.py
+++ b/tests/test_decoder_models.py
@@ -34,6 +34,7 @@
     CPTConfig,
     DeloraConfig,
     FourierFTConfig,
+    GraloraConfig,
     HRAConfig,
     IA3Config,
     LoraConfig,
@@ -137,6 +138,30 @@
             "target_modules": None,
         },
     ),
+    (
+        GraloraConfig,
+        {
+            "task_type": "CAUSAL_LM",
+            "r": 8,
+            "gralora_alpha": 16,
+            "target_modules": None,
+            "gralora_dropout": 0.05,
+            "gralora_k": 2,
+            "hybrid_r": 0,
+        },
+    ),
+    (
+        GraloraConfig,
+        {
+            "task_type": "CAUSAL_LM",
+            "r": 16,
+            "gralora_alpha": 32,
+            "target_modules": None,
+            "gralora_dropout": 0.05,
+            "gralora_k": 4,
+            "hybrid_r": 4,
+        },
+    ),
     (
         HRAConfig,
         {
diff --git a/tests/test_gralora.py b/tests/test_gralora.py
new file mode 100644
index 0000000000..af43ac22e4
--- /dev/null
+++ b/tests/test_gralora.py
@@ -0,0 +1,533 @@
+# Copyright 2025-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This test file is for tests specific to GraLoRA, since GraLoRA has some specific features
+# like block-diagonal structure, hybrid mode, and tensor permutation for information exchange.
+
+import pytest
+import torch
+from safetensors import safe_open
+from torch import nn
+
+from peft import PeftModel, get_peft_model
+from peft.tuners.gralora import GraloraConfig
+
+
+class MLP(nn.Module):
+    """Simple MLP for testing"""
+
+    def __init__(self, bias=True):
+        super().__init__()
+        self.relu = nn.ReLU()
+        self.lin0 = nn.Linear(10, 20, bias=bias)
+        self.lin1 = nn.Linear(20, 20, bias=bias)  # lin1 and lin2 have same shape
+        self.lin2 = nn.Linear(20, 20, bias=bias)
+        self.lin3 = nn.Linear(20, 2, bias=bias)
+        self.sm = nn.LogSoftmax(dim=-1)
+
+    def forward(self, X):
+        X = self.lin0(X)
+        X = self.relu(X)
+        X = self.lin1(X)
+        X = self.relu(X)
+        X = self.lin2(X)
+        X = self.relu(X)
+        X = self.lin3(X)
+        X = self.sm(X)
+        return X
+
+
+class TestGralora:
+    @pytest.fixture
+    def mlp(self):
+        torch.manual_seed(0)
+        model = MLP()
+        return model
+
+    @pytest.fixture
+    def mlp_gralora_pure(self, mlp):
+        """Pure GraLoRA without hybrid component"""
+        torch.manual_seed(0)
+        config = GraloraConfig(
+            target_modules=["lin1", "lin2"],
+            r=16,
+            gralora_k=4,
+            hybrid_r=0,
+            gralora_alpha=32,
+            gralora_dropout=0.1,
+        )
+        peft_model = get_peft_model(mlp, config)
+        return peft_model
+
+    @pytest.fixture
+    def mlp_gralora_hybrid(self):
+        """Hybrid GraLoRA with vanilla LoRA component"""
+        torch.manual_seed(0)
+        mlp = MLP()
+        config = GraloraConfig(
+            target_modules=["lin1", "lin2"],
+            r=16,
+            gralora_k=4,
+            hybrid_r=4,
+            gralora_alpha=32,
+            gralora_dropout=0.1,
+        )
+        peft_model = get_peft_model(mlp, config)
+        return peft_model
+
+    def test_gralora_config_validation(self):
+        """Test that config validation works correctly"""
+        # Valid config
+        config = GraloraConfig(r=16, gralora_k=4, hybrid_r=0)
+        assert config.r == 16
+        assert config.gralora_k == 4
+        assert config.hybrid_r == 0
+
+        # Hybrid config
+        config = GraloraConfig(r=16, gralora_k=4, hybrid_r=4)
+        assert config.r == 16
+        assert config.hybrid_r == 4
+
+    def test_gralora_parameter_shapes(self, mlp_gralora_hybrid):
+        """Test that GraLoRA parameters have correct shapes"""
+        for name, module in mlp_gralora_hybrid.named_modules():
+            if hasattr(module, "gralora_A"):
+                adapter_name = "default"
+                gralora_A = module.gralora_A[adapter_name]
+                gralora_B = module.gralora_B[adapter_name]
+                gralora_A_general = module.gralora_A_general[adapter_name]
+                gralora_B_general = module.gralora_B_general[adapter_name]
+
+                in_features = module.in_features
+                out_features = module.out_features
+                k = 4
+                gralora_rank = 16 - 4  # r - hybrid_r
+
+                # Check GraLoRA block shapes
+                # Each block has full gralora_rank, not gralora_rank // k
+                assert gralora_A.shape == (k, in_features // k, gralora_rank)
+                assert gralora_B.shape == (k, gralora_rank, out_features // k)
+
+                # Check hybrid component shapes
+                assert gralora_A_general.weight.shape == (4, in_features)
+                assert gralora_B_general.weight.shape == (out_features, 4)
+
+    def test_gralora_block_diagonal_structure(self):
+        """Test that pure GraLoRA produces block-diagonal delta weights"""
+        # Use init_weights=False to have non-zero B matrices
+        torch.manual_seed(0)
+        mlp = MLP()
+        config = GraloraConfig(
+            target_modules=["lin1", "lin2"],
+            r=16,
+            gralora_k=4,
+            hybrid_r=0,
+            init_weights=False,  # Both A and B initialized randomly
+        )
+        model = get_peft_model(mlp, config)
+
+        for name, module in model.named_modules():
+            if hasattr(module, "get_delta_weight"):
+                adapter_name = "default"
+                delta_weight = module.get_delta_weight(adapter_name)
+
+                k = 4
+                in_features = module.in_features
+                out_features = module.out_features
+                block_size_in = in_features // k
+                block_size_out = out_features // k
+
+                # Check diagonal blocks have non-zero values
+                for i in range(k):
+                    row_start = i * block_size_out
+                    row_end = (i + 1) * block_size_out
+                    col_start = i * block_size_in
+                    col_end = (i + 1) * block_size_in
+
+                    block = delta_weight[row_start:row_end, col_start:col_end]
+                    block_norm = torch.norm(block).item()
+                    # Diagonal blocks should have some values (initialized with kaiming)
+                    assert block_norm > 0, f"Diagonal block [{i},{i}] is zero"
+
+    def test_gralora_forward_pass(self, mlp_gralora_hybrid):
+        """Test that forward pass works without errors"""
+        mlp_gralora_hybrid.eval()
+        x = torch.randn(5, 10)
+
+        with torch.no_grad():
+            output = mlp_gralora_hybrid(x)
+
+        assert output.shape == (5, 2)
+        assert not torch.isnan(output).any()
+        assert not torch.isinf(output).any()
+
+    def test_gralora_backward_pass(self, mlp_gralora_hybrid):
+        """Test that backward pass computes gradients correctly"""
+        mlp_gralora_hybrid.train()
+        x = torch.randn(5, 10)
+
+        output = mlp_gralora_hybrid(x)
+        loss = output.sum()
+        loss.backward()
+
+        # Check that GraLoRA parameters have gradients
+        for name, param in mlp_gralora_hybrid.named_parameters():
+            if "gralora" in name and param.requires_grad:
+                assert param.grad is not None, f"Parameter {name} has no gradient"
+                assert not torch.isnan(param.grad).any(), f"Parameter {name} has NaN gradients"
+
+    def test_gralora_pure_vs_hybrid_params(self):
+        """Test that pure and hybrid modes have same total parameters but different distribution"""
+        torch.manual_seed(0)
+        mlp_pure = MLP()
+        config_pure = GraloraConfig(
+            target_modules=["lin1", "lin2"],
+            r=16,
+            gralora_k=4,
+            hybrid_r=0,
+        )
+        model_pure = get_peft_model(mlp_pure, config_pure)
+
+        torch.manual_seed(0)
+        mlp_hybrid = MLP()
+        config_hybrid = GraloraConfig(
+            target_modules=["lin1", "lin2"],
+            r=16,
+            gralora_k=4,
+            hybrid_r=4,
+        )
+        model_hybrid = get_peft_model(mlp_hybrid, config_hybrid)
+
+        def count_trainable_params(model):
+            return sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+        params_pure = count_trainable_params(model_pure)
+        params_hybrid = count_trainable_params(model_hybrid)
+
+        # Pure and hybrid should have same total parameters (r is constant)
+        # but distributed differently between block-diagonal and full-rank components
+        assert params_pure == params_hybrid, (
+            f"Pure ({params_pure}) and Hybrid ({params_hybrid}) should have same parameter count"
+        )
+
+        # Check that hybrid has general components
+        has_general = False
+        for name, _ in model_hybrid.named_modules():
+            if "gralora_A_general" in name or "gralora_B_general" in name:
+                has_general = True
+                break
+        assert has_general, "Hybrid mode should have general components"
+
+    def test_gralora_save_load_roundtrip(self, mlp_gralora_hybrid, tmp_path):
+        """Test that save/load preserves model behavior"""
+        mlp_gralora_hybrid.eval()
+        x = torch.randn(5, 10)
+
+        # Get output before save
+        with torch.no_grad():
+            output_before = mlp_gralora_hybrid(x)
+
+        # Save adapter
+        mlp_gralora_hybrid.save_pretrained(tmp_path)
+
+        # Load adapter
+        torch.manual_seed(0)
+        new_mlp = MLP()
+        loaded_model = PeftModel.from_pretrained(new_mlp, tmp_path)
+        loaded_model.eval()
+
+        # Get output after load
+        with torch.no_grad():
+            output_after = loaded_model(x)
+
+        # Outputs should be very close
+        assert torch.allclose(output_before, output_after, atol=1e-5, rtol=1e-5)
+
+    def test_gralora_state_dict_structure(self, mlp_gralora_hybrid, tmp_path):
+        """Test that state dict contains only necessary parameters"""
+        mlp_gralora_hybrid.save_pretrained(tmp_path)
+
+        # Load state dict
+        sd = {}
+        with safe_open(tmp_path / "adapter_model.safetensors", framework="pt", device="cpu") as f:
+            for key in f.keys():
+                sd[key] = f.get_tensor(key)
+
+        # Check that gralora parameters are present
+        assert any("gralora_A" in key for key in sd), "gralora_A not found in state dict"
+        assert any("gralora_B" in key for key in sd), "gralora_B not found in state dict"
+
+        # For hybrid mode, check hybrid components
+        assert any("gralora_A_general" in key for key in sd), "gralora_A_general not found"
+        assert any("gralora_B_general" in key for key in sd), "gralora_B_general not found"
+
+    def test_gralora_merge_and_unload(self, mlp_gralora_hybrid):
+        """Test merge_and_unload functionality"""
+        mlp_gralora_hybrid.eval()
+        x = torch.randn(5, 10)
+
+        # Get output before merge
+        with torch.no_grad():
+            output_before = mlp_gralora_hybrid(x)
+
+        # Merge and unload
+        merged_model = mlp_gralora_hybrid.merge_and_unload()
+        merged_model.eval()
+
+        # Get output after merge
+        with torch.no_grad():
+            output_after = merged_model(x)
+
+        # Outputs should be very close
+        assert torch.allclose(output_before, output_after, atol=1e-4, rtol=1e-4)
+
+        # Check that merged model has no GraLoRA layers
+        has_gralora = any("gralora" in name for name, _ in merged_model.named_parameters())
+        assert not has_gralora, "Merged model still has GraLoRA parameters"
+
+    def test_gralora_merge_unmerge(self):
+        """Test merge/unmerge functionality"""
+        torch.manual_seed(0)
+        mlp = MLP()
+        config = GraloraConfig(
+            target_modules=["lin1"],
+            r=8,
+            gralora_k=2,
+            hybrid_r=0,
+        )
+        model = get_peft_model(mlp, config)
+        model.eval()
+
+        x = torch.randn(5, 10)
+
+        # Output before merge
+        with torch.no_grad():
+            output_before = model(x)
+
+        # Merge adapter using PEFT API
+        model.merge_adapter()
+
+        with torch.no_grad():
+            output_merged = model(x)
+
+        # Outputs should be the same after merge
+        assert torch.allclose(output_before, output_merged, atol=1e-4, rtol=1e-4)
+
+        # Unmerge adapter using PEFT API
+        model.unmerge_adapter()
+
+        with torch.no_grad():
+            output_unmerged = model(x)
+
+        # Outputs should be the same after unmerge
+        assert torch.allclose(output_before, output_unmerged, atol=1e-4, rtol=1e-4)
+
+    def test_gralora_multiple_adapters(self):
+        """Test adding and switching between multiple adapters"""
+        torch.manual_seed(0)
+        mlp = MLP()
+
+        # Use init_weights=False to have non-zero outputs
+        config1 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2, hybrid_r=0, init_weights=False)
+        model = get_peft_model(mlp, config1, adapter_name="adapter1")
+
+        torch.manual_seed(42)  # Different seed for second adapter
+        config2 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2, hybrid_r=0, init_weights=False)
+        model.add_adapter("adapter2", config2)
+
+        x = torch.randn(5, 10)
+
+        # Test adapter1
+        model.set_adapter("adapter1")
+        with torch.no_grad():
+            output1 = model(x)
+
+        # Test adapter2
+        model.set_adapter("adapter2")
+        with torch.no_grad():
+            output2 = model(x)
+
+        # Different adapters should give different outputs
+        assert not torch.allclose(output1, output2, atol=1e-3, rtol=1e-3)
+
+    def test_gralora_dtype_compatibility(self):
+        """Test that GraLoRA works with different dtypes"""
+        for dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            if dtype == torch.bfloat16 and not torch.cuda.is_available():
+                # Skip bfloat16 on CPU if not supported
+                continue
+
+            torch.manual_seed(0)
+            mlp = MLP().to(dtype)
+            config = GraloraConfig(
+                target_modules=["lin1"],
+                r=8,
+                gralora_k=2,
+                hybrid_r=0,
+            )
+            model = get_peft_model(mlp, config)
+
+            x = torch.randn(5, 10).to(dtype)
+            output = model(x)
+
+            assert output.dtype == dtype, f"Output dtype mismatch for {dtype}"
+
+    def test_gralora_disable_adapters(self):
+        """Test disabling adapters"""
+        torch.manual_seed(0)
+        mlp = MLP()
+        # Use init_weights=False to have non-zero effect
+        config = GraloraConfig(
+            target_modules=["lin1", "lin2"],
+            r=16,
+            gralora_k=4,
+            hybrid_r=4,
+            init_weights=False,
+        )
+        model = get_peft_model(mlp, config)
+        model.eval()
+        x = torch.randn(5, 10)
+
+        # Output with adapter enabled
+        with torch.no_grad():
+            output_enabled = model(x)
+
+        # Output with adapter disabled
+        with model.disable_adapter():
+            with torch.no_grad():
+                output_disabled = model(x)
+
+        # Outputs should be different
+        assert not torch.allclose(output_enabled, output_disabled, atol=1e-6, rtol=1e-6)
+
+    def test_gralora_different_k_values(self):
+        """Test GraLoRA with different k values"""
+        for k in [2, 4]:
+            torch.manual_seed(0)
+            mlp = MLP()
+            config = GraloraConfig(
+                target_modules=["lin1", "lin2"],
+                r=k * 4,  # Make sure r is divisible by k
+                gralora_k=k,
+                hybrid_r=0,
+            )
+            model = get_peft_model(mlp, config)
+
+            x = torch.randn(5, 10)
+            output = model(x)
+
+            assert output.shape == (5, 2)
+            assert not torch.isnan(output).any()
+
+    def test_gralora_rank_divisibility_check(self):
+        """Test that invalid rank/k combinations raise errors"""
+        torch.manual_seed(0)
+        mlp = MLP()
+
+        # This should raise an error because (r - hybrid_r) is not divisible by k
+        # r=15, hybrid_r=0, k=4 -> gralora_rank=15, 15 % 4 != 0
+        config = GraloraConfig(
+            target_modules=["lin1"],
+            r=15,
+            gralora_k=4,
+            hybrid_r=0,
+        )
+
+        with pytest.raises(AssertionError, match="r should be divisible by gralora_k"):
+            get_peft_model(mlp, config)
+
+    def test_gralora_trainable_parameters_only(self, mlp_gralora_hybrid):
+        """Test that only GraLoRA parameters are trainable"""
+        for name, param in mlp_gralora_hybrid.named_parameters():
+            if "gralora" in name or "modules_to_save" in name:
+                assert param.requires_grad, f"GraLoRA parameter {name} should be trainable"
+            else:
+                assert not param.requires_grad, f"Base parameter {name} should be frozen"
+
+    def test_gralora_save_pretrained_files(self, mlp_gralora_hybrid, tmp_path):
+        """Test that save_pretrained creates expected files"""
+        mlp_gralora_hybrid.save_pretrained(tmp_path)
+
+        # Check for config file
+        assert (tmp_path / "adapter_config.json").exists()
+
+        # Check for weights file (either .bin or .safetensors)
+        assert (tmp_path / "adapter_model.safetensors").exists() or (tmp_path / "adapter_model.bin").exists()
+
+    def test_gralora_information_exchange_via_permutation(self, mlp_gralora_pure):
+        """
+        Test that information exchange happens through tensor permutation. Even though delta weights are
+        block-diagonal, the forward pass should allow information flow between blocks via the permutation operation.
+        """
+        mlp_gralora_pure.eval()
+
+        # Create two inputs that differ only in specific blocks
+        x1 = torch.randn(1, 10)
+        x2 = x1.clone()
+
+        # Modify only the first block (assuming k=4, block size = 10//4 = 2.5, rounded to 2-3 features)
+        x2[0, :5] += 1.0  # Modify first block
+
+        with torch.no_grad():
+            out1 = mlp_gralora_pure(x1)
+            out2 = mlp_gralora_pure(x2)
+
+        # Due to information exchange, changing one block should affect all outputs
+        # (not just outputs corresponding to that block)
+        diff = (out1 - out2).abs()
+
+        # All output dimensions should be affected (not just the first block's outputs)
+        assert (diff > 1e-6).all(), "Information exchange not happening correctly"
+
+    def test_gralora_scaling_factor(self):
+        """Test that scaling factor is correctly applied"""
+        torch.manual_seed(0)
+        mlp = MLP()
+
+        # Create two configs with different alpha values
+        config_alpha16 = GraloraConfig(
+            target_modules=["lin1"],
+            r=8,
+            gralora_alpha=16,
+            gralora_k=2,
+            hybrid_r=0,
+        )
+
+        config_alpha32 = GraloraConfig(
+            target_modules=["lin1"],
+            r=8,
+            gralora_alpha=32,
+            gralora_k=2,
+            hybrid_r=0,
+        )
+
+        model_alpha16 = get_peft_model(MLP(), config_alpha16)
+        model_alpha32 = get_peft_model(MLP(), config_alpha32)
+
+        # Copy weights to make them identical except for scaling
+        for (n1, p1), (n2, p2) in zip(model_alpha16.named_parameters(), model_alpha32.named_parameters()):
+            if "gralora" in n1:
+                p2.data = p1.data.clone()
+
+        x = torch.randn(5, 10)
+
+        model_alpha16.eval()
+        model_alpha32.eval()
+
+        with torch.no_grad():
+            out1 = model_alpha16(x)
+            out2 = model_alpha32(x)
+
+        # Outputs should be different due to different scaling
+        assert not torch.allclose(out1, out2, atol=1e-6, rtol=1e-6)

From c1fe6c4ae497e5da6295ad9d1f59d5246753a969 Mon Sep 17 00:00:00 2001
From: HaohanTsao <andy94729@gmail.com>
Date: Fri, 17 Oct 2025 11:30:53 +0800
Subject: [PATCH 04/11] FIX & TEST: Fix GraLoRA bugs in get_peft_config_as_dict
 and improve test coverage

---
 src/peft/tuners/gralora/model.py |   4 +-
 tests/test_gralora.py            | 549 +++++++++++++++++++++++++++++++
 2 files changed, 551 insertions(+), 2 deletions(-)

diff --git a/src/peft/tuners/gralora/model.py b/src/peft/tuners/gralora/model.py
index b7f6e15097..0273b30ab5 100644
--- a/src/peft/tuners/gralora/model.py
+++ b/src/peft/tuners/gralora/model.py
@@ -228,8 +228,8 @@ def get_peft_config_as_dict(self, inference: bool = False):
             config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()}
             if inference:
                 config["inference_mode"] = True
-        config_dict[key] = config
-        return config
+            config_dict[key] = config
+        return config_dict
 
     def _set_adapter_layers(self, enabled=True):
         for module in self.model.modules():
diff --git a/tests/test_gralora.py b/tests/test_gralora.py
index af43ac22e4..7e2ca5a078 100644
--- a/tests/test_gralora.py
+++ b/tests/test_gralora.py
@@ -531,3 +531,552 @@ def test_gralora_scaling_factor(self):
 
         # Outputs should be different due to different scaling
         assert not torch.allclose(out1, out2, atol=1e-6, rtol=1e-6)
+
+    def test_gralora_safe_merge_success(self):
+        """Test safe_merge with valid weights"""
+        torch.manual_seed(0)
+        mlp = MLP()
+        config = GraloraConfig(
+            target_modules=["lin1"],
+            r=8,
+            gralora_k=2,
+            hybrid_r=0,
+            init_weights=False,
+        )
+        model = get_peft_model(mlp, config)
+
+        x = torch.randn(5, 10)
+        with torch.no_grad():
+            output_before = model(x)
+
+        # Test safe merge
+        model.base_model.model.lin1.merge(safe_merge=True)
+
+        with torch.no_grad():
+            output_after = model(x)
+
+        assert torch.allclose(output_before, output_after, atol=1e-4, rtol=1e-4)
+
+    def test_gralora_safe_merge_detects_nan(self):
+        """Test that safe_merge detects NaN values"""
+        torch.manual_seed(0)
+        mlp = MLP()
+        config = GraloraConfig(
+            target_modules=["lin1"],
+            r=8,
+            gralora_k=2,
+            hybrid_r=0,
+        )
+        model = get_peft_model(mlp, config)
+
+        # Inject NaN into adapter weights (use .data to avoid requires_grad error)
+        model.base_model.model.lin1.gralora_A["default"].data[0, 0, 0] = float("nan")
+
+        # safe_merge should raise ValueError
+        with pytest.raises(ValueError, match="NaNs detected"):
+            model.base_model.model.lin1.merge(safe_merge=True)
+
+    def test_gralora_unmerge_warning_when_not_merged(self):
+        """Test that unmerge warns when already unmerged"""
+        torch.manual_seed(0)
+        mlp = MLP()
+        config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2)
+        model = get_peft_model(mlp, config)
+
+        # Try to unmerge without merging first
+        with pytest.warns(UserWarning, match="Already unmerged"):
+            model.base_model.model.lin1.unmerge()
+
+    def test_gralora_hybrid_forward_computation(self):
+        """Test that hybrid LoRA component is used in forward pass"""
+        torch.manual_seed(0)
+        mlp_hybrid = MLP()
+        mlp_pure = MLP()
+
+        config_hybrid = GraloraConfig(
+            target_modules=["lin1"],
+            r=16,
+            gralora_k=4,
+            hybrid_r=4,
+            init_weights=False,
+        )
+        model_hybrid = get_peft_model(mlp_hybrid, config_hybrid)
+
+        config_pure = GraloraConfig(
+            target_modules=["lin1"],
+            r=16,
+            gralora_k=4,
+            hybrid_r=0,
+            init_weights=False,
+        )
+        model_pure = get_peft_model(mlp_pure, config_pure)
+
+        x = torch.randn(5, 10)
+
+        with torch.no_grad():
+            output_hybrid = model_hybrid(x)
+            output_pure = model_pure(x)
+
+        # Outputs should be different due to hybrid component
+        assert not torch.allclose(output_hybrid, output_pure, atol=1e-3)
+
+    def test_gralora_invalid_rank_zero(self):
+        """Test that r=0 raises error"""
+        mlp = MLP()
+        config = GraloraConfig(target_modules=["lin1"], r=0, gralora_k=2)
+
+        with pytest.raises(ValueError, match="`r` should be a positive integer"):
+            get_peft_model(mlp, config)
+
+    def test_gralora_invalid_rank_negative(self):
+        """Test that negative r raises error"""
+        mlp = MLP()
+        config = GraloraConfig(target_modules=["lin1"], r=-1, gralora_k=2)
+
+        with pytest.raises(ValueError, match="`r` should be a positive integer"):
+            get_peft_model(mlp, config)
+
+    def test_gralora_bias_all(self):
+        """Test bias='all' configuration"""
+        torch.manual_seed(0)
+        mlp = MLP(bias=True)
+        config = GraloraConfig(
+            target_modules=["lin1"],
+            r=8,
+            gralora_k=2,
+            bias="all",
+        )
+        model = get_peft_model(mlp, config)
+
+        # Check that all bias parameters are trainable
+        bias_params = [name for name, param in model.named_parameters() if "bias" in name and param.requires_grad]
+        assert len(bias_params) > 0, "At least some bias parameters should be trainable"
+
+    def test_gralora_bias_gralora_only(self):
+        """Test bias='gralora_only' configuration"""
+        torch.manual_seed(0)
+        mlp = MLP(bias=True)
+        config = GraloraConfig(
+            target_modules=["lin1"],
+            r=8,
+            gralora_k=2,
+            bias="gralora_only",
+        )
+        model = get_peft_model(mlp, config)
+
+        # Only GraLoRA layer biases should be trainable
+        assert model.base_model.model.lin1.bias.requires_grad
+        assert not model.base_model.model.lin0.bias.requires_grad
+
+    def test_gralora_multiple_adapters_with_bias_raises(self):
+        """Test that multiple adapters with bias raises error"""
+        torch.manual_seed(0)
+        mlp = MLP()
+        config1 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2, bias="all")
+        model = get_peft_model(mlp, config1)
+
+        config2 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2, bias="all")
+
+        with pytest.raises(ValueError, match="supports only 1 adapter with bias"):
+            model.add_adapter("adapter2", config2)
+
+    def test_gralora_cpu_fp16_merge(self):
+        """Test merge with fp16 on CPU"""
+        torch.manual_seed(0)
+        mlp = MLP().to(torch.float16)
+        config = GraloraConfig(
+            target_modules=["lin1"],
+            r=8,
+            gralora_k=2,
+            hybrid_r=0,
+            init_weights=False,
+        )
+        model = get_peft_model(mlp, config)
+
+        x = torch.randn(5, 10).to(torch.float16)
+
+        with torch.no_grad():
+            output_before = model(x)
+
+        # Merge (should handle CPU fp16 correctly)
+        model.merge_adapter()
+
+        with torch.no_grad():
+            output_after = model(x)
+
+        assert torch.allclose(output_before, output_after, atol=1e-2, rtol=1e-2)
+
+    def test_gralora_cpu_bf16_merge(self):
+        """Test merge with bf16 on CPU (if supported)"""
+        # Check if bfloat16 is supported
+        try:
+            _ = torch.randn(2, 2).to(torch.bfloat16)
+        except RuntimeError:
+            pytest.skip("bfloat16 not supported on this system")
+
+        torch.manual_seed(0)
+        mlp = MLP().to(torch.bfloat16)
+        config = GraloraConfig(
+            target_modules=["lin1"],
+            r=8,
+            gralora_k=2,
+            hybrid_r=2,
+            init_weights=False,
+        )
+        model = get_peft_model(mlp, config)
+
+        x = torch.randn(5, 10).to(torch.bfloat16)
+
+        with torch.no_grad():
+            output_before = model(x)
+
+        # Merge with hybrid component
+        model.merge_adapter()
+
+        with torch.no_grad():
+            output_after = model(x)
+
+        assert torch.allclose(output_before, output_after, atol=1e-2, rtol=1e-2)
+
+    def test_gralora_disable_adapter_layers_warns_with_bias(self):
+        """Test that disable_adapter_layers warns when bias is configured"""
+        torch.manual_seed(0)
+        mlp = MLP()
+        config = GraloraConfig(
+            target_modules=["lin1"],
+            r=8,
+            gralora_k=2,
+            bias="all",
+        )
+        model = get_peft_model(mlp, config)
+
+        with pytest.warns(UserWarning, match="disabling adapter layers with bias"):
+            model.disable_adapter_layers()
+
+    def test_gralora_set_adapter_warns_when_merged(self):
+        """Test that set_adapter warns and unmerges when model is merged"""
+        torch.manual_seed(0)
+        mlp = MLP()
+        config1 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2)
+        model = get_peft_model(mlp, config1, adapter_name="adapter1")
+
+        config2 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2)
+        model.add_adapter("adapter2", config2)
+
+        # Merge first adapter
+        model.merge_adapter()
+
+        # Setting adapter should warn and unmerge
+        with pytest.warns(UserWarning, match="Adapter cannot be set when the model is merged"):
+            model.set_adapter("adapter2")
+
+        # Model should be unmerged now
+        assert not model.base_model.model.lin1.merged
+
+    def test_gralora_delete_adapter(self):
+        """Test deleting an adapter"""
+        torch.manual_seed(0)
+        mlp = MLP()
+        config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2)
+        model = get_peft_model(mlp, config, adapter_name="adapter1")
+
+        config2 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2)
+        model.add_adapter("adapter2", config2)
+
+        # Delete adapter1
+        model.delete_adapter("adapter1")
+
+        assert "adapter1" not in model.peft_config
+        assert "adapter2" in model.peft_config
+
+    def test_gralora_delete_nonexistent_adapter_raises(self):
+        """Test that deleting nonexistent adapter raises error"""
+        torch.manual_seed(0)
+        mlp = MLP()
+        config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2)
+        model = get_peft_model(mlp, config)
+
+        with pytest.raises(ValueError, match="Adapter .* does not exist"):
+            model.delete_adapter("nonexistent")
+
+    def test_gralora_unload_without_merge(self):
+        """Test unload without merging"""
+        torch.manual_seed(0)
+        mlp = MLP()
+        config = GraloraConfig(
+            target_modules=["lin1"],
+            r=8,
+            gralora_k=2,
+            init_weights=False,
+        )
+        model = get_peft_model(mlp, config)
+
+        x = torch.randn(5, 10)
+
+        # Get base model output
+        with model.disable_adapter():
+            with torch.no_grad():
+                base_output = model(x)
+
+        # Unload without merge
+        unloaded_model = model.unload()
+
+        with torch.no_grad():
+            unloaded_output = unloaded_model(x)
+
+        # Should match base model output (no merge)
+        assert torch.allclose(base_output, unloaded_output, atol=1e-5)
+
+    def test_gralora_get_peft_config_as_dict(self):
+        """Test get_peft_config_as_dict method"""
+        torch.manual_seed(0)
+        mlp = MLP()
+        config = GraloraConfig(
+            target_modules=["lin1"],
+            r=8,
+            gralora_k=2,
+            hybrid_r=4,
+            gralora_alpha=16,
+        )
+        model = get_peft_model(mlp, config)
+
+        config_dict = model.get_peft_config_as_dict(inference=False)
+
+        assert "default" in config_dict
+        assert config_dict["default"]["r"] == 8
+        assert config_dict["default"]["gralora_k"] == 2
+        assert config_dict["default"]["hybrid_r"] == 4
+
+    def test_gralora_get_peft_config_as_dict_inference_mode(self):
+        """Test get_peft_config_as_dict with inference=True"""
+        torch.manual_seed(0)
+        mlp = MLP()
+        config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2)
+        model = get_peft_model(mlp, config)
+
+        config_dict = model.get_peft_config_as_dict(inference=True)
+
+        assert config_dict["default"]["inference_mode"] is True
+
+    def test_gralora_merge_with_hybrid_component(self):
+        """Test that merge works correctly with hybrid component"""
+        torch.manual_seed(0)
+        mlp = MLP()
+        config = GraloraConfig(
+            target_modules=["lin1"],
+            r=16,
+            gralora_k=4,
+            hybrid_r=4,
+            init_weights=False,
+        )
+        model = get_peft_model(mlp, config)
+
+        x = torch.randn(5, 10)
+
+        with torch.no_grad():
+            output_before = model(x)
+
+        # Merge
+        model.merge_adapter()
+
+        with torch.no_grad():
+            output_after = model(x)
+
+        # Outputs should be very close
+        assert torch.allclose(output_before, output_after, atol=1e-4, rtol=1e-4)
+
+    def test_gralora_repr(self):
+        """Test __repr__ method"""
+        torch.manual_seed(0)
+        mlp = MLP()
+        config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2)
+        model = get_peft_model(mlp, config)
+
+        repr_str = repr(model.base_model.model.lin1)
+        assert "gralora" in repr_str.lower()
+
+    def test_gralora_merge_with_adapter_names(self):
+        """Test merge with specific adapter names"""
+        torch.manual_seed(0)
+        mlp = MLP()
+        config1 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2, init_weights=False)
+        model = get_peft_model(mlp, config1, adapter_name="adapter1")
+
+        torch.manual_seed(42)
+        config2 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2, init_weights=False)
+        model.add_adapter("adapter2", config2)
+
+        x = torch.randn(5, 10)
+
+        # Set to adapter1 and get output
+        model.set_adapter("adapter1")
+        with torch.no_grad():
+            output_before = model(x)
+
+        # Merge only adapter1
+        model.base_model.model.lin1.merge(adapter_names=["adapter1"])
+
+        with torch.no_grad():
+            output_after = model(x)
+
+        # Outputs should be close
+        assert torch.allclose(output_before, output_after, atol=1e-4, rtol=1e-4)
+
+    def test_gralora_enable_disable_adapter_layers(self):
+        """Test enable/disable adapter layers"""
+        torch.manual_seed(0)
+        mlp = MLP()
+        config = GraloraConfig(
+            target_modules=["lin1"],
+            r=8,
+            gralora_k=2,
+            init_weights=False,
+        )
+        model = get_peft_model(mlp, config)
+
+        x = torch.randn(5, 10)
+
+        # Get output with adapter enabled
+        with torch.no_grad():
+            output_enabled = model(x)
+
+        # Disable adapters
+        model.disable_adapter_layers()
+
+        with torch.no_grad():
+            output_disabled = model(x)
+
+        # Enable adapters
+        model.enable_adapter_layers()
+
+        with torch.no_grad():
+            output_re_enabled = model(x)
+
+        # Output with disabled adapter should be different
+        assert not torch.allclose(output_enabled, output_disabled, atol=1e-6)
+        # Output after re-enabling should match original
+        assert torch.allclose(output_enabled, output_re_enabled, atol=1e-6)
+
+    def test_gralora_forward_with_merged_adapter(self):
+        """Test forward pass with merged adapter"""
+        torch.manual_seed(0)
+        mlp = MLP()
+        config = GraloraConfig(
+            target_modules=["lin1"],
+            r=8,
+            gralora_k=2,
+            init_weights=False,
+        )
+        model = get_peft_model(mlp, config)
+
+        x = torch.randn(5, 10)
+
+        # Get output before merge
+        with torch.no_grad():
+            output_before = model(x)
+
+        # Merge adapter
+        model.merge_adapter()
+
+        # Forward with merged adapter (should take merged path)
+        with torch.no_grad():
+            output_after = model(x)
+
+        assert torch.allclose(output_before, output_after, atol=1e-4)
+
+    def test_gralora_forward_with_disable_adapters_and_merged(self):
+        """Test forward when disable_adapters=True and model is merged"""
+        torch.manual_seed(0)
+        mlp = MLP()
+        config = GraloraConfig(
+            target_modules=["lin1"],
+            r=8,
+            gralora_k=2,
+            init_weights=False,
+        )
+        model = get_peft_model(mlp, config)
+
+        x = torch.randn(5, 10)
+
+        # Merge adapter
+        model.merge_adapter()
+
+        # Get output with merged adapter
+        with torch.no_grad():
+            output_merged = model(x)
+
+        # Disable adapters (should unmerge)
+        with model.disable_adapter():
+            with torch.no_grad():
+                output_disabled = model(x)
+
+        # Outputs should be different
+        assert not torch.allclose(output_merged, output_disabled, atol=1e-5)
+
+    def test_gralora_bias_invalid_option_raises(self):
+        """Test that invalid bias option raises NotImplementedError"""
+        torch.manual_seed(0)
+        mlp = MLP()
+
+        # Create config with invalid bias (need to bypass validation)
+        config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2)
+        model = get_peft_model(mlp, config)
+
+        # Manually set invalid bias to trigger the error
+        model.peft_config["default"].bias = "invalid_option"
+
+        with pytest.raises(NotImplementedError, match="Requested bias"):
+            model._mark_only_adapters_as_trainable(model.model)
+
+    def test_gralora_merge_empty_adapter_names(self):
+        """Test merge with empty adapter_names returns early"""
+        torch.manual_seed(0)
+        mlp = MLP()
+        config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2)
+        model = get_peft_model(mlp, config)
+
+        # Call merge with empty list (should return early)
+        model.base_model.model.lin1.merge(adapter_names=[])
+
+        # Model should not be merged
+        assert not model.base_model.model.lin1.merged
+
+    def test_gralora_add_non_active_adapter(self):
+        """Test adding adapter that is not active (should not be trainable)"""
+        torch.manual_seed(0)
+        mlp = MLP()
+        config1 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2)
+        model = get_peft_model(mlp, config1, adapter_name="adapter1")
+
+        # Keep adapter1 active
+        model.set_adapter("adapter1")
+
+        # Add adapter2 (should not be active/trainable initially)
+        torch.manual_seed(42)
+        config2 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2)
+        model.add_adapter("adapter2", config2)
+
+        # adapter2 parameters should exist but might not be in active_adapters initially
+        assert "adapter2" in model.base_model.model.lin1.gralora_A
+
+    def test_gralora_forward_with_no_adapter_in_active_list(self):
+        """Test forward when active_adapter is not in gralora_A keys"""
+        torch.manual_seed(0)
+        mlp = MLP()
+        config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2)
+        model = get_peft_model(mlp, config, adapter_name="adapter1")
+
+        x = torch.randn(5, 10)
+
+        # Manually set _active_adapter to include non-existent adapter
+        original_adapter = model.base_model.model.lin1._active_adapter
+        model.base_model.model.lin1._active_adapter = ["nonexistent", "adapter1"]
+
+        # Should still work (skip nonexistent adapter)
+        with torch.no_grad():
+            output = model(x)
+
+        assert output.shape == (5, 2)
+
+        # Restore
+        model.base_model.model.lin1._active_adapter = original_adapter

From 4f1444f0aa394ebc40378fc3a057a213203735ce Mon Sep 17 00:00:00 2001
From: "yeonjoon.jung" <yeonjoon.jung@squeezebits.com>
Date: Sat, 18 Oct 2025 19:11:25 +0900
Subject: [PATCH 05/11] Refactor GraLoRA weight computation to improve
 efficiency in delta-weight calculation.

---
 src/peft/tuners/gralora/layer.py | 67 +++++++++-----------------------
 1 file changed, 19 insertions(+), 48 deletions(-)

diff --git a/src/peft/tuners/gralora/layer.py b/src/peft/tuners/gralora/layer.py
index 907730d49b..52de8c1b11 100644
--- a/src/peft/tuners/gralora/layer.py
+++ b/src/peft/tuners/gralora/layer.py
@@ -310,57 +310,28 @@ def get_delta_weight(self, adapter) -> torch.Tensor:
         gralora_rank = r - hybrid_r
         subblock_gralora_rank = gralora_rank // gralora_k
 
-        # Simulate the forward pass computation to get equivalent weight matrix
-        # We need to compute: W_delta such that W_delta @ x = gralora_forward(x) - base_forward(x)
-
-        # Create an identity matrix for each input dimension and compute output
-        # This gives us the columns of the weight matrix
-        delta_weight = torch.zeros(out_features, in_features, device=device, dtype=gralora_A.dtype)
-
-        # Process in batches to avoid memory issues
-        batch_size = min(256, in_features)
-        for start_idx in range(0, in_features, batch_size):
-            end_idx = min(start_idx + batch_size, in_features)
-            batch_len = end_idx - start_idx
-
-            # Create identity input: [batch_len, in_features]
-            x = torch.zeros(batch_len, in_features, device=device, dtype=gralora_A.dtype)
-            for i in range(batch_len):
-                x[i, start_idx + i] = 1.0
-
-            # Apply GraLoRA transformation (following forward logic)
-            # x shape: [batch_len, in_features]
-            N = gralora_k
-
-            # Reshape x: [batch_len, N, in_features//N]
-            x_reshaped = x.view(batch_len, N, in_features // N)
-
-            # Apply gralora_A: [batch_len, N, in_features//N] @ [N, in_features//N, rank]
-            # Result: [batch_len, N, rank]
-            temp = torch.einsum("bni, nir -> bnr", x_reshaped, gralora_A)
-
-            # Reshape and permute for information exchange
-            # [batch_len, N, rank] -> [batch_len, N, N, subblock_rank]
-            temp = temp.view(batch_len, N, N, subblock_gralora_rank)
-            # Permute: [batch_len, N, N, subblock_rank] -> [batch_len, N, N, subblock_rank]
-            temp = temp.permute(0, 2, 1, 3)
-            # Reshape: [batch_len, N, N * subblock_rank]
-            temp = temp.reshape(batch_len, N, N * subblock_gralora_rank)
-
-            # Apply gralora_B: [batch_len, N, N*subblock_rank] @ [N, rank, out_features//N]
-            # Note: rank here is actually gralora_rank = N * subblock_gralora_rank
-            # Result: [batch_len, N, out_features//N]
-            output = torch.einsum("bnr, nro -> bno", temp, gralora_B)
-
-            # Reshape to [batch_len, out_features]
-            output = output.reshape(batch_len, out_features)
-
-            # Store in delta_weight (transpose because weight is [out, in])
-            delta_weight[:, start_idx:end_idx] = output.T
+        # scatter gralora_A to get the scattered weight matrix
+        l_indices = torch.arange(in_features, device=device)
+        n_indices = (l_indices // (in_features // gralora_k))
+        i_indices = (l_indices % (in_features // gralora_k))
+        gralora_A_scattered = torch.zeros(in_features, gralora_k, gralora_rank, device=device, dtype=dtype)
+        gralora_A_scattered.scatter_(1, 
+            n_indices.unsqueeze(1).unsqueeze(2).expand(-1, 1, gralora_rank),
+            gralora_A[n_indices, i_indices, :].unsqueeze(1)
+        )
+
+        # compute the delta weight
+        delta_weight = torch.einsum(
+            "ikr, kro -> iko",
+            gralora_A_scattered
+            .view(in_features, gralora_k, gralora_k, subblock_gralora_rank)
+            .permute(0, 2, 1, 3)
+            .reshape(in_features, gralora_k, gralora_rank),
+            gralora_B,
+        ).reshape(in_features, out_features).T
 
         # Add hybrid LoRA component if present
         if hybrid_r > 0:
-            # general_A: [in_features, hybrid_r], general_B: [hybrid_r, out_features]
             weight_A_general = gralora_A_general.weight  # [hybrid_r, in_features]
             weight_B_general = gralora_B_general.weight  # [out_features, hybrid_r]
 

From 94315028e75030e03c87089b9fb28c3b4dbeb029 Mon Sep 17 00:00:00 2001
From: "yeonjoon.jung" <yeonjoon.jung@squeezebits.com>
Date: Fri, 24 Oct 2025 00:29:36 +0900
Subject: [PATCH 06/11] Refactor GraLoRA code for clearer documentation,
 simplified inheritance, and more intuitive hybrid_r handling.

---
 src/peft/tuners/gralora/config.py |  43 +++++-
 src/peft/tuners/gralora/layer.py  | 117 ++++++----------
 src/peft/tuners/gralora/model.py  | 218 +-----------------------------
 3 files changed, 86 insertions(+), 292 deletions(-)

diff --git a/src/peft/tuners/gralora/config.py b/src/peft/tuners/gralora/config.py
index 9e78b81afa..b88b26a77a 100644
--- a/src/peft/tuners/gralora/config.py
+++ b/src/peft/tuners/gralora/config.py
@@ -21,23 +21,54 @@
 
 @dataclass
 class GraloraConfig(PeftConfig):
-    r: int = field(default=8, metadata={"help": "gralora attention dimension"})
+    r: int = field(
+        default=32,
+        metadata={
+            "help": (
+                "GraLoRA attention dimension determines the rank of the GraLoRA adapter. "
+                "The total parameter count of the GraLoRA adapter is same as LoRA with same rank r, while the expressivitiy is multiplied by gralora_k."
+            )
+        },
+    )
     hybrid_r: int = field(
-        default=0, metadata={"help": "hybrid_r is the rank allocated to vanilla LoRA method when using Hybrid GraLoRA"}
+        default=0,
+        metadata={
+            "help": (
+                "hybrid_r is the rank allocated to vanilla LoRA method when using Hybrid GraLoRA method. "
+                "Hybrid GraLoRA, a combination of GraLoRA and vanilla LoRA, becomes available when hybrid_r > 0. "
+                "r + hybrid_r determines the parameter count of the GraLoRA adapter."
+            )
+        },
     )
     target_modules: Optional[Union[list[str], str]] = field(
         default=None,
         metadata={
             "help": (
-                "List of module names or regex expression of the module names to replace with gralora."
+                "List of module names or regex expression of the module names to replace with gralora. "
                 "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. "
                 "Only linear layers are supported."
             )
         },
     )
-    gralora_alpha: int = field(default=8, metadata={"help": "gralora alpha"})
+    gralora_alpha: int = field(
+        default=64,
+        metadata={
+            "help": (
+                "gralora alpha is the scaling factor for the GraLoRA adapter."
+                "Scale becomes gralora_alpha / (r + hybrid_r)."
+            )
+        },
+    )
     gralora_dropout: float = field(default=0.0, metadata={"help": "gralora dropout"})
-    gralora_k: int = field(default=2, metadata={"help": "gralora k"})
+    gralora_k: int = field(
+        default=2,
+        metadata={
+            "help": (
+                "gralora_k determines the number of subblocks in the GraLoRA adapter."
+                "The total parameter count is preserved regardles of gralora_k, while the expressivitiy is multiplied by gralora_k."
+            )
+        },
+    )
     fan_in_fan_out: bool = field(
         default=False,
         metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"},
@@ -90,3 +121,5 @@ def __post_init__(self):
         self.target_modules = (
             set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
         )
+        if self.r % self.gralora_k != 0:
+            raise ValueError(f"r should be divisible by gralora_k, but got {self.r} and {self.gralora_k}")
diff --git a/src/peft/tuners/gralora/layer.py b/src/peft/tuners/gralora/layer.py
index 52de8c1b11..4aefa02152 100644
--- a/src/peft/tuners/gralora/layer.py
+++ b/src/peft/tuners/gralora/layer.py
@@ -38,7 +38,6 @@ def __init__(self, base_layer: nn.Module, **kwargs):
         self.scaling = {}
         self.gralora_dropout = nn.ModuleDict({})
 
-        # Set to `None` otherwise to avoid computation with random weight
         self.gralora_A = nn.ParameterDict({})
         self.gralora_B = nn.ParameterDict({})
         self.gralora_A_general = nn.ModuleDict({})
@@ -55,57 +54,13 @@ def __init__(self, base_layer: nn.Module, **kwargs):
             in_features, out_features = (
                 base_layer.weight.ds_shape if hasattr(base_layer.weight, "ds_shape") else base_layer.weight.shape
             )
+        else:
+            raise NotImplementedError(f"Unsupported layer type {type(base_layer)}")
 
         self.in_features = in_features
         self.out_features = out_features
         self.kwargs = kwargs
 
-    def _move_adapter_to_device_of_base_layer(self, adapter_name: str, device: Optional[torch.device] = None) -> None:
-        """
-        Move the adapter of the given name to the device of the base layer.
-        """
-        from peft.tuners._buffer_dict import BufferDict
-
-        if device is None:
-            # check weight and qweight (for GPTQ)
-            for weight_name in ("weight", "qweight"):
-                weight = getattr(self.get_base_layer(), weight_name, None)
-                if weight is not None:
-                    device = weight.device
-                    dtype = weight.dtype
-                    break
-            else:
-                # no break encountered: could not determine the device
-                return
-
-        # loop through all potential adapter layers and move them to the device of the base layer; be careful to only
-        # move this specific adapter to the device, as the other adapters could be on different devices
-        # see #1639
-        for adapter_layer_name in self.adapter_layer_names + self.other_param_names:
-            adapter_layer = getattr(self, adapter_layer_name, None)
-            if not isinstance(adapter_layer, (nn.ModuleDict, nn.ParameterDict, BufferDict)):
-                continue
-            if adapter_name not in adapter_layer:
-                continue
-            if weight.dtype.is_floating_point or weight.dtype.is_complex:
-                adapter_layer[adapter_name] = adapter_layer[adapter_name].to(device, dtype=dtype)
-            else:
-                adapter_layer[adapter_name] = adapter_layer[adapter_name].to(device)
-
-    @property
-    def merged(self) -> bool:
-        return bool(self.merged_adapters)
-
-    @property
-    def bias(self) -> torch.Tensor:
-        base_layer = self.get_base_layer()
-        if isinstance(base_layer, nn.Linear):
-            return base_layer.bias
-        elif isinstance(base_layer, Conv1D):
-            return base_layer.bias
-        else:
-            return None
-
     def update_layer(
         self,
         adapter_name,
@@ -119,6 +74,8 @@ def update_layer(
     ):
         if r <= 0:
             raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
+        elif hybrid_r < 0:
+            raise ValueError(f"`hybrid_r` should be a non-negative integer value but the value passed is {hybrid_r}")
 
         self.r[adapter_name] = r
         self.gralora_alpha[adapter_name] = gralora_alpha
@@ -133,21 +90,29 @@ def update_layer(
         self.gralora_dropout.update(nn.ModuleDict({adapter_name: gralora_dropout_layer}))
 
         # Actual trainable parameters
+        if self.in_features % gralora_k != 0:
+            raise ValueError(
+                f"in_features should be divisible by gralora_k, but got {self.in_features} and {gralora_k}"
+            )
+        if self.out_features % gralora_k != 0:
+            raise ValueError(
+                f"out_features should be divisible by gralora_k, but got {self.out_features} and {gralora_k}"
+            )
         subblock_in_features = self.in_features // gralora_k
         subblock_out_features = self.out_features // gralora_k
 
-        gralora_r = r - hybrid_r  # gralora_r is the rank allocated to gralora method
-        assert gralora_r % gralora_k == 0, f"r should be divisible by gralora_k, but got {r} and {gralora_k}"
+        # gralora_r is the rank allocated to GraLoRA method; hybrid_r is the rank allocated to vanilla LoRA
+        gralora_r = r
 
-        gralora_A = nn.ParameterList()
-        gralora_B = nn.ParameterList()
+        gralora_A = []
+        gralora_B = []
         for _ in range(gralora_k):
-            new_A = nn.Parameter(torch.zeros(gralora_r, subblock_in_features))
-            new_B = nn.Parameter(torch.zeros(subblock_out_features, gralora_r))
+            new_A = nn.Parameter(torch.empty(gralora_r, subblock_in_features))
+            new_B = nn.Parameter(torch.empty(subblock_out_features, gralora_r))
             if init_weights:
                 # Initialize to identity: A is random, B is zero
                 nn.init.kaiming_uniform_(new_A, a=math.sqrt(5))
-                # new_B is already initialized to zeros
+                nn.init.zeros_(new_B)
             else:
                 # Initialize to random: both A and B are random (for testing)
                 nn.init.kaiming_uniform_(new_A, a=math.sqrt(5))
@@ -183,7 +148,7 @@ def update_layer(
 
         self.module_name = module_name
 
-        self.scaling[adapter_name] = gralora_alpha / r
+        self.scaling[adapter_name] = gralora_alpha / (gralora_r + hybrid_r)
         self._move_adapter_to_device_of_base_layer(adapter_name)
         self.set_adapter(self.active_adapters)
 
@@ -305,30 +270,38 @@ def get_delta_weight(self, adapter) -> torch.Tensor:
         # Get dimensions
         in_features = self.in_features
         out_features = self.out_features
-        subblock_in = in_features // gralora_k
-        subblock_out = out_features // gralora_k
-        gralora_rank = r - hybrid_r
+        gralora_rank = r
+        if in_features % gralora_k != 0:
+            raise ValueError(f"in_features should be divisible by gralora_k, but got {in_features} and {gralora_k}")
+        elif out_features % gralora_k != 0:
+            raise ValueError(f"out_features should be divisible by gralora_k, but got {out_features} and {gralora_k}")
+        elif gralora_rank % gralora_k != 0:
+            raise ValueError(f"rank should be divisible by gralora_k, but got {gralora_rank} and {gralora_k}")
         subblock_gralora_rank = gralora_rank // gralora_k
 
         # scatter gralora_A to get the scattered weight matrix
         l_indices = torch.arange(in_features, device=device)
-        n_indices = (l_indices // (in_features // gralora_k))
-        i_indices = (l_indices % (in_features // gralora_k))
+        n_indices = l_indices // (in_features // gralora_k)
+        i_indices = l_indices % (in_features // gralora_k)
         gralora_A_scattered = torch.zeros(in_features, gralora_k, gralora_rank, device=device, dtype=dtype)
-        gralora_A_scattered.scatter_(1, 
+        gralora_A_scattered.scatter_(
+            1,
             n_indices.unsqueeze(1).unsqueeze(2).expand(-1, 1, gralora_rank),
-            gralora_A[n_indices, i_indices, :].unsqueeze(1)
+            gralora_A[n_indices, i_indices, :].unsqueeze(1),
         )
 
         # compute the delta weight
-        delta_weight = torch.einsum(
-            "ikr, kro -> iko",
-            gralora_A_scattered
-            .view(in_features, gralora_k, gralora_k, subblock_gralora_rank)
-            .permute(0, 2, 1, 3)
-            .reshape(in_features, gralora_k, gralora_rank),
-            gralora_B,
-        ).reshape(in_features, out_features).T
+        delta_weight = (
+            torch.einsum(
+                "ikr, kro -> iko",
+                gralora_A_scattered.view(in_features, gralora_k, gralora_k, subblock_gralora_rank)
+                .permute(0, 2, 1, 3)
+                .reshape(in_features, gralora_k, gralora_rank),
+                gralora_B,
+            )
+            .reshape(in_features, out_features)
+            .T
+        )
 
         # Add hybrid LoRA component if present
         if hybrid_r > 0:
@@ -380,16 +353,14 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
                 gralora_B_general = self.gralora_B_general[active_adapter]
 
                 r = self.r[active_adapter]
+                gralora_rank = r
                 gralora_k = self.gralora_k[active_adapter]
                 hybrid_r = self.hybrid_r[active_adapter]
 
-                assert len(gralora_A) == len(gralora_B)
-
                 dropout = self.gralora_dropout[active_adapter]
                 scaling = self.scaling[active_adapter]
 
                 gralora_dtype = gralora_A.dtype
-                gralora_rank = r - hybrid_r
 
                 B, L, in_features = x.shape
                 N = gralora_k
diff --git a/src/peft/tuners/gralora/model.py b/src/peft/tuners/gralora/model.py
index 0273b30ab5..3d7fae2cc6 100644
--- a/src/peft/tuners/gralora/model.py
+++ b/src/peft/tuners/gralora/model.py
@@ -63,30 +63,12 @@ class GraloraModel(BaseTuner):
         - **peft_config** ([`GraloraConfig`]): The configuration of the Gralora model.
     """
 
+    # The unique prefix for GraLoRA method
     prefix: str = "gralora_"
+    # The class of tuner layer for GraLoRA method
+    tuner_layer_cls = GraloraLayer
 
-    def __init__(self, model, config, adapter_name) -> None:
-        super().__init__(model, config, adapter_name)
-
-    def _check_new_adapter_config(self, config: GraloraConfig) -> None:
-        """
-        A helper method to check the config when a new adapter is being added.
-
-        Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters.
-
-        """
-        # the below todo is copied from LoRA
-        # TODO: there should be a check if any of the existing adapters actually has bias != "none", or else the check
-        # does not fully correspond to the error message.
-        if (len(self.peft_config) > 1) and (config.bias != "none"):
-            raise ValueError(
-                f"{self.__class__.__name__} supports only 1 adapter with bias. When using multiple adapters, "
-                "set bias to 'none' for all adapters."
-            )
-
-    @staticmethod
-    def _check_target_module_exists(gralora_config, key):
-        return check_target_module_exists(gralora_config, key)
+    target_module_mapping = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING
 
     def _create_and_replace(
         self,
@@ -132,54 +114,6 @@ def _create_and_replace(
                 new_module.requires_grad_(False)
             self._replace_module(parent, target_name, new_module, target)
 
-    @staticmethod
-    def _replace_module(parent, child_name, new_module, child):
-        setattr(parent, child_name, new_module)
-        # It's not necessary to set requires_grad here, as that is handled by
-        # _mark_only_adapters_as_trainable
-
-        # child layer wraps the original module, unpack it
-        if hasattr(child, "base_layer"):
-            child = child.base_layer
-
-        if not hasattr(new_module, "base_layer"):
-            new_module.weight = child.weight
-            if hasattr(child, "bias"):
-                new_module.bias = child.bias
-
-        if getattr(child, "state", None) is not None:
-            if hasattr(new_module, "base_layer"):
-                new_module.base_layer.state = child.state
-            else:
-                new_module.state = child.state
-            new_module.to(child.weight.device)
-
-        # dispatch to correct device
-        for name, module in new_module.named_modules():
-            if "gralora_" in name:
-                module.to(child.weight.device)
-
-    def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None:
-        for n, p in model.named_parameters():
-            if self.prefix not in n:
-                p.requires_grad = False
-
-        for active_adapter in self.active_adapters:
-            bias = self.peft_config[active_adapter].bias
-            if bias == "none":
-                continue
-
-            if bias == "all":
-                for n, p in model.named_parameters():
-                    if "bias" in n:
-                        p.requires_grad = True
-            elif bias == "gralora_only":
-                for m in model.modules():
-                    if isinstance(m, GraloraLayer) and hasattr(m, "bias") and m.bias is not None:
-                        m.bias.requires_grad = True
-            else:
-                raise NotImplementedError(f"Requested bias: {bias}, is not implemented.")
-
     @staticmethod
     def _create_new_module(gralora_config, adapter_name, target, module_name, **kwargs):
         if isinstance(target, BaseTunerLayer):
@@ -214,147 +148,3 @@ def _create_new_module(gralora_config, adapter_name, target, module_name, **kwar
         )
 
         return new_module
-
-    def __getattr__(self, name: str):
-        """Forward missing attributes to the wrapped module."""
-        try:
-            return super().__getattr__(name)  # defer to nn.Module's logic
-        except AttributeError:
-            return getattr(self.model, name)
-
-    def get_peft_config_as_dict(self, inference: bool = False):
-        config_dict = {}
-        for key, value in self.peft_config.items():
-            config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()}
-            if inference:
-                config["inference_mode"] = True
-            config_dict[key] = config
-        return config_dict
-
-    def _set_adapter_layers(self, enabled=True):
-        for module in self.model.modules():
-            if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)):
-                module.enable_adapters(enabled)
-
-    def enable_adapter_layers(self):
-        self._set_adapter_layers(enabled=True)
-
-    def disable_adapter_layers(self):
-        for active_adapter in self.active_adapters:
-            val = self.peft_config[active_adapter].bias
-            if val != "none":
-                msg = (
-                    f"Careful, disabling adapter layers with bias configured to be '{val}' does not produce the same "
-                    "output as the the base model would without adaption."
-                )
-                warnings.warn(msg)
-        self._set_adapter_layers(enabled=False)
-
-    def set_adapter(self, adapter_name, inference_mode: bool = False):
-        for module in self.model.modules():
-            if isinstance(module, GraloraLayer):
-                if module.merged:
-                    warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.")
-                    module.unmerge()
-                module.set_adapter(adapter_name, inference_mode=inference_mode)
-        self.active_adapter = adapter_name
-
-    @staticmethod
-    def _prepare_adapter_config(peft_config, model_config):
-        if peft_config.target_modules is None:
-            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING:
-                raise ValueError("Please specify `target_modules` in `peft_config`")
-            peft_config.target_modules = set(
-                TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING[model_config["model_type"]]
-            )
-        return peft_config
-
-    def _unload_and_optionally_merge(
-        self,
-        merge=True,
-        progressbar: bool = False,
-        safe_merge: bool = False,
-        adapter_names: Optional[list[str]] = None,
-    ):
-        # we cannot use self.prefix as we want to include non-trainable gralora parameters
-        key_list = [key for key, _ in self.model.named_modules() if "gralora" not in key]
-        desc = "Unloading " + ("and merging " if merge else "") + "model"
-        for key in tqdm(key_list, disable=not progressbar, desc=desc):
-            try:
-                parent, target, target_name = _get_submodules(self.model, key)
-            except AttributeError:
-                continue
-
-            if hasattr(target, "base_layer"):
-                if merge:
-                    target.merge(safe_merge=safe_merge, adapter_names=adapter_names)
-
-                self._replace_module(parent, target_name, target.get_base_layer(), target)
-            elif isinstance(target, ModulesToSaveWrapper):
-                # save any additional trainable modules part of `modules_to_save`
-                setattr(parent, target_name, target.modules_to_save[target.active_adapter])
-
-        return self.model
-
-    def delete_adapter(self, adapter_name: str):
-        """
-        Deletes an existing adapter.
-
-        Args:
-            adapter_name (str): Name of the adapter to be deleted.
-        """
-        if adapter_name not in list(self.peft_config.keys()):
-            raise ValueError(f"Adapter {adapter_name} does not exist")
-        del self.peft_config[adapter_name]
-
-        # we cannot use self.prefix as we want to include non-trainable gralora parameters
-        key_list = [key for key, _ in self.model.named_modules() if "gralora" not in key]
-        new_adapter = None
-        for key in key_list:
-            _, target, _ = _get_submodules(self.model, key)
-            if isinstance(target, GraloraLayer):
-                target.delete_adapter(adapter_name)
-                if new_adapter is None:
-                    new_adapter = target.active_adapter[:]
-
-        self.active_adapter = new_adapter or []
-
-    def merge_and_unload(
-        self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[list[str]] = None
-    ):
-        r"""
-        This method merges the Gralora layers into the base model. This is needed if someone wants to use the base
-        model as a standalone model.
-
-        Args:
-            progressbar (`bool`):
-                whether to show a progressbar indicating the unload and merge process
-            safe_merge (`bool`):
-                whether to activate the safe merging check to check if there is any potential Nan in the adapter
-                weights
-            adapter_names (`list[str]`, *optional*):
-                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
-                to `None`.
-
-        Example:
-
-        ```py
-        >>> from transformers import AutoModelForCausalLM
-        >>> from peft import PeftModel
-
-        >>> base_model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-40b")
-        >>> peft_model_id = "smangrul/falcon-40B-int4-peft-lrasa-sfttrainer-sample"
-        >>> model = PeftModel.from_pretrained(base_model, peft_model_id)
-        >>> merged_model = model.merge_and_unload()
-        ```
-        """
-        return self._unload_and_optionally_merge(
-            progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names
-        )
-
-    def unload(self):
-        """
-        Gets back the base model by removing all the Gralora modules without merging. This gives back the original base
-        model.
-        """
-        return self._unload_and_optionally_merge(merge=False)

From dec25f55ee7bc0a81e2ea206a42264e4ac211666 Mon Sep 17 00:00:00 2001
From: "yeonjoon.jung" <yeonjoon.jung@squeezebits.com>
Date: Fri, 24 Oct 2025 00:30:11 +0900
Subject: [PATCH 07/11] Update test code for the GraLoRA method

---
 tests/test_custom_models.py | 53 ++++++++++++++++++++++++++++++-------
 tests/test_gralora.py       | 43 +++++-------------------------
 2 files changed, 50 insertions(+), 46 deletions(-)

diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
index ed83db98cb..30628f2bdf 100644
--- a/tests/test_custom_models.py
+++ b/tests/test_custom_models.py
@@ -38,6 +38,7 @@
     C3AConfig,
     DeloraConfig,
     FourierFTConfig,
+    GraloraConfig,
     HRAConfig,
     IA3Config,
     LNTuningConfig,
@@ -666,6 +667,25 @@
             "init_weights": True,
         },
     ),
+    ###########
+    # GraLoRA #
+    ###########
+    ("Vanilla MLP 1 GraLoRA", "MLP", GraloraConfig, {"target_modules": "lin0"}),
+    ("Vanilla MLP 2 GraLoRA", "MLP", GraloraConfig, {"target_modules": ["lin0"]}),
+    ("Vanilla MLP 3 GraLoRA", "MLP", GraloraConfig, {"target_modules": ["lin1"]}),
+    ("Vanilla MLP 4 GraLoRA", "MLP", GraloraConfig, {"target_modules": ["lin0", "lin1"]}),
+    (
+        "Vanilla MLP 5 GraLoRA",
+        "MLP",
+        GraloraConfig,
+        {"target_modules": ["lin0"], "modules_to_save": ["lin1"]},
+    ),
+    (
+        "Embedding + transformers Conv1D 1 GraLoRA",
+        "EmbConv1D",
+        GraloraConfig,
+        {"target_modules": ["conv1d"], "gralora_k": 1},
+    ),
     ##########
     # VBLoRA #
     ##########
@@ -979,6 +999,20 @@
         {"n_frequency": 10, "target_modules": ["lin0"]},
         {"n_frequency": 10, "target_modules": ["lin1"]},
     ),
+    (
+        "GraLoRA Same",
+        "gralora",
+        GraloraConfig,
+        {"target_modules": ["lin0"], "init_weights": False},
+        {"target_modules": ["lin0"], "init_weights": False},
+    ),
+    (
+        "GraLoRA Different",
+        "gralora",
+        GraloraConfig,
+        {"target_modules": ["lin0"], "init_weights": False},
+        {"target_modules": ["lin1"], "init_weights": False},
+    ),
     (
         "SHiRA Same",
         "shira",
@@ -1165,6 +1199,7 @@
     VeraConfig: "vera_lambda_",
     RandLoraConfig: "randlora_",
     FourierFTConfig: "fourierft_",
+    GraloraConfig: "gralora_",
     C3AConfig: "c3a_",
     HRAConfig: "hra_",
     ShiraConfig: "shira_",
@@ -3089,12 +3124,12 @@ def test_add_weighted_adapter_subtraction_with_negative_weights(self):
                 cancelled_B = module.lora_B["cancelled"].weight.data
 
                 # The weights should be approximately zero (they cancel out)
-                assert torch.allclose(cancelled_A, torch.zeros_like(cancelled_A), atol=1e-5), (
-                    f"Cancelled A should be ~0, got max abs value {cancelled_A.abs().max()}"
-                )
-                assert torch.allclose(cancelled_B, torch.zeros_like(cancelled_B), atol=1e-5), (
-                    f"Cancelled B should be ~0, got max abs value {cancelled_B.abs().max()}"
-                )
+                assert torch.allclose(
+                    cancelled_A, torch.zeros_like(cancelled_A), atol=1e-5
+                ), f"Cancelled A should be ~0, got max abs value {cancelled_A.abs().max()}"
+                assert torch.allclose(
+                    cancelled_B, torch.zeros_like(cancelled_B), atol=1e-5
+                ), f"Cancelled B should be ~0, got max abs value {cancelled_B.abs().max()}"
 
     def test_add_weighted_adapter_negative_weight_with_different_scaling(self):
         # Test negative weights with different scaling factors (lora_alpha)
@@ -3500,9 +3535,9 @@ def test_multirank_2(self):
                 if isinstance(module, BaseTunerLayer):
                     rank_expected = rank_pattern.get(key, r)
                     rank_current = module.lora_A[adapter].weight.shape[0]
-                    assert rank_current == rank_expected, (
-                        f"Rank {rank_current} is not equal to expected {rank_expected}"
-                    )
+                    assert (
+                        rank_current == rank_expected
+                    ), f"Rank {rank_current} is not equal to expected {rank_expected}"
 
 
 class TestLayerRepr:
diff --git a/tests/test_gralora.py b/tests/test_gralora.py
index 7e2ca5a078..59c2418a33 100644
--- a/tests/test_gralora.py
+++ b/tests/test_gralora.py
@@ -112,7 +112,7 @@ def test_gralora_parameter_shapes(self, mlp_gralora_hybrid):
                 in_features = module.in_features
                 out_features = module.out_features
                 k = 4
-                gralora_rank = 16 - 4  # r - hybrid_r
+                gralora_rank = 16
 
                 # Check GraLoRA block shapes
                 # Each block has full gralora_rank, not gralora_rank // k
@@ -203,7 +203,7 @@ def test_gralora_pure_vs_hybrid_params(self):
         mlp_hybrid = MLP()
         config_hybrid = GraloraConfig(
             target_modules=["lin1", "lin2"],
-            r=16,
+            r=12,
             gralora_k=4,
             hybrid_r=4,
         )
@@ -217,9 +217,9 @@ def count_trainable_params(model):
 
         # Pure and hybrid should have same total parameters (r is constant)
         # but distributed differently between block-diagonal and full-rank components
-        assert params_pure == params_hybrid, (
-            f"Pure ({params_pure}) and Hybrid ({params_hybrid}) should have same parameter count"
-        )
+        assert (
+            params_pure == params_hybrid
+        ), f"Pure ({params_pure}) and Hybrid ({params_hybrid}) should have same parameter count"
 
         # Check that hybrid has general components
         has_general = False
@@ -444,7 +444,7 @@ def test_gralora_rank_divisibility_check(self):
             hybrid_r=0,
         )
 
-        with pytest.raises(AssertionError, match="r should be divisible by gralora_k"):
+        with pytest.raises(ValueError, match="r should be divisible by gralora_k"):
             get_peft_model(mlp, config)
 
     def test_gralora_trainable_parameters_only(self, mlp_gralora_hybrid):
@@ -827,37 +827,6 @@ def test_gralora_unload_without_merge(self):
         # Should match base model output (no merge)
         assert torch.allclose(base_output, unloaded_output, atol=1e-5)
 
-    def test_gralora_get_peft_config_as_dict(self):
-        """Test get_peft_config_as_dict method"""
-        torch.manual_seed(0)
-        mlp = MLP()
-        config = GraloraConfig(
-            target_modules=["lin1"],
-            r=8,
-            gralora_k=2,
-            hybrid_r=4,
-            gralora_alpha=16,
-        )
-        model = get_peft_model(mlp, config)
-
-        config_dict = model.get_peft_config_as_dict(inference=False)
-
-        assert "default" in config_dict
-        assert config_dict["default"]["r"] == 8
-        assert config_dict["default"]["gralora_k"] == 2
-        assert config_dict["default"]["hybrid_r"] == 4
-
-    def test_gralora_get_peft_config_as_dict_inference_mode(self):
-        """Test get_peft_config_as_dict with inference=True"""
-        torch.manual_seed(0)
-        mlp = MLP()
-        config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2)
-        model = get_peft_model(mlp, config)
-
-        config_dict = model.get_peft_config_as_dict(inference=True)
-
-        assert config_dict["default"]["inference_mode"] is True
-
     def test_gralora_merge_with_hybrid_component(self):
         """Test that merge works correctly with hybrid component"""
         torch.manual_seed(0)

From 925ad7260e4a686df6ae55b4fbe953d4d789dfc9 Mon Sep 17 00:00:00 2001
From: "yeonjoon.jung" <yeonjoon.jung@squeezebits.com>
Date: Fri, 24 Oct 2025 19:05:19 +0900
Subject: [PATCH 08/11] ADD: documentations, examples, and test code for
 GraLoRA method

---
 docs/source/package_reference/gralora.md      |  32 +++
 examples/gralora_finetuning/README.md         |  71 ++++++
 .../gralora_finetuning/gralora_finetuning.py  | 213 ++++++++++++++++++
 tests/test_encoder_decoder_models.py          |   8 +
 tests/test_feature_extraction_models.py       |   8 +
 tests/test_seq_classifier.py                  |   8 +
 6 files changed, 340 insertions(+)
 create mode 100644 docs/source/package_reference/gralora.md
 create mode 100644 examples/gralora_finetuning/README.md
 create mode 100644 examples/gralora_finetuning/gralora_finetuning.py

diff --git a/docs/source/package_reference/gralora.md b/docs/source/package_reference/gralora.md
new file mode 100644
index 0000000000..3d499756c1
--- /dev/null
+++ b/docs/source/package_reference/gralora.md
@@ -0,0 +1,32 @@
+# GraLoRA
+
+[**Granular Low-Rank Adaptation (GraLoRA)**](https://huggingface.co/papers/2505.20355) is a PEFT method designed to enhance the **expressivity** of low-rank adaptation while improving **robustness to outlier** activations, based on insights from well-known issues in quantization.
+
+![GraLoRA Overview](https://github.com/SqueezeBits/GraLoRA/raw/main/figure/gralora_overview.png)
+
+Unlike standard LoRA, which applies a single low-rank adapter across the entire feature space, GraLoRA introduces a structured and fine-grained adaptation scheme. It divides the adaptation space into a grid of $𝑘^2$ smaller, independent adapter pairs, each responsible for a localized subset of the input and output dimensions. As a result, each adapter operates on a subspace that is $k$ times smaller in both dimensions than the original LoRA adapter.
+
+This granular decomposition enables spatially localized and context-aware updates, effectively increasing representational capacity without additional parameters or computational cost. By isolating the influence of extreme activations within smaller subspaces, GraLoRA mitigates gradient distortion and preserves inter-channel balance during adaptation.
+
+---
+
+The abstract from the paper is:
+
+*Low-Rank Adaptation (LoRA) is a popular method for parameter-efficient fine-
+tuning (PEFT) of generative models, valued for its simplicity and effectiveness.
+Despite recent enhancements, LoRA still suffers from a fundamental limitation:
+overfitting when the bottleneck is widened. It performs best at ranks 32–64, yet its
+accuracy stagnates or declines at higher ranks, still falling short of full fine-tuning
+(FFT) performance. We identify the root cause as LoRA’s structural bottleneck,
+which introduces gradient entanglement to the unrelated input channels and distorts
+gradient propagation. To address this, we introduce a novel structure, Granular
+Low-Rank Adaptation (GraLoRA) that partitions weight matrices into sub-blocks,
+each with its own low-rank adapter. With negligible computational or storage cost,
+GraLoRA overcomes LoRA’s limitations, effectively increases the representational
+capacity, and more closely approximates FFT behavior. Experiments on code
+generation, commonsense reasoning, mathematical reasoning, general language
+understanding, and image generation benchmarks show that GraLoRA consistently
+outperforms LoRA and other baselines, achieving up to +8.5% absolute gain in
+Pass@1 on HumanEval+. These improvements hold across model sizes and rank
+settings, making GraLoRA a scalable and robust solution for PEFT.*
+
diff --git a/examples/gralora_finetuning/README.md b/examples/gralora_finetuning/README.md
new file mode 100644
index 0000000000..a911ab86d5
--- /dev/null
+++ b/examples/gralora_finetuning/README.md
@@ -0,0 +1,71 @@
+# GraLoRA: Granular Low-Rank Adaptation
+
+![GraLoRA Overview](https://github.com/SqueezeBits/GraLoRA/raw/main/figure/gralora_overview.png)
+
+## Introduction
+[**Granular Low-Rank Adaptation (GraLoRA)**](https://huggingface.co/papers/2505.20355) is a PEFT method designed to enhance the **expressivity** of low-rank adaptation while improving **robustness to outlier** activations, based on insights from well-known issues in quantization. 
+
+GraLoRA introduces a structured and fine-grained adaptation scheme. It divides the adaptation space into a grid of $𝑘^2$ smaller, independent adapter pairs, each responsible for a localized subset of the input and output dimensions.
+
+## Quick start
+
+With respect to your standard PEFT training procedure with LoRA, simply swap your `LoraConfig` for a `GraloraConfig`.
+
+```python
+import torch
+from peft import GraloraConfig, get_peft_model
+from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer
+from datasets import load_dataset
+
+model = AutoModelForCausalLM.from_pretrained("huggyllama/llama-7b", device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
+dataset = load_dataset("timdettmers/openassistant-guanaco", split="train")
+gralora_config = GraloraConfig()
+peft_model = get_peft_model(model, gralora_config)
+trainer = transformers.Trainer(
+    model=peft_model,
+    train_dataset=dataset,
+    dataset_text_field="text",
+    max_seq_length=2048,
+    tokenizer=tokenizer,
+)
+trainer.train()
+peft_model.save_pretrained("gralora-llama-3-8b")
+```
+
+Run the finetuning script simply by running:
+```python
+python examples/gralora_finetuning/gralora_finetuning.py --base_model meta-llama/Meta-Llama-3-8B --data_path timdettmers/openassistant-guanaco
+```
+
+## Use the model on 🤗
+You can load and use the model as any other 🤗 models.
+```python
+import torch
+from peft import PeftModel
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Meta-Llama-3-8B", dtype=torch.bfloat16, device_map="auto"
+)
+peft_model = PeftModel.from_pretrained(model, "gralora-llama-3-8b")
+```
+
+## Additonal Notes
+While `gralora_k` is set to 2 for default, you can increase this value to create more fine-grained adapters. `gralora_k` of 4 is recommended when the total rank (`r + hybrid_r`) is 64 or higher.
+
+
+
+
+## Citation
+```
+@misc{jung2025graloragranularlowrankadaptation,
+      title={GraLoRA: Granular Low-Rank Adaptation for Parameter-Efficient Fine-Tuning}, 
+      author={Yeonjoon Jung and Daehyun Ahn and Hyungjun Kim and Taesu Kim and Eunhyeok Park},
+      year={2025},
+      eprint={2505.20355},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG},
+      url={https://arxiv.org/abs/2505.20355}, 
+}
+```
diff --git a/examples/gralora_finetuning/gralora_finetuning.py b/examples/gralora_finetuning/gralora_finetuning.py
new file mode 100644
index 0000000000..1dcdcf46ee
--- /dev/null
+++ b/examples/gralora_finetuning/gralora_finetuning.py
@@ -0,0 +1,213 @@
+# This script is based on examples/dora_finetuning/dora_finetuning.py
+import os
+
+import torch
+from datasets import load_dataset
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    DataCollatorForLanguageModeling,
+    Trainer,
+    TrainingArguments,
+)
+
+from peft import GraloraConfig, get_peft_model, prepare_model_for_kbit_training
+
+
+def train_model(
+    base_model: str,
+    data_path: str,
+    output_dir: str,
+    batch_size: int,
+    num_epochs: int,
+    learning_rate: float,
+    cutoff_len: int,
+    val_set_size: int,
+    quantize: bool,
+    eval_step: int,
+    save_step: int,
+    device: str,
+    gralora_r: int,
+    gralora_alpha: int,
+    gralora_dropout: float,
+    gralora_target_modules: str,
+    gralora_k: int,
+    hybrid_r: int,
+    hub_model_id: str,
+    push_to_hub: bool,
+):
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    hf_token = os.getenv("HF_TOKEN")
+
+    # Setup device
+    if device == "auto":
+        device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
+    else:
+        device = torch.device(device)
+    print(f"Using device: {device}")
+
+    # load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(base_model, token=hf_token)
+
+    # Quantized GraLoRA: IF YOU WANNA QUANTIZE THE MODEL
+    if quantize:
+        if (torch.cuda.is_available() and torch.cuda.is_bf16_supported()) or torch.xpu.is_available():
+            bnb_4bit_compute_dtype = torch.bfloat16
+        else:
+            bnb_4bit_compute_dtype = torch.float16
+        model = AutoModelForCausalLM.from_pretrained(
+            base_model,
+            token=hf_token,
+            quantization_config=BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4",
+            ),
+        )
+        # setup for quantized training
+        model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
+    else:
+        model = AutoModelForCausalLM.from_pretrained(base_model, token=hf_token)
+    # GraLoRA config for the PEFT model
+    gralora_config = GraloraConfig(
+        r=gralora_r,  # Rank of matrix
+        gralora_alpha=gralora_alpha,
+        target_modules=(
+            gralora_target_modules.split(",")
+            if gralora_target_modules
+            else ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+        ),
+        gralora_dropout=gralora_dropout,
+        gralora_k=gralora_k,
+        hybrid_r=hybrid_r,
+        bias="none",
+    )
+
+    # get the peft model with GraLoRA config
+    model = get_peft_model(model, gralora_config)
+
+    model.to(device)  # MODEL TO GPU/CUDA
+    tokenizer.pad_token = tokenizer.eos_token
+
+    # Load the dataset
+    dataset = load_dataset(data_path)
+
+    def tokenize_function(examples):
+        inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=cutoff_len)
+        inputs["labels"] = inputs["input_ids"].copy()  # setting labels for a language modeling task
+        return inputs
+
+    # Tokenize the dataset and prepare for training
+    tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)
+
+    # Data collator to dynamically pad the batched examples
+    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
+
+    # Define training arguments
+    training_args = TrainingArguments(
+        output_dir=output_dir,
+        num_train_epochs=num_epochs,
+        per_device_train_batch_size=batch_size,
+        per_device_eval_batch_size=batch_size,
+        warmup_steps=100,
+        weight_decay=0.01,
+        logging_dir="./logs",
+        logging_steps=eval_step,
+        save_steps=save_step,
+        save_total_limit=2,
+        push_to_hub=push_to_hub,
+        hub_model_id=hub_model_id,
+        gradient_accumulation_steps=16,
+        fp16=True,
+        learning_rate=learning_rate,
+        hub_token=hf_token,
+    )
+
+    # Clear device cache to free memory
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    elif torch.xpu.is_available():
+        torch.xpu.empty_cache()
+
+    # Initialize the Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_datasets["train"],
+        eval_dataset=tokenized_datasets["test"],
+        data_collator=data_collator,
+    )
+
+    # Start model training
+    trainer.train()
+
+    # Save and push the trained model and tokenizer
+    if push_to_hub:
+        # Push the main model to the hub
+        trainer.push_to_hub(commit_message="Fine-tuned model")
+
+    # Save the model and tokenizer locally
+    model.save_pretrained(output_dir)
+    tokenizer.save_pretrained(output_dir)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Fine-tune LLaMA with GraLoRA and PEFT")
+    parser.add_argument("--base_model", type=str, default="huggyllama/llama-7b", help="Base model path or name")
+    parser.add_argument(
+        "--data_path", type=str, default="timdettmers/openassistant-guanaco", help="Dataset path or name"
+    )
+    parser.add_argument(
+        "--output_dir", type=str, default="path/to/output", help="Output directory for the fine-tuned model"
+    )
+    parser.add_argument("--batch_size", type=int, default=1, help="Batch size")
+    parser.add_argument("--num_epochs", type=int, default=1, help="Number of training epochs")
+    parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate")
+    parser.add_argument("--cutoff_len", type=int, default=512, help="Cutoff length for tokenization")
+    parser.add_argument("--val_set_size", type=int, default=500, help="Validation set size")
+    parser.add_argument("--quantize", action="store_true", help="Use quantization")
+    parser.add_argument("--eval_step", type=int, default=10, help="Evaluation step interval")
+    parser.add_argument("--save_step", type=int, default=100, help="Save step interval")
+    parser.add_argument("--device", type=str, default="auto", help="Device to use for training")
+    parser.add_argument("--gralora_r", type=int, default=8, help="LoRA rank")
+    parser.add_argument("--gralora_alpha", type=int, default=16, help="LoRA alpha")
+    parser.add_argument("--gralora_dropout", type=float, default=0.05, help="LoRA dropout rate")
+    parser.add_argument(
+        "--gralora_target_modules", type=str, default=None, help="Comma-separated list of target modules for LoRA"
+    )
+    parser.add_argument("--gralora_k", type=int, default=2, help="GraLoRA k")
+    parser.add_argument("--hybrid_r", type=int, default=0, help="Hybrid rank")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default="path/to/repo",
+        help="Repository name to push the model on the Hugging Face Hub",
+    )
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to Hugging Face Hub")
+    args = parser.parse_args()
+    train_model(
+        base_model=args.base_model,
+        data_path=args.data_path,
+        output_dir=args.output_dir,
+        batch_size=args.batch_size,
+        num_epochs=args.num_epochs,
+        learning_rate=args.learning_rate,
+        cutoff_len=args.cutoff_len,
+        val_set_size=args.val_set_size,
+        quantize=args.quantize,
+        eval_step=args.eval_step,
+        save_step=args.save_step,
+        device=args.device,
+        gralora_r=args.gralora_r,
+        gralora_alpha=args.gralora_alpha,
+        gralora_dropout=args.gralora_dropout,
+        gralora_target_modules=args.gralora_target_modules,
+        gralora_k=args.gralora_k,
+        hybrid_r=args.hybrid_r,
+        hub_model_id=args.hub_model_id,
+        push_to_hub=args.push_to_hub,
+    )
diff --git a/tests/test_encoder_decoder_models.py b/tests/test_encoder_decoder_models.py
index c4e38f934b..42b12e66e0 100644
--- a/tests/test_encoder_decoder_models.py
+++ b/tests/test_encoder_decoder_models.py
@@ -24,6 +24,7 @@
     C3AConfig,
     DeloraConfig,
     FourierFTConfig,
+    GraloraConfig,
     HRAConfig,
     IA3Config,
     LoraConfig,
@@ -100,6 +101,13 @@
             "task_type": "SEQ_2_SEQ_LM",
         },
     ),
+    (
+        GraloraConfig,
+        {
+            "target_modules": None,
+            "task_type": "SEQ_2_SEQ_LM",
+        },
+    ),
     (
         HRAConfig,
         {
diff --git a/tests/test_feature_extraction_models.py b/tests/test_feature_extraction_models.py
index a5377827f4..6bfd254ec4 100644
--- a/tests/test_feature_extraction_models.py
+++ b/tests/test_feature_extraction_models.py
@@ -22,6 +22,7 @@
     C3AConfig,
     DeloraConfig,
     FourierFTConfig,
+    GraloraConfig,
     HRAConfig,
     IA3Config,
     LoraConfig,
@@ -98,6 +99,13 @@
             "target_modules": None,
         },
     ),
+    (
+        GraloraConfig,
+        {
+            "task_type": "FEATURE_EXTRACTION",
+            "target_modules": None,
+        },
+    ),
     (
         HRAConfig,
         {
diff --git a/tests/test_seq_classifier.py b/tests/test_seq_classifier.py
index 03869c3a7a..bee83a879a 100644
--- a/tests/test_seq_classifier.py
+++ b/tests/test_seq_classifier.py
@@ -22,6 +22,7 @@
     C3AConfig,
     DeloraConfig,
     FourierFTConfig,
+    GraloraConfig,
     HRAConfig,
     IA3Config,
     LoraConfig,
@@ -99,6 +100,13 @@
             "target_modules": None,
         },
     ),
+    (
+        GraloraConfig,
+        {
+            "task_type": "SEQ_CLS",
+            "target_modules": None,
+        },
+    ),
     (
         HRAConfig,
         {

From 3f69d8f64a07bc6a12e1fd5b93e19c1088fae3ae Mon Sep 17 00:00:00 2001
From: "yeonjoon.jung" <yeonjoon.jung@squeezebits.com>
Date: Sat, 25 Oct 2025 03:19:51 +0900
Subject: [PATCH 09/11] REFACTOR: integrate GraLoRA tests into existing test
 files

---
 docs/source/_toctree.yml          |    2 +
 src/peft/tuners/gralora/config.py |   82 ++-
 src/peft/tuners/gralora/layer.py  |    6 -
 src/peft/tuners/gralora/model.py  |   10 +-
 tests/test_custom_models.py       |   48 +-
 tests/test_gralora.py             | 1051 -----------------------------
 tests/test_initialization.py      |   51 ++
 7 files changed, 164 insertions(+), 1086 deletions(-)
 delete mode 100644 tests/test_gralora.py

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index ecee4aedf1..3ecc3e7e9b 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -116,6 +116,8 @@
       title: VeRA
     - local: package_reference/fourierft
       title: FourierFT
+    - local: package_reference/gralora
+      title: GraLoRA
     - local: package_reference/vblora
       title: VB-LoRA
     - local: package_reference/hra
diff --git a/src/peft/tuners/gralora/config.py b/src/peft/tuners/gralora/config.py
index b88b26a77a..57e2fb3f47 100644
--- a/src/peft/tuners/gralora/config.py
+++ b/src/peft/tuners/gralora/config.py
@@ -21,6 +21,57 @@
 
 @dataclass
 class GraloraConfig(PeftConfig):
+    """
+    This is the configuration class to store the configuration of a [`GraloraModel`].
+
+    Args:
+        r (`int`):
+            GraLoRA attention dimension determines the rank of the GraLoRA adapter.
+            The total parameter count of the GraLoRA adapter is same as LoRA with same rank r, while the expressivitiy is multiplied by gralora_k.
+        hybrid_r (`int`):
+            Hybrid GraLoRA rank determines the rank allocated to vanilla LoRA method when using Hybrid GraLoRA method.
+            Hybrid GraLoRA, a combination of GraLoRA and vanilla LoRA, becomes available when hybrid_r > 0.
+            The parameter count of the GraLoRA adapter is r + hybrid_r.
+        target_modules (`Union[List[str], str]`):
+            List of module names or regex expression of the module names to replace with GraLoRA. "
+            For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. "
+            This can also be a wildcard 'all-linear' which matches all linear/Conv1D "
+            "(if the model is a PreTrainedModel, the output layer excluded). "
+            If not specified, modules will be chosen according to the model architecture, If the architecture is "
+            not known, an error will be raised -- in this case, you should specify the target modules manually. "
+            To avoid targeting any modules (because you want to apply `target_parameters`), set "
+            `target_modules=[]`.
+        gralora_alpha (`int`): GraLoRA alpha.
+            GraLoRA alpha is the scaling factor for the GraLoRA adapter.
+            Scale becomes gralora_alpha / (r + hybrid_r).
+        gralora_dropout (`float`):
+            GraLoRA dropout is the dropout probability for the GraLoRA adapter.
+            It is used to prevent overfitting and improve the generalization of the GraLoRA adapter.
+        gralora_k (`int`):
+            GraLoRA k determines the number of subblocks in the GraLoRA adapter.
+            The rank r must be divisible by gralora_k for the GraLoRA adapter to be valid.
+            The total parameter count is preserved regardles of gralora_k.
+            The entire rank of the GraLoRA adapter is increased by gralora_k, while the rank of each subblock is reduced by gralora_k.
+            gralora_k=2 is recommended for rank 32 or lower, and gralora_k=4 is recommended for rank 64 or higher.
+        fan_in_fan_out (`bool`):
+            Set this to True if the layer to replace stores weight like (fan_in, fan_out).
+            For example, gpt-2 uses `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`.
+        bias (`str`):
+            Bias type for gralora. Can be 'none', 'all' or 'gralora_only'.
+            If 'all' or 'gralora_only', the corresponding biases will be updated during training.
+            Be aware that this means that, even when disabling the adapters, the model will not produce the same output as the base model would have without adaptation.
+        init_weights (`bool`):
+            Whether to initialize the weights of the GraLoRA layers with their default initialization.
+            Don't change this setting, except if you know exactly what you're doing.
+        layers_to_transform (`Union[List[int], int]`):
+            The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list.
+            If a single integer is passed, PEFT will transform only the layer at this index.
+            This only works when target_modules is a list of str.
+        layers_pattern (`Optional[Union[List[str], str]]`):
+            The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern.
+            This only works when target_modules is a list of str. This should target the `nn.ModuleList` of the model, which is often called `'layers'` or `'h'`.
+    """
+
     r: int = field(
         default=32,
         metadata={
@@ -44,9 +95,14 @@ class GraloraConfig(PeftConfig):
         default=None,
         metadata={
             "help": (
-                "List of module names or regex expression of the module names to replace with gralora. "
+                "List of module names or regex expression of the module names to replace with LoRA. "
                 "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. "
-                "Only linear layers are supported."
+                "This can also be a wildcard 'all-linear' which matches all linear/Conv1D "
+                "(if the model is a PreTrainedModel, the output layer excluded). "
+                "If not specified, modules will be chosen according to the model architecture, If the architecture is "
+                "not known, an error will be raised -- in this case, you should specify the target modules manually. "
+                "To avoid targeting any modules (because you want to apply `target_parameters`), set "
+                "`target_modules=[]`."
             )
         },
     )
@@ -54,8 +110,8 @@ class GraloraConfig(PeftConfig):
         default=64,
         metadata={
             "help": (
-                "gralora alpha is the scaling factor for the GraLoRA adapter."
-                "Scale becomes gralora_alpha / (r + hybrid_r)."
+                "gralora alpha is the scaling factor for the GraLoRA adapter. "
+                "Scale becomes gralora_alpha / (r + hybrid_r). "
             )
         },
     )
@@ -64,8 +120,11 @@ class GraloraConfig(PeftConfig):
         default=2,
         metadata={
             "help": (
-                "gralora_k determines the number of subblocks in the GraLoRA adapter."
-                "The total parameter count is preserved regardles of gralora_k, while the expressivitiy is multiplied by gralora_k."
+                "gralora_k determines the number of subblocks in the GraLoRA adapter. "
+                "The rank r must be divisible by gralora_k for the GraLoRA adapter to be valid. "
+                "The total parameter count is preserved regardles of gralora_k. "
+                "The entire rank of the GraLoRA adapter is increased by gralora_k, while the rank of each subblock is reduced by gralora_k. "
+                "gralora_k=2 is recommended for rank 32 or lower, and gralora_k=4 is recommended for rank 64 or higher. "
             )
         },
     )
@@ -99,9 +158,9 @@ class GraloraConfig(PeftConfig):
         default=None,
         metadata={
             "help": (
-                "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers"
-                " indexes that are specified inside this list. If a single integer is passed, PEFT will transform only"
-                " the layer at this index."
+                "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. "
+                "If a single integer is passed, PEFT will transform only the layer at this index. "
+                "This only works when target_modules is a list of str."
             )
         },
     )
@@ -109,8 +168,9 @@ class GraloraConfig(PeftConfig):
         default=None,
         metadata={
             "help": (
-                "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer"
-                " pattern is not in the common layers pattern."
+                "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern. "
+                "This only works when target_modules is a list of str. This should target the `nn.ModuleList` of the "
+                "model, which is often called `'layers'` or `'h'`."
             )
         },
     )
diff --git a/src/peft/tuners/gralora/layer.py b/src/peft/tuners/gralora/layer.py
index 4aefa02152..4303669aa4 100644
--- a/src/peft/tuners/gralora/layer.py
+++ b/src/peft/tuners/gralora/layer.py
@@ -271,12 +271,6 @@ def get_delta_weight(self, adapter) -> torch.Tensor:
         in_features = self.in_features
         out_features = self.out_features
         gralora_rank = r
-        if in_features % gralora_k != 0:
-            raise ValueError(f"in_features should be divisible by gralora_k, but got {in_features} and {gralora_k}")
-        elif out_features % gralora_k != 0:
-            raise ValueError(f"out_features should be divisible by gralora_k, but got {out_features} and {gralora_k}")
-        elif gralora_rank % gralora_k != 0:
-            raise ValueError(f"rank should be divisible by gralora_k, but got {gralora_rank} and {gralora_k}")
         subblock_gralora_rank = gralora_rank // gralora_k
 
         # scatter gralora_A to get the scattered weight matrix
diff --git a/src/peft/tuners/gralora/model.py b/src/peft/tuners/gralora/model.py
index 3d7fae2cc6..23a25d4c9c 100644
--- a/src/peft/tuners/gralora/model.py
+++ b/src/peft/tuners/gralora/model.py
@@ -15,23 +15,15 @@
 from __future__ import annotations
 
 import warnings
-from dataclasses import asdict
-from enum import Enum
-from typing import Optional
 
 import torch
-import torch.nn as nn
-from tqdm import tqdm
 from transformers.pytorch_utils import Conv1D
 
-from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists
+from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer
 from peft.utils import (
     TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
-    ModulesToSaveWrapper,
-    _get_submodules,
 )
 
-from .config import GraloraConfig
 from .layer import GraloraLayer, Linear
 
 
diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
index 30628f2bdf..8d9820a195 100644
--- a/tests/test_custom_models.py
+++ b/tests/test_custom_models.py
@@ -680,6 +680,18 @@
         GraloraConfig,
         {"target_modules": ["lin0"], "modules_to_save": ["lin1"]},
     ),
+    (
+        "Vanilla MLP 6 GraLoRA",
+        "MLP",
+        GraloraConfig,
+        {"target_modules": ["lin0", "lin1"], "modules_to_save": ["lin1"]},
+    ),
+    (
+        "Vanilla MLP 7 Hybrid GraLoRA",
+        "MLP",
+        GraloraConfig,
+        {"target_modules": ["lin0", "lin1"], "modules_to_save": ["lin1"], "hybrid_r": 4},
+    ),
     (
         "Embedding + transformers Conv1D 1 GraLoRA",
         "EmbConv1D",
@@ -3124,12 +3136,12 @@ def test_add_weighted_adapter_subtraction_with_negative_weights(self):
                 cancelled_B = module.lora_B["cancelled"].weight.data
 
                 # The weights should be approximately zero (they cancel out)
-                assert torch.allclose(
-                    cancelled_A, torch.zeros_like(cancelled_A), atol=1e-5
-                ), f"Cancelled A should be ~0, got max abs value {cancelled_A.abs().max()}"
-                assert torch.allclose(
-                    cancelled_B, torch.zeros_like(cancelled_B), atol=1e-5
-                ), f"Cancelled B should be ~0, got max abs value {cancelled_B.abs().max()}"
+                assert torch.allclose(cancelled_A, torch.zeros_like(cancelled_A), atol=1e-5), (
+                    f"Cancelled A should be ~0, got max abs value {cancelled_A.abs().max()}"
+                )
+                assert torch.allclose(cancelled_B, torch.zeros_like(cancelled_B), atol=1e-5), (
+                    f"Cancelled B should be ~0, got max abs value {cancelled_B.abs().max()}"
+                )
 
     def test_add_weighted_adapter_negative_weight_with_different_scaling(self):
         # Test negative weights with different scaling factors (lora_alpha)
@@ -3440,6 +3452,24 @@ def test_dora_save_and_load_remapping(self):
         for k in state_dict:
             assert torch.allclose(state_dict[k], state_dict_loaded[k])
 
+    def test_gralora_and_hybrid_gralora_parameter_count(self):
+        # Here we test the parameter count of GraLoRA is preserved
+        # when rank r + hybrid_r is the same regardless of the value of gralora_k.
+        model1 = MLP()
+        config1 = GraloraConfig(target_modules=["lin0"], r=12, gralora_k=2, hybrid_r=0)
+        model1 = get_peft_model(model1, config1)
+        model2 = MLP()
+        config2 = GraloraConfig(target_modules=["lin0"], r=10, gralora_k=2, hybrid_r=2)
+        model2 = get_peft_model(model2, config2)
+        model3 = MLP()
+        config3 = GraloraConfig(target_modules=["lin0"], r=10, gralora_k=5, hybrid_r=2)
+        model3 = get_peft_model(model3, config3)
+        trainable_params1, all_params1 = model1.get_nb_trainable_parameters()
+        trainable_params2, all_params2 = model2.get_nb_trainable_parameters()
+        trainable_params3, all_params3 = model3.get_nb_trainable_parameters()
+        assert trainable_params1 == trainable_params2 == trainable_params3
+        assert all_params1 == all_params2 == all_params3
+
     @pytest.mark.parametrize("with_forward_call", [False, True])
     def test_mha_gradients_set_correctly(self, with_forward_call):
         # check for this bug: https://github.com/huggingface/peft/issues/761#issuecomment-1893804738
@@ -3535,9 +3565,9 @@ def test_multirank_2(self):
                 if isinstance(module, BaseTunerLayer):
                     rank_expected = rank_pattern.get(key, r)
                     rank_current = module.lora_A[adapter].weight.shape[0]
-                    assert (
-                        rank_current == rank_expected
-                    ), f"Rank {rank_current} is not equal to expected {rank_expected}"
+                    assert rank_current == rank_expected, (
+                        f"Rank {rank_current} is not equal to expected {rank_expected}"
+                    )
 
 
 class TestLayerRepr:
diff --git a/tests/test_gralora.py b/tests/test_gralora.py
deleted file mode 100644
index 59c2418a33..0000000000
--- a/tests/test_gralora.py
+++ /dev/null
@@ -1,1051 +0,0 @@
-# Copyright 2025-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This test file is for tests specific to GraLoRA, since GraLoRA has some specific features
-# like block-diagonal structure, hybrid mode, and tensor permutation for information exchange.
-
-import pytest
-import torch
-from safetensors import safe_open
-from torch import nn
-
-from peft import PeftModel, get_peft_model
-from peft.tuners.gralora import GraloraConfig
-
-
-class MLP(nn.Module):
-    """Simple MLP for testing"""
-
-    def __init__(self, bias=True):
-        super().__init__()
-        self.relu = nn.ReLU()
-        self.lin0 = nn.Linear(10, 20, bias=bias)
-        self.lin1 = nn.Linear(20, 20, bias=bias)  # lin1 and lin2 have same shape
-        self.lin2 = nn.Linear(20, 20, bias=bias)
-        self.lin3 = nn.Linear(20, 2, bias=bias)
-        self.sm = nn.LogSoftmax(dim=-1)
-
-    def forward(self, X):
-        X = self.lin0(X)
-        X = self.relu(X)
-        X = self.lin1(X)
-        X = self.relu(X)
-        X = self.lin2(X)
-        X = self.relu(X)
-        X = self.lin3(X)
-        X = self.sm(X)
-        return X
-
-
-class TestGralora:
-    @pytest.fixture
-    def mlp(self):
-        torch.manual_seed(0)
-        model = MLP()
-        return model
-
-    @pytest.fixture
-    def mlp_gralora_pure(self, mlp):
-        """Pure GraLoRA without hybrid component"""
-        torch.manual_seed(0)
-        config = GraloraConfig(
-            target_modules=["lin1", "lin2"],
-            r=16,
-            gralora_k=4,
-            hybrid_r=0,
-            gralora_alpha=32,
-            gralora_dropout=0.1,
-        )
-        peft_model = get_peft_model(mlp, config)
-        return peft_model
-
-    @pytest.fixture
-    def mlp_gralora_hybrid(self):
-        """Hybrid GraLoRA with vanilla LoRA component"""
-        torch.manual_seed(0)
-        mlp = MLP()
-        config = GraloraConfig(
-            target_modules=["lin1", "lin2"],
-            r=16,
-            gralora_k=4,
-            hybrid_r=4,
-            gralora_alpha=32,
-            gralora_dropout=0.1,
-        )
-        peft_model = get_peft_model(mlp, config)
-        return peft_model
-
-    def test_gralora_config_validation(self):
-        """Test that config validation works correctly"""
-        # Valid config
-        config = GraloraConfig(r=16, gralora_k=4, hybrid_r=0)
-        assert config.r == 16
-        assert config.gralora_k == 4
-        assert config.hybrid_r == 0
-
-        # Hybrid config
-        config = GraloraConfig(r=16, gralora_k=4, hybrid_r=4)
-        assert config.r == 16
-        assert config.hybrid_r == 4
-
-    def test_gralora_parameter_shapes(self, mlp_gralora_hybrid):
-        """Test that GraLoRA parameters have correct shapes"""
-        for name, module in mlp_gralora_hybrid.named_modules():
-            if hasattr(module, "gralora_A"):
-                adapter_name = "default"
-                gralora_A = module.gralora_A[adapter_name]
-                gralora_B = module.gralora_B[adapter_name]
-                gralora_A_general = module.gralora_A_general[adapter_name]
-                gralora_B_general = module.gralora_B_general[adapter_name]
-
-                in_features = module.in_features
-                out_features = module.out_features
-                k = 4
-                gralora_rank = 16
-
-                # Check GraLoRA block shapes
-                # Each block has full gralora_rank, not gralora_rank // k
-                assert gralora_A.shape == (k, in_features // k, gralora_rank)
-                assert gralora_B.shape == (k, gralora_rank, out_features // k)
-
-                # Check hybrid component shapes
-                assert gralora_A_general.weight.shape == (4, in_features)
-                assert gralora_B_general.weight.shape == (out_features, 4)
-
-    def test_gralora_block_diagonal_structure(self):
-        """Test that pure GraLoRA produces block-diagonal delta weights"""
-        # Use init_weights=False to have non-zero B matrices
-        torch.manual_seed(0)
-        mlp = MLP()
-        config = GraloraConfig(
-            target_modules=["lin1", "lin2"],
-            r=16,
-            gralora_k=4,
-            hybrid_r=0,
-            init_weights=False,  # Both A and B initialized randomly
-        )
-        model = get_peft_model(mlp, config)
-
-        for name, module in model.named_modules():
-            if hasattr(module, "get_delta_weight"):
-                adapter_name = "default"
-                delta_weight = module.get_delta_weight(adapter_name)
-
-                k = 4
-                in_features = module.in_features
-                out_features = module.out_features
-                block_size_in = in_features // k
-                block_size_out = out_features // k
-
-                # Check diagonal blocks have non-zero values
-                for i in range(k):
-                    row_start = i * block_size_out
-                    row_end = (i + 1) * block_size_out
-                    col_start = i * block_size_in
-                    col_end = (i + 1) * block_size_in
-
-                    block = delta_weight[row_start:row_end, col_start:col_end]
-                    block_norm = torch.norm(block).item()
-                    # Diagonal blocks should have some values (initialized with kaiming)
-                    assert block_norm > 0, f"Diagonal block [{i},{i}] is zero"
-
-    def test_gralora_forward_pass(self, mlp_gralora_hybrid):
-        """Test that forward pass works without errors"""
-        mlp_gralora_hybrid.eval()
-        x = torch.randn(5, 10)
-
-        with torch.no_grad():
-            output = mlp_gralora_hybrid(x)
-
-        assert output.shape == (5, 2)
-        assert not torch.isnan(output).any()
-        assert not torch.isinf(output).any()
-
-    def test_gralora_backward_pass(self, mlp_gralora_hybrid):
-        """Test that backward pass computes gradients correctly"""
-        mlp_gralora_hybrid.train()
-        x = torch.randn(5, 10)
-
-        output = mlp_gralora_hybrid(x)
-        loss = output.sum()
-        loss.backward()
-
-        # Check that GraLoRA parameters have gradients
-        for name, param in mlp_gralora_hybrid.named_parameters():
-            if "gralora" in name and param.requires_grad:
-                assert param.grad is not None, f"Parameter {name} has no gradient"
-                assert not torch.isnan(param.grad).any(), f"Parameter {name} has NaN gradients"
-
-    def test_gralora_pure_vs_hybrid_params(self):
-        """Test that pure and hybrid modes have same total parameters but different distribution"""
-        torch.manual_seed(0)
-        mlp_pure = MLP()
-        config_pure = GraloraConfig(
-            target_modules=["lin1", "lin2"],
-            r=16,
-            gralora_k=4,
-            hybrid_r=0,
-        )
-        model_pure = get_peft_model(mlp_pure, config_pure)
-
-        torch.manual_seed(0)
-        mlp_hybrid = MLP()
-        config_hybrid = GraloraConfig(
-            target_modules=["lin1", "lin2"],
-            r=12,
-            gralora_k=4,
-            hybrid_r=4,
-        )
-        model_hybrid = get_peft_model(mlp_hybrid, config_hybrid)
-
-        def count_trainable_params(model):
-            return sum(p.numel() for p in model.parameters() if p.requires_grad)
-
-        params_pure = count_trainable_params(model_pure)
-        params_hybrid = count_trainable_params(model_hybrid)
-
-        # Pure and hybrid should have same total parameters (r is constant)
-        # but distributed differently between block-diagonal and full-rank components
-        assert (
-            params_pure == params_hybrid
-        ), f"Pure ({params_pure}) and Hybrid ({params_hybrid}) should have same parameter count"
-
-        # Check that hybrid has general components
-        has_general = False
-        for name, _ in model_hybrid.named_modules():
-            if "gralora_A_general" in name or "gralora_B_general" in name:
-                has_general = True
-                break
-        assert has_general, "Hybrid mode should have general components"
-
-    def test_gralora_save_load_roundtrip(self, mlp_gralora_hybrid, tmp_path):
-        """Test that save/load preserves model behavior"""
-        mlp_gralora_hybrid.eval()
-        x = torch.randn(5, 10)
-
-        # Get output before save
-        with torch.no_grad():
-            output_before = mlp_gralora_hybrid(x)
-
-        # Save adapter
-        mlp_gralora_hybrid.save_pretrained(tmp_path)
-
-        # Load adapter
-        torch.manual_seed(0)
-        new_mlp = MLP()
-        loaded_model = PeftModel.from_pretrained(new_mlp, tmp_path)
-        loaded_model.eval()
-
-        # Get output after load
-        with torch.no_grad():
-            output_after = loaded_model(x)
-
-        # Outputs should be very close
-        assert torch.allclose(output_before, output_after, atol=1e-5, rtol=1e-5)
-
-    def test_gralora_state_dict_structure(self, mlp_gralora_hybrid, tmp_path):
-        """Test that state dict contains only necessary parameters"""
-        mlp_gralora_hybrid.save_pretrained(tmp_path)
-
-        # Load state dict
-        sd = {}
-        with safe_open(tmp_path / "adapter_model.safetensors", framework="pt", device="cpu") as f:
-            for key in f.keys():
-                sd[key] = f.get_tensor(key)
-
-        # Check that gralora parameters are present
-        assert any("gralora_A" in key for key in sd), "gralora_A not found in state dict"
-        assert any("gralora_B" in key for key in sd), "gralora_B not found in state dict"
-
-        # For hybrid mode, check hybrid components
-        assert any("gralora_A_general" in key for key in sd), "gralora_A_general not found"
-        assert any("gralora_B_general" in key for key in sd), "gralora_B_general not found"
-
-    def test_gralora_merge_and_unload(self, mlp_gralora_hybrid):
-        """Test merge_and_unload functionality"""
-        mlp_gralora_hybrid.eval()
-        x = torch.randn(5, 10)
-
-        # Get output before merge
-        with torch.no_grad():
-            output_before = mlp_gralora_hybrid(x)
-
-        # Merge and unload
-        merged_model = mlp_gralora_hybrid.merge_and_unload()
-        merged_model.eval()
-
-        # Get output after merge
-        with torch.no_grad():
-            output_after = merged_model(x)
-
-        # Outputs should be very close
-        assert torch.allclose(output_before, output_after, atol=1e-4, rtol=1e-4)
-
-        # Check that merged model has no GraLoRA layers
-        has_gralora = any("gralora" in name for name, _ in merged_model.named_parameters())
-        assert not has_gralora, "Merged model still has GraLoRA parameters"
-
-    def test_gralora_merge_unmerge(self):
-        """Test merge/unmerge functionality"""
-        torch.manual_seed(0)
-        mlp = MLP()
-        config = GraloraConfig(
-            target_modules=["lin1"],
-            r=8,
-            gralora_k=2,
-            hybrid_r=0,
-        )
-        model = get_peft_model(mlp, config)
-        model.eval()
-
-        x = torch.randn(5, 10)
-
-        # Output before merge
-        with torch.no_grad():
-            output_before = model(x)
-
-        # Merge adapter using PEFT API
-        model.merge_adapter()
-
-        with torch.no_grad():
-            output_merged = model(x)
-
-        # Outputs should be the same after merge
-        assert torch.allclose(output_before, output_merged, atol=1e-4, rtol=1e-4)
-
-        # Unmerge adapter using PEFT API
-        model.unmerge_adapter()
-
-        with torch.no_grad():
-            output_unmerged = model(x)
-
-        # Outputs should be the same after unmerge
-        assert torch.allclose(output_before, output_unmerged, atol=1e-4, rtol=1e-4)
-
-    def test_gralora_multiple_adapters(self):
-        """Test adding and switching between multiple adapters"""
-        torch.manual_seed(0)
-        mlp = MLP()
-
-        # Use init_weights=False to have non-zero outputs
-        config1 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2, hybrid_r=0, init_weights=False)
-        model = get_peft_model(mlp, config1, adapter_name="adapter1")
-
-        torch.manual_seed(42)  # Different seed for second adapter
-        config2 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2, hybrid_r=0, init_weights=False)
-        model.add_adapter("adapter2", config2)
-
-        x = torch.randn(5, 10)
-
-        # Test adapter1
-        model.set_adapter("adapter1")
-        with torch.no_grad():
-            output1 = model(x)
-
-        # Test adapter2
-        model.set_adapter("adapter2")
-        with torch.no_grad():
-            output2 = model(x)
-
-        # Different adapters should give different outputs
-        assert not torch.allclose(output1, output2, atol=1e-3, rtol=1e-3)
-
-    def test_gralora_dtype_compatibility(self):
-        """Test that GraLoRA works with different dtypes"""
-        for dtype in [torch.float32, torch.float16, torch.bfloat16]:
-            if dtype == torch.bfloat16 and not torch.cuda.is_available():
-                # Skip bfloat16 on CPU if not supported
-                continue
-
-            torch.manual_seed(0)
-            mlp = MLP().to(dtype)
-            config = GraloraConfig(
-                target_modules=["lin1"],
-                r=8,
-                gralora_k=2,
-                hybrid_r=0,
-            )
-            model = get_peft_model(mlp, config)
-
-            x = torch.randn(5, 10).to(dtype)
-            output = model(x)
-
-            assert output.dtype == dtype, f"Output dtype mismatch for {dtype}"
-
-    def test_gralora_disable_adapters(self):
-        """Test disabling adapters"""
-        torch.manual_seed(0)
-        mlp = MLP()
-        # Use init_weights=False to have non-zero effect
-        config = GraloraConfig(
-            target_modules=["lin1", "lin2"],
-            r=16,
-            gralora_k=4,
-            hybrid_r=4,
-            init_weights=False,
-        )
-        model = get_peft_model(mlp, config)
-        model.eval()
-        x = torch.randn(5, 10)
-
-        # Output with adapter enabled
-        with torch.no_grad():
-            output_enabled = model(x)
-
-        # Output with adapter disabled
-        with model.disable_adapter():
-            with torch.no_grad():
-                output_disabled = model(x)
-
-        # Outputs should be different
-        assert not torch.allclose(output_enabled, output_disabled, atol=1e-6, rtol=1e-6)
-
-    def test_gralora_different_k_values(self):
-        """Test GraLoRA with different k values"""
-        for k in [2, 4]:
-            torch.manual_seed(0)
-            mlp = MLP()
-            config = GraloraConfig(
-                target_modules=["lin1", "lin2"],
-                r=k * 4,  # Make sure r is divisible by k
-                gralora_k=k,
-                hybrid_r=0,
-            )
-            model = get_peft_model(mlp, config)
-
-            x = torch.randn(5, 10)
-            output = model(x)
-
-            assert output.shape == (5, 2)
-            assert not torch.isnan(output).any()
-
-    def test_gralora_rank_divisibility_check(self):
-        """Test that invalid rank/k combinations raise errors"""
-        torch.manual_seed(0)
-        mlp = MLP()
-
-        # This should raise an error because (r - hybrid_r) is not divisible by k
-        # r=15, hybrid_r=0, k=4 -> gralora_rank=15, 15 % 4 != 0
-        config = GraloraConfig(
-            target_modules=["lin1"],
-            r=15,
-            gralora_k=4,
-            hybrid_r=0,
-        )
-
-        with pytest.raises(ValueError, match="r should be divisible by gralora_k"):
-            get_peft_model(mlp, config)
-
-    def test_gralora_trainable_parameters_only(self, mlp_gralora_hybrid):
-        """Test that only GraLoRA parameters are trainable"""
-        for name, param in mlp_gralora_hybrid.named_parameters():
-            if "gralora" in name or "modules_to_save" in name:
-                assert param.requires_grad, f"GraLoRA parameter {name} should be trainable"
-            else:
-                assert not param.requires_grad, f"Base parameter {name} should be frozen"
-
-    def test_gralora_save_pretrained_files(self, mlp_gralora_hybrid, tmp_path):
-        """Test that save_pretrained creates expected files"""
-        mlp_gralora_hybrid.save_pretrained(tmp_path)
-
-        # Check for config file
-        assert (tmp_path / "adapter_config.json").exists()
-
-        # Check for weights file (either .bin or .safetensors)
-        assert (tmp_path / "adapter_model.safetensors").exists() or (tmp_path / "adapter_model.bin").exists()
-
-    def test_gralora_information_exchange_via_permutation(self, mlp_gralora_pure):
-        """
-        Test that information exchange happens through tensor permutation. Even though delta weights are
-        block-diagonal, the forward pass should allow information flow between blocks via the permutation operation.
-        """
-        mlp_gralora_pure.eval()
-
-        # Create two inputs that differ only in specific blocks
-        x1 = torch.randn(1, 10)
-        x2 = x1.clone()
-
-        # Modify only the first block (assuming k=4, block size = 10//4 = 2.5, rounded to 2-3 features)
-        x2[0, :5] += 1.0  # Modify first block
-
-        with torch.no_grad():
-            out1 = mlp_gralora_pure(x1)
-            out2 = mlp_gralora_pure(x2)
-
-        # Due to information exchange, changing one block should affect all outputs
-        # (not just outputs corresponding to that block)
-        diff = (out1 - out2).abs()
-
-        # All output dimensions should be affected (not just the first block's outputs)
-        assert (diff > 1e-6).all(), "Information exchange not happening correctly"
-
-    def test_gralora_scaling_factor(self):
-        """Test that scaling factor is correctly applied"""
-        torch.manual_seed(0)
-        mlp = MLP()
-
-        # Create two configs with different alpha values
-        config_alpha16 = GraloraConfig(
-            target_modules=["lin1"],
-            r=8,
-            gralora_alpha=16,
-            gralora_k=2,
-            hybrid_r=0,
-        )
-
-        config_alpha32 = GraloraConfig(
-            target_modules=["lin1"],
-            r=8,
-            gralora_alpha=32,
-            gralora_k=2,
-            hybrid_r=0,
-        )
-
-        model_alpha16 = get_peft_model(MLP(), config_alpha16)
-        model_alpha32 = get_peft_model(MLP(), config_alpha32)
-
-        # Copy weights to make them identical except for scaling
-        for (n1, p1), (n2, p2) in zip(model_alpha16.named_parameters(), model_alpha32.named_parameters()):
-            if "gralora" in n1:
-                p2.data = p1.data.clone()
-
-        x = torch.randn(5, 10)
-
-        model_alpha16.eval()
-        model_alpha32.eval()
-
-        with torch.no_grad():
-            out1 = model_alpha16(x)
-            out2 = model_alpha32(x)
-
-        # Outputs should be different due to different scaling
-        assert not torch.allclose(out1, out2, atol=1e-6, rtol=1e-6)
-
-    def test_gralora_safe_merge_success(self):
-        """Test safe_merge with valid weights"""
-        torch.manual_seed(0)
-        mlp = MLP()
-        config = GraloraConfig(
-            target_modules=["lin1"],
-            r=8,
-            gralora_k=2,
-            hybrid_r=0,
-            init_weights=False,
-        )
-        model = get_peft_model(mlp, config)
-
-        x = torch.randn(5, 10)
-        with torch.no_grad():
-            output_before = model(x)
-
-        # Test safe merge
-        model.base_model.model.lin1.merge(safe_merge=True)
-
-        with torch.no_grad():
-            output_after = model(x)
-
-        assert torch.allclose(output_before, output_after, atol=1e-4, rtol=1e-4)
-
-    def test_gralora_safe_merge_detects_nan(self):
-        """Test that safe_merge detects NaN values"""
-        torch.manual_seed(0)
-        mlp = MLP()
-        config = GraloraConfig(
-            target_modules=["lin1"],
-            r=8,
-            gralora_k=2,
-            hybrid_r=0,
-        )
-        model = get_peft_model(mlp, config)
-
-        # Inject NaN into adapter weights (use .data to avoid requires_grad error)
-        model.base_model.model.lin1.gralora_A["default"].data[0, 0, 0] = float("nan")
-
-        # safe_merge should raise ValueError
-        with pytest.raises(ValueError, match="NaNs detected"):
-            model.base_model.model.lin1.merge(safe_merge=True)
-
-    def test_gralora_unmerge_warning_when_not_merged(self):
-        """Test that unmerge warns when already unmerged"""
-        torch.manual_seed(0)
-        mlp = MLP()
-        config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2)
-        model = get_peft_model(mlp, config)
-
-        # Try to unmerge without merging first
-        with pytest.warns(UserWarning, match="Already unmerged"):
-            model.base_model.model.lin1.unmerge()
-
-    def test_gralora_hybrid_forward_computation(self):
-        """Test that hybrid LoRA component is used in forward pass"""
-        torch.manual_seed(0)
-        mlp_hybrid = MLP()
-        mlp_pure = MLP()
-
-        config_hybrid = GraloraConfig(
-            target_modules=["lin1"],
-            r=16,
-            gralora_k=4,
-            hybrid_r=4,
-            init_weights=False,
-        )
-        model_hybrid = get_peft_model(mlp_hybrid, config_hybrid)
-
-        config_pure = GraloraConfig(
-            target_modules=["lin1"],
-            r=16,
-            gralora_k=4,
-            hybrid_r=0,
-            init_weights=False,
-        )
-        model_pure = get_peft_model(mlp_pure, config_pure)
-
-        x = torch.randn(5, 10)
-
-        with torch.no_grad():
-            output_hybrid = model_hybrid(x)
-            output_pure = model_pure(x)
-
-        # Outputs should be different due to hybrid component
-        assert not torch.allclose(output_hybrid, output_pure, atol=1e-3)
-
-    def test_gralora_invalid_rank_zero(self):
-        """Test that r=0 raises error"""
-        mlp = MLP()
-        config = GraloraConfig(target_modules=["lin1"], r=0, gralora_k=2)
-
-        with pytest.raises(ValueError, match="`r` should be a positive integer"):
-            get_peft_model(mlp, config)
-
-    def test_gralora_invalid_rank_negative(self):
-        """Test that negative r raises error"""
-        mlp = MLP()
-        config = GraloraConfig(target_modules=["lin1"], r=-1, gralora_k=2)
-
-        with pytest.raises(ValueError, match="`r` should be a positive integer"):
-            get_peft_model(mlp, config)
-
-    def test_gralora_bias_all(self):
-        """Test bias='all' configuration"""
-        torch.manual_seed(0)
-        mlp = MLP(bias=True)
-        config = GraloraConfig(
-            target_modules=["lin1"],
-            r=8,
-            gralora_k=2,
-            bias="all",
-        )
-        model = get_peft_model(mlp, config)
-
-        # Check that all bias parameters are trainable
-        bias_params = [name for name, param in model.named_parameters() if "bias" in name and param.requires_grad]
-        assert len(bias_params) > 0, "At least some bias parameters should be trainable"
-
-    def test_gralora_bias_gralora_only(self):
-        """Test bias='gralora_only' configuration"""
-        torch.manual_seed(0)
-        mlp = MLP(bias=True)
-        config = GraloraConfig(
-            target_modules=["lin1"],
-            r=8,
-            gralora_k=2,
-            bias="gralora_only",
-        )
-        model = get_peft_model(mlp, config)
-
-        # Only GraLoRA layer biases should be trainable
-        assert model.base_model.model.lin1.bias.requires_grad
-        assert not model.base_model.model.lin0.bias.requires_grad
-
-    def test_gralora_multiple_adapters_with_bias_raises(self):
-        """Test that multiple adapters with bias raises error"""
-        torch.manual_seed(0)
-        mlp = MLP()
-        config1 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2, bias="all")
-        model = get_peft_model(mlp, config1)
-
-        config2 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2, bias="all")
-
-        with pytest.raises(ValueError, match="supports only 1 adapter with bias"):
-            model.add_adapter("adapter2", config2)
-
-    def test_gralora_cpu_fp16_merge(self):
-        """Test merge with fp16 on CPU"""
-        torch.manual_seed(0)
-        mlp = MLP().to(torch.float16)
-        config = GraloraConfig(
-            target_modules=["lin1"],
-            r=8,
-            gralora_k=2,
-            hybrid_r=0,
-            init_weights=False,
-        )
-        model = get_peft_model(mlp, config)
-
-        x = torch.randn(5, 10).to(torch.float16)
-
-        with torch.no_grad():
-            output_before = model(x)
-
-        # Merge (should handle CPU fp16 correctly)
-        model.merge_adapter()
-
-        with torch.no_grad():
-            output_after = model(x)
-
-        assert torch.allclose(output_before, output_after, atol=1e-2, rtol=1e-2)
-
-    def test_gralora_cpu_bf16_merge(self):
-        """Test merge with bf16 on CPU (if supported)"""
-        # Check if bfloat16 is supported
-        try:
-            _ = torch.randn(2, 2).to(torch.bfloat16)
-        except RuntimeError:
-            pytest.skip("bfloat16 not supported on this system")
-
-        torch.manual_seed(0)
-        mlp = MLP().to(torch.bfloat16)
-        config = GraloraConfig(
-            target_modules=["lin1"],
-            r=8,
-            gralora_k=2,
-            hybrid_r=2,
-            init_weights=False,
-        )
-        model = get_peft_model(mlp, config)
-
-        x = torch.randn(5, 10).to(torch.bfloat16)
-
-        with torch.no_grad():
-            output_before = model(x)
-
-        # Merge with hybrid component
-        model.merge_adapter()
-
-        with torch.no_grad():
-            output_after = model(x)
-
-        assert torch.allclose(output_before, output_after, atol=1e-2, rtol=1e-2)
-
-    def test_gralora_disable_adapter_layers_warns_with_bias(self):
-        """Test that disable_adapter_layers warns when bias is configured"""
-        torch.manual_seed(0)
-        mlp = MLP()
-        config = GraloraConfig(
-            target_modules=["lin1"],
-            r=8,
-            gralora_k=2,
-            bias="all",
-        )
-        model = get_peft_model(mlp, config)
-
-        with pytest.warns(UserWarning, match="disabling adapter layers with bias"):
-            model.disable_adapter_layers()
-
-    def test_gralora_set_adapter_warns_when_merged(self):
-        """Test that set_adapter warns and unmerges when model is merged"""
-        torch.manual_seed(0)
-        mlp = MLP()
-        config1 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2)
-        model = get_peft_model(mlp, config1, adapter_name="adapter1")
-
-        config2 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2)
-        model.add_adapter("adapter2", config2)
-
-        # Merge first adapter
-        model.merge_adapter()
-
-        # Setting adapter should warn and unmerge
-        with pytest.warns(UserWarning, match="Adapter cannot be set when the model is merged"):
-            model.set_adapter("adapter2")
-
-        # Model should be unmerged now
-        assert not model.base_model.model.lin1.merged
-
-    def test_gralora_delete_adapter(self):
-        """Test deleting an adapter"""
-        torch.manual_seed(0)
-        mlp = MLP()
-        config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2)
-        model = get_peft_model(mlp, config, adapter_name="adapter1")
-
-        config2 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2)
-        model.add_adapter("adapter2", config2)
-
-        # Delete adapter1
-        model.delete_adapter("adapter1")
-
-        assert "adapter1" not in model.peft_config
-        assert "adapter2" in model.peft_config
-
-    def test_gralora_delete_nonexistent_adapter_raises(self):
-        """Test that deleting nonexistent adapter raises error"""
-        torch.manual_seed(0)
-        mlp = MLP()
-        config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2)
-        model = get_peft_model(mlp, config)
-
-        with pytest.raises(ValueError, match="Adapter .* does not exist"):
-            model.delete_adapter("nonexistent")
-
-    def test_gralora_unload_without_merge(self):
-        """Test unload without merging"""
-        torch.manual_seed(0)
-        mlp = MLP()
-        config = GraloraConfig(
-            target_modules=["lin1"],
-            r=8,
-            gralora_k=2,
-            init_weights=False,
-        )
-        model = get_peft_model(mlp, config)
-
-        x = torch.randn(5, 10)
-
-        # Get base model output
-        with model.disable_adapter():
-            with torch.no_grad():
-                base_output = model(x)
-
-        # Unload without merge
-        unloaded_model = model.unload()
-
-        with torch.no_grad():
-            unloaded_output = unloaded_model(x)
-
-        # Should match base model output (no merge)
-        assert torch.allclose(base_output, unloaded_output, atol=1e-5)
-
-    def test_gralora_merge_with_hybrid_component(self):
-        """Test that merge works correctly with hybrid component"""
-        torch.manual_seed(0)
-        mlp = MLP()
-        config = GraloraConfig(
-            target_modules=["lin1"],
-            r=16,
-            gralora_k=4,
-            hybrid_r=4,
-            init_weights=False,
-        )
-        model = get_peft_model(mlp, config)
-
-        x = torch.randn(5, 10)
-
-        with torch.no_grad():
-            output_before = model(x)
-
-        # Merge
-        model.merge_adapter()
-
-        with torch.no_grad():
-            output_after = model(x)
-
-        # Outputs should be very close
-        assert torch.allclose(output_before, output_after, atol=1e-4, rtol=1e-4)
-
-    def test_gralora_repr(self):
-        """Test __repr__ method"""
-        torch.manual_seed(0)
-        mlp = MLP()
-        config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2)
-        model = get_peft_model(mlp, config)
-
-        repr_str = repr(model.base_model.model.lin1)
-        assert "gralora" in repr_str.lower()
-
-    def test_gralora_merge_with_adapter_names(self):
-        """Test merge with specific adapter names"""
-        torch.manual_seed(0)
-        mlp = MLP()
-        config1 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2, init_weights=False)
-        model = get_peft_model(mlp, config1, adapter_name="adapter1")
-
-        torch.manual_seed(42)
-        config2 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2, init_weights=False)
-        model.add_adapter("adapter2", config2)
-
-        x = torch.randn(5, 10)
-
-        # Set to adapter1 and get output
-        model.set_adapter("adapter1")
-        with torch.no_grad():
-            output_before = model(x)
-
-        # Merge only adapter1
-        model.base_model.model.lin1.merge(adapter_names=["adapter1"])
-
-        with torch.no_grad():
-            output_after = model(x)
-
-        # Outputs should be close
-        assert torch.allclose(output_before, output_after, atol=1e-4, rtol=1e-4)
-
-    def test_gralora_enable_disable_adapter_layers(self):
-        """Test enable/disable adapter layers"""
-        torch.manual_seed(0)
-        mlp = MLP()
-        config = GraloraConfig(
-            target_modules=["lin1"],
-            r=8,
-            gralora_k=2,
-            init_weights=False,
-        )
-        model = get_peft_model(mlp, config)
-
-        x = torch.randn(5, 10)
-
-        # Get output with adapter enabled
-        with torch.no_grad():
-            output_enabled = model(x)
-
-        # Disable adapters
-        model.disable_adapter_layers()
-
-        with torch.no_grad():
-            output_disabled = model(x)
-
-        # Enable adapters
-        model.enable_adapter_layers()
-
-        with torch.no_grad():
-            output_re_enabled = model(x)
-
-        # Output with disabled adapter should be different
-        assert not torch.allclose(output_enabled, output_disabled, atol=1e-6)
-        # Output after re-enabling should match original
-        assert torch.allclose(output_enabled, output_re_enabled, atol=1e-6)
-
-    def test_gralora_forward_with_merged_adapter(self):
-        """Test forward pass with merged adapter"""
-        torch.manual_seed(0)
-        mlp = MLP()
-        config = GraloraConfig(
-            target_modules=["lin1"],
-            r=8,
-            gralora_k=2,
-            init_weights=False,
-        )
-        model = get_peft_model(mlp, config)
-
-        x = torch.randn(5, 10)
-
-        # Get output before merge
-        with torch.no_grad():
-            output_before = model(x)
-
-        # Merge adapter
-        model.merge_adapter()
-
-        # Forward with merged adapter (should take merged path)
-        with torch.no_grad():
-            output_after = model(x)
-
-        assert torch.allclose(output_before, output_after, atol=1e-4)
-
-    def test_gralora_forward_with_disable_adapters_and_merged(self):
-        """Test forward when disable_adapters=True and model is merged"""
-        torch.manual_seed(0)
-        mlp = MLP()
-        config = GraloraConfig(
-            target_modules=["lin1"],
-            r=8,
-            gralora_k=2,
-            init_weights=False,
-        )
-        model = get_peft_model(mlp, config)
-
-        x = torch.randn(5, 10)
-
-        # Merge adapter
-        model.merge_adapter()
-
-        # Get output with merged adapter
-        with torch.no_grad():
-            output_merged = model(x)
-
-        # Disable adapters (should unmerge)
-        with model.disable_adapter():
-            with torch.no_grad():
-                output_disabled = model(x)
-
-        # Outputs should be different
-        assert not torch.allclose(output_merged, output_disabled, atol=1e-5)
-
-    def test_gralora_bias_invalid_option_raises(self):
-        """Test that invalid bias option raises NotImplementedError"""
-        torch.manual_seed(0)
-        mlp = MLP()
-
-        # Create config with invalid bias (need to bypass validation)
-        config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2)
-        model = get_peft_model(mlp, config)
-
-        # Manually set invalid bias to trigger the error
-        model.peft_config["default"].bias = "invalid_option"
-
-        with pytest.raises(NotImplementedError, match="Requested bias"):
-            model._mark_only_adapters_as_trainable(model.model)
-
-    def test_gralora_merge_empty_adapter_names(self):
-        """Test merge with empty adapter_names returns early"""
-        torch.manual_seed(0)
-        mlp = MLP()
-        config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2)
-        model = get_peft_model(mlp, config)
-
-        # Call merge with empty list (should return early)
-        model.base_model.model.lin1.merge(adapter_names=[])
-
-        # Model should not be merged
-        assert not model.base_model.model.lin1.merged
-
-    def test_gralora_add_non_active_adapter(self):
-        """Test adding adapter that is not active (should not be trainable)"""
-        torch.manual_seed(0)
-        mlp = MLP()
-        config1 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2)
-        model = get_peft_model(mlp, config1, adapter_name="adapter1")
-
-        # Keep adapter1 active
-        model.set_adapter("adapter1")
-
-        # Add adapter2 (should not be active/trainable initially)
-        torch.manual_seed(42)
-        config2 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2)
-        model.add_adapter("adapter2", config2)
-
-        # adapter2 parameters should exist but might not be in active_adapters initially
-        assert "adapter2" in model.base_model.model.lin1.gralora_A
-
-    def test_gralora_forward_with_no_adapter_in_active_list(self):
-        """Test forward when active_adapter is not in gralora_A keys"""
-        torch.manual_seed(0)
-        mlp = MLP()
-        config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2)
-        model = get_peft_model(mlp, config, adapter_name="adapter1")
-
-        x = torch.randn(5, 10)
-
-        # Manually set _active_adapter to include non-existent adapter
-        original_adapter = model.base_model.model.lin1._active_adapter
-        model.base_model.model.lin1._active_adapter = ["nonexistent", "adapter1"]
-
-        # Should still work (skip nonexistent adapter)
-        with torch.no_grad():
-            output = model(x)
-
-        assert output.shape == (5, 2)
-
-        # Restore
-        model.base_model.model.lin1._active_adapter = original_adapter
diff --git a/tests/test_initialization.py b/tests/test_initialization.py
index 3475247cd8..cbc41e5671 100644
--- a/tests/test_initialization.py
+++ b/tests/test_initialization.py
@@ -38,6 +38,7 @@
     C3AConfig,
     DeloraConfig,
     EvaConfig,
+    GraloraConfig,
     IA3Config,
     LoftQConfig,
     LoKrConfig,
@@ -2157,6 +2158,56 @@ def test_init_weights_false_shifts_output(self, data):
         assert not torch.allclose(y_base, y_peft, atol=1e-6, rtol=1e-6)
 
 
+class TestGraLoRAInitialization:
+    """Basic sanity tests for the GraLoRA tuner."""
+
+    torch_device = infer_device()
+
+    def get_model(self, bias=True):
+        class MLP(nn.Module):
+            def __init__(self, bias=True):
+                super().__init__()
+                self.lin0 = nn.Linear(10, 30, bias=bias)
+                self.lin1 = nn.Linear(30, 2, bias=bias)
+
+            def forward(self, X):
+                X = self.lin0(X)
+                X = self.lin1(X)
+                return X
+
+        return MLP(bias=bias).to(self.torch_device).eval()
+
+    @pytest.fixture
+    def data(self):
+        torch.manual_seed(0)
+        return torch.randn(4, 10, device=self.torch_device)
+
+    def test_gralora_with_incompatible_gralora_k_and_r_raises(self):
+        model = self.get_model()
+        r = 6
+        gralora_k = 4
+        # msg = f"r should be divisible by gralora_k, but got {config.r} and {config.gralora_k}"
+        msg = f"r should be divisible by gralora_k, but got {r} and {gralora_k}"
+        with pytest.raises(ValueError, match=re.escape(msg)):
+            GraloraConfig(target_modules=["lin0"], r=r, gralora_k=gralora_k)
+
+    def test_gralora_with_incompatible_gralora_k_and_in_features_raises(self):
+        model = self.get_model()
+        config = GraloraConfig(target_modules=["lin0"], r=6, gralora_k=3)
+        msg = f"in_features should be divisible by gralora_k, but got {model.lin0.in_features} and {config.gralora_k}"
+        with pytest.raises(ValueError, match=re.escape(msg)):
+            get_peft_model(model, config)
+
+    def test_gralora_with_incompatible_gralora_k_and_out_features_raises(self):
+        model = self.get_model()
+        config = GraloraConfig(target_modules=["lin1"], r=6, gralora_k=3)
+        msg = (
+            f"out_features should be divisible by gralora_k, but got {model.lin1.out_features} and {config.gralora_k}"
+        )
+        with pytest.raises(ValueError, match=re.escape(msg)):
+            get_peft_model(model, config)
+
+
 class TestNoInfiniteRecursionDeepspeed:
     # see #1892 for details
     classes = [

From 430e89625142d9244aae82abf8386fb81229a666 Mon Sep 17 00:00:00 2001
From: "yeonjoon.jung" <yeonjoon.jung@squeezebits.com>
Date: Mon, 27 Oct 2025 21:39:27 +0900
Subject: [PATCH 10/11] UPDATE document format in GraLoRA

---
 src/peft/tuners/gralora/config.py | 64 +++++++++++++++----------------
 1 file changed, 31 insertions(+), 33 deletions(-)

diff --git a/src/peft/tuners/gralora/config.py b/src/peft/tuners/gralora/config.py
index 57e2fb3f47..1458bca3e2 100644
--- a/src/peft/tuners/gralora/config.py
+++ b/src/peft/tuners/gralora/config.py
@@ -26,50 +26,48 @@ class GraloraConfig(PeftConfig):
 
     Args:
         r (`int`):
-            GraLoRA attention dimension determines the rank of the GraLoRA adapter.
-            The total parameter count of the GraLoRA adapter is same as LoRA with same rank r, while the expressivitiy is multiplied by gralora_k.
+            GraLoRA attention dimension determines the rank of the GraLoRA adapter. The total parameter count of the
+            GraLoRA adapter is same as LoRA with same rank r, while the expressivitiy is multiplied by gralora_k.
         hybrid_r (`int`):
             Hybrid GraLoRA rank determines the rank allocated to vanilla LoRA method when using Hybrid GraLoRA method.
-            Hybrid GraLoRA, a combination of GraLoRA and vanilla LoRA, becomes available when hybrid_r > 0.
-            The parameter count of the GraLoRA adapter is r + hybrid_r.
+            Hybrid GraLoRA, a combination of GraLoRA and vanilla LoRA, becomes available when hybrid_r > 0. The
+            parameter count of the GraLoRA adapter is r + hybrid_r.
         target_modules (`Union[List[str], str]`):
-            List of module names or regex expression of the module names to replace with GraLoRA. "
-            For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. "
-            This can also be a wildcard 'all-linear' which matches all linear/Conv1D "
-            "(if the model is a PreTrainedModel, the output layer excluded). "
-            If not specified, modules will be chosen according to the model architecture, If the architecture is "
-            not known, an error will be raised -- in this case, you should specify the target modules manually. "
-            To avoid targeting any modules (because you want to apply `target_parameters`), set "
-            `target_modules=[]`.
+            List of module names or regex expression of the module names to replace with GraLoRA. " For example, ['q',
+            'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. " This can also be a wildcard 'all-linear'
+            which matches all linear/Conv1D " "(if the model is a PreTrainedModel, the output layer excluded). " If not
+            specified, modules will be chosen according to the model architecture, If the architecture is " not known,
+            an error will be raised -- in this case, you should specify the target modules manually. " To avoid
+            targeting any modules (because you want to apply `target_parameters`), set " `target_modules=[]`.
         gralora_alpha (`int`): GraLoRA alpha.
-            GraLoRA alpha is the scaling factor for the GraLoRA adapter.
-            Scale becomes gralora_alpha / (r + hybrid_r).
+            GraLoRA alpha is the scaling factor for the GraLoRA adapter. Scale becomes gralora_alpha / (r + hybrid_r).
         gralora_dropout (`float`):
-            GraLoRA dropout is the dropout probability for the GraLoRA adapter.
-            It is used to prevent overfitting and improve the generalization of the GraLoRA adapter.
+            GraLoRA dropout is the dropout probability for the GraLoRA adapter. It is used to prevent overfitting and
+            improve the generalization of the GraLoRA adapter.
         gralora_k (`int`):
-            GraLoRA k determines the number of subblocks in the GraLoRA adapter.
-            The rank r must be divisible by gralora_k for the GraLoRA adapter to be valid.
-            The total parameter count is preserved regardles of gralora_k.
-            The entire rank of the GraLoRA adapter is increased by gralora_k, while the rank of each subblock is reduced by gralora_k.
-            gralora_k=2 is recommended for rank 32 or lower, and gralora_k=4 is recommended for rank 64 or higher.
+            GraLoRA k determines the number of subblocks in the GraLoRA adapter. The rank r must be divisible by
+            gralora_k for the GraLoRA adapter to be valid. The total parameter count is preserved regardles of
+            gralora_k. The entire rank of the GraLoRA adapter is increased by gralora_k, while the rank of each
+            subblock is reduced by gralora_k. gralora_k=2 is recommended for rank 32 or lower, and gralora_k=4 is
+            recommended for rank 64 or higher.
         fan_in_fan_out (`bool`):
-            Set this to True if the layer to replace stores weight like (fan_in, fan_out).
-            For example, gpt-2 uses `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`.
+            Set this to True if the layer to replace stores weight like (fan_in, fan_out). For example, gpt-2 uses
+            `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`.
         bias (`str`):
-            Bias type for gralora. Can be 'none', 'all' or 'gralora_only'.
-            If 'all' or 'gralora_only', the corresponding biases will be updated during training.
-            Be aware that this means that, even when disabling the adapters, the model will not produce the same output as the base model would have without adaptation.
+            Bias type for gralora. Can be 'none', 'all' or 'gralora_only'. If 'all' or 'gralora_only', the
+            corresponding biases will be updated during training. Be aware that this means that, even when disabling
+            the adapters, the model will not produce the same output as the base model would have without adaptation.
         init_weights (`bool`):
-            Whether to initialize the weights of the GraLoRA layers with their default initialization.
-            Don't change this setting, except if you know exactly what you're doing.
+            Whether to initialize the weights of the GraLoRA layers with their default initialization. Don't change
+            this setting, except if you know exactly what you're doing.
         layers_to_transform (`Union[List[int], int]`):
-            The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list.
-            If a single integer is passed, PEFT will transform only the layer at this index.
-            This only works when target_modules is a list of str.
+            The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes
+            that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at
+            this index. This only works when target_modules is a list of str.
         layers_pattern (`Optional[Union[List[str], str]]`):
-            The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern.
-            This only works when target_modules is a list of str. This should target the `nn.ModuleList` of the model, which is often called `'layers'` or `'h'`.
+            The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is
+            not in the common layers pattern. This only works when target_modules is a list of str. This should target
+            the `nn.ModuleList` of the model, which is often called `'layers'` or `'h'`.
     """
 
     r: int = field(

From 351877f894693060f8a2eb75a5652f15b0e7cd63 Mon Sep 17 00:00:00 2001
From: "yeonjoon.jung" <yeonjoon.jung@squeezebits.com>
Date: Mon, 27 Oct 2025 23:15:49 +0900
Subject: [PATCH 11/11] FIX CPU casting in GraLoRA get_delta_weight function

---
 src/peft/tuners/gralora/layer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/peft/tuners/gralora/layer.py b/src/peft/tuners/gralora/layer.py
index 4303669aa4..d6f78665f0 100644
--- a/src/peft/tuners/gralora/layer.py
+++ b/src/peft/tuners/gralora/layer.py
@@ -277,7 +277,9 @@ def get_delta_weight(self, adapter) -> torch.Tensor:
         l_indices = torch.arange(in_features, device=device)
         n_indices = l_indices // (in_features // gralora_k)
         i_indices = l_indices % (in_features // gralora_k)
-        gralora_A_scattered = torch.zeros(in_features, gralora_k, gralora_rank, device=device, dtype=dtype)
+        gralora_A_scattered = torch.zeros(
+            in_features, gralora_k, gralora_rank, device=device, dtype=torch.float32 if cast_to_fp32 else dtype
+        )
         gralora_A_scattered.scatter_(
             1,
             n_indices.unsqueeze(1).unsqueeze(2).expand(-1, 1, gralora_rank),