From 6dfa24e4b62652a7b6fcfda01ce873a44999c956 Mon Sep 17 00:00:00 2001 From: yeonjoon-jung01 Date: Thu, 16 Oct 2025 16:43:12 +0900 Subject: [PATCH 01/11] feat: Add Gralora configuration and basic implementation --- src/peft/__init__.py | 4 + src/peft/tuners/__init__.py | 3 + src/peft/tuners/gralora/__init__.py | 20 ++ src/peft/tuners/gralora/config.py | 82 ++++++ src/peft/tuners/gralora/layer.py | 267 ++++++++++++++++++++ src/peft/tuners/gralora/model.py | 377 ++++++++++++++++++++++++++++ src/peft/utils/peft_types.py | 2 + 7 files changed, 755 insertions(+) create mode 100644 src/peft/tuners/gralora/__init__.py create mode 100644 src/peft/tuners/gralora/config.py create mode 100644 src/peft/tuners/gralora/layer.py create mode 100644 src/peft/tuners/gralora/model.py diff --git a/src/peft/__init__.py b/src/peft/__init__.py index f8fdd48ff0..9a89b19554 100644 --- a/src/peft/__init__.py +++ b/src/peft/__init__.py @@ -64,6 +64,8 @@ EvaConfig, FourierFTConfig, FourierFTModel, + GraloraConfig, + GraloraModel, HRAConfig, HRAModel, IA3Config, @@ -163,6 +165,8 @@ "EvaConfig", "FourierFTConfig", "FourierFTModel", + "GraloraConfig", + "GraloraModel", "HRAConfig", "HRAModel", "IA3Config", diff --git a/src/peft/tuners/__init__.py b/src/peft/tuners/__init__.py index 3bf53d7da9..364bbb8fb2 100644 --- a/src/peft/tuners/__init__.py +++ b/src/peft/tuners/__init__.py @@ -20,6 +20,7 @@ from .cpt import CPTConfig, CPTEmbedding from .delora import DeloraConfig, DeloraModel from .fourierft import FourierFTConfig, FourierFTModel +from .gralora import GraloraConfig, GraloraModel from .hra import HRAConfig, HRAModel from .ia3 import IA3Config, IA3Model from .ln_tuning import LNTuningConfig, LNTuningModel @@ -74,6 +75,8 @@ "EvaConfig", "FourierFTConfig", "FourierFTModel", + "GraloraConfig", + "GraloraModel", "HRAConfig", "HRAModel", "IA3Config", diff --git a/src/peft/tuners/gralora/__init__.py b/src/peft/tuners/gralora/__init__.py new file mode 100644 index 0000000000..db1927c442 --- /dev/null +++ b/src/peft/tuners/gralora/__init__.py @@ -0,0 +1,20 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .config import GraloraConfig +from .layer import GraloraLayer +from .model import GraloraModel + + +__all__ = ["GraloraConfig", "GraloraLayer", "GraloraModel"] diff --git a/src/peft/tuners/gralora/config.py b/src/peft/tuners/gralora/config.py new file mode 100644 index 0000000000..fb919fbcbf --- /dev/null +++ b/src/peft/tuners/gralora/config.py @@ -0,0 +1,82 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass, field +from typing import Optional, Union + +from peft.config import PeftConfig +from peft.utils import PeftType + + +@dataclass +class GraloraConfig(PeftConfig): + r: int = field(default=8, metadata={"help": "gralora attention dimension"}) + hybrid_r: int = field( + default=0, metadata={"help": "hybrid_r is the rank allocated to vanilla LoRA method when using Hybrid GraLoRA"} + ) + target_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": ( + "List of module names or regex expression of the module names to replace with gralora." + "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. " + "Only linear layers are supported." + ) + }, + ) + gralora_alpha: int = field(default=8, metadata={"help": "gralora alpha"}) + gralora_dropout: float = field(default=0.0, metadata={"help": "gralora dropout"}) + gralora_k: int = field(default=2, metadata={"help": "gralora k"}) + fan_in_fan_out: bool = field( + default=False, + metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"}, + ) + bias: str = field( + default="none", metadata={"help": "Bias type for gralora. Can be 'none', 'all' or 'gralora_only'"} + ) + modules_to_save: Optional[list[str]] = field( + default=None, + metadata={ + "help": ( + "List of modules apart from gralora layers to be set as trainable and saved in the final checkpoint. For" + " example, in Sequence Classification or Token Classification tasks, the final layer" + " `classifier/score` are randomly initialized and as such need to be trainable and saved." + ) + }, + ) + layers_to_transform: Optional[Union[list[int], int]] = field( + default=None, + metadata={ + "help": ( + "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers" + " indexes that are specified inside this list. If a single integer is passed, PEFT will transform only" + " the layer at this index." + ) + }, + ) + layers_pattern: Optional[str] = field( + default=None, + metadata={ + "help": ( + "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer" + " pattern is not in the common layers pattern." + ) + }, + ) + + def __post_init__(self): + self.peft_type = PeftType.GRALORA + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) diff --git a/src/peft/tuners/gralora/layer.py b/src/peft/tuners/gralora/layer.py new file mode 100644 index 0000000000..6e6c220145 --- /dev/null +++ b/src/peft/tuners/gralora/layer.py @@ -0,0 +1,267 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Optional + +import torch +import torch.nn as nn +from transformers.pytorch_utils import Conv1D + +from peft.tuners.tuners_utils import BaseTunerLayer + + +class GraloraLayer(BaseTunerLayer): + # List all names of layers that may contain adapter weight + adapter_layer_names = ("gralora_A", "gralora_B", "gralora_A_general", "gralora_B_general") + other_param_names = ("r", "hybrid_r", "gralora_alpha", "scaling", "gralora_dropout") + + def __init__(self, base_layer: nn.Module, **kwargs): + self.base_layer = base_layer + self.r = {} + self.gralora_alpha = {} + self.gralora_k = {} + self.hybrid_r = {} + self.scaling = {} + self.gralora_dropout = nn.ModuleDict({}) + + # Set to `None` otherwise to avoid computation with random weight + self.gralora_A = nn.ParameterDict({}) + self.gralora_B = nn.ParameterDict({}) + self.gralora_A_general = nn.ModuleDict({}) + self.gralora_B_general = nn.ModuleDict({}) + + # Mark the weight as unmerged + self._disable_adapters = False + self.merged_adapters = [] + + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + in_features, out_features = base_layer.in_features, base_layer.out_features + elif isinstance(base_layer, Conv1D): + in_features, out_features = ( + base_layer.weight.ds_shape if hasattr(base_layer.weight, "ds_shape") else base_layer.weight.shape + ) + + self.in_features = in_features + self.out_features = out_features + self.kwargs = kwargs + + def _move_adapter_to_device_of_base_layer(self, adapter_name: str, device: Optional[torch.device] = None) -> None: + """ + Move the adapter of the given name to the device of the base layer. + """ + from peft.tuners.vera.buffer_dict import BufferDict + + if device is None: + # check weight and qweight (for GPTQ) + for weight_name in ("weight", "qweight"): + weight = getattr(self.get_base_layer(), weight_name, None) + if weight is not None: + device = weight.device + dtype = weight.dtype + break + else: + # no break encountered: could not determine the device + return + + # loop through all potential adapter layers and move them to the device of the base layer; be careful to only + # move this specific adapter to the device, as the other adapters could be on different devices + # see #1639 + for adapter_layer_name in self.adapter_layer_names + self.other_param_names: + adapter_layer = getattr(self, adapter_layer_name, None) + if not isinstance(adapter_layer, (nn.ModuleDict, nn.ParameterDict, BufferDict)): + continue + if adapter_name not in adapter_layer: + continue + if weight.dtype.is_floating_point or weight.dtype.is_complex: + adapter_layer[adapter_name] = adapter_layer[adapter_name].to(device, dtype=dtype) + else: + adapter_layer[adapter_name] = adapter_layer[adapter_name].to(device) + + @property + def merged(self) -> bool: + return bool(self.merged_adapters) + + @property + def bias(self) -> torch.Tensor: + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + return base_layer.bias + elif isinstance(base_layer, Conv1D): + return base_layer.bias + else: + return None + + def update_layer( + self, + adapter_name, + module_name, + r, + gralora_alpha, + gralora_dropout, + gralora_k: int = 2, + hybrid_r: int = 0, + ): + if r <= 0: + raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") + + self.r[adapter_name] = r + self.gralora_alpha[adapter_name] = gralora_alpha + self.gralora_k[adapter_name] = gralora_k + self.hybrid_r[adapter_name] = hybrid_r + + if gralora_dropout > 0.0: + gralora_dropout_layer = nn.Dropout(p=gralora_dropout) + else: + gralora_dropout_layer = nn.Identity() + + self.gralora_dropout.update(nn.ModuleDict({adapter_name: gralora_dropout_layer})) + + # Actual trainable parameters + subblock_in_features = self.in_features // gralora_k + subblock_out_features = self.out_features // gralora_k + + gralora_r = r - hybrid_r # gralora_r is the rank allocated to gralora method + assert gralora_r % gralora_k == 0, f"r should be divisible by gralora_k, but got {r} and {gralora_k}" + + gralora_A = nn.ParameterList() + gralora_B = nn.ParameterList() + for _ in range(gralora_k): + new_A = nn.Parameter(torch.zeros(gralora_r, subblock_in_features)) + new_B = nn.Parameter(torch.zeros(subblock_out_features, gralora_r)) + nn.init.kaiming_uniform_(new_A, a=math.sqrt(5)) + gralora_A.append(new_A) + gralora_B.append(new_B) + # stack A and B and transpose to get the final shape + gralora_A = torch.stack(tuple(gralora_A), dim=0) # [N, rank, in_features//N] + gralora_A = gralora_A.transpose(1, 2).contiguous() # [N, in_features//N, rank] + + gralora_B = torch.stack(tuple(gralora_B), dim=0) # [N, out_features//N, rank] + gralora_B = gralora_B.transpose(1, 2).contiguous() # [N, rank, out_features//N] + + if hybrid_r > 0: + general_gralora_A = nn.Linear(self.in_features, hybrid_r, bias=False) + general_gralora_B = nn.Linear(hybrid_r, self.out_features, bias=False) + nn.init.kaiming_uniform_(general_gralora_A.weight, a=math.sqrt(5)) + nn.init.zeros_(general_gralora_B.weight) + else: + general_gralora_A = nn.Identity() + general_gralora_B = nn.Identity() + + self.gralora_A[adapter_name] = gralora_A + self.gralora_B[adapter_name] = gralora_B + self.gralora_A_general[adapter_name] = general_gralora_A + self.gralora_B_general[adapter_name] = general_gralora_B + + self.module_name = module_name + + self.scaling[adapter_name] = gralora_alpha / r + self._move_adapter_to_device_of_base_layer(adapter_name) + self.set_adapter(self.active_adapters) + + +class Linear(nn.Linear, GraloraLayer): + # Gralora implemented in a dense layer + def __init__( + self, + base_layer, + adapter_name: str, + module_name, + r: int = 0, + gralora_alpha: int = 1, + gralora_dropout: float = 0.0, + gralora_k: int = 2, + hybrid_r: int = 0, + fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out) + **kwargs, + ) -> None: + # this gets the init from nn.Linear's super perspective, i.e. nn.Module.__init__, which should always be called + super(nn.Linear, self).__init__() + GraloraLayer.__init__(self, base_layer, **kwargs) + self.fan_in_fan_out = fan_in_fan_out + + self._active_adapter = adapter_name + self.update_layer(adapter_name, module_name, r, gralora_alpha, gralora_dropout, gralora_k, hybrid_r) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + raise NotImplementedError("Merging is not supported for GraloraLayer yet.") + + def unmerge(self) -> None: + raise NotImplementedError("Unmerging is not supported for GraloraLayer yet.") + + def get_delta_weight(self, adapter) -> torch.Tensor: + raise NotImplementedError("Getting delta weight is not supported for GraloraLayer yet.") + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + previous_dtype = x.dtype + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + torch_result_dtype = result.dtype + for active_adapter in self.active_adapters: + if active_adapter not in self.gralora_A.keys(): + continue + gralora_A = self.gralora_A[active_adapter] + gralora_B = self.gralora_B[active_adapter] + + gralora_A_general = self.gralora_A_general[active_adapter] + gralora_B_general = self.gralora_B_general[active_adapter] + + r = self.r[active_adapter] + gralora_k = self.gralora_k[active_adapter] + hybrid_r = self.hybrid_r[active_adapter] + + assert len(gralora_A) == len(gralora_B) + + dropout = self.gralora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + + gralora_dtype = gralora_A.dtype + gralora_rank = r - hybrid_r + + B, L, in_features = x.shape + N = gralora_k + subblock_gralora_rank = gralora_rank // N + + output = torch.einsum( + "bljr, jro -> bljo", + torch.einsum( + "blni, nir -> blnr", + dropout(x.to(gralora_dtype)).view(B, L, N, in_features // N), + gralora_A, + ) + .view(B, L, N, N, subblock_gralora_rank) + .permute(0, 1, 3, 2, 4) + .reshape(B, L, N, N * subblock_gralora_rank), + gralora_B, + ).reshape(B, L, -1) + result += scaling * output.to(torch_result_dtype) + if hybrid_r > 0: + result += scaling * gralora_B_general(gralora_A_general(dropout(x.to(gralora_dtype)))).to( + torch_result_dtype + ) + + result = result.to(previous_dtype) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "gralora." + rep diff --git a/src/peft/tuners/gralora/model.py b/src/peft/tuners/gralora/model.py new file mode 100644 index 0000000000..26ae333174 --- /dev/null +++ b/src/peft/tuners/gralora/model.py @@ -0,0 +1,377 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import re +import warnings +from dataclasses import asdict +from enum import Enum +from typing import Optional + +import torch +import torch.nn as nn +from tqdm import tqdm +from transformers.pytorch_utils import Conv1D + +from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists +from peft.utils import ( + TRANSFORMERS_MODELS_TO_ORA_TARGET_MODULES_MAPPING, + ModulesToSaveWrapper, + _get_submodules, +) + +from .config import GraloraConfig +from .layer import GraloraLayer, Linear + + +class GraloraModel(BaseTuner): + """ + Creates Vector-based Random Matrix Adaptation (Gralora) model from a pretrained transformers model. + + Args: + model ([`~transformers.PreTrainedModel`]): The model to be adapted. + config ([`GraloraConfig`]): The configuration of the Gralora model. + adapter_name (`str`): The name of the adapter, defaults to `"default"`. + + Returns: + `torch.nn.Module`: The Gralora model. + + Example: + + ```py + >>> from transformers import AutoModelForCausalLM + >>> from peft import GraloraConfig, get_peft_model + + >>> base_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") + >>> config = GraloraConfig(r=128) + >>> model = get_peft_model(base_model, config) + ``` + + **Attributes**: + - **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted. + - **peft_config** ([`GraloraConfig`]): The configuration of the Gralora model. + """ + + prefix: str = "gralora_" + + def __init__(self, model, config, adapter_name) -> None: + super().__init__(model, config, adapter_name) + + def _check_new_adapter_config(self, config: GraloraConfig) -> None: + """ + A helper method to check the config when a new adapter is being added. + + Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters. + + """ + # the below todo is copied from LoRA + # TODO: there should be a check if any of the existing adapters actually has bias != "none", or else the check + # does not fully correspond to the error message. + if (len(self.peft_config) > 1) and (config.bias != "none"): + raise ValueError( + f"{self.__class__.__name__} supports only 1 adapter with bias. When using multiple adapters, " + "set bias to 'none' for all adapters." + ) + + for existing_config in self.peft_config.values(): + if existing_config is config: + # skip the current config + continue + + if existing_config.projection_prng_key != config.projection_prng_key: + raise ValueError( + f"Gralora PRNG initialisation key must be the same for all adapters. Got {config.projection_prng_key=} but " + f"previous config had {existing_config.projection_prng_key}." + ) + + @staticmethod + def _check_target_module_exists(gralora_config, key): + return check_target_module_exists(gralora_config, key) + + def _create_and_replace( + self, + gralora_config, + adapter_name, + target, + target_name, + parent, + current_key, + **optional_kwargs, + ): + if current_key is None: + raise ValueError("Current Key shouldn't be `None`") + + pattern = re.compile(r"layers\.(\d+)\.(.+)") + match = pattern.search(current_key) + if match: + module_name = match.group(2).replace(".", "__") + else: + raise ValueError("Invalid target module type") + + r = gralora_config.r + bias = hasattr(target, "bias") and target.bias is not None + kwargs = { + "r": r, + "gralora_alpha": gralora_config.gralora_alpha, + "gralora_dropout": gralora_config.gralora_dropout, + "gralora_k": gralora_config.gralora_k, + "fan_in_fan_out": gralora_config.fan_in_fan_out, + "hybrid_r": gralora_config.hybrid_r, + } + kwargs["bias"] = bias + + if isinstance(target, Linear): + target.update_layer( + adapter_name, + module_name, + r, + gralora_config.gralora_alpha, + gralora_config.gralora_dropout, + gralora_config.gralora_k, + gralora_config.hybrid_r, + ) + else: + new_module = self._create_new_module(gralora_config, adapter_name, target, module_name, **kwargs) + if adapter_name not in self.active_adapter: + # adding an additional adapter: it is not automatically trainable + new_module.requires_grad_(False) + self._replace_module(parent, target_name, new_module, target) + + @staticmethod + def _replace_module(parent, child_name, new_module, child): + setattr(parent, child_name, new_module) + # It's not necessary to set requires_grad here, as that is handled by + # _mark_only_adapters_as_trainable + + # child layer wraps the original module, unpack it + if hasattr(child, "base_layer"): + child = child.base_layer + + if not hasattr(new_module, "base_layer"): + new_module.weight = child.weight + if hasattr(child, "bias"): + new_module.bias = child.bias + + if getattr(child, "state", None) is not None: + if hasattr(new_module, "base_layer"): + new_module.base_layer.state = child.state + else: + new_module.state = child.state + new_module.to(child.weight.device) + + # dispatch to correct device + for name, module in new_module.named_modules(): + if "gralora_" in name: + module.to(child.weight.device) + + def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None: + for n, p in model.named_parameters(): + if self.prefix not in n: + p.requires_grad = False + + for active_adapter in self.active_adapters: + bias = self.peft_config[active_adapter].bias + if bias == "none": + continue + + if bias == "all": + for n, p in model.named_parameters(): + if "bias" in n: + p.requires_grad = True + elif bias == "gralora_only": + for m in model.modules(): + if isinstance(m, GraloraLayer) and hasattr(m, "bias") and m.bias is not None: + m.bias.requires_grad = True + else: + raise NotImplementedError(f"Requested bias: {bias}, is not implemented.") + + @staticmethod + def _create_new_module(gralora_config, adapter_name, target, module_name, **kwargs): + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if isinstance(target_base_layer, torch.nn.Linear): + if kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. " + "Setting fan_in_fan_out to False." + ) + kwargs["fan_in_fan_out"] = gralora_config.fan_in_fan_out = False + elif isinstance(target_base_layer, Conv1D): + kwargs["is_target_conv_1d_layer"] = True + if not kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to False but the target module is `Conv1D`. Setting fan_in_fan_out to True." + ) + kwargs["fan_in_fan_out"] = gralora_config.fan_in_fan_out = True + else: + raise ValueError( + f"Target module {target} is not supported. Currently, only the following modules are supported: " + "`torch.nn.Linear`, `transformers.pytorch_utils.Conv1D`." + ) + new_module = Linear( + target, + adapter_name, + module_name, + **kwargs, + ) + + return new_module + + def __getattr__(self, name: str): + """Forward missing attributes to the wrapped module.""" + try: + return super().__getattr__(name) # defer to nn.Module's logic + except AttributeError: + return getattr(self.model, name) + + def get_peft_config_as_dict(self, inference: bool = False): + config_dict = {} + for key, value in self.peft_config.items(): + config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()} + if inference: + config["inference_mode"] = True + config_dict[key] = config + return config + + def _set_adapter_layers(self, enabled=True): + for module in self.model.modules(): + if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)): + module.enable_adapters(enabled) + + def enable_adapter_layers(self): + self._set_adapter_layers(enabled=True) + + def disable_adapter_layers(self): + for active_adapter in self.active_adapters: + val = self.peft_config[active_adapter].bias + if val != "none": + msg = ( + f"Careful, disabling adapter layers with bias configured to be '{val}' does not produce the same " + "output as the the base model would without adaption." + ) + warnings.warn(msg) + self._set_adapter_layers(enabled=False) + + def set_adapter(self, adapter_name): + for module in self.model.modules(): + if isinstance(module, GraloraLayer): + if module.merged: + warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.") + module.unmerge() + module.set_adapter(adapter_name) + self.active_adapter = adapter_name + + @staticmethod + def _prepare_adapter_config(peft_config, model_config): + if peft_config.target_modules is None: + if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_ORA_TARGET_MODULES_MAPPING: + raise ValueError("Please specify `target_modules` in `peft_config`") + peft_config.target_modules = set( + TRANSFORMERS_MODELS_TO_ORA_TARGET_MODULES_MAPPING[model_config["model_type"]] + ) + return peft_config + + def _unload_and_optionally_merge( + self, + merge=True, + progressbar: bool = False, + safe_merge: bool = False, + adapter_names: Optional[list[str]] = None, + ): + # we cannot use self.prefix as we want to include non-trainable gralora parameters + key_list = [key for key, _ in self.model.named_modules() if "gralora" not in key] + desc = "Unloading " + ("and merging " if merge else "") + "model" + for key in tqdm(key_list, disable=not progressbar, desc=desc): + try: + parent, target, target_name = _get_submodules(self.model, key) + except AttributeError: + continue + + if hasattr(target, "base_layer"): + if merge: + target.merge(safe_merge=safe_merge, adapter_names=adapter_names) + + self._replace_module(parent, target_name, target.get_base_layer(), target) + elif isinstance(target, ModulesToSaveWrapper): + # save any additional trainable modules part of `modules_to_save` + setattr(parent, target_name, target.modules_to_save[target.active_adapter]) + + return self.model + + def delete_adapter(self, adapter_name: str): + """ + Deletes an existing adapter. + + Args: + adapter_name (str): Name of the adapter to be deleted. + """ + if adapter_name not in list(self.peft_config.keys()): + raise ValueError(f"Adapter {adapter_name} does not exist") + del self.peft_config[adapter_name] + + # we cannot use self.prefix as we want to include non-trainable gralora parameters + key_list = [key for key, _ in self.model.named_modules() if "gralora" not in key] + new_adapter = None + for key in key_list: + _, target, _ = _get_submodules(self.model, key) + if isinstance(target, GraloraLayer): + target.delete_adapter(adapter_name) + if new_adapter is None: + new_adapter = target.active_adapter[:] + + self.active_adapter = new_adapter or [] + + def merge_and_unload( + self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[list[str]] = None + ): + r""" + This method merges the Gralora layers into the base model. This is needed if someone wants to use the base model + as a standalone model. + + Args: + progressbar (`bool`): + whether to show a progressbar indicating the unload and merge process + safe_merge (`bool`): + whether to activate the safe merging check to check if there is any potential Nan in the adapter + weights + adapter_names (`list[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + + Example: + + ```py + >>> from transformers import AutoModelForCausalLM + >>> from peft import PeftModel + + >>> base_model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-40b") + >>> peft_model_id = "smangrul/falcon-40B-int4-peft-lrasa-sfttrainer-sample" + >>> model = PeftModel.from_pretrained(base_model, peft_model_id) + >>> merged_model = model.merge_and_unload() + ``` + """ + return self._unload_and_optionally_merge( + progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names + ) + + def unload(self): + """ + Gets back the base model by removing all the Gralora modules without merging. This gives back the original base + model. + """ + return self._unload_and_optionally_merge(merge=False) diff --git a/src/peft/utils/peft_types.py b/src/peft/utils/peft_types.py index 8f55a8f2b8..ddac0c8c70 100644 --- a/src/peft/utils/peft_types.py +++ b/src/peft/utils/peft_types.py @@ -48,6 +48,7 @@ class PeftType(str, enum.Enum): - WAVEFT - OSF - DELORA + - GRALORA """ PROMPT_TUNING = "PROMPT_TUNING" @@ -80,6 +81,7 @@ class PeftType(str, enum.Enum): WAVEFT = "WAVEFT" OSF = "OSF" DELORA = "DELORA" + GRALORA = "GRALORA" class TaskType(str, enum.Enum): From bfa1ef7633c76274e31509e3d7289fadd8a85e60 Mon Sep 17 00:00:00 2001 From: HaohanTsao Date: Thu, 16 Oct 2025 16:37:49 +0800 Subject: [PATCH 02/11] ENH Support merge/unmerge in GraLoRA functionality; support init_weights parameter for flexible initialization --- src/peft/tuners/gralora/__init__.py | 6 +- src/peft/tuners/gralora/config.py | 12 +- src/peft/tuners/gralora/layer.py | 222 ++++++++++++++++++++++++++-- src/peft/tuners/gralora/model.py | 39 ++--- 4 files changed, 233 insertions(+), 46 deletions(-) diff --git a/src/peft/tuners/gralora/__init__.py b/src/peft/tuners/gralora/__init__.py index db1927c442..830e0a477c 100644 --- a/src/peft/tuners/gralora/__init__.py +++ b/src/peft/tuners/gralora/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2023-present the HuggingFace Inc. team. +# Copyright 2025-present the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,9 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +from peft.utils import register_peft_method + from .config import GraloraConfig from .layer import GraloraLayer from .model import GraloraModel __all__ = ["GraloraConfig", "GraloraLayer", "GraloraModel"] + +register_peft_method(name="gralora", config_cls=GraloraConfig, model_cls=GraloraModel) diff --git a/src/peft/tuners/gralora/config.py b/src/peft/tuners/gralora/config.py index fb919fbcbf..9e78b81afa 100644 --- a/src/peft/tuners/gralora/config.py +++ b/src/peft/tuners/gralora/config.py @@ -1,4 +1,4 @@ -# Copyright 2023-present the HuggingFace Inc. team. +# Copyright 2025-present the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -55,6 +55,15 @@ class GraloraConfig(PeftConfig): ) }, ) + init_weights: bool = field( + default=True, + metadata={ + "help": ( + "Whether to initialize the weights of the GraLoRA layers with their default initialization. " + "Don't change this setting, except if you know exactly what you're doing." + ) + }, + ) layers_to_transform: Optional[Union[list[int], int]] = field( default=None, metadata={ @@ -76,6 +85,7 @@ class GraloraConfig(PeftConfig): ) def __post_init__(self): + super().__post_init__() self.peft_type = PeftType.GRALORA self.target_modules = ( set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules diff --git a/src/peft/tuners/gralora/layer.py b/src/peft/tuners/gralora/layer.py index 6e6c220145..926469f8a0 100644 --- a/src/peft/tuners/gralora/layer.py +++ b/src/peft/tuners/gralora/layer.py @@ -1,4 +1,4 @@ -# Copyright 2023-present the HuggingFace Inc. team. +# Copyright 2025-present the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ # limitations under the License. import math +import warnings from typing import Optional import torch @@ -20,6 +21,7 @@ from transformers.pytorch_utils import Conv1D from peft.tuners.tuners_utils import BaseTunerLayer +from peft.utils.other import transpose class GraloraLayer(BaseTunerLayer): @@ -62,7 +64,7 @@ def _move_adapter_to_device_of_base_layer(self, adapter_name: str, device: Optio """ Move the adapter of the given name to the device of the base layer. """ - from peft.tuners.vera.buffer_dict import BufferDict + from peft.tuners._buffer_dict import BufferDict if device is None: # check weight and qweight (for GPTQ) @@ -113,6 +115,7 @@ def update_layer( gralora_dropout, gralora_k: int = 2, hybrid_r: int = 0, + init_weights: bool = True, ): if r <= 0: raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") @@ -141,21 +144,34 @@ def update_layer( for _ in range(gralora_k): new_A = nn.Parameter(torch.zeros(gralora_r, subblock_in_features)) new_B = nn.Parameter(torch.zeros(subblock_out_features, gralora_r)) - nn.init.kaiming_uniform_(new_A, a=math.sqrt(5)) + if init_weights: + # Initialize to identity: A is random, B is zero + nn.init.kaiming_uniform_(new_A, a=math.sqrt(5)) + # new_B is already initialized to zeros + else: + # Initialize to random: both A and B are random (for testing) + nn.init.kaiming_uniform_(new_A, a=math.sqrt(5)) + nn.init.kaiming_uniform_(new_B, a=math.sqrt(5)) gralora_A.append(new_A) gralora_B.append(new_B) # stack A and B and transpose to get the final shape - gralora_A = torch.stack(tuple(gralora_A), dim=0) # [N, rank, in_features//N] - gralora_A = gralora_A.transpose(1, 2).contiguous() # [N, in_features//N, rank] + gralora_A = torch.stack(tuple(gralora_A), dim=0) # [N, gralora_r, in_features//N] + gralora_A = gralora_A.transpose(1, 2).contiguous() # [N, in_features//N, gralora_r] - gralora_B = torch.stack(tuple(gralora_B), dim=0) # [N, out_features//N, rank] - gralora_B = gralora_B.transpose(1, 2).contiguous() # [N, rank, out_features//N] + gralora_B = torch.stack(tuple(gralora_B), dim=0) # [N, out_features//N, gralora_r] + gralora_B = gralora_B.transpose(1, 2).contiguous() # [N, gralora_r, out_features//N] if hybrid_r > 0: general_gralora_A = nn.Linear(self.in_features, hybrid_r, bias=False) general_gralora_B = nn.Linear(hybrid_r, self.out_features, bias=False) - nn.init.kaiming_uniform_(general_gralora_A.weight, a=math.sqrt(5)) - nn.init.zeros_(general_gralora_B.weight) + if init_weights: + # Initialize to identity: A is random, B is zero + nn.init.kaiming_uniform_(general_gralora_A.weight, a=math.sqrt(5)) + nn.init.zeros_(general_gralora_B.weight) + else: + # Initialize to random: both A and B are random (for testing) + nn.init.kaiming_uniform_(general_gralora_A.weight, a=math.sqrt(5)) + nn.init.kaiming_uniform_(general_gralora_B.weight, a=math.sqrt(5)) else: general_gralora_A = nn.Identity() general_gralora_B = nn.Identity() @@ -185,6 +201,7 @@ def __init__( gralora_k: int = 2, hybrid_r: int = 0, fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out) + init_weights: bool = True, **kwargs, ) -> None: # this gets the init from nn.Linear's super perspective, i.e. nn.Module.__init__, which should always be called @@ -193,16 +210,176 @@ def __init__( self.fan_in_fan_out = fan_in_fan_out self._active_adapter = adapter_name - self.update_layer(adapter_name, module_name, r, gralora_alpha, gralora_dropout, gralora_k, hybrid_r) + self.update_layer( + adapter_name, module_name, r, gralora_alpha, gralora_dropout, gralora_k, hybrid_r, init_weights + ) def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: - raise NotImplementedError("Merging is not supported for GraloraLayer yet.") + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`list[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. + Defaults to `None`. + """ + from peft.tuners.tuners_utils import check_adapters_to_merge + + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.gralora_A.keys(): + base_layer = self.get_base_layer() + if safe_merge: + # Note that safe_merge will be slower than the normal merge + # because of the copy operation. + orig_weights = base_layer.weight.data.clone() + delta_weight = self.get_delta_weight(active_adapter) + orig_weights += delta_weight + + if not torch.isfinite(orig_weights).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + base_layer.weight.data = orig_weights + else: + delta_weight = self.get_delta_weight(active_adapter) + base_layer.weight.data += delta_weight + + self.merged_adapters.append(active_adapter) def unmerge(self) -> None: - raise NotImplementedError("Unmerging is not supported for GraloraLayer yet.") + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self.gralora_A.keys(): + delta_weight = self.get_delta_weight(active_adapter) + self.get_base_layer().weight.data -= delta_weight def get_delta_weight(self, adapter) -> torch.Tensor: - raise NotImplementedError("Getting delta weight is not supported for GraloraLayer yet.") + """ + Compute the delta weight for GraLoRA adapter. + + GraLoRA applies block-wise low-rank adaptation with information exchange. + This method computes the equivalent weight matrix that would be added to + the base weight during merge. + + Args: + adapter (str): The name of the adapter + + Returns: + torch.Tensor: The delta weight matrix with shape [out_features, in_features] + """ + gralora_A = self.gralora_A[adapter] # [N, in_features//N, rank] + gralora_B = self.gralora_B[adapter] # [N, rank, out_features//N] + gralora_A_general = self.gralora_A_general[adapter] + gralora_B_general = self.gralora_B_general[adapter] + + device = gralora_A.device + dtype = gralora_A.dtype + + gralora_k = self.gralora_k[adapter] + hybrid_r = self.hybrid_r[adapter] + r = self.r[adapter] + + # Handle CPU fp16/bf16 casting + cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16) + + if cast_to_fp32: + gralora_A = gralora_A.float() + gralora_B = gralora_B.float() + + # Get dimensions + in_features = self.in_features + out_features = self.out_features + subblock_in = in_features // gralora_k + subblock_out = out_features // gralora_k + gralora_rank = r - hybrid_r + subblock_gralora_rank = gralora_rank // gralora_k + + # Simulate the forward pass computation to get equivalent weight matrix + # We need to compute: W_delta such that W_delta @ x = gralora_forward(x) - base_forward(x) + + # Create an identity matrix for each input dimension and compute output + # This gives us the columns of the weight matrix + delta_weight = torch.zeros(out_features, in_features, device=device, dtype=gralora_A.dtype) + + # Process in batches to avoid memory issues + batch_size = min(256, in_features) + for start_idx in range(0, in_features, batch_size): + end_idx = min(start_idx + batch_size, in_features) + batch_len = end_idx - start_idx + + # Create identity input: [batch_len, in_features] + x = torch.zeros(batch_len, in_features, device=device, dtype=gralora_A.dtype) + for i in range(batch_len): + x[i, start_idx + i] = 1.0 + + # Apply GraLoRA transformation (following forward logic) + # x shape: [batch_len, in_features] + N = gralora_k + + # Reshape x: [batch_len, N, in_features//N] + x_reshaped = x.view(batch_len, N, in_features // N) + + # Apply gralora_A: [batch_len, N, in_features//N] @ [N, in_features//N, rank] + # Result: [batch_len, N, rank] + temp = torch.einsum("bni, nir -> bnr", x_reshaped, gralora_A) + + # Reshape and permute for information exchange + # [batch_len, N, rank] -> [batch_len, N, N, subblock_rank] + temp = temp.view(batch_len, N, N, subblock_gralora_rank) + # Permute: [batch_len, N, N, subblock_rank] -> [batch_len, N, N, subblock_rank] + temp = temp.permute(0, 2, 1, 3) + # Reshape: [batch_len, N, N * subblock_rank] + temp = temp.reshape(batch_len, N, N * subblock_gralora_rank) + + # Apply gralora_B: [batch_len, N, N*subblock_rank] @ [N, rank, out_features//N] + # Note: rank here is actually gralora_rank = N * subblock_gralora_rank + # Result: [batch_len, N, out_features//N] + output = torch.einsum("bnr, nro -> bno", temp, gralora_B) + + # Reshape to [batch_len, out_features] + output = output.reshape(batch_len, out_features) + + # Store in delta_weight (transpose because weight is [out, in]) + delta_weight[:, start_idx:end_idx] = output.T + + # Add hybrid LoRA component if present + if hybrid_r > 0: + # general_A: [in_features, hybrid_r], general_B: [hybrid_r, out_features] + weight_A_general = gralora_A_general.weight # [hybrid_r, in_features] + weight_B_general = gralora_B_general.weight # [out_features, hybrid_r] + + if cast_to_fp32: + weight_A_general = weight_A_general.float() + weight_B_general = weight_B_general.float() + + # Compute delta for hybrid part: [out_features, hybrid_r] @ [hybrid_r, in_features] + delta_weight += weight_B_general @ weight_A_general + + # Apply scaling and transpose if needed + delta_weight = transpose(delta_weight, self.fan_in_fan_out) * self.scaling[adapter] + + # Cast back if needed + if cast_to_fp32: + delta_weight = delta_weight.to(dtype=dtype) + + return delta_weight def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: previous_dtype = x.dtype @@ -216,6 +393,13 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: else: result = self.base_layer(x, *args, **kwargs) torch_result_dtype = result.dtype + + # Handle 2D input: [batch, features] -> [batch, 1, features] + # This is common for MLPs and other non-sequence models + x_is_2d = x.ndim == 2 + if x_is_2d: + x = x.unsqueeze(1) # [B, F] -> [B, 1, F] + for active_adapter in self.active_adapters: if active_adapter not in self.gralora_A.keys(): continue @@ -253,11 +437,17 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: .reshape(B, L, N, N * subblock_gralora_rank), gralora_B, ).reshape(B, L, -1) + + # Squeeze back to 2D if input was 2D + if x_is_2d: + output = output.squeeze(1) # [B, 1, F] -> [B, F] + result += scaling * output.to(torch_result_dtype) if hybrid_r > 0: - result += scaling * gralora_B_general(gralora_A_general(dropout(x.to(gralora_dtype)))).to( - torch_result_dtype - ) + hybrid_output = gralora_B_general(gralora_A_general(dropout(x.to(gralora_dtype)))) + if x_is_2d: + hybrid_output = hybrid_output.squeeze(1) + result += scaling * hybrid_output.to(torch_result_dtype) result = result.to(previous_dtype) return result diff --git a/src/peft/tuners/gralora/model.py b/src/peft/tuners/gralora/model.py index 26ae333174..c5c159dacf 100644 --- a/src/peft/tuners/gralora/model.py +++ b/src/peft/tuners/gralora/model.py @@ -1,4 +1,4 @@ -# Copyright 2023-present the HuggingFace Inc. team. +# Copyright 2025-present the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,7 +14,6 @@ from __future__ import annotations -import re import warnings from dataclasses import asdict from enum import Enum @@ -27,7 +26,7 @@ from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists from peft.utils import ( - TRANSFORMERS_MODELS_TO_ORA_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING, ModulesToSaveWrapper, _get_submodules, ) @@ -85,17 +84,6 @@ def _check_new_adapter_config(self, config: GraloraConfig) -> None: "set bias to 'none' for all adapters." ) - for existing_config in self.peft_config.values(): - if existing_config is config: - # skip the current config - continue - - if existing_config.projection_prng_key != config.projection_prng_key: - raise ValueError( - f"Gralora PRNG initialisation key must be the same for all adapters. Got {config.projection_prng_key=} but " - f"previous config had {existing_config.projection_prng_key}." - ) - @staticmethod def _check_target_module_exists(gralora_config, key): return check_target_module_exists(gralora_config, key) @@ -113,13 +101,6 @@ def _create_and_replace( if current_key is None: raise ValueError("Current Key shouldn't be `None`") - pattern = re.compile(r"layers\.(\d+)\.(.+)") - match = pattern.search(current_key) - if match: - module_name = match.group(2).replace(".", "__") - else: - raise ValueError("Invalid target module type") - r = gralora_config.r bias = hasattr(target, "bias") and target.bias is not None kwargs = { @@ -129,22 +110,24 @@ def _create_and_replace( "gralora_k": gralora_config.gralora_k, "fan_in_fan_out": gralora_config.fan_in_fan_out, "hybrid_r": gralora_config.hybrid_r, + "init_weights": gralora_config.init_weights, } kwargs["bias"] = bias if isinstance(target, Linear): target.update_layer( adapter_name, - module_name, + current_key, r, gralora_config.gralora_alpha, gralora_config.gralora_dropout, gralora_config.gralora_k, gralora_config.hybrid_r, + gralora_config.init_weights, ) else: - new_module = self._create_new_module(gralora_config, adapter_name, target, module_name, **kwargs) - if adapter_name not in self.active_adapter: + new_module = self._create_new_module(gralora_config, adapter_name, target, current_key, **kwargs) + if adapter_name not in self.active_adapters: # adding an additional adapter: it is not automatically trainable new_module.requires_grad_(False) self._replace_module(parent, target_name, new_module, target) @@ -267,22 +250,22 @@ def disable_adapter_layers(self): warnings.warn(msg) self._set_adapter_layers(enabled=False) - def set_adapter(self, adapter_name): + def set_adapter(self, adapter_name, inference_mode: bool = False): for module in self.model.modules(): if isinstance(module, GraloraLayer): if module.merged: warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.") module.unmerge() - module.set_adapter(adapter_name) + module.set_adapter(adapter_name, inference_mode=inference_mode) self.active_adapter = adapter_name @staticmethod def _prepare_adapter_config(peft_config, model_config): if peft_config.target_modules is None: - if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_ORA_TARGET_MODULES_MAPPING: + if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING: raise ValueError("Please specify `target_modules` in `peft_config`") peft_config.target_modules = set( - TRANSFORMERS_MODELS_TO_ORA_TARGET_MODULES_MAPPING[model_config["model_type"]] + TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING[model_config["model_type"]] ) return peft_config From 9813b170bb949f1169505fb5427fd04fd37f0732 Mon Sep 17 00:00:00 2001 From: HaohanTsao Date: Thu, 16 Oct 2025 16:54:05 +0800 Subject: [PATCH 03/11] TST Add test suite for GraLoRA. --- src/peft/tuners/gralora/layer.py | 9 +- src/peft/tuners/gralora/model.py | 4 +- tests/test_config.py | 2 + tests/test_decoder_models.py | 25 ++ tests/test_gralora.py | 533 +++++++++++++++++++++++++++++++ 5 files changed, 566 insertions(+), 7 deletions(-) create mode 100644 tests/test_gralora.py diff --git a/src/peft/tuners/gralora/layer.py b/src/peft/tuners/gralora/layer.py index 926469f8a0..907730d49b 100644 --- a/src/peft/tuners/gralora/layer.py +++ b/src/peft/tuners/gralora/layer.py @@ -224,8 +224,8 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N before merging the weights. This is useful if you want to check if the merge operation will produce NaNs. Defaults to `False`. adapter_names (`list[str]`, *optional*): - The list of adapter names that should be merged. If None, all active adapters will be merged. - Defaults to `None`. + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. """ from peft.tuners.tuners_utils import check_adapters_to_merge @@ -274,9 +274,8 @@ def get_delta_weight(self, adapter) -> torch.Tensor: """ Compute the delta weight for GraLoRA adapter. - GraLoRA applies block-wise low-rank adaptation with information exchange. - This method computes the equivalent weight matrix that would be added to - the base weight during merge. + GraLoRA applies block-wise low-rank adaptation with information exchange. This method computes the equivalent + weight matrix that would be added to the base weight during merge. Args: adapter (str): The name of the adapter diff --git a/src/peft/tuners/gralora/model.py b/src/peft/tuners/gralora/model.py index c5c159dacf..b7f6e15097 100644 --- a/src/peft/tuners/gralora/model.py +++ b/src/peft/tuners/gralora/model.py @@ -323,8 +323,8 @@ def merge_and_unload( self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[list[str]] = None ): r""" - This method merges the Gralora layers into the base model. This is needed if someone wants to use the base model - as a standalone model. + This method merges the Gralora layers into the base model. This is needed if someone wants to use the base + model as a standalone model. Args: progressbar (`bool`): diff --git a/tests/test_config.py b/tests/test_config.py index 9277d3bb68..5cb7523d84 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -27,6 +27,7 @@ BoneConfig, C3AConfig, FourierFTConfig, + GraloraConfig, HRAConfig, IA3Config, LNTuningConfig, @@ -64,6 +65,7 @@ (BoneConfig, {}), (C3AConfig, {}), (FourierFTConfig, {}), + (GraloraConfig, {}), (HRAConfig, {}), (IA3Config, {}), (LNTuningConfig, {}), diff --git a/tests/test_decoder_models.py b/tests/test_decoder_models.py index 5b23fa74e2..acb0d9c7d2 100644 --- a/tests/test_decoder_models.py +++ b/tests/test_decoder_models.py @@ -34,6 +34,7 @@ CPTConfig, DeloraConfig, FourierFTConfig, + GraloraConfig, HRAConfig, IA3Config, LoraConfig, @@ -137,6 +138,30 @@ "target_modules": None, }, ), + ( + GraloraConfig, + { + "task_type": "CAUSAL_LM", + "r": 8, + "gralora_alpha": 16, + "target_modules": None, + "gralora_dropout": 0.05, + "gralora_k": 2, + "hybrid_r": 0, + }, + ), + ( + GraloraConfig, + { + "task_type": "CAUSAL_LM", + "r": 16, + "gralora_alpha": 32, + "target_modules": None, + "gralora_dropout": 0.05, + "gralora_k": 4, + "hybrid_r": 4, + }, + ), ( HRAConfig, { diff --git a/tests/test_gralora.py b/tests/test_gralora.py new file mode 100644 index 0000000000..af43ac22e4 --- /dev/null +++ b/tests/test_gralora.py @@ -0,0 +1,533 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This test file is for tests specific to GraLoRA, since GraLoRA has some specific features +# like block-diagonal structure, hybrid mode, and tensor permutation for information exchange. + +import pytest +import torch +from safetensors import safe_open +from torch import nn + +from peft import PeftModel, get_peft_model +from peft.tuners.gralora import GraloraConfig + + +class MLP(nn.Module): + """Simple MLP for testing""" + + def __init__(self, bias=True): + super().__init__() + self.relu = nn.ReLU() + self.lin0 = nn.Linear(10, 20, bias=bias) + self.lin1 = nn.Linear(20, 20, bias=bias) # lin1 and lin2 have same shape + self.lin2 = nn.Linear(20, 20, bias=bias) + self.lin3 = nn.Linear(20, 2, bias=bias) + self.sm = nn.LogSoftmax(dim=-1) + + def forward(self, X): + X = self.lin0(X) + X = self.relu(X) + X = self.lin1(X) + X = self.relu(X) + X = self.lin2(X) + X = self.relu(X) + X = self.lin3(X) + X = self.sm(X) + return X + + +class TestGralora: + @pytest.fixture + def mlp(self): + torch.manual_seed(0) + model = MLP() + return model + + @pytest.fixture + def mlp_gralora_pure(self, mlp): + """Pure GraLoRA without hybrid component""" + torch.manual_seed(0) + config = GraloraConfig( + target_modules=["lin1", "lin2"], + r=16, + gralora_k=4, + hybrid_r=0, + gralora_alpha=32, + gralora_dropout=0.1, + ) + peft_model = get_peft_model(mlp, config) + return peft_model + + @pytest.fixture + def mlp_gralora_hybrid(self): + """Hybrid GraLoRA with vanilla LoRA component""" + torch.manual_seed(0) + mlp = MLP() + config = GraloraConfig( + target_modules=["lin1", "lin2"], + r=16, + gralora_k=4, + hybrid_r=4, + gralora_alpha=32, + gralora_dropout=0.1, + ) + peft_model = get_peft_model(mlp, config) + return peft_model + + def test_gralora_config_validation(self): + """Test that config validation works correctly""" + # Valid config + config = GraloraConfig(r=16, gralora_k=4, hybrid_r=0) + assert config.r == 16 + assert config.gralora_k == 4 + assert config.hybrid_r == 0 + + # Hybrid config + config = GraloraConfig(r=16, gralora_k=4, hybrid_r=4) + assert config.r == 16 + assert config.hybrid_r == 4 + + def test_gralora_parameter_shapes(self, mlp_gralora_hybrid): + """Test that GraLoRA parameters have correct shapes""" + for name, module in mlp_gralora_hybrid.named_modules(): + if hasattr(module, "gralora_A"): + adapter_name = "default" + gralora_A = module.gralora_A[adapter_name] + gralora_B = module.gralora_B[adapter_name] + gralora_A_general = module.gralora_A_general[adapter_name] + gralora_B_general = module.gralora_B_general[adapter_name] + + in_features = module.in_features + out_features = module.out_features + k = 4 + gralora_rank = 16 - 4 # r - hybrid_r + + # Check GraLoRA block shapes + # Each block has full gralora_rank, not gralora_rank // k + assert gralora_A.shape == (k, in_features // k, gralora_rank) + assert gralora_B.shape == (k, gralora_rank, out_features // k) + + # Check hybrid component shapes + assert gralora_A_general.weight.shape == (4, in_features) + assert gralora_B_general.weight.shape == (out_features, 4) + + def test_gralora_block_diagonal_structure(self): + """Test that pure GraLoRA produces block-diagonal delta weights""" + # Use init_weights=False to have non-zero B matrices + torch.manual_seed(0) + mlp = MLP() + config = GraloraConfig( + target_modules=["lin1", "lin2"], + r=16, + gralora_k=4, + hybrid_r=0, + init_weights=False, # Both A and B initialized randomly + ) + model = get_peft_model(mlp, config) + + for name, module in model.named_modules(): + if hasattr(module, "get_delta_weight"): + adapter_name = "default" + delta_weight = module.get_delta_weight(adapter_name) + + k = 4 + in_features = module.in_features + out_features = module.out_features + block_size_in = in_features // k + block_size_out = out_features // k + + # Check diagonal blocks have non-zero values + for i in range(k): + row_start = i * block_size_out + row_end = (i + 1) * block_size_out + col_start = i * block_size_in + col_end = (i + 1) * block_size_in + + block = delta_weight[row_start:row_end, col_start:col_end] + block_norm = torch.norm(block).item() + # Diagonal blocks should have some values (initialized with kaiming) + assert block_norm > 0, f"Diagonal block [{i},{i}] is zero" + + def test_gralora_forward_pass(self, mlp_gralora_hybrid): + """Test that forward pass works without errors""" + mlp_gralora_hybrid.eval() + x = torch.randn(5, 10) + + with torch.no_grad(): + output = mlp_gralora_hybrid(x) + + assert output.shape == (5, 2) + assert not torch.isnan(output).any() + assert not torch.isinf(output).any() + + def test_gralora_backward_pass(self, mlp_gralora_hybrid): + """Test that backward pass computes gradients correctly""" + mlp_gralora_hybrid.train() + x = torch.randn(5, 10) + + output = mlp_gralora_hybrid(x) + loss = output.sum() + loss.backward() + + # Check that GraLoRA parameters have gradients + for name, param in mlp_gralora_hybrid.named_parameters(): + if "gralora" in name and param.requires_grad: + assert param.grad is not None, f"Parameter {name} has no gradient" + assert not torch.isnan(param.grad).any(), f"Parameter {name} has NaN gradients" + + def test_gralora_pure_vs_hybrid_params(self): + """Test that pure and hybrid modes have same total parameters but different distribution""" + torch.manual_seed(0) + mlp_pure = MLP() + config_pure = GraloraConfig( + target_modules=["lin1", "lin2"], + r=16, + gralora_k=4, + hybrid_r=0, + ) + model_pure = get_peft_model(mlp_pure, config_pure) + + torch.manual_seed(0) + mlp_hybrid = MLP() + config_hybrid = GraloraConfig( + target_modules=["lin1", "lin2"], + r=16, + gralora_k=4, + hybrid_r=4, + ) + model_hybrid = get_peft_model(mlp_hybrid, config_hybrid) + + def count_trainable_params(model): + return sum(p.numel() for p in model.parameters() if p.requires_grad) + + params_pure = count_trainable_params(model_pure) + params_hybrid = count_trainable_params(model_hybrid) + + # Pure and hybrid should have same total parameters (r is constant) + # but distributed differently between block-diagonal and full-rank components + assert params_pure == params_hybrid, ( + f"Pure ({params_pure}) and Hybrid ({params_hybrid}) should have same parameter count" + ) + + # Check that hybrid has general components + has_general = False + for name, _ in model_hybrid.named_modules(): + if "gralora_A_general" in name or "gralora_B_general" in name: + has_general = True + break + assert has_general, "Hybrid mode should have general components" + + def test_gralora_save_load_roundtrip(self, mlp_gralora_hybrid, tmp_path): + """Test that save/load preserves model behavior""" + mlp_gralora_hybrid.eval() + x = torch.randn(5, 10) + + # Get output before save + with torch.no_grad(): + output_before = mlp_gralora_hybrid(x) + + # Save adapter + mlp_gralora_hybrid.save_pretrained(tmp_path) + + # Load adapter + torch.manual_seed(0) + new_mlp = MLP() + loaded_model = PeftModel.from_pretrained(new_mlp, tmp_path) + loaded_model.eval() + + # Get output after load + with torch.no_grad(): + output_after = loaded_model(x) + + # Outputs should be very close + assert torch.allclose(output_before, output_after, atol=1e-5, rtol=1e-5) + + def test_gralora_state_dict_structure(self, mlp_gralora_hybrid, tmp_path): + """Test that state dict contains only necessary parameters""" + mlp_gralora_hybrid.save_pretrained(tmp_path) + + # Load state dict + sd = {} + with safe_open(tmp_path / "adapter_model.safetensors", framework="pt", device="cpu") as f: + for key in f.keys(): + sd[key] = f.get_tensor(key) + + # Check that gralora parameters are present + assert any("gralora_A" in key for key in sd), "gralora_A not found in state dict" + assert any("gralora_B" in key for key in sd), "gralora_B not found in state dict" + + # For hybrid mode, check hybrid components + assert any("gralora_A_general" in key for key in sd), "gralora_A_general not found" + assert any("gralora_B_general" in key for key in sd), "gralora_B_general not found" + + def test_gralora_merge_and_unload(self, mlp_gralora_hybrid): + """Test merge_and_unload functionality""" + mlp_gralora_hybrid.eval() + x = torch.randn(5, 10) + + # Get output before merge + with torch.no_grad(): + output_before = mlp_gralora_hybrid(x) + + # Merge and unload + merged_model = mlp_gralora_hybrid.merge_and_unload() + merged_model.eval() + + # Get output after merge + with torch.no_grad(): + output_after = merged_model(x) + + # Outputs should be very close + assert torch.allclose(output_before, output_after, atol=1e-4, rtol=1e-4) + + # Check that merged model has no GraLoRA layers + has_gralora = any("gralora" in name for name, _ in merged_model.named_parameters()) + assert not has_gralora, "Merged model still has GraLoRA parameters" + + def test_gralora_merge_unmerge(self): + """Test merge/unmerge functionality""" + torch.manual_seed(0) + mlp = MLP() + config = GraloraConfig( + target_modules=["lin1"], + r=8, + gralora_k=2, + hybrid_r=0, + ) + model = get_peft_model(mlp, config) + model.eval() + + x = torch.randn(5, 10) + + # Output before merge + with torch.no_grad(): + output_before = model(x) + + # Merge adapter using PEFT API + model.merge_adapter() + + with torch.no_grad(): + output_merged = model(x) + + # Outputs should be the same after merge + assert torch.allclose(output_before, output_merged, atol=1e-4, rtol=1e-4) + + # Unmerge adapter using PEFT API + model.unmerge_adapter() + + with torch.no_grad(): + output_unmerged = model(x) + + # Outputs should be the same after unmerge + assert torch.allclose(output_before, output_unmerged, atol=1e-4, rtol=1e-4) + + def test_gralora_multiple_adapters(self): + """Test adding and switching between multiple adapters""" + torch.manual_seed(0) + mlp = MLP() + + # Use init_weights=False to have non-zero outputs + config1 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2, hybrid_r=0, init_weights=False) + model = get_peft_model(mlp, config1, adapter_name="adapter1") + + torch.manual_seed(42) # Different seed for second adapter + config2 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2, hybrid_r=0, init_weights=False) + model.add_adapter("adapter2", config2) + + x = torch.randn(5, 10) + + # Test adapter1 + model.set_adapter("adapter1") + with torch.no_grad(): + output1 = model(x) + + # Test adapter2 + model.set_adapter("adapter2") + with torch.no_grad(): + output2 = model(x) + + # Different adapters should give different outputs + assert not torch.allclose(output1, output2, atol=1e-3, rtol=1e-3) + + def test_gralora_dtype_compatibility(self): + """Test that GraLoRA works with different dtypes""" + for dtype in [torch.float32, torch.float16, torch.bfloat16]: + if dtype == torch.bfloat16 and not torch.cuda.is_available(): + # Skip bfloat16 on CPU if not supported + continue + + torch.manual_seed(0) + mlp = MLP().to(dtype) + config = GraloraConfig( + target_modules=["lin1"], + r=8, + gralora_k=2, + hybrid_r=0, + ) + model = get_peft_model(mlp, config) + + x = torch.randn(5, 10).to(dtype) + output = model(x) + + assert output.dtype == dtype, f"Output dtype mismatch for {dtype}" + + def test_gralora_disable_adapters(self): + """Test disabling adapters""" + torch.manual_seed(0) + mlp = MLP() + # Use init_weights=False to have non-zero effect + config = GraloraConfig( + target_modules=["lin1", "lin2"], + r=16, + gralora_k=4, + hybrid_r=4, + init_weights=False, + ) + model = get_peft_model(mlp, config) + model.eval() + x = torch.randn(5, 10) + + # Output with adapter enabled + with torch.no_grad(): + output_enabled = model(x) + + # Output with adapter disabled + with model.disable_adapter(): + with torch.no_grad(): + output_disabled = model(x) + + # Outputs should be different + assert not torch.allclose(output_enabled, output_disabled, atol=1e-6, rtol=1e-6) + + def test_gralora_different_k_values(self): + """Test GraLoRA with different k values""" + for k in [2, 4]: + torch.manual_seed(0) + mlp = MLP() + config = GraloraConfig( + target_modules=["lin1", "lin2"], + r=k * 4, # Make sure r is divisible by k + gralora_k=k, + hybrid_r=0, + ) + model = get_peft_model(mlp, config) + + x = torch.randn(5, 10) + output = model(x) + + assert output.shape == (5, 2) + assert not torch.isnan(output).any() + + def test_gralora_rank_divisibility_check(self): + """Test that invalid rank/k combinations raise errors""" + torch.manual_seed(0) + mlp = MLP() + + # This should raise an error because (r - hybrid_r) is not divisible by k + # r=15, hybrid_r=0, k=4 -> gralora_rank=15, 15 % 4 != 0 + config = GraloraConfig( + target_modules=["lin1"], + r=15, + gralora_k=4, + hybrid_r=0, + ) + + with pytest.raises(AssertionError, match="r should be divisible by gralora_k"): + get_peft_model(mlp, config) + + def test_gralora_trainable_parameters_only(self, mlp_gralora_hybrid): + """Test that only GraLoRA parameters are trainable""" + for name, param in mlp_gralora_hybrid.named_parameters(): + if "gralora" in name or "modules_to_save" in name: + assert param.requires_grad, f"GraLoRA parameter {name} should be trainable" + else: + assert not param.requires_grad, f"Base parameter {name} should be frozen" + + def test_gralora_save_pretrained_files(self, mlp_gralora_hybrid, tmp_path): + """Test that save_pretrained creates expected files""" + mlp_gralora_hybrid.save_pretrained(tmp_path) + + # Check for config file + assert (tmp_path / "adapter_config.json").exists() + + # Check for weights file (either .bin or .safetensors) + assert (tmp_path / "adapter_model.safetensors").exists() or (tmp_path / "adapter_model.bin").exists() + + def test_gralora_information_exchange_via_permutation(self, mlp_gralora_pure): + """ + Test that information exchange happens through tensor permutation. Even though delta weights are + block-diagonal, the forward pass should allow information flow between blocks via the permutation operation. + """ + mlp_gralora_pure.eval() + + # Create two inputs that differ only in specific blocks + x1 = torch.randn(1, 10) + x2 = x1.clone() + + # Modify only the first block (assuming k=4, block size = 10//4 = 2.5, rounded to 2-3 features) + x2[0, :5] += 1.0 # Modify first block + + with torch.no_grad(): + out1 = mlp_gralora_pure(x1) + out2 = mlp_gralora_pure(x2) + + # Due to information exchange, changing one block should affect all outputs + # (not just outputs corresponding to that block) + diff = (out1 - out2).abs() + + # All output dimensions should be affected (not just the first block's outputs) + assert (diff > 1e-6).all(), "Information exchange not happening correctly" + + def test_gralora_scaling_factor(self): + """Test that scaling factor is correctly applied""" + torch.manual_seed(0) + mlp = MLP() + + # Create two configs with different alpha values + config_alpha16 = GraloraConfig( + target_modules=["lin1"], + r=8, + gralora_alpha=16, + gralora_k=2, + hybrid_r=0, + ) + + config_alpha32 = GraloraConfig( + target_modules=["lin1"], + r=8, + gralora_alpha=32, + gralora_k=2, + hybrid_r=0, + ) + + model_alpha16 = get_peft_model(MLP(), config_alpha16) + model_alpha32 = get_peft_model(MLP(), config_alpha32) + + # Copy weights to make them identical except for scaling + for (n1, p1), (n2, p2) in zip(model_alpha16.named_parameters(), model_alpha32.named_parameters()): + if "gralora" in n1: + p2.data = p1.data.clone() + + x = torch.randn(5, 10) + + model_alpha16.eval() + model_alpha32.eval() + + with torch.no_grad(): + out1 = model_alpha16(x) + out2 = model_alpha32(x) + + # Outputs should be different due to different scaling + assert not torch.allclose(out1, out2, atol=1e-6, rtol=1e-6) From c1fe6c4ae497e5da6295ad9d1f59d5246753a969 Mon Sep 17 00:00:00 2001 From: HaohanTsao Date: Fri, 17 Oct 2025 11:30:53 +0800 Subject: [PATCH 04/11] FIX & TEST: Fix GraLoRA bugs in get_peft_config_as_dict and improve test coverage --- src/peft/tuners/gralora/model.py | 4 +- tests/test_gralora.py | 549 +++++++++++++++++++++++++++++++ 2 files changed, 551 insertions(+), 2 deletions(-) diff --git a/src/peft/tuners/gralora/model.py b/src/peft/tuners/gralora/model.py index b7f6e15097..0273b30ab5 100644 --- a/src/peft/tuners/gralora/model.py +++ b/src/peft/tuners/gralora/model.py @@ -228,8 +228,8 @@ def get_peft_config_as_dict(self, inference: bool = False): config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()} if inference: config["inference_mode"] = True - config_dict[key] = config - return config + config_dict[key] = config + return config_dict def _set_adapter_layers(self, enabled=True): for module in self.model.modules(): diff --git a/tests/test_gralora.py b/tests/test_gralora.py index af43ac22e4..7e2ca5a078 100644 --- a/tests/test_gralora.py +++ b/tests/test_gralora.py @@ -531,3 +531,552 @@ def test_gralora_scaling_factor(self): # Outputs should be different due to different scaling assert not torch.allclose(out1, out2, atol=1e-6, rtol=1e-6) + + def test_gralora_safe_merge_success(self): + """Test safe_merge with valid weights""" + torch.manual_seed(0) + mlp = MLP() + config = GraloraConfig( + target_modules=["lin1"], + r=8, + gralora_k=2, + hybrid_r=0, + init_weights=False, + ) + model = get_peft_model(mlp, config) + + x = torch.randn(5, 10) + with torch.no_grad(): + output_before = model(x) + + # Test safe merge + model.base_model.model.lin1.merge(safe_merge=True) + + with torch.no_grad(): + output_after = model(x) + + assert torch.allclose(output_before, output_after, atol=1e-4, rtol=1e-4) + + def test_gralora_safe_merge_detects_nan(self): + """Test that safe_merge detects NaN values""" + torch.manual_seed(0) + mlp = MLP() + config = GraloraConfig( + target_modules=["lin1"], + r=8, + gralora_k=2, + hybrid_r=0, + ) + model = get_peft_model(mlp, config) + + # Inject NaN into adapter weights (use .data to avoid requires_grad error) + model.base_model.model.lin1.gralora_A["default"].data[0, 0, 0] = float("nan") + + # safe_merge should raise ValueError + with pytest.raises(ValueError, match="NaNs detected"): + model.base_model.model.lin1.merge(safe_merge=True) + + def test_gralora_unmerge_warning_when_not_merged(self): + """Test that unmerge warns when already unmerged""" + torch.manual_seed(0) + mlp = MLP() + config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2) + model = get_peft_model(mlp, config) + + # Try to unmerge without merging first + with pytest.warns(UserWarning, match="Already unmerged"): + model.base_model.model.lin1.unmerge() + + def test_gralora_hybrid_forward_computation(self): + """Test that hybrid LoRA component is used in forward pass""" + torch.manual_seed(0) + mlp_hybrid = MLP() + mlp_pure = MLP() + + config_hybrid = GraloraConfig( + target_modules=["lin1"], + r=16, + gralora_k=4, + hybrid_r=4, + init_weights=False, + ) + model_hybrid = get_peft_model(mlp_hybrid, config_hybrid) + + config_pure = GraloraConfig( + target_modules=["lin1"], + r=16, + gralora_k=4, + hybrid_r=0, + init_weights=False, + ) + model_pure = get_peft_model(mlp_pure, config_pure) + + x = torch.randn(5, 10) + + with torch.no_grad(): + output_hybrid = model_hybrid(x) + output_pure = model_pure(x) + + # Outputs should be different due to hybrid component + assert not torch.allclose(output_hybrid, output_pure, atol=1e-3) + + def test_gralora_invalid_rank_zero(self): + """Test that r=0 raises error""" + mlp = MLP() + config = GraloraConfig(target_modules=["lin1"], r=0, gralora_k=2) + + with pytest.raises(ValueError, match="`r` should be a positive integer"): + get_peft_model(mlp, config) + + def test_gralora_invalid_rank_negative(self): + """Test that negative r raises error""" + mlp = MLP() + config = GraloraConfig(target_modules=["lin1"], r=-1, gralora_k=2) + + with pytest.raises(ValueError, match="`r` should be a positive integer"): + get_peft_model(mlp, config) + + def test_gralora_bias_all(self): + """Test bias='all' configuration""" + torch.manual_seed(0) + mlp = MLP(bias=True) + config = GraloraConfig( + target_modules=["lin1"], + r=8, + gralora_k=2, + bias="all", + ) + model = get_peft_model(mlp, config) + + # Check that all bias parameters are trainable + bias_params = [name for name, param in model.named_parameters() if "bias" in name and param.requires_grad] + assert len(bias_params) > 0, "At least some bias parameters should be trainable" + + def test_gralora_bias_gralora_only(self): + """Test bias='gralora_only' configuration""" + torch.manual_seed(0) + mlp = MLP(bias=True) + config = GraloraConfig( + target_modules=["lin1"], + r=8, + gralora_k=2, + bias="gralora_only", + ) + model = get_peft_model(mlp, config) + + # Only GraLoRA layer biases should be trainable + assert model.base_model.model.lin1.bias.requires_grad + assert not model.base_model.model.lin0.bias.requires_grad + + def test_gralora_multiple_adapters_with_bias_raises(self): + """Test that multiple adapters with bias raises error""" + torch.manual_seed(0) + mlp = MLP() + config1 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2, bias="all") + model = get_peft_model(mlp, config1) + + config2 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2, bias="all") + + with pytest.raises(ValueError, match="supports only 1 adapter with bias"): + model.add_adapter("adapter2", config2) + + def test_gralora_cpu_fp16_merge(self): + """Test merge with fp16 on CPU""" + torch.manual_seed(0) + mlp = MLP().to(torch.float16) + config = GraloraConfig( + target_modules=["lin1"], + r=8, + gralora_k=2, + hybrid_r=0, + init_weights=False, + ) + model = get_peft_model(mlp, config) + + x = torch.randn(5, 10).to(torch.float16) + + with torch.no_grad(): + output_before = model(x) + + # Merge (should handle CPU fp16 correctly) + model.merge_adapter() + + with torch.no_grad(): + output_after = model(x) + + assert torch.allclose(output_before, output_after, atol=1e-2, rtol=1e-2) + + def test_gralora_cpu_bf16_merge(self): + """Test merge with bf16 on CPU (if supported)""" + # Check if bfloat16 is supported + try: + _ = torch.randn(2, 2).to(torch.bfloat16) + except RuntimeError: + pytest.skip("bfloat16 not supported on this system") + + torch.manual_seed(0) + mlp = MLP().to(torch.bfloat16) + config = GraloraConfig( + target_modules=["lin1"], + r=8, + gralora_k=2, + hybrid_r=2, + init_weights=False, + ) + model = get_peft_model(mlp, config) + + x = torch.randn(5, 10).to(torch.bfloat16) + + with torch.no_grad(): + output_before = model(x) + + # Merge with hybrid component + model.merge_adapter() + + with torch.no_grad(): + output_after = model(x) + + assert torch.allclose(output_before, output_after, atol=1e-2, rtol=1e-2) + + def test_gralora_disable_adapter_layers_warns_with_bias(self): + """Test that disable_adapter_layers warns when bias is configured""" + torch.manual_seed(0) + mlp = MLP() + config = GraloraConfig( + target_modules=["lin1"], + r=8, + gralora_k=2, + bias="all", + ) + model = get_peft_model(mlp, config) + + with pytest.warns(UserWarning, match="disabling adapter layers with bias"): + model.disable_adapter_layers() + + def test_gralora_set_adapter_warns_when_merged(self): + """Test that set_adapter warns and unmerges when model is merged""" + torch.manual_seed(0) + mlp = MLP() + config1 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2) + model = get_peft_model(mlp, config1, adapter_name="adapter1") + + config2 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2) + model.add_adapter("adapter2", config2) + + # Merge first adapter + model.merge_adapter() + + # Setting adapter should warn and unmerge + with pytest.warns(UserWarning, match="Adapter cannot be set when the model is merged"): + model.set_adapter("adapter2") + + # Model should be unmerged now + assert not model.base_model.model.lin1.merged + + def test_gralora_delete_adapter(self): + """Test deleting an adapter""" + torch.manual_seed(0) + mlp = MLP() + config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2) + model = get_peft_model(mlp, config, adapter_name="adapter1") + + config2 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2) + model.add_adapter("adapter2", config2) + + # Delete adapter1 + model.delete_adapter("adapter1") + + assert "adapter1" not in model.peft_config + assert "adapter2" in model.peft_config + + def test_gralora_delete_nonexistent_adapter_raises(self): + """Test that deleting nonexistent adapter raises error""" + torch.manual_seed(0) + mlp = MLP() + config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2) + model = get_peft_model(mlp, config) + + with pytest.raises(ValueError, match="Adapter .* does not exist"): + model.delete_adapter("nonexistent") + + def test_gralora_unload_without_merge(self): + """Test unload without merging""" + torch.manual_seed(0) + mlp = MLP() + config = GraloraConfig( + target_modules=["lin1"], + r=8, + gralora_k=2, + init_weights=False, + ) + model = get_peft_model(mlp, config) + + x = torch.randn(5, 10) + + # Get base model output + with model.disable_adapter(): + with torch.no_grad(): + base_output = model(x) + + # Unload without merge + unloaded_model = model.unload() + + with torch.no_grad(): + unloaded_output = unloaded_model(x) + + # Should match base model output (no merge) + assert torch.allclose(base_output, unloaded_output, atol=1e-5) + + def test_gralora_get_peft_config_as_dict(self): + """Test get_peft_config_as_dict method""" + torch.manual_seed(0) + mlp = MLP() + config = GraloraConfig( + target_modules=["lin1"], + r=8, + gralora_k=2, + hybrid_r=4, + gralora_alpha=16, + ) + model = get_peft_model(mlp, config) + + config_dict = model.get_peft_config_as_dict(inference=False) + + assert "default" in config_dict + assert config_dict["default"]["r"] == 8 + assert config_dict["default"]["gralora_k"] == 2 + assert config_dict["default"]["hybrid_r"] == 4 + + def test_gralora_get_peft_config_as_dict_inference_mode(self): + """Test get_peft_config_as_dict with inference=True""" + torch.manual_seed(0) + mlp = MLP() + config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2) + model = get_peft_model(mlp, config) + + config_dict = model.get_peft_config_as_dict(inference=True) + + assert config_dict["default"]["inference_mode"] is True + + def test_gralora_merge_with_hybrid_component(self): + """Test that merge works correctly with hybrid component""" + torch.manual_seed(0) + mlp = MLP() + config = GraloraConfig( + target_modules=["lin1"], + r=16, + gralora_k=4, + hybrid_r=4, + init_weights=False, + ) + model = get_peft_model(mlp, config) + + x = torch.randn(5, 10) + + with torch.no_grad(): + output_before = model(x) + + # Merge + model.merge_adapter() + + with torch.no_grad(): + output_after = model(x) + + # Outputs should be very close + assert torch.allclose(output_before, output_after, atol=1e-4, rtol=1e-4) + + def test_gralora_repr(self): + """Test __repr__ method""" + torch.manual_seed(0) + mlp = MLP() + config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2) + model = get_peft_model(mlp, config) + + repr_str = repr(model.base_model.model.lin1) + assert "gralora" in repr_str.lower() + + def test_gralora_merge_with_adapter_names(self): + """Test merge with specific adapter names""" + torch.manual_seed(0) + mlp = MLP() + config1 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2, init_weights=False) + model = get_peft_model(mlp, config1, adapter_name="adapter1") + + torch.manual_seed(42) + config2 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2, init_weights=False) + model.add_adapter("adapter2", config2) + + x = torch.randn(5, 10) + + # Set to adapter1 and get output + model.set_adapter("adapter1") + with torch.no_grad(): + output_before = model(x) + + # Merge only adapter1 + model.base_model.model.lin1.merge(adapter_names=["adapter1"]) + + with torch.no_grad(): + output_after = model(x) + + # Outputs should be close + assert torch.allclose(output_before, output_after, atol=1e-4, rtol=1e-4) + + def test_gralora_enable_disable_adapter_layers(self): + """Test enable/disable adapter layers""" + torch.manual_seed(0) + mlp = MLP() + config = GraloraConfig( + target_modules=["lin1"], + r=8, + gralora_k=2, + init_weights=False, + ) + model = get_peft_model(mlp, config) + + x = torch.randn(5, 10) + + # Get output with adapter enabled + with torch.no_grad(): + output_enabled = model(x) + + # Disable adapters + model.disable_adapter_layers() + + with torch.no_grad(): + output_disabled = model(x) + + # Enable adapters + model.enable_adapter_layers() + + with torch.no_grad(): + output_re_enabled = model(x) + + # Output with disabled adapter should be different + assert not torch.allclose(output_enabled, output_disabled, atol=1e-6) + # Output after re-enabling should match original + assert torch.allclose(output_enabled, output_re_enabled, atol=1e-6) + + def test_gralora_forward_with_merged_adapter(self): + """Test forward pass with merged adapter""" + torch.manual_seed(0) + mlp = MLP() + config = GraloraConfig( + target_modules=["lin1"], + r=8, + gralora_k=2, + init_weights=False, + ) + model = get_peft_model(mlp, config) + + x = torch.randn(5, 10) + + # Get output before merge + with torch.no_grad(): + output_before = model(x) + + # Merge adapter + model.merge_adapter() + + # Forward with merged adapter (should take merged path) + with torch.no_grad(): + output_after = model(x) + + assert torch.allclose(output_before, output_after, atol=1e-4) + + def test_gralora_forward_with_disable_adapters_and_merged(self): + """Test forward when disable_adapters=True and model is merged""" + torch.manual_seed(0) + mlp = MLP() + config = GraloraConfig( + target_modules=["lin1"], + r=8, + gralora_k=2, + init_weights=False, + ) + model = get_peft_model(mlp, config) + + x = torch.randn(5, 10) + + # Merge adapter + model.merge_adapter() + + # Get output with merged adapter + with torch.no_grad(): + output_merged = model(x) + + # Disable adapters (should unmerge) + with model.disable_adapter(): + with torch.no_grad(): + output_disabled = model(x) + + # Outputs should be different + assert not torch.allclose(output_merged, output_disabled, atol=1e-5) + + def test_gralora_bias_invalid_option_raises(self): + """Test that invalid bias option raises NotImplementedError""" + torch.manual_seed(0) + mlp = MLP() + + # Create config with invalid bias (need to bypass validation) + config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2) + model = get_peft_model(mlp, config) + + # Manually set invalid bias to trigger the error + model.peft_config["default"].bias = "invalid_option" + + with pytest.raises(NotImplementedError, match="Requested bias"): + model._mark_only_adapters_as_trainable(model.model) + + def test_gralora_merge_empty_adapter_names(self): + """Test merge with empty adapter_names returns early""" + torch.manual_seed(0) + mlp = MLP() + config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2) + model = get_peft_model(mlp, config) + + # Call merge with empty list (should return early) + model.base_model.model.lin1.merge(adapter_names=[]) + + # Model should not be merged + assert not model.base_model.model.lin1.merged + + def test_gralora_add_non_active_adapter(self): + """Test adding adapter that is not active (should not be trainable)""" + torch.manual_seed(0) + mlp = MLP() + config1 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2) + model = get_peft_model(mlp, config1, adapter_name="adapter1") + + # Keep adapter1 active + model.set_adapter("adapter1") + + # Add adapter2 (should not be active/trainable initially) + torch.manual_seed(42) + config2 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2) + model.add_adapter("adapter2", config2) + + # adapter2 parameters should exist but might not be in active_adapters initially + assert "adapter2" in model.base_model.model.lin1.gralora_A + + def test_gralora_forward_with_no_adapter_in_active_list(self): + """Test forward when active_adapter is not in gralora_A keys""" + torch.manual_seed(0) + mlp = MLP() + config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2) + model = get_peft_model(mlp, config, adapter_name="adapter1") + + x = torch.randn(5, 10) + + # Manually set _active_adapter to include non-existent adapter + original_adapter = model.base_model.model.lin1._active_adapter + model.base_model.model.lin1._active_adapter = ["nonexistent", "adapter1"] + + # Should still work (skip nonexistent adapter) + with torch.no_grad(): + output = model(x) + + assert output.shape == (5, 2) + + # Restore + model.base_model.model.lin1._active_adapter = original_adapter From 4f1444f0aa394ebc40378fc3a057a213203735ce Mon Sep 17 00:00:00 2001 From: "yeonjoon.jung" Date: Sat, 18 Oct 2025 19:11:25 +0900 Subject: [PATCH 05/11] Refactor GraLoRA weight computation to improve efficiency in delta-weight calculation. --- src/peft/tuners/gralora/layer.py | 67 +++++++++----------------------- 1 file changed, 19 insertions(+), 48 deletions(-) diff --git a/src/peft/tuners/gralora/layer.py b/src/peft/tuners/gralora/layer.py index 907730d49b..52de8c1b11 100644 --- a/src/peft/tuners/gralora/layer.py +++ b/src/peft/tuners/gralora/layer.py @@ -310,57 +310,28 @@ def get_delta_weight(self, adapter) -> torch.Tensor: gralora_rank = r - hybrid_r subblock_gralora_rank = gralora_rank // gralora_k - # Simulate the forward pass computation to get equivalent weight matrix - # We need to compute: W_delta such that W_delta @ x = gralora_forward(x) - base_forward(x) - - # Create an identity matrix for each input dimension and compute output - # This gives us the columns of the weight matrix - delta_weight = torch.zeros(out_features, in_features, device=device, dtype=gralora_A.dtype) - - # Process in batches to avoid memory issues - batch_size = min(256, in_features) - for start_idx in range(0, in_features, batch_size): - end_idx = min(start_idx + batch_size, in_features) - batch_len = end_idx - start_idx - - # Create identity input: [batch_len, in_features] - x = torch.zeros(batch_len, in_features, device=device, dtype=gralora_A.dtype) - for i in range(batch_len): - x[i, start_idx + i] = 1.0 - - # Apply GraLoRA transformation (following forward logic) - # x shape: [batch_len, in_features] - N = gralora_k - - # Reshape x: [batch_len, N, in_features//N] - x_reshaped = x.view(batch_len, N, in_features // N) - - # Apply gralora_A: [batch_len, N, in_features//N] @ [N, in_features//N, rank] - # Result: [batch_len, N, rank] - temp = torch.einsum("bni, nir -> bnr", x_reshaped, gralora_A) - - # Reshape and permute for information exchange - # [batch_len, N, rank] -> [batch_len, N, N, subblock_rank] - temp = temp.view(batch_len, N, N, subblock_gralora_rank) - # Permute: [batch_len, N, N, subblock_rank] -> [batch_len, N, N, subblock_rank] - temp = temp.permute(0, 2, 1, 3) - # Reshape: [batch_len, N, N * subblock_rank] - temp = temp.reshape(batch_len, N, N * subblock_gralora_rank) - - # Apply gralora_B: [batch_len, N, N*subblock_rank] @ [N, rank, out_features//N] - # Note: rank here is actually gralora_rank = N * subblock_gralora_rank - # Result: [batch_len, N, out_features//N] - output = torch.einsum("bnr, nro -> bno", temp, gralora_B) - - # Reshape to [batch_len, out_features] - output = output.reshape(batch_len, out_features) - - # Store in delta_weight (transpose because weight is [out, in]) - delta_weight[:, start_idx:end_idx] = output.T + # scatter gralora_A to get the scattered weight matrix + l_indices = torch.arange(in_features, device=device) + n_indices = (l_indices // (in_features // gralora_k)) + i_indices = (l_indices % (in_features // gralora_k)) + gralora_A_scattered = torch.zeros(in_features, gralora_k, gralora_rank, device=device, dtype=dtype) + gralora_A_scattered.scatter_(1, + n_indices.unsqueeze(1).unsqueeze(2).expand(-1, 1, gralora_rank), + gralora_A[n_indices, i_indices, :].unsqueeze(1) + ) + + # compute the delta weight + delta_weight = torch.einsum( + "ikr, kro -> iko", + gralora_A_scattered + .view(in_features, gralora_k, gralora_k, subblock_gralora_rank) + .permute(0, 2, 1, 3) + .reshape(in_features, gralora_k, gralora_rank), + gralora_B, + ).reshape(in_features, out_features).T # Add hybrid LoRA component if present if hybrid_r > 0: - # general_A: [in_features, hybrid_r], general_B: [hybrid_r, out_features] weight_A_general = gralora_A_general.weight # [hybrid_r, in_features] weight_B_general = gralora_B_general.weight # [out_features, hybrid_r] From 94315028e75030e03c87089b9fb28c3b4dbeb029 Mon Sep 17 00:00:00 2001 From: "yeonjoon.jung" Date: Fri, 24 Oct 2025 00:29:36 +0900 Subject: [PATCH 06/11] Refactor GraLoRA code for clearer documentation, simplified inheritance, and more intuitive hybrid_r handling. --- src/peft/tuners/gralora/config.py | 43 +++++- src/peft/tuners/gralora/layer.py | 117 ++++++---------- src/peft/tuners/gralora/model.py | 218 +----------------------------- 3 files changed, 86 insertions(+), 292 deletions(-) diff --git a/src/peft/tuners/gralora/config.py b/src/peft/tuners/gralora/config.py index 9e78b81afa..b88b26a77a 100644 --- a/src/peft/tuners/gralora/config.py +++ b/src/peft/tuners/gralora/config.py @@ -21,23 +21,54 @@ @dataclass class GraloraConfig(PeftConfig): - r: int = field(default=8, metadata={"help": "gralora attention dimension"}) + r: int = field( + default=32, + metadata={ + "help": ( + "GraLoRA attention dimension determines the rank of the GraLoRA adapter. " + "The total parameter count of the GraLoRA adapter is same as LoRA with same rank r, while the expressivitiy is multiplied by gralora_k." + ) + }, + ) hybrid_r: int = field( - default=0, metadata={"help": "hybrid_r is the rank allocated to vanilla LoRA method when using Hybrid GraLoRA"} + default=0, + metadata={ + "help": ( + "hybrid_r is the rank allocated to vanilla LoRA method when using Hybrid GraLoRA method. " + "Hybrid GraLoRA, a combination of GraLoRA and vanilla LoRA, becomes available when hybrid_r > 0. " + "r + hybrid_r determines the parameter count of the GraLoRA adapter." + ) + }, ) target_modules: Optional[Union[list[str], str]] = field( default=None, metadata={ "help": ( - "List of module names or regex expression of the module names to replace with gralora." + "List of module names or regex expression of the module names to replace with gralora. " "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. " "Only linear layers are supported." ) }, ) - gralora_alpha: int = field(default=8, metadata={"help": "gralora alpha"}) + gralora_alpha: int = field( + default=64, + metadata={ + "help": ( + "gralora alpha is the scaling factor for the GraLoRA adapter." + "Scale becomes gralora_alpha / (r + hybrid_r)." + ) + }, + ) gralora_dropout: float = field(default=0.0, metadata={"help": "gralora dropout"}) - gralora_k: int = field(default=2, metadata={"help": "gralora k"}) + gralora_k: int = field( + default=2, + metadata={ + "help": ( + "gralora_k determines the number of subblocks in the GraLoRA adapter." + "The total parameter count is preserved regardles of gralora_k, while the expressivitiy is multiplied by gralora_k." + ) + }, + ) fan_in_fan_out: bool = field( default=False, metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"}, @@ -90,3 +121,5 @@ def __post_init__(self): self.target_modules = ( set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules ) + if self.r % self.gralora_k != 0: + raise ValueError(f"r should be divisible by gralora_k, but got {self.r} and {self.gralora_k}") diff --git a/src/peft/tuners/gralora/layer.py b/src/peft/tuners/gralora/layer.py index 52de8c1b11..4aefa02152 100644 --- a/src/peft/tuners/gralora/layer.py +++ b/src/peft/tuners/gralora/layer.py @@ -38,7 +38,6 @@ def __init__(self, base_layer: nn.Module, **kwargs): self.scaling = {} self.gralora_dropout = nn.ModuleDict({}) - # Set to `None` otherwise to avoid computation with random weight self.gralora_A = nn.ParameterDict({}) self.gralora_B = nn.ParameterDict({}) self.gralora_A_general = nn.ModuleDict({}) @@ -55,57 +54,13 @@ def __init__(self, base_layer: nn.Module, **kwargs): in_features, out_features = ( base_layer.weight.ds_shape if hasattr(base_layer.weight, "ds_shape") else base_layer.weight.shape ) + else: + raise NotImplementedError(f"Unsupported layer type {type(base_layer)}") self.in_features = in_features self.out_features = out_features self.kwargs = kwargs - def _move_adapter_to_device_of_base_layer(self, adapter_name: str, device: Optional[torch.device] = None) -> None: - """ - Move the adapter of the given name to the device of the base layer. - """ - from peft.tuners._buffer_dict import BufferDict - - if device is None: - # check weight and qweight (for GPTQ) - for weight_name in ("weight", "qweight"): - weight = getattr(self.get_base_layer(), weight_name, None) - if weight is not None: - device = weight.device - dtype = weight.dtype - break - else: - # no break encountered: could not determine the device - return - - # loop through all potential adapter layers and move them to the device of the base layer; be careful to only - # move this specific adapter to the device, as the other adapters could be on different devices - # see #1639 - for adapter_layer_name in self.adapter_layer_names + self.other_param_names: - adapter_layer = getattr(self, adapter_layer_name, None) - if not isinstance(adapter_layer, (nn.ModuleDict, nn.ParameterDict, BufferDict)): - continue - if adapter_name not in adapter_layer: - continue - if weight.dtype.is_floating_point or weight.dtype.is_complex: - adapter_layer[adapter_name] = adapter_layer[adapter_name].to(device, dtype=dtype) - else: - adapter_layer[adapter_name] = adapter_layer[adapter_name].to(device) - - @property - def merged(self) -> bool: - return bool(self.merged_adapters) - - @property - def bias(self) -> torch.Tensor: - base_layer = self.get_base_layer() - if isinstance(base_layer, nn.Linear): - return base_layer.bias - elif isinstance(base_layer, Conv1D): - return base_layer.bias - else: - return None - def update_layer( self, adapter_name, @@ -119,6 +74,8 @@ def update_layer( ): if r <= 0: raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") + elif hybrid_r < 0: + raise ValueError(f"`hybrid_r` should be a non-negative integer value but the value passed is {hybrid_r}") self.r[adapter_name] = r self.gralora_alpha[adapter_name] = gralora_alpha @@ -133,21 +90,29 @@ def update_layer( self.gralora_dropout.update(nn.ModuleDict({adapter_name: gralora_dropout_layer})) # Actual trainable parameters + if self.in_features % gralora_k != 0: + raise ValueError( + f"in_features should be divisible by gralora_k, but got {self.in_features} and {gralora_k}" + ) + if self.out_features % gralora_k != 0: + raise ValueError( + f"out_features should be divisible by gralora_k, but got {self.out_features} and {gralora_k}" + ) subblock_in_features = self.in_features // gralora_k subblock_out_features = self.out_features // gralora_k - gralora_r = r - hybrid_r # gralora_r is the rank allocated to gralora method - assert gralora_r % gralora_k == 0, f"r should be divisible by gralora_k, but got {r} and {gralora_k}" + # gralora_r is the rank allocated to GraLoRA method; hybrid_r is the rank allocated to vanilla LoRA + gralora_r = r - gralora_A = nn.ParameterList() - gralora_B = nn.ParameterList() + gralora_A = [] + gralora_B = [] for _ in range(gralora_k): - new_A = nn.Parameter(torch.zeros(gralora_r, subblock_in_features)) - new_B = nn.Parameter(torch.zeros(subblock_out_features, gralora_r)) + new_A = nn.Parameter(torch.empty(gralora_r, subblock_in_features)) + new_B = nn.Parameter(torch.empty(subblock_out_features, gralora_r)) if init_weights: # Initialize to identity: A is random, B is zero nn.init.kaiming_uniform_(new_A, a=math.sqrt(5)) - # new_B is already initialized to zeros + nn.init.zeros_(new_B) else: # Initialize to random: both A and B are random (for testing) nn.init.kaiming_uniform_(new_A, a=math.sqrt(5)) @@ -183,7 +148,7 @@ def update_layer( self.module_name = module_name - self.scaling[adapter_name] = gralora_alpha / r + self.scaling[adapter_name] = gralora_alpha / (gralora_r + hybrid_r) self._move_adapter_to_device_of_base_layer(adapter_name) self.set_adapter(self.active_adapters) @@ -305,30 +270,38 @@ def get_delta_weight(self, adapter) -> torch.Tensor: # Get dimensions in_features = self.in_features out_features = self.out_features - subblock_in = in_features // gralora_k - subblock_out = out_features // gralora_k - gralora_rank = r - hybrid_r + gralora_rank = r + if in_features % gralora_k != 0: + raise ValueError(f"in_features should be divisible by gralora_k, but got {in_features} and {gralora_k}") + elif out_features % gralora_k != 0: + raise ValueError(f"out_features should be divisible by gralora_k, but got {out_features} and {gralora_k}") + elif gralora_rank % gralora_k != 0: + raise ValueError(f"rank should be divisible by gralora_k, but got {gralora_rank} and {gralora_k}") subblock_gralora_rank = gralora_rank // gralora_k # scatter gralora_A to get the scattered weight matrix l_indices = torch.arange(in_features, device=device) - n_indices = (l_indices // (in_features // gralora_k)) - i_indices = (l_indices % (in_features // gralora_k)) + n_indices = l_indices // (in_features // gralora_k) + i_indices = l_indices % (in_features // gralora_k) gralora_A_scattered = torch.zeros(in_features, gralora_k, gralora_rank, device=device, dtype=dtype) - gralora_A_scattered.scatter_(1, + gralora_A_scattered.scatter_( + 1, n_indices.unsqueeze(1).unsqueeze(2).expand(-1, 1, gralora_rank), - gralora_A[n_indices, i_indices, :].unsqueeze(1) + gralora_A[n_indices, i_indices, :].unsqueeze(1), ) # compute the delta weight - delta_weight = torch.einsum( - "ikr, kro -> iko", - gralora_A_scattered - .view(in_features, gralora_k, gralora_k, subblock_gralora_rank) - .permute(0, 2, 1, 3) - .reshape(in_features, gralora_k, gralora_rank), - gralora_B, - ).reshape(in_features, out_features).T + delta_weight = ( + torch.einsum( + "ikr, kro -> iko", + gralora_A_scattered.view(in_features, gralora_k, gralora_k, subblock_gralora_rank) + .permute(0, 2, 1, 3) + .reshape(in_features, gralora_k, gralora_rank), + gralora_B, + ) + .reshape(in_features, out_features) + .T + ) # Add hybrid LoRA component if present if hybrid_r > 0: @@ -380,16 +353,14 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: gralora_B_general = self.gralora_B_general[active_adapter] r = self.r[active_adapter] + gralora_rank = r gralora_k = self.gralora_k[active_adapter] hybrid_r = self.hybrid_r[active_adapter] - assert len(gralora_A) == len(gralora_B) - dropout = self.gralora_dropout[active_adapter] scaling = self.scaling[active_adapter] gralora_dtype = gralora_A.dtype - gralora_rank = r - hybrid_r B, L, in_features = x.shape N = gralora_k diff --git a/src/peft/tuners/gralora/model.py b/src/peft/tuners/gralora/model.py index 0273b30ab5..3d7fae2cc6 100644 --- a/src/peft/tuners/gralora/model.py +++ b/src/peft/tuners/gralora/model.py @@ -63,30 +63,12 @@ class GraloraModel(BaseTuner): - **peft_config** ([`GraloraConfig`]): The configuration of the Gralora model. """ + # The unique prefix for GraLoRA method prefix: str = "gralora_" + # The class of tuner layer for GraLoRA method + tuner_layer_cls = GraloraLayer - def __init__(self, model, config, adapter_name) -> None: - super().__init__(model, config, adapter_name) - - def _check_new_adapter_config(self, config: GraloraConfig) -> None: - """ - A helper method to check the config when a new adapter is being added. - - Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters. - - """ - # the below todo is copied from LoRA - # TODO: there should be a check if any of the existing adapters actually has bias != "none", or else the check - # does not fully correspond to the error message. - if (len(self.peft_config) > 1) and (config.bias != "none"): - raise ValueError( - f"{self.__class__.__name__} supports only 1 adapter with bias. When using multiple adapters, " - "set bias to 'none' for all adapters." - ) - - @staticmethod - def _check_target_module_exists(gralora_config, key): - return check_target_module_exists(gralora_config, key) + target_module_mapping = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING def _create_and_replace( self, @@ -132,54 +114,6 @@ def _create_and_replace( new_module.requires_grad_(False) self._replace_module(parent, target_name, new_module, target) - @staticmethod - def _replace_module(parent, child_name, new_module, child): - setattr(parent, child_name, new_module) - # It's not necessary to set requires_grad here, as that is handled by - # _mark_only_adapters_as_trainable - - # child layer wraps the original module, unpack it - if hasattr(child, "base_layer"): - child = child.base_layer - - if not hasattr(new_module, "base_layer"): - new_module.weight = child.weight - if hasattr(child, "bias"): - new_module.bias = child.bias - - if getattr(child, "state", None) is not None: - if hasattr(new_module, "base_layer"): - new_module.base_layer.state = child.state - else: - new_module.state = child.state - new_module.to(child.weight.device) - - # dispatch to correct device - for name, module in new_module.named_modules(): - if "gralora_" in name: - module.to(child.weight.device) - - def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None: - for n, p in model.named_parameters(): - if self.prefix not in n: - p.requires_grad = False - - for active_adapter in self.active_adapters: - bias = self.peft_config[active_adapter].bias - if bias == "none": - continue - - if bias == "all": - for n, p in model.named_parameters(): - if "bias" in n: - p.requires_grad = True - elif bias == "gralora_only": - for m in model.modules(): - if isinstance(m, GraloraLayer) and hasattr(m, "bias") and m.bias is not None: - m.bias.requires_grad = True - else: - raise NotImplementedError(f"Requested bias: {bias}, is not implemented.") - @staticmethod def _create_new_module(gralora_config, adapter_name, target, module_name, **kwargs): if isinstance(target, BaseTunerLayer): @@ -214,147 +148,3 @@ def _create_new_module(gralora_config, adapter_name, target, module_name, **kwar ) return new_module - - def __getattr__(self, name: str): - """Forward missing attributes to the wrapped module.""" - try: - return super().__getattr__(name) # defer to nn.Module's logic - except AttributeError: - return getattr(self.model, name) - - def get_peft_config_as_dict(self, inference: bool = False): - config_dict = {} - for key, value in self.peft_config.items(): - config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()} - if inference: - config["inference_mode"] = True - config_dict[key] = config - return config_dict - - def _set_adapter_layers(self, enabled=True): - for module in self.model.modules(): - if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)): - module.enable_adapters(enabled) - - def enable_adapter_layers(self): - self._set_adapter_layers(enabled=True) - - def disable_adapter_layers(self): - for active_adapter in self.active_adapters: - val = self.peft_config[active_adapter].bias - if val != "none": - msg = ( - f"Careful, disabling adapter layers with bias configured to be '{val}' does not produce the same " - "output as the the base model would without adaption." - ) - warnings.warn(msg) - self._set_adapter_layers(enabled=False) - - def set_adapter(self, adapter_name, inference_mode: bool = False): - for module in self.model.modules(): - if isinstance(module, GraloraLayer): - if module.merged: - warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.") - module.unmerge() - module.set_adapter(adapter_name, inference_mode=inference_mode) - self.active_adapter = adapter_name - - @staticmethod - def _prepare_adapter_config(peft_config, model_config): - if peft_config.target_modules is None: - if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING: - raise ValueError("Please specify `target_modules` in `peft_config`") - peft_config.target_modules = set( - TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING[model_config["model_type"]] - ) - return peft_config - - def _unload_and_optionally_merge( - self, - merge=True, - progressbar: bool = False, - safe_merge: bool = False, - adapter_names: Optional[list[str]] = None, - ): - # we cannot use self.prefix as we want to include non-trainable gralora parameters - key_list = [key for key, _ in self.model.named_modules() if "gralora" not in key] - desc = "Unloading " + ("and merging " if merge else "") + "model" - for key in tqdm(key_list, disable=not progressbar, desc=desc): - try: - parent, target, target_name = _get_submodules(self.model, key) - except AttributeError: - continue - - if hasattr(target, "base_layer"): - if merge: - target.merge(safe_merge=safe_merge, adapter_names=adapter_names) - - self._replace_module(parent, target_name, target.get_base_layer(), target) - elif isinstance(target, ModulesToSaveWrapper): - # save any additional trainable modules part of `modules_to_save` - setattr(parent, target_name, target.modules_to_save[target.active_adapter]) - - return self.model - - def delete_adapter(self, adapter_name: str): - """ - Deletes an existing adapter. - - Args: - adapter_name (str): Name of the adapter to be deleted. - """ - if adapter_name not in list(self.peft_config.keys()): - raise ValueError(f"Adapter {adapter_name} does not exist") - del self.peft_config[adapter_name] - - # we cannot use self.prefix as we want to include non-trainable gralora parameters - key_list = [key for key, _ in self.model.named_modules() if "gralora" not in key] - new_adapter = None - for key in key_list: - _, target, _ = _get_submodules(self.model, key) - if isinstance(target, GraloraLayer): - target.delete_adapter(adapter_name) - if new_adapter is None: - new_adapter = target.active_adapter[:] - - self.active_adapter = new_adapter or [] - - def merge_and_unload( - self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[list[str]] = None - ): - r""" - This method merges the Gralora layers into the base model. This is needed if someone wants to use the base - model as a standalone model. - - Args: - progressbar (`bool`): - whether to show a progressbar indicating the unload and merge process - safe_merge (`bool`): - whether to activate the safe merging check to check if there is any potential Nan in the adapter - weights - adapter_names (`list[str]`, *optional*): - The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults - to `None`. - - Example: - - ```py - >>> from transformers import AutoModelForCausalLM - >>> from peft import PeftModel - - >>> base_model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-40b") - >>> peft_model_id = "smangrul/falcon-40B-int4-peft-lrasa-sfttrainer-sample" - >>> model = PeftModel.from_pretrained(base_model, peft_model_id) - >>> merged_model = model.merge_and_unload() - ``` - """ - return self._unload_and_optionally_merge( - progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names - ) - - def unload(self): - """ - Gets back the base model by removing all the Gralora modules without merging. This gives back the original base - model. - """ - return self._unload_and_optionally_merge(merge=False) From dec25f55ee7bc0a81e2ea206a42264e4ac211666 Mon Sep 17 00:00:00 2001 From: "yeonjoon.jung" Date: Fri, 24 Oct 2025 00:30:11 +0900 Subject: [PATCH 07/11] Update test code for the GraLoRA method --- tests/test_custom_models.py | 53 ++++++++++++++++++++++++++++++------- tests/test_gralora.py | 43 +++++------------------------- 2 files changed, 50 insertions(+), 46 deletions(-) diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index ed83db98cb..30628f2bdf 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -38,6 +38,7 @@ C3AConfig, DeloraConfig, FourierFTConfig, + GraloraConfig, HRAConfig, IA3Config, LNTuningConfig, @@ -666,6 +667,25 @@ "init_weights": True, }, ), + ########### + # GraLoRA # + ########### + ("Vanilla MLP 1 GraLoRA", "MLP", GraloraConfig, {"target_modules": "lin0"}), + ("Vanilla MLP 2 GraLoRA", "MLP", GraloraConfig, {"target_modules": ["lin0"]}), + ("Vanilla MLP 3 GraLoRA", "MLP", GraloraConfig, {"target_modules": ["lin1"]}), + ("Vanilla MLP 4 GraLoRA", "MLP", GraloraConfig, {"target_modules": ["lin0", "lin1"]}), + ( + "Vanilla MLP 5 GraLoRA", + "MLP", + GraloraConfig, + {"target_modules": ["lin0"], "modules_to_save": ["lin1"]}, + ), + ( + "Embedding + transformers Conv1D 1 GraLoRA", + "EmbConv1D", + GraloraConfig, + {"target_modules": ["conv1d"], "gralora_k": 1}, + ), ########## # VBLoRA # ########## @@ -979,6 +999,20 @@ {"n_frequency": 10, "target_modules": ["lin0"]}, {"n_frequency": 10, "target_modules": ["lin1"]}, ), + ( + "GraLoRA Same", + "gralora", + GraloraConfig, + {"target_modules": ["lin0"], "init_weights": False}, + {"target_modules": ["lin0"], "init_weights": False}, + ), + ( + "GraLoRA Different", + "gralora", + GraloraConfig, + {"target_modules": ["lin0"], "init_weights": False}, + {"target_modules": ["lin1"], "init_weights": False}, + ), ( "SHiRA Same", "shira", @@ -1165,6 +1199,7 @@ VeraConfig: "vera_lambda_", RandLoraConfig: "randlora_", FourierFTConfig: "fourierft_", + GraloraConfig: "gralora_", C3AConfig: "c3a_", HRAConfig: "hra_", ShiraConfig: "shira_", @@ -3089,12 +3124,12 @@ def test_add_weighted_adapter_subtraction_with_negative_weights(self): cancelled_B = module.lora_B["cancelled"].weight.data # The weights should be approximately zero (they cancel out) - assert torch.allclose(cancelled_A, torch.zeros_like(cancelled_A), atol=1e-5), ( - f"Cancelled A should be ~0, got max abs value {cancelled_A.abs().max()}" - ) - assert torch.allclose(cancelled_B, torch.zeros_like(cancelled_B), atol=1e-5), ( - f"Cancelled B should be ~0, got max abs value {cancelled_B.abs().max()}" - ) + assert torch.allclose( + cancelled_A, torch.zeros_like(cancelled_A), atol=1e-5 + ), f"Cancelled A should be ~0, got max abs value {cancelled_A.abs().max()}" + assert torch.allclose( + cancelled_B, torch.zeros_like(cancelled_B), atol=1e-5 + ), f"Cancelled B should be ~0, got max abs value {cancelled_B.abs().max()}" def test_add_weighted_adapter_negative_weight_with_different_scaling(self): # Test negative weights with different scaling factors (lora_alpha) @@ -3500,9 +3535,9 @@ def test_multirank_2(self): if isinstance(module, BaseTunerLayer): rank_expected = rank_pattern.get(key, r) rank_current = module.lora_A[adapter].weight.shape[0] - assert rank_current == rank_expected, ( - f"Rank {rank_current} is not equal to expected {rank_expected}" - ) + assert ( + rank_current == rank_expected + ), f"Rank {rank_current} is not equal to expected {rank_expected}" class TestLayerRepr: diff --git a/tests/test_gralora.py b/tests/test_gralora.py index 7e2ca5a078..59c2418a33 100644 --- a/tests/test_gralora.py +++ b/tests/test_gralora.py @@ -112,7 +112,7 @@ def test_gralora_parameter_shapes(self, mlp_gralora_hybrid): in_features = module.in_features out_features = module.out_features k = 4 - gralora_rank = 16 - 4 # r - hybrid_r + gralora_rank = 16 # Check GraLoRA block shapes # Each block has full gralora_rank, not gralora_rank // k @@ -203,7 +203,7 @@ def test_gralora_pure_vs_hybrid_params(self): mlp_hybrid = MLP() config_hybrid = GraloraConfig( target_modules=["lin1", "lin2"], - r=16, + r=12, gralora_k=4, hybrid_r=4, ) @@ -217,9 +217,9 @@ def count_trainable_params(model): # Pure and hybrid should have same total parameters (r is constant) # but distributed differently between block-diagonal and full-rank components - assert params_pure == params_hybrid, ( - f"Pure ({params_pure}) and Hybrid ({params_hybrid}) should have same parameter count" - ) + assert ( + params_pure == params_hybrid + ), f"Pure ({params_pure}) and Hybrid ({params_hybrid}) should have same parameter count" # Check that hybrid has general components has_general = False @@ -444,7 +444,7 @@ def test_gralora_rank_divisibility_check(self): hybrid_r=0, ) - with pytest.raises(AssertionError, match="r should be divisible by gralora_k"): + with pytest.raises(ValueError, match="r should be divisible by gralora_k"): get_peft_model(mlp, config) def test_gralora_trainable_parameters_only(self, mlp_gralora_hybrid): @@ -827,37 +827,6 @@ def test_gralora_unload_without_merge(self): # Should match base model output (no merge) assert torch.allclose(base_output, unloaded_output, atol=1e-5) - def test_gralora_get_peft_config_as_dict(self): - """Test get_peft_config_as_dict method""" - torch.manual_seed(0) - mlp = MLP() - config = GraloraConfig( - target_modules=["lin1"], - r=8, - gralora_k=2, - hybrid_r=4, - gralora_alpha=16, - ) - model = get_peft_model(mlp, config) - - config_dict = model.get_peft_config_as_dict(inference=False) - - assert "default" in config_dict - assert config_dict["default"]["r"] == 8 - assert config_dict["default"]["gralora_k"] == 2 - assert config_dict["default"]["hybrid_r"] == 4 - - def test_gralora_get_peft_config_as_dict_inference_mode(self): - """Test get_peft_config_as_dict with inference=True""" - torch.manual_seed(0) - mlp = MLP() - config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2) - model = get_peft_model(mlp, config) - - config_dict = model.get_peft_config_as_dict(inference=True) - - assert config_dict["default"]["inference_mode"] is True - def test_gralora_merge_with_hybrid_component(self): """Test that merge works correctly with hybrid component""" torch.manual_seed(0) From 925ad7260e4a686df6ae55b4fbe953d4d789dfc9 Mon Sep 17 00:00:00 2001 From: "yeonjoon.jung" Date: Fri, 24 Oct 2025 19:05:19 +0900 Subject: [PATCH 08/11] ADD: documentations, examples, and test code for GraLoRA method --- docs/source/package_reference/gralora.md | 32 +++ examples/gralora_finetuning/README.md | 71 ++++++ .../gralora_finetuning/gralora_finetuning.py | 213 ++++++++++++++++++ tests/test_encoder_decoder_models.py | 8 + tests/test_feature_extraction_models.py | 8 + tests/test_seq_classifier.py | 8 + 6 files changed, 340 insertions(+) create mode 100644 docs/source/package_reference/gralora.md create mode 100644 examples/gralora_finetuning/README.md create mode 100644 examples/gralora_finetuning/gralora_finetuning.py diff --git a/docs/source/package_reference/gralora.md b/docs/source/package_reference/gralora.md new file mode 100644 index 0000000000..3d499756c1 --- /dev/null +++ b/docs/source/package_reference/gralora.md @@ -0,0 +1,32 @@ +# GraLoRA + +[**Granular Low-Rank Adaptation (GraLoRA)**](https://huggingface.co/papers/2505.20355) is a PEFT method designed to enhance the **expressivity** of low-rank adaptation while improving **robustness to outlier** activations, based on insights from well-known issues in quantization. + +![GraLoRA Overview](https://github.com/SqueezeBits/GraLoRA/raw/main/figure/gralora_overview.png) + +Unlike standard LoRA, which applies a single low-rank adapter across the entire feature space, GraLoRA introduces a structured and fine-grained adaptation scheme. It divides the adaptation space into a grid of $𝑘^2$ smaller, independent adapter pairs, each responsible for a localized subset of the input and output dimensions. As a result, each adapter operates on a subspace that is $k$ times smaller in both dimensions than the original LoRA adapter. + +This granular decomposition enables spatially localized and context-aware updates, effectively increasing representational capacity without additional parameters or computational cost. By isolating the influence of extreme activations within smaller subspaces, GraLoRA mitigates gradient distortion and preserves inter-channel balance during adaptation. + +--- + +The abstract from the paper is: + +*Low-Rank Adaptation (LoRA) is a popular method for parameter-efficient fine- +tuning (PEFT) of generative models, valued for its simplicity and effectiveness. +Despite recent enhancements, LoRA still suffers from a fundamental limitation: +overfitting when the bottleneck is widened. It performs best at ranks 32–64, yet its +accuracy stagnates or declines at higher ranks, still falling short of full fine-tuning +(FFT) performance. We identify the root cause as LoRA’s structural bottleneck, +which introduces gradient entanglement to the unrelated input channels and distorts +gradient propagation. To address this, we introduce a novel structure, Granular +Low-Rank Adaptation (GraLoRA) that partitions weight matrices into sub-blocks, +each with its own low-rank adapter. With negligible computational or storage cost, +GraLoRA overcomes LoRA’s limitations, effectively increases the representational +capacity, and more closely approximates FFT behavior. Experiments on code +generation, commonsense reasoning, mathematical reasoning, general language +understanding, and image generation benchmarks show that GraLoRA consistently +outperforms LoRA and other baselines, achieving up to +8.5% absolute gain in +Pass@1 on HumanEval+. These improvements hold across model sizes and rank +settings, making GraLoRA a scalable and robust solution for PEFT.* + diff --git a/examples/gralora_finetuning/README.md b/examples/gralora_finetuning/README.md new file mode 100644 index 0000000000..a911ab86d5 --- /dev/null +++ b/examples/gralora_finetuning/README.md @@ -0,0 +1,71 @@ +# GraLoRA: Granular Low-Rank Adaptation + +![GraLoRA Overview](https://github.com/SqueezeBits/GraLoRA/raw/main/figure/gralora_overview.png) + +## Introduction +[**Granular Low-Rank Adaptation (GraLoRA)**](https://huggingface.co/papers/2505.20355) is a PEFT method designed to enhance the **expressivity** of low-rank adaptation while improving **robustness to outlier** activations, based on insights from well-known issues in quantization. + +GraLoRA introduces a structured and fine-grained adaptation scheme. It divides the adaptation space into a grid of $𝑘^2$ smaller, independent adapter pairs, each responsible for a localized subset of the input and output dimensions. + +## Quick start + +With respect to your standard PEFT training procedure with LoRA, simply swap your `LoraConfig` for a `GraloraConfig`. + +```python +import torch +from peft import GraloraConfig, get_peft_model +from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer +from datasets import load_dataset + +model = AutoModelForCausalLM.from_pretrained("huggyllama/llama-7b", device_map="auto") +tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b") +dataset = load_dataset("timdettmers/openassistant-guanaco", split="train") +gralora_config = GraloraConfig() +peft_model = get_peft_model(model, gralora_config) +trainer = transformers.Trainer( + model=peft_model, + train_dataset=dataset, + dataset_text_field="text", + max_seq_length=2048, + tokenizer=tokenizer, +) +trainer.train() +peft_model.save_pretrained("gralora-llama-3-8b") +``` + +Run the finetuning script simply by running: +```python +python examples/gralora_finetuning/gralora_finetuning.py --base_model meta-llama/Meta-Llama-3-8B --data_path timdettmers/openassistant-guanaco +``` + +## Use the model on 🤗 +You can load and use the model as any other 🤗 models. +```python +import torch +from peft import PeftModel +from transformers import AutoModelForCausalLM + +model = AutoModelForCausalLM.from_pretrained( + "meta-llama/Meta-Llama-3-8B", dtype=torch.bfloat16, device_map="auto" +) +peft_model = PeftModel.from_pretrained(model, "gralora-llama-3-8b") +``` + +## Additonal Notes +While `gralora_k` is set to 2 for default, you can increase this value to create more fine-grained adapters. `gralora_k` of 4 is recommended when the total rank (`r + hybrid_r`) is 64 or higher. + + + + +## Citation +``` +@misc{jung2025graloragranularlowrankadaptation, + title={GraLoRA: Granular Low-Rank Adaptation for Parameter-Efficient Fine-Tuning}, + author={Yeonjoon Jung and Daehyun Ahn and Hyungjun Kim and Taesu Kim and Eunhyeok Park}, + year={2025}, + eprint={2505.20355}, + archivePrefix={arXiv}, + primaryClass={cs.LG}, + url={https://arxiv.org/abs/2505.20355}, +} +``` diff --git a/examples/gralora_finetuning/gralora_finetuning.py b/examples/gralora_finetuning/gralora_finetuning.py new file mode 100644 index 0000000000..1dcdcf46ee --- /dev/null +++ b/examples/gralora_finetuning/gralora_finetuning.py @@ -0,0 +1,213 @@ +# This script is based on examples/dora_finetuning/dora_finetuning.py +import os + +import torch +from datasets import load_dataset +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + BitsAndBytesConfig, + DataCollatorForLanguageModeling, + Trainer, + TrainingArguments, +) + +from peft import GraloraConfig, get_peft_model, prepare_model_for_kbit_training + + +def train_model( + base_model: str, + data_path: str, + output_dir: str, + batch_size: int, + num_epochs: int, + learning_rate: float, + cutoff_len: int, + val_set_size: int, + quantize: bool, + eval_step: int, + save_step: int, + device: str, + gralora_r: int, + gralora_alpha: int, + gralora_dropout: float, + gralora_target_modules: str, + gralora_k: int, + hybrid_r: int, + hub_model_id: str, + push_to_hub: bool, +): + os.environ["TOKENIZERS_PARALLELISM"] = "false" + hf_token = os.getenv("HF_TOKEN") + + # Setup device + if device == "auto": + device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" + else: + device = torch.device(device) + print(f"Using device: {device}") + + # load tokenizer + tokenizer = AutoTokenizer.from_pretrained(base_model, token=hf_token) + + # Quantized GraLoRA: IF YOU WANNA QUANTIZE THE MODEL + if quantize: + if (torch.cuda.is_available() and torch.cuda.is_bf16_supported()) or torch.xpu.is_available(): + bnb_4bit_compute_dtype = torch.bfloat16 + else: + bnb_4bit_compute_dtype = torch.float16 + model = AutoModelForCausalLM.from_pretrained( + base_model, + token=hf_token, + quantization_config=BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=bnb_4bit_compute_dtype, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + ), + ) + # setup for quantized training + model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True) + else: + model = AutoModelForCausalLM.from_pretrained(base_model, token=hf_token) + # GraLoRA config for the PEFT model + gralora_config = GraloraConfig( + r=gralora_r, # Rank of matrix + gralora_alpha=gralora_alpha, + target_modules=( + gralora_target_modules.split(",") + if gralora_target_modules + else ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] + ), + gralora_dropout=gralora_dropout, + gralora_k=gralora_k, + hybrid_r=hybrid_r, + bias="none", + ) + + # get the peft model with GraLoRA config + model = get_peft_model(model, gralora_config) + + model.to(device) # MODEL TO GPU/CUDA + tokenizer.pad_token = tokenizer.eos_token + + # Load the dataset + dataset = load_dataset(data_path) + + def tokenize_function(examples): + inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=cutoff_len) + inputs["labels"] = inputs["input_ids"].copy() # setting labels for a language modeling task + return inputs + + # Tokenize the dataset and prepare for training + tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names) + + # Data collator to dynamically pad the batched examples + data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) + + # Define training arguments + training_args = TrainingArguments( + output_dir=output_dir, + num_train_epochs=num_epochs, + per_device_train_batch_size=batch_size, + per_device_eval_batch_size=batch_size, + warmup_steps=100, + weight_decay=0.01, + logging_dir="./logs", + logging_steps=eval_step, + save_steps=save_step, + save_total_limit=2, + push_to_hub=push_to_hub, + hub_model_id=hub_model_id, + gradient_accumulation_steps=16, + fp16=True, + learning_rate=learning_rate, + hub_token=hf_token, + ) + + # Clear device cache to free memory + if torch.cuda.is_available(): + torch.cuda.empty_cache() + elif torch.xpu.is_available(): + torch.xpu.empty_cache() + + # Initialize the Trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["test"], + data_collator=data_collator, + ) + + # Start model training + trainer.train() + + # Save and push the trained model and tokenizer + if push_to_hub: + # Push the main model to the hub + trainer.push_to_hub(commit_message="Fine-tuned model") + + # Save the model and tokenizer locally + model.save_pretrained(output_dir) + tokenizer.save_pretrained(output_dir) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Fine-tune LLaMA with GraLoRA and PEFT") + parser.add_argument("--base_model", type=str, default="huggyllama/llama-7b", help="Base model path or name") + parser.add_argument( + "--data_path", type=str, default="timdettmers/openassistant-guanaco", help="Dataset path or name" + ) + parser.add_argument( + "--output_dir", type=str, default="path/to/output", help="Output directory for the fine-tuned model" + ) + parser.add_argument("--batch_size", type=int, default=1, help="Batch size") + parser.add_argument("--num_epochs", type=int, default=1, help="Number of training epochs") + parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate") + parser.add_argument("--cutoff_len", type=int, default=512, help="Cutoff length for tokenization") + parser.add_argument("--val_set_size", type=int, default=500, help="Validation set size") + parser.add_argument("--quantize", action="store_true", help="Use quantization") + parser.add_argument("--eval_step", type=int, default=10, help="Evaluation step interval") + parser.add_argument("--save_step", type=int, default=100, help="Save step interval") + parser.add_argument("--device", type=str, default="auto", help="Device to use for training") + parser.add_argument("--gralora_r", type=int, default=8, help="LoRA rank") + parser.add_argument("--gralora_alpha", type=int, default=16, help="LoRA alpha") + parser.add_argument("--gralora_dropout", type=float, default=0.05, help="LoRA dropout rate") + parser.add_argument( + "--gralora_target_modules", type=str, default=None, help="Comma-separated list of target modules for LoRA" + ) + parser.add_argument("--gralora_k", type=int, default=2, help="GraLoRA k") + parser.add_argument("--hybrid_r", type=int, default=0, help="Hybrid rank") + parser.add_argument( + "--hub_model_id", + type=str, + default="path/to/repo", + help="Repository name to push the model on the Hugging Face Hub", + ) + parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to Hugging Face Hub") + args = parser.parse_args() + train_model( + base_model=args.base_model, + data_path=args.data_path, + output_dir=args.output_dir, + batch_size=args.batch_size, + num_epochs=args.num_epochs, + learning_rate=args.learning_rate, + cutoff_len=args.cutoff_len, + val_set_size=args.val_set_size, + quantize=args.quantize, + eval_step=args.eval_step, + save_step=args.save_step, + device=args.device, + gralora_r=args.gralora_r, + gralora_alpha=args.gralora_alpha, + gralora_dropout=args.gralora_dropout, + gralora_target_modules=args.gralora_target_modules, + gralora_k=args.gralora_k, + hybrid_r=args.hybrid_r, + hub_model_id=args.hub_model_id, + push_to_hub=args.push_to_hub, + ) diff --git a/tests/test_encoder_decoder_models.py b/tests/test_encoder_decoder_models.py index c4e38f934b..42b12e66e0 100644 --- a/tests/test_encoder_decoder_models.py +++ b/tests/test_encoder_decoder_models.py @@ -24,6 +24,7 @@ C3AConfig, DeloraConfig, FourierFTConfig, + GraloraConfig, HRAConfig, IA3Config, LoraConfig, @@ -100,6 +101,13 @@ "task_type": "SEQ_2_SEQ_LM", }, ), + ( + GraloraConfig, + { + "target_modules": None, + "task_type": "SEQ_2_SEQ_LM", + }, + ), ( HRAConfig, { diff --git a/tests/test_feature_extraction_models.py b/tests/test_feature_extraction_models.py index a5377827f4..6bfd254ec4 100644 --- a/tests/test_feature_extraction_models.py +++ b/tests/test_feature_extraction_models.py @@ -22,6 +22,7 @@ C3AConfig, DeloraConfig, FourierFTConfig, + GraloraConfig, HRAConfig, IA3Config, LoraConfig, @@ -98,6 +99,13 @@ "target_modules": None, }, ), + ( + GraloraConfig, + { + "task_type": "FEATURE_EXTRACTION", + "target_modules": None, + }, + ), ( HRAConfig, { diff --git a/tests/test_seq_classifier.py b/tests/test_seq_classifier.py index 03869c3a7a..bee83a879a 100644 --- a/tests/test_seq_classifier.py +++ b/tests/test_seq_classifier.py @@ -22,6 +22,7 @@ C3AConfig, DeloraConfig, FourierFTConfig, + GraloraConfig, HRAConfig, IA3Config, LoraConfig, @@ -99,6 +100,13 @@ "target_modules": None, }, ), + ( + GraloraConfig, + { + "task_type": "SEQ_CLS", + "target_modules": None, + }, + ), ( HRAConfig, { From 3f69d8f64a07bc6a12e1fd5b93e19c1088fae3ae Mon Sep 17 00:00:00 2001 From: "yeonjoon.jung" Date: Sat, 25 Oct 2025 03:19:51 +0900 Subject: [PATCH 09/11] REFACTOR: integrate GraLoRA tests into existing test files --- docs/source/_toctree.yml | 2 + src/peft/tuners/gralora/config.py | 82 ++- src/peft/tuners/gralora/layer.py | 6 - src/peft/tuners/gralora/model.py | 10 +- tests/test_custom_models.py | 48 +- tests/test_gralora.py | 1051 ----------------------------- tests/test_initialization.py | 51 ++ 7 files changed, 164 insertions(+), 1086 deletions(-) delete mode 100644 tests/test_gralora.py diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index ecee4aedf1..3ecc3e7e9b 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -116,6 +116,8 @@ title: VeRA - local: package_reference/fourierft title: FourierFT + - local: package_reference/gralora + title: GraLoRA - local: package_reference/vblora title: VB-LoRA - local: package_reference/hra diff --git a/src/peft/tuners/gralora/config.py b/src/peft/tuners/gralora/config.py index b88b26a77a..57e2fb3f47 100644 --- a/src/peft/tuners/gralora/config.py +++ b/src/peft/tuners/gralora/config.py @@ -21,6 +21,57 @@ @dataclass class GraloraConfig(PeftConfig): + """ + This is the configuration class to store the configuration of a [`GraloraModel`]. + + Args: + r (`int`): + GraLoRA attention dimension determines the rank of the GraLoRA adapter. + The total parameter count of the GraLoRA adapter is same as LoRA with same rank r, while the expressivitiy is multiplied by gralora_k. + hybrid_r (`int`): + Hybrid GraLoRA rank determines the rank allocated to vanilla LoRA method when using Hybrid GraLoRA method. + Hybrid GraLoRA, a combination of GraLoRA and vanilla LoRA, becomes available when hybrid_r > 0. + The parameter count of the GraLoRA adapter is r + hybrid_r. + target_modules (`Union[List[str], str]`): + List of module names or regex expression of the module names to replace with GraLoRA. " + For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. " + This can also be a wildcard 'all-linear' which matches all linear/Conv1D " + "(if the model is a PreTrainedModel, the output layer excluded). " + If not specified, modules will be chosen according to the model architecture, If the architecture is " + not known, an error will be raised -- in this case, you should specify the target modules manually. " + To avoid targeting any modules (because you want to apply `target_parameters`), set " + `target_modules=[]`. + gralora_alpha (`int`): GraLoRA alpha. + GraLoRA alpha is the scaling factor for the GraLoRA adapter. + Scale becomes gralora_alpha / (r + hybrid_r). + gralora_dropout (`float`): + GraLoRA dropout is the dropout probability for the GraLoRA adapter. + It is used to prevent overfitting and improve the generalization of the GraLoRA adapter. + gralora_k (`int`): + GraLoRA k determines the number of subblocks in the GraLoRA adapter. + The rank r must be divisible by gralora_k for the GraLoRA adapter to be valid. + The total parameter count is preserved regardles of gralora_k. + The entire rank of the GraLoRA adapter is increased by gralora_k, while the rank of each subblock is reduced by gralora_k. + gralora_k=2 is recommended for rank 32 or lower, and gralora_k=4 is recommended for rank 64 or higher. + fan_in_fan_out (`bool`): + Set this to True if the layer to replace stores weight like (fan_in, fan_out). + For example, gpt-2 uses `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`. + bias (`str`): + Bias type for gralora. Can be 'none', 'all' or 'gralora_only'. + If 'all' or 'gralora_only', the corresponding biases will be updated during training. + Be aware that this means that, even when disabling the adapters, the model will not produce the same output as the base model would have without adaptation. + init_weights (`bool`): + Whether to initialize the weights of the GraLoRA layers with their default initialization. + Don't change this setting, except if you know exactly what you're doing. + layers_to_transform (`Union[List[int], int]`): + The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. + If a single integer is passed, PEFT will transform only the layer at this index. + This only works when target_modules is a list of str. + layers_pattern (`Optional[Union[List[str], str]]`): + The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern. + This only works when target_modules is a list of str. This should target the `nn.ModuleList` of the model, which is often called `'layers'` or `'h'`. + """ + r: int = field( default=32, metadata={ @@ -44,9 +95,14 @@ class GraloraConfig(PeftConfig): default=None, metadata={ "help": ( - "List of module names or regex expression of the module names to replace with gralora. " + "List of module names or regex expression of the module names to replace with LoRA. " "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. " - "Only linear layers are supported." + "This can also be a wildcard 'all-linear' which matches all linear/Conv1D " + "(if the model is a PreTrainedModel, the output layer excluded). " + "If not specified, modules will be chosen according to the model architecture, If the architecture is " + "not known, an error will be raised -- in this case, you should specify the target modules manually. " + "To avoid targeting any modules (because you want to apply `target_parameters`), set " + "`target_modules=[]`." ) }, ) @@ -54,8 +110,8 @@ class GraloraConfig(PeftConfig): default=64, metadata={ "help": ( - "gralora alpha is the scaling factor for the GraLoRA adapter." - "Scale becomes gralora_alpha / (r + hybrid_r)." + "gralora alpha is the scaling factor for the GraLoRA adapter. " + "Scale becomes gralora_alpha / (r + hybrid_r). " ) }, ) @@ -64,8 +120,11 @@ class GraloraConfig(PeftConfig): default=2, metadata={ "help": ( - "gralora_k determines the number of subblocks in the GraLoRA adapter." - "The total parameter count is preserved regardles of gralora_k, while the expressivitiy is multiplied by gralora_k." + "gralora_k determines the number of subblocks in the GraLoRA adapter. " + "The rank r must be divisible by gralora_k for the GraLoRA adapter to be valid. " + "The total parameter count is preserved regardles of gralora_k. " + "The entire rank of the GraLoRA adapter is increased by gralora_k, while the rank of each subblock is reduced by gralora_k. " + "gralora_k=2 is recommended for rank 32 or lower, and gralora_k=4 is recommended for rank 64 or higher. " ) }, ) @@ -99,9 +158,9 @@ class GraloraConfig(PeftConfig): default=None, metadata={ "help": ( - "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers" - " indexes that are specified inside this list. If a single integer is passed, PEFT will transform only" - " the layer at this index." + "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. " + "If a single integer is passed, PEFT will transform only the layer at this index. " + "This only works when target_modules is a list of str." ) }, ) @@ -109,8 +168,9 @@ class GraloraConfig(PeftConfig): default=None, metadata={ "help": ( - "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer" - " pattern is not in the common layers pattern." + "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern. " + "This only works when target_modules is a list of str. This should target the `nn.ModuleList` of the " + "model, which is often called `'layers'` or `'h'`." ) }, ) diff --git a/src/peft/tuners/gralora/layer.py b/src/peft/tuners/gralora/layer.py index 4aefa02152..4303669aa4 100644 --- a/src/peft/tuners/gralora/layer.py +++ b/src/peft/tuners/gralora/layer.py @@ -271,12 +271,6 @@ def get_delta_weight(self, adapter) -> torch.Tensor: in_features = self.in_features out_features = self.out_features gralora_rank = r - if in_features % gralora_k != 0: - raise ValueError(f"in_features should be divisible by gralora_k, but got {in_features} and {gralora_k}") - elif out_features % gralora_k != 0: - raise ValueError(f"out_features should be divisible by gralora_k, but got {out_features} and {gralora_k}") - elif gralora_rank % gralora_k != 0: - raise ValueError(f"rank should be divisible by gralora_k, but got {gralora_rank} and {gralora_k}") subblock_gralora_rank = gralora_rank // gralora_k # scatter gralora_A to get the scattered weight matrix diff --git a/src/peft/tuners/gralora/model.py b/src/peft/tuners/gralora/model.py index 3d7fae2cc6..23a25d4c9c 100644 --- a/src/peft/tuners/gralora/model.py +++ b/src/peft/tuners/gralora/model.py @@ -15,23 +15,15 @@ from __future__ import annotations import warnings -from dataclasses import asdict -from enum import Enum -from typing import Optional import torch -import torch.nn as nn -from tqdm import tqdm from transformers.pytorch_utils import Conv1D -from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists +from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer from peft.utils import ( TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING, - ModulesToSaveWrapper, - _get_submodules, ) -from .config import GraloraConfig from .layer import GraloraLayer, Linear diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index 30628f2bdf..8d9820a195 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -680,6 +680,18 @@ GraloraConfig, {"target_modules": ["lin0"], "modules_to_save": ["lin1"]}, ), + ( + "Vanilla MLP 6 GraLoRA", + "MLP", + GraloraConfig, + {"target_modules": ["lin0", "lin1"], "modules_to_save": ["lin1"]}, + ), + ( + "Vanilla MLP 7 Hybrid GraLoRA", + "MLP", + GraloraConfig, + {"target_modules": ["lin0", "lin1"], "modules_to_save": ["lin1"], "hybrid_r": 4}, + ), ( "Embedding + transformers Conv1D 1 GraLoRA", "EmbConv1D", @@ -3124,12 +3136,12 @@ def test_add_weighted_adapter_subtraction_with_negative_weights(self): cancelled_B = module.lora_B["cancelled"].weight.data # The weights should be approximately zero (they cancel out) - assert torch.allclose( - cancelled_A, torch.zeros_like(cancelled_A), atol=1e-5 - ), f"Cancelled A should be ~0, got max abs value {cancelled_A.abs().max()}" - assert torch.allclose( - cancelled_B, torch.zeros_like(cancelled_B), atol=1e-5 - ), f"Cancelled B should be ~0, got max abs value {cancelled_B.abs().max()}" + assert torch.allclose(cancelled_A, torch.zeros_like(cancelled_A), atol=1e-5), ( + f"Cancelled A should be ~0, got max abs value {cancelled_A.abs().max()}" + ) + assert torch.allclose(cancelled_B, torch.zeros_like(cancelled_B), atol=1e-5), ( + f"Cancelled B should be ~0, got max abs value {cancelled_B.abs().max()}" + ) def test_add_weighted_adapter_negative_weight_with_different_scaling(self): # Test negative weights with different scaling factors (lora_alpha) @@ -3440,6 +3452,24 @@ def test_dora_save_and_load_remapping(self): for k in state_dict: assert torch.allclose(state_dict[k], state_dict_loaded[k]) + def test_gralora_and_hybrid_gralora_parameter_count(self): + # Here we test the parameter count of GraLoRA is preserved + # when rank r + hybrid_r is the same regardless of the value of gralora_k. + model1 = MLP() + config1 = GraloraConfig(target_modules=["lin0"], r=12, gralora_k=2, hybrid_r=0) + model1 = get_peft_model(model1, config1) + model2 = MLP() + config2 = GraloraConfig(target_modules=["lin0"], r=10, gralora_k=2, hybrid_r=2) + model2 = get_peft_model(model2, config2) + model3 = MLP() + config3 = GraloraConfig(target_modules=["lin0"], r=10, gralora_k=5, hybrid_r=2) + model3 = get_peft_model(model3, config3) + trainable_params1, all_params1 = model1.get_nb_trainable_parameters() + trainable_params2, all_params2 = model2.get_nb_trainable_parameters() + trainable_params3, all_params3 = model3.get_nb_trainable_parameters() + assert trainable_params1 == trainable_params2 == trainable_params3 + assert all_params1 == all_params2 == all_params3 + @pytest.mark.parametrize("with_forward_call", [False, True]) def test_mha_gradients_set_correctly(self, with_forward_call): # check for this bug: https://github.com/huggingface/peft/issues/761#issuecomment-1893804738 @@ -3535,9 +3565,9 @@ def test_multirank_2(self): if isinstance(module, BaseTunerLayer): rank_expected = rank_pattern.get(key, r) rank_current = module.lora_A[adapter].weight.shape[0] - assert ( - rank_current == rank_expected - ), f"Rank {rank_current} is not equal to expected {rank_expected}" + assert rank_current == rank_expected, ( + f"Rank {rank_current} is not equal to expected {rank_expected}" + ) class TestLayerRepr: diff --git a/tests/test_gralora.py b/tests/test_gralora.py deleted file mode 100644 index 59c2418a33..0000000000 --- a/tests/test_gralora.py +++ /dev/null @@ -1,1051 +0,0 @@ -# Copyright 2025-present the HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This test file is for tests specific to GraLoRA, since GraLoRA has some specific features -# like block-diagonal structure, hybrid mode, and tensor permutation for information exchange. - -import pytest -import torch -from safetensors import safe_open -from torch import nn - -from peft import PeftModel, get_peft_model -from peft.tuners.gralora import GraloraConfig - - -class MLP(nn.Module): - """Simple MLP for testing""" - - def __init__(self, bias=True): - super().__init__() - self.relu = nn.ReLU() - self.lin0 = nn.Linear(10, 20, bias=bias) - self.lin1 = nn.Linear(20, 20, bias=bias) # lin1 and lin2 have same shape - self.lin2 = nn.Linear(20, 20, bias=bias) - self.lin3 = nn.Linear(20, 2, bias=bias) - self.sm = nn.LogSoftmax(dim=-1) - - def forward(self, X): - X = self.lin0(X) - X = self.relu(X) - X = self.lin1(X) - X = self.relu(X) - X = self.lin2(X) - X = self.relu(X) - X = self.lin3(X) - X = self.sm(X) - return X - - -class TestGralora: - @pytest.fixture - def mlp(self): - torch.manual_seed(0) - model = MLP() - return model - - @pytest.fixture - def mlp_gralora_pure(self, mlp): - """Pure GraLoRA without hybrid component""" - torch.manual_seed(0) - config = GraloraConfig( - target_modules=["lin1", "lin2"], - r=16, - gralora_k=4, - hybrid_r=0, - gralora_alpha=32, - gralora_dropout=0.1, - ) - peft_model = get_peft_model(mlp, config) - return peft_model - - @pytest.fixture - def mlp_gralora_hybrid(self): - """Hybrid GraLoRA with vanilla LoRA component""" - torch.manual_seed(0) - mlp = MLP() - config = GraloraConfig( - target_modules=["lin1", "lin2"], - r=16, - gralora_k=4, - hybrid_r=4, - gralora_alpha=32, - gralora_dropout=0.1, - ) - peft_model = get_peft_model(mlp, config) - return peft_model - - def test_gralora_config_validation(self): - """Test that config validation works correctly""" - # Valid config - config = GraloraConfig(r=16, gralora_k=4, hybrid_r=0) - assert config.r == 16 - assert config.gralora_k == 4 - assert config.hybrid_r == 0 - - # Hybrid config - config = GraloraConfig(r=16, gralora_k=4, hybrid_r=4) - assert config.r == 16 - assert config.hybrid_r == 4 - - def test_gralora_parameter_shapes(self, mlp_gralora_hybrid): - """Test that GraLoRA parameters have correct shapes""" - for name, module in mlp_gralora_hybrid.named_modules(): - if hasattr(module, "gralora_A"): - adapter_name = "default" - gralora_A = module.gralora_A[adapter_name] - gralora_B = module.gralora_B[adapter_name] - gralora_A_general = module.gralora_A_general[adapter_name] - gralora_B_general = module.gralora_B_general[adapter_name] - - in_features = module.in_features - out_features = module.out_features - k = 4 - gralora_rank = 16 - - # Check GraLoRA block shapes - # Each block has full gralora_rank, not gralora_rank // k - assert gralora_A.shape == (k, in_features // k, gralora_rank) - assert gralora_B.shape == (k, gralora_rank, out_features // k) - - # Check hybrid component shapes - assert gralora_A_general.weight.shape == (4, in_features) - assert gralora_B_general.weight.shape == (out_features, 4) - - def test_gralora_block_diagonal_structure(self): - """Test that pure GraLoRA produces block-diagonal delta weights""" - # Use init_weights=False to have non-zero B matrices - torch.manual_seed(0) - mlp = MLP() - config = GraloraConfig( - target_modules=["lin1", "lin2"], - r=16, - gralora_k=4, - hybrid_r=0, - init_weights=False, # Both A and B initialized randomly - ) - model = get_peft_model(mlp, config) - - for name, module in model.named_modules(): - if hasattr(module, "get_delta_weight"): - adapter_name = "default" - delta_weight = module.get_delta_weight(adapter_name) - - k = 4 - in_features = module.in_features - out_features = module.out_features - block_size_in = in_features // k - block_size_out = out_features // k - - # Check diagonal blocks have non-zero values - for i in range(k): - row_start = i * block_size_out - row_end = (i + 1) * block_size_out - col_start = i * block_size_in - col_end = (i + 1) * block_size_in - - block = delta_weight[row_start:row_end, col_start:col_end] - block_norm = torch.norm(block).item() - # Diagonal blocks should have some values (initialized with kaiming) - assert block_norm > 0, f"Diagonal block [{i},{i}] is zero" - - def test_gralora_forward_pass(self, mlp_gralora_hybrid): - """Test that forward pass works without errors""" - mlp_gralora_hybrid.eval() - x = torch.randn(5, 10) - - with torch.no_grad(): - output = mlp_gralora_hybrid(x) - - assert output.shape == (5, 2) - assert not torch.isnan(output).any() - assert not torch.isinf(output).any() - - def test_gralora_backward_pass(self, mlp_gralora_hybrid): - """Test that backward pass computes gradients correctly""" - mlp_gralora_hybrid.train() - x = torch.randn(5, 10) - - output = mlp_gralora_hybrid(x) - loss = output.sum() - loss.backward() - - # Check that GraLoRA parameters have gradients - for name, param in mlp_gralora_hybrid.named_parameters(): - if "gralora" in name and param.requires_grad: - assert param.grad is not None, f"Parameter {name} has no gradient" - assert not torch.isnan(param.grad).any(), f"Parameter {name} has NaN gradients" - - def test_gralora_pure_vs_hybrid_params(self): - """Test that pure and hybrid modes have same total parameters but different distribution""" - torch.manual_seed(0) - mlp_pure = MLP() - config_pure = GraloraConfig( - target_modules=["lin1", "lin2"], - r=16, - gralora_k=4, - hybrid_r=0, - ) - model_pure = get_peft_model(mlp_pure, config_pure) - - torch.manual_seed(0) - mlp_hybrid = MLP() - config_hybrid = GraloraConfig( - target_modules=["lin1", "lin2"], - r=12, - gralora_k=4, - hybrid_r=4, - ) - model_hybrid = get_peft_model(mlp_hybrid, config_hybrid) - - def count_trainable_params(model): - return sum(p.numel() for p in model.parameters() if p.requires_grad) - - params_pure = count_trainable_params(model_pure) - params_hybrid = count_trainable_params(model_hybrid) - - # Pure and hybrid should have same total parameters (r is constant) - # but distributed differently between block-diagonal and full-rank components - assert ( - params_pure == params_hybrid - ), f"Pure ({params_pure}) and Hybrid ({params_hybrid}) should have same parameter count" - - # Check that hybrid has general components - has_general = False - for name, _ in model_hybrid.named_modules(): - if "gralora_A_general" in name or "gralora_B_general" in name: - has_general = True - break - assert has_general, "Hybrid mode should have general components" - - def test_gralora_save_load_roundtrip(self, mlp_gralora_hybrid, tmp_path): - """Test that save/load preserves model behavior""" - mlp_gralora_hybrid.eval() - x = torch.randn(5, 10) - - # Get output before save - with torch.no_grad(): - output_before = mlp_gralora_hybrid(x) - - # Save adapter - mlp_gralora_hybrid.save_pretrained(tmp_path) - - # Load adapter - torch.manual_seed(0) - new_mlp = MLP() - loaded_model = PeftModel.from_pretrained(new_mlp, tmp_path) - loaded_model.eval() - - # Get output after load - with torch.no_grad(): - output_after = loaded_model(x) - - # Outputs should be very close - assert torch.allclose(output_before, output_after, atol=1e-5, rtol=1e-5) - - def test_gralora_state_dict_structure(self, mlp_gralora_hybrid, tmp_path): - """Test that state dict contains only necessary parameters""" - mlp_gralora_hybrid.save_pretrained(tmp_path) - - # Load state dict - sd = {} - with safe_open(tmp_path / "adapter_model.safetensors", framework="pt", device="cpu") as f: - for key in f.keys(): - sd[key] = f.get_tensor(key) - - # Check that gralora parameters are present - assert any("gralora_A" in key for key in sd), "gralora_A not found in state dict" - assert any("gralora_B" in key for key in sd), "gralora_B not found in state dict" - - # For hybrid mode, check hybrid components - assert any("gralora_A_general" in key for key in sd), "gralora_A_general not found" - assert any("gralora_B_general" in key for key in sd), "gralora_B_general not found" - - def test_gralora_merge_and_unload(self, mlp_gralora_hybrid): - """Test merge_and_unload functionality""" - mlp_gralora_hybrid.eval() - x = torch.randn(5, 10) - - # Get output before merge - with torch.no_grad(): - output_before = mlp_gralora_hybrid(x) - - # Merge and unload - merged_model = mlp_gralora_hybrid.merge_and_unload() - merged_model.eval() - - # Get output after merge - with torch.no_grad(): - output_after = merged_model(x) - - # Outputs should be very close - assert torch.allclose(output_before, output_after, atol=1e-4, rtol=1e-4) - - # Check that merged model has no GraLoRA layers - has_gralora = any("gralora" in name for name, _ in merged_model.named_parameters()) - assert not has_gralora, "Merged model still has GraLoRA parameters" - - def test_gralora_merge_unmerge(self): - """Test merge/unmerge functionality""" - torch.manual_seed(0) - mlp = MLP() - config = GraloraConfig( - target_modules=["lin1"], - r=8, - gralora_k=2, - hybrid_r=0, - ) - model = get_peft_model(mlp, config) - model.eval() - - x = torch.randn(5, 10) - - # Output before merge - with torch.no_grad(): - output_before = model(x) - - # Merge adapter using PEFT API - model.merge_adapter() - - with torch.no_grad(): - output_merged = model(x) - - # Outputs should be the same after merge - assert torch.allclose(output_before, output_merged, atol=1e-4, rtol=1e-4) - - # Unmerge adapter using PEFT API - model.unmerge_adapter() - - with torch.no_grad(): - output_unmerged = model(x) - - # Outputs should be the same after unmerge - assert torch.allclose(output_before, output_unmerged, atol=1e-4, rtol=1e-4) - - def test_gralora_multiple_adapters(self): - """Test adding and switching between multiple adapters""" - torch.manual_seed(0) - mlp = MLP() - - # Use init_weights=False to have non-zero outputs - config1 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2, hybrid_r=0, init_weights=False) - model = get_peft_model(mlp, config1, adapter_name="adapter1") - - torch.manual_seed(42) # Different seed for second adapter - config2 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2, hybrid_r=0, init_weights=False) - model.add_adapter("adapter2", config2) - - x = torch.randn(5, 10) - - # Test adapter1 - model.set_adapter("adapter1") - with torch.no_grad(): - output1 = model(x) - - # Test adapter2 - model.set_adapter("adapter2") - with torch.no_grad(): - output2 = model(x) - - # Different adapters should give different outputs - assert not torch.allclose(output1, output2, atol=1e-3, rtol=1e-3) - - def test_gralora_dtype_compatibility(self): - """Test that GraLoRA works with different dtypes""" - for dtype in [torch.float32, torch.float16, torch.bfloat16]: - if dtype == torch.bfloat16 and not torch.cuda.is_available(): - # Skip bfloat16 on CPU if not supported - continue - - torch.manual_seed(0) - mlp = MLP().to(dtype) - config = GraloraConfig( - target_modules=["lin1"], - r=8, - gralora_k=2, - hybrid_r=0, - ) - model = get_peft_model(mlp, config) - - x = torch.randn(5, 10).to(dtype) - output = model(x) - - assert output.dtype == dtype, f"Output dtype mismatch for {dtype}" - - def test_gralora_disable_adapters(self): - """Test disabling adapters""" - torch.manual_seed(0) - mlp = MLP() - # Use init_weights=False to have non-zero effect - config = GraloraConfig( - target_modules=["lin1", "lin2"], - r=16, - gralora_k=4, - hybrid_r=4, - init_weights=False, - ) - model = get_peft_model(mlp, config) - model.eval() - x = torch.randn(5, 10) - - # Output with adapter enabled - with torch.no_grad(): - output_enabled = model(x) - - # Output with adapter disabled - with model.disable_adapter(): - with torch.no_grad(): - output_disabled = model(x) - - # Outputs should be different - assert not torch.allclose(output_enabled, output_disabled, atol=1e-6, rtol=1e-6) - - def test_gralora_different_k_values(self): - """Test GraLoRA with different k values""" - for k in [2, 4]: - torch.manual_seed(0) - mlp = MLP() - config = GraloraConfig( - target_modules=["lin1", "lin2"], - r=k * 4, # Make sure r is divisible by k - gralora_k=k, - hybrid_r=0, - ) - model = get_peft_model(mlp, config) - - x = torch.randn(5, 10) - output = model(x) - - assert output.shape == (5, 2) - assert not torch.isnan(output).any() - - def test_gralora_rank_divisibility_check(self): - """Test that invalid rank/k combinations raise errors""" - torch.manual_seed(0) - mlp = MLP() - - # This should raise an error because (r - hybrid_r) is not divisible by k - # r=15, hybrid_r=0, k=4 -> gralora_rank=15, 15 % 4 != 0 - config = GraloraConfig( - target_modules=["lin1"], - r=15, - gralora_k=4, - hybrid_r=0, - ) - - with pytest.raises(ValueError, match="r should be divisible by gralora_k"): - get_peft_model(mlp, config) - - def test_gralora_trainable_parameters_only(self, mlp_gralora_hybrid): - """Test that only GraLoRA parameters are trainable""" - for name, param in mlp_gralora_hybrid.named_parameters(): - if "gralora" in name or "modules_to_save" in name: - assert param.requires_grad, f"GraLoRA parameter {name} should be trainable" - else: - assert not param.requires_grad, f"Base parameter {name} should be frozen" - - def test_gralora_save_pretrained_files(self, mlp_gralora_hybrid, tmp_path): - """Test that save_pretrained creates expected files""" - mlp_gralora_hybrid.save_pretrained(tmp_path) - - # Check for config file - assert (tmp_path / "adapter_config.json").exists() - - # Check for weights file (either .bin or .safetensors) - assert (tmp_path / "adapter_model.safetensors").exists() or (tmp_path / "adapter_model.bin").exists() - - def test_gralora_information_exchange_via_permutation(self, mlp_gralora_pure): - """ - Test that information exchange happens through tensor permutation. Even though delta weights are - block-diagonal, the forward pass should allow information flow between blocks via the permutation operation. - """ - mlp_gralora_pure.eval() - - # Create two inputs that differ only in specific blocks - x1 = torch.randn(1, 10) - x2 = x1.clone() - - # Modify only the first block (assuming k=4, block size = 10//4 = 2.5, rounded to 2-3 features) - x2[0, :5] += 1.0 # Modify first block - - with torch.no_grad(): - out1 = mlp_gralora_pure(x1) - out2 = mlp_gralora_pure(x2) - - # Due to information exchange, changing one block should affect all outputs - # (not just outputs corresponding to that block) - diff = (out1 - out2).abs() - - # All output dimensions should be affected (not just the first block's outputs) - assert (diff > 1e-6).all(), "Information exchange not happening correctly" - - def test_gralora_scaling_factor(self): - """Test that scaling factor is correctly applied""" - torch.manual_seed(0) - mlp = MLP() - - # Create two configs with different alpha values - config_alpha16 = GraloraConfig( - target_modules=["lin1"], - r=8, - gralora_alpha=16, - gralora_k=2, - hybrid_r=0, - ) - - config_alpha32 = GraloraConfig( - target_modules=["lin1"], - r=8, - gralora_alpha=32, - gralora_k=2, - hybrid_r=0, - ) - - model_alpha16 = get_peft_model(MLP(), config_alpha16) - model_alpha32 = get_peft_model(MLP(), config_alpha32) - - # Copy weights to make them identical except for scaling - for (n1, p1), (n2, p2) in zip(model_alpha16.named_parameters(), model_alpha32.named_parameters()): - if "gralora" in n1: - p2.data = p1.data.clone() - - x = torch.randn(5, 10) - - model_alpha16.eval() - model_alpha32.eval() - - with torch.no_grad(): - out1 = model_alpha16(x) - out2 = model_alpha32(x) - - # Outputs should be different due to different scaling - assert not torch.allclose(out1, out2, atol=1e-6, rtol=1e-6) - - def test_gralora_safe_merge_success(self): - """Test safe_merge with valid weights""" - torch.manual_seed(0) - mlp = MLP() - config = GraloraConfig( - target_modules=["lin1"], - r=8, - gralora_k=2, - hybrid_r=0, - init_weights=False, - ) - model = get_peft_model(mlp, config) - - x = torch.randn(5, 10) - with torch.no_grad(): - output_before = model(x) - - # Test safe merge - model.base_model.model.lin1.merge(safe_merge=True) - - with torch.no_grad(): - output_after = model(x) - - assert torch.allclose(output_before, output_after, atol=1e-4, rtol=1e-4) - - def test_gralora_safe_merge_detects_nan(self): - """Test that safe_merge detects NaN values""" - torch.manual_seed(0) - mlp = MLP() - config = GraloraConfig( - target_modules=["lin1"], - r=8, - gralora_k=2, - hybrid_r=0, - ) - model = get_peft_model(mlp, config) - - # Inject NaN into adapter weights (use .data to avoid requires_grad error) - model.base_model.model.lin1.gralora_A["default"].data[0, 0, 0] = float("nan") - - # safe_merge should raise ValueError - with pytest.raises(ValueError, match="NaNs detected"): - model.base_model.model.lin1.merge(safe_merge=True) - - def test_gralora_unmerge_warning_when_not_merged(self): - """Test that unmerge warns when already unmerged""" - torch.manual_seed(0) - mlp = MLP() - config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2) - model = get_peft_model(mlp, config) - - # Try to unmerge without merging first - with pytest.warns(UserWarning, match="Already unmerged"): - model.base_model.model.lin1.unmerge() - - def test_gralora_hybrid_forward_computation(self): - """Test that hybrid LoRA component is used in forward pass""" - torch.manual_seed(0) - mlp_hybrid = MLP() - mlp_pure = MLP() - - config_hybrid = GraloraConfig( - target_modules=["lin1"], - r=16, - gralora_k=4, - hybrid_r=4, - init_weights=False, - ) - model_hybrid = get_peft_model(mlp_hybrid, config_hybrid) - - config_pure = GraloraConfig( - target_modules=["lin1"], - r=16, - gralora_k=4, - hybrid_r=0, - init_weights=False, - ) - model_pure = get_peft_model(mlp_pure, config_pure) - - x = torch.randn(5, 10) - - with torch.no_grad(): - output_hybrid = model_hybrid(x) - output_pure = model_pure(x) - - # Outputs should be different due to hybrid component - assert not torch.allclose(output_hybrid, output_pure, atol=1e-3) - - def test_gralora_invalid_rank_zero(self): - """Test that r=0 raises error""" - mlp = MLP() - config = GraloraConfig(target_modules=["lin1"], r=0, gralora_k=2) - - with pytest.raises(ValueError, match="`r` should be a positive integer"): - get_peft_model(mlp, config) - - def test_gralora_invalid_rank_negative(self): - """Test that negative r raises error""" - mlp = MLP() - config = GraloraConfig(target_modules=["lin1"], r=-1, gralora_k=2) - - with pytest.raises(ValueError, match="`r` should be a positive integer"): - get_peft_model(mlp, config) - - def test_gralora_bias_all(self): - """Test bias='all' configuration""" - torch.manual_seed(0) - mlp = MLP(bias=True) - config = GraloraConfig( - target_modules=["lin1"], - r=8, - gralora_k=2, - bias="all", - ) - model = get_peft_model(mlp, config) - - # Check that all bias parameters are trainable - bias_params = [name for name, param in model.named_parameters() if "bias" in name and param.requires_grad] - assert len(bias_params) > 0, "At least some bias parameters should be trainable" - - def test_gralora_bias_gralora_only(self): - """Test bias='gralora_only' configuration""" - torch.manual_seed(0) - mlp = MLP(bias=True) - config = GraloraConfig( - target_modules=["lin1"], - r=8, - gralora_k=2, - bias="gralora_only", - ) - model = get_peft_model(mlp, config) - - # Only GraLoRA layer biases should be trainable - assert model.base_model.model.lin1.bias.requires_grad - assert not model.base_model.model.lin0.bias.requires_grad - - def test_gralora_multiple_adapters_with_bias_raises(self): - """Test that multiple adapters with bias raises error""" - torch.manual_seed(0) - mlp = MLP() - config1 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2, bias="all") - model = get_peft_model(mlp, config1) - - config2 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2, bias="all") - - with pytest.raises(ValueError, match="supports only 1 adapter with bias"): - model.add_adapter("adapter2", config2) - - def test_gralora_cpu_fp16_merge(self): - """Test merge with fp16 on CPU""" - torch.manual_seed(0) - mlp = MLP().to(torch.float16) - config = GraloraConfig( - target_modules=["lin1"], - r=8, - gralora_k=2, - hybrid_r=0, - init_weights=False, - ) - model = get_peft_model(mlp, config) - - x = torch.randn(5, 10).to(torch.float16) - - with torch.no_grad(): - output_before = model(x) - - # Merge (should handle CPU fp16 correctly) - model.merge_adapter() - - with torch.no_grad(): - output_after = model(x) - - assert torch.allclose(output_before, output_after, atol=1e-2, rtol=1e-2) - - def test_gralora_cpu_bf16_merge(self): - """Test merge with bf16 on CPU (if supported)""" - # Check if bfloat16 is supported - try: - _ = torch.randn(2, 2).to(torch.bfloat16) - except RuntimeError: - pytest.skip("bfloat16 not supported on this system") - - torch.manual_seed(0) - mlp = MLP().to(torch.bfloat16) - config = GraloraConfig( - target_modules=["lin1"], - r=8, - gralora_k=2, - hybrid_r=2, - init_weights=False, - ) - model = get_peft_model(mlp, config) - - x = torch.randn(5, 10).to(torch.bfloat16) - - with torch.no_grad(): - output_before = model(x) - - # Merge with hybrid component - model.merge_adapter() - - with torch.no_grad(): - output_after = model(x) - - assert torch.allclose(output_before, output_after, atol=1e-2, rtol=1e-2) - - def test_gralora_disable_adapter_layers_warns_with_bias(self): - """Test that disable_adapter_layers warns when bias is configured""" - torch.manual_seed(0) - mlp = MLP() - config = GraloraConfig( - target_modules=["lin1"], - r=8, - gralora_k=2, - bias="all", - ) - model = get_peft_model(mlp, config) - - with pytest.warns(UserWarning, match="disabling adapter layers with bias"): - model.disable_adapter_layers() - - def test_gralora_set_adapter_warns_when_merged(self): - """Test that set_adapter warns and unmerges when model is merged""" - torch.manual_seed(0) - mlp = MLP() - config1 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2) - model = get_peft_model(mlp, config1, adapter_name="adapter1") - - config2 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2) - model.add_adapter("adapter2", config2) - - # Merge first adapter - model.merge_adapter() - - # Setting adapter should warn and unmerge - with pytest.warns(UserWarning, match="Adapter cannot be set when the model is merged"): - model.set_adapter("adapter2") - - # Model should be unmerged now - assert not model.base_model.model.lin1.merged - - def test_gralora_delete_adapter(self): - """Test deleting an adapter""" - torch.manual_seed(0) - mlp = MLP() - config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2) - model = get_peft_model(mlp, config, adapter_name="adapter1") - - config2 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2) - model.add_adapter("adapter2", config2) - - # Delete adapter1 - model.delete_adapter("adapter1") - - assert "adapter1" not in model.peft_config - assert "adapter2" in model.peft_config - - def test_gralora_delete_nonexistent_adapter_raises(self): - """Test that deleting nonexistent adapter raises error""" - torch.manual_seed(0) - mlp = MLP() - config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2) - model = get_peft_model(mlp, config) - - with pytest.raises(ValueError, match="Adapter .* does not exist"): - model.delete_adapter("nonexistent") - - def test_gralora_unload_without_merge(self): - """Test unload without merging""" - torch.manual_seed(0) - mlp = MLP() - config = GraloraConfig( - target_modules=["lin1"], - r=8, - gralora_k=2, - init_weights=False, - ) - model = get_peft_model(mlp, config) - - x = torch.randn(5, 10) - - # Get base model output - with model.disable_adapter(): - with torch.no_grad(): - base_output = model(x) - - # Unload without merge - unloaded_model = model.unload() - - with torch.no_grad(): - unloaded_output = unloaded_model(x) - - # Should match base model output (no merge) - assert torch.allclose(base_output, unloaded_output, atol=1e-5) - - def test_gralora_merge_with_hybrid_component(self): - """Test that merge works correctly with hybrid component""" - torch.manual_seed(0) - mlp = MLP() - config = GraloraConfig( - target_modules=["lin1"], - r=16, - gralora_k=4, - hybrid_r=4, - init_weights=False, - ) - model = get_peft_model(mlp, config) - - x = torch.randn(5, 10) - - with torch.no_grad(): - output_before = model(x) - - # Merge - model.merge_adapter() - - with torch.no_grad(): - output_after = model(x) - - # Outputs should be very close - assert torch.allclose(output_before, output_after, atol=1e-4, rtol=1e-4) - - def test_gralora_repr(self): - """Test __repr__ method""" - torch.manual_seed(0) - mlp = MLP() - config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2) - model = get_peft_model(mlp, config) - - repr_str = repr(model.base_model.model.lin1) - assert "gralora" in repr_str.lower() - - def test_gralora_merge_with_adapter_names(self): - """Test merge with specific adapter names""" - torch.manual_seed(0) - mlp = MLP() - config1 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2, init_weights=False) - model = get_peft_model(mlp, config1, adapter_name="adapter1") - - torch.manual_seed(42) - config2 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2, init_weights=False) - model.add_adapter("adapter2", config2) - - x = torch.randn(5, 10) - - # Set to adapter1 and get output - model.set_adapter("adapter1") - with torch.no_grad(): - output_before = model(x) - - # Merge only adapter1 - model.base_model.model.lin1.merge(adapter_names=["adapter1"]) - - with torch.no_grad(): - output_after = model(x) - - # Outputs should be close - assert torch.allclose(output_before, output_after, atol=1e-4, rtol=1e-4) - - def test_gralora_enable_disable_adapter_layers(self): - """Test enable/disable adapter layers""" - torch.manual_seed(0) - mlp = MLP() - config = GraloraConfig( - target_modules=["lin1"], - r=8, - gralora_k=2, - init_weights=False, - ) - model = get_peft_model(mlp, config) - - x = torch.randn(5, 10) - - # Get output with adapter enabled - with torch.no_grad(): - output_enabled = model(x) - - # Disable adapters - model.disable_adapter_layers() - - with torch.no_grad(): - output_disabled = model(x) - - # Enable adapters - model.enable_adapter_layers() - - with torch.no_grad(): - output_re_enabled = model(x) - - # Output with disabled adapter should be different - assert not torch.allclose(output_enabled, output_disabled, atol=1e-6) - # Output after re-enabling should match original - assert torch.allclose(output_enabled, output_re_enabled, atol=1e-6) - - def test_gralora_forward_with_merged_adapter(self): - """Test forward pass with merged adapter""" - torch.manual_seed(0) - mlp = MLP() - config = GraloraConfig( - target_modules=["lin1"], - r=8, - gralora_k=2, - init_weights=False, - ) - model = get_peft_model(mlp, config) - - x = torch.randn(5, 10) - - # Get output before merge - with torch.no_grad(): - output_before = model(x) - - # Merge adapter - model.merge_adapter() - - # Forward with merged adapter (should take merged path) - with torch.no_grad(): - output_after = model(x) - - assert torch.allclose(output_before, output_after, atol=1e-4) - - def test_gralora_forward_with_disable_adapters_and_merged(self): - """Test forward when disable_adapters=True and model is merged""" - torch.manual_seed(0) - mlp = MLP() - config = GraloraConfig( - target_modules=["lin1"], - r=8, - gralora_k=2, - init_weights=False, - ) - model = get_peft_model(mlp, config) - - x = torch.randn(5, 10) - - # Merge adapter - model.merge_adapter() - - # Get output with merged adapter - with torch.no_grad(): - output_merged = model(x) - - # Disable adapters (should unmerge) - with model.disable_adapter(): - with torch.no_grad(): - output_disabled = model(x) - - # Outputs should be different - assert not torch.allclose(output_merged, output_disabled, atol=1e-5) - - def test_gralora_bias_invalid_option_raises(self): - """Test that invalid bias option raises NotImplementedError""" - torch.manual_seed(0) - mlp = MLP() - - # Create config with invalid bias (need to bypass validation) - config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2) - model = get_peft_model(mlp, config) - - # Manually set invalid bias to trigger the error - model.peft_config["default"].bias = "invalid_option" - - with pytest.raises(NotImplementedError, match="Requested bias"): - model._mark_only_adapters_as_trainable(model.model) - - def test_gralora_merge_empty_adapter_names(self): - """Test merge with empty adapter_names returns early""" - torch.manual_seed(0) - mlp = MLP() - config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2) - model = get_peft_model(mlp, config) - - # Call merge with empty list (should return early) - model.base_model.model.lin1.merge(adapter_names=[]) - - # Model should not be merged - assert not model.base_model.model.lin1.merged - - def test_gralora_add_non_active_adapter(self): - """Test adding adapter that is not active (should not be trainable)""" - torch.manual_seed(0) - mlp = MLP() - config1 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2) - model = get_peft_model(mlp, config1, adapter_name="adapter1") - - # Keep adapter1 active - model.set_adapter("adapter1") - - # Add adapter2 (should not be active/trainable initially) - torch.manual_seed(42) - config2 = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2) - model.add_adapter("adapter2", config2) - - # adapter2 parameters should exist but might not be in active_adapters initially - assert "adapter2" in model.base_model.model.lin1.gralora_A - - def test_gralora_forward_with_no_adapter_in_active_list(self): - """Test forward when active_adapter is not in gralora_A keys""" - torch.manual_seed(0) - mlp = MLP() - config = GraloraConfig(target_modules=["lin1"], r=8, gralora_k=2) - model = get_peft_model(mlp, config, adapter_name="adapter1") - - x = torch.randn(5, 10) - - # Manually set _active_adapter to include non-existent adapter - original_adapter = model.base_model.model.lin1._active_adapter - model.base_model.model.lin1._active_adapter = ["nonexistent", "adapter1"] - - # Should still work (skip nonexistent adapter) - with torch.no_grad(): - output = model(x) - - assert output.shape == (5, 2) - - # Restore - model.base_model.model.lin1._active_adapter = original_adapter diff --git a/tests/test_initialization.py b/tests/test_initialization.py index 3475247cd8..cbc41e5671 100644 --- a/tests/test_initialization.py +++ b/tests/test_initialization.py @@ -38,6 +38,7 @@ C3AConfig, DeloraConfig, EvaConfig, + GraloraConfig, IA3Config, LoftQConfig, LoKrConfig, @@ -2157,6 +2158,56 @@ def test_init_weights_false_shifts_output(self, data): assert not torch.allclose(y_base, y_peft, atol=1e-6, rtol=1e-6) +class TestGraLoRAInitialization: + """Basic sanity tests for the GraLoRA tuner.""" + + torch_device = infer_device() + + def get_model(self, bias=True): + class MLP(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.lin0 = nn.Linear(10, 30, bias=bias) + self.lin1 = nn.Linear(30, 2, bias=bias) + + def forward(self, X): + X = self.lin0(X) + X = self.lin1(X) + return X + + return MLP(bias=bias).to(self.torch_device).eval() + + @pytest.fixture + def data(self): + torch.manual_seed(0) + return torch.randn(4, 10, device=self.torch_device) + + def test_gralora_with_incompatible_gralora_k_and_r_raises(self): + model = self.get_model() + r = 6 + gralora_k = 4 + # msg = f"r should be divisible by gralora_k, but got {config.r} and {config.gralora_k}" + msg = f"r should be divisible by gralora_k, but got {r} and {gralora_k}" + with pytest.raises(ValueError, match=re.escape(msg)): + GraloraConfig(target_modules=["lin0"], r=r, gralora_k=gralora_k) + + def test_gralora_with_incompatible_gralora_k_and_in_features_raises(self): + model = self.get_model() + config = GraloraConfig(target_modules=["lin0"], r=6, gralora_k=3) + msg = f"in_features should be divisible by gralora_k, but got {model.lin0.in_features} and {config.gralora_k}" + with pytest.raises(ValueError, match=re.escape(msg)): + get_peft_model(model, config) + + def test_gralora_with_incompatible_gralora_k_and_out_features_raises(self): + model = self.get_model() + config = GraloraConfig(target_modules=["lin1"], r=6, gralora_k=3) + msg = ( + f"out_features should be divisible by gralora_k, but got {model.lin1.out_features} and {config.gralora_k}" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + get_peft_model(model, config) + + class TestNoInfiniteRecursionDeepspeed: # see #1892 for details classes = [ From 430e89625142d9244aae82abf8386fb81229a666 Mon Sep 17 00:00:00 2001 From: "yeonjoon.jung" Date: Mon, 27 Oct 2025 21:39:27 +0900 Subject: [PATCH 10/11] UPDATE document format in GraLoRA --- src/peft/tuners/gralora/config.py | 64 +++++++++++++++---------------- 1 file changed, 31 insertions(+), 33 deletions(-) diff --git a/src/peft/tuners/gralora/config.py b/src/peft/tuners/gralora/config.py index 57e2fb3f47..1458bca3e2 100644 --- a/src/peft/tuners/gralora/config.py +++ b/src/peft/tuners/gralora/config.py @@ -26,50 +26,48 @@ class GraloraConfig(PeftConfig): Args: r (`int`): - GraLoRA attention dimension determines the rank of the GraLoRA adapter. - The total parameter count of the GraLoRA adapter is same as LoRA with same rank r, while the expressivitiy is multiplied by gralora_k. + GraLoRA attention dimension determines the rank of the GraLoRA adapter. The total parameter count of the + GraLoRA adapter is same as LoRA with same rank r, while the expressivitiy is multiplied by gralora_k. hybrid_r (`int`): Hybrid GraLoRA rank determines the rank allocated to vanilla LoRA method when using Hybrid GraLoRA method. - Hybrid GraLoRA, a combination of GraLoRA and vanilla LoRA, becomes available when hybrid_r > 0. - The parameter count of the GraLoRA adapter is r + hybrid_r. + Hybrid GraLoRA, a combination of GraLoRA and vanilla LoRA, becomes available when hybrid_r > 0. The + parameter count of the GraLoRA adapter is r + hybrid_r. target_modules (`Union[List[str], str]`): - List of module names or regex expression of the module names to replace with GraLoRA. " - For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. " - This can also be a wildcard 'all-linear' which matches all linear/Conv1D " - "(if the model is a PreTrainedModel, the output layer excluded). " - If not specified, modules will be chosen according to the model architecture, If the architecture is " - not known, an error will be raised -- in this case, you should specify the target modules manually. " - To avoid targeting any modules (because you want to apply `target_parameters`), set " - `target_modules=[]`. + List of module names or regex expression of the module names to replace with GraLoRA. " For example, ['q', + 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. " This can also be a wildcard 'all-linear' + which matches all linear/Conv1D " "(if the model is a PreTrainedModel, the output layer excluded). " If not + specified, modules will be chosen according to the model architecture, If the architecture is " not known, + an error will be raised -- in this case, you should specify the target modules manually. " To avoid + targeting any modules (because you want to apply `target_parameters`), set " `target_modules=[]`. gralora_alpha (`int`): GraLoRA alpha. - GraLoRA alpha is the scaling factor for the GraLoRA adapter. - Scale becomes gralora_alpha / (r + hybrid_r). + GraLoRA alpha is the scaling factor for the GraLoRA adapter. Scale becomes gralora_alpha / (r + hybrid_r). gralora_dropout (`float`): - GraLoRA dropout is the dropout probability for the GraLoRA adapter. - It is used to prevent overfitting and improve the generalization of the GraLoRA adapter. + GraLoRA dropout is the dropout probability for the GraLoRA adapter. It is used to prevent overfitting and + improve the generalization of the GraLoRA adapter. gralora_k (`int`): - GraLoRA k determines the number of subblocks in the GraLoRA adapter. - The rank r must be divisible by gralora_k for the GraLoRA adapter to be valid. - The total parameter count is preserved regardles of gralora_k. - The entire rank of the GraLoRA adapter is increased by gralora_k, while the rank of each subblock is reduced by gralora_k. - gralora_k=2 is recommended for rank 32 or lower, and gralora_k=4 is recommended for rank 64 or higher. + GraLoRA k determines the number of subblocks in the GraLoRA adapter. The rank r must be divisible by + gralora_k for the GraLoRA adapter to be valid. The total parameter count is preserved regardles of + gralora_k. The entire rank of the GraLoRA adapter is increased by gralora_k, while the rank of each + subblock is reduced by gralora_k. gralora_k=2 is recommended for rank 32 or lower, and gralora_k=4 is + recommended for rank 64 or higher. fan_in_fan_out (`bool`): - Set this to True if the layer to replace stores weight like (fan_in, fan_out). - For example, gpt-2 uses `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`. + Set this to True if the layer to replace stores weight like (fan_in, fan_out). For example, gpt-2 uses + `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`. bias (`str`): - Bias type for gralora. Can be 'none', 'all' or 'gralora_only'. - If 'all' or 'gralora_only', the corresponding biases will be updated during training. - Be aware that this means that, even when disabling the adapters, the model will not produce the same output as the base model would have without adaptation. + Bias type for gralora. Can be 'none', 'all' or 'gralora_only'. If 'all' or 'gralora_only', the + corresponding biases will be updated during training. Be aware that this means that, even when disabling + the adapters, the model will not produce the same output as the base model would have without adaptation. init_weights (`bool`): - Whether to initialize the weights of the GraLoRA layers with their default initialization. - Don't change this setting, except if you know exactly what you're doing. + Whether to initialize the weights of the GraLoRA layers with their default initialization. Don't change + this setting, except if you know exactly what you're doing. layers_to_transform (`Union[List[int], int]`): - The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. - If a single integer is passed, PEFT will transform only the layer at this index. - This only works when target_modules is a list of str. + The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes + that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at + this index. This only works when target_modules is a list of str. layers_pattern (`Optional[Union[List[str], str]]`): - The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern. - This only works when target_modules is a list of str. This should target the `nn.ModuleList` of the model, which is often called `'layers'` or `'h'`. + The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is + not in the common layers pattern. This only works when target_modules is a list of str. This should target + the `nn.ModuleList` of the model, which is often called `'layers'` or `'h'`. """ r: int = field( From 351877f894693060f8a2eb75a5652f15b0e7cd63 Mon Sep 17 00:00:00 2001 From: "yeonjoon.jung" Date: Mon, 27 Oct 2025 23:15:49 +0900 Subject: [PATCH 11/11] FIX CPU casting in GraLoRA get_delta_weight function --- src/peft/tuners/gralora/layer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/peft/tuners/gralora/layer.py b/src/peft/tuners/gralora/layer.py index 4303669aa4..d6f78665f0 100644 --- a/src/peft/tuners/gralora/layer.py +++ b/src/peft/tuners/gralora/layer.py @@ -277,7 +277,9 @@ def get_delta_weight(self, adapter) -> torch.Tensor: l_indices = torch.arange(in_features, device=device) n_indices = l_indices // (in_features // gralora_k) i_indices = l_indices % (in_features // gralora_k) - gralora_A_scattered = torch.zeros(in_features, gralora_k, gralora_rank, device=device, dtype=dtype) + gralora_A_scattered = torch.zeros( + in_features, gralora_k, gralora_rank, device=device, dtype=torch.float32 if cast_to_fp32 else dtype + ) gralora_A_scattered.scatter_( 1, n_indices.unsqueeze(1).unsqueeze(2).expand(-1, 1, gralora_rank),