ENH Llama-Adapters support for GPT2 (#2643)

efraimdahl · web-flow · commit 663b1209fde8 · 2025-07-24T14:51:16.000+02:00
aka "adaption prompt"
diff --git a/src/peft/tuners/adaption_prompt/config.py b/src/peft/tuners/adaption_prompt/config.py
@@ -18,7 +18,7 @@
 from peft.config import PeftConfig
 from peft.utils import PeftType
 
-from .utils import llama_compute_query_states
+from .utils import gpt2_compute_query_states, llama_compute_query_states
 
 
 @dataclass
@@ -62,6 +62,13 @@ def is_adaption_prompt(self) -> bool:
         v_proj_layer="v_proj",
         o_proj_layer="o_proj",
     ),
+    "gpt2": ModelTypeConfig(  # piggybacking of off the prior definitions, GPTs attention calculation is different
+        compute_query_states=gpt2_compute_query_states,
+        target_modules="attn",
+        k_proj_layer="c_attn",
+        v_proj_layer=None,
+        o_proj_layer=None,
+    ),
 }
 
 
@@ -71,7 +78,7 @@ def prepare_config(
 ) -> AdaptionPromptConfig:
     """Prepare the config based on the llama model type."""
     if model.config.model_type not in TRANSFORMERS_MODEL_CONFIG:
-        raise ValueError("Unsupported model type for adaption prompt: '{model.config.model_type}'.")
+        raise ValueError(f"Unsupported model type for adaption prompt: '{model.config.model_type}'.")
 
     model_config = TRANSFORMERS_MODEL_CONFIG[model.config.model_type]
 
diff --git a/src/peft/tuners/adaption_prompt/layer.py b/src/peft/tuners/adaption_prompt/layer.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import math
+from typing import Optional, Union
 
 import torch
 import torch.nn as nn
@@ -21,10 +22,10 @@
 from .config import TRANSFORMERS_MODEL_CONFIG
 
 
-class AdaptedAttention(nn.Module):
-    """This module wraps a LLamaAttention module and injects adaption prompts."""
+class _BaseAdaptedAttention(nn.Module):
+    """Base module, which defines adaption prompts for multiple model types."""
 
-    def __init__(self, model_type: str, adapter_len: int, model):
+    def __init__(self, model_type: str, adapter_len: int, model, target_dtype=torch.float32):
         """
         Initialize object.
 
@@ -34,31 +35,128 @@ def __init__(self, model_type: str, adapter_len: int, model):
             adapter_len: The length of the adaption prompt to insert.
             model: The original transformer attention module that is being wrapped.
         """
-        assert not isinstance(model, AdaptedAttention)
+        if isinstance(model, _BaseAdaptedAttention):
+            raise ValueError("Unable to stack multiple adaption prompts")
         super().__init__()
         self.model_type = model_type
         self.model = model
         self.adapter_len = adapter_len
         # Assume all parameters of the attention model we are wrapping are on the same device.
+
         device = next(model.parameters()).device
         # Don't think this was specified in the paper, but we follow the official repo which used an Embedding
         # which initializes the tokens with standard normal values.
         # https://github.com/ZrrSkywalker/LLaMA-Adapter/blob/41c3546fe1997ab8a65809dc8d8f9252b19d9faf/llama/model.py#L234
         # (bsz, adapter_len, hidden_size)
-        target_dtype = (
-            model.q_proj.weight.dtype if model.q_proj.weight.dtype not in [torch.int8, torch.uint8] else torch.float32
-        )
+
         if hasattr(self.model, "hidden_size"):
             # TODO: remove this clause after 2026-01-01
             hidden_size = self.model.hidden_size
         else:  # changed in https://github.com/huggingface/transformers/pull/35235
             hidden_size = self.model.config.hidden_size
+
+        if hasattr(self.model, "num_heads"):
+            # TODO: remove this clause after 2026-01-01
+            self.num_heads = self.model.num_heads
+        else:  # changed in https://github.com/huggingface/transformers/pull/35235
+            self.num_heads = self.model.config.num_attention_heads
+
         self.adaption_prompt = nn.Parameter(
             torch.empty(1, adapter_len, hidden_size, device=device, dtype=target_dtype).normal_()
         )
         # Initialize the gate to 0 as this is "zero-init".
         self.adaption_gate = nn.Parameter(torch.zeros(1, device=device, dtype=target_dtype))
 
+
+class AdaptedAttentionGPT(_BaseAdaptedAttention):
+    """This module wraps a GPT2Attention module and injects adaption prompts"""
+
+    def __init__(self, model_type, adapter_len, model):
+        target_dtype = (
+            model.c_proj.weight.dtype if model.c_proj.weight.dtype not in [torch.int8, torch.uint8] else torch.float32
+        )
+        super().__init__(model_type, adapter_len, model, target_dtype=target_dtype)
+
+    def forward(
+        self,
+        hidden_states: Optional[tuple[torch.FloatTensor]],
+        layer_past: Optional[tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        **kwargs,
+    ) -> tuple[Union[torch.Tensor, tuple[torch.Tensor]], ...]:
+        attn_outputs = self.model(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            **kwargs,
+        )
+        """
+        Forward pass for the adapter which wraps the GPT2Attention module
+        """
+
+        attn_output = attn_outputs[0]
+        add_outputs = attn_outputs[1:]
+
+        c_attn_layer = TRANSFORMERS_MODEL_CONFIG[self.model_type].k_proj_layer
+
+        bsz = attn_output.shape[0]
+        q_len = attn_output.shape[1]
+        embed_dim = attn_output.shape[2]
+
+        _, key, value = getattr(self.model, c_attn_layer)(self.adaption_prompt).split(embed_dim, dim=2)
+
+        adapter_k = (
+            key.view(1, self.adapter_len, self.num_heads, self.model.head_dim).repeat(bsz, 1, 1, 1).transpose(1, 2)
+        )
+        adapter_v = (
+            value.view(1, self.adapter_len, self.num_heads, self.model.head_dim).repeat(bsz, 1, 1, 1).transpose(1, 2)
+        )
+        # recompute query state since it is not returned by GPT2 forward
+        compute_query_states = TRANSFORMERS_MODEL_CONFIG[self.model_type].compute_query_states
+        query_states = compute_query_states(
+            self.model, hidden_states=hidden_states, encoder_hidden_states=encoder_hidden_states
+        )
+
+        previous_dtype = query_states.dtype
+
+        scores = torch.matmul(query_states, adapter_k.transpose(2, 3).to(previous_dtype)) / math.sqrt(
+            self.model.head_dim
+        )
+        # Upcast attention to fp32
+        # (bsz, num_heads, q_len, adapter_len)
+        scores = self.adaption_gate * F.softmax(scores, dim=-1, dtype=torch.float32).to(previous_dtype)
+        # (bsz, q_len, num_heads * head_dim)
+        adapter_output = torch.matmul(scores, adapter_v).transpose(1, 2).reshape(bsz, q_len, -1)
+
+        # Add adaption prompt output to original output.
+        hidden_state = attn_output + adapter_output
+
+        # Restore original dtype.
+        hidden_state = hidden_state.to(previous_dtype)
+
+        # add additional attention outputs (attention and cross attention)
+        output = (hidden_state,) + add_outputs
+        return output
+
+
+class AdaptedAttention(_BaseAdaptedAttention):
+    """This module wraps a LLamaAttention module and injects adaption prompts."""
+
+    def __init__(self, model_type, adapter_len, model):
+        target_dtype = (
+            model.q_proj.weight.dtype if model.q_proj.weight.dtype not in [torch.int8, torch.uint8] else torch.float32
+        )
+        super().__init__(model_type, adapter_len, model, target_dtype=target_dtype)
+
     def forward(self, **kwargs):
         """
         Forward pass for the adapter which wraps the original LlamaAttention module.
diff --git a/src/peft/tuners/adaption_prompt/model.py b/src/peft/tuners/adaption_prompt/model.py
@@ -18,7 +18,7 @@
 from peft.utils import _freeze_adapter, _get_submodules
 
 from .config import AdaptionPromptConfig, prepare_config
-from .layer import AdaptedAttention
+from .layer import AdaptedAttention, AdaptedAttentionGPT
 from .utils import is_adaption_prompt_trainable
 
 
@@ -65,7 +65,7 @@ def add_adapter(self, adapter_name: str, config: AdaptionPromptConfig) -> None:
 
         parents = []
         for name, _ in self.model.named_modules():
-            if name.endswith(config.target_modules):
+            if name.endswith(f".{config.target_modules}"):
                 par, _, _ = _get_submodules(self.model, name)
                 parents.append(par)
         if len(parents) < config.adapter_layers:
@@ -118,11 +118,19 @@ def disable_adapter_layers(self):
     def _create_adapted_attentions(self, config: AdaptionPromptConfig, parents: list[nn.Module]) -> None:
         """Wrap LlamaAttention modules with newly created AdaptedAttention modules."""
         for par in parents:
-            attn = AdaptedAttention(
-                model_type=self.model.config.model_type,
-                adapter_len=config.adapter_len,
-                model=getattr(par, config.target_modules),
-            )
+            if self.model.config.model_type == "gpt2":
+                attn = AdaptedAttentionGPT(
+                    model_type=self.model.config.model_type,
+                    adapter_len=config.adapter_len,
+                    model=getattr(par, config.target_modules),
+                )
+
+            else:
+                attn = AdaptedAttention(
+                    model_type=self.model.config.model_type,
+                    adapter_len=config.adapter_len,
+                    model=getattr(par, config.target_modules),
+                )
             setattr(par, config.target_modules, attn)
 
     def _set_adapted_attentions(self, adapter_name: str) -> None:
diff --git a/src/peft/tuners/adaption_prompt/utils.py b/src/peft/tuners/adaption_prompt/utils.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import inspect
+from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -127,6 +128,31 @@ def llama_compute_query_states(model: nn.Module, **kwargs) -> torch.Tensor:
     return (query_states * cos) + (llama_rotate_half(query_states) * sin)
 
 
+def gpt2_compute_query_states(
+    model: nn.Module,
+    hidden_states: Optional[tuple[torch.FloatTensor]],
+    encoder_hidden_states: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    Compute query states for GPT2 models. They need to be recomputed as the forward() method of the GPT@ in the
+    transformers library does not return them. See the related discussion in the PR:
+    """
+    if encoder_hidden_states is not None:
+        if not hasattr(model, "q_attn"):
+            raise ValueError(
+                f"If `{model.__class__.__name__}` is used as cross attention, the weights `q_attn` must be defined. "
+                f"Please make sure to instantiate it with `GPT2Attention(..., is_cross_attention=True)`."
+            )
+        query_states = model.q_attn(hidden_states)
+    else:
+        query_states, _, _ = model.c_attn(hidden_states).split(model.split_size, dim=2)
+
+    shape_q = (*query_states.shape[:-1], -1, model.head_dim)
+    query_states = query_states.view(shape_q).transpose(1, 2)
+
+    return query_states
+
+
 def is_adaption_prompt_trainable(params: str) -> bool:
     """Return True if module is trainable under adaption prompt fine-tuning."""
     return params.split(".")[-1].startswith("adaption_")
diff --git a/tests/test_adaption_prompt.py b/tests/test_adaption_prompt.py
@@ -29,6 +29,7 @@
 
 
 MODELS_TO_TEST = [
+    "hf-internal-testing/tiny-random-gpt2",
     "trl-internal-testing/tiny-random-LlamaForCausalLM",
     "hf-internal-testing/tiny-random-MistralForCausalLM",
 ]

Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,7 @@`
`29`	`29`
`30`	`30`
`31`	`31`	`MODELS_TO_TEST = [`
	`32`	`+ "hf-internal-testing/tiny-random-gpt2",`
`32`	`33`	`"trl-internal-testing/tiny-random-LlamaForCausalLM",`
`33`	`34`	`"hf-internal-testing/tiny-random-MistralForCausalLM",`
`34`	`35`	`]`