Add lora linear

lucylq · lucylq · commit e87de4c727f0 · 2025-04-08T17:19:22.000-07:00
diff --git a/examples/models/llama/attention.py b/examples/models/llama/attention.py
@@ -167,6 +167,10 @@ def __init__(
         args: ModelArgs,
         layer_id: int,
         rope: Rope,
+        wq: nn.Module,
+        wk: nn.Module,
+        wv: nn.Module,
+        wo: nn.Module,
     ):
         super().__init__()
         self.use_kv_cache = args.use_kv_cache
@@ -190,16 +194,21 @@ def __init__(
             self.q_norm_fn = RMSNorm(q_norm_dim, eps=args.norm_eps)
             self.k_norm_fn = RMSNorm(k_norm_dim, eps=args.norm_eps)
 
-        self.wq = nn.Linear(
-            self.dim, self.n_heads * self.head_dim, bias=self.attention_qkv_bias
-        )
-        self.wk = nn.Linear(
-            self.dim, self.n_kv_heads * self.head_dim, bias=self.attention_qkv_bias
-        )
-        self.wv = nn.Linear(
-            self.dim, self.n_kv_heads * self.head_dim, bias=self.attention_qkv_bias
-        )
-        self.wo = nn.Linear(self.n_heads * self.head_dim, self.dim, bias=False)
+        # self.wq = nn.Linear(
+        #     self.dim, self.n_heads * self.head_dim, bias=self.attention_qkv_bias
+        # )
+        # self.wk = nn.Linear(
+        #     self.dim, self.n_kv_heads * self.head_dim, bias=self.attention_qkv_bias
+        # )
+        # self.wv = nn.Linear(
+        #     self.dim, self.n_kv_heads * self.head_dim, bias=self.attention_qkv_bias
+        # )
+        # self.wo = nn.Linear(self.n_heads * self.head_dim, self.dim, bias=False)
+
+        self.wq = wq
+        self.wk = wk
+        self.wv = wv
+        self.wo = wo
 
         self.layer_id = layer_id
 
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -209,6 +209,18 @@ def build_args_parser() -> argparse.ArgumentParser:
         help="checkpoint directory. Use with a sharded checkpoint, not for the standard llama2 model. Note, checkpoint_dir takes precedence over checkpoint if both are set.",
     )
 
+    parser.add_argument(
+        "--adapter_checkpoint",
+        required=False,
+        help="Path to the adapter.pt file. Used if the model has trained LoRA adapters. Must provide adapter_config.",
+    )
+
+    parser.add_argument(
+        "--adapter_config",
+        required=False,
+        help="Path to the adapter_config.json file. Used if the model has trained LoRA adapters. Must provide adapter_checkpoint.",
+    )
+
     parser.add_argument(
         "--use_qnn_sha",
         action="store_true",
@@ -592,6 +604,18 @@ def _prepare_for_llama_export(args) -> LLMEdgeManager:
         canonical_path(args.checkpoint_dir) if args.checkpoint_dir else None
     )
     params_path = canonical_path(args.params) if args.params else None
+
+    assert (args.adapter_checkpoint is None and args.adapter_config is None) or (
+        args.adapter_checkpoint is not None and args.adapter_config is not None
+    ), "Must provide both adapter_checkpoint and adapter_config, or neither"
+
+    adapter_checkpoint_path = (
+        canonical_path(args.adapter_checkpoint) if args.adapter_checkpoint else None
+    )
+    adapter_config_path = (
+        canonical_path(args.adapter_config) if args.adapter_config else None
+    )
+
     output_dir_path = canonical_path(args.output_dir, dir=True)
     weight_type = WeightType.FAIRSEQ2 if args.fairseq2 else WeightType.LLAMA
 
@@ -603,6 +627,8 @@ def _prepare_for_llama_export(args) -> LLMEdgeManager:
         checkpoint=checkpoint_path,
         checkpoint_dir=checkpoint_dir,
         params_path=params_path,
+        adapter_checkpoint=adapter_checkpoint_path,
+        adapter_config=adapter_config_path,
         use_kv_cache=args.use_kv_cache,
         use_sdpa_with_kv_cache=args.use_sdpa_with_kv_cache,
         generate_full_logits=args.generate_full_logits,
@@ -1040,6 +1066,8 @@ def _load_llama_model(
     checkpoint: Optional[str] = None,
     checkpoint_dir: Optional[str] = None,
     params_path: Optional[str] = None,
+    adapter_checkpoint: Optional[str] = None,
+    adapter_config: Optional[str] = None,
     use_kv_cache: bool = False,
     use_sdpa_with_kv_cache: bool = False,
     generate_full_logits: bool = False,
@@ -1087,6 +1115,8 @@ def _load_llama_model(
             checkpoint=checkpoint,
             checkpoint_dir=checkpoint_dir,
             params=params_path,
+            adapter_checkpoint=adapter_checkpoint,
+            adapter_config=adapter_config,
             use_kv_cache=use_kv_cache,
             use_sdpa_with_kv_cache=use_sdpa_with_kv_cache,
             generate_full_logits=generate_full_logits,
diff --git a/examples/models/llama/lora.py b/examples/models/llama/lora.py
@@ -0,0 +1,53 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+# Helper functions for tranforming the model to be able to load checkpoints with
+# LoRA adaptors. See https://arxiv.org/abs/2106.09685 for more details about LoRA.
+
+import torch
+from torch import nn
+
+
+class LoRALinear(nn.Module):
+    """LoRA linear layer as introduced in `LoRA: Low-Rank Adaptation of Large Language Models <https://arxiv.org/abs/2106.09685>`."""
+
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        rank: int,
+        alpha: float,
+        dropout: float = 0.0,
+        use_bias: bool = False,
+    ):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.rank = rank
+        self.alpha = alpha
+        self.use_bias = use_bias
+        self.dropout = dropout
+
+        linear = nn.Linear(in_dim, out_dim, bias=use_bias)
+        weight = linear.weight
+        bias = linear.bias if self.use_bias else None
+        self.register_parameter("weight", nn.Parameter(weight))
+        self.register_parameter(
+            "bias", nn.Parameter(bias) if bias is not None else None
+        )
+
+        self.dropout = nn.Dropout(p=dropout) if dropout > 0.0 else nn.Identity()
+        self.lora_a = nn.Linear(in_features=in_dim, out_features=rank, bias=False)
+        self.lora_b = nn.Linear(in_features=rank, out_features=out_dim, bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.nn.functional.linear(x, self.weight, self.bias)
+        lora_out = self.lora_a(self.dropout(x))
+        lora_out = (self.alpha / self.rank) * self.lora_b(lora_out)
+
+        return out + lora_out
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
@@ -20,9 +20,12 @@
     Transformer,
     TransformerBlock,
 )
+from executorch.examples.models.llama.lora import LoRALinear
 from executorch.examples.models.llama.model_args import ModelArgs
 from executorch.examples.models.llama.rope import Rope
 
+from torchtune.models import convert_weights
+
 try:
     from .fairseq2 import convert_to_llama_checkpoint
 
@@ -37,6 +40,87 @@ def convert_to_llama_checkpoint(**kwargs):
 from ..model_base import EagerModelBase
 
 
+def construct_llm(model_args: ModelArgs) -> Transformer:
+    if model_args.attention_type not in ATTENTION_REGISTRY:
+        raise ValueError(
+            f"Unknown attention type: {model_args.attention_type}. "
+            f"Available: {list(ATTENTION_REGISTRY.keys())}"
+        )
+
+    rope = Rope(model_args)
+    layers = torch.nn.ModuleList()
+    cls = ATTENTION_REGISTRY[model_args.attention_type]
+
+    wq = (
+        LoRALinear(
+            in_dim=model_args.dim,
+            out_dim=model_args.n_heads * model_args.head_dim,
+            rank=model_args.r,  # todo
+            alpha=model_args.lora_alpha,  # todo
+            dropout=0.0,
+            use_bias=model_args.attention_qkv_bias,
+        )
+        if "q_proj" in model_args.target_modules
+        else (
+            torch.nn.Linear(
+                model_args.dim,
+                model_args.n_heads * model_args.head_dim,
+                bias=model_args.attention_qkv_bias,
+            )
+        )
+    )
+
+    wk = (
+        LoRALinear(
+            in_dim=model_args.dim,
+            out_dim=model_args.n_kv_heads * model_args.head_dim,
+            rank=model_args.r,  # todo
+            alpha=model_args.lora_alpha,  # todo
+            dropout=0.0,
+            use_bias=model_args.attention_qkv_bias,
+        )
+        if "k_proj" in model_args.target_modules
+        else (
+            torch.nn.Linear(
+                model_args.dim,
+                model_args.n_kv_heads * model_args.head_dim,
+                bias=model_args.attention_qkv_bias,
+            )
+        )
+    )
+    wv = (
+        LoRALinear(
+            in_dim=model_args.dim,
+            out_dim=model_args.n_kv_heads * model_args.head_dim,
+            rank=model_args.r,  # todo
+            alpha=model_args.lora_alpha,  # todo
+            dropout=0.0,
+            use_bias=model_args.attention_qkv_bias,
+        )
+        if "v_proj" in model_args.target_modules
+        else (
+            torch.nn.Linear(
+                model_args.dim,
+                model_args.n_kv_heads * model_args.head_dim,
+                bias=model_args.attention_qkv_bias,
+            )
+        )
+    )
+
+    # todo
+    wo = torch.nn.Linear(
+        model_args.n_heads * model_args.head_dim, model_args.dim, bias=False
+    )
+
+    for layer_id in range(model_args.n_layers):
+        attention = cls(model_args, layer_id, rope, wq, wk, wv, wo)
+        transformer_block = TransformerBlock(model_args, attention)
+        layers.append(transformer_block)
+
+    # Construct transformer model.
+    return Transformer(model_args, layers, rope)
+
+
 class Llama2Model(EagerModelBase):
     def __init__(self, **kwargs):
         resource_dir = get_default_model_resource_dir(__file__)
@@ -49,6 +133,10 @@ def __init__(self, **kwargs):
         # Params file.
         params_path = kwargs.get("params", None)
 
+        # Adapter
+        adapter_checkpoint = kwargs.get("adapter_checkpoint", None)
+        adapter_config = kwargs.get("adapter_config", None)
+
         self.use_kv_cache = kwargs.get("use_kv_cache", False)
         self.use_sdpa_with_kv_cache_op = kwargs.get("use_sdpa_with_kv_cache", False)
         self.generate_full_logits = kwargs.get("generate_full_logits", False)
@@ -132,6 +220,22 @@ def __init__(self, **kwargs):
             with open(params_path, "r") as f:
                 params = json.loads(f.read())
 
+        # Get adapter checkpoint and config.
+        adapter_checkpoint = {}
+        adapter_config = {}
+        adapter_checkpoint_path = kwargs.get("adapter_checkpoint", None)
+        if adapter_checkpoint_path:
+            adapter_checkpoint = torch.load(
+                adapter_checkpoint_path, map_location=device, mmap=True
+            )
+            adapter_checkpoint = convert_weights.tune_to_meta(adapter_checkpoint)
+
+            adapter_config = kwargs.get("adapter_config", None)
+            with open(adapter_config, "r") as f:
+                adapter_config = json.loads(f.read())
+
+            checkpoint.update(adapter_checkpoint)
+
         output_prune_map = None
         if self.output_prune_map_path is not None:
             with open(self.output_prune_map_path, "r") as f:
@@ -156,6 +260,7 @@ def __init__(self, **kwargs):
             output_prune_map=output_prune_map,
             enable_dynamic_shape=self.enable_dynamic_shape,
             **params,
+            **adapter_config,
         )
 
         if model_args.use_scaled_rope:
@@ -177,23 +282,7 @@ def __init__(self, **kwargs):
         # They possess all other metadata a tensor carries such as size, stride, requires_grad.
         with torch.device("meta"):
             # Model itself is loaded in default dtype, fp32.
-
-            # Construct attention layers.
-            rope = Rope(model_args)
-            if model_args.attention_type not in ATTENTION_REGISTRY:
-                raise ValueError(
-                    f"Unknown attention type: {model_args.attention_type}. "
-                    f"Available: {list(ATTENTION_REGISTRY.keys())}"
-                )
-            layers = torch.nn.ModuleList()
-            cls = ATTENTION_REGISTRY[model_args.attention_type]
-            for layer_id in range(model_args.n_layers):
-                attention = cls(model_args, layer_id, rope)
-                transformer_block = TransformerBlock(model_args, attention)
-                layers.append(transformer_block)
-
-            # Construct transformer model.
-            self.model_ = Transformer(model_args, layers, rope)
+            self.model_ = construct_llm(model_args)
 
             # Get checkpoint dtype.
             if checkpoint:
diff --git a/examples/models/llama/model_args.py b/examples/models/llama/model_args.py
@@ -53,8 +53,16 @@ class ModelArgs:
     eos_count: int = 2
 
     quantization_args: Optional[dict] = None
+    # LoRA for QAT.
     lora_args: Optional[dict] = None
 
+    # LoRA arguments.
+    r: Optional[int] = None  # Rank.
+    lora_alpha: Optional[int] = None  # Alpha.
+    target_modules: Optional[list] = None  # Target modules.
+    peft_type: Optional[str] = None  # PEFT type.
+    base_model_name_or_path: Optional[str] = None  # Base model name or path.
+
     def __post_init__(self):
         if self.n_kv_heads is None:
             self.n_kv_heads = self.n_heads