LoRA experiment notes

vpj · vpj · commit d69f1c105837 · 2024-08-16T16:14:52.000+05:30
diff --git a/labml_nn/lora/experiment.py b/labml_nn/lora/experiment.py
@@ -1,3 +1,16 @@
+"""
+---
+title: Finetune GPT-2 with LoRA
+summary: This is training code with notes for fine-tuning pre-trained GPT-2 model with LoRA.
+---
+
+# Finetune GPT-2 with [LoRA](index.html)
+
+Here's a Colab notebook for training a feedback transformer on Tiny Shakespeare dataset.
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/lora/experiment.ipynb)
+"""
+
 import torch
 from labml import lab, monit, tracker
 from labml.configs import BaseConfigs, option
@@ -9,19 +22,31 @@
 from labml_nn.lora.gpt2 import GPTModel
 
 
-class Configs(BaseConfigs):
+class Trainer(BaseConfigs):
+    """
+    ## Trainer configurations and the training loop
+
+    The default configs can and will be over-ridden when we start the experiment
+    """
     device: torch.device = DeviceConfigs()
+
+    # GPT-2 configs
     layer_norm_epsilon: float = 1e-05
     n_embed: int = 768
     n_layer: int = 12
     n_positions: int = 1024
     vocab_size: int = 50257
+
+    # Training configs
     epochs: int = 10
     batch_size: int = 32
     learning_rate: float = 1e-4
     context_len: int = 512
-    r: int = 32
 
+    # LoRA rank
+    lora_r: int = 32
+
+    # Dataset
     text: TensorDataset = "tiny_shakespeare"
     tokenizer = AutoTokenizer.from_pretrained("gpt2")
     model: GPTModel
@@ -30,10 +55,15 @@ class Configs(BaseConfigs):
     data_loader: DataLoader
 
     def _load_pretrained_weights(self):
-        hf_model = AutoModelForCausalLM.from_pretrained("gpt2")
+        """
+        ### Load pre-trained [GPT-2 from huggingface](https://huggingface.co/openai-community/gpt2)
+        """
 
+        # Load the huggingface model and get the parameters
+        hf_model = AutoModelForCausalLM.from_pretrained("gpt2")
         state_dict = hf_model.state_dict()
 
+        # Transformer embedding and prediction layer parameter mapping (`hf: ours`)
         mapping = {
             'transformer.wte.weight': 'token_embedding.weight',
             'transformer.wpe.weight': 'position_embedding.weight',
@@ -42,6 +72,7 @@ def _load_pretrained_weights(self):
             'lm_head.weight': 'lm_head.weight'
         }
 
+        # Mapping (`hf: ours`) of decoder layers
         for i in range(12):
             mapping[f'transformer.h.{i}.ln_1.weight'] = f'blocks.{i}.pre_norm.weight'
             mapping[f'transformer.h.{i}.ln_1.bias'] = f'blocks.{i}.pre_norm.bias'
@@ -56,12 +87,13 @@ def _load_pretrained_weights(self):
             mapping[f'transformer.h.{i}.mlp.c_proj.weight'] = f'blocks.{i}.ffn.c_proj.weight'
             mapping[f'transformer.h.{i}.mlp.c_proj.bias'] = f'blocks.{i}.ffn.c_proj.bias'
 
+        # Move the parameters based on mapping
         new_state_dict = {}
         for old_key, new_key in mapping.items():
             if old_key in state_dict:
                 new_state_dict[new_key] = state_dict[old_key]
 
-        # transpose weight matrices of convo 1d layers to use linear layers instead
+        # GPT-2 hugging face uses 1D Convolution layers. We need to transpose those weights since we use linear layers
         convo_layers = ([f'blocks.{i}.ffn.c_fc.weight' for i in range(12)] +
                         [f'blocks.{i}.ffn.c_proj.weight' for i in range(12)] +
                         [f'blocks.{i}.attn.c_att.weight' for i in range(12)] +
@@ -70,29 +102,37 @@ def _load_pretrained_weights(self):
         for layer in convo_layers:
             new_state_dict[layer] = torch.transpose(new_state_dict[layer], 0, 1)
 
+        # Load out model
         self.model.load_state_dict(new_state_dict, strict=False)  # state dict does not have lora weights
 
-        del hf_model
-        del state_dict
-        del new_state_dict
-
     def initialize(self):
+        """
+        ### Initialize the model, optimizer and dataloader
+        """
+        # Initialize the model
         self.model = GPTModel(
             layer_norm_epsilon=self.layer_norm_epsilon,
             n_embd=self.n_embed,
             n_layer=self.n_layer,
             n_positions=self.n_positions,
             vocab_size=self.vocab_size,
-            r=self.r,
-            device=self.device
-        ).to(self.device)
+            r=self.lora_r,
+        )
+        self.model.to(self.device)
+        # Load pre-trained model weights
         self._load_pretrained_weights()
 
+        # Initialize the optimizer
         self.optimizer = Adam(self.model.parameters(), lr=self.learning_rate)
 
+        # Initialize the data loader
         self.data_loader = DataLoader(self.text, batch_size=self.batch_size, shuffle=True)
 
     def run(self):
+        """
+        ### Training loop
+        """
+
         for _ in monit.loop(self.epochs):
             for i, batch in monit.enum('Train', self.data_loader):
                 inputs = batch[0]
@@ -117,8 +157,8 @@ def run(self):
             tracker.new_line()
 
 
-@option(Configs.text)
-def tiny_shakespeare(c: Configs):
+@option(Trainer.text)
+def tiny_shakespeare(c: Trainer):
     """
     ### Tiny Shakespeare dataset
 
diff --git a/labml_nn/lora/gpt2.py b/labml_nn/lora/gpt2.py
@@ -6,7 +6,9 @@
 class FFN(nn.Module):
     def __init__(self, dim: int, n_embed: int, r: int):
         super().__init__()
+        # lin1
         self.c_fc = Linear(n_embed, dim, r=r, bias=True)
+        # lin2
         self.c_proj = Linear(dim, n_embed, r=r, bias=True)
         self.act = nn.functional.gelu
 
@@ -25,7 +27,9 @@ def __init__(self, n_embed: int, r: int):
         self.head_dim = self.embed_dim // self.num_heads
         self.split_size = self.embed_dim
 
+        # qkv
         self.c_att = Linear(n_embed, n_embed * 3, r=r, bias=True)
+        # out
         self.c_proj = Linear(n_embed, n_embed, r=r, bias=True)
 
     def _split_heads(self, tensor, num_heads, attn_head_size):
@@ -87,7 +91,7 @@ def forward(self, hidden_states):
 
 class GPTModel(nn.Module):
     def __init__(self, layer_norm_epsilon: float, n_embd: int, n_layer: int, n_positions: int,
-                 vocab_size: int, r: int, device: torch.device):
+                 vocab_size: int, r: int):
         super().__init__()
 
         self.token_embedding = Embedding(vocab_size, n_embd, r=r)
@@ -100,22 +104,27 @@ def __init__(self, layer_norm_epsilon: float, n_embd: int, n_layer: int, n_posit
 
         self.lm_head = Linear(n_embd, vocab_size, r=r, bias=False)
 
-        self.device = device
-
-    def forward(self, input_ids):
-        batch_size, input_shape = input_ids.size()
+    def forward(self, input_ids: torch.Tensor):
+        """
+        :param input_ids: has shape `[batch_size, seq_len]`
+        """
+        batch_size, seq_len = input_ids.shape
 
-        token_embeddings = self.token_embedding(input_ids)  # B T C
-        position_ids = torch.arange(input_shape, device=self.device)  # T C
-        position_embeddings = self.position_embedding(position_ids)  # B T C
+        # Get token embeddings
+        token_embeddings = self.token_embedding(input_ids)
+        # Get position ids
+        position_ids = torch.arange(seq_len, device=input_ids.device)[None, :]
+        # Get position embeddings
+        position_embeddings = self.position_embedding(position_ids)
 
-        hidden_states = token_embeddings + position_embeddings
+        # Add position embeddings
+        x = token_embeddings + position_embeddings
 
+        # Run through transformer blocks
         for block in self.blocks:
-            hidden_states = block(hidden_states)
-
-        hidden_states = self.final_norm(hidden_states)
-
-        logits = self.lm_head(hidden_states)
+            x = block(x)
 
-        return logits
+        # Final normalization
+        x = self.final_norm(x)
+        # Get logits from projection layer
+        return self.lm_head(x)