replace convo1D layers with linear

lakshith-403 · lakshith-403 · commit d1e8daa1212c · 2024-07-28T08:51:03.000+05:30
diff --git a/docs/transformers/LoRA/GPT2.py b/docs/transformers/LoRA/GPT2.py
@@ -14,37 +14,11 @@
 }
 
 
-# from transformers
-class Conv1D(nn.Module):
-    """
-    1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).
-
-    Basically works like a linear layer but the weights are transposed.
-
-    Args:
-        nf (`int`): The number of output features.
-        nx (`int`): The number of input features.
-    """
-
-    def __init__(self, nf, nx):
-        super().__init__()
-        self.nf = nf
-        self.weight = nn.Parameter(torch.empty(nx, nf))
-        self.bias = nn.Parameter(torch.zeros(nf))
-        nn.init.normal_(self.weight, std=0.02)
-
-    def forward(self, x):
-        size_out = x.size()[:-1] + (self.nf,)
-        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
-        x = x.view(size_out)
-        return x
-
-
 class HeadFFN(nn.Module):  # todo rename
     def __init__(self, dim):
         super().__init__()
-        self.c_fc = Conv1D(dim, config['n_embd'])
-        self.c_proj = Conv1D(config['n_embd'], dim)
+        self.c_fc = nn.Linear(config['n_embd'], dim)
+        self.c_proj = nn.Linear(dim, config['n_embd'])
         self.act = nn.functional.gelu
 
     def forward(self, hidden_states):
@@ -62,8 +36,8 @@ def __init__(self):
         self.head_dim = self.embed_dim // self.num_heads
         self.split_size = self.embed_dim
 
-        self.c_att = Conv1D(config['n_embd'] * 3, config['n_embd'])
-        self.c_proj = Conv1D(config['n_embd'], config['n_embd'])
+        self.c_att = nn.Linear(config['n_embd'], config['n_embd'] * 3)
+        self.c_proj = nn.Linear(config['n_embd'], config['n_embd'])
 
     def _split_heads(self, tensor, num_heads, attn_head_size):
         """
diff --git a/docs/transformers/LoRA/gpt2_state_dict.py b/docs/transformers/LoRA/gpt2_state_dict.py
@@ -32,4 +32,13 @@
     if old_key in state_dict:
         new_state_dict[new_key] = state_dict[old_key]
 
+# transpose weight matrices of convo 1d layers to use linear layers instead
+convo_layers = ([f'blocks.{i}.ffn.c_fc.weight' for i in range(12)] +
+                [f'blocks.{i}.ffn.c_proj.weight' for i in range(12)] +
+                [f'blocks.{i}.attn.c_att.weight' for i in range(12)] +
+                [f'blocks.{i}.attn.c_proj.weight' for i in range(12)])
+
+for layer in convo_layers:
+    new_state_dict[layer] = torch.transpose(new_state_dict[layer], 0, 1)
+
 torch.save(new_state_dict, 'transformed.pth')