Fix tests

jackzhxng · jackzhxng · commit 03b44fcfc336 · 2024-11-08T11:30:53.000-08:00
diff --git a/extension/llm/modules/mha.py b/extension/llm/modules/mha.py
@@ -354,6 +354,7 @@ def forward(
         q = q.transpose(1, 2)
         k = k.transpose(1, 2)
         v = v.transpose(1, 2)
+
         output = self._attention_fn(
             q,
             k,
diff --git a/extension/llm/modules/test/test_mha.py b/extension/llm/modules/test/test_mha.py
@@ -7,12 +7,14 @@
 import unittest
 
 import torch
+from executorch.exir import EdgeCompileConfig, to_edge
 
 from executorch.extension.llm.modules.mha import (
     MultiHeadAttention as ETMultiHeadAttention,
 )
+from executorch.runtime import Runtime
+from torchtune.models.llama3_1._position_embeddings import Llama3ScaledRoPE
 from torchtune.modules.attention import MultiHeadAttention as TTMultiHeadAttention
-from torchtune.modules.kv_cache import KVCache
 
 
 torch.manual_seed(0)
@@ -21,76 +23,118 @@
 class AttentionTest(unittest.TestCase):
     def setUp(self):
         super().setUp()
-        self.embed_dim=2048
-        self.num_heads=32
-        self.num_kv_heads=8
-        self.head_dim=64
+
+        # Constants
+        self.embed_dim = 2048
+        self.num_heads = 32
+        self.num_kv_heads = 8
+        self.head_dim = 64
         self.max_seq_len = 128
+        self.rope_base = 500_000
+        self.scale_factor = 32
+
+        # Module dependency injections.
+        self.q_proj = torch.nn.Linear(
+            self.embed_dim, self.num_heads * self.head_dim, bias=False
+        )
+        self.k_proj = torch.nn.Linear(
+            self.embed_dim, self.num_kv_heads * self.head_dim, bias=False
+        )
+        self.v_proj = torch.nn.Linear(
+            self.embed_dim, self.num_kv_heads * self.head_dim, bias=False
+        )
+        self.output_proj = torch.nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        self.pos_embeddings = Llama3ScaledRoPE(dim=self.head_dim, max_seq_len=self.max_seq_len, base=self.rope_base, scale_factor=self.scale_factor)
+
+        # Original TorchTune reference module to test accuracy against.
         self.tt_mha = TTMultiHeadAttention(
             embed_dim=self.embed_dim,
             num_heads=self.num_heads,
             num_kv_heads=self.num_kv_heads,
             head_dim=self.head_dim,
-            q_proj=torch.nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=False),
-            k_proj=torch.nn.Linear(self.embed_dim, self.num_kv_heads * self.head_dim, bias=False),
-            v_proj=torch.nn.Linear(self.embed_dim, self.num_kv_heads * self.head_dim, bias=False),
-            output_proj=torch.nn.Linear(self.embed_dim, self.embed_dim, bias=False),
-            # pos_embeddings=rope,
+            q_proj=self.q_proj,
+            k_proj=self.k_proj,
+            v_proj=self.v_proj,
+            output_proj=self.output_proj,
+            pos_embeddings=self.pos_embeddings,
             max_seq_len=self.max_seq_len,
-            # attn_dropout=attn_dropout,
         )
+
+        # Source transformed module that we are testing.
         self.et_mha = ETMultiHeadAttention(
             embed_dim=self.embed_dim,
             num_heads=self.num_heads,
             num_kv_heads=self.num_kv_heads,
             head_dim=self.head_dim,
-            q_proj=torch.nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=False),
-            k_proj=torch.nn.Linear(self.embed_dim, self.num_kv_heads * self.head_dim, bias=False),
-            v_proj=torch.nn.Linear(self.embed_dim, self.num_kv_heads * self.head_dim, bias=False),
-            output_proj=torch.nn.Linear(self.embed_dim, self.embed_dim, bias=False),
-            # pos_embeddings=rope,
+            q_proj=self.q_proj,
+            k_proj=self.k_proj,
+            v_proj=self.v_proj,
+            output_proj=self.output_proj,
+            pos_embeddings=self.pos_embeddings,
             max_seq_len=self.max_seq_len,
-            # attn_dropout=attn_dropout,
         )
 
-    def test_self_attention_eager(self):
+        # Common inputs.
         seq_len = 10
-        x = torch.randn(1, seq_len, self.embed_dim)
-        et_res = self.et_mha(x, x) # Self attention.
-        tt_res = self.tt_mha(x, x) # Self attention.
-        
+        self.x = torch.randn(1, seq_len, self.embed_dim)
+        seq_len_dim = torch.export.Dim("seq_len", min=1, max=100)
+        self.dynamic_shapes = (
+            {0: torch.export.Dim.STATIC, 1: seq_len_dim, 2: torch.export.Dim.STATIC},
+            {0: torch.export.Dim.STATIC, 1: seq_len_dim, 2: torch.export.Dim.STATIC},
+        )
+
+    def test_attention_eager(self):
+        et_res = self.et_mha(self.x, self.x)  # Self attention.
+        tt_res = self.tt_mha(self.x, self.x)  # Self attention.
+
         self.assertTrue(torch.allclose(et_res, tt_res))
 
         # TODO: KV cache.
         # self.et_mha.setup_cache(1, dtype=torch.float16, max_seq_len=20)
         # self.tt_mha.setup_cache(1, dtype=torch.float16, max_seq_len=20)
-        
-        # et_res = self.et_mha(x, x) # Self attention.
-        # tt_res = self.tt_mha(x, x) # Self attention.
 
-        # self.assertTrue(torch.allclose(et_res, tt_res))
+        # et_res = self.et_mha(self.x, self.x) # Self attention.
+        # tt_res = self.tt_mha(self.x, self.x) # Self attention.
 
-    def test_self_attention_export(self):
-        seq_len = 10
-        x = torch.randn(1, seq_len, self.embed_dim)
-        seq_len_dim = torch.export.Dim("seq_len", min=1, max=100)
-        dynamic_shapes = (
-            {0: torch.export.Dim.STATIC, 1: seq_len_dim, 2: torch.export.Dim.STATIC},
-            {0: torch.export.Dim.STATIC, 1: seq_len_dim, 2: torch.export.Dim.STATIC},
-        )
+        # self.assertTrue(torch.allclose(et_res, tt_res))
 
+    def test_attention_export(self):
         # Self attention.
         et_mha_ep = torch.export.export(
             self.et_mha,
-            (x, x),
+            (self.x, self.x),
             kwargs=None,
-            dynamic_shapes=dynamic_shapes,
+            dynamic_shapes=self.dynamic_shapes,
         )
-        et_res = et_mha_ep.module()(x, x)
-        tt_res = self.tt_mha(x, x)
+        et_res = et_mha_ep.module()(self.x, self.x)
+        tt_res = self.tt_mha(self.x, self.x)
         self.assertTrue(torch.allclose(et_res, tt_res))
-        
+
         # TODO: KV cache.
 
-    def test_cross_attention_export(self):
+    def test_attention_aoti(self):
+        # TODO.
         pass
+
+    def test_attention_executorch(self):
+        # Self attention.
+        et_mha_ep = torch.export.export(
+            self.et_mha,
+            (self.x, self.x),
+            kwargs=None,
+            dynamic_shapes=self.dynamic_shapes,
+        )
+        et_program = to_edge(
+            et_mha_ep,
+            compile_config=EdgeCompileConfig(),
+        ).to_executorch()
+        runtime = Runtime.get()
+        program = runtime.load_program(et_program.buffer)
+        method = program.load_method("forward")
+        et_res = method.execute((self.x, self.x))
+        tt_res = self.tt_mha(self.x, self.x)
+
+        self.assertTrue(torch.allclose(et_res[0], tt_res, atol=1e-06))
+
+        # TODO: KV cache.
+