Add test

larryliu0820 · larryliu0820 · commit b7763e9e4f55 · 2024-11-14T17:21:44.000-08:00
Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/extension/llm/modules/attention.py b/extension/llm/modules/attention.py
@@ -310,7 +310,7 @@ def false_fn(y):
             self.kv_cache.v_cache.copy_(v)
             self.kv_cache.cache_pos.copy_(cache_pos)
 
-        output = self._sdpa(q, k, v, b, s_x)
+        output = self._sdpa(q, k, v, b, s_x, mask=mask)
         return self.output_proj(output)
 
 
diff --git a/extension/llm/modules/kv_cache.py b/extension/llm/modules/kv_cache.py
@@ -131,17 +131,16 @@ def update(
     def clone(self) -> "KVCache":
         """Create a clone of the KVCache."""
         if self.transpose_cache:
-            max_seq_len = self.k_cache.shape[1]
-            num_kv_heads = self.k_cache.shape[2]
-        else:
-            max_seq_len = self.k_cache.shape[2]
             num_kv_heads = self.k_cache.shape[1]
+        else:
+            num_kv_heads = self.k_cache.shape[2]
         clone = KVCache(
             batch_size=self.batch_size,
-            max_seq_len=max_seq_len,
+            max_seq_len=self.max_seq_len,
             num_kv_heads=num_kv_heads,
             head_dim=self.k_cache.shape[3],
             dtype=self.k_cache.dtype,
+            transpose_cache=self.transpose_cache,
         )
         clone.k_cache.copy_(self.k_cache)
         clone.v_cache.copy_(self.v_cache)
diff --git a/extension/llm/modules/test/test_attention.py b/extension/llm/modules/test/test_attention.py
@@ -27,7 +27,7 @@ def setUp(self):
         torch.manual_seed(0)
         # Constants
         self.embed_dim = 2048
-        self.num_heads = 32
+        self.num_heads = 8
         self.num_kv_heads = 8
         self.head_dim = 64
         self.max_seq_len = 128
@@ -46,7 +46,9 @@ def setUp(self):
             self.embed_dim, self.num_kv_heads * self.head_dim, bias=False
         )
         self.v_proj.weight.requires_grad = False
-        self.output_proj = torch.nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        self.output_proj = torch.nn.Linear(
+            self.num_heads * self.head_dim, self.embed_dim, bias=False
+        )
         self.pos_embeddings = Llama3ScaledRoPE(
             dim=self.head_dim,
             max_seq_len=self.max_seq_len,
@@ -92,6 +94,12 @@ def setUp(self):
             {0: torch.export.Dim.STATIC, 1: seq_len_dim, 2: torch.export.Dim.STATIC},
             {0: torch.export.Dim.STATIC, 1: seq_len_dim},
         )
+        self.causal_mask = torch.tril(
+            torch.ones(
+                size=(self.max_seq_len, self.max_seq_len),
+                dtype=torch.bool,
+            )
+        )
 
     def test_attention_eager(self):
         et_res = self.et_mha(self.x, self.x)  # Self attention.
@@ -197,3 +205,35 @@ def test_attention_executorch(self):
         tt_res = self.tt_mha(self.x, self.x, input_pos=self.input_pos)
 
         assert_close(et_res[0], tt_res)
+
+    def test_attention_torch_cond_eager(self):
+        # Different from vanilla torchtune MHA, we rewrite the if condition with torch.cond. We need to make sure they are giving the same results regarding the if condition.
+        # For the first run of MHA we provide `y` (self.x) but for the second run it will be a tensor full of nan.
+        self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
+        self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
+
+        # mask
+        mask = self.causal_mask[self.input_pos, :]
+        # First run
+        et_res = self.et_mha(
+            self.x, self.x, mask=mask, input_pos=self.input_pos
+        )  # Self attention with input pos.
+        tt_res = self.tt_mha(
+            self.x, self.x, mask=mask, input_pos=self.input_pos
+        )  # Self attention with input pos.
+
+        self.assertTrue(torch.allclose(et_res, tt_res))
+
+        # Second run test kv cache read. Input pos is [10, 11, ..., 19]
+        next_input_pos = torch.arange(10, 20).unsqueeze(0)
+
+        empty_y = torch.full_like(self.x, torch.nan)
+        mask = self.causal_mask[next_input_pos, :]
+        et_res = self.et_mha(
+            self.x, empty_y, mask=mask, input_pos=next_input_pos
+        )  # Self attention with input pos.
+        tt_res = self.tt_mha(
+            self.x, None, mask=mask, input_pos=next_input_pos
+        )  # Self attention with input pos.
+
+        assert_close(et_res, tt_res)