Add a test to check functionality of ConvertMhaToSha

shewu-quic · shewu-quic · commit a666afa29268 · 2025-11-04T12:41:47.000+08:00
diff --git a/backends/qualcomm/_passes/convert_bmm_to_matmul.py b/backends/qualcomm/_passes/convert_bmm_to_matmul.py
@@ -47,7 +47,13 @@ def call(self, graph_module: torch.fx.GraphModule):
         graph = graph_module.graph
         partitions = get_source_partitions(
             graph,
-            [operator.matmul, torch.matmul, torch.bmm, torch.ops.aten.matmul.default],
+            [
+                "matmul",
+                operator.matmul,
+                torch.matmul,
+                torch.bmm,
+                torch.ops.aten.matmul.default,
+            ],
         )
         for _, src_partitions in partitions.items():
             for src_partition in src_partitions:
diff --git a/backends/qualcomm/_passes/convert_mha_to_sha.py b/backends/qualcomm/_passes/convert_mha_to_sha.py
@@ -126,7 +126,7 @@ def _get_attention_output(self, softmax):
         pattern_qk = [_is_softmax, "*", lambda x: _is_matmul(x) or _is_bmm(x)]
         qk = find_pattern(softmax, pattern_qk)
         if not qk:
-            return None, None
+            return None, None, None
 
         patterns_qkv = [
             _is_softmax,
@@ -139,7 +139,7 @@ def _get_attention_output(self, softmax):
 
         qkv = find_pattern(softmax, patterns_qkv, from_args=False)
         if qkv is None:
-            return None, None
+            return None, None, None
 
         permute, reshape = qkv[0][-2:]
         matmul = qkv[0][2]
diff --git a/backends/qualcomm/tests/test_passes.py b/backends/qualcomm/tests/test_passes.py
@@ -1,7 +1,15 @@
 import unittest
 
 import torch
-from executorch.backends.qualcomm._passes import InsertReshapeForReduceOps
+from executorch.backends.qualcomm._passes import (
+    ConvertBmmToMatmul,
+    ConvertMhaToSha,
+    InsertReshapeForReduceOps,
+    RemoveRedundancy,
+)
+
+from executorch.exir import to_edge
+from executorch.exir.dialects._ops import ops as exir_ops
 
 
 class TestPasses(unittest.TestCase):
@@ -49,6 +57,90 @@ def forward(self, x):
             torch.equal(*out, ref), f"Output mismatch: got {out}, expected {ref}"
         )
 
+    def test_mha_to_sha(self):
+        from executorch.backends.qualcomm.utils.utils import convert_linear_to_conv2d
+        from executorch.examples.models.llama.model_args import ModelArgs
+        from executorch.examples.qualcomm.oss_scripts.llama.masking_utils import (
+            CausalAttentionMask,
+        )
+        from executorch.examples.qualcomm.oss_scripts.llama.model.static_llama import (
+            LlamaAttention,
+        )
+
+        # Initailize model config
+        args = ModelArgs()
+        args.max_seq_len = 128
+        args.ar_len = 32
+        args.use_kv_cache = True
+        args.dim = 32
+        args.n_heads = 8
+        args.n_kv_heads = 8
+        args.n_layers = 2
+        args.head_dim = args.dim // args.n_heads
+        mod = convert_linear_to_conv2d(LlamaAttention(0, args, True))
+
+        # Prepare inputs
+        hidden_states = torch.randint(
+            low=0,
+            high=100,
+            size=(args.max_batch_size, args.ar_len, args.dim),
+            dtype=torch.float32,
+        )
+        freqs_cos = torch.randn(args.ar_len, 1)
+        freqs_sin = torch.randn(args.ar_len, 1)
+        atten_mask = CausalAttentionMask(
+            args.max_batch_size, args.ar_len, args.max_seq_len
+        )
+        k_cache = torch.zeros(
+            args.max_batch_size,
+            args.n_kv_heads,
+            args.head_dim,
+            args.max_seq_len - args.ar_len,
+        )
+
+        v_cache = torch.zeros(
+            args.max_batch_size,
+            args.n_kv_heads,
+            args.max_seq_len - args.ar_len,
+            args.head_dim,
+        )
+        sample_input = (
+            hidden_states,
+            freqs_cos,
+            freqs_sin,
+            atten_mask.mask,
+            k_cache,
+            v_cache,
+        )
+
+        # Export the module and convert linear to conv2d
+        edge_program = to_edge(torch.export.export(mod, sample_input))
+        new_ep = edge_program.exported_program()
+
+        conv_nodes = [
+            n
+            for n in new_ep.graph.nodes
+            if n.target == exir_ops.edge.aten.convolution.default
+        ]
+        # WQ, WK, WV, O
+        self.assertTrue(len(conv_nodes) == 4, "Convolution nodes missing")
+
+        # Convert MHA to SHA
+        # This is a simplified version of what happens in the full pipeline to test the core functionality
+        graph_module = RemoveRedundancy(quantization_capture=False)(
+            new_ep.graph_module
+        ).graph_module
+        graph_module = ConvertBmmToMatmul()(graph_module).graph_module
+        graph_module = ConvertMhaToSha(new_ep)(graph_module).graph_module
+
+        conv_nodes = [
+            n
+            for n in new_ep.graph.nodes
+            if n.target == exir_ops.edge.aten.convolution.default
+        ]
+        # Check graph structure: WQ, WK, WV should be converted to SHA
+        self.assertTrue(len(conv_nodes) == 25, "Convolution nodes should be splited")
+
 
 if __name__ == "__main__":
     unittest.main()