Fixed the CI for meta's llama

shewu-quic · shewu-quic · commit 19c5aa13cb23 · 2025-06-19T12:14:53.000+08:00
diff --git a/backends/qualcomm/_passes/__init__.py b/backends/qualcomm/_passes/__init__.py
@@ -8,6 +8,7 @@
 from .annotate_quant_attrs import AnnotateQuantAttrs
 from .annotate_stack import AnnotateStack
 from .annotate_unbind import AnnotateUnbind
+from .convert_bmm_to_matmul import ConvertBmmToMatmul
 from .convert_conv1d_to_conv2d import ConvertConv1dToConv2d
 from .convert_square_to_pow import ConvertSquareToPow
 from .decompose_any import DecomposeAny
@@ -44,6 +45,7 @@
     AnnotateQuantAttrs,
     AnnotateStack,
     AnnotateUnbind,
+    ConvertBmmToMatmul,
     ConvertConv1dToConv2d,
     ConvertSquareToPow,
     DecomposeAny,
diff --git a/backends/qualcomm/_passes/convert_bmm_to_matmul.py b/backends/qualcomm/_passes/convert_bmm_to_matmul.py
@@ -0,0 +1,76 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import operator
+from collections import Counter
+from typing import List
+
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
+
+
+class ConvertBmmToMatmul(ExportPass):
+    """
+    Replace bmm to matmul, because bmm is eqaul to matmul in QNN.
+    Handle missing quantization tag for bmm op.
+    """
+
+    view_copy = exir_ops.edge.aten.view_copy.default
+    expand_copy = exir_ops.edge.aten.expand_copy.default
+    clone = exir_ops.edge.aten.clone.default
+    bmm = exir_ops.edge.aten.bmm.default
+    matmul = exir_ops.edge.aten.matmul.default
+    patterns = [
+        {expand_copy: 2, view_copy: 3, bmm: 1},
+        {expand_copy: 2, view_copy: 3, bmm: 1, clone: 1},
+        {bmm: 1},
+    ]
+
+    def __init__(self):
+        super(ConvertBmmToMatmul, self).__init__()
+
+    def _get_ordered_inputs(
+        self, inputs: List[torch.fx.Node], output: torch.fx.Node
+    ) -> List[torch.fx.Node]:
+        bmm_inputs = []
+        for arg in output.args:
+            while arg not in inputs:
+                arg = arg.args[0]
+            bmm_inputs.append(arg)
+        return bmm_inputs
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+        partitions = get_source_partitions(
+            graph,
+            [operator.matmul, torch.matmul, torch.bmm, torch.ops.aten.matmul.default],
+        )
+        for _, src_partitions in partitions.items():
+            for src_partition in src_partitions:
+                op_cnt = Counter([n.target for n in src_partition.nodes])
+                if op_cnt not in self.patterns:
+                    raise AssertionError(
+                        "Found a new pattern needed be converted to linear op"
+                    )
+
+                inputs = src_partition.input_nodes
+                bmm_node = [n for n in src_partition.nodes if n.target == self.bmm][0]
+                output = src_partition.output_nodes[0]
+                # the order of src_partition.inputs is not guaranteed.
+                lhs, rhs = self._get_ordered_inputs(inputs, bmm_node)
+                with graph_module.graph.inserting_before(output):
+                    # replace bmm to matmul, because bmm is eqaul to matmul in qnn.
+                    matmul_node = graph.create_node(
+                        "call_function", self.matmul, (lhs, rhs)
+                    )
+                    matmul_node.meta = output.meta
+                    for user in output.users.copy():
+                        user.replace_input_with(output, matmul_node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/qnn_pass_manager.py b/backends/qualcomm/_passes/qnn_pass_manager.py
@@ -13,6 +13,7 @@
     AnnotateQuantAttrs,
     AnnotateStack,
     AnnotateUnbind,
+    ConvertBmmToMatmul,
     ConvertConv1dToConv2d,
     ConvertSquareToPow,
     DecomposeAny,
@@ -79,6 +80,7 @@ def get_capture_program_passes():
         (AnnotateQuantAttrs, True),
         (AnnotateStack, True),
         (AnnotateUnbind, True),
+        (ConvertBmmToMatmul, False),
         (ConvertConv1dToConv2d, True),
         (DecomposeAny, True),
         (DecomposeColIm, True),
diff --git a/backends/qualcomm/_passes/utils.py b/backends/qualcomm/_passes/utils.py
@@ -64,6 +64,7 @@ def get_passes_dependency_for_capture_program():
         AnnotateQuantAttrs,
         AnnotateStack,
         AnnotateUnbind,
+        ConvertBmmToMatmul,
         ConvertConv1dToConv2d,
         DecomposeAny,
         DecomposeColIm,
@@ -82,11 +83,13 @@ def get_passes_dependency_for_capture_program():
     return {
         AnnotateAdaptiveAvgPool1D: [RemoveRedundancy],
         AnnotateQuantAttrs: [
+            ConvertBmmToMatmul,
             RecomposePixelUnshuffle,
             RemoveRedundancy,
         ],
         AnnotateStack: [RemoveRedundancy],
         AnnotateUnbind: [RemoveRedundancy],
+        ConvertBmmToMatmul: [RecomposePixelUnshuffle],
         DecomposeAny: [RemoveRedundancy],
         DecomposeColIm: [FoldQDQ],
         DecomposeLinalgVectorNorm: [RemoveRedundancy],
diff --git a/backends/qualcomm/quantizer/custom_annotation.py b/backends/qualcomm/quantizer/custom_annotation.py
@@ -292,14 +292,15 @@ def annotate_matmul(node: Node, quantization_config: QuantizationConfig):
         )
 
     def annotate_index_put(node: Node, quantization_config: QuantizationConfig) -> None:
-        input = node.args[0]
+        # Avoid annotating the input node because mutable buffers will be folded during the convert_pt2e process.
         value = node.args[2]
+
         input_qspec_map = {}
-        input_qspec_map[input] = quantization_config.input_activation
-        input_qspec_map[value] = SharedQuantizationSpec((input, node))
+        input_qspec_map[value] = quantization_config.input_activation
+
         node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
             input_qspec_map=input_qspec_map,
-            output_qspec=SharedQuantizationSpec((input, node)),
+            output_qspec=SharedQuantizationSpec((value, node)),
             _annotated=True,
         )
 
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -914,6 +914,7 @@ def _to_edge_and_lower_llama(  # noqa: C901
         # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm._passes`
         from executorch.backends.qualcomm._passes import (
             AnnotateStack,
+            ConvertBmmToMatmul,
             FoldQDQ,
             RecomposeRmsNorm,
             TagQuantIO,
@@ -956,6 +957,7 @@ def _to_edge_and_lower_llama(  # noqa: C901
         passes_job = get_capture_program_passes()
         dep_table = get_passes_dependency_for_capture_program()
         passes_job[AnnotateStack][QCOM_PASS_ACTIVATE_KEY] = True
+        passes_job[ConvertBmmToMatmul][QCOM_PASS_ACTIVATE_KEY] = True
         passes_job[RecomposeRmsNorm][QCOM_PASS_ACTIVATE_KEY] = True
         passes_job[TagQuantIO][QCOM_PASS_ACTIVATE_KEY] = True
         passes_job[TagQuantIO][QCOM_PASS_ARGS_KWARGS_DEFAULTS_KEY][