pytorch
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎backends/arm/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/cadence/aot/fuse_ops.py‎
Lines changed: 5 additions & 23 deletions b/‎backends/cadence/aot/fuse_ops.py‎
Lines changed: 5 additions & 23 deletions
diff --git a/‎backends/cadence/aot/remove_ops.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/cadence/aot/remove_ops.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/aot/tests/test_fusion_ops_passes.py‎
Lines changed: 5 additions & 5 deletions b/‎backends/cadence/aot/tests/test_fusion_ops_passes.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎backends/cadence/aot/tests/test_reorder_ops_passes.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/cadence/aot/tests/test_reorder_ops_passes.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/qualcomm/_passes/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎backends/qualcomm/_passes/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/qualcomm/_passes/convert_bmm_to_matmul.py‎
Lines changed: 76 additions & 0 deletions b/‎backends/qualcomm/_passes/convert_bmm_to_matmul.py‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎backends/qualcomm/_passes/insert_io_qdq.py‎
Lines changed: 8 additions & 2 deletions b/‎backends/qualcomm/_passes/insert_io_qdq.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎backends/qualcomm/_passes/qnn_pass_manager.py‎
Lines changed: 10 additions & 3 deletions b/‎backends/qualcomm/_passes/qnn_pass_manager.py‎
Lines changed: 10 additions & 3 deletions
@@ -19,7 +19,7 @@ It supports a wide range of models including LLMs (Large Language Models), CV (C
 Platform Support:
 - Operating Systems:
   - iOS
-  - Mac
+  - MacOS (ARM64)
   - Android
   - Linux
   - Microcontrollers
 
@@ -12,6 +12,8 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 endif()
 
+add_compile_options("-Wall" "-Werror")
+
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 set(_common_include_directories ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 
@@ -712,32 +712,14 @@ def _create_requantize_node(
         out_dtype: torch.dtype,
         graph: torch.fx.Graph,
     ) -> torch.fx.Node:
-        in_scale_tensor = graph.call_function(
-            exir_ops.edge.aten.full.default, args=((1,), in_scale)
-        )
-        in_zero_point_tensor = graph.call_function(
-            exir_ops.edge.aten.full.default,
-            args=((1,), in_zero_point),
-            kwargs={"dtype": torch.int32},
-        )
-        out_scale_tensor = graph.call_function(
-            exir_ops.edge.aten.full.default, args=((1,), out_scale)
-        )
-        out_zero_point_tensor = graph.call_function(
-            exir_ops.edge.aten.full.default,
-            args=((1,), out_zero_point),
-            kwargs={"dtype": torch.int32},
-        )
-        # cadence::requantize(Tensor input, Tensor in_scale, Tensor in_zero_point, Tensor out_scale, Tensor out_zero_point, ScalarType out_dtype) -> Tensor Y
-        # TODO(hardiksharma): Add support for per-tensor requantize.
         return graph.call_function(
-            exir_ops.edge.cadence.requantize.default,
+            exir_ops.edge.cadence.requantize.per_tensor,
             args=(
                 in_tensor,
-                in_scale_tensor,
-                in_zero_point_tensor,
-                out_scale_tensor,
-                out_zero_point_tensor,
+                in_scale,
+                in_zero_point,
+                out_scale,
+                out_zero_point,
                 out_dtype,
             ),
         )
 
@@ -447,7 +447,7 @@ def call_operator(
         kwargs: dict[str, Argument],
         meta: NodeMetadata,
     ) -> ProxyValue:
-        if op != exir_ops.edge.cadence.requantize.default:
+        if op != exir_ops.edge.cadence.requantize.per_tensor:
             return super().call_operator(op, args, kwargs, meta)
 
         # Parse the args
 
@@ -306,7 +306,7 @@ def test_force_quant_dequant_fusion(self) -> None:
                 # Verify that dequant/quant pair was replaced with requantize.
                 exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: 0,
                 exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: 0,
-                exir_ops.edge.cadence.requantize.default: 1,
+                exir_ops.edge.cadence.requantize.per_tensor: 1,
             },
         )
 
@@ -336,7 +336,7 @@ def test_no_replace_quant_permute_dequant_with_requantize(self) -> None:
                 # quantize -> permute -> dequantize should not be replaced with requantize.
                 exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: 1,
                 exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: 1,
-                exir_ops.edge.cadence.requantize.default: 0,
+                exir_ops.edge.cadence.requantize.per_tensor: 0,
             },
         )
 
@@ -364,7 +364,7 @@ def test_replace_quant_view_dequant_with_requantize(self) -> None:
                 # Verify that dequant/quant pair was replaced with requantize.
                 exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: 0,
                 exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: 0,
-                exir_ops.edge.cadence.requantize.default: 1,
+                exir_ops.edge.cadence.requantize.per_tensor: 1,
             },
         )
 
@@ -390,7 +390,7 @@ def test_replace_dequant_quant_with_requantize(self) -> None:
                 # Verify that dequant -> quant was replaced with requantize.
                 exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: 0,
                 exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: 0,
-                exir_ops.edge.cadence.requantize.default: 1,
+                exir_ops.edge.cadence.requantize.per_tensor: 1,
             },
         )
 
@@ -420,7 +420,7 @@ def test_replace_dequant_permute_quant_with_requantize(self) -> None:
                 exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: 0,
                 exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: 0,
                 exir_ops.edge.aten.permute_copy.default: 1,
-                exir_ops.edge.cadence.requantize.default: 1,
+                exir_ops.edge.cadence.requantize.per_tensor: 1,
             },
         )
 
 
@@ -217,7 +217,7 @@ def test_advance_branched_quantize(self) -> None:
         self.assertEqual(
             count_node(
                 graph_module,
-                exir_ops.edge.cadence.requantize.default,
+                exir_ops.edge.cadence.requantize.per_tensor,
             ),
             1,
         )
 
@@ -8,6 +8,7 @@
 from .annotate_quant_attrs import AnnotateQuantAttrs
 from .annotate_stack import AnnotateStack
 from .annotate_unbind import AnnotateUnbind
+from .convert_bmm_to_matmul import ConvertBmmToMatmul
 from .convert_conv1d_to_conv2d import ConvertConv1dToConv2d
 from .convert_square_to_pow import ConvertSquareToPow
 from .decompose_any import DecomposeAny
@@ -35,7 +36,6 @@
 from .remove_0d_tensor import Remove0DTensor
 from .remove_redundancy import RemoveRedundancy
 from .replace_arange_args import ReplaceArangeArgs
-from .replace_index_put_input import ReplaceIndexPutInput
 from .replace_inf_values import ReplaceInfValues
 from .tag_quant_io import TagQuantIO
 
@@ -45,6 +45,7 @@
     AnnotateQuantAttrs,
     AnnotateStack,
     AnnotateUnbind,
+    ConvertBmmToMatmul,
     ConvertConv1dToConv2d,
     ConvertSquareToPow,
     DecomposeAny,
@@ -72,7 +73,6 @@
     Remove0DTensor,
     RemoveRedundancy,
     ReplaceArangeArgs,
-    ReplaceIndexPutInput,
     ReplaceInfValues,
     TagQuantIO,
 ]
@@ -0,0 +1,76 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import operator
+from collections import Counter
+from typing import List
+
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
+
+
+class ConvertBmmToMatmul(ExportPass):
+    """
+    Replace bmm to matmul, because bmm is eqaul to matmul in QNN.
+    Handle missing quantization tag for bmm op.
+    """
+
+    view_copy = exir_ops.edge.aten.view_copy.default
+    expand_copy = exir_ops.edge.aten.expand_copy.default
+    clone = exir_ops.edge.aten.clone.default
+    bmm = exir_ops.edge.aten.bmm.default
+    matmul = exir_ops.edge.aten.matmul.default
+    patterns = [
+        {expand_copy: 2, view_copy: 3, bmm: 1},
+        {expand_copy: 2, view_copy: 3, bmm: 1, clone: 1},
+        {bmm: 1},
+    ]
+
+    def __init__(self):
+        super(ConvertBmmToMatmul, self).__init__()
+
+    def _get_ordered_inputs(
+        self, inputs: List[torch.fx.Node], output: torch.fx.Node
+    ) -> List[torch.fx.Node]:
+        bmm_inputs = []
+        for arg in output.args:
+            while arg not in inputs:
+                arg = arg.args[0]
+            bmm_inputs.append(arg)
+        return bmm_inputs
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+        partitions = get_source_partitions(
+            graph,
+            [operator.matmul, torch.matmul, torch.bmm, torch.ops.aten.matmul.default],
+        )
+        for _, src_partitions in partitions.items():
+            for src_partition in src_partitions:
+                op_cnt = Counter([n.target for n in src_partition.nodes])
+                if op_cnt not in self.patterns:
+                    raise AssertionError(
+                        "Found a new pattern needed be converted to linear op"
+                    )
+
+                inputs = src_partition.input_nodes
+                bmm_node = [n for n in src_partition.nodes if n.target == self.bmm][0]
+                output = src_partition.output_nodes[0]
+                # the order of src_partition.inputs is not guaranteed.
+                lhs, rhs = self._get_ordered_inputs(inputs, bmm_node)
+                with graph_module.graph.inserting_before(output):
+                    # replace bmm to matmul, because bmm is eqaul to matmul in qnn.
+                    matmul_node = graph.create_node(
+                        "call_function", self.matmul, (lhs, rhs)
+                    )
+                    matmul_node.meta = output.meta
+                    for user in output.users.copy():
+                        user.replace_input_with(output, matmul_node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
@@ -9,7 +9,10 @@
 
 from executorch.backends.qualcomm.builders.node_visitor import q_ops
 
-from executorch.backends.qualcomm.builders.utils import is_parameter
+from executorch.backends.qualcomm.builders.utils import (
+    is_mutable_buffer_input,
+    is_parameter,
+)
 from executorch.backends.qualcomm.utils.constants import (
     QCOM_ENCODING,
     QCOM_QUANT_ATTRS,
@@ -124,7 +127,10 @@ def _insert(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule:
             if (
                 n.op == "placeholder"
                 and n.meta.get(QCOM_QUANT_ATTRS)
-                and not is_parameter(n, self.edge_program)
+                and (
+                    not is_parameter(n, self.edge_program)
+                    or is_mutable_buffer_input(n, self.edge_program)
+                )
             ):
                 self._insert_quant_node(
                     graph_module, n, n.meta[QCOM_QUANT_ATTRS][QCOM_ENCODING]
 
@@ -13,6 +13,7 @@
     AnnotateQuantAttrs,
     AnnotateStack,
     AnnotateUnbind,
+    ConvertBmmToMatmul,
     ConvertConv1dToConv2d,
     ConvertSquareToPow,
     DecomposeAny,
@@ -40,7 +41,6 @@
     Remove0DTensor,
     RemoveRedundancy,
     ReplaceArangeArgs,
-    ReplaceIndexPutInput,
     ReplaceInfValues,
     TagQuantIO,
 )
@@ -80,6 +80,7 @@ def get_capture_program_passes():
         (AnnotateQuantAttrs, True),
         (AnnotateStack, True),
         (AnnotateUnbind, True),
+        (ConvertBmmToMatmul, False),
         (ConvertConv1dToConv2d, True),
         (DecomposeAny, True),
         (DecomposeColIm, True),
@@ -92,7 +93,6 @@ def get_capture_program_passes():
         (RecomposeRmsNorm, False),
         (Remove0DTensor, True),
         (RemoveRedundancy, True),
-        (ReplaceIndexPutInput, True),
         (TagQuantIO, False),
     ]
 
@@ -224,4 +224,11 @@ def transform_for_preprocess_pipeline(self, exported_program: ExportedProgram):
         self.add_pass(LayoutTransform(exported_program, insert_permute=True))
         self.add_pass(FuseConsecutiveCast())
         self.add_pass(FuseConsecutiveTranspose())
-        return self._transform(exported_program.graph_module)
+        self._transform(exported_program.graph_module)
+        # Update inputs_to_buffers and buffers_to_mutate in graph signature for mutable buffer
+        # Since I/O will be inserted Q/DQ, it results in failed to mapping output node names and buffer
+        exported_program._graph_signature = _get_updated_graph_signature(
+            exported_program.graph_signature,
+            exported_program.graph_module,
+        )
+        return exported_program.graph_module
Original file line number	Diff line number	Diff line change
`@@ -217,7 +217,7 @@ def test_advance_branched_quantize(self) -> None:`
`217`	`217`	`self.assertEqual(`
`218`	`218`	`count_node(`
`219`	`219`	`graph_module,`
`220`		`- exir_ops.edge.cadence.requantize.default,`
	`220`	`+ exir_ops.edge.cadence.requantize.per_tensor,`
`221`	`221`	`),`
`222`	`222`	`1,`
`223`	`223`	`)`