pytorch
diff --git a/‎backends/qualcomm/_passes/__init__.py‎
Lines changed: 0 additions & 2 deletions b/‎backends/qualcomm/_passes/__init__.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎backends/qualcomm/_passes/insert_io_qdq.py‎
Lines changed: 8 additions & 2 deletions b/‎backends/qualcomm/_passes/insert_io_qdq.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎backends/qualcomm/_passes/qnn_pass_manager.py‎
Lines changed: 8 additions & 3 deletions b/‎backends/qualcomm/_passes/qnn_pass_manager.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎backends/qualcomm/_passes/replace_index_put_input.py‎
Lines changed: 0 additions & 54 deletions b/‎backends/qualcomm/_passes/replace_index_put_input.py‎
Lines changed: 0 additions & 54 deletions
diff --git a/‎backends/qualcomm/_passes/utils.py‎
Lines changed: 1 addition & 3 deletions b/‎backends/qualcomm/_passes/utils.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎backends/qualcomm/builders/node_visitor.py‎
Lines changed: 33 additions & 11 deletions b/‎backends/qualcomm/builders/node_visitor.py‎
Lines changed: 33 additions & 11 deletions
diff --git a/‎backends/qualcomm/builders/node_visitor_manager.py‎
Lines changed: 4 additions & 2 deletions b/‎backends/qualcomm/builders/node_visitor_manager.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎backends/qualcomm/builders/op_index_put.py‎
Lines changed: 6 additions & 1 deletion b/‎backends/qualcomm/builders/op_index_put.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎backends/qualcomm/builders/utils.py‎
Lines changed: 34 additions & 1 deletion b/‎backends/qualcomm/builders/utils.py‎
Lines changed: 34 additions & 1 deletion
diff --git a/‎backends/qualcomm/partition/qnn_partitioner.py‎
Lines changed: 16 additions & 1 deletion b/‎backends/qualcomm/partition/qnn_partitioner.py‎
Lines changed: 16 additions & 1 deletion
@@ -35,7 +35,6 @@
 from .remove_0d_tensor import Remove0DTensor
 from .remove_redundancy import RemoveRedundancy
 from .replace_arange_args import ReplaceArangeArgs
-from .replace_index_put_input import ReplaceIndexPutInput
 from .replace_inf_values import ReplaceInfValues
 from .tag_quant_io import TagQuantIO
 
@@ -72,7 +71,6 @@
     Remove0DTensor,
     RemoveRedundancy,
     ReplaceArangeArgs,
-    ReplaceIndexPutInput,
     ReplaceInfValues,
     TagQuantIO,
 ]
@@ -9,7 +9,10 @@
 
 from executorch.backends.qualcomm.builders.node_visitor import q_ops
 
-from executorch.backends.qualcomm.builders.utils import is_parameter
+from executorch.backends.qualcomm.builders.utils import (
+    is_mutable_buffer_input,
+    is_parameter,
+)
 from executorch.backends.qualcomm.utils.constants import (
     QCOM_ENCODING,
     QCOM_QUANT_ATTRS,
@@ -124,7 +127,10 @@ def _insert(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule:
             if (
                 n.op == "placeholder"
                 and n.meta.get(QCOM_QUANT_ATTRS)
-                and not is_parameter(n, self.edge_program)
+                and (
+                    not is_parameter(n, self.edge_program)
+                    or is_mutable_buffer_input(n, self.edge_program)
+                )
             ):
                 self._insert_quant_node(
                     graph_module, n, n.meta[QCOM_QUANT_ATTRS][QCOM_ENCODING]
 
@@ -40,7 +40,6 @@
     Remove0DTensor,
     RemoveRedundancy,
     ReplaceArangeArgs,
-    ReplaceIndexPutInput,
     ReplaceInfValues,
     TagQuantIO,
 )
@@ -92,7 +91,6 @@ def get_capture_program_passes():
         (RecomposeRmsNorm, False),
         (Remove0DTensor, True),
         (RemoveRedundancy, True),
-        (ReplaceIndexPutInput, True),
         (TagQuantIO, False),
     ]
 
@@ -224,4 +222,11 @@ def transform_for_preprocess_pipeline(self, exported_program: ExportedProgram):
         self.add_pass(LayoutTransform(exported_program, insert_permute=True))
         self.add_pass(FuseConsecutiveCast())
         self.add_pass(FuseConsecutiveTranspose())
-        return self._transform(exported_program.graph_module)
+        self._transform(exported_program.graph_module)
+        # Update inputs_to_buffers and buffers_to_mutate in graph signature for mutable buffer
+        # Since I/O will be inserted Q/DQ, it results in failed to mapping output node names and buffer
+        exported_program._graph_signature = _get_updated_graph_signature(
+            exported_program.graph_signature,
+            exported_program.graph_module,
+        )
+        return exported_program.graph_module
@@ -76,7 +76,6 @@ def get_passes_dependency_for_capture_program():
         RecomposePixelUnshuffle,
         RecomposeRmsNorm,
         RemoveRedundancy,
-        ReplaceIndexPutInput,
         TagQuantIO,
     )
 
@@ -103,8 +102,7 @@ def get_passes_dependency_for_capture_program():
         ],
         RecomposePixelUnshuffle: [RemoveRedundancy],
         RecomposeRmsNorm: [RemoveRedundancy],
-        ReplaceIndexPutInput: [LayoutTransform],
-        TagQuantIO: [ReplaceIndexPutInput],
+        TagQuantIO: [LayoutTransform],
     }
 
 
 
@@ -41,6 +41,8 @@
     get_parameter,
     is_graph_input,
     is_graph_output,
+    is_mutable_buffer_input,
+    is_mutable_buffer_output,
     is_parameter,
 )
 
@@ -307,7 +309,9 @@ def get_tensor_type(
         node: torch.fx.Node,
         tensor_type: PyQnnWrapper.Qnn_TensorType_t,
     ) -> PyQnnWrapper.Qnn_TensorType_t:
-        is_input = is_graph_input(node, self.edge_program)
+        is_input = is_graph_input(node, self.edge_program) or is_mutable_buffer_input(
+            node, self.edge_program
+        )
         is_output = is_graph_output(node)
         # handle logic for input/output tensors
         if is_input or is_output:
@@ -352,6 +356,33 @@ def get_dynamic_dimension(self, dims):
 
         return dynamic_dims if any(dynamic_dims) else [], nominal_dims
 
+    def get_tensor_name(
+        self,
+        node: torch.fx.Node,
+        wrapper_idx: int = 0,
+    ):
+        tensor_name = f"{node.name}_{wrapper_idx}"
+        # The `input_{id}` is utilized for sorting at runtime. Due to multiple passes in qnn_preprocess,
+        # the input order between QNN and the original graph’s forward function may differ.
+        # The `mutbuf_{id}` is utilized for mapping I/O of mutable buffer at runtime.
+        # The `output_` is identified as the graph’s output at runtime to prevent confusion with per_tensor_dump.
+        if is_mutable_buffer_input(node, self.edge_program):
+            fqn = self.edge_program.graph_signature.inputs_to_buffers[node.target]
+            position_index = list(
+                self.edge_program.graph_signature.buffers_to_mutate.values()
+            ).index(fqn)
+            tensor_name = f"input_{str(self.external_ids[node])}_mutbuf_{str(position_index)}_{tensor_name}"
+        elif is_graph_input(node, self.edge_program):
+            tensor_name = f"input_{str(self.external_ids[node])}_{tensor_name}"
+        elif is_mutable_buffer_output(node, self.edge_program):
+            position_index = list(
+                self.edge_program.graph_signature.buffers_to_mutate.keys()
+            ).index(node.name)
+            tensor_name = f"output_mutbuf_{position_index}_{tensor_name}"
+        elif is_graph_output(node):
+            tensor_name = f"output_{tensor_name}"
+        return tensor_name
+
     def define_custom_tensor_wrapper(
         self,
         node_name: str,
@@ -413,16 +444,7 @@ def define_tensor(
         if cached := nodes_to_wrappers[node_name].get(wrapper_idx, None):
             return cached
 
-        tensor_name = f"{tensor_source_node.name}_{wrapper_idx}"
-        if is_graph_input(tensor_source_node, self.edge_program):
-            tensor_name = (
-                "input_"
-                + str(self.external_ids[tensor_source_node])
-                + "_"
-                + tensor_name
-            )
-        if is_graph_output(tensor_source_node):
-            tensor_name = "output_" + tensor_name
+        tensor_name = self.get_tensor_name(tensor_source_node, wrapper_idx)
         dims = torch.Size([1]) if len(tensor.size()) == 0 else tensor.size()
         dynamic_dims, nominal_dims = self.get_dynamic_dimension(dims)
         tensor_type = self.get_tensor_type(tensor_source_node, tensor_type)
 
@@ -13,7 +13,7 @@
 
 from .node_visitor import NodeVisitor
 from .op_custom_op import CustomOp
-from .utils import is_graph_input, is_graph_output
+from .utils import is_graph_input, is_graph_output, is_mutable_buffer_input
 
 
 # This will hold mapping of all node names to the visitor class
@@ -39,7 +39,9 @@ def generate_node_to_external_map(
         # The order in which we visit the placeholder node is same as the *args
         # order for the forward(*args) signature for this gm. Using the order of
         # the nodes as external_id to extract the right arg from *args at runtime
-        if is_graph_input(node, edge_program):
+        if is_graph_input(node, edge_program) or is_mutable_buffer_input(
+            node, edge_program
+        ):
             node_to_external_map[node] = len(node_to_external_map)
     for node in edge_program.graph_module.graph.nodes:
         if is_graph_output(node):
 
@@ -1,9 +1,10 @@
 from typing import Dict
 
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
-
 import torch
 
+from executorch.backends.qualcomm.utils.constants import QCOM_QUANT_ATTRS
+
 from .node_visitor import NodeVisitor
 from .node_visitor_manager import register_node_visitor
 from .qnn_constants import OpScatterNd, QNN_OP_PACKAGE_NAME_QTI_AISW
@@ -22,6 +23,10 @@ def define_node(
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
         input_node = self.get_node(node.args[0])
+        # Because the args[0] of index_put op doesn't annotate, need to fill in the quant_attr with the node here.
+        if quant_attrs := node.meta.get(QCOM_QUANT_ATTRS):
+            quant_attrs = quant_attrs.copy()
+            input_node.meta[QCOM_QUANT_ATTRS] = quant_attrs
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
 
@@ -75,6 +75,20 @@ def is_graph_input(
     return tensor.op == "placeholder" and not is_parameter(tensor, edge_program)
 
 
+def is_mutable_buffer_input(
+    tensor: torch.fx.Node, edge_program: torch.export.ExportedProgram
+) -> bool:
+    """
+    Check if the given tensor is a mutable buffer input
+    Args:
+        tensor: EdgeIR Tensor that is being checked for mutable buffer input
+    """
+    if tensor.op == "placeholder" and is_buffer(edge_program, tensor):
+        fqn = edge_program.graph_signature.inputs_to_buffers[tensor.target]
+        # if the buffer is mutated then record that
+        return fqn in edge_program.graph_signature.buffers_to_mutate.values()
+
+
 def is_graph_output(node: torch.fx.Node) -> bool:
     """
     Check if the given tensor is used as a graph output
@@ -83,14 +97,33 @@ def is_graph_output(node: torch.fx.Node) -> bool:
         tensor: EdgeIR Tensor that is being checked for graph input
     """
     for user in node.users.keys():
-        # getitem node is skiped, check the op_skip_ops.py
+        # getitem node is skipped, check the op_skip_ops.py
         if user.op == "output" or (
             user.target.__name__ == "getitem" and is_graph_output(user)
         ):
             return True
     return False
 
 
+def is_mutable_buffer_output(
+    tensor: torch.fx.Node, edge_program: torch.export.ExportedProgram
+) -> bool:
+    """
+    Check if the given tensor is a mutable buffer output
+    Args:
+        tensor: EdgeIR Tensor that is being checked for mutable buffer output
+    """
+    return (
+        any(
+            user.op == "output"
+            or user.target.__name__ == "getitem"
+            and is_graph_output(user)
+            for user in tensor.users.keys()
+        )
+        and tensor.name in edge_program.graph_signature.buffers_to_mutate.keys()
+    )
+
+
 def is_constant(
     tensor: torch.fx.Node, edge_program: torch.export.ExportedProgram
 ) -> bool:
 
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 import copy
+import logging
 from collections import defaultdict
 from typing import Any, Callable, Dict, List, Optional, Tuple
 
@@ -29,7 +30,7 @@
     Partitioner,
     PartitionResult,
 )
-from executorch.exir.backend.utils import tag_constant_data
+from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer
 from torch.export.exported_program import ExportedProgram
 from torch.fx.passes.infra.partitioner import Partition
 from torch.fx.passes.operator_support import OperatorSupportBase
@@ -42,6 +43,9 @@
 )
 from .utils import filter_fn, generate_qnn_executorch_option, get_skip_decomp_table
 
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
 
 class QnnOperatorSupport(OperatorSupportBase):
     def __init__(
@@ -124,6 +128,7 @@ def __init__(
         compiler_specs: List[CompileSpec],
         skip_node_id_set: set = None,
         skip_node_op_set: set = None,
+        skip_mutable_buffer: bool = False,
     ):
         self.compiler_specs_snapshot = copy.deepcopy(compiler_specs)
 
@@ -133,6 +138,7 @@ def __init__(
         self.partition_tags: Dict[str, DelegationSpec] = {}
         self.skip_node_id_set = set() if skip_node_id_set is None else skip_node_id_set
         self.skip_node_op_set = set() if skip_node_op_set is None else skip_node_op_set
+        self.skip_mutable_buffer = skip_mutable_buffer
 
     def generate_partitions(
         self, edge_program: torch.export.ExportedProgram
@@ -178,6 +184,15 @@ def partition(self, edge_program: torch.export.ExportedProgram) -> PartitionResu
         if len(partitions) != 0:
             self.tag_nodes(partitions, edge_program)
             tag_constant_data(edge_program)
+            if not self.skip_mutable_buffer:
+                logger.info(
+                    "Qnn partitioner will delegate torch mutable buffer with the same I/O address during the runtime, "
+                    "so if your model contains mutable buffer, "
+                    "then you can get the better performance with skip_mutable_buffer=False. "
+                    "If you encounter accuracy issue during the runtime, "
+                    "then please set `skip_mutable_buffer=True` and try again."
+                )
+                tag_mutated_buffer(edge_program)
         for node in edge_program.graph_module.graph.nodes:
             if hasattr(node, "meta"):
                 # pop certain keys in meta for not affecting the passes in compilation
Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,6 @@ def get_passes_dependency_for_capture_program():`
`76`	`76`	`RecomposePixelUnshuffle,`
`77`	`77`	`RecomposeRmsNorm,`
`78`	`78`	`RemoveRedundancy,`
`79`		`- ReplaceIndexPutInput,`
`80`	`79`	`TagQuantIO,`
`81`	`80`	`)`
`82`	`81`
`@@ -103,8 +102,7 @@ def get_passes_dependency_for_capture_program():`
`103`	`102`	`],`
`104`	`103`	`RecomposePixelUnshuffle: [RemoveRedundancy],`
`105`	`104`	`RecomposeRmsNorm: [RemoveRedundancy],`
`106`		`- ReplaceIndexPutInput: [LayoutTransform],`
`107`		`- TagQuantIO: [ReplaceIndexPutInput],`
	`105`	`+ TagQuantIO: [LayoutTransform],`
`108`	`106`	`}`
`109`	`107`
`110`	`108`