Milestone2.2: Optimize transposes in XNNPACK partition by removing redundant to_copy ops (#11316)

leafs1 · web-flow · commit 4e29bc982da4 · 2025-07-11T17:08:58.000-07:00
### Summary
Optimize transposes in XNNPACK partition by adding a new
remove_redundant_ops_pass that check for dim order conversion ops that
cancel each other. The pass supports non-quantized conversions and also
quantized graphs. In the quantized graph case, the conversion nodes and
wrapping q/dq nodes will be removed. I also refactored the
channels_last_tagged_reshape_pass code by modularizing some functions
and adding some setter/getter functions.

This change will improve speed/memory at runtime by not executing
redundant to_copy ops that would be there otherwise.

### Test plan
Created a TestChannelsLastTaggedReshapePass class which constructs
graphs with multiple redundant to_copy ops in different positions and in
quantized/non-quantized graphs. These redundant ops are either
explicitly stated or generated via other passes. I asserted their
removal after the passes finished.
diff --git a/backends/xnnpack/_passes/__init__.py b/backends/xnnpack/_passes/__init__.py
@@ -23,6 +23,9 @@
 from executorch.backends.xnnpack._passes.fuse_activation_pass import FuseActivationPass
 from executorch.backends.xnnpack._passes.fuse_batch_norm import FuseBatchNormPass
 from executorch.backends.xnnpack._passes.prelu_reshape_pass import PReLUReshapePass
+from executorch.backends.xnnpack._passes.remove_redundant_copy_pass import (
+    RemoveRedundantCopyPass,
+)
 from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
 
 from executorch.exir.pass_base import ExportPass
@@ -65,6 +68,7 @@ def __init__(
                 Conv1dUnsqueezePass,
                 PReLUReshapePass,
                 ChannelsLastTaggedReshapePass,
+                RemoveRedundantCopyPass,
             ]
         else:
             self.passes = passes
diff --git a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from enum import Enum
 from typing import Optional, Tuple
 
 import torch
@@ -19,6 +20,11 @@
 from executorch.exir.pass_base import PassResult
 
 
+class InputDimOrder(Enum):
+    NCHW = 1
+    NHWC = 2
+
+
 # TODO(T151254305) use subgraph_rewriter
 class ChannelsLastTaggedReshapePass(XNNPACKPass):
     """
@@ -83,17 +89,49 @@ class ChannelsLastTaggedReshapePass(XNNPACKPass):
     # is done
     PARTNER_NODE = "XNN_CHANNELS_LAST_TAGGED_RESHAPE_PARTNER_NODE"
 
-    def mark_as_nhwc_node(self, node: torch.fx.Node) -> None:
+    @staticmethod
+    def mark_as_nhwc_node(node: torch.fx.Node) -> None:
         node.meta[ChannelsLastTaggedReshapePass.XNN_NHWC_NODE] = True
 
-    def mark_as_nchw_node(self, node: torch.fx.Node) -> None:
+    @staticmethod
+    def mark_as_nchw_node(node: torch.fx.Node) -> None:
         node.meta[ChannelsLastTaggedReshapePass.XNN_NHWC_NODE] = False
 
-    def is_nhwc_node(self, node: torch.fx.Node) -> bool:
+    def tag_node(self, node: torch.fx.Node) -> None:
+        if node.kwargs["memory_format"] == torch.channels_last:
+            self.mark_as_nhwc_node(node)
+        else:
+            self.mark_as_nchw_node(node)
+
+    @staticmethod
+    def is_nhwc_node(node: torch.fx.Node) -> bool:
+        if is_dequant(node) and len(node.all_input_nodes) > 0:
+            quantize_node = node.args[0]
+            if len(quantize_node.all_input_nodes) > 0:
+                actual_node = quantize_node.args[0]
+                if actual_node.op == "placeholder":
+                    return not actual_node.meta["val"][0].is_contiguous()
+                else:
+                    return actual_node.meta.get(
+                        ChannelsLastTaggedReshapePass.XNN_NHWC_NODE, False
+                    )
+
         return node.meta.get(ChannelsLastTaggedReshapePass.XNN_NHWC_NODE, False)
 
-    def is_nchw_node(self, node: torch.fx.Node) -> bool:
-        return not self.is_nhwc_node(node)
+    @staticmethod
+    def is_nchw_node(node: torch.fx.Node) -> bool:
+        if is_dequant(node) and len(node.all_input_nodes) > 0:
+            quantize_node = node.args[0]
+            if len(quantize_node.all_input_nodes) > 0:
+                actual_node = quantize_node.args[0]
+                if actual_node.op == "placeholder":
+                    return actual_node.meta["val"][0].is_contiguous()
+                else:
+                    return not actual_node.meta.get(
+                        ChannelsLastTaggedReshapePass.XNN_NHWC_NODE, False
+                    )
+
+        return not ChannelsLastTaggedReshapePass.is_nhwc_node(node)
 
     def requires_nhwc_input(self, node: torch.fx.Node) -> bool:
         return node.target in self.memory_sensitive_ops_nhwc
@@ -111,7 +149,7 @@ def can_be_converted_to_nhwc(self, node: torch.fx.Node) -> bool:
         is_nchw_constant = (
             is_param_node(self.exported_program, node)
             and (ChannelsLastTaggedReshapePass.XNN_NHWC_NODE in node.meta)
-            and (self.is_nchw_node(node))
+            and (ChannelsLastTaggedReshapePass.is_nchw_node(node))
         )
         return is_4d and not is_nchw_constant
 
@@ -273,6 +311,22 @@ def insert_copy_and_assign_partner_nodes_quantization_sensitive(
             # in that case
             self.make_partners(original_input, copy_node)
 
+    def input_dim_order(
+        self, input_node: torch.fx.Node, input_order: InputDimOrder
+    ) -> bool:
+        if input_node.op == "placeholder":
+            return (
+                input_node.meta["val"].is_contiguous()
+                if input_order == InputDimOrder.NCHW
+                else not input_node.meta["val"].is_contiguous()
+            )
+        else:
+            return (
+                ChannelsLastTaggedReshapePass.is_nchw_node(input_node)
+                if input_order == InputDimOrder.NCHW
+                else ChannelsLastTaggedReshapePass.is_nhwc_node(input_node)
+            )
+
     def input_to_nhwc(
         self,
         graph_module: torch.fx.GraphModule,
@@ -282,7 +336,7 @@ def input_to_nhwc(
         if is_param_node(self.exported_program, input_node):
             if (
                 ChannelsLastTaggedReshapePass.XNN_NHWC_NODE in input_node.meta
-                and self.is_nchw_node(input_node)
+                and ChannelsLastTaggedReshapePass.is_nchw_node(input_node)
             ):
                 # This constant data tensor has been used somewhere else
                 # in NCHW format so we can't use it here in NHWC format
@@ -296,7 +350,10 @@ def input_to_nhwc(
         if input_node.op == "placeholder":
             if not input_node.meta["val"][0].is_contiguous():
                 return
-        elif self.is_nhwc_node(input_node):
+        elif ChannelsLastTaggedReshapePass.is_nhwc_node(input_node):
+            return
+
+        if self.input_dim_order(input_node, InputDimOrder.NHWC):
             return
 
         if not self.can_be_converted_to_nhwc(input_node):
@@ -326,6 +383,8 @@ def input_to_nhwc(
                     args=(input_node,),
                     memory_format=torch.channels_last,
                 )
+                # Use static method for consistency
+                ChannelsLastTaggedReshapePass.mark_as_nhwc_node(input_node_nhwc)
 
             if is_dynamic_input:
                 # Replace downstream input_nodes with NHWC node
@@ -348,7 +407,7 @@ def input_to_nchw(
         if is_param_node(self.exported_program, input_node):
             if (
                 ChannelsLastTaggedReshapePass.XNN_NHWC_NODE in input_node.meta
-                and self.is_nhwc_node(input_node)
+                and ChannelsLastTaggedReshapePass.is_nhwc_node(input_node)
             ):
                 # This constant data tensor has been used somewhere else
                 # in NHWC format so we can't use it here in NCHW format
@@ -363,7 +422,10 @@ def input_to_nchw(
         if input_node.op == "placeholder":
             if input_node.meta["val"].is_contiguous():
                 return
-        elif self.is_nchw_node(input_node):
+        elif ChannelsLastTaggedReshapePass.is_nchw_node(input_node):
+            return
+
+        if self.input_dim_order(input_node, InputDimOrder.NCHW):
             return
 
         if ChannelsLastTaggedReshapePass.PARTNER_NODE in input_node.meta:
@@ -380,6 +442,7 @@ def input_to_nchw(
                     args=(input_node,),
                     memory_format=torch.contiguous_format,
                 )
+                ChannelsLastTaggedReshapePass.mark_as_nchw_node(input_node_nchw)
 
         self.insert_copy_and_assign_partner_nodes_quantization_sensitive(
             graph_module=graph_module,
@@ -393,7 +456,12 @@ def call(self, graph_module: torch.fx.GraphModule):  # noqa: C901
         original_nodes = list(graph.nodes)
         for node in original_nodes:
             if len(node.all_input_nodes) == 0:
-                # This node has no inputs so we don't need to change anything
+                # This node has no inputs so we don't need to change anything, but still need to tag input nodes
+                if "val" in node.meta and isinstance(node.meta["val"], torch.Tensor):
+                    if node.meta["val"].is_contiguous():
+                        self.mark_as_nchw_node(node)
+                    else:
+                        self.mark_as_nhwc_node(node)
                 continue
 
             # Need special case for output node because it can have multiple output dim orders as we can output a tuple multiple nodes
@@ -407,10 +475,12 @@ def call(self, graph_module: torch.fx.GraphModule):  # noqa: C901
             elif self.requires_nhwc_input(node):
                 # Nodes which enter this branch are ones that require their
                 # first input to be nhwc. This makes this node's output nhwc too
-
                 self.input_to_nhwc(graph_module, node.args[0], node)
-                for input_node in node.all_input_nodes:
-                    if input_node.op == "placeholder" and self.is_nhwc_node(input_node):
+                for input_node in node.all_input_nodes[1:]:
+                    if (
+                        input_node.op == "placeholder"
+                        and ChannelsLastTaggedReshapePass.is_nhwc_node(input_node)
+                    ):
                         raise AssertionError(
                             f"Expected {input_node} to be NCHW in channels last reshape pass"
                         )
@@ -419,11 +489,14 @@ def call(self, graph_module: torch.fx.GraphModule):  # noqa: C901
                 # The node requires nchw inputs
                 for input_node in node.all_input_nodes:
                     self.input_to_nchw(graph_module, input_node, node)
+            elif node.target == exir_ops.edge.aten._to_copy.default:
+                self.tag_node(node)
             else:
                 # The node can have inputs in any format (but all must be the
                 # same format)
                 is_or_isnt_nhwc_node = [
-                    self.is_nhwc_node(input_node) for input_node in node.all_input_nodes
+                    ChannelsLastTaggedReshapePass.is_nhwc_node(input_node)
+                    for input_node in node.all_input_nodes
                 ]
                 if all(is_or_isnt_nhwc_node):
                     # All inputs are nhwc so this node's output is nhwc too
diff --git a/backends/xnnpack/_passes/remove_redundant_copy_pass.py b/backends/xnnpack/_passes/remove_redundant_copy_pass.py
@@ -0,0 +1,158 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.backends.xnnpack._passes.channels_last_tagged_reshape_pass import (
+    ChannelsLastTaggedReshapePass,
+)
+from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
+from executorch.backends.xnnpack.utils.quant_utils import is_dequant, is_quant
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import PassResult
+
+
+class RemoveRedundantCopyPass(XNNPACKPass):
+    def _safe_remove_node(self, node, graph):
+        if len(node.users) == 0:
+            graph.erase_node(node)
+
+    def _try_remove_regular_redundant_to_copy(self, node, graph):
+        """
+        Try to remove redundant regular to_copy operations with pattern to_copy1 -> to_copy2 with opposite memory formats
+        """
+        input_node = node.args[0]
+
+        # Check if input is a to_copy with opposite memory format
+        if (
+            input_node.target == exir_ops.edge.aten._to_copy.default
+            and ChannelsLastTaggedReshapePass.is_nchw_node(input_node)
+            != ChannelsLastTaggedReshapePass.is_nchw_node(node)
+            and len(input_node.users) == 1
+        ):  # Ensure the first copy has no other users
+
+            # Get the original input (before the first to_copy)
+            original_input = input_node.args[0]
+
+            # Replace all users of the second to_copy with the original input
+            for user in node.users.copy():
+                user.replace_input_with(node, original_input)
+
+            # Remove both to_copy nodes
+            self._safe_remove_node(node, graph)
+            self._safe_remove_node(input_node, graph)
+
+            return True
+        elif (
+            ChannelsLastTaggedReshapePass.is_nhwc_node(input_node)
+            and ChannelsLastTaggedReshapePass.is_nhwc_node(node)
+        ) or (
+            ChannelsLastTaggedReshapePass.is_nchw_node(input_node)
+            and ChannelsLastTaggedReshapePass.is_nchw_node(node)
+        ):
+            # Replace all users of the second to_copy with the original input
+            for user in node.users.copy():
+                user.replace_input_with(node, input_node)
+            self._safe_remove_node(node, graph)
+            return True
+
+        return False
+
+    def _try_remove_quantized_redundant_to_copy(self, node, graph):
+        """
+        Try to remove redundant to_copy operations in quantized graphs with pattern dq1 -> to_copy1 -> q1 -> dq2 -> to_copy2 -> q2
+        """
+        # Check if this to_copy is followed by a quantize node
+        if len(node.users) != 1:
+            return False
+        q_node = next(iter(node.users))
+        if not is_quant(q_node):
+            return False
+
+        # Check if this to_copy is preceded by a dequantize node
+        dq_node = node.args[0]
+        if not is_dequant(dq_node):
+            return False
+
+        # Get the input to the dequantize node
+        if len(dq_node.all_input_nodes) != 1:
+            return False
+
+        prev_q_node = dq_node.args[0]
+
+        # Check if there's another dequantize -> to_copy -> quantize chain
+        if not is_quant(prev_q_node) or len(prev_q_node.all_input_nodes) != 1:
+            return False
+
+        # Check if there's a to_copy before the previous quantize
+        prev_to_copy = prev_q_node.args[0]
+        if (
+            prev_to_copy.target == exir_ops.edge.aten._to_copy.default
+            and ChannelsLastTaggedReshapePass.is_nchw_node(prev_to_copy)
+            != ChannelsLastTaggedReshapePass.is_nchw_node(node)
+            and len(prev_to_copy.users) == 1
+        ):  # Ensure the first copy has no other users
+            prev_dq_node = prev_to_copy.args[0]
+            if not is_dequant(prev_dq_node) or len(prev_dq_node.all_input_nodes) != 1:
+                return False
+
+            # Get the original input (before the first to_copy)
+            original_input = prev_dq_node.args[0]
+
+            # Replace all users of the second to_copy with the original input
+            for user in q_node.users.copy():
+                user.replace_input_with(q_node, original_input)
+
+            # Remove nodes safely (only if they have no other users)
+            self._safe_remove_node(q_node, graph)
+            self._safe_remove_node(node, graph)
+            self._safe_remove_node(dq_node, graph)
+            self._safe_remove_node(prev_q_node, graph)
+            self._safe_remove_node(prev_to_copy, graph)
+            self._safe_remove_node(prev_dq_node, graph)
+        elif (
+            ChannelsLastTaggedReshapePass.is_nhwc_node(prev_to_copy)
+            and ChannelsLastTaggedReshapePass.is_nhwc_node(node)
+        ) or (
+            ChannelsLastTaggedReshapePass.is_nchw_node(prev_to_copy)
+            and ChannelsLastTaggedReshapePass.is_nchw_node(node)
+        ):
+            # Remove node and the q/dq around it only
+            # Get the original quantized tensor (input to dq_node)
+            original_q_tensor = dq_node.args[0]
+
+            # Replace all users of q_node with the original quantized tensor
+            for user in q_node.users.copy():
+                user.replace_input_with(q_node, original_q_tensor)
+
+            self._safe_remove_node(q_node, graph)
+            self._safe_remove_node(node, graph)
+            self._safe_remove_node(dq_node, graph)
+            return True
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+        original_nodes = list(graph.nodes)
+
+        for node in original_nodes:
+            if len(node.all_input_nodes) == 0:
+                continue
+
+            # Only process to_copy nodes
+            if node.target != exir_ops.edge.aten._to_copy.default:
+                continue
+
+            if is_dequant(node.args[0]):
+                self._try_remove_quantized_redundant_to_copy(node, graph)
+            else:
+                self._try_remove_regular_redundant_to_copy(node, graph)
+
+        graph_module.recompile()
+
+        # Since we are overriding "call", we need to call the parent's "call"
+        # to retrace the graph and regenerate metadata
+        graph_module = super().call(graph_module).graph_module
+
+        return PassResult(graph_module, True)
diff --git a/backends/xnnpack/test/passes/test_remove_redundant_copy_pass.py b/backends/xnnpack/test/passes/test_remove_redundant_copy_pass.py