Optimize transposes in XNNPACK partition

leafs1 · leafs1 · commit dd07f802cab6 · 2025-07-01T17:19:08.000-07:00
diff --git a/backends/xnnpack/_passes/__init__.py b/backends/xnnpack/_passes/__init__.py
@@ -25,6 +25,10 @@
     FuseBatchNormWithConvPass,
 )
 from executorch.backends.xnnpack._passes.prelu_reshape_pass import PReLUReshapePass
+
+from executorch.backends.xnnpack._passes.remove_redundant_copy_pass import (
+    RemoveRedundantCopyPass,
+)
 from executorch.backends.xnnpack._passes.tag_implicit_q_dq_pass import (
     TagImplicitQDqPass,
 )
@@ -70,6 +74,7 @@ def __init__(
                 Conv1dUnsqueezePass,
                 PReLUReshapePass,
                 ChannelsLastTaggedReshapePass,
+                RemoveRedundantCopyPass,
                 TagImplicitQDqPass,
             ]
         else:
diff --git a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from enum import Enum
 from typing import Optional, Tuple
 
 import torch
@@ -14,6 +15,11 @@
 from executorch.exir.pass_base import PassResult
 
 
+class InputDimOrder(Enum):
+    NCHW = 1
+    NHWC = 2
+
+
 # TODO(T151254305) use subgraph_rewriter
 class ChannelsLastTaggedReshapePass(XNNPACKPass):
     """
@@ -84,11 +90,19 @@ def mark_as_nhwc_node(self, node: torch.fx.Node) -> None:
     def mark_as_nchw_node(self, node: torch.fx.Node) -> None:
         node.meta[ChannelsLastTaggedReshapePass.XNN_NHWC_NODE] = False
 
-    def is_nhwc_node(self, node: torch.fx.Node) -> bool:
+    def tag_node(self, node: torch.fx.Node) -> None:
+        if node.kwargs["memory_format"] == torch.channels_last:
+            self.mark_as_nhwc_node(node)
+        else:
+            self.mark_as_nchw_node(node)
+
+    @staticmethod
+    def is_nhwc_node(node: torch.fx.Node) -> bool:
         return node.meta.get(ChannelsLastTaggedReshapePass.XNN_NHWC_NODE, False)
 
-    def is_nchw_node(self, node: torch.fx.Node) -> bool:
-        return not self.is_nhwc_node(node)
+    @staticmethod
+    def is_nchw_node(node: torch.fx.Node) -> bool:
+        return not ChannelsLastTaggedReshapePass.is_nhwc_node(node)
 
     def requires_nhwc_input(self, node: torch.fx.Node) -> bool:
         return node.target in self.memory_sensitive_ops_nhwc
@@ -106,7 +120,7 @@ def can_be_converted_to_nhwc(self, node: torch.fx.Node) -> bool:
         is_nchw_constant = (
             is_param_node(self.exported_program, node)
             and (ChannelsLastTaggedReshapePass.XNN_NHWC_NODE in node.meta)
-            and (self.is_nchw_node(node))
+            and (ChannelsLastTaggedReshapePass.is_nchw_node(node))
         )
         return is_4d and not is_nchw_constant
 
@@ -249,6 +263,22 @@ def insert_copy_and_assign_partner_nodes_quantization_sensitive(
             # in that case
             self.make_partners(original_input, copy_node)
 
+    def input_dim_order(
+        self, input_node: torch.fx.Node, input_order: InputDimOrder
+    ) -> bool:
+        if input_node.op == "placeholder":
+            return (
+                input_node.meta["val"].is_contiguous()
+                if input_order == InputDimOrder.NCHW
+                else not input_node.meta["val"].is_contiguous()
+            )
+        else:
+            return (
+                ChannelsLastTaggedReshapePass.is_nchw_node(input_node)
+                if input_order == InputDimOrder.NCHW
+                else ChannelsLastTaggedReshapePass.is_nhwc_node(input_node)
+            )
+
     def input_to_nhwc(
         self,
         graph_module: torch.fx.GraphModule,
@@ -258,7 +288,7 @@ def input_to_nhwc(
         if is_param_node(self.exported_program, input_node):
             if (
                 ChannelsLastTaggedReshapePass.XNN_NHWC_NODE in input_node.meta
-                and self.is_nchw_node(input_node)
+                and ChannelsLastTaggedReshapePass.is_nchw_node(input_node)
             ):
                 # This constant data tensor has been used somewhere else
                 # in NCHW format so we can't use it here in NHWC format
@@ -275,6 +305,9 @@ def input_to_nhwc(
         elif self.is_nhwc_node(input_node):
             return
 
+        if self.input_dim_order(input_node, InputDimOrder.NHWC):
+            return
+
         if not self.can_be_converted_to_nhwc(input_node):
             raise AssertionError(
                 "Attempting to convert non-NHWC compatible node to NHWC"
@@ -302,6 +335,7 @@ def input_to_nhwc(
                     args=(input_node,),
                     memory_format=torch.channels_last,
                 )
+                self.mark_as_nhwc_node(input_node_nhwc)
 
             if is_dynamic_input:
                 # Replace downstream input_nodes with NHWC node
@@ -324,7 +358,7 @@ def input_to_nchw(
         if is_param_node(self.exported_program, input_node):
             if (
                 ChannelsLastTaggedReshapePass.XNN_NHWC_NODE in input_node.meta
-                and self.is_nhwc_node(input_node)
+                and ChannelsLastTaggedReshapePass.is_nhwc_node(input_node)
             ):
                 # This constant data tensor has been used somewhere else
                 # in NHWC format so we can't use it here in NCHW format
@@ -342,6 +376,9 @@ def input_to_nchw(
         elif self.is_nchw_node(input_node):
             return
 
+        if self.input_dim_order(input_node, InputDimOrder.NCHW):
+            return
+
         if ChannelsLastTaggedReshapePass.PARTNER_NODE in input_node.meta:
             # Already has an associated NCHW node
             input_node_nchw = input_node.meta[
@@ -356,6 +393,7 @@ def input_to_nchw(
                     args=(input_node,),
                     memory_format=torch.contiguous_format,
                 )
+                self.mark_as_nchw_node(input_node_nchw)
 
         self.insert_copy_and_assign_partner_nodes_quantization_sensitive(
             graph_module=graph_module,
@@ -383,10 +421,12 @@ def call(self, graph_module: torch.fx.GraphModule):  # noqa: C901
             elif self.requires_nhwc_input(node):
                 # Nodes which enter this branch are ones that require their
                 # first input to be nhwc. This makes this node's output nhwc too
-
                 self.input_to_nhwc(graph_module, node.args[0], node)
-                for input_node in node.all_input_nodes:
-                    if input_node.op == "placeholder" and self.is_nhwc_node(input_node):
+                for input_node in node.all_input_nodes[1:]:
+                    if (
+                        input_node.op == "placeholder"
+                        and ChannelsLastTaggedReshapePass.is_nhwc_node(input_node)
+                    ):
                         raise AssertionError(
                             f"Expected {input_node} to be NCHW in channels last reshape pass"
                         )
@@ -395,11 +435,14 @@ def call(self, graph_module: torch.fx.GraphModule):  # noqa: C901
                 # The node requires nchw inputs
                 for input_node in node.all_input_nodes:
                     self.input_to_nchw(graph_module, input_node, node)
+            elif node.target == exir_ops.edge.aten._to_copy.default:
+                self.tag_node(node)
             else:
                 # The node can have inputs in any format (but all must be the
                 # same format)
                 is_or_isnt_nhwc_node = [
-                    self.is_nhwc_node(input_node) for input_node in node.all_input_nodes
+                    ChannelsLastTaggedReshapePass.is_nhwc_node(input_node)
+                    for input_node in node.all_input_nodes
                 ]
                 if all(is_or_isnt_nhwc_node):
                     # All inputs are nhwc so this node's output is nhwc too
diff --git a/backends/xnnpack/_passes/remove_redundant_copy_pass.py b/backends/xnnpack/_passes/remove_redundant_copy_pass.py
@@ -0,0 +1,50 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.backends.xnnpack._passes.channels_last_tagged_reshape_pass import (
+    ChannelsLastTaggedReshapePass,
+)
+from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import PassResult
+
+
+class RemoveRedundantCopyPass(XNNPACKPass):
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+        original_nodes = list(graph.nodes)
+
+        for node in original_nodes:
+            if len(node.all_input_nodes) == 0:
+                continue
+
+            # If we encounter a to_copy node, check if its input is also a to_copy node with opposite format
+            if node.target == exir_ops.edge.aten._to_copy.default:
+                input_node = node.args[0]
+                if (
+                    input_node.target == exir_ops.edge.aten._to_copy.default
+                    and ChannelsLastTaggedReshapePass.is_nchw_node(input_node)
+                    != ChannelsLastTaggedReshapePass.is_nchw_node(node)
+                    and len(input_node.users)
+                    == 1  # Ensure the first copy has no other users
+                ):
+                    # If we find an opposite to_copy node, remove both nodes
+                    original_input = input_node.args[0]
+
+                    for user in node.users.copy():
+                        user.replace_input_with(node, original_input)
+
+                    graph.erase_node(node)
+                    graph.erase_node(input_node)
+
+        graph_module.recompile()
+
+        # Since we are overriding "call", we need to call the parent's "call"
+        # to retrace the graph and regenerate metadata
+        graph_module = super().call(graph_module).graph_module
+
+        return PassResult(graph_module, True)
diff --git a/backends/xnnpack/test/passes/test_remove_redundant_copy_pass.py b/backends/xnnpack/test/passes/test_remove_redundant_copy_pass.py
@@ -0,0 +1,166 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.backends.xnnpack._passes.channels_last_tagged_reshape_pass import (
+    ChannelsLastTaggedReshapePass,
+)
+from executorch.backends.xnnpack._passes.convert_to_linear import ConvertToLinearPass
+from executorch.backends.xnnpack._passes.remove_redundant_copy_pass import (
+    RemoveRedundantCopyPass,
+)
+from executorch.backends.xnnpack.test.tester import RunPasses, Tester
+from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass
+from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import XNNPACKQuantizer
+
+
+class TestChannelsLastTaggedReshapePass(unittest.TestCase):
+    PassStage = RunPasses(
+        [
+            DimOrderOpsRevertPass,
+            ConvertToLinearPass,
+            ChannelsLastTaggedReshapePass,
+            RemoveRedundantCopyPass,
+        ]
+    )
+
+    def setUp(self):
+        torch._dynamo.reset()
+
+    def run_tester(self, module, inputs):
+        tester = Tester(
+            module.eval(),
+            inputs,
+        )
+        tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs()
+
+    class ChannelsLastToContiguous(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = torch.nn.Conv2d(3, 3, 3)
+            self.linear1 = torch.nn.Linear(4, 3)
+
+        def forward(self, x):
+            y = self.linear1(x)
+            y = y.to(memory_format=torch.channels_last)
+            y = y.to(memory_format=torch.contiguous_format)
+            y = y.to(memory_format=torch.channels_last)
+            y = y.to(memory_format=torch.contiguous_format)
+            y = y.to(memory_format=torch.channels_last)
+            y = y.to(memory_format=torch.contiguous_format)
+            return self.conv1(y)
+
+    ChannelsLastToContiguousModule = ChannelsLastToContiguous()
+
+    class ContiguousToChannelsLast(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = torch.nn.Conv2d(3, 3, 3)
+            self.linear1 = torch.nn.Linear(4, 3)
+
+        def forward(self, x):
+            y = self.linear1(x)
+            y = y.to(memory_format=torch.contiguous_format)
+            y = y.to(memory_format=torch.channels_last)
+            y = y.to(memory_format=torch.contiguous_format)
+            y = y.to(memory_format=torch.channels_last)
+            y = y.to(memory_format=torch.contiguous_format)
+            y = y.to(memory_format=torch.channels_last)
+
+            return self.conv1(y)
+
+    ContiguousToChannelsLastModule = ContiguousToChannelsLast()
+
+    class ImplicitRedundantOpRemoval(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.upsample = torch.nn.Upsample(scale_factor=2, mode="nearest")
+            self.conv = torch.nn.Conv2d(3, 3, 3)
+
+        def forward(self, x):
+            y = x.to(memory_format=torch.channels_last)
+            y = self.upsample(y)
+            y = y.to(memory_format=torch.contiguous_format)
+            return self.conv(y)
+
+    ImplicitRedundantOpRemovalModule = ImplicitRedundantOpRemoval()
+
+    class QuantizableRedundantCopyModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = torch.nn.Conv2d(3, 16, 3, padding=1)
+            self.conv2 = torch.nn.Conv2d(16, 16, 3, padding=1)
+
+        def forward(self, x):
+            x = self.conv1(x)
+
+            x = x.to(memory_format=torch.channels_last)
+            x = x.to(memory_format=torch.contiguous_format)
+            x = x.to(memory_format=torch.channels_last)
+
+            x = self.conv2(x)
+            return x
+
+    QuantizableRedundantCopyModule = QuantizableRedundantCopyModel()
+
+    def test_redundant_to_copy_op_removal(self):
+        (
+            Tester(self.ChannelsLastToContiguousModule, (torch.randn(1, 3, 6, 4),))
+            .export()
+            .to_edge()
+            .run_passes(self.PassStage)
+            .check_count(
+                {
+                    "executorch_exir_dialects_edge__ops_aten__to_copy_default": 2,
+                }
+            )
+            .run_method_and_compare_outputs()
+        )
+
+    def test_redundant_to_copy_op_removal_2(self):
+        (
+            Tester(self.ContiguousToChannelsLastModule, (torch.randn(1, 3, 6, 4),))
+            .export()
+            .to_edge()
+            .run_passes(self.PassStage)
+            .check_count(
+                {
+                    "executorch_exir_dialects_edge__ops_aten__to_copy_default": 1,
+                }
+            )
+            .run_method_and_compare_outputs()
+        )
+
+    def test_implicit_redundant_op_removal(self):
+        (
+            Tester(self.ImplicitRedundantOpRemovalModule, (torch.randn(1, 3, 3, 3),))
+            .export()
+            .to_edge()
+            .run_passes(self.PassStage)
+            .check_count(
+                {
+                    "executorch_exir_dialects_edge__ops_aten__to_copy_default": 2,
+                }
+            )
+            .run_method_and_compare_outputs()
+        )
+
+    def test_quantized_redundant_copy_removal(self):
+        (
+            Tester(self.QuantizableRedundantCopyModule, (torch.randn(1, 3, 32, 32),))
+            .quantize()
+            .export()
+            .to_edge()
+            .run_passes(self.PassStage)
+            .check_count(
+                {
+                    "executorch_exir_dialects_edge__ops_aten__to_copy_default": 2,
+                }
+            )
+            .run_method_and_compare_outputs()
+        )

Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,10 @@`
`25`	`25`	`FuseBatchNormWithConvPass,`
`26`	`26`	`)`
`27`	`27`	`from executorch.backends.xnnpack._passes.prelu_reshape_pass import PReLUReshapePass`
	`28`	`+`
	`29`	`+from executorch.backends.xnnpack._passes.remove_redundant_copy_pass import (`
	`30`	`+ RemoveRedundantCopyPass,`
	`31`	`+)`
`28`	`32`	`from executorch.backends.xnnpack._passes.tag_implicit_q_dq_pass import (`
`29`	`33`	`TagImplicitQDqPass,`
`30`	`34`	`)`
`@@ -70,6 +74,7 @@ def __init__(`
`70`	`74`	`Conv1dUnsqueezePass,`
`71`	`75`	`PReLUReshapePass,`
`72`	`76`	`ChannelsLastTaggedReshapePass,`
	`77`	`+ RemoveRedundantCopyPass,`
`73`	`78`	`TagImplicitQDqPass,`
`74`	`79`	`]`
`75`	`80`	`else:`