Optimize transposes in XNNPACK partition

leafs1 · leafs1 · commit 6835f6199b4d · 2025-06-03T10:26:39.000-07:00
diff --git a/backends/xnnpack/_passes/__init__.py b/backends/xnnpack/_passes/__init__.py
@@ -25,6 +25,10 @@
     FuseBatchNormWithConvPass,
 )
 from executorch.backends.xnnpack._passes.prelu_reshape_pass import PReLUReshapePass
+
+from executorch.backends.xnnpack._passes.remove_redundant_ops_pass import (
+    RemoveRedundantOpsPass,
+)
 from executorch.backends.xnnpack._passes.tag_implicit_q_dq_pass import (
     TagImplicitQDqPass,
 )
@@ -70,6 +74,7 @@ def __init__(
                 Conv1dUnsqueezePass,
                 PReLUReshapePass,
                 ChannelsLastTaggedReshapePass,
+                RemoveRedundantOpsPass,
                 TagImplicitQDqPass,
             ]
         else:
diff --git a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from enum import Enum
 from typing import Optional, Tuple
 
 import torch
@@ -14,6 +15,11 @@
 from executorch.exir.pass_base import PassResult
 
 
+class InputDimOrder(Enum):
+    NCHW = 1
+    NHWC = 2
+
+
 # TODO(T151254305) use subgraph_rewriter
 class ChannelsLastTaggedReshapePass(XNNPACKPass):
     """
@@ -84,11 +90,13 @@ def mark_as_nhwc_node(self, node: torch.fx.Node) -> None:
     def mark_as_nchw_node(self, node: torch.fx.Node) -> None:
         node.meta[ChannelsLastTaggedReshapePass.XNN_NHWC_NODE] = False
 
-    def is_nhwc_node(self, node: torch.fx.Node) -> bool:
+    @staticmethod
+    def is_nhwc_node(node: torch.fx.Node) -> bool:
         return node.meta.get(ChannelsLastTaggedReshapePass.XNN_NHWC_NODE, False)
 
-    def is_nchw_node(self, node: torch.fx.Node) -> bool:
-        return not self.is_nhwc_node(node)
+    @staticmethod
+    def is_nchw_node(node: torch.fx.Node) -> bool:
+        return not ChannelsLastTaggedReshapePass.is_nhwc_node(node)
 
     def requires_nhwc_input(self, node: torch.fx.Node) -> bool:
         return (
@@ -114,7 +122,7 @@ def can_be_converted_to_nhwc(self, node: torch.fx.Node) -> bool:
         is_nchw_constant = (
             is_param_node(self.exported_program, node)
             and (ChannelsLastTaggedReshapePass.XNN_NHWC_NODE in node.meta)
-            and (self.is_nchw_node(node))
+            and (ChannelsLastTaggedReshapePass.is_nchw_node(node))
         )
         return is_4d and not is_nchw_constant
 
@@ -257,6 +265,22 @@ def insert_copy_and_assign_partner_nodes_quantization_sensitive(
             # in that case
             self.make_partners(original_input, copy_node)
 
+    def input_dim_order(
+        self, input_node: torch.fx.Node, input_order: InputDimOrder
+    ) -> bool:
+        if input_node.name == "x":
+            return (
+                input_node.meta["val"].is_contiguous()
+                if input_order == InputDimOrder.NCHW
+                else not input_node.meta["val"].is_contiguous()
+            )
+        else:
+            return (
+                ChannelsLastTaggedReshapePass.is_nchw_node(input_node)
+                if input_order == InputDimOrder.NCHW
+                else ChannelsLastTaggedReshapePass.is_nhwc_node(input_node)
+            )
+
     def input_to_nhwc(
         self,
         graph_module: torch.fx.GraphModule,
@@ -266,7 +290,7 @@ def input_to_nhwc(
         if is_param_node(self.exported_program, input_node):
             if (
                 ChannelsLastTaggedReshapePass.XNN_NHWC_NODE in input_node.meta
-                and self.is_nchw_node(input_node)
+                and ChannelsLastTaggedReshapePass.is_nchw_node(input_node)
             ):
                 # This constant data tensor has been used somewhere else
                 # in NCHW format so we can't use it here in NHWC format
@@ -277,10 +301,7 @@ def input_to_nhwc(
             # serializing graph, but don't do anything else here
             self.mark_as_nhwc_node(input_node)
 
-        if input_node.name == "x":
-            if not input_node.meta["val"][0].is_contiguous():
-                return
-        elif self.is_nhwc_node(input_node):
+        if self.input_dim_order(input_node, InputDimOrder.NHWC):
             return
 
         if not self.can_be_converted_to_nhwc(input_node):
@@ -332,7 +353,7 @@ def input_to_nchw(
         if is_param_node(self.exported_program, input_node):
             if (
                 ChannelsLastTaggedReshapePass.XNN_NHWC_NODE in input_node.meta
-                and self.is_nhwc_node(input_node)
+                and ChannelsLastTaggedReshapePass.is_nhwc_node(input_node)
             ):
                 # This constant data tensor has been used somewhere else
                 # in NHWC format so we can't use it here in NCHW format
@@ -344,10 +365,7 @@ def input_to_nchw(
             # do anything else here
             self.mark_as_nchw_node(input_node)
 
-        if input_node.name == "x":
-            if input_node.meta["val"].is_contiguous():
-                return
-        elif self.is_nchw_node(input_node):
+        if self.input_dim_order(input_node, InputDimOrder.NCHW):
             return
 
         if ChannelsLastTaggedReshapePass.PARTNER_NODE in input_node.meta:
@@ -391,7 +409,7 @@ def call(self, graph_module: torch.fx.GraphModule):  # noqa: C901
                     self.input_to_nhwc(graph_module, node.args[0], node)
 
                 for input_node in node.all_input_nodes[1:]:
-                    if self.is_nhwc_node(input_node):
+                    if ChannelsLastTaggedReshapePass.is_nhwc_node(input_node):
                         raise AssertionError(
                             f"Expected {input_node} to be NCHW in channels last reshape pass"
                         )
@@ -409,7 +427,8 @@ def call(self, graph_module: torch.fx.GraphModule):  # noqa: C901
                 # The node can have inputs in any format (but all must be the
                 # same format)
                 is_or_isnt_nhwc_node = [
-                    self.is_nhwc_node(input_node) for input_node in node.all_input_nodes
+                    ChannelsLastTaggedReshapePass.is_nhwc_node(input_node)
+                    for input_node in node.all_input_nodes
                 ]
                 if all(is_or_isnt_nhwc_node):
                     # All inputs are nhwc so this node's output is nhwc too
diff --git a/backends/xnnpack/_passes/remove_redundant_ops_pass.py b/backends/xnnpack/_passes/remove_redundant_ops_pass.py
@@ -0,0 +1,48 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
+from executorch.backends.xnnpack._passes.channels_last_tagged_reshape_pass import ChannelsLastTaggedReshapePass
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import PassResult
+
+class RemoveRedundantOpsPass(XNNPACKPass):
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+        original_nodes = list(graph.nodes)
+
+        # Store first subsequent visitation of to_copy node
+        prev = None
+        for node in original_nodes:
+            if len(node.all_input_nodes) == 0:
+                continue
+
+            # If we encounter a to_copy node, check if it is preceded by an opposite to_copy node
+            if node.target == exir_ops.edge.aten._to_copy.default:
+                if prev and ChannelsLastTaggedReshapePass.is_nchw_node(prev) != ChannelsLastTaggedReshapePass.is_nchw_node(node):
+                    # If we find an opposite to_copy node, remove both nodes
+                    prevPrev = prev.args[0]
+
+                    for user in node.users.copy():
+                        user.replace_input_with(node, prevPrev)
+
+                    graph.erase_node(node)
+                    graph.erase_node(prev)
+
+                    prev = None
+                    continue
+                prev = node
+            else:
+                prev = None
+
+        graph_module.recompile()
+
+        # Since we are overriding "call", we need to call the parent's "call"
+        # to retrace the graph and regenerate metadata
+        graph_module = super().call(graph_module).graph_module
+
+        return PassResult(graph_module, True)
diff --git a/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py b/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py
@@ -147,7 +147,6 @@ def forward(self, x):
     def test_conv_linear_dim_order_swap_partitioner(self):
         self.run_tester(self.LinearConvDimSwapModule, (torch.randn(1, 3, 6, 4),))
 
-
     def test_qs8_channels_last_tagged_reshape_pass(self):
         for module, num_reshape in self.modules.items():
             (
diff --git a/backends/xnnpack/test/passes/test_remove_redundant_ops_pass.py b/backends/xnnpack/test/passes/test_remove_redundant_ops_pass.py
@@ -0,0 +1,100 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.backends.xnnpack._passes.remove_redundant_ops_pass import (
+    RemoveRedundantOpsPass,
+)
+from executorch.backends.xnnpack._passes.channels_last_tagged_reshape_pass import (
+    ChannelsLastTaggedReshapePass,
+)
+from executorch.backends.xnnpack._passes.convert_to_linear import ConvertToLinearPass
+from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass
+from executorch.backends.xnnpack.test.tester import RunPasses, Tester
+
+
+class TestChannelsLastTaggedReshapePass(unittest.TestCase):
+    PassStage = RunPasses([DimOrderOpsRevertPass,
+                ConvertToLinearPass,
+                ChannelsLastTaggedReshapePass,
+                RemoveRedundantOpsPass])
+
+    def setUp(self):
+        torch._dynamo.reset()
+
+    def run_tester(self, module, inputs):
+        tester = Tester(
+            module.eval(),
+            inputs,
+        )
+        tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs()
+
+    class ChannelsLastToContiguous(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = torch.nn.Conv2d(3, 3, 3)
+            self.linear1 = torch.nn.Linear(4, 3)
+
+        def forward(self, x):
+            y = self.linear1(x)
+            y = y.to(memory_format=torch.channels_last)
+            y = y.to(memory_format=torch.contiguous_format)
+            y = y.to(memory_format=torch.channels_last)
+            y = y.to(memory_format=torch.contiguous_format)
+            y = y.to(memory_format=torch.channels_last)
+            y = y.to(memory_format=torch.contiguous_format)
+            return self.conv1(y)
+
+    ChannelsLastToContiguousModule = ChannelsLastToContiguous()
+
+    class ContiguousToChannelsLast(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = torch.nn.Conv2d(3, 3, 3)
+            self.linear1 = torch.nn.Linear(4, 3)
+
+        def forward(self, x):
+            y = self.linear1(x)
+            y = y.to(memory_format=torch.contiguous_format)
+            y = y.to(memory_format=torch.channels_last)
+            y = y.to(memory_format=torch.contiguous_format)
+            y = y.to(memory_format=torch.channels_last)
+            y = y.to(memory_format=torch.contiguous_format)
+            y = y.to(memory_format=torch.channels_last)
+
+            return self.conv1(y)
+
+    ContiguousToChannelsLastModule = ContiguousToChannelsLast()
+
+    def test_redundant_to_copy_op_removal(self):
+        (
+            Tester(self.ChannelsLastToContiguousModule, (torch.randn(1, 3, 6, 4),))
+            .export()
+            .to_edge()
+            .run_passes(self.PassStage)
+            .check_count(
+                {
+                    "executorch_exir_dialects_edge__ops_aten__to_copy_default": 2,
+                }
+            )
+            .run_method_and_compare_outputs()
+        )
+
+    def test_redundant_to_copy_op_removal_2(self):
+        (
+            Tester(self.ContiguousToChannelsLastModule, (torch.randn(1, 3, 6, 4),))
+            .export()
+            .to_edge()
+            .run_passes(self.PassStage)
+            .check_count(
+                {
+                    "executorch_exir_dialects_edge__ops_aten__to_copy_default": 1,
+                }
+            )
+            .run_method_and_compare_outputs()
+        )

Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,10 @@`
`25`	`25`	`FuseBatchNormWithConvPass,`
`26`	`26`	`)`
`27`	`27`	`from executorch.backends.xnnpack._passes.prelu_reshape_pass import PReLUReshapePass`
	`28`	`+`
	`29`	`+from executorch.backends.xnnpack._passes.remove_redundant_ops_pass import (`
	`30`	`+ RemoveRedundantOpsPass,`
	`31`	`+)`
`28`	`32`	`from executorch.backends.xnnpack._passes.tag_implicit_q_dq_pass import (`
`29`	`33`	`TagImplicitQDqPass,`
`30`	`34`	`)`
`@@ -70,6 +74,7 @@ def __init__(`
`70`	`74`	`Conv1dUnsqueezePass,`
`71`	`75`	`PReLUReshapePass,`
`72`	`76`	`ChannelsLastTaggedReshapePass,`
	`77`	`+ RemoveRedundantOpsPass,`
`73`	`78`	`TagImplicitQDqPass,`
`74`	`79`	`]`
`75`	`80`	`else:`
Original file line number	Diff line number	Diff line change
`@@ -147,7 +147,6 @@ def forward(self, x):`
`147`	`147`	`def test_conv_linear_dim_order_swap_partitioner(self):`
`148`	`148`	`self.run_tester(self.LinearConvDimSwapModule, (torch.randn(1, 3, 6, 4),))`
`149`	`149`
`150`		`-`
`151`	`150`	`def test_qs8_channels_last_tagged_reshape_pass(self):`
`152`	`151`	`for module, num_reshape in self.modules.items():`
`153`	`152`	`(`