Optimize transposes in XNNPACK partition

leafs1 · leafs1 · commit 824a753dc028 · 2025-06-23T12:21:51.000-07:00
diff --git a/backends/xnnpack/_passes/__init__.py b/backends/xnnpack/_passes/__init__.py
@@ -25,6 +25,10 @@
     FuseBatchNormWithConvPass,
 )
 from executorch.backends.xnnpack._passes.prelu_reshape_pass import PReLUReshapePass
+
+from executorch.backends.xnnpack._passes.remove_redundant_ops_pass import (
+    RemoveRedundantOpsPass,
+)
 from executorch.backends.xnnpack._passes.tag_implicit_q_dq_pass import (
     TagImplicitQDqPass,
 )
@@ -70,6 +74,7 @@ def __init__(
                 Conv1dUnsqueezePass,
                 PReLUReshapePass,
                 ChannelsLastTaggedReshapePass,
+                RemoveRedundantOpsPass,
                 TagImplicitQDqPass,
             ]
         else:
diff --git a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from enum import Enum
 from typing import Optional, Tuple
 
 import torch
@@ -14,6 +15,11 @@
 from executorch.exir.pass_base import PassResult
 
 
+class InputDimOrder(Enum):
+    NCHW = 1
+    NHWC = 2
+
+
 # TODO(T151254305) use subgraph_rewriter
 class ChannelsLastTaggedReshapePass(XNNPACKPass):
     """
@@ -84,11 +90,19 @@ def mark_as_nhwc_node(self, node: torch.fx.Node) -> None:
     def mark_as_nchw_node(self, node: torch.fx.Node) -> None:
         node.meta[ChannelsLastTaggedReshapePass.XNN_NHWC_NODE] = False
 
-    def is_nhwc_node(self, node: torch.fx.Node) -> bool:
+    def tag_node(self, node: torch.fx.Node) -> None:
+        if node.meta["val"].is_contiguous():
+            self.mark_as_nchw_node(node)
+        else:
+            self.mark_as_nhwc_node(node)
+
+    @staticmethod
+    def is_nhwc_node(node: torch.fx.Node) -> bool:
         return node.meta.get(ChannelsLastTaggedReshapePass.XNN_NHWC_NODE, False)
 
-    def is_nchw_node(self, node: torch.fx.Node) -> bool:
-        return not self.is_nhwc_node(node)
+    @staticmethod
+    def is_nchw_node(node: torch.fx.Node) -> bool:
+        return not ChannelsLastTaggedReshapePass.is_nhwc_node(node)
 
     def requires_nhwc_input(self, node: torch.fx.Node) -> bool:
         return (
@@ -114,7 +128,7 @@ def can_be_converted_to_nhwc(self, node: torch.fx.Node) -> bool:
         is_nchw_constant = (
             is_param_node(self.exported_program, node)
             and (ChannelsLastTaggedReshapePass.XNN_NHWC_NODE in node.meta)
-            and (self.is_nchw_node(node))
+            and (ChannelsLastTaggedReshapePass.is_nchw_node(node))
         )
         return is_4d and not is_nchw_constant
 
@@ -257,6 +271,22 @@ def insert_copy_and_assign_partner_nodes_quantization_sensitive(
             # in that case
             self.make_partners(original_input, copy_node)
 
+    def input_dim_order(
+        self, input_node: torch.fx.Node, input_order: InputDimOrder
+    ) -> bool:
+        if input_node.op == "placeholder":
+            return (
+                input_node.meta["val"].is_contiguous()
+                if input_order == InputDimOrder.NCHW
+                else not input_node.meta["val"].is_contiguous()
+            )
+        else:
+            return (
+                ChannelsLastTaggedReshapePass.is_nchw_node(input_node)
+                if input_order == InputDimOrder.NCHW
+                else ChannelsLastTaggedReshapePass.is_nhwc_node(input_node)
+            )
+
     def input_to_nhwc(
         self,
         graph_module: torch.fx.GraphModule,
@@ -266,7 +296,7 @@ def input_to_nhwc(
         if is_param_node(self.exported_program, input_node):
             if (
                 ChannelsLastTaggedReshapePass.XNN_NHWC_NODE in input_node.meta
-                and self.is_nchw_node(input_node)
+                and ChannelsLastTaggedReshapePass.is_nchw_node(input_node)
             ):
                 # This constant data tensor has been used somewhere else
                 # in NCHW format so we can't use it here in NHWC format
@@ -283,6 +313,9 @@ def input_to_nhwc(
         elif self.is_nhwc_node(input_node):
             return
 
+        if self.input_dim_order(input_node, InputDimOrder.NHWC):
+            return
+
         if not self.can_be_converted_to_nhwc(input_node):
             raise AssertionError(
                 "Attempting to convert non-NHWC compatible node to NHWC"
@@ -310,6 +343,7 @@ def input_to_nhwc(
                     args=(input_node,),
                     memory_format=torch.channels_last,
                 )
+                self.mark_as_nhwc_node(input_node_nhwc)
 
             if is_dynamic_input:
                 # Replace downstream input_nodes with NHWC node
@@ -332,7 +366,7 @@ def input_to_nchw(
         if is_param_node(self.exported_program, input_node):
             if (
                 ChannelsLastTaggedReshapePass.XNN_NHWC_NODE in input_node.meta
-                and self.is_nhwc_node(input_node)
+                and ChannelsLastTaggedReshapePass.is_nhwc_node(input_node)
             ):
                 # This constant data tensor has been used somewhere else
                 # in NHWC format so we can't use it here in NCHW format
@@ -350,6 +384,9 @@ def input_to_nchw(
         elif self.is_nchw_node(input_node):
             return
 
+        if self.input_dim_order(input_node, InputDimOrder.NCHW):
+            return
+
         if ChannelsLastTaggedReshapePass.PARTNER_NODE in input_node.meta:
             # Already has an associated NCHW node
             input_node_nchw = input_node.meta[
@@ -364,6 +401,7 @@ def input_to_nchw(
                     args=(input_node,),
                     memory_format=torch.contiguous_format,
                 )
+                self.mark_as_nchw_node(input_node_nchw)
 
         self.insert_copy_and_assign_partner_nodes_quantization_sensitive(
             graph_module=graph_module,
@@ -391,7 +429,7 @@ def call(self, graph_module: torch.fx.GraphModule):  # noqa: C901
                     self.input_to_nhwc(graph_module, node.args[0], node)
 
                 for input_node in node.all_input_nodes[1:]:
-                    if self.is_nhwc_node(input_node):
+                    if ChannelsLastTaggedReshapePass.is_nhwc_node(input_node):
                         raise AssertionError(
                             f"Expected {input_node} to be NCHW in channels last reshape pass"
                         )
@@ -401,15 +439,13 @@ def call(self, graph_module: torch.fx.GraphModule):  # noqa: C901
                 for input_node in node.all_input_nodes:
                     self.input_to_nchw(graph_module, input_node, node)
             elif node.target == exir_ops.edge.aten._to_copy.default:
-                if node.meta["val"].is_contiguous():
-                    self.mark_as_nchw_node(node)
-                else:
-                    self.mark_as_nhwc_node(node)
+                self.tag_node(node)
             else:
                 # The node can have inputs in any format (but all must be the
                 # same format)
                 is_or_isnt_nhwc_node = [
-                    self.is_nhwc_node(input_node) for input_node in node.all_input_nodes
+                    ChannelsLastTaggedReshapePass.is_nhwc_node(input_node)
+                    for input_node in node.all_input_nodes
                 ]
                 if all(is_or_isnt_nhwc_node):
                     # All inputs are nhwc so this node's output is nhwc too
diff --git a/backends/xnnpack/_passes/remove_redundant_ops_pass.py b/backends/xnnpack/_passes/remove_redundant_ops_pass.py
@@ -0,0 +1,48 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.backends.xnnpack._passes.channels_last_tagged_reshape_pass import (
+    ChannelsLastTaggedReshapePass,
+)
+from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import PassResult
+
+
+class RemoveRedundantOpsPass(XNNPACKPass):
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+        original_nodes = list(graph.nodes)
+
+        for node in original_nodes:
+            if len(node.all_input_nodes) == 0:
+                continue
+
+            # If we encounter a to_copy node, check if its input is also a to_copy node with opposite format
+            if node.target == exir_ops.edge.aten._to_copy.default:
+                input_node = node.args[0]
+                if (
+                    input_node.target == exir_ops.edge.aten._to_copy.default
+                    and ChannelsLastTaggedReshapePass.is_nchw_node(input_node)
+                    != ChannelsLastTaggedReshapePass.is_nchw_node(node)
+                ):
+                    # If we find an opposite to_copy node, remove both nodes
+                    original_input = input_node.args[0]
+
+                    for user in node.users.copy():
+                        user.replace_input_with(node, original_input)
+
+                    graph.erase_node(node)
+                    graph.erase_node(input_node)
+
+        graph_module.recompile()
+
+        # Since we are overriding "call", we need to call the parent's "call"
+        # to retrace the graph and regenerate metadata
+        graph_module = super().call(graph_module).graph_module
+
+        return PassResult(graph_module, True)
diff --git a/backends/xnnpack/test/passes/test_remove_redundant_ops_pass.py b/backends/xnnpack/test/passes/test_remove_redundant_ops_pass.py
@@ -0,0 +1,132 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.backends.xnnpack._passes.channels_last_tagged_reshape_pass import (
+    ChannelsLastTaggedReshapePass,
+)
+from executorch.backends.xnnpack._passes.convert_to_linear import ConvertToLinearPass
+from executorch.backends.xnnpack._passes.remove_redundant_ops_pass import (
+    RemoveRedundantOpsPass,
+)
+from executorch.backends.xnnpack.test.tester import RunPasses, Tester
+from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass
+
+
+class TestChannelsLastTaggedReshapePass(unittest.TestCase):
+    PassStage = RunPasses(
+        [
+            DimOrderOpsRevertPass,
+            ConvertToLinearPass,
+            ChannelsLastTaggedReshapePass,
+            RemoveRedundantOpsPass,
+        ]
+    )
+
+    def setUp(self):
+        torch._dynamo.reset()
+
+    def run_tester(self, module, inputs):
+        tester = Tester(
+            module.eval(),
+            inputs,
+        )
+        tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs()
+
+    class ChannelsLastToContiguous(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = torch.nn.Conv2d(3, 3, 3)
+            self.linear1 = torch.nn.Linear(4, 3)
+
+        def forward(self, x):
+            y = self.linear1(x)
+            y = y.to(memory_format=torch.channels_last)
+            y = y.to(memory_format=torch.contiguous_format)
+            y = y.to(memory_format=torch.channels_last)
+            y = y.to(memory_format=torch.contiguous_format)
+            y = y.to(memory_format=torch.channels_last)
+            y = y.to(memory_format=torch.contiguous_format)
+            return self.conv1(y)
+
+    ChannelsLastToContiguousModule = ChannelsLastToContiguous()
+
+    class ContiguousToChannelsLast(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = torch.nn.Conv2d(3, 3, 3)
+            self.linear1 = torch.nn.Linear(4, 3)
+
+        def forward(self, x):
+            y = self.linear1(x)
+            y = y.to(memory_format=torch.contiguous_format)
+            y = y.to(memory_format=torch.channels_last)
+            y = y.to(memory_format=torch.contiguous_format)
+            y = y.to(memory_format=torch.channels_last)
+            y = y.to(memory_format=torch.contiguous_format)
+            y = y.to(memory_format=torch.channels_last)
+
+            return self.conv1(y)
+
+    ContiguousToChannelsLastModule = ContiguousToChannelsLast()
+
+    class ImplicitRedundantOpRemoval(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.upsample = torch.nn.Upsample(scale_factor=2, mode="nearest")
+            self.conv = torch.nn.Conv2d(3, 3, 3)
+
+        def forward(self, x):
+            y = x.to(memory_format=torch.channels_last)
+            y = self.upsample(y)
+            y = y.to(memory_format=torch.contiguous_format)
+            return self.conv(y)
+
+    ImplicitRedundantOpRemovalModule = ImplicitRedundantOpRemoval()
+
+    def test_redundant_to_copy_op_removal(self):
+        (
+            Tester(self.ChannelsLastToContiguousModule, (torch.randn(1, 3, 6, 4),))
+            .export()
+            .to_edge()
+            .run_passes(self.PassStage)
+            .check_count(
+                {
+                    "executorch_exir_dialects_edge__ops_aten__to_copy_default": 2,
+                }
+            )
+            .run_method_and_compare_outputs()
+        )
+
+    def test_redundant_to_copy_op_removal_2(self):
+        (
+            Tester(self.ContiguousToChannelsLastModule, (torch.randn(1, 3, 6, 4),))
+            .export()
+            .to_edge()
+            .run_passes(self.PassStage)
+            .check_count(
+                {
+                    "executorch_exir_dialects_edge__ops_aten__to_copy_default": 1,
+                }
+            )
+            .run_method_and_compare_outputs()
+        )
+
+    def test_implicit_redundant_op_removal(self):
+        (
+            Tester(self.ImplicitRedundantOpRemovalModule, (torch.randn(1, 3, 3, 3),))
+            .export()
+            .to_edge()
+            .run_passes(self.PassStage)
+            .check_count(
+                {
+                    "executorch_exir_dialects_edge__ops_aten__to_copy_default": 2,
+                }
+            )
+            .run_method_and_compare_outputs()
+        )
diff --git a/backends/xnnpack/third-party/cpuinfo b/backends/xnnpack/third-party/cpuinfo
@@ -1 +1 @@
-Subproject commit 1e83a2fdd3102f65c6f1fb602c1b320486218a99
+Subproject commit c61fe919607bbc534d7a5a5707bdd7041e72c5ff
diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers
@@ -1 +1 @@
-Subproject commit 57eb76d71d6dde5396520c7d35142eb868994e06
+Subproject commit fc32028858020c4fcafe37aaaeaf5d1b480336a2
diff --git a/kernels/optimized/third-party/eigen b/kernels/optimized/third-party/eigen
@@ -1 +1 @@
-Subproject commit a39ade4ccf99df845ec85c580fbbb324f71952fa
+Subproject commit 729443409942a1816ddf74b95224003b83f4925c
diff --git a/third-party/ao b/third-party/ao
@@ -1 +1 @@
-Subproject commit 214e70476f0093e84983ec015d1c737183e2f2ca
+Subproject commit bc68b11f1bf77be38721ca7dd2c477aeb5e6626e

Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,10 @@`
`25`	`25`	`FuseBatchNormWithConvPass,`
`26`	`26`	`)`
`27`	`27`	`from executorch.backends.xnnpack._passes.prelu_reshape_pass import PReLUReshapePass`
	`28`	`+`
	`29`	`+from executorch.backends.xnnpack._passes.remove_redundant_ops_pass import (`
	`30`	`+ RemoveRedundantOpsPass,`
	`31`	`+)`
`28`	`32`	`from executorch.backends.xnnpack._passes.tag_implicit_q_dq_pass import (`
`29`	`33`	`TagImplicitQDqPass,`
`30`	`34`	`)`
`@@ -70,6 +74,7 @@ def __init__(`
`70`	`74`	`Conv1dUnsqueezePass,`
`71`	`75`	`PReLUReshapePass,`
`72`	`76`	`ChannelsLastTaggedReshapePass,`
	`77`	`+ RemoveRedundantOpsPass,`
`73`	`78`	`TagImplicitQDqPass,`
`74`	`79`	`]`
`75`	`80`	`else:`