XNNPACK: Add support for clone

digantdesai · facebook-github-bot · commit 62d52fc056d5 · 2025-09-30T04:06:25.000-07:00
Summary:
* Partition `dim_order_clone.default"
* Revert back to `aten.clone.default"
* Run `RemoveCloneOpsTransform` to remove redundant clones
* Lower `aten.clone.default` to XNNPACKStaticTranspose if left
* Add tests

Differential Revision: D83560001
diff --git a/backends/xnnpack/_passes/TARGETS b/backends/xnnpack/_passes/TARGETS
@@ -8,6 +8,7 @@ runtime.python_library(
     deps = [
         "//caffe2:torch",
         "//executorch/backends/transforms:addmm_mm_to_linear",
+        "//executorch/backends/transforms:remove_clone_ops",
         "//executorch/backends/transforms:lib",
         "//executorch/backends/xnnpack/partition:partitioner_graphs",
         "//executorch/backends/xnnpack/serialization:xnnpack_schema",
diff --git a/backends/xnnpack/_passes/__init__.py b/backends/xnnpack/_passes/__init__.py
@@ -4,8 +4,11 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import logging
 from typing import List, Optional, Type
 
+from executorch.backends.transforms.remove_clone_ops import RemoveCloneOpsTransform
+
 from executorch.backends.transforms.remove_getitem_op import RemoveGetItemPass
 
 from executorch.backends.xnnpack._passes.channels_last_tagged_reshape_pass import (
@@ -38,6 +41,9 @@
 
 from torch.export import ExportedProgram
 
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.WARNING)
+
 
 class XNNPACKPassManager:
     def __init__(
@@ -69,6 +75,7 @@ def __init__(
                 PReLUReshapePass,
                 ChannelsLastTaggedReshapePass,
                 RemoveRedundantCopyPass,
+                RemoveCloneOpsTransform,
             ]
         else:
             self.passes = passes
@@ -92,4 +99,6 @@ def transform(self) -> ExportedProgram:
                     f"Expecting ExportPass or ExportPass(), but got pass: {pass_} with type: {type(pass_)}"
                 )
             ep = _transform(ep, transform_pass)
+            logger.debug(f"Running {pass_.__name__} pass")
+            logger.debug(f"Transformed program: {ep}")
         return ep
diff --git a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
@@ -493,7 +493,10 @@ def call(self, graph_module: torch.fx.GraphModule):  # noqa: C901
                 # The node requires nchw inputs
                 for input_node in node.all_input_nodes:
                     self.input_to_nchw(graph_module, input_node, node)
-            elif node.target == exir_ops.edge.aten._to_copy.default:
+            elif node.target in [
+                exir_ops.edge.aten._to_copy.default,
+                exir_ops.edge.aten.clone.default,
+            ]:
                 self.tag_node(node)
             else:
                 # The node can have inputs in any format (but all must be the
diff --git a/backends/xnnpack/operators/op_to_copy.py b/backends/xnnpack/operators/op_to_copy.py
@@ -28,9 +28,7 @@
 )
 
 
-@register_node_visitor
 class ConvertMemoryFormat(NodeVisitor):
-    target = "aten._to_copy.default"
 
     def __init__(self, *args) -> None:
         super().__init__(*args)
@@ -54,6 +52,13 @@ def define_node(
         input_quant_params = QuantParams.from_inputs(input_node, self._exported_program)
         output_quant_params = QuantParams.from_outputs(node)
 
+        # Ensure input and output have the same dtype
+        input_dtype = input_node.meta["val"].dtype
+        output_dtype = node.meta["val"].dtype
+        assert (
+            input_dtype == output_dtype
+        ), f"Input dtype {input_dtype} must match output dtype {output_dtype} for {node.target}. Expected dtype to not change."
+
         permute_order = PERM_NCHW_TO_NHWC if to_channels_last else PERM_NHWC_TO_NCHW
 
         self.define_tensor(
@@ -89,3 +94,15 @@ def define_node(
             debug_handle=debug_handle,
         )
         xnn_graph.xnodes.append(ser_node)
+
+
+@register_node_visitor
+class ConvertMemoryFormatToCopy(ConvertMemoryFormat):
+
+    target = "aten._to_copy.default"
+
+
+@register_node_visitor
+class ConvertMemoryFormatClone(ConvertMemoryFormat):
+
+    target = "aten.clone.default"
diff --git a/backends/xnnpack/partition/config/__init__.py b/backends/xnnpack/partition/config/__init__.py
@@ -22,6 +22,7 @@
     CatConfig,
     CeilConfig,
     ClampConfig,
+    CloneDimOrderConfig,
     ConstantPadConfig,
     DeQuantizedPerTensorConfig,
     DivConfig,
@@ -117,4 +118,5 @@
     QuantizeAffineConfig,
     DeQuantizeAffineConfig,
     ChooseQParamsAffineConfig,
+    CloneDimOrderConfig,
 ]
diff --git a/backends/xnnpack/partition/config/generic_node_configs.py b/backends/xnnpack/partition/config/generic_node_configs.py
@@ -454,6 +454,14 @@ def supported_precision_types(self) -> List[ConfigPrecisionType]:
         return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT]
 
 
+class CloneDimOrderConfig(ToDimOrderCopyConfig):
+    target_name = "_clone_dim_order.default"
+
+    """
+    Similar to ToDimOrderCopyConfig, but with different target name. We shouldn't change dtype anyway.
+    """
+
+
 class MeanDimConfig(GenericNodePartitionerConfig):
     target_name = "mean.dim"
 
diff --git a/backends/xnnpack/test/ops/test_clone.py b/backends/xnnpack/test/ops/test_clone.py
@@ -0,0 +1,146 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+
+from executorch.backends.xnnpack.test.tester import Tester
+
+
+class TestCloneMemoryFormat(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
+    def run_tester(self, module, inputs):
+        tester = Tester(
+            module.eval(),
+            inputs,
+        )
+        tester.export().to_edge_transform_and_lower().check_not(
+            ["executorch_exir_dialects_edge__ops_aten_clone_default"]
+        ).to_executorch().serialize().run_method_and_compare_outputs()
+
+    class ChannelLastBeforeLinear(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.linear = torch.nn.Linear(3, 3)
+
+        def forward(self, x):
+            y = x.clone(memory_format=torch.channels_last)
+            return self.linear(y)
+
+    ChannelLastBeforeLinearModule = ChannelLastBeforeLinear()
+
+    def test_channel_last_before_linear(self):
+        self.run_tester(self.ChannelLastBeforeLinearModule, (torch.randn(1, 3, 3, 3),))
+
+    class ContiguousBeforeConv(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = torch.nn.Conv2d(3, 3, 3)
+
+        def forward(self, x):
+            y = x.clone(memory_format=torch.contiguous_format)
+            return self.conv(y)
+
+    ContiguousBeforeConvModule = ContiguousBeforeConv()
+
+    def test_contiguous_before_conv(self):
+        self.run_tester(self.ContiguousBeforeConvModule, (torch.randn(1, 3, 6, 6),))
+
+    class CloneChannelsLastToContiguous(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = torch.nn.Conv2d(3, 3, 3)
+
+        def forward(self, x):
+            # Start with channels_last input
+            x_channels_last = x.to(memory_format=torch.channels_last)
+            # Clone to contiguous format
+            y = x_channels_last.clone(memory_format=torch.contiguous_format)
+            return self.conv(y)
+
+    CloneChannelsLastToContiguousModule = CloneChannelsLastToContiguous()
+
+    def test_clone_channels_last_to_contiguous(self):
+        self.run_tester(
+            self.CloneChannelsLastToContiguousModule, (torch.randn(1, 3, 6, 6),)
+        )
+
+    class CloneContiguousToChannelsLast(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = torch.nn.Conv2d(3, 3, 3)
+
+        def forward(self, x):
+            # Clone contiguous input to channels_last format
+            y = x.clone(memory_format=torch.channels_last)
+            return self.conv(y)
+
+    CloneContiguousToChannelsLastModule = CloneContiguousToChannelsLast()
+
+    def test_clone_contiguous_to_channels_last(self):
+        self.run_tester(
+            self.CloneContiguousToChannelsLastModule, (torch.randn(1, 3, 6, 6),)
+        )
+
+    class SimpleClone(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = torch.nn.Conv2d(3, 3, 3)
+
+        def forward(self, x):
+            # Simple clone without memory format (should default to contiguous)
+            y = x.clone()
+            return self.conv(y)
+
+    SimpleCloneModule = SimpleClone()
+
+    def test_simple_clone(self):
+        self.run_tester(self.SimpleCloneModule, (torch.randn(1, 3, 6, 6),))
+
+    class QuantizedClone(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = torch.nn.Conv2d(3, 3, 3)
+            self.conv2 = torch.nn.Conv2d(3, 3, 3)
+
+        def forward(self, x):
+            y = self.conv(x)
+            y = y.clone(memory_format=torch.contiguous_format)
+            return self.conv2(y)
+
+    QuantizedCloneModule = QuantizedClone()
+
+    def test_quantized_clone(self):
+        tester = Tester(
+            self.QuantizedCloneModule.eval(),
+            (torch.randn(1, 3, 9, 9),),
+        )
+
+        tester.quantize().export().to_edge_transform_and_lower().check_not(
+            [
+                "executorch_exir_dialects_edge__ops_aten_clone_default",
+                "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default",
+            ]
+        ).to_executorch().serialize().run_method_and_compare_outputs(qtol=1)
+
+    class ChainedClone(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = torch.nn.Conv2d(3, 3, 3)
+
+        def forward(self, x):
+            # Chain multiple clones with different memory formats
+            y = x.clone(memory_format=torch.channels_last)
+            z = y.clone(memory_format=torch.contiguous_format)
+            return self.conv(z)
+
+    ChainedCloneModule = ChainedClone()
+
+    def test_chained_clone(self):
+        self.run_tester(self.ChainedCloneModule, (torch.randn(1, 3, 6, 6),))