pytorch
diff --git a/‎backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py‎
Lines changed: 45 additions & 6 deletions b/‎backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py‎
Lines changed: 45 additions & 6 deletions
diff --git a/‎backends/xnnpack/runtime/XNNExecutor.cpp‎
Lines changed: 29 additions & 9 deletions b/‎backends/xnnpack/runtime/XNNExecutor.cpp‎
Lines changed: 29 additions & 9 deletions
diff --git a/‎backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py‎
Lines changed: 119 additions & 79 deletions b/‎backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py‎
Lines changed: 119 additions & 79 deletions
@@ -4,6 +4,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import string
+from logging import FATAL
+from tokenize import String
 from typing import Optional, Tuple
 
 import torch
@@ -56,9 +59,9 @@ class ChannelsLastTaggedReshapePass(XNNPACKPass):
 
     # Set of ops that require memory format to be NCHW
     memory_sensitive_ops_nchw = {
-        "output",
         exir_ops.edge.aten.squeeze_copy.dim,
         exir_ops.edge.aten.unsqueeze_copy.default,
+        exir_ops.edge.aten.linear.default,
     }
 
     # Tag which is added to a node's meta to indicate that it uses NHWC format.
@@ -91,10 +94,20 @@ def is_nchw_node(self, node: torch.fx.Node) -> bool:
         return not self.is_nhwc_node(node)
 
     def requires_nhwc_input(self, node: torch.fx.Node) -> bool:
-        return node.target in self.memory_sensitive_ops_nhwc
+        return (
+            node.target in self.memory_sensitive_ops_nhwc
+            or node.name == "output"
+            and not node.args[0][0].meta["val"].is_contiguous()
+        )
 
     def requires_nchw_inputs(self, node: torch.fx.Node) -> bool:
-        return node.target in self.memory_sensitive_ops_nchw
+        return (
+            node.target in self.memory_sensitive_ops_nchw
+            or node.name == "output"
+            and node.args[0][0]
+            .meta["val"]
+            .is_contiguous()  # Need to consider output trace so out matches
+        )
 
     def can_be_converted_to_nhwc(self, node: torch.fx.Node) -> bool:
         # There are two conditions that must be met for a node to be able to
@@ -269,8 +282,17 @@ def input_to_nhwc(
             # serializing graph, but don't do anything else here
             self.mark_as_nhwc_node(input_node)
 
-        if self.is_nhwc_node(input_node):
+        if input_node.op == "placeholder":
+            if not input_node.meta["val"][0].is_contiguous():
+                return
+        elif self.is_nhwc_node(input_node):
             return
+        # if (
+        #     self.is_nhwc_node(input_node)
+        #     or input_node.op == "placeholder"
+        #     and not input_node.meta["val"][0].is_contiguous()
+        # ):
+        #     return
 
         if not self.can_be_converted_to_nhwc(input_node):
             raise AssertionError(
@@ -333,8 +355,21 @@ def input_to_nchw(
             # do anything else here
             self.mark_as_nchw_node(input_node)
 
-        if self.is_nchw_node(input_node):
+        if input_node.op == "placeholder":
+            if input_node.meta["val"][0].is_contiguous():
+                return
+        elif self.is_nchw_node(input_node):
             return
+        # TODO
+        # meta trace happens before passes. At the end of pass, meta gets regenerated. eager mode assumes in/out stay same for conv. Linear has implicit nchw conv
+        # if (
+        #     self.is_nchw_node(
+        #         input_node
+        #     )  # This is triggering as x (placeholder) is tagged as nchw
+        #     or input_node.op == "placeholder"
+        #     and input_node.meta["val"][0].is_contiguous()
+        # ):
+        #     return
 
         if ChannelsLastTaggedReshapePass.PARTNER_NODE in input_node.meta:
             # Already has an associated NCHW node
@@ -371,7 +406,11 @@ def call(self, graph_module: torch.fx.GraphModule):  # noqa: C901
                 # first input to be nhwc. This makes this node's output nhwc too
                 # Currently, all nodes like this should have all of their other
                 # inputs as nchw, so fail if this is not true
-                self.input_to_nhwc(graph_module, node.args[0], node)
+                if node.name == "output":
+                    self.input_to_nhwc(graph_module, node.args[0][0], node)
+                else:
+                    self.input_to_nhwc(graph_module, node.args[0], node)
+
                 for input_node in node.all_input_nodes[1:]:
                     if self.is_nhwc_node(input_node):
                         raise AssertionError(
 
@@ -106,20 +106,24 @@ ET_NODISCARD Error XNNExecutor::prepare_args(EValue** args) {
           err == Error::Ok,
           Internal,
           "Failed to retrieve dim order from tensor!");
-      // ET_CHECK_OR_RETURN_ERROR(
-      //     is_contiguous_dim_order(dim_order, tensor->dim()),
-      //     Internal,
-      //     "Expecting default dim_order but got a non default dim_order tensor for external input %u",
-      //     i);
       size_t dims[XNN_MAX_TENSOR_DIMS];
       ET_CHECK_OR_RETURN_ERROR(
           num_dims <= XNN_MAX_TENSOR_DIMS,
           InvalidArgument,
           "XNNPACK backend accepts tensors with at most %d dims, but got %zu",
           XNN_MAX_TENSOR_DIMS,
           num_dims);
-      for (int d = 0; d < num_dims; ++d) {
-        dims[d] = tensor->size(d);
+
+      bool is_channels_last = executorch::runtime::is_channels_last_dim_order(dim_order, num_dims);
+      if (is_channels_last) {
+        dims[0] = tensor->size(0);
+        dims[1] = tensor->size(2);
+        dims[2] = tensor->size(3);
+        dims[3] = tensor->size(1);
+      } else {
+        for (int d = 0; d < num_dims; ++d) {
+          dims[d] = tensor->size(d);
+        }
       }
       status =
           xnn_reshape_external_value(runtime_.get(), ext_id, num_dims, dims);
@@ -220,8 +224,24 @@ ET_NODISCARD Error XNNExecutor::resize_outputs(EValue** args) const {
 
     // Convert new output shape into SizesType
     SizesType expected_output_size[kTensorDimensionLimit];
-    for (size_t d = 0; d < num_dim; ++d) {
-      expected_output_size[d] = static_cast<SizesType>(dims[d]);
+    executorch::aten::DimOrderType dim_order[kTensorDimensionLimit];
+    Error errr =
+          ET_RUNTIME_NAMESPACE::get_dim_order(*out_tensor, dim_order, num_dim);
+      ET_CHECK_OR_RETURN_ERROR(
+          errr == Error::Ok,
+          Internal,
+          "Failed to retrieve dim order from tensor!");
+
+    bool is_channels_last = executorch::runtime::is_channels_last_dim_order(dim_order, num_dim);
+    if (is_channels_last) {
+      expected_output_size[0] = static_cast<SizesType>(dims[0]);
+      expected_output_size[1] = static_cast<SizesType>(dims[3]);
+      expected_output_size[2] = static_cast<SizesType>(dims[1]);
+      expected_output_size[3] = static_cast<SizesType>(dims[2]);
+    } else {
+      for (size_t d = 0; d < num_dim; ++d) {
+        expected_output_size[d] = static_cast<SizesType>(dims[d]);
+      }
     }
 
     executorch::aten::ArrayRef<SizesType> output_size{
 
@@ -7,6 +7,7 @@
 import unittest
 
 import torch
+from backends.xnnpack._passes.convert_to_linear import ConvertToLinearPass
 from executorch.backends.xnnpack._passes.channels_last_tagged_reshape_pass import (
     ChannelsLastTaggedReshapePass,
 )
@@ -58,48 +59,87 @@ def setUp(self):
     #             .run_method_and_compare_outputs()
     #         )
 
-    # def test_channels_last_input_graph_transformation(self):
-    #     # Define a simple module for testing
-    #     class SimpleModule(torch.nn.Module):
-    #         def __init__(self):
-    #             super().__init__()
-    #             self.conv = torch.nn.Conv2d(3, 3, 3)
-    #         def forward(self, x):
-    #             return self.conv(x)
-    #     # Create a tester instance with NHWC input
-    #     tester = Tester(SimpleModule().eval(), (torch.randn(1, 3, 3, 3).to(memory_format=torch.channels_last),))
-    #     # Run the export and pass stages
-    #     tester.export().to_edge().run_passes(self.PassStage)
-    #     # Check the graph for expected nodes
-    #     tester.check_count({
-    #         "executorch_exir_dialects_edge__ops_aten__to_copy_default": 2, # should be 1 but its 2
-    #         "executorch_exir_dialects_edge__ops_aten_convolution_default": 1
-    #     })
-    #     tester.dump_artifact()
-
-    def test_nhwc_input(self):
-        class SimpleModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.conv = torch.nn.Conv2d(3, 3, 3)
-            def forward(self, x):
-                return self.conv(x)
-
-        tester = Tester(SimpleModule().eval(), (torch.randn(1, 3, 8, 8).to(memory_format=torch.channels_last),))
-
-        tester2 = Tester(SimpleModule().eval(), (torch.randn(1, 3, 8, 8).to(memory_format=torch.channels_last),))
-        tester2.export().to_edge().run_passes(self.PassStage).dump_artifact()
-
-
-        tester.export() \
-              .to_edge_transform_and_lower() \
-                .dump_artifact()\
-              .to_executorch() \
-                .dump_artifact()\
-              .serialize() \
-              .run_method_and_compare_outputs()
+    class LinearConv(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = torch.nn.Conv2d(3, 3, 3)
+            self.linear1 = torch.nn.Linear(4, 3)
+
+        def forward(self, x):
+            y = self.linear1(x)
+            return self.conv1(y)
+
+    def test_conv_linear_dim_order_swaps_on_nhwc_input(self):
+        tester = Tester(
+            self.LinearConv().eval(),
+            (torch.randn(1, 3, 6, 4).to(memory_format=torch.channels_last),),
+        )
+
+        tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs()
+
+    def test_conv_linear_dim_order_swaps_on_nchw_input(self):
+        tester = Tester(
+            self.LinearConv().eval(),
+            (torch.randn(1, 3, 6, 4),),
+        )
+
+        tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs()
+
+    class ConvLinearConv(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = torch.nn.Conv2d(3, 3, 3)
+            self.linear1 = torch.nn.Linear(4, 4)
+
+        def forward(self, x):
+            y = self.conv1(x)
+            return self.linear1(y)
+
+    def test_linear_conv_dim_order_swaps_on_nhwc_input(self):
+        tester = Tester(
+            self.ConvLinearConv().eval(),
+            (torch.randn(1, 3, 6, 6).to(memory_format=torch.channels_last),),
+        )
 
+        tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs()
 
+    def test_linear_conv_dim_order_swaps_on_nchw_input(self):
+        tester = Tester(
+            self.ConvLinearConv().eval(),
+            (torch.randn(1, 3, 6, 6),),
+        )
+
+        tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs()
+
+    class Bilinear(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x):
+            return torch.nn.functional.interpolate(
+                x, scale_factor=2, mode="bilinear", align_corners=True
+            )
+
+    def test_nhwc_input_on_nhwc_op(self):
+        tester = Tester(
+            self.Bilinear().eval(),
+            (
+                torch.arange(8)
+                .reshape(1, 2, 2, 2)
+                .to(torch.float32)
+                .to(memory_format=torch.channels_last),
+            ),
+        )
+
+        tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs()
+
+    def test_nchw_input_on_nhwc_op(self):
+        tester = Tester(
+            self.Bilinear().eval(),
+            (torch.arange(8).reshape(1, 2, 2, 2).to(torch.float32),),
+        )
+
+        tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs()
 
     # def test_qs8_channels_last_tagged_reshape_pass(self):
     #     for module, num_reshape in self.modules.items():
@@ -190,45 +230,45 @@ def forward(self, x):
             return x
 
     # def test_fp32_channels_last_tagged_reshape_pass_conv_bn_hardtanh_mean_seq(self):
-        # Copy #1 is for input to conv, nchw -> nhwc
-        # Copy #2 is for conv to _native_batch_norm_legit_no_training, nhwc -> nchw
-        # Copy #3 is for input to mean, nchw -> nhwc
-        # Copy #4 is for output, nhwc -> nchw
-
-        # The graph looks like:
-        # graph():
-        #     %arg0_1 : [#users=1] = placeholder[target=arg0_1]
-        #     %aten__to_copy_default : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._to_copy.default](args = (%arg0_1,), kwargs = {memory_format: torch.channels_last})
-        #     %_param_constant0 : [#users=1] = get_attr[target=_param_constant0]
-        #     %_param_constant1 : [#users=1] = get_attr[target=_param_constant1]
-        #     %aten_convolution_default : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten__to_copy_default, %_param_constant0, %_param_constant1, [2, 2], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
-        #     %aten__to_copy_default_1 : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._to_copy.default](args = (%aten_convolution_default,), kwargs = {memory_format: torch.contiguous_format})
-        #     %_param_constant2 : [#users=1] = get_attr[target=_param_constant2]
-        #     %_param_constant3 : [#users=1] = get_attr[target=_param_constant3]
-        #     %_tensor_constant0 : [#users=1] = get_attr[target=_tensor_constant0]
-        #     %_tensor_constant1 : [#users=1] = get_attr[target=_tensor_constant1]
-        #     %aten__native_batch_norm_legit_no_training_default : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten__to_copy_default_1, %_param_constant2, %_param_constant3, %_tensor_constant0, %_tensor_constant1, 0.1, 1e-05), kwargs = {})
-        #     %getitem : [#users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default, 0), kwargs = {})
-        #     %aten_hardtanh_default : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem, 0, 6), kwargs = {})
-        #     %aten__to_copy_default_2 : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._to_copy.default](args = (%aten_hardtanh_default,), kwargs = {memory_format: torch.channels_last})
-        #     %aten_mean_dim : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.mean.dim](args = (%aten__to_copy_default_2, [-1, -2], True), kwargs = {})
-        #     %aten__to_copy_default_3 : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._to_copy.default](args = (%aten_mean_dim,), kwargs = {memory_format: torch.contiguous_format})
-        #     return [aten__to_copy_default_3]
-        # (
-        #     Tester(
-        #         self.Conv2dBnHardtanhMeanSequenceModule().eval(),
-        #         (torch.randn(1, 1, 6, 6),),
-        #     )
-        #     .export()
-        #     .to_edge()
-        #     .run_passes(self.PassStage)
-        #     .check_count(
-        #         {
-        #             self.to_copy_name: 4,
-        #         }
-        #     )
-        #     .run_method_and_compare_outputs()
-        # )
+    # Copy #1 is for input to conv, nchw -> nhwc
+    # Copy #2 is for conv to _native_batch_norm_legit_no_training, nhwc -> nchw
+    # Copy #3 is for input to mean, nchw -> nhwc
+    # Copy #4 is for output, nhwc -> nchw
+
+    # The graph looks like:
+    # graph():
+    #     %arg0_1 : [#users=1] = placeholder[target=arg0_1]
+    #     %aten__to_copy_default : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._to_copy.default](args = (%arg0_1,), kwargs = {memory_format: torch.channels_last})
+    #     %_param_constant0 : [#users=1] = get_attr[target=_param_constant0]
+    #     %_param_constant1 : [#users=1] = get_attr[target=_param_constant1]
+    #     %aten_convolution_default : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten__to_copy_default, %_param_constant0, %_param_constant1, [2, 2], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
+    #     %aten__to_copy_default_1 : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._to_copy.default](args = (%aten_convolution_default,), kwargs = {memory_format: torch.contiguous_format})
+    #     %_param_constant2 : [#users=1] = get_attr[target=_param_constant2]
+    #     %_param_constant3 : [#users=1] = get_attr[target=_param_constant3]
+    #     %_tensor_constant0 : [#users=1] = get_attr[target=_tensor_constant0]
+    #     %_tensor_constant1 : [#users=1] = get_attr[target=_tensor_constant1]
+    #     %aten__native_batch_norm_legit_no_training_default : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten__to_copy_default_1, %_param_constant2, %_param_constant3, %_tensor_constant0, %_tensor_constant1, 0.1, 1e-05), kwargs = {})
+    #     %getitem : [#users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default, 0), kwargs = {})
+    #     %aten_hardtanh_default : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem, 0, 6), kwargs = {})
+    #     %aten__to_copy_default_2 : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._to_copy.default](args = (%aten_hardtanh_default,), kwargs = {memory_format: torch.channels_last})
+    #     %aten_mean_dim : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.mean.dim](args = (%aten__to_copy_default_2, [-1, -2], True), kwargs = {})
+    #     %aten__to_copy_default_3 : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._to_copy.default](args = (%aten_mean_dim,), kwargs = {memory_format: torch.contiguous_format})
+    #     return [aten__to_copy_default_3]
+    # (
+    #     Tester(
+    #         self.Conv2dBnHardtanhMeanSequenceModule().eval(),
+    #         (torch.randn(1, 1, 6, 6),),
+    #     )
+    #     .export()
+    #     .to_edge()
+    #     .run_passes(self.PassStage)
+    #     .check_count(
+    #         {
+    #             self.to_copy_name: 4,
+    #         }
+    #     )
+    #     .run_method_and_compare_outputs()
+    # )
 
     class Conv2dDynamicQuant(torch.nn.Module):
         def __init__(self):