support channels last dim order in xnnpack

leafs1 · leafs1 · commit e359d50c5ec7 · 2025-05-27T15:57:47.000-07:00
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,17 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Debug CMake project",
+            "type": "lldb", // https://github.com/vadimcn/vscode-lldb
+            "request": "launch",
+            "program": "${command:cmake.launchTargetPath}",
+            "args": [
+                "--model_path=./add.pte",
+            ]
+        }
+    ]
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,69 @@
+{
+    "files.associations": {
+        "cstdlib": "cpp",
+        "__bit_reference": "cpp",
+        "__hash_table": "cpp",
+        "__locale": "cpp",
+        "__node_handle": "cpp",
+        "__split_buffer": "cpp",
+        "__tree": "cpp",
+        "__verbose_abort": "cpp",
+        "array": "cpp",
+        "bitset": "cpp",
+        "cctype": "cpp",
+        "charconv": "cpp",
+        "clocale": "cpp",
+        "cmath": "cpp",
+        "complex": "cpp",
+        "condition_variable": "cpp",
+        "cstdarg": "cpp",
+        "cstdint": "cpp",
+        "cstdio": "cpp",
+        "cstring": "cpp",
+        "ctime": "cpp",
+        "cwchar": "cpp",
+        "cwctype": "cpp",
+        "deque": "cpp",
+        "execution": "cpp",
+        "memory": "cpp",
+        "forward_list": "cpp",
+        "future": "cpp",
+        "initializer_list": "cpp",
+        "iomanip": "cpp",
+        "ios": "cpp",
+        "iosfwd": "cpp",
+        "iostream": "cpp",
+        "istream": "cpp",
+        "limits": "cpp",
+        "list": "cpp",
+        "locale": "cpp",
+        "map": "cpp",
+        "mutex": "cpp",
+        "new": "cpp",
+        "optional": "cpp",
+        "print": "cpp",
+        "queue": "cpp",
+        "ratio": "cpp",
+        "regex": "cpp",
+        "set": "cpp",
+        "shared_mutex": "cpp",
+        "sstream": "cpp",
+        "stack": "cpp",
+        "stdexcept": "cpp",
+        "streambuf": "cpp",
+        "string": "cpp",
+        "string_view": "cpp",
+        "typeindex": "cpp",
+        "typeinfo": "cpp",
+        "unordered_map": "cpp",
+        "unordered_set": "cpp",
+        "variant": "cpp",
+        "vector": "cpp",
+        "algorithm": "cpp",
+        "iterator": "cpp",
+        "tuple": "cpp",
+        "span": "cpp"
+    },
+    "C_Cpp.default.compilerPath": "/library/developer/commandlinetools/usr/bin/c++",
+    "python.analysis.typeCheckingMode": "off"
+}
diff --git a/CMakePresets.json b/CMakePresets.json
@@ -104,6 +104,17 @@
                 "Windows"
             ]
         }
-    }
+    },
+    {
+      "name": "Executorch",
+      "displayName": "Executorch",
+      "description": "Sets Ninja generator, build and install directory",
+      "generator": "Ninja",
+      "binaryDir": "${sourceDir}/out/build/${presetName}",
+      "cacheVariables": {
+          "CMAKE_BUILD_TYPE": "Debug",
+          "CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}"
+      }
+  }
   ]
 }
diff --git a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
@@ -56,9 +56,9 @@ class ChannelsLastTaggedReshapePass(XNNPACKPass):
 
     # Set of ops that require memory format to be NCHW
     memory_sensitive_ops_nchw = {
-        "output",
         exir_ops.edge.aten.squeeze_copy.dim,
         exir_ops.edge.aten.unsqueeze_copy.default,
+        exir_ops.edge.aten.linear.default,
     }
 
     # Tag which is added to a node's meta to indicate that it uses NHWC format.
@@ -91,10 +91,20 @@ def is_nchw_node(self, node: torch.fx.Node) -> bool:
         return not self.is_nhwc_node(node)
 
     def requires_nhwc_input(self, node: torch.fx.Node) -> bool:
-        return node.target in self.memory_sensitive_ops_nhwc
+        return (
+            node.target in self.memory_sensitive_ops_nhwc
+            or node.name == "output"
+            and not node.args[0][0].meta["val"].is_contiguous()
+        )
 
     def requires_nchw_inputs(self, node: torch.fx.Node) -> bool:
-        return node.target in self.memory_sensitive_ops_nchw
+        return (
+            node.target in self.memory_sensitive_ops_nchw
+            or node.name == "output"
+            and node.args[0][0]
+            .meta["val"]
+            .is_contiguous()
+        )
 
     def can_be_converted_to_nhwc(self, node: torch.fx.Node) -> bool:
         # There are two conditions that must be met for a node to be able to
@@ -269,7 +279,10 @@ def input_to_nhwc(
             # serializing graph, but don't do anything else here
             self.mark_as_nhwc_node(input_node)
 
-        if self.is_nhwc_node(input_node):
+        if input_node.op == "placeholder":
+            if not input_node.meta["val"][0].is_contiguous():
+                return
+        elif self.is_nhwc_node(input_node):
             return
 
         if not self.can_be_converted_to_nhwc(input_node):
@@ -333,7 +346,10 @@ def input_to_nchw(
             # do anything else here
             self.mark_as_nchw_node(input_node)
 
-        if self.is_nchw_node(input_node):
+        if input_node.op == "placeholder":
+            if input_node.meta["val"][0].is_contiguous():
+                return
+        elif self.is_nchw_node(input_node):
             return
 
         if ChannelsLastTaggedReshapePass.PARTNER_NODE in input_node.meta:
@@ -371,7 +387,11 @@ def call(self, graph_module: torch.fx.GraphModule):  # noqa: C901
                 # first input to be nhwc. This makes this node's output nhwc too
                 # Currently, all nodes like this should have all of their other
                 # inputs as nchw, so fail if this is not true
-                self.input_to_nhwc(graph_module, node.args[0], node)
+                if node.name == "output":
+                    self.input_to_nhwc(graph_module, node.args[0][0], node)
+                else:
+                    self.input_to_nhwc(graph_module, node.args[0], node)
+
                 for input_node in node.all_input_nodes[1:]:
                     if self.is_nhwc_node(input_node):
                         raise AssertionError(
diff --git a/backends/xnnpack/runtime/XNNExecutor.cpp b/backends/xnnpack/runtime/XNNExecutor.cpp
@@ -106,20 +106,24 @@ ET_NODISCARD Error XNNExecutor::prepare_args(EValue** args) {
           err == Error::Ok,
           Internal,
           "Failed to retrieve dim order from tensor!");
-      ET_CHECK_OR_RETURN_ERROR(
-          is_contiguous_dim_order(dim_order, tensor->dim()),
-          Internal,
-          "Expecting default dim_order but got a non default dim_order tensor for external input %u",
-          i);
       size_t dims[XNN_MAX_TENSOR_DIMS];
       ET_CHECK_OR_RETURN_ERROR(
           num_dims <= XNN_MAX_TENSOR_DIMS,
           InvalidArgument,
           "XNNPACK backend accepts tensors with at most %d dims, but got %zu",
           XNN_MAX_TENSOR_DIMS,
           num_dims);
-      for (int d = 0; d < num_dims; ++d) {
-        dims[d] = tensor->size(d);
+
+      bool is_channels_last = executorch::runtime::is_channels_last_dim_order(dim_order, num_dims);
+      if (is_channels_last) {
+        dims[0] = tensor->size(0);
+        dims[1] = tensor->size(2);
+        dims[2] = tensor->size(3);
+        dims[3] = tensor->size(1);
+      } else {
+        for (int d = 0; d < num_dims; ++d) {
+          dims[d] = tensor->size(d);
+        }
       }
       status =
           xnn_reshape_external_value(runtime_.get(), ext_id, num_dims, dims);
@@ -220,8 +224,24 @@ ET_NODISCARD Error XNNExecutor::resize_outputs(EValue** args) const {
 
     // Convert new output shape into SizesType
     SizesType expected_output_size[kTensorDimensionLimit];
-    for (size_t d = 0; d < num_dim; ++d) {
-      expected_output_size[d] = static_cast<SizesType>(dims[d]);
+    executorch::aten::DimOrderType dim_order[kTensorDimensionLimit];
+    Error errr =
+          ET_RUNTIME_NAMESPACE::get_dim_order(*out_tensor, dim_order, num_dim);
+      ET_CHECK_OR_RETURN_ERROR(
+          errr == Error::Ok,
+          Internal,
+          "Failed to retrieve dim order from tensor!");
+
+    bool is_channels_last = executorch::runtime::is_channels_last_dim_order(dim_order, num_dim);
+    if (is_channels_last) {
+      expected_output_size[0] = static_cast<SizesType>(dims[0]);
+      expected_output_size[1] = static_cast<SizesType>(dims[3]);
+      expected_output_size[2] = static_cast<SizesType>(dims[1]);
+      expected_output_size[3] = static_cast<SizesType>(dims[2]);
+    } else {
+      for (size_t d = 0; d < num_dim; ++d) {
+        expected_output_size[d] = static_cast<SizesType>(dims[d]);
+      }
     }
 
     executorch::aten::ArrayRef<SizesType> output_size{
diff --git a/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py b/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py
@@ -7,6 +7,7 @@
 import unittest
 
 import torch
+from backends.xnnpack._passes.convert_to_linear import ConvertToLinearPass
 from executorch.backends.xnnpack._passes.channels_last_tagged_reshape_pass import (
     ChannelsLastTaggedReshapePass,
 )
@@ -58,6 +59,88 @@ def test_fp32_channels_last_tagged_reshape_pass(self):
                 .run_method_and_compare_outputs()
             )
 
+    class LinearConv(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = torch.nn.Conv2d(3, 3, 3)
+            self.linear1 = torch.nn.Linear(4, 3)
+
+        def forward(self, x):
+            y = self.linear1(x)
+            return self.conv1(y)
+
+    def test_conv_linear_dim_order_swaps_on_nhwc_input(self):
+        tester = Tester(
+            self.LinearConv().eval(),
+            (torch.randn(1, 3, 6, 4).to(memory_format=torch.channels_last),),
+        )
+
+        tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs()
+
+    def test_conv_linear_dim_order_swaps_on_nchw_input(self):
+        tester = Tester(
+            self.LinearConv().eval(),
+            (torch.randn(1, 3, 6, 4),),
+        )
+
+        tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs()
+
+    class ConvLinearConv(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = torch.nn.Conv2d(3, 3, 3)
+            self.linear1 = torch.nn.Linear(4, 4)
+
+        def forward(self, x):
+            y = self.conv1(x)
+            return self.linear1(y)
+
+    def test_linear_conv_dim_order_swaps_on_nhwc_input(self):
+        tester = Tester(
+            self.ConvLinearConv().eval(),
+            (torch.randn(1, 3, 6, 6).to(memory_format=torch.channels_last),),
+        )
+
+        tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs()
+
+    def test_linear_conv_dim_order_swaps_on_nchw_input(self):
+        tester = Tester(
+            self.ConvLinearConv().eval(),
+            (torch.randn(1, 3, 6, 6),),
+        )
+
+        tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs()
+
+    class Bilinear(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x):
+            return torch.nn.functional.interpolate(
+                x, scale_factor=2, mode="bilinear", align_corners=True
+            )
+
+    def test_nhwc_input_on_nhwc_op(self):
+        tester = Tester(
+            self.Bilinear().eval(),
+            (
+                torch.arange(8)
+                .reshape(1, 2, 2, 2)
+                .to(torch.float32)
+                .to(memory_format=torch.channels_last),
+            ),
+        )
+
+        tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs()
+
+    def test_nchw_input_on_nhwc_op(self):
+        tester = Tester(
+            self.Bilinear().eval(),
+            (torch.arange(8).reshape(1, 2, 2, 2).to(torch.float32),),
+        )
+
+        tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs()
+
     def test_qs8_channels_last_tagged_reshape_pass(self):
         for module, num_reshape in self.modules.items():
             (
diff --git a/backends/xnnpack/test/tester/tester.py b/backends/xnnpack/test/tester/tester.py
@@ -31,6 +31,7 @@
 )
 from executorch.exir.backend.backend_api import validation_disabled
 from executorch.exir.backend.partitioner import Partitioner
+from executorch.exir.dim_order_utils import get_memory_format
 from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
 
 from executorch.exir.print_program import pretty_print, print_program
@@ -533,10 +534,13 @@ def fn(x):
         # create random tensor inputs with the shapes given above:
         random_inputs = []
         for arg_idx in range(len(self.example_inputs)):
+            memFormat = get_memory_format(
+                list(self.example_inputs[arg_idx].dim_order())
+            )
             random_inputs.append(
-                torch.randn(input_shapes[arg_idx]).to(
-                    dtype=self.example_inputs[arg_idx].dtype
-                )
+                torch.randn(input_shapes[arg_idx])
+                .to(dtype=self.example_inputs[arg_idx].dtype)
+                .to(memory_format=memFormat)
             )
 
         yield tuple(random_inputs)
diff --git a/backends/xnnpack/xnnpack_preprocess.py b/backends/xnnpack/xnnpack_preprocess.py
@@ -145,9 +145,6 @@ def preprocess(
 
         node_to_external_map = generate_node_to_external_map(ep, graph_module)
 
-        # Make sure all inputs are contiguous_format or NCHW or default dim order
-        assert_default_dim_order(graph_module)
-
         # TODO retrace the graph module to lift the new params may have
         # been added to the graph in passes
 
diff --git a/cmake_wrapper.sh b/cmake_wrapper.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+source /Users/madragna/executorch/.venv/bin/activate
+cmake "$@"

Original file line number	Diff line number	Diff line change
`@@ -104,6 +104,17 @@`
`104`	`104`	`"Windows"`
`105`	`105`	`]`
`106`	`106`	`}`
`107`		`- }`
	`107`	`+ },`
	`108`	`+ {`
	`109`	`+ "name": "Executorch",`
	`110`	`+ "displayName": "Executorch",`
	`111`	`+ "description": "Sets Ninja generator, build and install directory",`
	`112`	`+ "generator": "Ninja",`
	`113`	`+ "binaryDir": "${sourceDir}/out/build/${presetName}",`
	`114`	`+ "cacheVariables": {`
	`115`	`+ "CMAKE_BUILD_TYPE": "Debug",`
	`116`	`+ "CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}"`
	`117`	`+ }`
	`118`	`+ }`
`108`	`119`	`]`
`109`	`120`	`}`