diff --git a/.Package.swift/.watchman-cookie-madragna-mac-84335-443 b/.Package.swift/.watchman-cookie-madragna-mac-84335-443 new file mode 100755 index 00000000000..e69de29bb2d diff --git a/.ci/.watchman-cookie-madragna-mac-84335-443 b/.ci/.watchman-cookie-madragna-mac-84335-443 new file mode 100755 index 00000000000..e69de29bb2d diff --git a/.github/.watchman-cookie-madragna-mac-84335-443 b/.github/.watchman-cookie-madragna-mac-84335-443 new file mode 100755 index 00000000000..e69de29bb2d diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 00000000000..e08db314050 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,26 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Debug CMake project", + "type": "lldb", // https://github.com/vadimcn/vscode-lldb + "request": "launch", + "program": "${command:cmake.launchTargetPath}", + "args": [ + "--model_path=./add.pte", + ] + }, + { + "name": "Debug python proj", + "type": "debugpy", + "request": "launch", + "module": "unittest", + "args": [ + "./backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py" + ] + }, + ] +} diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000000..4139c8edeba --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,83 @@ +{ + "files.associations": { + "cstdlib": "cpp", + "__bit_reference": "cpp", + "__hash_table": "cpp", + "__locale": "cpp", + "__node_handle": "cpp", + "__split_buffer": "cpp", + "__tree": "cpp", + "__verbose_abort": "cpp", + "array": "cpp", + "bitset": "cpp", + "cctype": "cpp", + "charconv": "cpp", + "clocale": "cpp", + "cmath": "cpp", + "complex": "cpp", + "condition_variable": "cpp", + "cstdarg": "cpp", + "cstdint": "cpp", + "cstdio": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "cwctype": "cpp", + "deque": "cpp", + "execution": "cpp", + "memory": "cpp", + "forward_list": "cpp", + "future": "cpp", + "initializer_list": "cpp", + "iomanip": "cpp", + "ios": "cpp", + "iosfwd": "cpp", + "iostream": "cpp", + "istream": "cpp", + "limits": "cpp", + "list": "cpp", + "locale": "cpp", + "map": "cpp", + "mutex": "cpp", + "new": "cpp", + "optional": "cpp", + "print": "cpp", + "queue": "cpp", + "ratio": "cpp", + "regex": "cpp", + "set": "cpp", + "shared_mutex": "cpp", + "sstream": "cpp", + "stack": "cpp", + "stdexcept": "cpp", + "streambuf": "cpp", + "string": "cpp", + "string_view": "cpp", + "typeindex": "cpp", + "typeinfo": "cpp", + "unordered_map": "cpp", + "unordered_set": "cpp", + "variant": "cpp", + "vector": "cpp", + "algorithm": "cpp", + "iterator": "cpp", + "tuple": "cpp", + "span": "cpp", + "*.inc": "cpp", + "alignedvector3": "cpp" + }, + "C_Cpp.default.compilerPath": "/library/developer/commandlinetools/usr/bin/c++", + "python.analysis.typeCheckingMode": "off", + "python.testing.unittestArgs": [ + "-v", + "-s", + "./backends", + "-p", + "test_*.py" + ], + "python.testing.pytestEnabled": true, + "python.testing.unittestEnabled": false, + "python.testing.pytestArgs": [ + "." + ] +} diff --git a/backends/.watchman-cookie-madragna-mac-84335-443 b/backends/.watchman-cookie-madragna-mac-84335-443 new file mode 100755 index 00000000000..e69de29bb2d diff --git a/backends/apple/coreml/executorchcoreml.cpython-312-darwin.so b/backends/apple/coreml/executorchcoreml.cpython-312-darwin.so new file mode 100755 index 00000000000..dd956d400d3 Binary files /dev/null and b/backends/apple/coreml/executorchcoreml.cpython-312-darwin.so differ diff --git a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py index 768df1f4f04..8f4ee4a30f5 100644 --- a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py +++ b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py @@ -4,6 +4,9 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import string +from logging import FATAL +from tokenize import String from typing import Optional, Tuple import torch @@ -56,9 +59,9 @@ class ChannelsLastTaggedReshapePass(XNNPACKPass): # Set of ops that require memory format to be NCHW memory_sensitive_ops_nchw = { - "output", exir_ops.edge.aten.squeeze_copy.dim, exir_ops.edge.aten.unsqueeze_copy.default, + exir_ops.edge.aten.linear.default, } # Tag which is added to a node's meta to indicate that it uses NHWC format. @@ -91,10 +94,20 @@ def is_nchw_node(self, node: torch.fx.Node) -> bool: return not self.is_nhwc_node(node) def requires_nhwc_input(self, node: torch.fx.Node) -> bool: - return node.target in self.memory_sensitive_ops_nhwc + return ( + node.target in self.memory_sensitive_ops_nhwc + or node.name == "output" + and not node.args[0][0].meta["val"].is_contiguous() + ) def requires_nchw_inputs(self, node: torch.fx.Node) -> bool: - return node.target in self.memory_sensitive_ops_nchw + return ( + node.target in self.memory_sensitive_ops_nchw + or node.name == "output" + and node.args[0][0] + .meta["val"] + .is_contiguous() # Need to consider output trace so out matches + ) def can_be_converted_to_nhwc(self, node: torch.fx.Node) -> bool: # There are two conditions that must be met for a node to be able to @@ -269,8 +282,17 @@ def input_to_nhwc( # serializing graph, but don't do anything else here self.mark_as_nhwc_node(input_node) - if self.is_nhwc_node(input_node): + if input_node.op == "placeholder": + if not input_node.meta["val"][0].is_contiguous(): + return + elif self.is_nhwc_node(input_node): return + # if ( + # self.is_nhwc_node(input_node) + # or input_node.op == "placeholder" + # and not input_node.meta["val"][0].is_contiguous() + # ): + # return if not self.can_be_converted_to_nhwc(input_node): raise AssertionError( @@ -333,8 +355,21 @@ def input_to_nchw( # do anything else here self.mark_as_nchw_node(input_node) - if self.is_nchw_node(input_node): + if input_node.op == "placeholder": + if input_node.meta["val"][0].is_contiguous(): + return + elif self.is_nchw_node(input_node): return + # TODO + # meta trace happens before passes. At the end of pass, meta gets regenerated. eager mode assumes in/out stay same for conv. Linear has implicit nchw conv + # if ( + # self.is_nchw_node( + # input_node + # ) # This is triggering as x (placeholder) is tagged as nchw + # or input_node.op == "placeholder" + # and input_node.meta["val"][0].is_contiguous() + # ): + # return if ChannelsLastTaggedReshapePass.PARTNER_NODE in input_node.meta: # Already has an associated NCHW node @@ -371,7 +406,11 @@ def call(self, graph_module: torch.fx.GraphModule): # noqa: C901 # first input to be nhwc. This makes this node's output nhwc too # Currently, all nodes like this should have all of their other # inputs as nchw, so fail if this is not true - self.input_to_nhwc(graph_module, node.args[0], node) + if node.name == "output": + self.input_to_nhwc(graph_module, node.args[0][0], node) + else: + self.input_to_nhwc(graph_module, node.args[0], node) + for input_node in node.all_input_nodes[1:]: if self.is_nhwc_node(input_node): raise AssertionError( diff --git a/backends/xnnpack/runtime/XNNExecutor.cpp b/backends/xnnpack/runtime/XNNExecutor.cpp index 68cb4b4d885..c870476d65d 100644 --- a/backends/xnnpack/runtime/XNNExecutor.cpp +++ b/backends/xnnpack/runtime/XNNExecutor.cpp @@ -106,11 +106,6 @@ ET_NODISCARD Error XNNExecutor::prepare_args(EValue** args) { err == Error::Ok, Internal, "Failed to retrieve dim order from tensor!"); - ET_CHECK_OR_RETURN_ERROR( - is_contiguous_dim_order(dim_order, tensor->dim()), - Internal, - "Expecting default dim_order but got a non default dim_order tensor for external input %u", - i); size_t dims[XNN_MAX_TENSOR_DIMS]; ET_CHECK_OR_RETURN_ERROR( num_dims <= XNN_MAX_TENSOR_DIMS, @@ -118,8 +113,17 @@ ET_NODISCARD Error XNNExecutor::prepare_args(EValue** args) { "XNNPACK backend accepts tensors with at most %d dims, but got %zu", XNN_MAX_TENSOR_DIMS, num_dims); - for (int d = 0; d < num_dims; ++d) { - dims[d] = tensor->size(d); + + bool is_channels_last = executorch::runtime::is_channels_last_dim_order(dim_order, num_dims); + if (is_channels_last) { + dims[0] = tensor->size(0); + dims[1] = tensor->size(2); + dims[2] = tensor->size(3); + dims[3] = tensor->size(1); + } else { + for (int d = 0; d < num_dims; ++d) { + dims[d] = tensor->size(d); + } } status = xnn_reshape_external_value(runtime_.get(), ext_id, num_dims, dims); @@ -220,8 +224,24 @@ ET_NODISCARD Error XNNExecutor::resize_outputs(EValue** args) const { // Convert new output shape into SizesType SizesType expected_output_size[kTensorDimensionLimit]; - for (size_t d = 0; d < num_dim; ++d) { - expected_output_size[d] = static_cast(dims[d]); + executorch::aten::DimOrderType dim_order[kTensorDimensionLimit]; + Error errr = + ET_RUNTIME_NAMESPACE::get_dim_order(*out_tensor, dim_order, num_dim); + ET_CHECK_OR_RETURN_ERROR( + errr == Error::Ok, + Internal, + "Failed to retrieve dim order from tensor!"); + + bool is_channels_last = executorch::runtime::is_channels_last_dim_order(dim_order, num_dim); + if (is_channels_last) { + expected_output_size[0] = static_cast(dims[0]); + expected_output_size[1] = static_cast(dims[3]); + expected_output_size[2] = static_cast(dims[1]); + expected_output_size[3] = static_cast(dims[2]); + } else { + for (size_t d = 0; d < num_dim; ++d) { + expected_output_size[d] = static_cast(dims[d]); + } } executorch::aten::ArrayRef output_size{ diff --git a/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py b/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py index a00209f4ea6..b475a031f80 100644 --- a/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py +++ b/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py @@ -7,6 +7,7 @@ import unittest import torch +from backends.xnnpack._passes.convert_to_linear import ConvertToLinearPass from executorch.backends.xnnpack._passes.channels_last_tagged_reshape_pass import ( ChannelsLastTaggedReshapePass, ) @@ -43,41 +44,123 @@ def setUp(self): ) dynamic_quant_name = "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_tensor" - def test_fp32_channels_last_tagged_reshape_pass(self): - for module, num_reshape in self.modules.items(): - ( - Tester(module, (torch.randn(1, 1, 6, 6),)) - .export() - .to_edge() - .run_passes(self.PassStage) - .check_count( - { - self.to_copy_name: num_reshape, - } - ) - .run_method_and_compare_outputs() + # def test_fp32_channels_last_tagged_reshape_pass(self): + # for module, num_reshape in self.modules.items(): + # ( + # Tester(module, (torch.randn(1, 1, 6, 6),)) + # .export() + # .to_edge() + # .run_passes(self.PassStage) + # .check_count( + # { + # self.to_copy_name: num_reshape, + # } + # ) + # .run_method_and_compare_outputs() + # ) + + class LinearConv(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv1 = torch.nn.Conv2d(3, 3, 3) + self.linear1 = torch.nn.Linear(4, 3) + + def forward(self, x): + y = self.linear1(x) + return self.conv1(y) + + def test_conv_linear_dim_order_swaps_on_nhwc_input(self): + tester = Tester( + self.LinearConv().eval(), + (torch.randn(1, 3, 6, 4).to(memory_format=torch.channels_last),), + ) + + tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs() + + def test_conv_linear_dim_order_swaps_on_nchw_input(self): + tester = Tester( + self.LinearConv().eval(), + (torch.randn(1, 3, 6, 4),), + ) + + tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs() + + class ConvLinearConv(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv1 = torch.nn.Conv2d(3, 3, 3) + self.linear1 = torch.nn.Linear(4, 4) + + def forward(self, x): + y = self.conv1(x) + return self.linear1(y) + + def test_linear_conv_dim_order_swaps_on_nhwc_input(self): + tester = Tester( + self.ConvLinearConv().eval(), + (torch.randn(1, 3, 6, 6).to(memory_format=torch.channels_last),), + ) + + tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs() + + def test_linear_conv_dim_order_swaps_on_nchw_input(self): + tester = Tester( + self.ConvLinearConv().eval(), + (torch.randn(1, 3, 6, 6),), + ) + + tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs() + + class Bilinear(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + return torch.nn.functional.interpolate( + x, scale_factor=2, mode="bilinear", align_corners=True ) - def test_qs8_channels_last_tagged_reshape_pass(self): - for module, num_reshape in self.modules.items(): + def test_nhwc_input_on_nhwc_op(self): + tester = Tester( + self.Bilinear().eval(), ( - Tester(module, (torch.randn(1, 1, 6, 6),)) - .quantize() - .export() - .to_edge() - .run_passes(self.PassStage) - .check( - [ - self.quant_name, - self.dequant_name, - self.to_copy_name, - self.quant_name, - self.dequant_name, - ] - * num_reshape - ) - .run_method_and_compare_outputs() - ) + torch.arange(8) + .reshape(1, 2, 2, 2) + .to(torch.float32) + .to(memory_format=torch.channels_last), + ), + ) + + tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs() + + def test_nchw_input_on_nhwc_op(self): + tester = Tester( + self.Bilinear().eval(), + (torch.arange(8).reshape(1, 2, 2, 2).to(torch.float32),), + ) + + tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs() + + # def test_qs8_channels_last_tagged_reshape_pass(self): + # for module, num_reshape in self.modules.items(): + # ( + # Tester(module, (torch.randn(1, 1, 6, 6),)) + # .quantize() + # .export() + # .to_edge() + # .run_passes(self.PassStage) + # .check( + # [ + # self.quant_name, + # self.dequant_name, + # self.to_copy_name, + # self.quant_name, + # self.dequant_name, + # ] + # * num_reshape + # ) + # .run_method_and_compare_outputs() + # ) class ConvRelu(torch.nn.Module): def __init__(self): @@ -88,39 +171,39 @@ def __init__(self): def forward(self, x): return self.relu(self.conv(x)) - def test_fp32_channels_last_tagged_reshape_pass_conv_relu(self): - ( - Tester(self.ConvRelu().eval(), (torch.randn(1, 1, 6, 6),)) - .export() - .to_edge() - .run_passes(self.PassStage) - .check( - [self.to_copy_name, self.conv_name, self.relu_name, self.to_copy_name] - ) - .run_method_and_compare_outputs() - ) + # def test_fp32_channels_last_tagged_reshape_pass_conv_relu(self): + # ( + # Tester(self.ConvRelu().eval(), (torch.randn(1, 1, 6, 6),)) + # .export() + # .to_edge() + # .run_passes(self.PassStage) + # .check( + # [self.to_copy_name, self.conv_name, self.relu_name, self.to_copy_name] + # ) + # .run_method_and_compare_outputs() + # ) - def test_qs8_channels_last_tagged_reshape_pass_conv_relu(self): - ( - Tester(self.ConvRelu().eval(), (torch.randn(1, 1, 6, 6),)) - .quantize() - .export() - .to_edge() - .run_passes(self.PassStage) - .check( - [ - self.to_copy_name, - self.quant_name, - self.dequant_name, - self.conv_name, - self.relu_name, - self.quant_name, - self.dequant_name, - self.to_copy_name, - ] - ) - .run_method_and_compare_outputs() - ) + # def test_qs8_channels_last_tagged_reshape_pass_conv_relu(self): + # ( + # Tester(self.ConvRelu().eval(), (torch.randn(1, 1, 6, 6),)) + # .quantize() + # .export() + # .to_edge() + # .run_passes(self.PassStage) + # .check( + # [ + # self.to_copy_name, + # self.quant_name, + # self.dequant_name, + # self.conv_name, + # self.relu_name, + # self.quant_name, + # self.dequant_name, + # self.to_copy_name, + # ] + # ) + # .run_method_and_compare_outputs() + # ) class Conv2dBnHardtanhMeanSequenceModule(torch.nn.Module): def __init__(self): @@ -146,46 +229,46 @@ def forward(self, x): x = torch.mean(x, (-1, -2), keepdim=True) return x - def test_fp32_channels_last_tagged_reshape_pass_conv_bn_hardtanh_mean_seq(self): - # Copy #1 is for input to conv, nchw -> nhwc - # Copy #2 is for conv to _native_batch_norm_legit_no_training, nhwc -> nchw - # Copy #3 is for input to mean, nchw -> nhwc - # Copy #4 is for output, nhwc -> nchw - - # The graph looks like: - # graph(): - # %arg0_1 : [#users=1] = placeholder[target=arg0_1] - # %aten__to_copy_default : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._to_copy.default](args = (%arg0_1,), kwargs = {memory_format: torch.channels_last}) - # %_param_constant0 : [#users=1] = get_attr[target=_param_constant0] - # %_param_constant1 : [#users=1] = get_attr[target=_param_constant1] - # %aten_convolution_default : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten__to_copy_default, %_param_constant0, %_param_constant1, [2, 2], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) - # %aten__to_copy_default_1 : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._to_copy.default](args = (%aten_convolution_default,), kwargs = {memory_format: torch.contiguous_format}) - # %_param_constant2 : [#users=1] = get_attr[target=_param_constant2] - # %_param_constant3 : [#users=1] = get_attr[target=_param_constant3] - # %_tensor_constant0 : [#users=1] = get_attr[target=_tensor_constant0] - # %_tensor_constant1 : [#users=1] = get_attr[target=_tensor_constant1] - # %aten__native_batch_norm_legit_no_training_default : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten__to_copy_default_1, %_param_constant2, %_param_constant3, %_tensor_constant0, %_tensor_constant1, 0.1, 1e-05), kwargs = {}) - # %getitem : [#users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default, 0), kwargs = {}) - # %aten_hardtanh_default : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem, 0, 6), kwargs = {}) - # %aten__to_copy_default_2 : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._to_copy.default](args = (%aten_hardtanh_default,), kwargs = {memory_format: torch.channels_last}) - # %aten_mean_dim : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.mean.dim](args = (%aten__to_copy_default_2, [-1, -2], True), kwargs = {}) - # %aten__to_copy_default_3 : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._to_copy.default](args = (%aten_mean_dim,), kwargs = {memory_format: torch.contiguous_format}) - # return [aten__to_copy_default_3] - ( - Tester( - self.Conv2dBnHardtanhMeanSequenceModule().eval(), - (torch.randn(1, 1, 6, 6),), - ) - .export() - .to_edge() - .run_passes(self.PassStage) - .check_count( - { - self.to_copy_name: 4, - } - ) - .run_method_and_compare_outputs() - ) + # def test_fp32_channels_last_tagged_reshape_pass_conv_bn_hardtanh_mean_seq(self): + # Copy #1 is for input to conv, nchw -> nhwc + # Copy #2 is for conv to _native_batch_norm_legit_no_training, nhwc -> nchw + # Copy #3 is for input to mean, nchw -> nhwc + # Copy #4 is for output, nhwc -> nchw + + # The graph looks like: + # graph(): + # %arg0_1 : [#users=1] = placeholder[target=arg0_1] + # %aten__to_copy_default : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._to_copy.default](args = (%arg0_1,), kwargs = {memory_format: torch.channels_last}) + # %_param_constant0 : [#users=1] = get_attr[target=_param_constant0] + # %_param_constant1 : [#users=1] = get_attr[target=_param_constant1] + # %aten_convolution_default : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten__to_copy_default, %_param_constant0, %_param_constant1, [2, 2], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) + # %aten__to_copy_default_1 : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._to_copy.default](args = (%aten_convolution_default,), kwargs = {memory_format: torch.contiguous_format}) + # %_param_constant2 : [#users=1] = get_attr[target=_param_constant2] + # %_param_constant3 : [#users=1] = get_attr[target=_param_constant3] + # %_tensor_constant0 : [#users=1] = get_attr[target=_tensor_constant0] + # %_tensor_constant1 : [#users=1] = get_attr[target=_tensor_constant1] + # %aten__native_batch_norm_legit_no_training_default : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten__to_copy_default_1, %_param_constant2, %_param_constant3, %_tensor_constant0, %_tensor_constant1, 0.1, 1e-05), kwargs = {}) + # %getitem : [#users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default, 0), kwargs = {}) + # %aten_hardtanh_default : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem, 0, 6), kwargs = {}) + # %aten__to_copy_default_2 : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._to_copy.default](args = (%aten_hardtanh_default,), kwargs = {memory_format: torch.channels_last}) + # %aten_mean_dim : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.mean.dim](args = (%aten__to_copy_default_2, [-1, -2], True), kwargs = {}) + # %aten__to_copy_default_3 : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._to_copy.default](args = (%aten_mean_dim,), kwargs = {memory_format: torch.contiguous_format}) + # return [aten__to_copy_default_3] + # ( + # Tester( + # self.Conv2dBnHardtanhMeanSequenceModule().eval(), + # (torch.randn(1, 1, 6, 6),), + # ) + # .export() + # .to_edge() + # .run_passes(self.PassStage) + # .check_count( + # { + # self.to_copy_name: 4, + # } + # ) + # .run_method_and_compare_outputs() + # ) class Conv2dDynamicQuant(torch.nn.Module): def __init__(self): @@ -195,28 +278,28 @@ def __init__(self): def forward(self, x): return self.conv(x) - def test_dq_conv2d_channels_last_tagged_reshape_pass(self) -> None: - ( - Tester(self.Conv2dDynamicQuant().eval(), (torch.randn(1, 3, 8, 8),)) - .quantize( - Quantize( - quantization_config=get_symmetric_quantization_config( - is_dynamic=True - ) - ) - ) - .export() - .to_edge() - .run_passes(self.PassStage) - .check( - [ - self.to_copy_name, - self.choose_qparams_name, - self.dynamic_quant_name, - self.dequant_name, - self.conv_name, - self.to_copy_name, - ] - ) - .run_method_and_compare_outputs() - ) + # def test_dq_conv2d_channels_last_tagged_reshape_pass(self) -> None: + # ( + # Tester(self.Conv2dDynamicQuant().eval(), (torch.randn(1, 3, 8, 8),)) + # .quantize( + # Quantize( + # quantization_config=get_symmetric_quantization_config( + # is_dynamic=True + # ) + # ) + # ) + # .export() + # .to_edge() + # .run_passes(self.PassStage) + # .check( + # [ + # self.to_copy_name, + # self.choose_qparams_name, + # self.dynamic_quant_name, + # self.dequant_name, + # self.conv_name, + # self.to_copy_name, + # ] + # ) + # .run_method_and_compare_outputs() + # ) diff --git a/backends/xnnpack/test/tester/tester.py b/backends/xnnpack/test/tester/tester.py index fa8edd3e03c..0ed9434d807 100644 --- a/backends/xnnpack/test/tester/tester.py +++ b/backends/xnnpack/test/tester/tester.py @@ -31,6 +31,7 @@ ) from executorch.exir.backend.backend_api import validation_disabled from executorch.exir.backend.partitioner import Partitioner +from executorch.exir.dim_order_utils import get_memory_format from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass from executorch.exir.print_program import pretty_print, print_program @@ -533,10 +534,13 @@ def fn(x): # create random tensor inputs with the shapes given above: random_inputs = [] for arg_idx in range(len(self.example_inputs)): + memFormat = get_memory_format( + list(self.example_inputs[arg_idx].dim_order()) + ) random_inputs.append( - torch.randn(input_shapes[arg_idx]).to( - dtype=self.example_inputs[arg_idx].dtype - ) + torch.randn(input_shapes[arg_idx]) + .to(dtype=self.example_inputs[arg_idx].dtype) + .to(memory_format=memFormat) ) yield tuple(random_inputs) diff --git a/backends/xnnpack/xnnpack_preprocess.py b/backends/xnnpack/xnnpack_preprocess.py index 84cdfd69a48..d8892b179cf 100644 --- a/backends/xnnpack/xnnpack_preprocess.py +++ b/backends/xnnpack/xnnpack_preprocess.py @@ -145,9 +145,6 @@ def preprocess( node_to_external_map = generate_node_to_external_map(ep, graph_module) - # Make sure all inputs are contiguous_format or NCHW or default dim order - assert_default_dim_order(graph_module) - # TODO retrace the graph module to lift the new params may have # been added to the graph in passes diff --git a/cmake_wrapper.sh b/cmake_wrapper.sh new file mode 100755 index 00000000000..ec904b42cf1 --- /dev/null +++ b/cmake_wrapper.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +source /Users/madragna/executorch/.venv/bin/activate +cmake "$@" diff --git a/codegen/.watchman-cookie-madragna-mac-84335-443 b/codegen/.watchman-cookie-madragna-mac-84335-443 new file mode 100755 index 00000000000..e69de29bb2d diff --git a/configurations/.watchman-cookie-madragna-mac-84335-443 b/configurations/.watchman-cookie-madragna-mac-84335-443 new file mode 100755 index 00000000000..e69de29bb2d diff --git a/data/bin/__init__.py b/data/bin/__init__.py new file mode 100644 index 00000000000..0c9d60e0498 --- /dev/null +++ b/data/bin/__init__.py @@ -0,0 +1,44 @@ +# This file should be written to the wheel package as +# `executorch/data/bin/__init__.py`. +# +# Setuptools will expect to be able to say something like `from +# executorch.data.bin import mybin; mybin()` for each entry listed in the +# [project.scripts] section of pyproject.toml. This file makes the `mybin()` +# function execute the binary at `executorch/data/bin/mybin` and exit with that +# binary's exit status. + +import subprocess +import os +import sys +import types + +# This file should live in the target `bin` directory. +_bin_dir = os.path.join(os.path.dirname(__file__)) + +def _find_executable_files_under(dir): + """Lists all executable files in the given directory.""" + bin_names = [] + for filename in os.listdir(dir): + filepath = os.path.join(dir, filename) + if os.path.isfile(filepath) and os.access(filepath, os.X_OK): + # Remove .exe suffix on windows. + filename_without_ext = os.path.splitext(filename)[0] + bin_names.append(filename_without_ext) + return bin_names + +# The list of binaries to create wrapper functions for. +_bin_names = _find_executable_files_under(_bin_dir) + +# We'll define functions named after each binary. Make them importable. +__all__ = _bin_names + +def _run(name): + """Runs the named binary, which should live under _bin_dir. + + Exits the current process with the return code of the subprocess. + """ + raise SystemExit(subprocess.call([os.path.join(_bin_dir, name)] + sys.argv[1:], close_fds=False)) + +# Define a function named after each of the binaries. +for bin_name in _bin_names: + exec(f"def {bin_name}(): _run('{bin_name}')") diff --git a/data/bin/flatc b/data/bin/flatc new file mode 100755 index 00000000000..a4ec7b2bac1 Binary files /dev/null and b/data/bin/flatc differ diff --git a/devtools/.watchman-cookie-madragna-mac-84335-443 b/devtools/.watchman-cookie-madragna-mac-84335-443 new file mode 100755 index 00000000000..e69de29bb2d diff --git a/devtools/bundled_program/serialize/bundled_program_schema.fbs b/devtools/bundled_program/serialize/bundled_program_schema.fbs new file mode 100644 index 00000000000..b37164a410d --- /dev/null +++ b/devtools/bundled_program/serialize/bundled_program_schema.fbs @@ -0,0 +1,97 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. + +// +// See README.md before modifying this file. +// + +include "scalar_type.fbs"; + +namespace bundled_program_flatbuffer; + +// Identifier of a valid bundled program schema. +file_identifier "BP08"; +// Extension of written files. +file_extension "bpte"; + +// Reason for basic struct: union value type can only be table/struct/string +table Int { + int_val:long; +} + +table Bool { + bool_val:bool; +} + +table Double { + double_val:double; +} + +// All information we need to bundle for a tensor EValue input. +table Tensor { + // The scalar type of Tensor + scalar_type: executorch_flatbuffer.ScalarType; + // The target sizes of the tensor. + sizes: [int]; + // The contents of the corresponding input tensor. + data: [ubyte] (force_align: 16); + dim_order:[ubyte]; +} + +union ValueUnion { + Tensor, + Int, + Bool, + Double, +} + +// Abstraction for BundledMethodTestCase values +table Value { + val: ValueUnion; +} + +// A single test for a method. The provided inputs should produce the +// expected outputs. +table BundledMethodTestCase { + // The inputs to provide to the method. The number and types of inputs must + // match the schema of the method under test. + inputs: [Value]; + + // The expected outputs generated while running the model in eager mode using + // the inputs provided. Its length should be equal to the length of program + // outputs. + expected_outputs: [Value]; +} + +// Collection of test cases for a program method. +table BundledMethodTestSuite { + // The name of the method to test; e.g., "forward" for the forward() method + // of an nn.Module. This name match a method defined by the ExecuTorch + // program. + method_name: string; + + // Individual test cases for the method. + test_cases: [BundledMethodTestCase]; +} + + +// Executorch program bunlded with data for verification. +table BundledProgram { + // Schema version. + version:uint; + + // Test sets to run against the program. + // Each BundledMethodTestSuite should be used for the method of program sharing same name. + method_test_suites: [BundledMethodTestSuite]; + + // The binary data of a serialized Executorch program. + // The following `force_align` may sliently override any larger force_align + // used in the program. Therefore, to keep the data (including constant + // tensor, delegate data, etc, see schema.fbs for more info) in the + // executorch program keeps the same alignment as original no matter how + // the program schema changes, we need to make the force_align here the max + // one around all kinds of force_align in the current and future program + // schema, so we use the 32 as the force_align here. + program: [ubyte] (force_align: 32); +} + +root_type BundledProgram; diff --git a/devtools/bundled_program/serialize/scalar_type.fbs b/devtools/bundled_program/serialize/scalar_type.fbs new file mode 100644 index 00000000000..e9c830b972c --- /dev/null +++ b/devtools/bundled_program/serialize/scalar_type.fbs @@ -0,0 +1,43 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. + +// +// See README.md before modifying this file. +// + +namespace executorch_flatbuffer; + +// The scalar data type. +// Must match executorch/runtime/core/portable_type/tensor_impl.h +enum ScalarType : byte { + BYTE = 0, + CHAR = 1, + SHORT = 2, + INT = 3, + LONG = 4, + HALF = 5, + FLOAT = 6, + DOUBLE = 7, + BOOL = 11, + QINT8 = 12, + QUINT8 = 13, + QINT32 = 14, + QUINT4X2 = 16, + QUINT2X4 = 17, + BITS16 = 22, + FLOAT8E5M2 = 23, + FLOAT8E4M3FN = 24, + FLOAT8E5M2FNUZ = 25, + FLOAT8E4M3FNUZ = 26, + UINT16 = 27, + UINT32 = 28, + UINT64 = 29, + // Types currently not implemented. + // COMPLEXHALF = 8, + // COMPLEXFLOAT = 9, + // COMPLEXDOUBLE = 10, + // BFLOAT16 = 15, + // BITS1x8 = 18, + // BITS2x4 = 19, + // BITS4x2 = 20, + // BITS8 = 21, +} diff --git a/docs/.watchman-cookie-madragna-mac-84335-443 b/docs/.watchman-cookie-madragna-mac-84335-443 new file mode 100755 index 00000000000..e69de29bb2d diff --git a/examples/.watchman-cookie-madragna-mac-84335-443 b/examples/.watchman-cookie-madragna-mac-84335-443 new file mode 100755 index 00000000000..e69de29bb2d diff --git a/exir/.watchman-cookie-madragna-mac-84335-443 b/exir/.watchman-cookie-madragna-mac-84335-443 new file mode 100755 index 00000000000..e69de29bb2d diff --git a/exir/_serialize/program.fbs b/exir/_serialize/program.fbs new file mode 100644 index 00000000000..7308cc63199 --- /dev/null +++ b/exir/_serialize/program.fbs @@ -0,0 +1,489 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. + +// +// See README.md before modifying this file. +// + +include "scalar_type.fbs"; + +namespace executorch_flatbuffer; + +// Identifier of a valid executor schema. +file_identifier "ET12"; +// Extension of written files. +file_extension "pte"; + +// Table that contains the metadata about how +// to unflatten the flattened input/output from compiler +table ContainerMetadata { + encoded_inp_str: string; + encoded_out_str: string; +} + +table Null {} + +// Contains information relevant to the allocation of non-constant +// buffer data (e.g. from tensors). +// This refers to where the buffer needs to be placed in an existing +// memory and at what offset from its base address. +table AllocationDetails { + memory_id: uint; // ID of the memory where this data needs to be placed. + + // Offset in bytes relative to the start of the memory area indicated by + // memory_id. + // + // Originally this field was a single 32-bit uint, but we need 64 bits for + // larger models. To preserve backwards compatibility, the high bits are + // managed in a separate 32-bit field. Users should combine the two fields + // to get the full 64-bit offset. + memory_offset_low: uint; // Least significant 32 bits + memory_offset_high: uint; // Most significant 32 bits. Defaults to zero. +} + +// Indicates the types of shape a Tensor may have, from the point +// of view of their dynamism. +enum TensorShapeDynamism : byte { + // Static shape. Memory is allocated by the compiler. + STATIC = 0, + // Dynamic shape but with an upper bound. + // Memory is allocated by the compiler. + DYNAMIC_BOUND = 1, + // Dynamic shape without upper bound. + // Memory allocation is handled by the runtime. + DYNAMIC_UNBOUND = 2, +} + +// Indicates where a tensor is stored. +enum TensorDataLocation : byte { + // Stored in a segment of the PTE file. + SEGMENT = 0, + // Stored outside of the PTE file. + EXTERNAL = 1, +} + +// Table to put additional information about tensors in that is not applicable +// to the vast majority of tensors in the vast majority of programs. +table ExtraTensorInfo { + // [Optional] Specifies the SubsegmentOffsets in + // program.mutable_data_segments that specifies where the data is located in. + // If not present and the data is located in a segment, then the data is in + // index zero. + mutable_data_segments_idx: uint64; + + // [Optional] The unique name of the tensor. e.g. 'mod.linear.weight' + fully_qualified_name: string; + + // [Optional] Specifies where the tensor's data is stored. + // - SEGMENT (default): Data is stored in a segment. + // - EXTERNAL: Data is stored outside of the PTE file. fully_qualified_name + // must be non-empty, and is used as a key to find the tensor's external + // data. Tensor.data_buffer_idx is ignored. + location: TensorDataLocation; +} + +table Tensor { + scalar_type: ScalarType; + + // Offset in scalar_type elements (e.g., multiples of 4 bytes for an int + // scalar type) from the beginning of the tensor buffer to the beginning of + // the actual data. Currently, the runtime only supports a value of zero. + storage_offset: int; + + sizes: [int]; + + // Specifies in what order the dimensions are laid out in memory (from outer + // to inner). + // + // For example, given a rank 3 Tensor of size (3, 5, 2). If we name + // dimensions: [row, column, batch], then a dim_order of: + // - (2, 0, 1) represents a [batch, row, column] ordering where "column" is + // the innermost dimension, then comes "row", and the outermost dimension is + // "batch". + // - (0, 2, 1) represents a [row, batch, column] ordering where "column" is + // the innermost dimension, then comes "batch", and the outermost dimension + // is "row". + dim_order: [ubyte]; + + // out of scope M1 + requires_grad: bool; + + // Overall, a Tensor is either constant or mutable. At method load time + // constant tensors receive a dataptr into the serialized program. Mutable + // tensors can either receive a pointer from the heirarchical allocator or a + // nullptr if they will receive a data pointer at execution time (inputs + // and control flow placeholders can be like this). Mutable tensors may or + // may not also have an initial value in the serialized program. + // + // In summary: + // data_buffer_idx > 0, allocation_info = Null: Tensor is a constant. + // data_buffer_idx = 0, allocation_info = Non Null: Tensor is mutable and + // will receive a dataptr at method load time. + // data_buffer_idx = 0, allocation_info = Null: Tensor is mutable and + // will receive a dataptr at input time or during execution. + // data_buffer_idx > 0, allocation_info = Non Null: Tensor is mutable and + // will receive a dataptr at method load time, and has an initial state. + // + // Tensor data is stored inline if program.constant_buffer is null. Otherwise + // it is in a segment. If this tensor's allocation_info is null then the + // tensor data location is specified by program.constant_segment. If the + // allocation_info is non_null then the data is somewhere in + // program.mutable_data_segments. If tensor_info is Null, then the data is + // in program.mutable_data_segments[0] otherwise if tensor_info is non-null + // then the mutable_data_segment index is specified by + // tensor_info.mutable_data_segments_index. + data_buffer_idx: uint; + + // [Optional] preallocation details for non-constants (null otherwise). + allocation_info: AllocationDetails; + + // May not be needed. + layout: byte; + + // Determines the type of the tensor's shape, from the point of view of its + // dynamic or not behavior, and consequently how the allocation of the + // underlying memory is handled, and also how to interpret the sizes and + // strides fields. + // 1. dynamism == STATIC: sizes field represents the static shape of + // the tensor. + // 2. dynamism == DYNAMIC_BOUND: sizes field represents the upper bound shape + // of the tensor. Each dimension of the tensor at runtime should never + // exceed the corresponding dimension of the upper bound shape. + // + // 3. dynamism == DYNAMIC_UNBOUND: the stored sizes field can be ignored since + // shape is fully dynamic. + shape_dynamism: TensorShapeDynamism; + + // [Optional] Additional information about the Tensor that is not applicable + // to most tensors. + extra_tensor_info: ExtraTensorInfo; +} + +table Int { + int_val: long; +} + +table Bool { + bool_val: bool; +} + +table Double { + double_val: double; +} + +table String { + string_val: string; +} + +table IntList { + items: [long]; +} + +table DoubleList { + items: [double]; +} + +table BoolList { + items: [bool]; +} + +// Unlike primitive lists, tensor lists have mutable members and aliasing behavior when +// elements are added to them. To match this aliasing behavior, the runtime tensor list is +// serialized by serializing its elements into the ExecutionPlan.values array, and then +// serializing their corresponding indices into TensorList.items. +table TensorList { + items: [int]; // EValue indices. +} + +// Similar to TensorList except the indices can also point to None. +table OptionalTensorList { + items: [int]; +} + +// Supported values in Executorch kernels, Enums are serialized as ints. +union KernelTypes { + Null, + Int, + Bool, + Double, + Tensor, + String, + IntList, + DoubleList, + BoolList, + TensorList, + OptionalTensorList, +} + +// Abstraction for program values. A subset of types supported in core pytorch kernels. +table EValue { + val: KernelTypes; +} + +table Operator { + // Operator registry and lookup is uniquely identified by its name, and overload name. + // TODO(larryliu): is there a more efficient way to represent this + name: string; + overload: string; +} + +table KernelCall { + // Index to the operators table in the program. + op_index: int; + + // Indexes to the (values) required by the operation (in and out). + args: [int]; +} + +table DelegateCall { + // Index to the delegates table in the program. + delegate_index: int; + + // Indexes to the (values) required by the delegates (in and out). + args: [int]; +} + +table MoveCall { + // Index into the values table of the evalue we are moving from + move_from: int; + + // Index into the values table of the evalue we are moving into + move_to: int; +} + +table JumpFalseCall { + // Index into the values table of boolean that specifies whether or not to jump + cond_value_index: int; + + // Value to set the executor program counter if the jump occurs + destination_instruction: int; +} + +table FreeCall { + // Index into values table of the tensor whose underlying data blob is being freed + value_index: int; +} + +union InstructionArguments { + KernelCall, + DelegateCall, + MoveCall, + JumpFalseCall, + FreeCall, +} + +// Basic unit of execution +table Instruction { + instr_args: InstructionArguments; +} + +table Frame { + // For storing the frame to print stacktraces + filename: string; // Name of the file in which the instruction exists + lineno: int; // Line number at which the instruction was called + name: string; // Name of the function the instruction was called from + context: string; // Source code of the instruction +} + +table FrameList { + // For storing the frames to print stacktraces + items: [Frame]; +} + +// Indicates where a piece of data is stored. +enum DataLocation : byte { + // Stored directly in the flatbuffer. + INLINE = 0, + // Stored in a segment. + SEGMENT = 1, +} + +// Indicates where the delegate data is stored +table BackendDelegateDataReference { + // Indicates which list to index into: + // INLINE -> Program.backend_delegate_data + // SEGMENT -> Program.segments + location: DataLocation; + + // The index into the list indicated by the location. + index: uint; +} + +table CompileSpec { + // One compile spec. There are can be multiple specs for one method + key: string; // like max_value + value: [ubyte]; // like 4, or other types based on needs. +} + +table BackendDelegate { + // Used to resolve the delegate backend classes, for example, "TCE0", "TCE1", etc. + // This string is also used in to_backend. + id: string; + + // A binary blob (from a subgraph) as an output of preprocessing. Will be + // provided to the backend code at init time. Can be very large, on the + // order of 10-100MB. + processed: BackendDelegateDataReference; + + // The compilation spec for the lowered module's forward function + // Example: [CompileSpec["max_value", 4]] + compile_specs: [CompileSpec]; +} + +// A sequence of blocking instructions to be executed in order. The +// abstraction is not currently leveraged, all current programs are 1 chain. +// We are leaving chains as part of the program definition for future use cases +// around graph level async where different threads will be represented as +// seperate chains. +table Chain { + // Indices of the values that are (non-static) inputs into this Chain. + inputs: [int]; + + // Indices of the values that are outputs out of this Chain. + outputs: [int]; + + // List of instructions to be executed in order. + instructions: [Instruction]; + + // Optional list of frames for each instruction. + // The backend config must have 'emit_stacktrace' set to true to emit + stacktrace: [FrameList]; +} + +table ExecutionPlan { + + // Name of a method on the nn.Module that was traced to create this program. + name: string; + + // Type meta data for input/output to the execution plan + container_meta_type: ContainerMetadata; + + // A list of all values used in this execution plan. + values: [EValue]; + + // Indices to the 'Evalues' that are inputs to this execution plan. + // This list contains only the non-constant tensors (i.e. not part of + // the saved program). + inputs: [int]; + + // Indices to the 'Evalues' that are outputs of this execution plan. + // This signals a lifespan that goes beyond the execution. + outputs: [int]; + + // List of Chains of kernels. + chains: [Chain]; + + // Operators used in this execution plan + operators: [Operator]; + + // A list of delegates and each is a special instance of execution, the same level of chains. + delegates: [BackendDelegate]; + + // List of buffer sizes for non_constant memory allocations. (Think neural net activations) + // A list instead of a single buffer to account for complex memory hierarchies. + // TODO(jakeszwe, razy): How to reconcile this with the ability for the hierarchical memory allocator + // to be id based instead of index based. + // Runtime should use the len(constant_buffer) as the ground truth of the + // constants memory buffer size, and ignore non_const_buffer_sizes[0]. + non_const_buffer_sizes: [int64]; + +} + +// Constant tensor data stored directly in the flatbuffer. +table Buffer { + // During serialization, this alignment may be rewritten to a larger value. + // The magic "@executorch-tensor-alignment" comment tells EXIR which lines to + // patch. + storage: [ubyte] (force_align: 16); // @executorch-tensor-alignment +} + +// Delegate data stored directly in the flatbuffer. This is a different type +// than Buffer because tensors and delegates can have different alignment +// requirements. +table BackendDelegateInlineData { + // During serialization, this alignment may be rewritten to a larger value. + // The magic "@executorch-delegate-alignment" comment tells EXIR which lines + // to patch. + data: [ubyte] (force_align: 16); // @executorch-delegate-alignment +} + +// Describes a contiguous piece of data that lives outside of the flatbuffer data, +// typically appended afterwards in the file. The "extended header" in the file, +// when present, points to the segment base offset. +table DataSegment { + // Segment offsets are relative to the segment base offset provided in + // the extended file header. Segments will typically be aligned in a + // way to make it possible to use mmap() to load them. + offset: uint64; + + // The size in bytes of valid data starting at the offset. The segment + // data may be followed by padding before the segment that follows it, + // to make it easier to use mmap(). + size: uint64; +} + +// Describes data offsets into a particular segment +table SubsegmentOffsets { + // Index of the segment in Program.segments + segment_index: uint; + + // Each element is an offset in bytes into the data of the segment pointed to + // by segment_index. Offsets must be aligned to @executorch-tensor-alignment. + offsets: [uint64]; +} + +// Attributes a name to data referenced by Program.segments. Used when data is +// referenced by multiple users, in cases where indices are not guaranteed to +// be consistent across the users. +table NamedData { + // The unique id of the data blob. + key: string; + + // Index of the segment in Program.segments. + segment_index: uint32; +} + +table Program { + // Schema version. + version: uint; + + // List of ExecutionPlans that make up the program. Each ExecutionPlan corresponds with a + // different entry point into the model. + execution_plan: [ExecutionPlan]; + + // Tables of constant data, used for constant Values (e.g.data field of weight tensors). + // Each constant is assigned an index into the table which are each individually aligned. + // 0 index is reserved to be pointed to by non-constant Tensors. + // If this field is non-empty, constant_segment.offsets must be empty. + // DEPRECATED: After D61996249 on 2024-09-05, no new PTE files will use this field. + constant_buffer: [Buffer]; + + // List of delegate data. Pointed to by BackendDelegateDataReference. + backend_delegate_data: [BackendDelegateInlineData]; + + // List of data segments that follow the Program data in this file, sorted by + // offset. Elements in this schema can refer to these segments by index. + segments: [DataSegment]; + + // Describes the offsets of each constant tensor, relative to the segment + // offset. If constant_segment.offsets field is non-empty, constant_buffer + // must be empty. constant_segment.offsets[0] is reserved to be pointed to by + // non-constant Tensors. + constant_segment: SubsegmentOffsets; + + // [Optional] Describes the offsets into various segments for each mutable + // tensor. Only mutable tensors with a meaningful initial state are + // serialized here (for example weights that will be trained on-device as + // opposed to just layer activations). Seperate from the constant_segment to + // reduce peak memory usage by letting us read directly from the PTE file + // into the mutable tensor, as opposed to loading the .pte data into + // constant memory, copying it over, and then being unable to release the + // constant segment. No two elements should point to the same segment. + mutable_data_segments: [SubsegmentOffsets]; + + // [Optional] List of blobs keyed by a unique name. Note that multiple + // 'NamedData' entries could point to the same segment index. Stored in + // segments attached to the PTE file. + named_data: [NamedData]; +} + +root_type Program; diff --git a/exir/_serialize/scalar_type.fbs b/exir/_serialize/scalar_type.fbs new file mode 100644 index 00000000000..e9c830b972c --- /dev/null +++ b/exir/_serialize/scalar_type.fbs @@ -0,0 +1,43 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. + +// +// See README.md before modifying this file. +// + +namespace executorch_flatbuffer; + +// The scalar data type. +// Must match executorch/runtime/core/portable_type/tensor_impl.h +enum ScalarType : byte { + BYTE = 0, + CHAR = 1, + SHORT = 2, + INT = 3, + LONG = 4, + HALF = 5, + FLOAT = 6, + DOUBLE = 7, + BOOL = 11, + QINT8 = 12, + QUINT8 = 13, + QINT32 = 14, + QUINT4X2 = 16, + QUINT2X4 = 17, + BITS16 = 22, + FLOAT8E5M2 = 23, + FLOAT8E4M3FN = 24, + FLOAT8E5M2FNUZ = 25, + FLOAT8E4M3FNUZ = 26, + UINT16 = 27, + UINT32 = 28, + UINT64 = 29, + // Types currently not implemented. + // COMPLEXHALF = 8, + // COMPLEXFLOAT = 9, + // COMPLEXDOUBLE = 10, + // BFLOAT16 = 15, + // BITS1x8 = 18, + // BITS2x4 = 19, + // BITS4x2 = 20, + // BITS8 = 21, +} diff --git a/export/.watchman-cookie-madragna-mac-84335-443 b/export/.watchman-cookie-madragna-mac-84335-443 new file mode 100755 index 00000000000..e69de29bb2d diff --git a/extension/.watchman-cookie-madragna-mac-84335-443 b/extension/.watchman-cookie-madragna-mac-84335-443 new file mode 100755 index 00000000000..e69de29bb2d diff --git a/extension/llm/custom_ops/libcustom_ops_aot_lib.dylib b/extension/llm/custom_ops/libcustom_ops_aot_lib.dylib new file mode 100755 index 00000000000..eadf818636c Binary files /dev/null and b/extension/llm/custom_ops/libcustom_ops_aot_lib.dylib differ diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers index 57eb76d71d6..9ceef562d5c 160000 --- a/extension/llm/tokenizers +++ b/extension/llm/tokenizers @@ -1 +1 @@ -Subproject commit 57eb76d71d6dde5396520c7d35142eb868994e06 +Subproject commit 9ceef562d5c941eb6aea5476c768d0419962bc0c diff --git a/extension/pybindings/_portable_lib.cpython-312-darwin.so b/extension/pybindings/_portable_lib.cpython-312-darwin.so new file mode 100755 index 00000000000..78a5e8dcba6 Binary files /dev/null and b/extension/pybindings/_portable_lib.cpython-312-darwin.so differ diff --git a/include/executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h b/include/executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h new file mode 100644 index 00000000000..16b71594eb3 --- /dev/null +++ b/include/executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h @@ -0,0 +1,198 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//===----------------------------------------------------------------------===// +/// \file extension/kernel_util/make_boxed_from_unboxed_functor.h +/// Defines a template that can be used to create a boxed version of an unboxed +/// functor. +/// Example usage: +/// ``` +/// Tensor& +/// my_op(KernelRuntimeContext& ctx, const Tensor& self, const Tensor& other, +/// Tensor& out) +/// { +/// // ... +/// return out; +/// } +/// +/// Kernel my_kernel = Kernel::make_boxed_kernel("my_ns::my_op", +/// EXECUTORCH_FN(my_op)); +/// static auto res = register_kernels({my_kernel}); +/// ``` +/// Or simply: +/// ``` +/// EXECUTORCH_LIBRARY(my_ns, "my_op", my_op); +/// ``` +/// +/// The trick here is to convert each EValue to inferred argument type. This +/// uses a lot of C++17 features. +//===----------------------------------------------------------------------===// + +#pragma once +#if __cplusplus < 201703L +#error "This header requires C++17" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace executorch { +namespace runtime { +class KernelRuntimeContext; // Forward declaration +} // namespace runtime +} // namespace executorch + +namespace executorch { +namespace extension { + +// This extension has a lot of generic internal names like "size"; use a unique +// internal namespace to avoid conflicts with other extensions. +namespace kernel_util_internal { + +template +struct decay_if_not_tensor final { + using type = std::decay_t; +}; +template <> +struct decay_if_not_tensor final { + using type = executorch::aten::Tensor&; +}; +template <> +struct decay_if_not_tensor final { + using type = const executorch::aten::Tensor&; +}; + +template +struct evalue_to_arg final { + static T call(executorch::runtime::EValue& v) { + return std::move(v).to(); + } +}; + +template <> +struct evalue_to_arg final { + static executorch::aten::Tensor& call(executorch::runtime::EValue& v) { + return v.toTensor(); + } +}; + +template <> +struct evalue_to_arg final { + static const executorch::aten::Tensor& call(executorch::runtime::EValue& v) { + return v.toTensor(); + } +}; + +template +struct evalue_to_arg> final { + static executorch::aten::optional call(executorch::runtime::EValue& v) { + return v.toOptional(); + } +}; + +template +struct evalue_to_arg>> + final { + static executorch::aten::ArrayRef> call( + executorch::runtime::EValue& v) { + return v.toListOptionalTensor(); + } +}; + +template +void call_functor_with_args_from_stack( + ::executorch::runtime::KernelRuntimeContext& ctx, + executorch::runtime::EValue** stack, + std::index_sequence, + typelist*) { + (*Functor::func_ptr())( + ctx, + evalue_to_arg::type>::call( + *stack[evalue_arg_indices])...); +} + +} // namespace kernel_util_internal + +/** + * WrapUnboxedIntoFunctor: Given a function pointer, wrap it into a functor that + * takes EValues as input and returns void. The wrapped functor will unbox all + * inputs and forward them to unboxed kernel. + */ +template +struct WrapUnboxedIntoFunctor { + static_assert( + kernel_util_internal::is_compile_time_function_pointer::value, + "Can't handle function other than EXECUTORCH_FN"); + using TrueType = typename FuncType::FuncType; + using ReturnType = typename kernel_util_internal::infer_function_traits_t< + TrueType>::return_type; + using ArgsType = typename kernel_util_internal::infer_function_traits_t< + TrueType>::parameter_types; + // check if the first argument is KernelRuntimeContext, if so, remove it + static constexpr bool first_arg_is_context = std::is_same< + ::executorch::runtime::KernelRuntimeContext, + std::remove_reference_t< + kernel_util_internal::head_with_default_t>>::value; + using ContextRemovedArgsType = std::conditional_t< + first_arg_is_context, + kernel_util_internal::drop_if_nonempty_t, + ArgsType>; + + static void call( + ::executorch::runtime::KernelRuntimeContext& ctx, + executorch::runtime::EValue** stack) { + constexpr size_t num_inputs = + kernel_util_internal::size::value; + return kernel_util_internal::call_functor_with_args_from_stack( + ctx, + stack, + std::make_index_sequence(), + static_cast(nullptr)); + } +}; + +template +static executorch::runtime::Kernel make_boxed_kernel( + const char* name, + FuncType) { + return executorch::runtime::Kernel( + name, WrapUnboxedIntoFunctor::call); +} + +} // namespace extension +} // namespace executorch + +// Inspired from C10_CONCATENATE +#define ET_CONCATENATE_IMPL(s1, s2) s1##s2 +#define ET_CONCATENATE(s1, s2) ET_CONCATENATE_IMPL(s1, s2) +#define ET_UID __LINE__ + +#define EXECUTORCH_LIBRARY(ns, op_name, func) \ + _EXECUTORCH_LIBRARY_IMPL(ns, op_name, func, ET_UID) + +#define _EXECUTORCH_LIBRARY_IMPL(ns, op_name, func, uid) \ + static auto ET_CONCATENATE(res_##ns##_, uid) = \ + ::executorch::runtime::register_kernel( \ + ::executorch::extension::make_boxed_kernel( \ + #ns "::" op_name, EXECUTORCH_FN(func))) + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::make_boxed_kernel; +using ::executorch::extension::WrapUnboxedIntoFunctor; +} // namespace executor +} // namespace torch diff --git a/include/executorch/extension/kernel_util/meta_programming.h b/include/executorch/extension/kernel_util/meta_programming.h new file mode 100644 index 00000000000..027568fe687 --- /dev/null +++ b/include/executorch/extension/kernel_util/meta_programming.h @@ -0,0 +1,119 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once +#if __cplusplus < 201703L +#error "This header requires C++17" +#endif + +#include +#include +#include +#include +#include + +namespace executorch { +namespace extension { +// This extension has a lot of generic internal names like "size"; use a unique +// internal namespace to avoid conflicts with other extensions. +namespace kernel_util_internal { + +// Check if a given type is a function +template +struct is_function_type : std::false_type {}; +template +struct is_function_type : std::true_type {}; +template +using is_function_type_t = typename is_function_type::type; + +// A compile-time wrapper around a function pointer +template +struct CompileTimeFunctionPointer final { + static_assert( + is_function_type::value, + "EXECUTORCH_FN can only wrap function types."); + using FuncType = FuncType_; + + static constexpr FuncType* func_ptr() { + return func_ptr_; + } +}; + +// Check if a given type is a compile-time function pointer +template +struct is_compile_time_function_pointer : std::false_type {}; +template +struct is_compile_time_function_pointer< + CompileTimeFunctionPointer> : std::true_type {}; + +#define EXECUTORCH_FN_TYPE(func) \ + ::executorch::extension::kernel_util_internal::CompileTimeFunctionPointer< \ + std::remove_pointer_t>, \ + func> +#define EXECUTORCH_FN(func) EXECUTORCH_FN_TYPE(func)() + +/** + * strip_class: helper to remove the class type from pointers to `operator()`. + */ +template +struct strip_class {}; +template +struct strip_class { + using type = Result(Args...); +}; +template +struct strip_class { + using type = Result(Args...); +}; +template +using strip_class_t = typename strip_class::type; + +/** + * Access information about result type or arguments from a function type. + * Example: + * using A = function_traits::return_type // A == int + * using A = function_traits::parameter_types::tuple_type + * // A == tuple + */ +template +struct function_traits { + static_assert( + !std::is_same::value, + "In function_traits, Func must be a plain function type."); +}; +template +struct function_traits { + using func_type = Result(Args...); + using return_type = Result; + using parameter_types = typelist; + static constexpr auto number_of_parameters = sizeof...(Args); +}; + +/** + * infer_function_traits: creates a `function_traits` type for a simple + * function (pointer) or functor (lambda/struct). Currently does not support + * class methods. + */ +template +struct infer_function_traits { + using type = function_traits>; +}; +template +struct infer_function_traits { + using type = function_traits; +}; +template +struct infer_function_traits { + using type = function_traits; +}; +template +using infer_function_traits_t = typename infer_function_traits::type; + +} // namespace kernel_util_internal +} // namespace extension +} // namespace executorch diff --git a/include/executorch/extension/kernel_util/type_list.h b/include/executorch/extension/kernel_util/type_list.h new file mode 100644 index 00000000000..300cbfcb7cb --- /dev/null +++ b/include/executorch/extension/kernel_util/type_list.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/// +/// \file runtime/kernel/type_list.h +/// Forked from pytorch/c10/util/TypeList.h +/// \brief Utilities for working with type lists. +#pragma once +#if __cplusplus < 201703L +#error "This header requires C++17" +#endif + +#include +#include +#include +#include + +namespace executorch { +namespace extension { +// This extension has a lot of generic internal names like "size"; use a unique +// internal namespace to avoid conflicts with other extensions. +namespace kernel_util_internal { + +/** + * Type holding a list of types for compile time type computations + * constexpr size_t num = size>::value; + * static_assert(num == 2, ""); + */ +template +struct false_t : std::false_type {}; + +template +struct typelist final { + public: + typelist() = delete; // not for instantiation +}; +template +struct size final { + static_assert( + false_t::value, + "In typelist::size, T must be typelist<...>."); +}; +template +struct size> final { + static constexpr size_t value = sizeof...(Types); +}; + +/** + * is_instantiation_of is true_type iff I is a template instantiation of T + * (e.g. vector is an instantiation of vector) Example: + * is_instantiation_of_t> // true + * is_instantiation_of_t> // true + * is_instantiation_of_t> // false + */ +template