diff --git a/.Package.swift/.watchman-cookie-madragna-mac-84335-443 b/.Package.swift/.watchman-cookie-madragna-mac-84335-443
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/.ci/.watchman-cookie-madragna-mac-84335-443 b/.ci/.watchman-cookie-madragna-mac-84335-443
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/.github/.watchman-cookie-madragna-mac-84335-443 b/.github/.watchman-cookie-madragna-mac-84335-443
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 00000000000..e08db314050
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,26 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Debug CMake project",
+            "type": "lldb", // https://github.com/vadimcn/vscode-lldb
+            "request": "launch",
+            "program": "${command:cmake.launchTargetPath}",
+            "args": [
+                "--model_path=./add.pte",
+            ]
+        },
+        {
+            "name": "Debug python proj",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "unittest",
+            "args": [
+                "./backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py"
+            ]
+        },
+    ]
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 00000000000..4139c8edeba
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,83 @@
+{
+    "files.associations": {
+        "cstdlib": "cpp",
+        "__bit_reference": "cpp",
+        "__hash_table": "cpp",
+        "__locale": "cpp",
+        "__node_handle": "cpp",
+        "__split_buffer": "cpp",
+        "__tree": "cpp",
+        "__verbose_abort": "cpp",
+        "array": "cpp",
+        "bitset": "cpp",
+        "cctype": "cpp",
+        "charconv": "cpp",
+        "clocale": "cpp",
+        "cmath": "cpp",
+        "complex": "cpp",
+        "condition_variable": "cpp",
+        "cstdarg": "cpp",
+        "cstdint": "cpp",
+        "cstdio": "cpp",
+        "cstring": "cpp",
+        "ctime": "cpp",
+        "cwchar": "cpp",
+        "cwctype": "cpp",
+        "deque": "cpp",
+        "execution": "cpp",
+        "memory": "cpp",
+        "forward_list": "cpp",
+        "future": "cpp",
+        "initializer_list": "cpp",
+        "iomanip": "cpp",
+        "ios": "cpp",
+        "iosfwd": "cpp",
+        "iostream": "cpp",
+        "istream": "cpp",
+        "limits": "cpp",
+        "list": "cpp",
+        "locale": "cpp",
+        "map": "cpp",
+        "mutex": "cpp",
+        "new": "cpp",
+        "optional": "cpp",
+        "print": "cpp",
+        "queue": "cpp",
+        "ratio": "cpp",
+        "regex": "cpp",
+        "set": "cpp",
+        "shared_mutex": "cpp",
+        "sstream": "cpp",
+        "stack": "cpp",
+        "stdexcept": "cpp",
+        "streambuf": "cpp",
+        "string": "cpp",
+        "string_view": "cpp",
+        "typeindex": "cpp",
+        "typeinfo": "cpp",
+        "unordered_map": "cpp",
+        "unordered_set": "cpp",
+        "variant": "cpp",
+        "vector": "cpp",
+        "algorithm": "cpp",
+        "iterator": "cpp",
+        "tuple": "cpp",
+        "span": "cpp",
+        "*.inc": "cpp",
+        "alignedvector3": "cpp"
+    },
+    "C_Cpp.default.compilerPath": "/library/developer/commandlinetools/usr/bin/c++",
+    "python.analysis.typeCheckingMode": "off",
+    "python.testing.unittestArgs": [
+        "-v",
+        "-s",
+        "./backends",
+        "-p",
+        "test_*.py"
+    ],
+    "python.testing.pytestEnabled": true,
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestArgs": [
+        "."
+    ]
+}
diff --git a/backends/.watchman-cookie-madragna-mac-84335-443 b/backends/.watchman-cookie-madragna-mac-84335-443
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/backends/apple/coreml/executorchcoreml.cpython-312-darwin.so b/backends/apple/coreml/executorchcoreml.cpython-312-darwin.so
new file mode 100755
index 00000000000..dd956d400d3
Binary files /dev/null and b/backends/apple/coreml/executorchcoreml.cpython-312-darwin.so differ
diff --git a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
index 768df1f4f04..8f4ee4a30f5 100644
--- a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
+++ b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
@@ -4,6 +4,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import string
+from logging import FATAL
+from tokenize import String
 from typing import Optional, Tuple
 
 import torch
@@ -56,9 +59,9 @@ class ChannelsLastTaggedReshapePass(XNNPACKPass):
 
     # Set of ops that require memory format to be NCHW
     memory_sensitive_ops_nchw = {
-        "output",
         exir_ops.edge.aten.squeeze_copy.dim,
         exir_ops.edge.aten.unsqueeze_copy.default,
+        exir_ops.edge.aten.linear.default,
     }
 
     # Tag which is added to a node's meta to indicate that it uses NHWC format.
@@ -91,10 +94,20 @@ def is_nchw_node(self, node: torch.fx.Node) -> bool:
         return not self.is_nhwc_node(node)
 
     def requires_nhwc_input(self, node: torch.fx.Node) -> bool:
-        return node.target in self.memory_sensitive_ops_nhwc
+        return (
+            node.target in self.memory_sensitive_ops_nhwc
+            or node.name == "output"
+            and not node.args[0][0].meta["val"].is_contiguous()
+        )
 
     def requires_nchw_inputs(self, node: torch.fx.Node) -> bool:
-        return node.target in self.memory_sensitive_ops_nchw
+        return (
+            node.target in self.memory_sensitive_ops_nchw
+            or node.name == "output"
+            and node.args[0][0]
+            .meta["val"]
+            .is_contiguous()  # Need to consider output trace so out matches
+        )
 
     def can_be_converted_to_nhwc(self, node: torch.fx.Node) -> bool:
         # There are two conditions that must be met for a node to be able to
@@ -269,8 +282,17 @@ def input_to_nhwc(
             # serializing graph, but don't do anything else here
             self.mark_as_nhwc_node(input_node)
 
-        if self.is_nhwc_node(input_node):
+        if input_node.op == "placeholder":
+            if not input_node.meta["val"][0].is_contiguous():
+                return
+        elif self.is_nhwc_node(input_node):
             return
+        # if (
+        #     self.is_nhwc_node(input_node)
+        #     or input_node.op == "placeholder"
+        #     and not input_node.meta["val"][0].is_contiguous()
+        # ):
+        #     return
 
         if not self.can_be_converted_to_nhwc(input_node):
             raise AssertionError(
@@ -333,8 +355,21 @@ def input_to_nchw(
             # do anything else here
             self.mark_as_nchw_node(input_node)
 
-        if self.is_nchw_node(input_node):
+        if input_node.op == "placeholder":
+            if input_node.meta["val"][0].is_contiguous():
+                return
+        elif self.is_nchw_node(input_node):
             return
+        # TODO
+        # meta trace happens before passes. At the end of pass, meta gets regenerated. eager mode assumes in/out stay same for conv. Linear has implicit nchw conv
+        # if (
+        #     self.is_nchw_node(
+        #         input_node
+        #     )  # This is triggering as x (placeholder) is tagged as nchw
+        #     or input_node.op == "placeholder"
+        #     and input_node.meta["val"][0].is_contiguous()
+        # ):
+        #     return
 
         if ChannelsLastTaggedReshapePass.PARTNER_NODE in input_node.meta:
             # Already has an associated NCHW node
@@ -371,7 +406,11 @@ def call(self, graph_module: torch.fx.GraphModule):  # noqa: C901
                 # first input to be nhwc. This makes this node's output nhwc too
                 # Currently, all nodes like this should have all of their other
                 # inputs as nchw, so fail if this is not true
-                self.input_to_nhwc(graph_module, node.args[0], node)
+                if node.name == "output":
+                    self.input_to_nhwc(graph_module, node.args[0][0], node)
+                else:
+                    self.input_to_nhwc(graph_module, node.args[0], node)
+
                 for input_node in node.all_input_nodes[1:]:
                     if self.is_nhwc_node(input_node):
                         raise AssertionError(
diff --git a/backends/xnnpack/runtime/XNNExecutor.cpp b/backends/xnnpack/runtime/XNNExecutor.cpp
index 68cb4b4d885..c870476d65d 100644
--- a/backends/xnnpack/runtime/XNNExecutor.cpp
+++ b/backends/xnnpack/runtime/XNNExecutor.cpp
@@ -106,11 +106,6 @@ ET_NODISCARD Error XNNExecutor::prepare_args(EValue** args) {
           err == Error::Ok,
           Internal,
           "Failed to retrieve dim order from tensor!");
-      ET_CHECK_OR_RETURN_ERROR(
-          is_contiguous_dim_order(dim_order, tensor->dim()),
-          Internal,
-          "Expecting default dim_order but got a non default dim_order tensor for external input %u",
-          i);
       size_t dims[XNN_MAX_TENSOR_DIMS];
       ET_CHECK_OR_RETURN_ERROR(
           num_dims <= XNN_MAX_TENSOR_DIMS,
@@ -118,8 +113,17 @@ ET_NODISCARD Error XNNExecutor::prepare_args(EValue** args) {
           "XNNPACK backend accepts tensors with at most %d dims, but got %zu",
           XNN_MAX_TENSOR_DIMS,
           num_dims);
-      for (int d = 0; d < num_dims; ++d) {
-        dims[d] = tensor->size(d);
+
+      bool is_channels_last = executorch::runtime::is_channels_last_dim_order(dim_order, num_dims);
+      if (is_channels_last) {
+        dims[0] = tensor->size(0);
+        dims[1] = tensor->size(2);
+        dims[2] = tensor->size(3);
+        dims[3] = tensor->size(1);
+      } else {
+        for (int d = 0; d < num_dims; ++d) {
+          dims[d] = tensor->size(d);
+        }
       }
       status =
           xnn_reshape_external_value(runtime_.get(), ext_id, num_dims, dims);
@@ -220,8 +224,24 @@ ET_NODISCARD Error XNNExecutor::resize_outputs(EValue** args) const {
 
     // Convert new output shape into SizesType
     SizesType expected_output_size[kTensorDimensionLimit];
-    for (size_t d = 0; d < num_dim; ++d) {
-      expected_output_size[d] = static_cast<SizesType>(dims[d]);
+    executorch::aten::DimOrderType dim_order[kTensorDimensionLimit];
+    Error errr =
+          ET_RUNTIME_NAMESPACE::get_dim_order(*out_tensor, dim_order, num_dim);
+      ET_CHECK_OR_RETURN_ERROR(
+          errr == Error::Ok,
+          Internal,
+          "Failed to retrieve dim order from tensor!");
+
+    bool is_channels_last = executorch::runtime::is_channels_last_dim_order(dim_order, num_dim);
+    if (is_channels_last) {
+      expected_output_size[0] = static_cast<SizesType>(dims[0]);
+      expected_output_size[1] = static_cast<SizesType>(dims[3]);
+      expected_output_size[2] = static_cast<SizesType>(dims[1]);
+      expected_output_size[3] = static_cast<SizesType>(dims[2]);
+    } else {
+      for (size_t d = 0; d < num_dim; ++d) {
+        expected_output_size[d] = static_cast<SizesType>(dims[d]);
+      }
     }
 
     executorch::aten::ArrayRef<SizesType> output_size{
diff --git a/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py b/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py
index a00209f4ea6..b475a031f80 100644
--- a/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py
+++ b/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py
@@ -7,6 +7,7 @@
 import unittest
 
 import torch
+from backends.xnnpack._passes.convert_to_linear import ConvertToLinearPass
 from executorch.backends.xnnpack._passes.channels_last_tagged_reshape_pass import (
     ChannelsLastTaggedReshapePass,
 )
@@ -43,41 +44,123 @@ def setUp(self):
     )
     dynamic_quant_name = "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_tensor"
 
-    def test_fp32_channels_last_tagged_reshape_pass(self):
-        for module, num_reshape in self.modules.items():
-            (
-                Tester(module, (torch.randn(1, 1, 6, 6),))
-                .export()
-                .to_edge()
-                .run_passes(self.PassStage)
-                .check_count(
-                    {
-                        self.to_copy_name: num_reshape,
-                    }
-                )
-                .run_method_and_compare_outputs()
+    # def test_fp32_channels_last_tagged_reshape_pass(self):
+    #     for module, num_reshape in self.modules.items():
+    #         (
+    #             Tester(module, (torch.randn(1, 1, 6, 6),))
+    #             .export()
+    #             .to_edge()
+    #             .run_passes(self.PassStage)
+    #             .check_count(
+    #                 {
+    #                     self.to_copy_name: num_reshape,
+    #                 }
+    #             )
+    #             .run_method_and_compare_outputs()
+    #         )
+
+    class LinearConv(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = torch.nn.Conv2d(3, 3, 3)
+            self.linear1 = torch.nn.Linear(4, 3)
+
+        def forward(self, x):
+            y = self.linear1(x)
+            return self.conv1(y)
+
+    def test_conv_linear_dim_order_swaps_on_nhwc_input(self):
+        tester = Tester(
+            self.LinearConv().eval(),
+            (torch.randn(1, 3, 6, 4).to(memory_format=torch.channels_last),),
+        )
+
+        tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs()
+
+    def test_conv_linear_dim_order_swaps_on_nchw_input(self):
+        tester = Tester(
+            self.LinearConv().eval(),
+            (torch.randn(1, 3, 6, 4),),
+        )
+
+        tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs()
+
+    class ConvLinearConv(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = torch.nn.Conv2d(3, 3, 3)
+            self.linear1 = torch.nn.Linear(4, 4)
+
+        def forward(self, x):
+            y = self.conv1(x)
+            return self.linear1(y)
+
+    def test_linear_conv_dim_order_swaps_on_nhwc_input(self):
+        tester = Tester(
+            self.ConvLinearConv().eval(),
+            (torch.randn(1, 3, 6, 6).to(memory_format=torch.channels_last),),
+        )
+
+        tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs()
+
+    def test_linear_conv_dim_order_swaps_on_nchw_input(self):
+        tester = Tester(
+            self.ConvLinearConv().eval(),
+            (torch.randn(1, 3, 6, 6),),
+        )
+
+        tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs()
+
+    class Bilinear(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x):
+            return torch.nn.functional.interpolate(
+                x, scale_factor=2, mode="bilinear", align_corners=True
             )
 
-    def test_qs8_channels_last_tagged_reshape_pass(self):
-        for module, num_reshape in self.modules.items():
+    def test_nhwc_input_on_nhwc_op(self):
+        tester = Tester(
+            self.Bilinear().eval(),
             (
-                Tester(module, (torch.randn(1, 1, 6, 6),))
-                .quantize()
-                .export()
-                .to_edge()
-                .run_passes(self.PassStage)
-                .check(
-                    [
-                        self.quant_name,
-                        self.dequant_name,
-                        self.to_copy_name,
-                        self.quant_name,
-                        self.dequant_name,
-                    ]
-                    * num_reshape
-                )
-                .run_method_and_compare_outputs()
-            )
+                torch.arange(8)
+                .reshape(1, 2, 2, 2)
+                .to(torch.float32)
+                .to(memory_format=torch.channels_last),
+            ),
+        )
+
+        tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs()
+
+    def test_nchw_input_on_nhwc_op(self):
+        tester = Tester(
+            self.Bilinear().eval(),
+            (torch.arange(8).reshape(1, 2, 2, 2).to(torch.float32),),
+        )
+
+        tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs()
+
+    # def test_qs8_channels_last_tagged_reshape_pass(self):
+    #     for module, num_reshape in self.modules.items():
+    #         (
+    #             Tester(module, (torch.randn(1, 1, 6, 6),))
+    #             .quantize()
+    #             .export()
+    #             .to_edge()
+    #             .run_passes(self.PassStage)
+    #             .check(
+    #                 [
+    #                     self.quant_name,
+    #                     self.dequant_name,
+    #                     self.to_copy_name,
+    #                     self.quant_name,
+    #                     self.dequant_name,
+    #                 ]
+    #                 * num_reshape
+    #             )
+    #             .run_method_and_compare_outputs()
+    #         )
 
     class ConvRelu(torch.nn.Module):
         def __init__(self):
@@ -88,39 +171,39 @@ def __init__(self):
         def forward(self, x):
             return self.relu(self.conv(x))
 
-    def test_fp32_channels_last_tagged_reshape_pass_conv_relu(self):
-        (
-            Tester(self.ConvRelu().eval(), (torch.randn(1, 1, 6, 6),))
-            .export()
-            .to_edge()
-            .run_passes(self.PassStage)
-            .check(
-                [self.to_copy_name, self.conv_name, self.relu_name, self.to_copy_name]
-            )
-            .run_method_and_compare_outputs()
-        )
+    # def test_fp32_channels_last_tagged_reshape_pass_conv_relu(self):
+    #     (
+    #         Tester(self.ConvRelu().eval(), (torch.randn(1, 1, 6, 6),))
+    #         .export()
+    #         .to_edge()
+    #         .run_passes(self.PassStage)
+    #         .check(
+    #             [self.to_copy_name, self.conv_name, self.relu_name, self.to_copy_name]
+    #         )
+    #         .run_method_and_compare_outputs()
+    #     )
 
-    def test_qs8_channels_last_tagged_reshape_pass_conv_relu(self):
-        (
-            Tester(self.ConvRelu().eval(), (torch.randn(1, 1, 6, 6),))
-            .quantize()
-            .export()
-            .to_edge()
-            .run_passes(self.PassStage)
-            .check(
-                [
-                    self.to_copy_name,
-                    self.quant_name,
-                    self.dequant_name,
-                    self.conv_name,
-                    self.relu_name,
-                    self.quant_name,
-                    self.dequant_name,
-                    self.to_copy_name,
-                ]
-            )
-            .run_method_and_compare_outputs()
-        )
+    # def test_qs8_channels_last_tagged_reshape_pass_conv_relu(self):
+    #     (
+    #         Tester(self.ConvRelu().eval(), (torch.randn(1, 1, 6, 6),))
+    #         .quantize()
+    #         .export()
+    #         .to_edge()
+    #         .run_passes(self.PassStage)
+    #         .check(
+    #             [
+    #                 self.to_copy_name,
+    #                 self.quant_name,
+    #                 self.dequant_name,
+    #                 self.conv_name,
+    #                 self.relu_name,
+    #                 self.quant_name,
+    #                 self.dequant_name,
+    #                 self.to_copy_name,
+    #             ]
+    #         )
+    #         .run_method_and_compare_outputs()
+    #     )
 
     class Conv2dBnHardtanhMeanSequenceModule(torch.nn.Module):
         def __init__(self):
@@ -146,46 +229,46 @@ def forward(self, x):
             x = torch.mean(x, (-1, -2), keepdim=True)
             return x
 
-    def test_fp32_channels_last_tagged_reshape_pass_conv_bn_hardtanh_mean_seq(self):
-        # Copy #1 is for input to conv, nchw -> nhwc
-        # Copy #2 is for conv to _native_batch_norm_legit_no_training, nhwc -> nchw
-        # Copy #3 is for input to mean, nchw -> nhwc
-        # Copy #4 is for output, nhwc -> nchw
-
-        # The graph looks like:
-        # graph():
-        #     %arg0_1 : [#users=1] = placeholder[target=arg0_1]
-        #     %aten__to_copy_default : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._to_copy.default](args = (%arg0_1,), kwargs = {memory_format: torch.channels_last})
-        #     %_param_constant0 : [#users=1] = get_attr[target=_param_constant0]
-        #     %_param_constant1 : [#users=1] = get_attr[target=_param_constant1]
-        #     %aten_convolution_default : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten__to_copy_default, %_param_constant0, %_param_constant1, [2, 2], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
-        #     %aten__to_copy_default_1 : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._to_copy.default](args = (%aten_convolution_default,), kwargs = {memory_format: torch.contiguous_format})
-        #     %_param_constant2 : [#users=1] = get_attr[target=_param_constant2]
-        #     %_param_constant3 : [#users=1] = get_attr[target=_param_constant3]
-        #     %_tensor_constant0 : [#users=1] = get_attr[target=_tensor_constant0]
-        #     %_tensor_constant1 : [#users=1] = get_attr[target=_tensor_constant1]
-        #     %aten__native_batch_norm_legit_no_training_default : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten__to_copy_default_1, %_param_constant2, %_param_constant3, %_tensor_constant0, %_tensor_constant1, 0.1, 1e-05), kwargs = {})
-        #     %getitem : [#users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default, 0), kwargs = {})
-        #     %aten_hardtanh_default : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem, 0, 6), kwargs = {})
-        #     %aten__to_copy_default_2 : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._to_copy.default](args = (%aten_hardtanh_default,), kwargs = {memory_format: torch.channels_last})
-        #     %aten_mean_dim : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.mean.dim](args = (%aten__to_copy_default_2, [-1, -2], True), kwargs = {})
-        #     %aten__to_copy_default_3 : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._to_copy.default](args = (%aten_mean_dim,), kwargs = {memory_format: torch.contiguous_format})
-        #     return [aten__to_copy_default_3]
-        (
-            Tester(
-                self.Conv2dBnHardtanhMeanSequenceModule().eval(),
-                (torch.randn(1, 1, 6, 6),),
-            )
-            .export()
-            .to_edge()
-            .run_passes(self.PassStage)
-            .check_count(
-                {
-                    self.to_copy_name: 4,
-                }
-            )
-            .run_method_and_compare_outputs()
-        )
+    # def test_fp32_channels_last_tagged_reshape_pass_conv_bn_hardtanh_mean_seq(self):
+    # Copy #1 is for input to conv, nchw -> nhwc
+    # Copy #2 is for conv to _native_batch_norm_legit_no_training, nhwc -> nchw
+    # Copy #3 is for input to mean, nchw -> nhwc
+    # Copy #4 is for output, nhwc -> nchw
+
+    # The graph looks like:
+    # graph():
+    #     %arg0_1 : [#users=1] = placeholder[target=arg0_1]
+    #     %aten__to_copy_default : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._to_copy.default](args = (%arg0_1,), kwargs = {memory_format: torch.channels_last})
+    #     %_param_constant0 : [#users=1] = get_attr[target=_param_constant0]
+    #     %_param_constant1 : [#users=1] = get_attr[target=_param_constant1]
+    #     %aten_convolution_default : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten__to_copy_default, %_param_constant0, %_param_constant1, [2, 2], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
+    #     %aten__to_copy_default_1 : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._to_copy.default](args = (%aten_convolution_default,), kwargs = {memory_format: torch.contiguous_format})
+    #     %_param_constant2 : [#users=1] = get_attr[target=_param_constant2]
+    #     %_param_constant3 : [#users=1] = get_attr[target=_param_constant3]
+    #     %_tensor_constant0 : [#users=1] = get_attr[target=_tensor_constant0]
+    #     %_tensor_constant1 : [#users=1] = get_attr[target=_tensor_constant1]
+    #     %aten__native_batch_norm_legit_no_training_default : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten__to_copy_default_1, %_param_constant2, %_param_constant3, %_tensor_constant0, %_tensor_constant1, 0.1, 1e-05), kwargs = {})
+    #     %getitem : [#users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default, 0), kwargs = {})
+    #     %aten_hardtanh_default : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem, 0, 6), kwargs = {})
+    #     %aten__to_copy_default_2 : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._to_copy.default](args = (%aten_hardtanh_default,), kwargs = {memory_format: torch.channels_last})
+    #     %aten_mean_dim : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.mean.dim](args = (%aten__to_copy_default_2, [-1, -2], True), kwargs = {})
+    #     %aten__to_copy_default_3 : [#users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._to_copy.default](args = (%aten_mean_dim,), kwargs = {memory_format: torch.contiguous_format})
+    #     return [aten__to_copy_default_3]
+    # (
+    #     Tester(
+    #         self.Conv2dBnHardtanhMeanSequenceModule().eval(),
+    #         (torch.randn(1, 1, 6, 6),),
+    #     )
+    #     .export()
+    #     .to_edge()
+    #     .run_passes(self.PassStage)
+    #     .check_count(
+    #         {
+    #             self.to_copy_name: 4,
+    #         }
+    #     )
+    #     .run_method_and_compare_outputs()
+    # )
 
     class Conv2dDynamicQuant(torch.nn.Module):
         def __init__(self):
@@ -195,28 +278,28 @@ def __init__(self):
         def forward(self, x):
             return self.conv(x)
 
-    def test_dq_conv2d_channels_last_tagged_reshape_pass(self) -> None:
-        (
-            Tester(self.Conv2dDynamicQuant().eval(), (torch.randn(1, 3, 8, 8),))
-            .quantize(
-                Quantize(
-                    quantization_config=get_symmetric_quantization_config(
-                        is_dynamic=True
-                    )
-                )
-            )
-            .export()
-            .to_edge()
-            .run_passes(self.PassStage)
-            .check(
-                [
-                    self.to_copy_name,
-                    self.choose_qparams_name,
-                    self.dynamic_quant_name,
-                    self.dequant_name,
-                    self.conv_name,
-                    self.to_copy_name,
-                ]
-            )
-            .run_method_and_compare_outputs()
-        )
+    # def test_dq_conv2d_channels_last_tagged_reshape_pass(self) -> None:
+    #     (
+    #         Tester(self.Conv2dDynamicQuant().eval(), (torch.randn(1, 3, 8, 8),))
+    #         .quantize(
+    #             Quantize(
+    #                 quantization_config=get_symmetric_quantization_config(
+    #                     is_dynamic=True
+    #                 )
+    #             )
+    #         )
+    #         .export()
+    #         .to_edge()
+    #         .run_passes(self.PassStage)
+    #         .check(
+    #             [
+    #                 self.to_copy_name,
+    #                 self.choose_qparams_name,
+    #                 self.dynamic_quant_name,
+    #                 self.dequant_name,
+    #                 self.conv_name,
+    #                 self.to_copy_name,
+    #             ]
+    #         )
+    #         .run_method_and_compare_outputs()
+    #     )
diff --git a/backends/xnnpack/test/tester/tester.py b/backends/xnnpack/test/tester/tester.py
index fa8edd3e03c..0ed9434d807 100644
--- a/backends/xnnpack/test/tester/tester.py
+++ b/backends/xnnpack/test/tester/tester.py
@@ -31,6 +31,7 @@
 )
 from executorch.exir.backend.backend_api import validation_disabled
 from executorch.exir.backend.partitioner import Partitioner
+from executorch.exir.dim_order_utils import get_memory_format
 from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
 
 from executorch.exir.print_program import pretty_print, print_program
@@ -533,10 +534,13 @@ def fn(x):
         # create random tensor inputs with the shapes given above:
         random_inputs = []
         for arg_idx in range(len(self.example_inputs)):
+            memFormat = get_memory_format(
+                list(self.example_inputs[arg_idx].dim_order())
+            )
             random_inputs.append(
-                torch.randn(input_shapes[arg_idx]).to(
-                    dtype=self.example_inputs[arg_idx].dtype
-                )
+                torch.randn(input_shapes[arg_idx])
+                .to(dtype=self.example_inputs[arg_idx].dtype)
+                .to(memory_format=memFormat)
             )
 
         yield tuple(random_inputs)
diff --git a/backends/xnnpack/xnnpack_preprocess.py b/backends/xnnpack/xnnpack_preprocess.py
index 84cdfd69a48..d8892b179cf 100644
--- a/backends/xnnpack/xnnpack_preprocess.py
+++ b/backends/xnnpack/xnnpack_preprocess.py
@@ -145,9 +145,6 @@ def preprocess(
 
         node_to_external_map = generate_node_to_external_map(ep, graph_module)
 
-        # Make sure all inputs are contiguous_format or NCHW or default dim order
-        assert_default_dim_order(graph_module)
-
         # TODO retrace the graph module to lift the new params may have
         # been added to the graph in passes
 
diff --git a/cmake_wrapper.sh b/cmake_wrapper.sh
new file mode 100755
index 00000000000..ec904b42cf1
--- /dev/null
+++ b/cmake_wrapper.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+source /Users/madragna/executorch/.venv/bin/activate
+cmake "$@"
diff --git a/codegen/.watchman-cookie-madragna-mac-84335-443 b/codegen/.watchman-cookie-madragna-mac-84335-443
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/configurations/.watchman-cookie-madragna-mac-84335-443 b/configurations/.watchman-cookie-madragna-mac-84335-443
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/data/bin/__init__.py b/data/bin/__init__.py
new file mode 100644
index 00000000000..0c9d60e0498
--- /dev/null
+++ b/data/bin/__init__.py
@@ -0,0 +1,44 @@
+# This file should be written to the wheel package as
+# `executorch/data/bin/__init__.py`.
+#
+# Setuptools will expect to be able to say something like `from
+# executorch.data.bin import mybin; mybin()` for each entry listed in the
+# [project.scripts] section of pyproject.toml. This file makes the `mybin()`
+# function execute the binary at `executorch/data/bin/mybin` and exit with that
+# binary's exit status.
+
+import subprocess
+import os
+import sys
+import types
+
+# This file should live in the target `bin` directory.
+_bin_dir = os.path.join(os.path.dirname(__file__))
+
+def _find_executable_files_under(dir):
+    """Lists all executable files in the given directory."""
+    bin_names = []
+    for filename in os.listdir(dir):
+        filepath = os.path.join(dir, filename)
+        if os.path.isfile(filepath) and os.access(filepath, os.X_OK):
+            # Remove .exe suffix on windows.
+            filename_without_ext = os.path.splitext(filename)[0]
+            bin_names.append(filename_without_ext)
+    return bin_names
+
+# The list of binaries to create wrapper functions for.
+_bin_names = _find_executable_files_under(_bin_dir)
+
+# We'll define functions named after each binary. Make them importable.
+__all__ = _bin_names
+
+def _run(name):
+    """Runs the named binary, which should live under _bin_dir.
+
+    Exits the current process with the return code of the subprocess.
+    """
+    raise SystemExit(subprocess.call([os.path.join(_bin_dir, name)] + sys.argv[1:], close_fds=False))
+
+# Define a function named after each of the binaries.
+for bin_name in _bin_names:
+    exec(f"def {bin_name}(): _run('{bin_name}')")
diff --git a/data/bin/flatc b/data/bin/flatc
new file mode 100755
index 00000000000..a4ec7b2bac1
Binary files /dev/null and b/data/bin/flatc differ
diff --git a/devtools/.watchman-cookie-madragna-mac-84335-443 b/devtools/.watchman-cookie-madragna-mac-84335-443
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/devtools/bundled_program/serialize/bundled_program_schema.fbs b/devtools/bundled_program/serialize/bundled_program_schema.fbs
new file mode 100644
index 00000000000..b37164a410d
--- /dev/null
+++ b/devtools/bundled_program/serialize/bundled_program_schema.fbs
@@ -0,0 +1,97 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+//
+// See README.md before modifying this file.
+//
+
+include "scalar_type.fbs";
+
+namespace bundled_program_flatbuffer;
+
+// Identifier of a valid bundled program schema.
+file_identifier "BP08";
+// Extension of written files.
+file_extension "bpte";
+
+// Reason for basic struct: union value type can only be table/struct/string
+table Int {
+  int_val:long;
+}
+
+table Bool {
+  bool_val:bool;
+}
+
+table Double {
+  double_val:double;
+}
+
+// All information we need to bundle for a tensor EValue input.
+table Tensor {
+  // The scalar type of Tensor
+  scalar_type: executorch_flatbuffer.ScalarType;
+  // The target sizes of the tensor.
+  sizes: [int];
+  // The contents of the corresponding input tensor.
+  data: [ubyte] (force_align: 16);
+  dim_order:[ubyte];
+}
+
+union ValueUnion {
+  Tensor,
+  Int,
+  Bool,
+  Double,
+}
+
+// Abstraction for BundledMethodTestCase values
+table Value {
+  val: ValueUnion;
+}
+
+// A single test for a method. The provided inputs should produce the
+// expected outputs.
+table BundledMethodTestCase {
+  // The inputs to provide to the method. The number and types of inputs must
+  // match the schema of the method under test.
+  inputs: [Value];
+
+  // The expected outputs generated while running the model in eager mode using
+  // the inputs provided. Its length should be equal to the length of program
+  // outputs.
+  expected_outputs: [Value];
+}
+
+// Collection of test cases for a program method.
+table BundledMethodTestSuite {
+  // The name of the method to test; e.g., "forward" for the forward() method
+  // of an nn.Module. This name match a method defined by the ExecuTorch
+  // program.
+  method_name: string;
+
+  // Individual test cases for the method.
+  test_cases: [BundledMethodTestCase];
+}
+
+
+// Executorch program bunlded with data for verification.
+table BundledProgram {
+  // Schema version.
+  version:uint;
+
+  // Test sets to run against the program.
+  // Each BundledMethodTestSuite should be used for the method of program sharing same name.
+  method_test_suites: [BundledMethodTestSuite];
+
+  // The binary data of a serialized Executorch program.
+  // The following `force_align` may sliently override any larger force_align
+  // used in the program. Therefore, to keep the data (including constant
+  // tensor, delegate data, etc, see schema.fbs for more info) in the
+  // executorch program keeps the same alignment as original no matter how
+  // the program schema changes, we need to make the force_align here the max
+  // one around all kinds of force_align in the current and future program
+  // schema, so we use the 32 as the force_align here.
+  program: [ubyte] (force_align: 32);
+}
+
+root_type BundledProgram;
diff --git a/devtools/bundled_program/serialize/scalar_type.fbs b/devtools/bundled_program/serialize/scalar_type.fbs
new file mode 100644
index 00000000000..e9c830b972c
--- /dev/null
+++ b/devtools/bundled_program/serialize/scalar_type.fbs
@@ -0,0 +1,43 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+//
+// See README.md before modifying this file.
+//
+
+namespace executorch_flatbuffer;
+
+// The scalar data type.
+// Must match executorch/runtime/core/portable_type/tensor_impl.h
+enum ScalarType : byte {
+  BYTE = 0,
+  CHAR = 1,
+  SHORT = 2,
+  INT = 3,
+  LONG = 4,
+  HALF = 5,
+  FLOAT = 6,
+  DOUBLE = 7,
+  BOOL = 11,
+  QINT8 = 12,
+  QUINT8 = 13,
+  QINT32 = 14,
+  QUINT4X2 = 16,
+  QUINT2X4 = 17,
+  BITS16 = 22,
+  FLOAT8E5M2 = 23,
+  FLOAT8E4M3FN = 24,
+  FLOAT8E5M2FNUZ = 25,
+  FLOAT8E4M3FNUZ = 26,
+  UINT16 = 27,
+  UINT32 = 28,
+  UINT64 = 29,
+  // Types currently not implemented.
+  // COMPLEXHALF = 8,
+  // COMPLEXFLOAT = 9,
+  // COMPLEXDOUBLE = 10,
+  // BFLOAT16 = 15,
+  // BITS1x8 = 18,
+  // BITS2x4 = 19,
+  // BITS4x2 = 20,
+  // BITS8 = 21,
+}
diff --git a/docs/.watchman-cookie-madragna-mac-84335-443 b/docs/.watchman-cookie-madragna-mac-84335-443
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/examples/.watchman-cookie-madragna-mac-84335-443 b/examples/.watchman-cookie-madragna-mac-84335-443
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/exir/.watchman-cookie-madragna-mac-84335-443 b/exir/.watchman-cookie-madragna-mac-84335-443
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/exir/_serialize/program.fbs b/exir/_serialize/program.fbs
new file mode 100644
index 00000000000..7308cc63199
--- /dev/null
+++ b/exir/_serialize/program.fbs
@@ -0,0 +1,489 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+//
+// See README.md before modifying this file.
+//
+
+include "scalar_type.fbs";
+
+namespace executorch_flatbuffer;
+
+// Identifier of a valid executor schema.
+file_identifier "ET12";
+// Extension of written files.
+file_extension "pte";
+
+// Table that contains the metadata about how
+// to unflatten the flattened input/output from compiler
+table ContainerMetadata {
+  encoded_inp_str: string;
+  encoded_out_str: string;
+}
+
+table Null {}
+
+// Contains information relevant to the allocation of non-constant
+// buffer data (e.g. from tensors).
+// This refers to where the buffer needs to be placed in an existing
+// memory and at what offset from its base address.
+table AllocationDetails {
+  memory_id: uint;  // ID of the memory where this data needs to be placed.
+
+  // Offset in bytes relative to the start of the memory area indicated by
+  // memory_id.
+  //
+  // Originally this field was a single 32-bit uint, but we need 64 bits for
+  // larger models. To preserve backwards compatibility, the high bits are
+  // managed in a separate 32-bit field. Users should combine the two fields
+  // to get the full 64-bit offset.
+  memory_offset_low: uint;  // Least significant 32 bits
+  memory_offset_high: uint;  // Most significant 32 bits. Defaults to zero.
+}
+
+// Indicates the types of shape a Tensor may have, from the point
+// of view of their dynamism.
+enum TensorShapeDynamism : byte {
+  // Static shape. Memory is allocated by the compiler.
+  STATIC = 0,
+  // Dynamic shape but with an upper bound.
+  // Memory is allocated by the compiler.
+  DYNAMIC_BOUND = 1,
+  // Dynamic shape without upper bound.
+  // Memory allocation is handled by the runtime.
+  DYNAMIC_UNBOUND = 2,
+}
+
+// Indicates where a tensor is stored.
+enum TensorDataLocation : byte {
+  // Stored in a segment of the PTE file.
+  SEGMENT = 0,
+  // Stored outside of the PTE file.
+  EXTERNAL = 1,
+}
+
+// Table to put additional information about tensors in that is not applicable
+// to the vast majority of tensors in the vast majority of programs.
+table ExtraTensorInfo {
+  // [Optional] Specifies the SubsegmentOffsets in
+  //  program.mutable_data_segments that specifies where the data is located in.
+  //  If not present and the data is located in a segment, then the data is in
+  //  index zero.
+  mutable_data_segments_idx: uint64;
+
+  // [Optional] The unique name of the tensor. e.g. 'mod.linear.weight'
+  fully_qualified_name: string;
+
+  // [Optional] Specifies where the tensor's data is stored.
+  // - SEGMENT (default): Data is stored in a segment.
+  // - EXTERNAL: Data is stored outside of the PTE file. fully_qualified_name
+  //   must be non-empty, and is used as a key to find the tensor's external
+  //   data. Tensor.data_buffer_idx is ignored.
+  location: TensorDataLocation;
+}
+
+table Tensor {
+  scalar_type: ScalarType;
+
+  // Offset in scalar_type elements (e.g., multiples of 4 bytes for an int
+  // scalar type) from the beginning of the tensor buffer to the beginning of
+  // the actual data. Currently, the runtime only supports a value of zero.
+  storage_offset: int;
+
+  sizes: [int];
+
+  // Specifies in what order the dimensions are laid out in memory (from outer
+  // to inner).
+  //
+  // For example, given a rank 3 Tensor of size (3, 5, 2). If we name
+  // dimensions: [row, column, batch], then a dim_order of:
+  // - (2, 0, 1) represents a [batch, row, column] ordering where "column" is
+  //   the innermost dimension, then comes "row", and the outermost dimension is
+  //   "batch".
+  // - (0, 2, 1) represents a [row, batch, column] ordering where "column" is
+  //   the innermost dimension, then comes "batch", and the outermost dimension
+  //   is "row".
+  dim_order: [ubyte];
+
+  // out of scope M1
+  requires_grad: bool;
+
+  // Overall, a Tensor is either constant or mutable. At method load time
+  //  constant tensors receive a dataptr into the serialized program. Mutable
+  //  tensors can either receive a pointer from the heirarchical allocator or a
+  //  nullptr if they will receive a data pointer at execution time (inputs
+  //  and control flow placeholders can be like this). Mutable tensors may or
+  //  may not also have an initial value in the serialized program.
+  //
+  // In summary:
+  //   data_buffer_idx > 0, allocation_info = Null: Tensor is a constant.
+  //   data_buffer_idx = 0, allocation_info = Non Null: Tensor is mutable and
+  //     will receive a dataptr at method load time.
+  //   data_buffer_idx = 0, allocation_info = Null: Tensor is mutable and
+  //     will receive a dataptr at input time or during execution.
+  //   data_buffer_idx > 0, allocation_info = Non Null: Tensor is mutable and
+  //     will receive a dataptr at method load time, and has an initial state.
+  //
+  // Tensor data is stored inline if program.constant_buffer is null. Otherwise
+  //  it is in a segment. If this tensor's allocation_info is null then the
+  //  tensor data location is specified by program.constant_segment. If the
+  //  allocation_info is non_null then the data is somewhere in
+  //  program.mutable_data_segments. If tensor_info is Null, then the data is
+  //  in program.mutable_data_segments[0] otherwise if tensor_info is non-null
+  //  then the mutable_data_segment index is specified by
+  //  tensor_info.mutable_data_segments_index.
+  data_buffer_idx: uint;
+
+  // [Optional] preallocation details for non-constants (null otherwise).
+  allocation_info: AllocationDetails;
+
+  // May not be needed.
+  layout: byte;
+
+  // Determines the type of the tensor's shape, from the point of view of its
+  // dynamic or not behavior, and consequently how the allocation of the
+  // underlying memory is handled, and also how to interpret the sizes and
+  // strides fields.
+  // 1. dynamism == STATIC: sizes field represents the static shape of
+  //    the tensor.
+  // 2. dynamism == DYNAMIC_BOUND: sizes field represents the upper bound shape
+  //    of the tensor. Each dimension of the tensor at runtime should never
+  //    exceed the corresponding dimension of the upper bound shape.
+  //
+  // 3. dynamism == DYNAMIC_UNBOUND: the stored sizes field can be ignored since
+  //    shape is fully dynamic.
+  shape_dynamism: TensorShapeDynamism;
+
+  // [Optional] Additional information about the Tensor that is not applicable
+  // to most tensors.
+  extra_tensor_info: ExtraTensorInfo;
+}
+
+table Int {
+  int_val: long;
+}
+
+table Bool {
+  bool_val: bool;
+}
+
+table Double {
+  double_val: double;
+}
+
+table String {
+  string_val: string;
+}
+
+table IntList {
+  items: [long];
+}
+
+table DoubleList {
+  items: [double];
+}
+
+table BoolList {
+  items: [bool];
+}
+
+// Unlike primitive lists, tensor lists have mutable members and aliasing behavior when
+// elements are added to them. To match this aliasing behavior, the runtime tensor list is
+// serialized by serializing its elements into the ExecutionPlan.values array, and then
+// serializing their corresponding indices into TensorList.items.
+table TensorList {
+  items: [int];  // EValue indices.
+}
+
+// Similar to TensorList except the indices can also point to None.
+table OptionalTensorList {
+  items: [int];
+}
+
+// Supported values in Executorch kernels, Enums are serialized as ints.
+union KernelTypes {
+  Null,
+  Int,
+  Bool,
+  Double,
+  Tensor,
+  String,
+  IntList,
+  DoubleList,
+  BoolList,
+  TensorList,
+  OptionalTensorList,
+}
+
+// Abstraction for program values. A subset of types supported in core pytorch kernels.
+table EValue {
+  val: KernelTypes;
+}
+
+table Operator {
+  // Operator registry and lookup is uniquely identified by its name, and overload name.
+  // TODO(larryliu): is there a more efficient way to represent this
+  name: string;
+  overload: string;
+}
+
+table KernelCall {
+  // Index to the operators table in the program.
+  op_index: int;
+
+  // Indexes to the (values) required by the operation (in and out).
+  args: [int];
+}
+
+table DelegateCall {
+  // Index to the delegates table in the program.
+  delegate_index: int;
+
+  // Indexes to the (values) required by the delegates (in and out).
+  args: [int];
+}
+
+table MoveCall {
+  // Index into the values table of the evalue we are moving from
+  move_from: int;
+
+  // Index into the values table of the evalue we are moving into
+  move_to: int;
+}
+
+table JumpFalseCall {
+  // Index into the values table of boolean that specifies whether or not to jump
+  cond_value_index: int;
+
+  // Value to set the executor program counter if the jump occurs
+  destination_instruction: int;
+}
+
+table FreeCall {
+  // Index into values table of the tensor whose underlying data blob is being freed
+  value_index: int;
+}
+
+union InstructionArguments {
+  KernelCall,
+  DelegateCall,
+  MoveCall,
+  JumpFalseCall,
+  FreeCall,
+}
+
+// Basic unit of execution
+table Instruction {
+  instr_args: InstructionArguments;
+}
+
+table Frame {
+  // For storing the frame to print stacktraces
+  filename: string;  // Name of the file in which the instruction exists
+  lineno: int;       // Line number at which the instruction was called
+  name: string;      // Name of the function the instruction was called from
+  context: string;   // Source code of the instruction
+}
+
+table FrameList {
+  // For storing the frames to print stacktraces
+  items: [Frame];
+}
+
+// Indicates where a piece of data is stored.
+enum DataLocation : byte {
+  // Stored directly in the flatbuffer.
+  INLINE = 0,
+  // Stored in a segment.
+  SEGMENT = 1,
+}
+
+// Indicates where the delegate data is stored
+table BackendDelegateDataReference {
+  // Indicates which list to index into:
+  //     INLINE -> Program.backend_delegate_data
+  //     SEGMENT -> Program.segments
+  location: DataLocation;
+
+  // The index into the list indicated by the location.
+  index: uint;
+}
+
+table CompileSpec {
+  // One compile spec. There are can be multiple specs for one method
+  key: string; // like max_value
+  value: [ubyte]; // like 4, or other types based on needs.
+}
+
+table BackendDelegate {
+  // Used to resolve the delegate backend classes, for example, "TCE0", "TCE1", etc.
+  // This string is also used in to_backend.
+  id: string;
+
+  // A binary blob (from a subgraph) as an output of preprocessing. Will be
+  // provided to the backend code at init time. Can be very large, on the
+  // order of 10-100MB.
+  processed: BackendDelegateDataReference;
+
+  // The compilation spec for the lowered module's forward function
+  // Example: [CompileSpec["max_value", 4]]
+  compile_specs: [CompileSpec];
+}
+
+// A sequence of blocking instructions to be executed in order. The
+// abstraction is not currently leveraged, all current programs are 1 chain.
+// We are leaving chains as part of the program definition for future use cases
+// around graph level async where different threads will be represented as
+// seperate chains.
+table Chain {
+  // Indices of the values that are (non-static) inputs into this Chain.
+  inputs: [int];
+
+  // Indices of the values that are outputs out of this Chain.
+  outputs: [int];
+
+  // List of instructions to be executed in order.
+  instructions: [Instruction];
+
+  // Optional list of frames for each instruction.
+  // The backend config must have 'emit_stacktrace' set to true to emit
+  stacktrace: [FrameList];
+}
+
+table ExecutionPlan {
+
+  // Name of a method on the nn.Module that was traced to create this program.
+  name: string;
+
+  // Type meta data for input/output to the execution plan
+  container_meta_type: ContainerMetadata;
+
+  // A list of all values used in this execution plan.
+  values: [EValue];
+
+  // Indices to the 'Evalues' that are inputs to this execution plan.
+  // This list contains only the non-constant tensors (i.e. not part of
+  // the saved program).
+  inputs: [int];
+
+  // Indices to the 'Evalues' that are outputs of this execution plan.
+  // This signals a lifespan that goes beyond the execution.
+  outputs: [int];
+
+  // List of Chains of kernels.
+  chains: [Chain];
+
+  // Operators used in this execution plan
+  operators: [Operator];
+
+  // A list of delegates and each is a special instance of execution, the same level of chains.
+  delegates: [BackendDelegate];
+
+  // List of buffer sizes for non_constant memory allocations. (Think neural net activations)
+  // A list instead of a single buffer to account for complex memory hierarchies.
+  // TODO(jakeszwe, razy): How to reconcile this with the ability for the hierarchical memory allocator
+  // to be id based instead of index based.
+  // Runtime should use the len(constant_buffer) as the ground truth of the
+  // constants memory buffer size, and ignore non_const_buffer_sizes[0].
+  non_const_buffer_sizes: [int64];
+
+}
+
+// Constant tensor data stored directly in the flatbuffer.
+table Buffer {
+  // During serialization, this alignment may be rewritten to a larger value.
+  // The magic "@executorch-tensor-alignment" comment tells EXIR which lines to
+  // patch.
+  storage: [ubyte] (force_align: 16);  // @executorch-tensor-alignment
+}
+
+// Delegate data stored directly in the flatbuffer. This is a different type
+// than Buffer because tensors and delegates can have different alignment
+// requirements.
+table BackendDelegateInlineData {
+  // During serialization, this alignment may be rewritten to a larger value.
+  // The magic "@executorch-delegate-alignment" comment tells EXIR which lines
+  // to patch.
+  data: [ubyte] (force_align: 16);  // @executorch-delegate-alignment
+}
+
+// Describes a contiguous piece of data that lives outside of the flatbuffer data,
+// typically appended afterwards in the file. The "extended header" in the file,
+// when present, points to the segment base offset.
+table DataSegment {
+  // Segment offsets are relative to the segment base offset provided in
+  // the extended file header. Segments will typically be aligned in a
+  // way to make it possible to use mmap() to load them.
+  offset: uint64;
+
+  // The size in bytes of valid data starting at the offset. The segment
+  // data may be followed by padding before the segment that follows it,
+  // to make it easier to use mmap().
+  size: uint64;
+}
+
+// Describes data offsets into a particular segment
+table SubsegmentOffsets {
+  // Index of the segment in Program.segments
+  segment_index: uint;
+
+  // Each element is an offset in bytes into the data of the segment pointed to
+  // by segment_index. Offsets must be aligned to @executorch-tensor-alignment.
+  offsets: [uint64];
+}
+
+// Attributes a name to data referenced by Program.segments. Used when data is
+// referenced by multiple users, in cases where indices are not guaranteed to
+// be consistent across the users.
+table NamedData {
+  // The unique id of the data blob.
+  key: string;
+
+  // Index of the segment in Program.segments.
+  segment_index: uint32;
+}
+
+table Program {
+  // Schema version.
+  version: uint;
+
+  // List of ExecutionPlans that make up the program. Each ExecutionPlan corresponds with a
+  // different entry point into the model.
+  execution_plan: [ExecutionPlan];
+
+  // Tables of constant data, used for constant Values (e.g.data field of weight tensors).
+  // Each constant is assigned an index into the table which are each individually aligned.
+  // 0 index is reserved to be pointed to by non-constant Tensors.
+  // If this field is non-empty, constant_segment.offsets must be empty.
+  // DEPRECATED: After D61996249 on 2024-09-05, no new PTE files will use this field.
+  constant_buffer: [Buffer];
+
+  // List of delegate data. Pointed to by BackendDelegateDataReference.
+  backend_delegate_data: [BackendDelegateInlineData];
+
+  // List of data segments that follow the Program data in this file, sorted by
+  // offset. Elements in this schema can refer to these segments by index.
+  segments: [DataSegment];
+
+  // Describes the offsets of each constant tensor, relative to the segment
+  // offset. If constant_segment.offsets field is non-empty, constant_buffer
+  // must be empty. constant_segment.offsets[0] is reserved to be pointed to by
+  // non-constant Tensors.
+  constant_segment: SubsegmentOffsets;
+
+  // [Optional] Describes the offsets into various segments for each mutable
+  // tensor. Only mutable tensors with a meaningful initial state are
+  // serialized here (for example weights that will be trained on-device as
+  // opposed to just layer activations). Seperate from the constant_segment to
+  // reduce peak memory usage by letting us read directly from the PTE file
+  // into the mutable tensor, as opposed to loading the .pte data into
+  // constant memory, copying it over, and then being unable to release the
+  // constant segment. No two elements should point to the same segment.
+  mutable_data_segments: [SubsegmentOffsets];
+
+  // [Optional] List of blobs keyed by a unique name. Note that multiple
+  // 'NamedData' entries could point to the same segment index. Stored in
+  // segments attached to the PTE file.
+  named_data: [NamedData];
+}
+
+root_type Program;
diff --git a/exir/_serialize/scalar_type.fbs b/exir/_serialize/scalar_type.fbs
new file mode 100644
index 00000000000..e9c830b972c
--- /dev/null
+++ b/exir/_serialize/scalar_type.fbs
@@ -0,0 +1,43 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+//
+// See README.md before modifying this file.
+//
+
+namespace executorch_flatbuffer;
+
+// The scalar data type.
+// Must match executorch/runtime/core/portable_type/tensor_impl.h
+enum ScalarType : byte {
+  BYTE = 0,
+  CHAR = 1,
+  SHORT = 2,
+  INT = 3,
+  LONG = 4,
+  HALF = 5,
+  FLOAT = 6,
+  DOUBLE = 7,
+  BOOL = 11,
+  QINT8 = 12,
+  QUINT8 = 13,
+  QINT32 = 14,
+  QUINT4X2 = 16,
+  QUINT2X4 = 17,
+  BITS16 = 22,
+  FLOAT8E5M2 = 23,
+  FLOAT8E4M3FN = 24,
+  FLOAT8E5M2FNUZ = 25,
+  FLOAT8E4M3FNUZ = 26,
+  UINT16 = 27,
+  UINT32 = 28,
+  UINT64 = 29,
+  // Types currently not implemented.
+  // COMPLEXHALF = 8,
+  // COMPLEXFLOAT = 9,
+  // COMPLEXDOUBLE = 10,
+  // BFLOAT16 = 15,
+  // BITS1x8 = 18,
+  // BITS2x4 = 19,
+  // BITS4x2 = 20,
+  // BITS8 = 21,
+}
diff --git a/export/.watchman-cookie-madragna-mac-84335-443 b/export/.watchman-cookie-madragna-mac-84335-443
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/extension/.watchman-cookie-madragna-mac-84335-443 b/extension/.watchman-cookie-madragna-mac-84335-443
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/extension/llm/custom_ops/libcustom_ops_aot_lib.dylib b/extension/llm/custom_ops/libcustom_ops_aot_lib.dylib
new file mode 100755
index 00000000000..eadf818636c
Binary files /dev/null and b/extension/llm/custom_ops/libcustom_ops_aot_lib.dylib differ
diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers
index 57eb76d71d6..9ceef562d5c 160000
--- a/extension/llm/tokenizers
+++ b/extension/llm/tokenizers
@@ -1 +1 @@
-Subproject commit 57eb76d71d6dde5396520c7d35142eb868994e06
+Subproject commit 9ceef562d5c941eb6aea5476c768d0419962bc0c
diff --git a/extension/pybindings/_portable_lib.cpython-312-darwin.so b/extension/pybindings/_portable_lib.cpython-312-darwin.so
new file mode 100755
index 00000000000..78a5e8dcba6
Binary files /dev/null and b/extension/pybindings/_portable_lib.cpython-312-darwin.so differ
diff --git a/include/executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h b/include/executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h
new file mode 100644
index 00000000000..16b71594eb3
--- /dev/null
+++ b/include/executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+//===----------------------------------------------------------------------===//
+/// \file extension/kernel_util/make_boxed_from_unboxed_functor.h
+/// Defines a template that can be used to create a boxed version of an unboxed
+/// functor.
+/// Example usage:
+/// ```
+/// Tensor&
+/// my_op(KernelRuntimeContext& ctx, const Tensor& self, const Tensor& other,
+///       Tensor& out)
+/// {
+///   // ...
+///   return out;
+/// }
+///
+/// Kernel my_kernel = Kernel::make_boxed_kernel("my_ns::my_op",
+///   EXECUTORCH_FN(my_op));
+/// static auto res = register_kernels({my_kernel});
+/// ```
+/// Or simply:
+/// ```
+/// EXECUTORCH_LIBRARY(my_ns, "my_op", my_op);
+/// ```
+///
+/// The trick here is to convert each EValue to inferred argument type. This
+/// uses a lot of C++17 features.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#if __cplusplus < 201703L
+#error "This header requires C++17"
+#endif
+
+#include <executorch/extension/kernel_util/meta_programming.h>
+#include <executorch/extension/kernel_util/type_list.h>
+#include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/kernel/operator_registry.h>
+#include <cstdlib>
+#include <memory>
+#include <type_traits>
+#include <typeinfo>
+
+namespace executorch {
+namespace runtime {
+class KernelRuntimeContext; // Forward declaration
+} // namespace runtime
+} // namespace executorch
+
+namespace executorch {
+namespace extension {
+
+// This extension has a lot of generic internal names like "size"; use a unique
+// internal namespace to avoid conflicts with other extensions.
+namespace kernel_util_internal {
+
+template <class T>
+struct decay_if_not_tensor final {
+  using type = std::decay_t<T>;
+};
+template <>
+struct decay_if_not_tensor<executorch::aten::Tensor&> final {
+  using type = executorch::aten::Tensor&;
+};
+template <>
+struct decay_if_not_tensor<const executorch::aten::Tensor&> final {
+  using type = const executorch::aten::Tensor&;
+};
+
+template <class T>
+struct evalue_to_arg final {
+  static T call(executorch::runtime::EValue& v) {
+    return std::move(v).to<T>();
+  }
+};
+
+template <>
+struct evalue_to_arg<executorch::aten::Tensor&> final {
+  static executorch::aten::Tensor& call(executorch::runtime::EValue& v) {
+    return v.toTensor();
+  }
+};
+
+template <>
+struct evalue_to_arg<const executorch::aten::Tensor&> final {
+  static const executorch::aten::Tensor& call(executorch::runtime::EValue& v) {
+    return v.toTensor();
+  }
+};
+
+template <class T>
+struct evalue_to_arg<executorch::aten::optional<T>> final {
+  static executorch::aten::optional<T> call(executorch::runtime::EValue& v) {
+    return v.toOptional<T>();
+  }
+};
+
+template <class T>
+struct evalue_to_arg<executorch::aten::ArrayRef<executorch::aten::optional<T>>>
+    final {
+  static executorch::aten::ArrayRef<executorch::aten::optional<T>> call(
+      executorch::runtime::EValue& v) {
+    return v.toListOptionalTensor();
+  }
+};
+
+template <class Functor, size_t... evalue_arg_indices, typename... ArgTypes>
+void call_functor_with_args_from_stack(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    executorch::runtime::EValue** stack,
+    std::index_sequence<evalue_arg_indices...>,
+    typelist<ArgTypes...>*) {
+  (*Functor::func_ptr())(
+      ctx,
+      evalue_to_arg<typename decay_if_not_tensor<ArgTypes>::type>::call(
+          *stack[evalue_arg_indices])...);
+}
+
+} // namespace kernel_util_internal
+
+/**
+ * WrapUnboxedIntoFunctor: Given a function pointer, wrap it into a functor that
+ * takes EValues as input and returns void. The wrapped functor will unbox all
+ * inputs and forward them to unboxed kernel.
+ */
+template <class FuncType>
+struct WrapUnboxedIntoFunctor {
+  static_assert(
+      kernel_util_internal::is_compile_time_function_pointer<FuncType>::value,
+      "Can't handle function other than EXECUTORCH_FN");
+  using TrueType = typename FuncType::FuncType;
+  using ReturnType = typename kernel_util_internal::infer_function_traits_t<
+      TrueType>::return_type;
+  using ArgsType = typename kernel_util_internal::infer_function_traits_t<
+      TrueType>::parameter_types;
+  // check if the first argument is KernelRuntimeContext, if so, remove it
+  static constexpr bool first_arg_is_context = std::is_same<
+      ::executorch::runtime::KernelRuntimeContext,
+      std::remove_reference_t<
+          kernel_util_internal::head_with_default_t<void, ArgsType>>>::value;
+  using ContextRemovedArgsType = std::conditional_t<
+      first_arg_is_context,
+      kernel_util_internal::drop_if_nonempty_t<ArgsType, 1>,
+      ArgsType>;
+
+  static void call(
+      ::executorch::runtime::KernelRuntimeContext& ctx,
+      executorch::runtime::EValue** stack) {
+    constexpr size_t num_inputs =
+        kernel_util_internal::size<ContextRemovedArgsType>::value;
+    return kernel_util_internal::call_functor_with_args_from_stack<FuncType>(
+        ctx,
+        stack,
+        std::make_index_sequence<num_inputs>(),
+        static_cast<ContextRemovedArgsType*>(nullptr));
+  }
+};
+
+template <typename FuncType>
+static executorch::runtime::Kernel make_boxed_kernel(
+    const char* name,
+    FuncType) {
+  return executorch::runtime::Kernel(
+      name, WrapUnboxedIntoFunctor<FuncType>::call);
+}
+
+} // namespace extension
+} // namespace executorch
+
+// Inspired from C10_CONCATENATE
+#define ET_CONCATENATE_IMPL(s1, s2) s1##s2
+#define ET_CONCATENATE(s1, s2) ET_CONCATENATE_IMPL(s1, s2)
+#define ET_UID __LINE__
+
+#define EXECUTORCH_LIBRARY(ns, op_name, func) \
+  _EXECUTORCH_LIBRARY_IMPL(ns, op_name, func, ET_UID)
+
+#define _EXECUTORCH_LIBRARY_IMPL(ns, op_name, func, uid) \
+  static auto ET_CONCATENATE(res_##ns##_, uid) =         \
+      ::executorch::runtime::register_kernel(            \
+          ::executorch::extension::make_boxed_kernel(    \
+              #ns "::" op_name, EXECUTORCH_FN(func)))
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::make_boxed_kernel;
+using ::executorch::extension::WrapUnboxedIntoFunctor;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/extension/kernel_util/meta_programming.h b/include/executorch/extension/kernel_util/meta_programming.h
new file mode 100644
index 00000000000..027568fe687
--- /dev/null
+++ b/include/executorch/extension/kernel_util/meta_programming.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#if __cplusplus < 201703L
+#error "This header requires C++17"
+#endif
+
+#include <executorch/extension/kernel_util/type_list.h>
+#include <cstdlib>
+#include <memory>
+#include <type_traits>
+#include <typeinfo>
+
+namespace executorch {
+namespace extension {
+// This extension has a lot of generic internal names like "size"; use a unique
+// internal namespace to avoid conflicts with other extensions.
+namespace kernel_util_internal {
+
+// Check if a given type is a function
+template <class T>
+struct is_function_type : std::false_type {};
+template <class Result, class... Args>
+struct is_function_type<Result(Args...)> : std::true_type {};
+template <class T>
+using is_function_type_t = typename is_function_type<T>::type;
+
+// A compile-time wrapper around a function pointer
+template <class FuncType_, FuncType_* func_ptr_>
+struct CompileTimeFunctionPointer final {
+  static_assert(
+      is_function_type<FuncType_>::value,
+      "EXECUTORCH_FN can only wrap function types.");
+  using FuncType = FuncType_;
+
+  static constexpr FuncType* func_ptr() {
+    return func_ptr_;
+  }
+};
+
+// Check if a given type is a compile-time function pointer
+template <class T>
+struct is_compile_time_function_pointer : std::false_type {};
+template <class FuncType, FuncType* func_ptr>
+struct is_compile_time_function_pointer<
+    CompileTimeFunctionPointer<FuncType, func_ptr>> : std::true_type {};
+
+#define EXECUTORCH_FN_TYPE(func)                                             \
+  ::executorch::extension::kernel_util_internal::CompileTimeFunctionPointer< \
+      std::remove_pointer_t<std::remove_reference_t<decltype(func)>>,        \
+      func>
+#define EXECUTORCH_FN(func) EXECUTORCH_FN_TYPE(func)()
+
+/**
+ * strip_class: helper to remove the class type from pointers to `operator()`.
+ */
+template <typename T>
+struct strip_class {};
+template <typename Class, typename Result, typename... Args>
+struct strip_class<Result (Class::*)(Args...)> {
+  using type = Result(Args...);
+};
+template <typename Class, typename Result, typename... Args>
+struct strip_class<Result (Class::*)(Args...) const> {
+  using type = Result(Args...);
+};
+template <typename T>
+using strip_class_t = typename strip_class<T>::type;
+
+/**
+ * Access information about result type or arguments from a function type.
+ * Example:
+ * using A = function_traits<int (float, double)>::return_type // A == int
+ * using A = function_traits<int (float, double)>::parameter_types::tuple_type
+ * // A == tuple<float, double>
+ */
+template <class Func>
+struct function_traits {
+  static_assert(
+      !std::is_same<Func, Func>::value,
+      "In function_traits<Func>, Func must be a plain function type.");
+};
+template <class Result, class... Args>
+struct function_traits<Result(Args...)> {
+  using func_type = Result(Args...);
+  using return_type = Result;
+  using parameter_types = typelist<Args...>;
+  static constexpr auto number_of_parameters = sizeof...(Args);
+};
+
+/**
+ * infer_function_traits: creates a `function_traits` type for a simple
+ * function (pointer) or functor (lambda/struct). Currently does not support
+ * class methods.
+ */
+template <typename Functor>
+struct infer_function_traits {
+  using type = function_traits<strip_class_t<decltype(&Functor::operator())>>;
+};
+template <typename Result, typename... Args>
+struct infer_function_traits<Result (*)(Args...)> {
+  using type = function_traits<Result(Args...)>;
+};
+template <typename Result, typename... Args>
+struct infer_function_traits<Result(Args...)> {
+  using type = function_traits<Result(Args...)>;
+};
+template <typename T>
+using infer_function_traits_t = typename infer_function_traits<T>::type;
+
+} // namespace kernel_util_internal
+} // namespace extension
+} // namespace executorch
diff --git a/include/executorch/extension/kernel_util/type_list.h b/include/executorch/extension/kernel_util/type_list.h
new file mode 100644
index 00000000000..300cbfcb7cb
--- /dev/null
+++ b/include/executorch/extension/kernel_util/type_list.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+///
+/// \file runtime/kernel/type_list.h
+/// Forked from pytorch/c10/util/TypeList.h
+/// \brief Utilities for working with type lists.
+#pragma once
+#if __cplusplus < 201703L
+#error "This header requires C++17"
+#endif
+
+#include <algorithm>
+#include <cstddef>
+#include <type_traits>
+#include <utility>
+
+namespace executorch {
+namespace extension {
+// This extension has a lot of generic internal names like "size"; use a unique
+// internal namespace to avoid conflicts with other extensions.
+namespace kernel_util_internal {
+
+/**
+ * Type holding a list of types for compile time type computations
+ *     constexpr size_t num = size<typelist<int, double>>::value;
+ *     static_assert(num == 2, "");
+ */
+template <class... T>
+struct false_t : std::false_type {};
+
+template <class... Items>
+struct typelist final {
+ public:
+  typelist() = delete; // not for instantiation
+};
+template <class TypeList>
+struct size final {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::size<T>, T must be typelist<...>.");
+};
+template <class... Types>
+struct size<typelist<Types...>> final {
+  static constexpr size_t value = sizeof...(Types);
+};
+
+/**
+ * is_instantiation_of<T, I> is true_type iff I is a template instantiation of T
+ * (e.g. vector<int> is an instantiation of vector) Example:
+ *    is_instantiation_of_t<vector, vector<int>> // true
+ *    is_instantiation_of_t<pair, pair<int, string>> // true
+ *    is_instantiation_of_t<vector, pair<int, string>> // false
+ */
+template <template <class...> class Template, class T>
+struct is_instantiation_of : std::false_type {};
+template <template <class...> class Template, class... Args>
+struct is_instantiation_of<Template, Template<Args...>> : std::true_type {};
+template <template <class...> class Template, class T>
+using is_instantiation_of_t = typename is_instantiation_of<Template, T>::type;
+
+/// Base template.
+template <size_t Index, class TypeList>
+struct element final {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::element<T>, the T argument must be typelist<...>.");
+};
+/// Successful case, we have reached the zero index and can "return" the head
+/// type.
+template <class Head, class... Tail>
+struct element<0, typelist<Head, Tail...>> {
+  using type = Head;
+};
+/// Error case, we have an index but ran out of types! It will only be selected
+/// if `Ts...` is actually empty!
+template <size_t Index, class... Ts>
+struct element<Index, typelist<Ts...>> {
+  static_assert(
+      Index < sizeof...(Ts),
+      "Index is out of bounds in typelist::element");
+};
+/// Shave off types until we hit the <0, Head, Tail...> or <Index> case.
+template <size_t Index, class Head, class... Tail>
+struct element<Index, typelist<Head, Tail...>>
+    : element<Index - 1, typelist<Tail...>> {};
+/// Convenience alias.
+template <size_t Index, class TypeList>
+using element_t = typename element<Index, TypeList>::type;
+
+/**
+ * Returns the first element of a type list, or the specified default if the
+ * type list is empty. Example: int  ==  head_t<bool, typelist<int, string>>
+ *   bool  ==  head_t<bool, typelist<>>
+ */
+template <class Default, class TypeList>
+struct head_with_default final {
+  using type = Default;
+};
+template <class Default, class Head, class... Tail>
+struct head_with_default<Default, typelist<Head, Tail...>> final {
+  using type = Head;
+};
+template <class Default, class TypeList>
+using head_with_default_t = typename head_with_default<Default, TypeList>::type;
+
+/**
+ * Take/drop a number of arguments from a typelist.
+ * Example:
+ *   typelist<int, string> == take_t<typelist<int, string, bool>, 2>
+ *   typelist<bool> == drop_t<typelist<int, string, bool>, 2>
+ */
+template <class TypeList, size_t offset, class IndexSequence>
+struct take_elements final {};
+template <class TypeList, size_t offset, size_t... Indices>
+struct take_elements<TypeList, offset, std::index_sequence<Indices...>> final {
+  using type = typelist<typename element<offset + Indices, TypeList>::type...>;
+};
+
+/**
+ * Like drop, but returns an empty list rather than an assertion error if `num`
+ * is larger than the size of the TypeList.
+ * Example:
+ *   typelist<> == drop_if_nonempty_t<typelist<string, bool>, 2>
+ *   typelist<> == drop_if_nonempty_t<typelist<int, string, bool>, 3>
+ */
+template <class TypeList, size_t num>
+struct drop_if_nonempty final {
+  static_assert(
+      is_instantiation_of<typelist, TypeList>::value,
+      "In typelist::drop<T, num>, the T argument must be typelist<...>.");
+  using type = typename take_elements<
+      TypeList,
+      std::min(num, size<TypeList>::value),
+      std::make_index_sequence<
+          size<TypeList>::value - std::min(num, size<TypeList>::value)>>::type;
+};
+template <class TypeList, size_t num>
+using drop_if_nonempty_t = typename drop_if_nonempty<TypeList, num>::type;
+
+} // namespace kernel_util_internal
+} // namespace extension
+} // namespace executorch
diff --git a/include/executorch/extension/tensor/tensor.h b/include/executorch/extension/tensor/tensor.h
new file mode 100644
index 00000000000..80a41018a20
--- /dev/null
+++ b/include/executorch/extension/tensor/tensor.h
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+// Umbrella header for the Tensor extension.
+#include <executorch/extension/tensor/tensor_ptr.h>
+#include <executorch/extension/tensor/tensor_ptr_maker.h>
diff --git a/include/executorch/extension/tensor/tensor_accessor.h b/include/executorch/extension/tensor/tensor_accessor.h
new file mode 100644
index 00000000000..271516b3390
--- /dev/null
+++ b/include/executorch/extension/tensor/tensor_accessor.h
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/result.h>
+
+namespace executorch {
+namespace extension {
+namespace internal {
+
+/**
+ * Base class template storing the underlying data with size and stride helpers.
+ * Inherited by TensorAccessor<> which requires specialization on rank.
+ */
+template <typename T, ssize_t N>
+class TensorAccessorBase {
+ public:
+  /// Returns the size of the underlying tensor at the given dimension.
+  executorch::aten::SizesType size(ssize_t i) const {
+    ET_CHECK_MSG(
+        i < dim_ && i >= 0,
+        "Dimension outside of [0, %zd], got %zd",
+        dim_ - 1,
+        i);
+    return sizes_[i];
+  }
+
+  /// Returns the stride of the underlying tensor at the given dimension.
+  executorch::aten::StridesType stride(ssize_t i) const {
+    ET_CHECK_MSG(
+        i < dim_ && i >= 0,
+        "Dimension outside of [0, %zd], got %zd",
+        dim_ - 1,
+        i);
+    return strides_[i];
+  }
+
+ protected:
+  TensorAccessorBase(
+      T* data,
+      const executorch::aten::SizesType* sizes,
+      const executorch::aten::StridesType* strides,
+      ssize_t dim)
+      : data_(data), sizes_(sizes), strides_(strides), dim_(dim) {}
+
+  T* data_;
+  const executorch::aten::SizesType* sizes_;
+  const executorch::aten::StridesType* strides_;
+  ssize_t dim_;
+};
+
+} // namespace internal
+
+/**
+ * TensorAccessor template with data type and rank as template parameters. No
+ * public constructors, can only be created using make_tensor_accessor from a
+ * given executorch::aten::Tensor. Use operator[] to index and obtain a lower
+ * rank accessor or the underlying scalar value.
+ */
+template <typename T, ssize_t N>
+class TensorAccessor : public internal::TensorAccessorBase<T, N> {
+ public:
+  /**
+   * Index into the the outer most dimension.
+   *
+   * @param i Index.
+   * @return If N > 1, a TensorAccessor with N-1 dimensions. If N == 1, a
+   * reference to the underlying scalar. Refer to the TensorAccessor<T, 1>
+   * specialization.
+   */
+  TensorAccessor<T, N - 1> operator[](ssize_t i) {
+    return TensorAccessor<T, N - 1>(
+        this->data_ + this->strides_[0] * i,
+        this->sizes_ + 1,
+        this->strides_ + 1,
+        N - 1);
+  }
+
+  /**
+   * Index into the the outer most dimension.
+   *
+   * @param i Index.
+   * @return If N > 1, a constant TensorAccessor with N-1 dimensions. If N == 1,
+   * a constant reference to the underlying scalar. Refer to the
+   * TensorAccessor<T, 1> specialization.
+   */
+  const TensorAccessor<T, N - 1> operator[](ssize_t i) const {
+    return TensorAccessor<T, N - 1>(
+        this->data_ + this->strides_[0] * i,
+        this->sizes_ + 1,
+        this->strides_ + 1,
+        N - 1);
+  }
+
+ private:
+  TensorAccessor(
+      T* data,
+      const executorch::aten::SizesType* sizes,
+      const executorch::aten::StridesType* strides,
+      ssize_t dim)
+      : internal::TensorAccessorBase<T, N>(data, sizes, strides, dim) {}
+
+  template <typename T2, ssize_t N2>
+  friend class TensorAccessor;
+
+  template <typename T2, ssize_t N2>
+  friend executorch::runtime::Result<TensorAccessor<T2, N2>>
+  make_tensor_accessor(const executorch::aten::Tensor& t);
+};
+
+/**
+ * TensorAccessor specialization for N == 1, where operator[] returns a
+ * reference to the underlying scalar.
+ */
+template <typename T>
+class TensorAccessor<T, 1> : public internal::TensorAccessorBase<T, 1> {
+ public:
+  /**
+   * Index into the the outer most dimension.
+   *
+   * @param i Index.
+   * @return Reference to the underlying scalar.
+   */
+  T& operator[](ssize_t i) {
+    return this->data_[this->strides_[0] * i];
+  }
+
+  /**
+   * Index into the the outer most dimension.
+   *
+   * @param i Index.
+   * @return Constant reference to the underlying scalar.
+   */
+  const T& operator[](ssize_t i) const {
+    return this->data_[this->strides_[0] * i];
+  }
+
+ private:
+  TensorAccessor(
+      T* data,
+      const executorch::aten::SizesType* sizes,
+      const executorch::aten::StridesType* strides,
+      ssize_t dim)
+      : internal::TensorAccessorBase<T, 1>(data, sizes, strides, dim) {}
+
+  template <typename T2, ssize_t N2>
+  friend class TensorAccessor;
+
+  template <typename T2, ssize_t N2>
+  friend executorch::runtime::Result<TensorAccessor<T2, N2>>
+  make_tensor_accessor(const executorch::aten::Tensor& t);
+};
+
+/**
+ * Creates a TensorAccessor<T, N> from the given tensor. The number of dimension
+ * N and the data type T's size must match those of the input tensor. For
+ * Executorch tensors, non-trivial dimension order is not supported.
+ *
+ * @param tensor Origin tensor. The TensorImpl inside must outlive the returned
+ * TensorAccessor.
+ * @return TensorAccessor of the input tensor.
+ * @retval Error::InvalidArgument Mismatch on data type or number of dimensions.
+ * @retval Error::NotSupported Input tensor has non-trivial dimension onrder.
+ */
+template <typename T, ssize_t N>
+executorch::runtime::Result<TensorAccessor<T, N>> make_tensor_accessor(
+    const executorch::aten::Tensor& tensor) {
+  static_assert(
+      N > 0,
+      "TensorAccessor is used for indexing tensors, for scalar use *_data_ptr<T>()");
+
+  if (N != tensor.dim()) {
+    ET_LOG(
+        Error,
+        "Expecting %zd dimensions but tensor has %zd.",
+        static_cast<ssize_t>(N),
+        static_cast<ssize_t>(tensor.dim()));
+    return executorch::runtime::Error::InvalidArgument;
+  }
+
+  if (sizeof(T) != tensor.element_size()) {
+    ET_LOG(
+        Error,
+        "Size of data type template argument (%zd) not equal to tensor element size (%zd)",
+        static_cast<ssize_t>(sizeof(T)),
+        static_cast<ssize_t>(tensor.element_size()));
+    return executorch::runtime::Error::InvalidArgument;
+  }
+
+#ifndef USE_ATEN_LIB
+  auto dim_order = tensor.dim_order();
+  for (ssize_t i = 0; i < dim_order.size(); i++) {
+    if (dim_order[i] != i) {
+      ET_LOG(Error, "Non-trival dim_order not supported.");
+      return executorch::runtime::Error::NotSupported;
+    }
+  }
+#endif
+
+  T* ptr = nullptr;
+  if constexpr (std::is_const_v<T>) {
+    ptr = tensor.const_data_ptr<T>();
+  } else {
+    ptr = tensor.mutable_data_ptr<T>();
+  }
+  return TensorAccessor<T, N>(
+      ptr, tensor.sizes().data(), tensor.strides().data(), N);
+}
+
+} // namespace extension
+} // namespace executorch
diff --git a/include/executorch/extension/tensor/tensor_ptr.h b/include/executorch/extension/tensor/tensor_ptr.h
new file mode 100644
index 00000000000..3259bdbaf2b
--- /dev/null
+++ b/include/executorch/extension/tensor/tensor_ptr.h
@@ -0,0 +1,378 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+
+namespace executorch {
+namespace extension {
+
+/**
+ * A smart pointer type for managing the lifecycle of a Tensor.
+ */
+using TensorPtr = std::shared_ptr<executorch::aten::Tensor>;
+
+/**
+ * Creates a TensorPtr that manages a Tensor with the specified properties.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param data A pointer to the data buffer.
+ * @param dim_order A vector specifying the order of dimensions.
+ * @param strides A vector specifying the strides of the tensor.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @param deleter A custom deleter function for managing the lifetime of the
+ * data buffer. If provided, this deleter will be called when the managed Tensor
+ * object is destroyed.
+ * @return A TensorPtr that manages the newly created Tensor.
+ */
+TensorPtr make_tensor_ptr(
+    std::vector<executorch::aten::SizesType> sizes,
+    void* data,
+    std::vector<executorch::aten::DimOrderType> dim_order,
+    std::vector<executorch::aten::StridesType> strides,
+    const executorch::aten::ScalarType type =
+        executorch::aten::ScalarType::Float,
+    const executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
+    std::function<void(void*)> deleter = nullptr);
+
+/**
+ * Creates a TensorPtr that manages a Tensor with the specified properties.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param data A pointer to the data buffer.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @param deleter A custom deleter function for managing the lifetime of the
+ * data buffer. If provided, this deleter will be called when the managed Tensor
+ * object is destroyed.
+ * @return A TensorPtr that manages the newly created Tensor.
+ */
+inline TensorPtr make_tensor_ptr(
+    std::vector<executorch::aten::SizesType> sizes,
+    void* data,
+    const executorch::aten::ScalarType type =
+        executorch::aten::ScalarType::Float,
+    const executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
+    std::function<void(void*)> deleter = nullptr) {
+  return make_tensor_ptr(
+      std::move(sizes), data, {}, {}, type, dynamism, std::move(deleter));
+}
+
+/**
+ * Creates a TensorPtr that manages a Tensor with the specified properties.
+ *
+ * This template overload is specialized for cases where the tensor data is
+ * provided as a vector. The scalar type is automatically deduced from the
+ * vector's data type. If the specified `type` differs from the deduced type of
+ * the vector's elements, and casting is allowed, the data will be cast to the
+ * specified `type`. This allows for flexible creation of tensors with data
+ * vectors of one type and a different scalar type.
+ *
+ * @tparam T The C++ type of the tensor elements, deduced from the vector.
+ * @param sizes A vector specifying the size of each dimension.
+ * @param data A vector containing the tensor's data.
+ * @param dim_order A vector specifying the order of dimensions.
+ * @param strides A vector specifying the strides of each dimension.
+ * @param type The scalar type of the tensor elements. If it differs from the
+ * deduced type, the data will be cast to this type if allowed.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @return A TensorPtr that manages the newly created TensorImpl.
+ */
+template <
+    typename T = float,
+    executorch::aten::ScalarType deduced_type =
+        runtime::CppTypeToScalarType<T>::value>
+inline TensorPtr make_tensor_ptr(
+    std::vector<executorch::aten::SizesType> sizes,
+    std::vector<T> data,
+    std::vector<executorch::aten::DimOrderType> dim_order = {},
+    std::vector<executorch::aten::StridesType> strides = {},
+    executorch::aten::ScalarType type = deduced_type,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  if (type != deduced_type) {
+    ET_CHECK_MSG(
+        runtime::canCast(deduced_type, type),
+        "Cannot cast deduced type to specified type.");
+    std::vector<uint8_t> casted_data(data.size() * runtime::elementSize(type));
+    ET_SWITCH_REALHBBF16_TYPES(type, nullptr, "make_tensor_ptr", CTYPE, [&] {
+      std::transform(
+          data.begin(),
+          data.end(),
+          reinterpret_cast<CTYPE*>(casted_data.data()),
+          [](const T& val) { return static_cast<CTYPE>(val); });
+    });
+    const auto raw_data_ptr = casted_data.data();
+    auto data_ptr =
+        std::make_shared<std::vector<uint8_t>>(std::move(casted_data));
+    return make_tensor_ptr(
+        std::move(sizes),
+        raw_data_ptr,
+        std::move(dim_order),
+        std::move(strides),
+        type,
+        dynamism,
+        [data_ptr = std::move(data_ptr)](void*) {});
+  }
+  const auto raw_data_ptr = data.data();
+  auto data_ptr = std::make_shared<std::vector<T>>(std::move(data));
+  return make_tensor_ptr(
+      std::move(sizes),
+      raw_data_ptr,
+      std::move(dim_order),
+      std::move(strides),
+      type,
+      dynamism,
+      [data_ptr = std::move(data_ptr)](void*) {});
+}
+
+/**
+ * Creates a TensorPtr that manages a Tensor with the specified properties.
+ *
+ * This template overload is specialized for cases where the tensor data is
+ * provided as a vector. The scalar type is automatically deduced from the
+ * vector's data type. If the specified `type` differs from the deduced type of
+ * the vector's elements, and casting is allowed, the data will be cast to the
+ * specified `type`. This allows for flexible creation of tensors with data
+ * vectors of one type and a different scalar type.
+ *
+ * @tparam T The C++ type of the tensor elements, deduced from the vector.
+ * @param data A vector containing the tensor's data.
+ * @param type The scalar type of the tensor elements. If it differs from the
+ * deduced type, the data will be cast to this type if allowed.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @return A TensorPtr that manages the newly created TensorImpl.
+ */
+template <
+    typename T = float,
+    executorch::aten::ScalarType deduced_type =
+        runtime::CppTypeToScalarType<T>::value>
+inline TensorPtr make_tensor_ptr(
+    std::vector<T> data,
+    executorch::aten::ScalarType type = deduced_type,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  std::vector<executorch::aten::SizesType> sizes{
+      executorch::aten::SizesType(data.size())};
+  return make_tensor_ptr(
+      std::move(sizes), std::move(data), {0}, {1}, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr that manages a Tensor with the specified properties.
+ *
+ * This template overload is specialized for cases where the tensor data is
+ * provided as an initializer list. The scalar type is automatically deduced
+ * from the initializer list's data type. If the specified `type` differs from
+ * the deduced type of the initializer list's elements, and casting is allowed,
+ * the data will be cast to the specified `type`. This allows for flexible
+ * creation of tensors with data vectors of one type and a different scalar
+ * type.
+ *
+ * @tparam T The C++ type of the tensor elements, deduced from the initializer
+ * list.
+ * @param sizes A vector specifying the size of each dimension.
+ * @param list An initializer list containing the tensor's data.
+ * @param dim_order A vector specifying the order of dimensions.
+ * @param strides A vector specifying the strides of each dimension.
+ * @param type The scalar type of the tensor elements. If it differs from the
+ * deduced type, the data will be cast to this type if allowed.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @return A TensorPtr that manages the newly created TensorImpl.
+ */
+template <
+    typename T = float,
+    executorch::aten::ScalarType deduced_type =
+        runtime::CppTypeToScalarType<T>::value>
+inline TensorPtr make_tensor_ptr(
+    std::vector<executorch::aten::SizesType> sizes,
+    std::initializer_list<T> list,
+    std::vector<executorch::aten::DimOrderType> dim_order = {},
+    std::vector<executorch::aten::StridesType> strides = {},
+    executorch::aten::ScalarType type = deduced_type,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return make_tensor_ptr(
+      std::move(sizes),
+      std::vector<T>(std::move(list)),
+      std::move(dim_order),
+      std::move(strides),
+      type,
+      dynamism);
+}
+
+/**
+ * Creates a TensorPtr that manages a Tensor with the specified properties.
+ *
+ * This template overload allows creating a Tensor from an initializer list
+ * of data. The scalar type is automatically deduced from the type of the
+ * initializer list's elements. If the specified `type` differs from
+ * the deduced type of the initializer list's elements, and casting is allowed,
+ * the data will be cast to the specified `type`. This allows for flexible
+ * creation of tensors with data vectors of one type and a different scalar
+ * type.
+ *
+ * @tparam T The C++ type of the tensor elements, deduced from the initializer
+ * list.
+ * @param list An initializer list containing the tensor's data.
+ * @param type The scalar type of the tensor elements. If it differs from the
+ * deduced type, the data will be cast to this type if allowed.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @return A TensorPtr that manages the newly created TensorImpl.
+ */
+template <
+    typename T = float,
+    executorch::aten::ScalarType deduced_type =
+        runtime::CppTypeToScalarType<T>::value>
+inline TensorPtr make_tensor_ptr(
+    std::initializer_list<T> list,
+    executorch::aten::ScalarType type = deduced_type,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  std::vector<executorch::aten::SizesType> sizes{
+      executorch::aten::SizesType(list.size())};
+  return make_tensor_ptr(
+      std::move(sizes), std::move(list), {0}, {1}, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr that manages a Tensor with a single scalar value.
+ *
+ * @tparam T The C++ type of the scalar value.
+ * @param value The scalar value to be used for the Tensor.
+ * @return A TensorPtr that manages the newly created TensorImpl.
+ */
+template <typename T>
+inline TensorPtr make_tensor_ptr(T value) {
+  return make_tensor_ptr({}, std::vector<T>{value});
+}
+
+/**
+ * Creates a TensorPtr that manages a Tensor with the specified properties.
+ *
+ * This overload accepts a raw memory buffer stored in a std::vector<uint8_t>
+ * and a scalar type to interpret the data. The vector is managed, and the
+ * memory's lifetime is tied to the TensorImpl.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param data A vector containing the raw memory for the tensor's data.
+ * @param dim_order A vector specifying the order of dimensions.
+ * @param strides A vector specifying the strides of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @return A TensorPtr managing the newly created Tensor.
+ */
+TensorPtr make_tensor_ptr(
+    std::vector<executorch::aten::SizesType> sizes,
+    std::vector<uint8_t> data,
+    std::vector<executorch::aten::DimOrderType> dim_order,
+    std::vector<executorch::aten::StridesType> strides,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Float,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND);
+
+/**
+ * Creates a TensorPtr that manages a Tensor with the specified properties.
+ *
+ * This overload accepts a raw memory buffer stored in a std::vector<uint8_t>
+ * and a scalar type to interpret the data. The vector is managed, and the
+ * memory's lifetime is tied to the TensorImpl.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param data A vector containing the raw memory for the tensor's data.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @return A TensorPtr managing the newly created Tensor.
+ */
+inline TensorPtr make_tensor_ptr(
+    std::vector<executorch::aten::SizesType> sizes,
+    std::vector<uint8_t> data,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Float,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return make_tensor_ptr(
+      std::move(sizes), std::move(data), {}, {}, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr to manage a new Tensor with the same properties
+ * as the given Tensor, sharing the same data without owning it.
+ *
+ * @param tensor The Tensor whose properties are used to create a new TensorPtr.
+ * @return A new TensorPtr managing a Tensor with the same properties as the
+ * original.
+ */
+inline TensorPtr make_tensor_ptr(const executorch::aten::Tensor& tensor) {
+  return make_tensor_ptr(
+      std::vector<executorch::aten::SizesType>(
+          tensor.sizes().begin(), tensor.sizes().end()),
+      tensor.mutable_data_ptr(),
+#ifndef USE_ATEN_LIB
+      std::vector<executorch::aten::DimOrderType>(
+          tensor.dim_order().begin(), tensor.dim_order().end()),
+      std::vector<executorch::aten::StridesType>(
+          tensor.strides().begin(), tensor.strides().end()),
+      tensor.scalar_type(),
+      tensor.shape_dynamism()
+#else // USE_ATEN_LIB
+      {},
+      std::vector<executorch::aten::StridesType>(
+          tensor.strides().begin(), tensor.strides().end()),
+      tensor.scalar_type()
+#endif // USE_ATEN_LIB
+  );
+}
+
+/**
+ * Creates a TensorPtr that manages a new Tensor with the same properties
+ * as the given Tensor, but with a copy of the data owned by the returned
+ * TensorPtr, or nullptr if the original data is null.
+ *
+ * @param tensor The Tensor to clone.
+ * @return A new TensorPtr that manages a Tensor with the same properties as the
+ * original but with copied data.
+ */
+TensorPtr clone_tensor_ptr(const executorch::aten::Tensor& tensor);
+
+/**
+ * Creates a new TensorPtr by cloning the given TensorPtr, copying the
+ * underlying data.
+ *
+ * @param tensor The TensorPtr to clone.
+ * @return A new TensorPtr that manages a Tensor with the same properties as the
+ * original but with copied data.
+ */
+inline TensorPtr clone_tensor_ptr(const TensorPtr& tensor) {
+  return clone_tensor_ptr(*tensor);
+}
+
+/**
+ * Resizes the Tensor managed by the provided TensorPtr to the new sizes.
+ *
+ * @param tensor A TensorPtr managing the Tensor to resize.
+ * @param sizes A vector representing the new sizes for each dimension.
+ * @return Error::Ok on success, or an appropriate error code on failure.
+ */
+ET_NODISCARD
+runtime::Error resize_tensor_ptr(
+    TensorPtr& tensor,
+    const std::vector<executorch::aten::SizesType>& sizes);
+
+} // namespace extension
+} // namespace executorch
diff --git a/include/executorch/extension/tensor/tensor_ptr_maker.h b/include/executorch/extension/tensor/tensor_ptr_maker.h
new file mode 100644
index 00000000000..eb3745d34e2
--- /dev/null
+++ b/include/executorch/extension/tensor/tensor_ptr_maker.h
@@ -0,0 +1,687 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/extension/tensor/tensor_ptr.h>
+
+namespace executorch {
+namespace extension {
+
+/**
+ * A helper class for creating TensorPtr instances from raw data and tensor
+ * properties. Note that the TensorPtr created by this class does not own the
+ * data, so the data must outlive the TensorPtr.
+ *
+ * TensorPtrMaker provides a fluent interface for specifying various tensor
+ * properties, such as type, sizes, data pointer, dimension order, strides, and
+ * shape dynamism. The final tensor is created by invoking make_tensor_ptr() or
+ * by converting TensorPtrMaker to TensorPtr.
+ */
+class TensorPtrMaker final {
+ public:
+  // This class may have non-copyable members in the future.
+  TensorPtrMaker(const TensorPtrMaker&) = delete;
+  TensorPtrMaker& operator=(const TensorPtrMaker&) = delete;
+  // But it is movable.
+  TensorPtrMaker(TensorPtrMaker&&) = default;
+  TensorPtrMaker& operator=(TensorPtrMaker&&) = default;
+
+  /**
+   * Sets the scalar type of the tensor elements.
+   *
+   * @param type The scalar type (e.g., float, int, bool).
+   * @return Rvalue to this TensorPtrMaker for method chaining.
+   */
+  TensorPtrMaker&& type(executorch::aten::ScalarType type) {
+    type_ = type;
+    return std::move(*this);
+  }
+
+  /**
+   * Sets the order of dimensions in memory.
+   *
+   * @param dim_order A vector specifying the dimension order.
+   * @return Rvalue to this TensorPtrMaker for method chaining.
+   */
+  TensorPtrMaker&& dim_order(
+      std::vector<executorch::aten::DimOrderType> dim_order) {
+    dim_order_ = std::move(dim_order);
+    return std::move(*this);
+  }
+
+  /**
+   * Sets the strides for each dimension of the tensor.
+   *
+   * @param strides A vector specifying the stride for each dimension.
+   * @return Rvalue to this TensorPtrMaker for method chaining.
+   */
+  TensorPtrMaker&& strides(std::vector<executorch::aten::StridesType> strides) {
+    strides_ = std::move(strides);
+    return std::move(*this);
+  }
+
+  /**
+   * Sets the shape dynamism of the tensor.
+   *
+   * @param dynamism Specifies whether the tensor's shape is static, dynamic, or
+   * bounded.
+   * @return Rvalue to this TensorPtrMaker for method chaining.
+   */
+  TensorPtrMaker&& dynamism(executorch::aten::TensorShapeDynamism dynamism) {
+    dynamism_ = dynamism;
+    return std::move(*this);
+  }
+
+  /**
+   * Sets a custom deleter function to manage the lifetime of the data buffer.
+   *
+   * @param deleter A function that will be called to delete the data buffer
+   * when the Tensor object managed by the TensorPtr is destroyed. Explicitly
+   * consuming an rvalue to avoid unnecessary copies when the deleter is a
+   * lambda that has captured some state.
+   * @return Rvalue to this TensorPtrMaker for method chaining.
+   */
+  TensorPtrMaker&& deleter(std::function<void(void*)>&& deleter) {
+    deleter_ = std::move(deleter);
+    return std::move(*this);
+  }
+
+  /**
+   * Creates and returns a TensorPtr instance using the properties set in this
+   * TensorPtrMaker.
+   *
+   * @return A TensorPtr instance that manages the newly created Tensor.
+   */
+  TensorPtr make_tensor_ptr() && {
+    return ::executorch::extension::make_tensor_ptr(
+        std::move(sizes_),
+        data_,
+        std::move(dim_order_),
+        std::move(strides_),
+        type_,
+        dynamism_,
+        std::move(deleter_));
+  }
+
+  /**
+   * Implicit conversion operator to create a TensorPtr.
+   *
+   * @return A TensorPtr instance that manages the newly created Tensor.
+   */
+  operator TensorPtr() && {
+    return std::move(*this).make_tensor_ptr();
+  }
+
+ private:
+  TensorPtrMaker(
+      void* data,
+      std::vector<executorch::aten::SizesType> sizes,
+      executorch::aten::ScalarType type)
+      : sizes_(std::move(sizes)), data_(data), type_(type) {}
+
+ private:
+  // The following properties are required to create a Tensor.
+  friend TensorPtrMaker for_blob(
+      void* data,
+      std::vector<executorch::aten::SizesType> sizes,
+      executorch::aten::ScalarType type);
+
+ private:
+  std::vector<executorch::aten::SizesType> sizes_;
+  std::vector<executorch::aten::StridesType> strides_;
+  std::vector<executorch::aten::DimOrderType> dim_order_;
+  std::function<void(void*)> deleter_ = nullptr;
+  void* data_ = nullptr;
+  executorch::aten::ScalarType type_ = executorch::aten::ScalarType::Float;
+  executorch::aten::TensorShapeDynamism dynamism_ =
+      executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND;
+};
+
+/**
+ * Creates a TensorPtrMaker instance for building a TensorPtr from a raw data
+ * pointer and tensor sizes.
+ *
+ * The TensorPtrMaker returned by this function allows for further customization
+ * of the tensor's properties, such as data type, dimension order, strides, and
+ * shape dynamism, before finalizing the TensorPtr creation.
+ *
+ * @param data A pointer to the raw data to be used by the tensor. It must
+ * outlive the TensorPtr created by this function.
+ * @param sizes A vector specifying the size of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @return A TensorPtrMaker instance for creating a TensorPtr.
+ */
+inline TensorPtrMaker for_blob(
+    void* data,
+    std::vector<executorch::aten::SizesType> sizes,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Float) {
+  return TensorPtrMaker(data, std::move(sizes), type);
+}
+
+/**
+ * Creates a TensorPtr from a raw data pointer and tensor sizes, with an
+ * optional dynamism setting.
+ *
+ * This function provides a convenient way to create a tensor from existing
+ * data, with the option to specify whether the tensor's shape is static or
+ * dynamic.
+ *
+ * @param data A pointer to the raw data used by the tensor. The data must
+ * outlive the TensorPtr created by this function.
+ * @param sizes A vector specifying the size of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr from_blob(
+    void* data,
+    std::vector<executorch::aten::SizesType> sizes,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Float,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return for_blob(data, std::move(sizes), type)
+      .dynamism(dynamism)
+      .make_tensor_ptr();
+}
+
+/**
+ * Creates a TensorPtr from a raw data pointer, tensor sizes, and strides, with
+ * an optional dynamism setting.
+ *
+ * This function allows for the creation of a tensor from existing data, with
+ * the option to specify custom strides for each dimension and whether the
+ * tensor’s shape is static, dynamic, or bounded.
+ *
+ * @param data A pointer to the raw data used by the tensor. The data must
+ * outlive the TensorPtr created by this function.
+ * @param sizes A vector specifying the size of each dimension.
+ * @param strides A vector specifying the stride for each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static, dynamic, or
+ * bounded.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr from_blob(
+    void* data,
+    std::vector<executorch::aten::SizesType> sizes,
+    std::vector<executorch::aten::StridesType> strides,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Float,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return for_blob(data, std::move(sizes), type)
+      .strides(std::move(strides))
+      .dynamism(dynamism)
+      .make_tensor_ptr();
+}
+
+/**
+ * Creates a TensorPtr from a raw data pointer and tensor sizes, with an
+ * optional dynamism setting.
+ *
+ * This function is a convenient way to create a tensor from existing data, with
+ * the option to specify whether the tensor's shape is static, dynamic, or
+ * bounded.
+ *
+ * @param data A pointer to the raw data to be used by the tensor. It must
+ * outlive the TensorPtr created by this function.
+ * @param sizes A vector specifying the size of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param deleter A function to delete the data when it's no longer needed.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance that manages the newly created Tensor.
+ */
+inline TensorPtr from_blob(
+    void* data,
+    std::vector<executorch::aten::SizesType> sizes,
+    executorch::aten::ScalarType type,
+    std::function<void(void*)>&& deleter,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return for_blob(data, std::move(sizes), type)
+      .deleter(std::move(deleter))
+      .dynamism(dynamism)
+      .make_tensor_ptr();
+}
+
+/**
+ * Creates a TensorPtr from a raw data pointer, tensor sizes, and strides, with
+ * an optional dynamism setting.
+ *
+ * This function allows for the creation of a tensor from existing data, with
+ * the option to specify custom strides for each dimension and whether the
+ * tensor's shape is static, dynamic, or bounded.
+ *
+ * @param data A pointer to the raw data to be used by the tensor. It must
+ * outlive the TensorPtr created by this function.
+ * @param sizes A vector specifying the size of each dimension.
+ * @param strides A vector specifying the stride for each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param deleter A function to delete the data when it's no longer needed.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance that manages the newly created Tensor.
+ */
+inline TensorPtr from_blob(
+    void* data,
+    std::vector<executorch::aten::SizesType> sizes,
+    std::vector<executorch::aten::StridesType> strides,
+    executorch::aten::ScalarType type,
+    std::function<void(void*)>&& deleter,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return for_blob(data, std::move(sizes), type)
+      .strides(std::move(strides))
+      .deleter(std::move(deleter))
+      .dynamism(dynamism)
+      .make_tensor_ptr();
+}
+
+/**
+ * Creates a TensorPtr with the specified sizes, strides, and properties.
+ *
+ * This function allocates memory for the tensor elements but does not
+ * initialize them with any specific values. The tensor is created with the
+ * specified strides.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param strides A vector specifying the stride for each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+TensorPtr empty_strided(
+    std::vector<executorch::aten::SizesType> sizes,
+    std::vector<executorch::aten::StridesType> strides,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Float,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND);
+
+/**
+ * Creates an empty TensorPtr with the same size and properties as the given
+ * tensor.
+ *
+ * This function allocates memory for the tensor elements but does not
+ * initialize them with any specific values.
+ *
+ * @param other A reference to another tensor, whose size and properties are
+ * used.
+ * @param type The scalar type of the tensor elements. If not provided, the
+ * scalar type of the other tensor is used.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr empty_like(
+    const TensorPtr& other,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Undefined,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  if (type == executorch::aten::ScalarType::Undefined) {
+    type = other->scalar_type();
+  }
+  return empty_strided(
+      {other->sizes().begin(), other->sizes().end()},
+      {other->strides().begin(), other->strides().end()},
+      type,
+      dynamism);
+}
+
+/**
+ * Creates an empty TensorPtr with the specified sizes and properties.
+ *
+ * This function allocates memory for the tensor elements but does not
+ * initialize them with any specific values.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr empty(
+    std::vector<executorch::aten::SizesType> sizes,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Float,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return empty_strided(std::move(sizes), {}, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with the specified value.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param strides A vector specifying the stride for each dimension.
+ * @param fill_value The value to fill the tensor with.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+TensorPtr full_strided(
+    std::vector<executorch::aten::SizesType> sizes,
+    std::vector<executorch::aten::StridesType> strides,
+    executorch::aten::Scalar fill_value,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Float,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND);
+
+/**
+ * Creates a TensorPtr filled with the specified value, with the same size and
+ * properties as another tensor.
+ *
+ * @param other A reference to another tensor, whose size and properties will be
+ * used.
+ * @param fill_value The value to fill the tensor with.
+ * @param type The scalar type of the tensor elements. If not specified, the
+ * scalar type of the other tensor is used.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr full_like(
+    const TensorPtr& other,
+    executorch::aten::Scalar fill_value,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Undefined,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  if (type == executorch::aten::ScalarType::Undefined) {
+    type = other->scalar_type();
+  }
+  return full_strided(
+      {other->sizes().begin(), other->sizes().end()},
+      {other->strides().begin(), other->strides().end()},
+      fill_value,
+      type,
+      dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with the specified value.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param fill_value The value used to fill the tensor.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr full(
+    std::vector<executorch::aten::SizesType> sizes,
+    executorch::aten::Scalar fill_value,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Float,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return full_strided(std::move(sizes), {}, fill_value, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr holding a scalar value.
+ *
+ * @param value The scalar value for the tensor.
+ * @param type The scalar type of the tensor elements.
+ * @return A TensorPtr instance managing the newly created scalar Tensor.
+ */
+inline TensorPtr scalar_tensor(
+    executorch::aten::Scalar value,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Float) {
+  return full({}, value, type);
+}
+
+/**
+ * Creates a TensorPtr filled with ones, with the same size and properties as
+ * another tensor.
+ *
+ * @param other A reference to another tensor, whose size and properties are
+ * used.
+ * @param type The scalar type of the tensor elements. If not provided, the
+ * scalar type of the other tensor is used.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr ones_like(
+    const TensorPtr& other,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Undefined,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return full_like(other, 1, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with ones.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr ones(
+    std::vector<executorch::aten::SizesType> sizes,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Float,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return full(std::move(sizes), 1, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with zeros, with the same size and properties as
+ * another tensor.
+ *
+ * @param other A reference to another tensor, whose size and properties will be
+ * used.
+ * @param type The scalar type of the tensor elements. If not specified, the
+ * scalar type of the `other` tensor is used.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr zeros_like(
+    const TensorPtr& other,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Undefined,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return full_like(other, 0, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with zeros.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr zeros(
+    std::vector<executorch::aten::SizesType> sizes,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Float,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return full(std::move(sizes), 0, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with random values between 0 and 1.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param strides A vector specifying the stride for each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ **/
+TensorPtr rand_strided(
+    std::vector<executorch::aten::SizesType> sizes,
+    std::vector<executorch::aten::StridesType> strides,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Float,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND);
+
+/**
+ * Creates a TensorPtr filled with random values between 0 and 1.
+ *
+ * @param other A reference to another tensor, whose size and properties will be
+ * used.
+ * @param type The scalar type of the tensor elements. If not specified, the
+ * scalar type of the other tensor is used.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr rand_like(
+    const TensorPtr& other,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Undefined,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  if (type == executorch::aten::ScalarType::Undefined) {
+    type = other->scalar_type();
+  }
+  return rand_strided(
+      {other->sizes().begin(), other->sizes().end()},
+      {other->strides().begin(), other->strides().end()},
+      type,
+      dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with random values between 0 and 1.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr rand(
+    std::vector<executorch::aten::SizesType> sizes,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Float,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return rand_strided(std::move(sizes), {}, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with random values from a normal distribution.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param strides A vector specifying the stride for each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+TensorPtr randn_strided(
+    std::vector<executorch::aten::SizesType> sizes,
+    std::vector<executorch::aten::StridesType> strides,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Float,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND);
+
+/**
+ * Creates a TensorPtr filled with random values from a normal distribution.
+ *
+ * @param other A reference to another tensor, whose size and properties will be
+ * used.
+ * @param type The scalar type of the tensor elements. If not specified, the
+ * scalar type of the other tensor is used.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr randn_like(
+    const TensorPtr& other,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Undefined,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  if (type == executorch::aten::ScalarType::Undefined) {
+    type = other->scalar_type();
+  }
+  return randn_strided(
+      {other->sizes().begin(), other->sizes().end()},
+      {other->strides().begin(), other->strides().end()},
+      type,
+      dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with random values from a normal distribution.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr randn(
+    std::vector<executorch::aten::SizesType> sizes,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Float,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return randn_strided(std::move(sizes), {}, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with random integer values in the given range.
+ *
+ * @param low The lower bound (inclusive) of the random values.
+ * @param high The upper bound (exclusive) of the random values.
+ * @param sizes A vector specifying the size of each dimension.
+ * @param strides A vector specifying the stride for each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+TensorPtr randint_strided(
+    int64_t low,
+    int64_t high,
+    std::vector<executorch::aten::SizesType> sizes,
+    std::vector<executorch::aten::StridesType> strides,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Int,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND);
+
+/**
+ * Creates a TensorPtr filled with random integer values in the given range.
+ *
+ * @param other A reference to another tensor, whose size and properties will be
+ * used.
+ * @param low The lower bound (inclusive) of the random values.
+ * @param high The upper bound (exclusive) of the random values.
+ * @param type The scalar type of the tensor elements. If not specified, the
+ * scalar type of the other tensor is used.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr randint_like(
+    const TensorPtr& other,
+    int64_t low,
+    int64_t high,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Undefined,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  if (type == executorch::aten::ScalarType::Undefined) {
+    type = other->scalar_type();
+  }
+  return randint_strided(
+      low,
+      high,
+      {other->sizes().begin(), other->sizes().end()},
+      {other->strides().begin(), other->strides().end()},
+      type,
+      dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with random integer values within the specified
+ * range.
+ *
+ * @param low The inclusive lower bound of the random values.
+ * @param high The exclusive upper bound of the random values.
+ * @param sizes A vector specifying the size of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr randint(
+    int64_t low,
+    int64_t high,
+    std::vector<executorch::aten::SizesType> sizes,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Int,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return randint_strided(low, high, std::move(sizes), {}, type, dynamism);
+}
+
+} // namespace extension
+} // namespace executorch
diff --git a/include/executorch/extension/threadpool/cpuinfo_utils.h b/include/executorch/extension/threadpool/cpuinfo_utils.h
new file mode 100644
index 00000000000..d559738b728
--- /dev/null
+++ b/include/executorch/extension/threadpool/cpuinfo_utils.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cpuinfo.h>
+
+namespace executorch::extension::cpuinfo {
+
+uint32_t get_num_performant_cores();
+
+} // namespace executorch::extension::cpuinfo
+
+namespace torch::executorch::cpuinfo { // DEPRECATED
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces. Note that threadpool incorrectly used
+// the namespace `torch::executorch` instead of `torch::executor`.
+using ::executorch::extension::cpuinfo::get_num_performant_cores; // DEPRECATED
+} // namespace torch::executorch::cpuinfo
diff --git a/include/executorch/extension/threadpool/threadpool.h b/include/executorch/extension/threadpool/threadpool.h
new file mode 100644
index 00000000000..15133befef6
--- /dev/null
+++ b/include/executorch/extension/threadpool/threadpool.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <mutex>
+
+#include <pthreadpool.h>
+
+namespace executorch::extension::threadpool {
+
+class ThreadPool final {
+ public:
+  explicit ThreadPool(size_t thread_count = 0);
+  ~ThreadPool() = default;
+
+  // Make threadpool non copyable
+  // Non-copyable: threadpool cannot be copied because it will
+  // effectively require cloning of threadpool.
+  // Cloning can be done by just calling create_thread_pool.
+  ThreadPool(const ThreadPool&) = delete;
+  ThreadPool& operator=(const ThreadPool&) = delete;
+
+  // Make threadpool non-movable.
+  ThreadPool(ThreadPool&&) = delete;
+  ThreadPool& operator=(ThreadPool&&) = delete;
+
+  size_t get_thread_count() const;
+
+  /**
+   * INTERNAL: Resets the threadpool by creating a new threadpool with requested
+   * # of threads. This is not a thread safe call. When calling this method,
+   * threads of the threadpool might be doing some work. Some other code may
+   * also be holding on to the threadpool pointer, that is no longer valid. This
+   * is a private API, which will later be replaced by something that allows
+   * creating of threadpool with requested size and use such a threadpool with
+   * backend delegates, custom ops or optimized lib.
+   */
+  [[deprecated("This API is experimental and may change without notice.")]]
+  bool _unsafe_reset_threadpool(uint32_t num_threads);
+
+  /**
+   * Run, in parallel, function fn(task_id) over task_id in range [0, range).
+   * This function is blocking.  All input is processed by the time it returns.
+   * NoThreadPoolGuard (see threadpool_guard.h) can used to disable use of
+   * multiple threads with the scope of the guard When NoThreadPoolGuard is not
+   * used all calls to run method are serialized.
+   */
+  void run(const std::function<void(size_t)>& fn, size_t range);
+
+ private:
+  friend pthreadpool_t get_pthreadpool();
+
+ private:
+  // This mutex is used inside get_thread_count API but it is not really needed
+  // since data members of ThreadPool objects are not really mutable.
+  // TODO(kimishpatel): Figure out if we will allow set_num_threads API, in
+  // which case this mutex will be useful. Otherwise remove it.
+  mutable std::mutex mutex_;
+  std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy)> threadpool_;
+};
+
+/**
+ * Returns the singleton instance of ThreadPool for ATen/TH multithreading.
+ */
+ThreadPool* get_threadpool();
+
+/**
+ * Returns the underlying pthreadpool instance used by the implementation of
+ * ThreadPool returned by `get_threadpool()`. Only for use in external libraries
+ * so as to unify threading across internal (i.e. ATen, etc.) and external (e.g.
+ * NNPACK, QNNPACK, XNNPACK) use cases.
+ */
+pthreadpool_t get_pthreadpool();
+
+} // namespace executorch::extension::threadpool
+
+namespace torch::executorch::threadpool { // DEPRECATED
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces. Note that threadpool incorrectly used
+// the namespace `torch::executorch` instead of `torch::executor`.
+using ::executorch::extension::threadpool::get_pthreadpool; // DEPRECATED
+using ::executorch::extension::threadpool::get_threadpool; // DEPRECATED
+using ::executorch::extension::threadpool::ThreadPool; // DEPRECATED
+} // namespace torch::executorch::threadpool
diff --git a/include/executorch/extension/threadpool/threadpool_guard.h b/include/executorch/extension/threadpool/threadpool_guard.h
new file mode 100644
index 00000000000..9fe1d8af737
--- /dev/null
+++ b/include/executorch/extension/threadpool/threadpool_guard.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+namespace executorch::extension::threadpool {
+
+// A RAII, thread local (!) guard that enables or disables guard upon
+// construction, and sets it back to the original value upon destruction.
+struct NoThreadPoolGuard {
+  static bool is_enabled();
+  static void set_enabled(bool enabled);
+
+  NoThreadPoolGuard() : prev_mode_(NoThreadPoolGuard::is_enabled()) {
+    NoThreadPoolGuard::set_enabled(true);
+  }
+  ~NoThreadPoolGuard() {
+    NoThreadPoolGuard::set_enabled(prev_mode_);
+  }
+
+ private:
+  const bool prev_mode_;
+};
+
+} // namespace executorch::extension::threadpool
+
+namespace torch::executorch::threadpool { // DEPRECATED
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces. Note that threadpool incorrectly used
+// the namespace `torch::executorch` instead of `torch::executor`.
+using ::executorch::extension::threadpool::NoThreadPoolGuard; // DEPRECATED
+} // namespace torch::executorch::threadpool
diff --git a/include/executorch/runtime/core/array_ref.h b/include/executorch/runtime/core/array_ref.h
new file mode 100644
index 00000000000..a23509e8698
--- /dev/null
+++ b/include/executorch/runtime/core/array_ref.h
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+//===--- ArrayRef.h - Array Reference Wrapper -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// removed llvm-specific functionality
+// removed some implicit const -> non-const conversions that rely on
+// complicated std::enable_if meta-programming
+// removed a bunch of slice variants for simplicity...
+// remove constructors and operators for std::vector
+// removed some prevention of accidental assignments from temporary that
+// required std::enable_if meta-programming
+// removed reverse iterator
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+
+#include <c10/util/irange.h>
+#include <executorch/runtime/platform/assert.h>
+
+namespace executorch {
+namespace runtime {
+
+/**
+ * Represents a constant reference to an array (0 or more elements
+ * consecutively in memory), i.e. a start pointer and a length.  It allows
+ * various APIs to take consecutive elements easily and conveniently.
+ *
+ * This class does not own the underlying data, it is expected to be used in
+ * situations where the data resides in some other buffer, whose lifetime
+ * extends past that of the ArrayRef. For this reason, it is not in general
+ * safe to store an ArrayRef.
+ *
+ * Span and ArrayRef are extrememly similar with the difference being ArrayRef
+ * views a list of constant elements and Span views a list of mutable elements.
+ * Clients should decide between the two based on if the list elements for their
+ * use case should be mutable.
+ *
+ * This is intended to be trivially copyable, so it should be passed by
+ * value.
+ */
+template <typename T>
+class ArrayRef final {
+ public:
+  using iterator = const T*;
+  using const_iterator = const T*;
+  using size_type = size_t;
+  using value_type = T;
+
+ private:
+  /// The start of the array, in an external buffer.
+  const T* Data;
+
+  /// The number of elements.
+  size_type Length;
+
+ public:
+  /// @name Constructors
+  /// @{
+
+  /// Construct an empty ArrayRef.
+  /* implicit */ constexpr ArrayRef() : Data(nullptr), Length(0) {}
+
+  /// Construct a ArrayRef from a single element. Implicitly convert element
+  /// type. It is aligned with PyTorch's c10::ArrayRef.
+  /* implicit */ constexpr ArrayRef(const T& OneElt)
+      : Data(&OneElt), Length(1) {}
+
+  /// Construct a ArrayRef from a pointer and length.
+  ArrayRef(const T* data, size_t length) : Data(data), Length(length) {
+    ET_DCHECK(Data != nullptr || Length == 0);
+  }
+
+  /// Construct a ArrayRef from a range.
+  ArrayRef(const T* begin, const T* end) : Data(begin), Length(end - begin) {}
+
+  /// Construct an ArrayRef from a std::array
+  template <size_t N>
+  /* implicit */ constexpr ArrayRef(const std::array<T, N>& Arr)
+      : Data(Arr.data()), Length(N) {}
+
+  /// Construct a ArrayRef from a C array.
+  template <size_t N>
+  /* implicit */ constexpr ArrayRef(const T (&Arr)[N]) : Data(Arr), Length(N) {}
+
+  /// @}
+  /// @name Simple Operations
+  /// @{
+
+  constexpr iterator begin() const {
+    return Data;
+  }
+  constexpr iterator end() const {
+    return Data + Length;
+  }
+
+  // These are actually the same as iterator, since ArrayRef only
+  // gives you const iterators.
+  constexpr const_iterator cbegin() const {
+    return Data;
+  }
+  constexpr const_iterator cend() const {
+    return Data + Length;
+  }
+
+  /// empty - Check if the array is empty.
+  constexpr bool empty() const {
+    return Length == 0;
+  }
+
+  constexpr const T* data() const {
+    return Data;
+  }
+
+  /// size - Get the array size.
+  constexpr size_t size() const {
+    return Length;
+  }
+
+  /// front - Get the first element.
+  const T& front() const {
+    // ArrayRef: attempted to access front() of empty list
+    ET_CHECK(!empty());
+    return Data[0];
+  }
+
+  /// back - Get the last element.
+  const T& back() const {
+    // ArrayRef: attempted to access back() of empty list
+    ET_CHECK(!empty());
+    return Data[Length - 1];
+  }
+
+  /// equals - Check for element-wise equality.
+  bool equals(ArrayRef RHS) const {
+    if (Length != RHS.Length) {
+      return false;
+    }
+    for (const auto i : c10::irange(this->Length)) {
+      if (Data[i] != RHS.Data[i]) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /// slice(n, m) - Take M elements of the array starting at element N
+  ArrayRef<T> slice(size_t N, size_t M) const {
+    // cant slice longer then the array
+    ET_CHECK(N + M <= size());
+    return ArrayRef<T>(data() + N, M);
+  }
+
+  /// slice(n) - Chop off the first N elements of the array.
+  constexpr ArrayRef<T> slice(size_t N) const {
+    return slice(N, size() - N);
+  }
+
+  /// @}
+  /// @name Operator Overloads
+  /// @{
+  constexpr const T& operator[](size_t Index) const {
+    return Data[Index];
+  }
+
+  /// Vector compatibility
+  const T& at(size_t Index) const {
+    // invalid index
+    ET_CHECK(Index < Length);
+    return Data[Index];
+  }
+
+  /// @}
+};
+
+/// @name ArrayRef Convenience constructors
+/// @{
+
+/// Construct an ArrayRef from a single element.
+template <typename T>
+ArrayRef<T> makeArrayRef(const T& OneElt) {
+  return OneElt;
+}
+
+/// Construct an ArrayRef from a pointer and length.
+template <typename T>
+ArrayRef<T> makeArrayRef(const T* data, size_t length) {
+  return ArrayRef<T>(data, length);
+}
+
+/// Construct an ArrayRef from a range.
+template <typename T>
+ArrayRef<T> makeArrayRef(const T* begin, const T* end) {
+  return ArrayRef<T>(begin, end);
+}
+
+/// Construct an ArrayRef from a std::array.
+template <typename T, std::size_t N>
+ArrayRef<T> makeArrayRef(const std::array<T, N>& Arr) {
+  return Arr;
+}
+
+/// Construct an ArrayRef from an ArrayRef (no-op) (const)
+template <typename T>
+ArrayRef<T> makeArrayRef(const ArrayRef<T>& Vec) {
+  return Vec;
+}
+
+/// Construct an ArrayRef from an ArrayRef (no-op)
+template <typename T>
+ArrayRef<T>& makeArrayRef(ArrayRef<T>& Vec) {
+  return Vec;
+}
+
+/// Construct an ArrayRef from a C array.
+template <typename T, size_t N>
+ArrayRef<T> makeArrayRef(const T (&Arr)[N]) {
+  return ArrayRef<T>(Arr);
+}
+
+// WARNING: Template instantiation will NOT be willing to do an implicit
+// conversions to get you to an ArrayRef, which is why we need so
+// many overloads.
+
+template <typename T>
+bool operator==(ArrayRef<T> a1, ArrayRef<T> a2) {
+  return a1.equals(a2);
+}
+
+template <typename T>
+bool operator!=(ArrayRef<T> a1, ArrayRef<T> a2) {
+  return !a1.equals(a2);
+}
+
+using IntArrayRef = ArrayRef<int64_t>;
+
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::ArrayRef;
+using ::executorch::runtime::IntArrayRef;
+using ::executorch::runtime::makeArrayRef;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/data_loader.h b/include/executorch/runtime/core/data_loader.h
new file mode 100644
index 00000000000..3dda5516908
--- /dev/null
+++ b/include/executorch/runtime/core/data_loader.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstddef>
+
+#include <executorch/runtime/core/freeable_buffer.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/platform/compiler.h>
+
+namespace executorch {
+namespace runtime {
+
+/**
+ * Loads from a data source.
+ *
+ * See //executorch/extension/data_loader for common implementations.
+ */
+class DataLoader {
+ public:
+  /**
+   * Describes the content of the segment.
+   */
+  struct SegmentInfo {
+    /**
+     * Represents the purpose of the segment.
+     */
+    enum class Type {
+      /**
+       * Data for the actual program.
+       */
+      Program,
+      /**
+       * Holds constant tensor data.
+       */
+      Constant,
+      /**
+       * Data used for initializing a backend.
+       */
+      Backend,
+      /**
+       * Data used for initializing mutable tensors.
+       */
+      Mutable,
+      /**
+       * Data used for initializing external tensors.
+       */
+      External,
+    };
+
+    /// Type of the segment.
+    Type segment_type;
+
+    /// Index of the segment within the segment list. Undefined for program
+    /// segments.
+    size_t segment_index;
+
+    /// An optional, null-terminated string describing the segment. For
+    /// `Backend` segments, this is the backend ID. Null for other segment
+    /// types.
+    const char* descriptor;
+
+    SegmentInfo() = default;
+
+    explicit SegmentInfo(
+        Type segment_type_,
+        size_t segment_index_ = 0,
+        const char* descriptor_ = nullptr)
+        : segment_type(segment_type_),
+          segment_index(segment_index_),
+          descriptor(descriptor_) {}
+  };
+
+  virtual ~DataLoader() = default;
+
+  /**
+   * Loads data from the underlying data source.
+   *
+   * NOTE: This must be thread-safe. If this call modifies common state, the
+   * implementation must do its own locking.
+   *
+   * @param offset The byte offset in the data source to start loading from.
+   * @param size The number of bytes to load.
+   * @param segment_info Information about the segment being loaded.
+   *
+   * @returns a `FreeableBuffer` that owns the loaded data.
+   */
+  ET_NODISCARD virtual Result<FreeableBuffer>
+  load(size_t offset, size_t size, const SegmentInfo& segment_info) const = 0;
+
+  /**
+   * Loads data from the underlying data source into the provided buffer.
+   *
+   * NOTE: This must be thread-safe. If this call modifies common state, the
+   * implementation must do its own locking.
+   *
+   * @param offset The byte offset in the data source to start loading from.
+   * @param size The number of bytes to load.
+   * @param segment_info Information about the segment being loaded.
+   * @param buffer The buffer to load data into. Must point to at least `size`
+   * bytes of memory.
+   *
+   * @returns an Error indicating if the load was successful.
+   */
+  ET_NODISCARD virtual Error load_into(
+      size_t offset,
+      size_t size,
+      const SegmentInfo& segment_info,
+      void* buffer) const {
+    // Using a stub implementation here instead of pure virtual to expand the
+    // data_loader interface in a backwards compatible way.
+    (void)buffer;
+    (void)offset;
+    (void)size;
+    (void)segment_info;
+    ET_LOG(Error, "load_into() not implemented for this data loader.");
+    return Error::NotImplemented;
+  }
+
+  /**
+   * Returns the length of the underlying data source, typically the file size.
+   */
+  ET_NODISCARD virtual Result<size_t> size() const = 0;
+};
+
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::DataLoader;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/defines.h b/include/executorch/runtime/core/defines.h
new file mode 100644
index 00000000000..ee471268264
--- /dev/null
+++ b/include/executorch/runtime/core/defines.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * @file
+ * Contains preprocessor definitions used by ExecuTorch core.
+ */
+
+#pragma once
+
+// Enable ET_ENABLE_ENUM_STRINGS by default. This option gates inclusion of
+// enum string names and can be disabled by explicitly setting it to 0.
+#ifndef ET_ENABLE_ENUM_STRINGS
+#define ET_ENABLE_ENUM_STRINGS 1
+#endif
diff --git a/include/executorch/runtime/core/error.h b/include/executorch/runtime/core/error.h
new file mode 100644
index 00000000000..73e343a5c45
--- /dev/null
+++ b/include/executorch/runtime/core/error.h
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * @file
+ * ExecuTorch Error declarations.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include <executorch/runtime/platform/log.h>
+
+namespace executorch {
+namespace runtime {
+
+// Alias error code integral type to minimal platform width (32-bits for now).
+typedef uint32_t error_code_t;
+
+/**
+ * ExecuTorch Error type.
+ */
+enum class Error : error_code_t {
+  /*
+   * System errors.
+   */
+
+  /// Status indicating a successful operation.
+  Ok = 0x00,
+
+  /// An internal error occurred.
+  Internal = 0x01,
+
+  /// Status indicating the executor is in an invalid state for a target
+  /// operation
+  InvalidState = 0x2,
+
+  /// Status indicating there are no more steps of execution to run
+  EndOfMethod = 0x03,
+
+  /*
+   * Logical errors.
+   */
+
+  /// Operation is not supported in the current context.
+  NotSupported = 0x10,
+
+  /// Operation is not yet implemented.
+  NotImplemented = 0x11,
+
+  /// User provided an invalid argument.
+  InvalidArgument = 0x12,
+
+  /// Object is an invalid type for the operation.
+  InvalidType = 0x13,
+
+  /// Operator(s) missing in the operator registry.
+  OperatorMissing = 0x14,
+
+  /*
+   * Resource errors.
+   */
+
+  /// Requested resource could not be found.
+  NotFound = 0x20,
+
+  /// Could not allocate the requested memory.
+  MemoryAllocationFailed = 0x21,
+
+  /// Could not access a resource.
+  AccessFailed = 0x22,
+
+  /// Error caused by the contents of a program.
+  InvalidProgram = 0x23,
+
+  /// Error caused by the contents of external data.
+  InvalidExternalData = 0x24,
+
+  /// Does not have enough resources to perform the requested operation.
+  OutOfResources = 0x25,
+
+  /*
+   * Delegate errors.
+   */
+
+  /// Init stage: Backend receives an incompatible delegate version.
+  DelegateInvalidCompatibility = 0x30,
+  /// Init stage: Backend fails to allocate memory.
+  DelegateMemoryAllocationFailed = 0x31,
+  /// Execute stage: The handle is invalid.
+  DelegateInvalidHandle = 0x32,
+
+};
+
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::Error;
+using ::executorch::runtime::error_code_t;
+} // namespace executor
+} // namespace torch
+
+/**
+ * If cond__ is false, log the specified message and return the specified Error
+ * from the current function, which must be of return type
+ * executorch::runtime::Error.
+ *
+ * @param[in] cond__ The condition to be checked, asserted as true.
+ * @param[in] error__ Error enum value to return without the `Error::` prefix,
+ * like `InvalidArgument`.
+ * @param[in] message__ Format string for the log error message.
+ * @param[in] ... Optional additional arguments for the format string.
+ */
+#define ET_CHECK_OR_RETURN_ERROR(cond__, error__, message__, ...) \
+  {                                                               \
+    if (!(cond__)) {                                              \
+      ET_LOG(Error, message__, ##__VA_ARGS__);                    \
+      return ::executorch::runtime::Error::error__;               \
+    }                                                             \
+  }
+
+/**
+ * A convenience macro to be used in utility functions that check whether input
+ * tensor(s) are valid, which are expected to return a boolean. Checks whether
+ * `cond` is true; if not, log the failed check with `message` and return false.
+ *
+ * @param[in] cond the condition to check
+ * @param[in] message an additional message to log with `cond`
+ */
+#define ET_CHECK_OR_RETURN_FALSE(cond__, message__, ...)                      \
+  {                                                                           \
+    if (!(cond__)) {                                                          \
+      ET_LOG(Error, "Check failed (%s): " message__, #cond__, ##__VA_ARGS__); \
+      return false;                                                           \
+    }                                                                         \
+  }
+
+/**
+ * If error__ is not Error::Ok, optionally log a message and return the error
+ * from the current function, which must be of return type
+ * executorch::runtime::Error.
+ *
+ * @param[in] error__ Error enum value asserted to be Error::Ok.
+ * @param[in] ... Optional format string for the log error message and its
+ * arguments.
+ */
+#define ET_CHECK_OK_OR_RETURN_ERROR(error__, ...) \
+  ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR(error__, ##__VA_ARGS__)
+
+// Internal only: Use ET_CHECK_OK_OR_RETURN_ERROR() instead.
+#define ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR(...) \
+  ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR_SELECT(    \
+      __VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1) \
+  (__VA_ARGS__)
+
+/**
+ * Internal only: Use ET_CHECK_OK_OR_RETURN_ERROR() instead.
+ * This macro selects the correct version of
+ * ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR based on the number of arguments passed.
+ * It uses a trick with the preprocessor to count the number of arguments and
+ * then selects the appropriate macro.
+ *
+ * The macro expansion uses __VA_ARGS__ to accept any number of arguments and
+ * then appends them to ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR_, followed by the
+ * count of arguments. The count is determined by the macro
+ * ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR_SELECT which takes the arguments and
+ * passes them along with a sequence of numbers (2, 1). The preprocessor then
+ * matches this sequence to the correct number of arguments provided.
+ *
+ * If two arguments are passed, ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR_2 is
+ * selected, suitable for cases where an error code and a custom message are
+ * provided. If only one argument is passed,
+ * ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR_1 is selected, which is used for cases
+ * with just an error code.
+ *
+ * Usage:
+ * ET_CHECK_OK_OR_RETURN_ERROR(error_code); // Calls v1
+ * ET_CHECK_OK_OR_RETURN_ERROR(error_code, "Error message", ...); // Calls v2
+ */
+#define ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR_SELECT( \
+    _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, N, ...) \
+  ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR_##N
+
+// Internal only: Use ET_CHECK_OK_OR_RETURN_ERROR() instead.
+#define ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR_1(error__)   \
+  do {                                                    \
+    const auto et_error__ = (error__);                    \
+    if (et_error__ != ::executorch::runtime::Error::Ok) { \
+      return et_error__;                                  \
+    }                                                     \
+  } while (0)
+
+// Internal only: Use ET_CHECK_OK_OR_RETURN_ERROR() instead.
+#define ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR_2(error__, message__, ...) \
+  do {                                                                  \
+    const auto et_error__ = (error__);                                  \
+    if (et_error__ != ::executorch::runtime::Error::Ok) {               \
+      ET_LOG(Error, message__, ##__VA_ARGS__);                          \
+      return et_error__;                                                \
+    }                                                                   \
+  } while (0)
+
+// Internal only: Use ET_CHECK_OK_OR_RETURN_ERROR() instead.
+#define ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR_3 \
+  ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR_2
+#define ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR_4 \
+  ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR_2
+#define ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR_5 \
+  ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR_2
+#define ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR_6 \
+  ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR_2
+#define ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR_7 \
+  ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR_2
+#define ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR_8 \
+  ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR_2
+#define ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR_9 \
+  ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR_2
+#define ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR_10 \
+  ET_INTERNAL_CHECK_OK_OR_RETURN_ERROR_2
diff --git a/include/executorch/runtime/core/evalue.h b/include/executorch/runtime/core/evalue.h
new file mode 100644
index 00000000000..500c2bc4f0e
--- /dev/null
+++ b/include/executorch/runtime/core/evalue.h
@@ -0,0 +1,563 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/tag.h>
+#include <executorch/runtime/platform/assert.h>
+
+namespace executorch {
+namespace runtime {
+
+struct EValue;
+
+namespace internal {
+
+// Tensor gets proper reference treatment because its expensive to copy in aten
+// mode, all other types are just copied.
+template <typename T>
+struct evalue_to_const_ref_overload_return {
+  using type = T;
+};
+
+template <>
+struct evalue_to_const_ref_overload_return<executorch::aten::Tensor> {
+  using type = const executorch::aten::Tensor&;
+};
+
+template <typename T>
+struct evalue_to_ref_overload_return {
+  using type = T;
+};
+
+template <>
+struct evalue_to_ref_overload_return<executorch::aten::Tensor> {
+  using type = executorch::aten::Tensor&;
+};
+
+} // namespace internal
+
+/*
+ * Helper class used to correlate EValues in the executor table, with the
+ * unwrapped list of the proper type. Because values in the runtime's values
+ * table can change during execution, we cannot statically allocate list of
+ * objects at deserialization. Imagine the serialized list says index 0 in the
+ * value table is element 2 in the list, but during execution the value in
+ * element 2 changes (in the case of tensor this means the TensorImpl* stored in
+ * the tensor changes). To solve this instead they must be created dynamically
+ * whenever they are used.
+ */
+template <typename T>
+class BoxedEvalueList {
+ public:
+  BoxedEvalueList() = default;
+  /*
+   * Wrapped_vals is a list of pointers into the values table of the runtime
+   * whose destinations correlate with the elements of the list, unwrapped_vals
+   * is a container of the same size whose serves as memory to construct the
+   * unwrapped vals.
+   */
+  BoxedEvalueList(EValue** wrapped_vals, T* unwrapped_vals, int size)
+      : wrapped_vals_(wrapped_vals, size), unwrapped_vals_(unwrapped_vals) {}
+  /*
+   * Constructs and returns the list of T specified by the EValue pointers
+   */
+  executorch::aten::ArrayRef<T> get() const;
+
+ private:
+  // Source of truth for the list
+  executorch::aten::ArrayRef<EValue*> wrapped_vals_;
+  // Same size as wrapped_vals
+  mutable T* unwrapped_vals_;
+};
+
+template <>
+executorch::aten::ArrayRef<executorch::aten::optional<executorch::aten::Tensor>>
+BoxedEvalueList<executorch::aten::optional<executorch::aten::Tensor>>::get()
+    const;
+
+// Aggregate typing system similar to IValue only slimmed down with less
+// functionality, no dependencies on atomic, and fewer supported types to better
+// suit embedded systems (ie no intrusive ptr)
+struct EValue {
+  union Payload {
+    // When in ATen mode at::Tensor is not trivially copyable, this nested union
+    // lets us handle tensor as a special case while leaving the rest of the
+    // fields in a simple state instead of requiring a switch on tag everywhere.
+    union TriviallyCopyablePayload {
+      TriviallyCopyablePayload() : as_int(0) {}
+      // Scalar supported through these 3 types
+      int64_t as_int;
+      double as_double;
+      bool as_bool;
+      // TODO(jakeszwe): convert back to pointers to optimize size of this
+      // struct
+      executorch::aten::ArrayRef<char> as_string;
+      executorch::aten::ArrayRef<double> as_double_list;
+      executorch::aten::ArrayRef<bool> as_bool_list;
+      BoxedEvalueList<int64_t> as_int_list;
+      BoxedEvalueList<executorch::aten::Tensor> as_tensor_list;
+      BoxedEvalueList<executorch::aten::optional<executorch::aten::Tensor>>
+          as_list_optional_tensor;
+    } copyable_union;
+
+    // Since a Tensor just holds a TensorImpl*, there's no value to use Tensor*
+    // here.
+    executorch::aten::Tensor as_tensor;
+
+    Payload() {}
+    ~Payload() {}
+  };
+
+  // Data storage and type tag
+  Payload payload;
+  Tag tag;
+
+  // Basic ctors and assignments
+  EValue(const EValue& rhs) : EValue(rhs.payload, rhs.tag) {}
+
+  EValue(EValue&& rhs) noexcept : tag(rhs.tag) {
+    moveFrom(std::move(rhs));
+  }
+
+  EValue& operator=(EValue&& rhs) & noexcept {
+    if (&rhs == this) {
+      return *this;
+    }
+
+    destroy();
+    moveFrom(std::move(rhs));
+    return *this;
+  }
+
+  EValue& operator=(EValue const& rhs) & {
+    // Define copy assignment through copy ctor and move assignment
+    *this = EValue(rhs);
+    return *this;
+  }
+
+  ~EValue() {
+    destroy();
+  }
+
+  /****** None Type ******/
+  EValue() : tag(Tag::None) {
+    payload.copyable_union.as_int = 0;
+  }
+
+  bool isNone() const {
+    return tag == Tag::None;
+  }
+
+  /****** Int Type ******/
+  /*implicit*/ EValue(int64_t i) : tag(Tag::Int) {
+    payload.copyable_union.as_int = i;
+  }
+
+  bool isInt() const {
+    return tag == Tag::Int;
+  }
+
+  int64_t toInt() const {
+    ET_CHECK_MSG(isInt(), "EValue is not an int.");
+    return payload.copyable_union.as_int;
+  }
+
+  /****** Double Type ******/
+  /*implicit*/ EValue(double d) : tag(Tag::Double) {
+    payload.copyable_union.as_double = d;
+  }
+
+  bool isDouble() const {
+    return tag == Tag::Double;
+  }
+
+  double toDouble() const {
+    ET_CHECK_MSG(isDouble(), "EValue is not a Double.");
+    return payload.copyable_union.as_double;
+  }
+
+  /****** Bool Type ******/
+  /*implicit*/ EValue(bool b) : tag(Tag::Bool) {
+    payload.copyable_union.as_bool = b;
+  }
+
+  bool isBool() const {
+    return tag == Tag::Bool;
+  }
+
+  bool toBool() const {
+    ET_CHECK_MSG(isBool(), "EValue is not a Bool.");
+    return payload.copyable_union.as_bool;
+  }
+
+  /****** Scalar Type ******/
+  /// Construct an EValue using the implicit value of a Scalar.
+  /*implicit*/ EValue(executorch::aten::Scalar s) {
+    if (s.isIntegral(false)) {
+      tag = Tag::Int;
+      payload.copyable_union.as_int = s.to<int64_t>();
+    } else if (s.isFloatingPoint()) {
+      tag = Tag::Double;
+      payload.copyable_union.as_double = s.to<double>();
+    } else if (s.isBoolean()) {
+      tag = Tag::Bool;
+      payload.copyable_union.as_bool = s.to<bool>();
+    } else {
+      ET_CHECK_MSG(false, "Scalar passed to EValue is not initialized.");
+    }
+  }
+
+  bool isScalar() const {
+    return tag == Tag::Int || tag == Tag::Double || tag == Tag::Bool;
+  }
+
+  executorch::aten::Scalar toScalar() const {
+    // Convert from implicit value to Scalar using implicit constructors.
+
+    if (isDouble()) {
+      return toDouble();
+    } else if (isInt()) {
+      return toInt();
+    } else if (isBool()) {
+      return toBool();
+    } else {
+      ET_CHECK_MSG(false, "EValue is not a Scalar.");
+    }
+  }
+
+  /****** Tensor Type ******/
+  /*implicit*/ EValue(executorch::aten::Tensor t) : tag(Tag::Tensor) {
+    // When built in aten mode, at::Tensor has a non trivial constructor
+    // destructor, so regular assignment to a union field is UB. Instead we must
+    // go through placement new (which causes a refcount bump).
+    new (&payload.as_tensor) executorch::aten::Tensor(t);
+  }
+
+  // Template constructor that allows construction from types that can be
+  // dereferenced to produce a type that EValue can be implicitly constructed
+  // from.
+  template <
+      typename T,
+      typename = typename std::enable_if<std::is_convertible<
+          decltype(*std::forward<T>(std::declval<T>())), // declval to simulate
+                                                         // forwarding
+          EValue>::value>::type>
+  /*implicit*/ EValue(T&& value) {
+    ET_CHECK_MSG(value != nullptr, "Pointer is null.");
+    // Note that this ctor does not initialize this->tag directly; it is set by
+    // moving in the new value.
+    moveFrom(*std::forward<T>(value));
+  }
+
+  // Delete constructor for raw pointers to ensure they cannot be used.
+  template <typename T>
+  explicit EValue(T* value) = delete;
+
+  bool isTensor() const {
+    return tag == Tag::Tensor;
+  }
+
+  executorch::aten::Tensor toTensor() && {
+    ET_CHECK_MSG(isTensor(), "EValue is not a Tensor.");
+    auto res = std::move(payload.as_tensor);
+    clearToNone();
+    return res;
+  }
+
+  executorch::aten::Tensor& toTensor() & {
+    ET_CHECK_MSG(isTensor(), "EValue is not a Tensor.");
+    return payload.as_tensor;
+  }
+
+  const executorch::aten::Tensor& toTensor() const& {
+    ET_CHECK_MSG(isTensor(), "EValue is not a Tensor.");
+    return payload.as_tensor;
+  }
+
+  /****** String Type ******/
+  /*implicit*/ EValue(const char* s, size_t size) : tag(Tag::String) {
+    payload.copyable_union.as_string =
+        executorch::aten::ArrayRef<char>(s, size);
+  }
+
+  bool isString() const {
+    return tag == Tag::String;
+  }
+
+  executorch::aten::string_view toString() const {
+    ET_CHECK_MSG(isString(), "EValue is not a String.");
+    return executorch::aten::string_view(
+        payload.copyable_union.as_string.data(),
+        payload.copyable_union.as_string.size());
+  }
+
+  /****** Int List Type ******/
+  /*implicit*/ EValue(BoxedEvalueList<int64_t> i) : tag(Tag::ListInt) {
+    payload.copyable_union.as_int_list = i;
+  }
+
+  bool isIntList() const {
+    return tag == Tag::ListInt;
+  }
+
+  executorch::aten::ArrayRef<int64_t> toIntList() const {
+    ET_CHECK_MSG(isIntList(), "EValue is not an Int List.");
+    return payload.copyable_union.as_int_list.get();
+  }
+
+  /****** Bool List Type ******/
+  /*implicit*/ EValue(executorch::aten::ArrayRef<bool> b) : tag(Tag::ListBool) {
+    payload.copyable_union.as_bool_list = b;
+  }
+
+  bool isBoolList() const {
+    return tag == Tag::ListBool;
+  }
+
+  executorch::aten::ArrayRef<bool> toBoolList() const {
+    ET_CHECK_MSG(isBoolList(), "EValue is not a Bool List.");
+    return payload.copyable_union.as_bool_list;
+  }
+
+  /****** Double List Type ******/
+  /*implicit*/ EValue(executorch::aten::ArrayRef<double> d)
+      : tag(Tag::ListDouble) {
+    payload.copyable_union.as_double_list = d;
+  }
+
+  bool isDoubleList() const {
+    return tag == Tag::ListDouble;
+  }
+
+  executorch::aten::ArrayRef<double> toDoubleList() const {
+    ET_CHECK_MSG(isDoubleList(), "EValue is not a Double List.");
+    return payload.copyable_union.as_double_list;
+  }
+
+  /****** Tensor List Type ******/
+  /*implicit*/ EValue(BoxedEvalueList<executorch::aten::Tensor> t)
+      : tag(Tag::ListTensor) {
+    payload.copyable_union.as_tensor_list = t;
+  }
+
+  bool isTensorList() const {
+    return tag == Tag::ListTensor;
+  }
+
+  executorch::aten::ArrayRef<executorch::aten::Tensor> toTensorList() const {
+    ET_CHECK_MSG(isTensorList(), "EValue is not a Tensor List.");
+    return payload.copyable_union.as_tensor_list.get();
+  }
+
+  /****** List Optional Tensor Type ******/
+  /*implicit*/ EValue(
+      BoxedEvalueList<executorch::aten::optional<executorch::aten::Tensor>> t)
+      : tag(Tag::ListOptionalTensor) {
+    payload.copyable_union.as_list_optional_tensor = t;
+  }
+
+  bool isListOptionalTensor() const {
+    return tag == Tag::ListOptionalTensor;
+  }
+
+  executorch::aten::ArrayRef<
+      executorch::aten::optional<executorch::aten::Tensor>>
+  toListOptionalTensor() const {
+    return payload.copyable_union.as_list_optional_tensor.get();
+  }
+
+  /****** ScalarType Type ******/
+  executorch::aten::ScalarType toScalarType() const {
+    ET_CHECK_MSG(isInt(), "EValue is not a ScalarType.");
+    return static_cast<executorch::aten::ScalarType>(
+        payload.copyable_union.as_int);
+  }
+
+  /****** MemoryFormat Type ******/
+  executorch::aten::MemoryFormat toMemoryFormat() const {
+    ET_CHECK_MSG(isInt(), "EValue is not a MemoryFormat.");
+    return static_cast<executorch::aten::MemoryFormat>(
+        payload.copyable_union.as_int);
+  }
+
+  /****** Layout Type ******/
+  executorch::aten::Layout toLayout() const {
+    ET_CHECK_MSG(isInt(), "EValue is not a Layout.");
+    return static_cast<executorch::aten::Layout>(payload.copyable_union.as_int);
+  }
+
+  /****** Device Type ******/
+  executorch::aten::Device toDevice() const {
+    ET_CHECK_MSG(isInt(), "EValue is not a Device.");
+    return executorch::aten::Device(
+        static_cast<executorch::aten::DeviceType>(
+            payload.copyable_union.as_int),
+        -1);
+  }
+
+  template <typename T>
+  T to() &&;
+  template <typename T>
+  typename internal::evalue_to_const_ref_overload_return<T>::type to() const&;
+  template <typename T>
+  typename internal::evalue_to_ref_overload_return<T>::type to() &;
+
+  /**
+   * Converts the EValue to an optional object that can represent both T and
+   * an uninitialized state.
+   */
+  template <typename T>
+  inline executorch::aten::optional<T> toOptional() const {
+    if (this->isNone()) {
+      return executorch::aten::nullopt;
+    }
+    return this->to<T>();
+  }
+
+ private:
+  // Pre cond: the payload value has had its destructor called
+  void clearToNone() noexcept {
+    payload.copyable_union.as_int = 0;
+    tag = Tag::None;
+  }
+
+  // Shared move logic
+  void moveFrom(EValue&& rhs) noexcept {
+    if (rhs.isTensor()) {
+      new (&payload.as_tensor)
+          executorch::aten::Tensor(std::move(rhs.payload.as_tensor));
+      rhs.payload.as_tensor.~Tensor();
+    } else {
+      payload.copyable_union = rhs.payload.copyable_union;
+    }
+    tag = rhs.tag;
+    rhs.clearToNone();
+  }
+
+  // Destructs stored tensor if there is one
+  void destroy() {
+    // Necessary for ATen tensor to refcount decrement the intrusive_ptr to
+    // tensorimpl that got a refcount increment when we placed it in the evalue,
+    // no-op if executorch tensor #ifdef could have a
+    // minor performance bump for a code maintainability hit
+    if (isTensor()) {
+      payload.as_tensor.~Tensor();
+    } else if (isTensorList()) {
+      for (auto& tensor : toTensorList()) {
+        tensor.~Tensor();
+      }
+    } else if (isListOptionalTensor()) {
+      for (auto& optional_tensor : toListOptionalTensor()) {
+        optional_tensor.~optional();
+      }
+    }
+  }
+
+  EValue(const Payload& p, Tag t) : tag(t) {
+    if (isTensor()) {
+      new (&payload.as_tensor) executorch::aten::Tensor(p.as_tensor);
+    } else {
+      payload.copyable_union = p.copyable_union;
+    }
+  }
+};
+
+#define EVALUE_DEFINE_TO(T, method_name)                                       \
+  template <>                                                                  \
+  inline T EValue::to<T>()&& {                                                 \
+    return static_cast<T>(std::move(*this).method_name());                     \
+  }                                                                            \
+  template <>                                                                  \
+  inline ::executorch::runtime::internal::evalue_to_const_ref_overload_return< \
+      T>::type                                                                 \
+  EValue::to<T>() const& {                                                     \
+    typedef ::executorch::runtime::internal::                                  \
+        evalue_to_const_ref_overload_return<T>::type return_type;              \
+    return static_cast<return_type>(this->method_name());                      \
+  }                                                                            \
+  template <>                                                                  \
+  inline ::executorch::runtime::internal::evalue_to_ref_overload_return<       \
+      T>::type                                                                 \
+  EValue::to<T>()& {                                                           \
+    typedef ::executorch::runtime::internal::evalue_to_ref_overload_return<    \
+        T>::type return_type;                                                  \
+    return static_cast<return_type>(this->method_name());                      \
+  }
+
+EVALUE_DEFINE_TO(executorch::aten::Scalar, toScalar)
+EVALUE_DEFINE_TO(int64_t, toInt)
+EVALUE_DEFINE_TO(bool, toBool)
+EVALUE_DEFINE_TO(double, toDouble)
+EVALUE_DEFINE_TO(executorch::aten::string_view, toString)
+EVALUE_DEFINE_TO(executorch::aten::ScalarType, toScalarType)
+EVALUE_DEFINE_TO(executorch::aten::MemoryFormat, toMemoryFormat)
+EVALUE_DEFINE_TO(executorch::aten::Layout, toLayout)
+EVALUE_DEFINE_TO(executorch::aten::Device, toDevice)
+// Tensor and Optional Tensor
+EVALUE_DEFINE_TO(
+    executorch::aten::optional<executorch::aten::Tensor>,
+    toOptional<executorch::aten::Tensor>)
+EVALUE_DEFINE_TO(executorch::aten::Tensor, toTensor)
+
+// IntList and Optional IntList
+EVALUE_DEFINE_TO(executorch::aten::ArrayRef<int64_t>, toIntList)
+EVALUE_DEFINE_TO(
+    executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>,
+    toOptional<executorch::aten::ArrayRef<int64_t>>)
+
+// DoubleList and Optional DoubleList
+EVALUE_DEFINE_TO(executorch::aten::ArrayRef<double>, toDoubleList)
+EVALUE_DEFINE_TO(
+    executorch::aten::optional<executorch::aten::ArrayRef<double>>,
+    toOptional<executorch::aten::ArrayRef<double>>)
+
+// BoolList and Optional BoolList
+EVALUE_DEFINE_TO(executorch::aten::ArrayRef<bool>, toBoolList)
+EVALUE_DEFINE_TO(
+    executorch::aten::optional<executorch::aten::ArrayRef<bool>>,
+    toOptional<executorch::aten::ArrayRef<bool>>)
+
+// TensorList and Optional TensorList
+EVALUE_DEFINE_TO(
+    executorch::aten::ArrayRef<executorch::aten::Tensor>,
+    toTensorList)
+EVALUE_DEFINE_TO(
+    executorch::aten::optional<
+        executorch::aten::ArrayRef<executorch::aten::Tensor>>,
+    toOptional<executorch::aten::ArrayRef<executorch::aten::Tensor>>)
+
+// List of Optional Tensor
+EVALUE_DEFINE_TO(
+    executorch::aten::ArrayRef<
+        executorch::aten::optional<executorch::aten::Tensor>>,
+    toListOptionalTensor)
+#undef EVALUE_DEFINE_TO
+
+template <typename T>
+executorch::aten::ArrayRef<T> BoxedEvalueList<T>::get() const {
+  for (typename executorch::aten::ArrayRef<T>::size_type i = 0;
+       i < wrapped_vals_.size();
+       i++) {
+    ET_CHECK(wrapped_vals_[i] != nullptr);
+    unwrapped_vals_[i] = wrapped_vals_[i]->template to<T>();
+  }
+  return executorch::aten::ArrayRef<T>{unwrapped_vals_, wrapped_vals_.size()};
+}
+
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::BoxedEvalueList;
+using ::executorch::runtime::EValue;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/event_tracer.h b/include/executorch/runtime/core/event_tracer.h
new file mode 100644
index 00000000000..5bcdd0cfb1f
--- /dev/null
+++ b/include/executorch/runtime/core/event_tracer.h
@@ -0,0 +1,592 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/core/array_ref.h>
+#include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/platform/platform.h>
+#include <stdlib.h>
+#include <cstdint>
+
+#pragma once
+
+namespace executorch {
+namespace runtime {
+
+/// Represents an allocator id returned by track_allocator.
+typedef uint32_t AllocatorID;
+/// Represents the chain id that will be passed in by the user during
+/// event logging.
+typedef int32_t ChainID;
+/// Represents the debug handle that is generally associated with each
+/// op executed in the runtime.
+typedef uint32_t DebugHandle;
+// Represents the delegate debug id that is generally associated with each
+// delegate event.
+typedef int32_t DelegateDebugIntId;
+
+/// Default id's for chain id and debug handle.
+constexpr ChainID kUnsetChainId = -1;
+constexpr DebugHandle kUnsetDebugHandle = 0;
+constexpr DelegateDebugIntId kUnsetDelegateDebugIntId = -1;
+// Default bundled input index to indicate that it hasn't been set yet.
+constexpr int kUnsetBundledInputIndex = -1;
+
+/// Different types of delegate debug identifiers that are supported currently.
+enum class DelegateDebugIdType {
+  /// Default value, indicates that it's not a delegate event.
+  kNone,
+  /// Indicates a delegate event logged using an integer delegate debug
+  /// identifier.
+  kInt,
+  /// Indicates a delegate event logged using a string delegate debug
+  /// identifier i.e. the delegate debug id is a pointer to a string table
+  /// managed by the class implementing EventTracer functionality.
+  kStr
+};
+
+/// Indicates the type of the EValue that was logged. These values could be
+/// serialized and should not be changed.
+enum class LoggedEValueType {
+  /// Intermediate output from an operator.
+  kIntermediateOutput = 0,
+  /// Output at the program level. This is essentially the output
+  /// of the model.
+  kProgramOutput = 1,
+};
+
+/// Indicates the level of event tracer debug logging. Verbosity of the logging
+/// increases as we go down the enum list.
+enum class EventTracerDebugLogLevel {
+  /// No logging.
+  kNoLogging,
+  /// When set to this only the program level outputs will be logged.
+  kProgramOutputs,
+  /// When set to this all intermediate outputs and program level outputs
+  /// will be logged.
+  kIntermediateOutputs,
+};
+
+/**
+ * EventTracerFilterBase is an abstract base class that provides an interface
+ * for filtering events based on their name or delegate debug index.
+ * Derived classes should implement the filter method to define specific
+ * filtering logic.
+ */
+class EventTracerFilterBase {
+ public:
+  /**
+   * Filters events based on the given name or delegate debug index.
+   *
+   * Note that only one of either the name or delegate_debug_index should be
+   * passed in.
+   *
+   * @param[in] name A pointer to a string representing the `name` of the
+   * event. If `delegate_debug_index` is not set to kUnsetDebugHandle, `name`
+   * should be set to nullptr.
+   *
+   * @param[in] delegate_debug_index A DebugHandle representing the debug index
+   * of the delegate. If `name` is not nullptr, this should be set to
+   * kUnsetDebugHandle.
+   *
+   * @return A Result<bool> indicating whether the event matches the filter
+   * criteria.
+   *         - True if the event matches the filter.
+   *         - False if the event does not match or is unknown.
+   *         - An error code if an error occurs during filtering.
+   */
+  virtual Result<bool> filter(
+      const char* name,
+      DelegateDebugIntId delegate_debug_index) = 0;
+
+  /**
+   * Virtual destructor for the EventTracerFilterBase class.
+   * Ensures proper cleanup of derived class objects.
+   */
+  virtual ~EventTracerFilterBase() = default;
+};
+
+/**
+ * Indicates the level of profiling that should be enabled. Profiling
+ * events will be logged in increasing order of verbosity as we go down the
+ * enum list. Thus it is important to keep the enum values in the right order.
+ */
+enum class EventTracerProfilingLevel {
+  /// No operator profiling.
+  kProfileMethodOnly,
+  /// All profiling events enabled.
+  kProfileAllEvents,
+};
+
+/**
+ * This is the struct which should be returned when a profiling event is
+ * started. This is used to uniquely identify that profiling event and will be
+ * required to be passed into the end_profiling call to signal that the event
+ * identified by this struct has completed.
+ *
+ * TODO(gasoonjia): Now this struct is mix-used for both delegate and
+ *non-delegate events. In the future we should separate them into two different
+ *structs: EventTracerEntry for non-delegate events holding DebugHandle, and
+ *DelegateEventTracerEntry for delegate events holding DelegateDebugIntId.
+ **/
+struct EventTracerEntry {
+  /// An event id to uniquely identify this event that was generated during a
+  /// call to start the tracking of an event.
+  int64_t event_id;
+  /// The chain to which this event belongs to.
+  ChainID chain_id;
+  /// The debug handle corresponding to this event.
+  DebugHandle debug_handle;
+  /// The time at which this event was started to be tracked.
+  et_timestamp_t start_time;
+  /// When delegate_event_id_type != DelegateDebugIdType::kNone it indicates
+  /// that event_id represents a delegate event. If delegate_event_id_type is:
+  /// 1) kInt then event_id contains an integer delegate debug id.
+  /// 2) kStr then event_id contains a string table index into a string table
+  /// maintained by the class implementing EventTracer functionality that will
+  /// give us the string identifier of this delegate event. For more details
+  /// refer to the DelegateMappingBuilder library present in
+  /// executorch/exir/backend/utils.py.
+  DelegateDebugIdType delegate_event_id_type;
+};
+/**
+ * EventTracer is a class that users can inherit and implement to
+ * log/serialize/stream etc. the profiling and debugging events that are
+ * generated at runtime for a model. An example of this is the ETDump
+ * implementation in the devtools codebase that serializes these events to a
+ * flatbuffer.
+ */
+class EventTracer {
+ public:
+  /**
+   * Start a new event block (can consist of profiling and/or debugging events.)
+   * identified by this name. A block is conceptually a set of events that we
+   * want to group together. e.g. all the events that occur during the call to
+   * execute() (i.e. model inference) could be categorized as a block.
+   *
+   * @param[in] name A human readable identifier for the event block. Users
+   * calling this interface do not need to keep the memory pointed to by this
+   * pointer around. The string must be copied over into internal memory during
+   * this call.
+   */
+  virtual void create_event_block(const char* name) = 0;
+
+  /**
+   * Start the profiling of the event identified by name and debug_handle.
+   * The user can pass in a chain_id and debug_handle to this call, or leave
+   * them empty (default values) which would then result in the chain_id and
+   * debug handle stored within (set by set_chain_debug_handle) this class to be
+   * used.
+   * @param[in] name Human readable name for the profiling event. Users calling
+   * this interface do not need to keep the memory pointed to by this pointer
+   * around. The string must be copied over into internal memory during this
+   * call.
+   * @param[in] chain_id The id of the chain to which this event belongs to. If
+   * kUnsetChainId is passed in the chain_id and kUnsetDebugHandle for
+   * debug_handle then the values stored in the class internally for these
+   * properties will be used.
+   * @param[in] debug_handle Debug handle generated ahead-of-time during model
+   * compilation.
+   *
+   * @return Returns an instance of EventTracerEntry which should be passed back
+   * into the end_profiling() call.
+   */
+  virtual EventTracerEntry start_profiling(
+      const char* name,
+      ChainID chain_id = kUnsetChainId,
+      DebugHandle debug_handle = kUnsetDebugHandle) = 0;
+
+  /**
+   * Start the profiling of a delegate event. Similar to start_profiling it will
+   * return an instance of EventTracerEntry that contains the details of this
+   * event.
+   *
+   * @param[in] name Human readable name for the delegate event. This name has
+   * to be the same name that was passed in during the Debug delegate mapping
+   * generation in the export/ahead-of-time process. If indices and not names
+   * are used by this delegate to identify ops executed in the backend then
+   * nullptr can be passed in. Users calling this interface do not need to keep
+   * the memory pointed to by this pointer around. The string must be copied
+   * over into internal memory during this call.
+   * @param[in] delegate_debug_index The id of the delegate event. If string
+   * based names are used by this delegate to identify ops executed in the
+   * backend then kUnsetDebugHandle should be passed in here.
+   */
+  virtual EventTracerEntry start_profiling_delegate(
+      const char* name,
+      DelegateDebugIntId delegate_debug_index) = 0;
+
+  /**
+   * Signal the end of the delegate profiling event contained in
+   * event_tracer_entry. Users also have the option to log some some free-from
+   * string based metadata along with this.
+   *
+   * @param[in] event_tracer_entry The EventTracerEntry returned by a call to
+   * start_profiling_delegate().
+   * @param[in] metadata Optional data relevant to the execution that the user
+   * wants to log along with this event. Pointer to metadata doesn't need to be
+   * valid after the call to this function. The contents and format of the data
+   * are transparent to the event tracer. It will just pipe along the data and
+   * make it available for the user again in the post-processing stage.
+   * @param[in] metadata_len Length of the metadata buffer.
+   */
+  virtual void end_profiling_delegate(
+      EventTracerEntry event_tracer_entry,
+      const void* metadata = nullptr,
+      size_t metadata_len = 0) = 0;
+
+  /**
+   * Some delegates get access to the profiling details only after the complete
+   * graph has been executed. This interface is to support such use cases. It
+   * can be called in a loop etc. to log any number of profiling events that are
+   * part of this delegate.
+   *
+   * @param[in] name Human readable name for the delegate event. This name has
+   * to be the same name that was passed in during the Debug delegate mapping
+   * generation in the export/ahead-of-time process. If indices and not names
+   * are used by this delegate to identify ops executed in the backend then
+   * nullptr can be passed in. Users calling this interface do not need to keep
+   * the memory pointed to by this pointer around. The string must be copied
+   * over into internal memory during this call.
+   * @param[in] delegate_debug_index The id of the delegate event. If string
+   * based names are used by this delegate to identify ops executed in the
+   * backend then kUnsetDebugHandle should be passed in here.
+   * @param[in] start_time The timestamp when the delegate event started.
+   * @param[in] end_time The timestamp when the delegate event finished.
+   * @param[in] metadata Optional data relevant to the execution that the user
+   * wants to log along with this event. Pointer to metadata doesn't need to be
+   * valid after the call to this function. The contents and format of the data
+   * are transparent to the event tracer. It will just pipe along the data and
+   * make it available for the user again in the post-processing stage.
+   * @param[in] metadata_len Length of the metadata buffer.
+   */
+  virtual void log_profiling_delegate(
+      const char* name,
+      DelegateDebugIntId delegate_debug_index,
+      et_timestamp_t start_time,
+      et_timestamp_t end_time,
+      const void* metadata = nullptr,
+      size_t metadata_len = 0) = 0;
+
+  /**
+   * End the profiling of the event identified by prof_entry
+   *
+   * @param[in] prof_entry Value returned by a call to start_profiling
+   */
+  virtual void end_profiling(EventTracerEntry prof_entry) = 0;
+
+  /**
+   * Track this allocation done via a MemoryAllocator which had profiling
+   * enabled on it.
+   *
+   * @param[in] id Allocator id generated by a call to track_allocator.
+   * @param[in] size The size of the allocation done, in bytes.
+   */
+  virtual void track_allocation(AllocatorID id, size_t size) = 0;
+
+  /**
+   * Generate an allocator id for this memory allocator that will be used in the
+   * future to identify all the allocations done by this allocator.
+   *
+   * @param[in] name Human readable name for the allocator. Users calling
+   * this interface do not need to keep the memory pointed to by this pointer
+   * around. The string should be copied over into internal memory during this
+   * call.
+   *
+   * @return Identifier to uniquely identify this allocator.
+   */
+  virtual AllocatorID track_allocator(const char* name) = 0;
+
+  /**
+   * Log an evalue during the execution of the model. This is useful for
+   * debugging purposes. Model outputs are a special case of this and will
+   * be logged with the output bool enabled.
+   *
+   * Users of this should refer to the chain_id and debug_handle to get the
+   * context for these evalues and their corresponding op.
+   *
+   * @param[in] evalue The value to be logged.
+   * @param[in] evalue_type Indicates what type of output this is logging e.g.
+   * an intermediate output, program output etc.
+   * @return A Result<bool> indicating the status of the logging operation.
+   *         - True if the evalue output was successfully logged.
+   *         - An error code if an error occurs during logging.
+   */
+  virtual Result<bool> log_evalue(
+      const EValue& evalue,
+      LoggedEValueType evalue_type) = 0;
+
+  /**
+   * Log an intermediate tensor output from a delegate.
+   *
+   * @param[in] name Human readable name for the delegate event. This name has
+   * to be the same name that was passed in during the Debug delegate mapping
+   * generation in the export/ahead-of-time process. If indices and not names
+   * are used by this delegate to identify ops executed in the backend then
+   * nullptr can be passed in. Users calling this interface do not need to keep
+   * the memory pointed to by this pointer around. The string must be copied
+   * over into internal memory during this call.
+   * @param[in] delegate_debug_index The id of the delegate event. If string
+   * based names are used by this delegate to identify ops executed in the
+   * backend then kUnsetDebugHandle should be passed in here.
+   * @param[in] output The tensor type output to be logged.
+   * @return A Result<bool> indicating the status of the logging operation.
+   *         - True if the tensor type output was successfully logged.
+   *         - False if the tensor type output was filtered out and not logged.
+   *         - An error code if an error occurs during logging.
+   */
+  virtual Result<bool> log_intermediate_output_delegate(
+      const char* name,
+      DelegateDebugIntId delegate_debug_index,
+      const executorch::aten::Tensor& output) = 0;
+
+  /**
+   * Log an intermediate tensor array output from a delegate.
+   *
+   * @param[in] name Human readable name for the delegate event. This name has
+   * to be the same name that was passed in during the Debug delegate mapping
+   * generation in the export/ahead-of-time process. If indices and not names
+   * are used by this delegate to identify ops executed in the backend then
+   * nullptr can be passed in. Users calling this interface do not need to keep
+   * the memory pointed to by this pointer around. The string must be copied
+   * over into internal memory during this call.
+   * @param[in] delegate_debug_index The id of the delegate event. If string
+   * based names are used by this delegate to identify ops executed in the
+   * backend then kUnsetDebugHandle should be passed in here.
+   * @param[in] output The tensor array type output to be logged.
+   * @return A Result<bool> indicating the status of the logging operation.
+   *         - True if the tensor array type output was successfully logged.
+   *         - False if the tensor array type output was filtered out and not
+   * logged.
+   *         - An error code if an error occurs during logging.
+   */
+  virtual Result<bool> log_intermediate_output_delegate(
+      const char* name,
+      DelegateDebugIntId delegate_debug_index,
+      const ArrayRef<executorch::aten::Tensor> output) = 0;
+
+  /**
+   * Log an intermediate int output from a delegate.
+   *
+   * @param[in] name Human readable name for the delegate event. This name has
+   * to be the same name that was passed in during the Debug delegate mapping
+   * generation in the export/ahead-of-time process. If indices and not names
+   * are used by this delegate to identify ops executed in the backend then
+   * nullptr can be passed in. Users calling this interface do not need to keep
+   * the memory pointed to by this pointer around. The string must be copied
+   * over into internal memory during this call.
+   * @param[in] delegate_debug_index The id of the delegate event. If string
+   * based names are used by this delegate to identify ops executed in the
+   * backend then kUnsetDebugHandle should be passed in here.
+   * @param[in] output The int type output to be logged.
+   * @return A Result<bool> indicating the status of the logging operation.
+   *         - True if the int type output was successfully logged.
+   *         - False if the int type output was filtered out and not logged.
+   *         - An error code if an error occurs during logging.
+   */
+  virtual Result<bool> log_intermediate_output_delegate(
+      const char* name,
+      DelegateDebugIntId delegate_debug_index,
+      const int& output) = 0;
+
+  /**
+   * Log an intermediate bool output from a delegate.
+   *
+   * @param[in] name Human readable name for the delegate event. This name has
+   * to be the same name that was passed in during the Debug delegate mapping
+   * generation in the export/ahead-of-time process. If indices and not names
+   * are used by this delegate to identify ops executed in the backend then
+   * nullptr can be passed in. Users calling this interface do not need to keep
+   * the memory pointed to by this pointer around. The string must be copied
+   * over into internal memory during this call.
+   * @param[in] delegate_debug_index The id of the delegate event. If string
+   * based names are used by this delegate to identify ops executed in the
+   * backend then kUnsetDebugHandle should be passed in here.
+   * @param[in] output The bool type output to be logged.
+   * @return A Result<bool> indicating the status of the logging operation.
+   *         - True if the bool type output was successfully logged.
+   *         - False if the bool type output was filtered out and not logged.
+   *         - An error code if an error occurs during logging.
+   */
+  virtual Result<bool> log_intermediate_output_delegate(
+      const char* name,
+      DelegateDebugIntId delegate_debug_index,
+      const bool& output) = 0;
+
+  /**
+   * Log an intermediate double output from a delegate.
+   *
+   * @param[in] name Human readable name for the delegate event. This name has
+   * to be the same name that was passed in during the Debug delegate mapping
+   * generation in the export/ahead-of-time process. If indices and not names
+   * are used by this delegate to identify ops executed in the backend then
+   * nullptr can be passed in. Users calling this interface do not need to keep
+   * the memory pointed to by this pointer around. The string must be copied
+   * over into internal memory during this call.
+   * @param[in] delegate_debug_index The id of the delegate event. If string
+   * based names are used by this delegate to identify ops executed in the
+   * backend then kUnsetDebugHandle should be passed in here.
+   * @param[in] output The double type output to be logged.
+   * @return A Result<bool> indicating the status of the logging operation.
+   *         - True if the double type output was successfully logged.
+   *         - False if the double type output was filtered out and not logged.
+   *         - An error code if an error occurs during logging.
+   */
+  virtual Result<bool> log_intermediate_output_delegate(
+      const char* name,
+      DelegateDebugIntId delegate_debug_index,
+      const double& output) = 0;
+
+  /**
+   * Set the filter of event tracer for delegation intermediate outputs.
+   */
+  virtual void set_delegation_intermediate_output_filter(
+      EventTracerFilterBase* event_tracer_filter) = 0;
+
+  /**
+   * Helper function to set the chain id ands debug handle. Users have two
+   * options, the first is that they can directly pass in the chain id and debug
+   * handle to start_profiling or they can explicitly set them through this
+   * helper before calling start_profiling.
+   *
+   * The reason this helper exists is to
+   * solve a specific problem. We want to do profiling logging inside the
+   * codegen layer which calls the kernels. The problem though is that the
+   * codegen layer doesn't have access to these ids when calling
+   * start_profiling.
+   *
+   * Users should ideally use these within a RAII scope interface to make sure
+   * that these values are unset after the end_profiling call. If non-default
+   * values are passed into the start_profiling call they will always be given
+   * precedence over the values set by this interface.
+   *
+   * So what we do is call this helper in method.cpp before
+   * we hit the codegen layer and in the codegen layer we do a start_profiling
+   * call without passing in a chain_id or debug_handle. This ensures that the
+   * values set via this helper are the ones associated with that call.
+   *
+   * @param[in] chain_id Chain id of the current instruction being exectuted.
+   * @param[in] debug_handle Debug handle of the current instruction being
+   * executed. In this context debug handle and instruction id are the same
+   * thing.
+   */
+  void set_chain_debug_handle(ChainID chain_id, DebugHandle debug_handle) {
+    chain_id_ = chain_id;
+    debug_handle_ = debug_handle;
+  }
+
+  /**
+   * When running a program wrapped in a bundled program, log the bundled input
+   * index of the current bundled input being tested out on this method.
+   * If users want to unset the index back to the default value, they can call
+   * this method with kUnsetBundledInputIndex.
+   *
+   * @param[in] bundled_input_index Index of the current input being tested
+   */
+  void set_bundled_input_index(int bundled_input_index) {
+    bundled_input_index_ = bundled_input_index;
+  }
+
+  /**
+   * Return the current bundled input index.
+   */
+  int bundled_input_index() {
+    return bundled_input_index_;
+  }
+
+  /**
+   * Set the level of event tracer debug logging that is desired.
+   *
+   */
+  void set_event_tracer_debug_level(EventTracerDebugLogLevel log_level) {
+    event_tracer_debug_level_ = log_level;
+  }
+
+  /**
+   * Return the current level of event tracer debug logging.
+   */
+  EventTracerDebugLogLevel event_tracer_debug_level() {
+    return event_tracer_debug_level_;
+  }
+
+  /**
+   * Set the level of event tracer profiling that is desired.
+   */
+  void set_event_tracer_profiling_level(
+      EventTracerProfilingLevel profiling_level) {
+    event_tracer_profiling_level_ = profiling_level;
+  }
+
+  /**
+   * Return the current level of event tracer profiling.
+   */
+  EventTracerProfilingLevel event_tracer_profiling_level() {
+    return event_tracer_profiling_level_;
+  }
+
+  /**
+   * Return the current status of intermediate outputs logging mode.
+   */
+  bool intermediate_outputs_logging_status() {
+    return log_intermediate_tensors_;
+  }
+
+  /**
+   * Get the current chain id.
+   *
+   * @return Current chain id.
+   */
+  ChainID current_chain_id() {
+    return chain_id_;
+  }
+
+  /**
+   * Get the current debug handle.
+   *
+   * @return Current debug handle.
+   */
+  DebugHandle current_debug_handle() {
+    return debug_handle_;
+  }
+
+  virtual ~EventTracer() {}
+
+ protected:
+  ChainID chain_id_ = kUnsetChainId;
+  DebugHandle debug_handle_ = kUnsetDebugHandle;
+  bool event_tracer_enable_debugging_ = false;
+  bool log_intermediate_tensors_ = false;
+  int bundled_input_index_ = kUnsetBundledInputIndex;
+  EventTracerDebugLogLevel event_tracer_debug_level_ =
+      EventTracerDebugLogLevel::kNoLogging;
+  EventTracerProfilingLevel event_tracer_profiling_level_ =
+      EventTracerProfilingLevel::kProfileAllEvents;
+};
+
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::AllocatorID;
+using ::executorch::runtime::ChainID;
+using ::executorch::runtime::DebugHandle;
+using ::executorch::runtime::DelegateDebugIdType;
+using ::executorch::runtime::DelegateDebugIntId;
+using ::executorch::runtime::EventTracer;
+using ::executorch::runtime::EventTracerDebugLogLevel;
+using ::executorch::runtime::EventTracerEntry;
+using ::executorch::runtime::kUnsetBundledInputIndex;
+using ::executorch::runtime::kUnsetChainId;
+using ::executorch::runtime::kUnsetDebugHandle;
+using ::executorch::runtime::kUnsetDelegateDebugIntId;
+using ::executorch::runtime::LoggedEValueType;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/event_tracer_hooks.h b/include/executorch/runtime/core/event_tracer_hooks.h
new file mode 100644
index 00000000000..cd74b447ca8
--- /dev/null
+++ b/include/executorch/runtime/core/event_tracer_hooks.h
@@ -0,0 +1,340 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/event_tracer.h>
+
+/**
+ * @file
+ *
+ * This file contains the hooks that are inserted across various parts of the
+ * core runtime code to call into the EventTracer class for logging of profiling
+ * and debugging events. Any calls made to the EventTracer from the runtime must
+ * be made via these hooks.
+ * Users shouldn't directly add these hooks in their code and it's meant only
+ * for usage in ExecuTorch internal code.
+ *
+ * The benefit of defining these hooks is that we can easily control whether or
+ * not we want to compile in the EventTracer code based on the status of the
+ * ET_EVENT_TRACER_ENABLED flag.
+ *
+ * TODO(dbort): Make this a private header of runtime/executor. It only contains
+ * runtime-internal functions and should not be part of the public set of
+ * headers.
+ */
+
+namespace executorch {
+namespace ET_RUNTIME_NAMESPACE {
+namespace internal {
+
+/**
+ * This class enables scope based profiling where needed using RAII for
+ * operators only. If operator profiling is disabled then this class is a no-op.
+ */
+class EventTracerProfileOpScope final {
+ public:
+  EventTracerProfileOpScope(EventTracer* event_tracer, const char* name) {
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_ = event_tracer;
+    if (event_tracer_ == nullptr) {
+      return;
+    }
+    if (event_tracer_->event_tracer_profiling_level() >
+        executorch::runtime::EventTracerProfilingLevel::kProfileMethodOnly) {
+      event_entry_ = event_tracer->start_profiling(name);
+    }
+#else //! ET_EVENT_TRACER_ENABLED
+    (void)event_tracer;
+    (void)name;
+#endif
+  }
+
+  ~EventTracerProfileOpScope() {
+#ifdef ET_EVENT_TRACER_ENABLED
+    if (event_tracer_ == nullptr) {
+      return;
+    }
+    if (event_tracer_->event_tracer_profiling_level() >
+        executorch::runtime::EventTracerProfilingLevel::kProfileMethodOnly) {
+      event_tracer_->end_profiling(event_entry_);
+    }
+#endif
+  }
+
+ private:
+#ifdef ET_EVENT_TRACER_ENABLED
+  EventTracer* event_tracer_;
+  EventTracerEntry event_entry_;
+#endif
+};
+
+using EventTracerProfileScope = EventTracerProfileOpScope;
+
+/**
+ * This class enables scope based profiling where needed using RAII.
+ * Profiling will be started when the object is created and will end
+ * when the object goes out of scope. This is specifically intended to
+ * be used for profiling methods in the runtime.
+ */
+class EventTracerProfileMethodScope final {
+ public:
+  EventTracerProfileMethodScope(EventTracer* event_tracer, const char* name) {
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_ = event_tracer;
+    if (event_tracer_ == nullptr) {
+      return;
+    }
+    event_entry_ = event_tracer->start_profiling(name);
+#else //! ET_EVENT_TRACER_ENABLED
+    (void)event_tracer;
+    (void)name;
+#endif
+  }
+
+  ~EventTracerProfileMethodScope() {
+#ifdef ET_EVENT_TRACER_ENABLED
+    if (event_tracer_ == nullptr) {
+      return;
+    }
+    event_tracer_->end_profiling(event_entry_);
+#endif
+  }
+
+ private:
+#ifdef ET_EVENT_TRACER_ENABLED
+  EventTracer* event_tracer_;
+  EventTracerEntry event_entry_;
+#endif
+};
+
+/**
+ * This class helps us set and then clear out the chain id and debug handle
+ * values stored in the event tracer class using RAII. This is typically called
+ * in the executor loop before entering the codegen layer to configure the chain
+ * id and debug handle of the current instruction being executed.
+ * After we return from the kernel execution we can then reset the chain id and
+ * debug handle to defaults when this object goes out of scope.
+ */
+class EventTracerProfileInstructionScope final {
+ public:
+  EventTracerProfileInstructionScope(
+      EventTracer* event_tracer,
+      ChainID chain_idx,
+      DebugHandle debug_handle) {
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_ = event_tracer;
+    if (event_tracer_ == nullptr) {
+      return;
+    }
+    event_tracer_->set_chain_debug_handle(chain_idx, debug_handle);
+#else //! ET_EVENT_TRACER_ENABLED
+    (void)event_tracer;
+    (void)chain_idx;
+    (void)debug_handle;
+#endif
+  }
+
+  ~EventTracerProfileInstructionScope() {
+#ifdef ET_EVENT_TRACER_ENABLED
+    if (event_tracer_ == nullptr) {
+      return;
+    }
+    event_tracer_->set_chain_debug_handle(kUnsetChainId, kUnsetDebugHandle);
+#endif
+  }
+
+ private:
+#ifdef ET_EVENT_TRACER_ENABLED
+  EventTracer* event_tracer_;
+#endif
+};
+
+inline bool event_tracer_enabled() {
+#ifdef ET_EVENT_TRACER_ENABLED
+  return true;
+#else //! ET_EVENT_TRACER_ENABLED
+  return false;
+#endif
+}
+/**
+ * Create a new event block with the specified name. Any events logged
+ * after this will be associated with this new event block.
+ */
+inline void event_tracer_create_event_block(
+    EventTracer* event_tracer,
+    char const* name) {
+#ifdef ET_EVENT_TRACER_ENABLED
+  if (event_tracer) {
+    event_tracer->create_event_block(name);
+  }
+#else //! ET_EVENT_TRACER_ENABLED
+  (void)event_tracer;
+  (void)name;
+#endif
+}
+
+/**
+ * Explicitly mark the beginning of a new profiling event. This returns
+ * an instance of an EventTracerEntry object that the user needs to keep
+ * around and pass into the corresponding event_tracer_end_profiling_event
+ * call.
+ */
+inline EventTracerEntry event_tracer_begin_profiling_event(
+    EventTracer* event_tracer,
+    char const* name) {
+#ifdef ET_EVENT_TRACER_ENABLED
+  if (event_tracer) {
+    return event_tracer->start_profiling(name);
+  }
+#else //! ET_EVENT_TRACER_ENABLED
+  (void)event_tracer;
+  (void)name;
+#endif
+  // There is no active tracer; this value will be ignored.
+  return EventTracerEntry();
+}
+
+/**
+ * Mark the end of a profiling event passing in the entry token
+ * returned by a previous call to ET_EVENT_TRACER_BEGIN_PROFILING_EVENT.
+ */
+inline void event_tracer_end_profiling_event(
+    EventTracer* event_tracer,
+    EventTracerEntry event) {
+#ifdef ET_EVENT_TRACER_ENABLED
+  if (event_tracer) {
+    event_tracer->end_profiling(event);
+  }
+#else //! ET_EVENT_TRACER_ENABLED
+  (void)event_tracer;
+  (void)event;
+#endif
+}
+
+/**
+ * Start the tracking of the allocator represented by this name and returns
+ * an AllocatorID that will be used to track all subsequent allocations done by
+ * this allocator.
+ */
+inline AllocatorID event_tracer_track_allocator(
+    EventTracer* event_tracer,
+    const char* name) {
+#ifdef ET_EVENT_TRACER_ENABLED
+  if (event_tracer) {
+    return event_tracer->track_allocator(name);
+  }
+#else //! ET_EVENT_TRACER_ENABLED
+  (void)event_tracer;
+  (void)name;
+#endif
+  // There is no active tracer; this value will be ignored.
+  return 0;
+}
+
+/// Log the allocation event done via the allocator represented by id.
+inline void event_tracer_track_allocation(
+    EventTracer* event_tracer,
+    AllocatorID id,
+    size_t size) {
+#ifdef ET_EVENT_TRACER_ENABLED
+  if (event_tracer) {
+    event_tracer->track_allocation(id, size);
+  }
+#else //! ET_EVENT_TRACER_ENABLED
+  (void)event_tracer;
+  (void)id;
+  (void)size;
+#endif
+}
+
+/// Log an intermediate value.
+inline void event_tracer_log_evalue(EventTracer* event_tracer, EValue& evalue) {
+#ifdef ET_EVENT_TRACER_ENABLED
+  if (event_tracer) {
+    if (event_tracer->event_tracer_debug_level() >=
+        EventTracerDebugLogLevel::kIntermediateOutputs) {
+      event_tracer->log_evalue(evalue, LoggedEValueType::kIntermediateOutput);
+    }
+  }
+#else //! ET_EVENT_TRACER_ENABLED
+  (void)event_tracer;
+  (void)evalue;
+#endif
+}
+
+/// Log a program output.
+inline void event_tracer_log_evalue_output(
+    EventTracer* event_tracer,
+    const EValue& evalue) {
+#ifdef ET_EVENT_TRACER_ENABLED
+  /*
+   * If debugging via event tracer is enabled but intermediate output logging is
+   * disabled then we want to only log the outputs.
+   */
+  if (event_tracer) {
+    if (event_tracer->event_tracer_debug_level() >=
+        EventTracerDebugLogLevel::kProgramOutputs) {
+      event_tracer->log_evalue(evalue, LoggedEValueType::kProgramOutput);
+    }
+  }
+#else //! ET_EVENT_TRACER_ENABLED
+  (void)event_tracer;
+  (void)evalue;
+#endif
+}
+
+// Set the bundled input index of the current bundled input being used by the
+// method.
+inline void event_tracer_set_bundled_input_index(
+    EventTracer* event_tracer,
+    int bundled_input_index) {
+#ifdef ET_EVENT_TRACER_ENABLED
+  if (event_tracer) {
+    event_tracer->set_bundled_input_index(bundled_input_index);
+  }
+#else //! ET_EVENT_TRACER_ENABLED
+  (void)event_tracer;
+  (void)bundled_input_index;
+#endif
+}
+
+} // namespace internal
+} // namespace ET_RUNTIME_NAMESPACE
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+namespace internal {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::
+    event_tracer_begin_profiling_event;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::
+    event_tracer_create_event_block;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::
+    event_tracer_end_profiling_event;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::event_tracer_log_evalue;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::
+    event_tracer_log_evalue_output;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::
+    event_tracer_set_bundled_input_index;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::
+    event_tracer_track_allocation;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::
+    event_tracer_track_allocator;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::
+    EventTracerProfileInstructionScope;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::
+    EventTracerProfileMethodScope;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::EventTracerProfileOpScope;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::EventTracerProfileScope;
+
+} // namespace internal
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/event_tracer_hooks_delegate.h b/include/executorch/runtime/core/event_tracer_hooks_delegate.h
new file mode 100644
index 00000000000..b2369fc216d
--- /dev/null
+++ b/include/executorch/runtime/core/event_tracer_hooks_delegate.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/event_tracer.h>
+
+/**
+ * @file
+ *
+ * This file contains the hooks that can be used by runtime delegate backend
+ * authors to log profiling and debugging events from backend code. In order to
+ * use these hooks delegate authors would have needed to generate a delegate
+ * debug identifier mapping using the DelegateMappingBuilder library present in
+ * executorch/exir/backend/utils.py. The delegate debug identifiers generated by
+ * that library are the ones that need to be passed to these hooks to log
+ * events. Using any other identifiers will cause post-processing of the events
+ * data to not properly link back to the nodes in the original lowered graph.
+ *
+ * The benefit of defining these hooks is that we can easily control whether or
+ * not we want to compile in the EventTracer code based on the status of the
+ * ET_EVENT_TRACER_ENABLED flag.
+ */
+
+namespace executorch {
+namespace runtime {
+
+/**
+ * Start the profiling of a delegate event. Similar to start_profiling it will
+ * return an instance of EventTracerEntry that contains the details of this
+ * event. Can be left in production code as these hooks compile conditionally.
+ *
+ * @param[in] event_tracer The event tracer instance that is doing the logging.
+ * @param[in] name Human readable name for the delegate event. This name has
+ * to be the same name that was passed in during the Debug delegate mapping
+ * generation in the export/ahead-of-time process. If indices and not names
+ * are used by this delegate to identify ops executed in the backend then
+ * nullptr can be passed in. Users calling this interface do not need to keep
+ * the memory pointed to by this pointer around. The string must be copied over
+ * into internal memory during this call.
+ * @param[in] delegate_debug_id The id of the delegate event. If string
+ * based names are used by this delegate to identify ops executed in the
+ * backend then kUnsetDebugHandle should be passed in here.
+ */
+inline EventTracerEntry event_tracer_start_profiling_delegate(
+    EventTracer* event_tracer,
+    const char* name,
+    DebugHandle delegate_debug_id) {
+#ifdef ET_EVENT_TRACER_ENABLED
+  if (event_tracer) {
+    return event_tracer->start_profiling_delegate(name, delegate_debug_id);
+  }
+#else //! ET_EVENT_TRACER_ENABLED
+  (void)name;
+  (void)delegate_debug_id;
+#endif
+  // There is no active tracer; this value will be ignored.
+  return EventTracerEntry();
+}
+
+/**
+ * Signal the end of the delegate profiling event contained in
+ * event_tracer_entry. Users also have the option to log some some free-from
+ * string based metadata along with this. Can be left in production code as
+ * these hooks compile conditionally.
+ *
+ * @param[in] event_tracer The event tracer instance that is doing the logging.
+ * @param[in] event_tracer_entry The EventTracerEntry returned by a call to
+ * start_profiling_delegate().
+ * @param[in] metadata Optional data relevant to the execution that the user
+ * wants to log along with this event. Pointer to metadata doesn't need to be
+ * valid after the call to this function. The contents and format of the data
+ * are transparent to the event tracer. It will just pipe along the data and
+ * make it available for the user again in the post-processing stage.
+ * @param[in] metadata_len Length of the metadata buffer.
+ */
+inline void event_tracer_end_profiling_delegate(
+    EventTracer* event_tracer,
+    EventTracerEntry event_tracer_entry,
+    const void* metadata = nullptr,
+    size_t metadata_len = 0) {
+#ifdef ET_EVENT_TRACER_ENABLED
+  if (event_tracer) {
+    event_tracer->end_profiling_delegate(
+        event_tracer_entry, metadata, metadata_len);
+  }
+#else //! ET_EVENT_TRACER_ENABLED
+  (void)event_tracer_entry;
+  (void)metadata;
+  (void)metadata_len;
+#endif
+}
+
+/**
+ * Some delegates get access to the profiling details only after the complete
+ * graph has been executed. This interface is to support such use cases. It
+ * can be called in a loop etc. to log any number of profiling events that are
+ * part of this delegate. Can be left in production code as these hooks
+ * compile conditionally.
+ *
+ * @param[in] event_tracer The event tracer instance that is doing the logging.
+ * @param[in] name Human readable name for the delegate event. This name has
+ * to be the same name that was passed in during the Debug delegate mapping
+ * generation in the export/ahead-of-time process. If indices and not names
+ * are used by this delegate to identify ops executed in the backend then
+ * nullptr can be passed in. Users calling this interface do not need to keep
+ * the memory pointed to by this pointer around. The string must
+ * be copied over into internal memory during this call.
+ * @param[in] delegate_debug_id The id of the delegate event. If string
+ * based names are used by this delegate to identify ops executed in the
+ * backend then -1 should be passed in here.
+ * @param[in] start_time The timestamp when the delegate event started.
+ * @param[in] end_time The timestamp when the delegate event finished.
+ * @param[in] metadata Optional data relevant to the execution that the user
+ * wants to log along with this event. Pointer to metadata doesn't need to be
+ * valid after the call to this function. The contents and format of the data
+ * are transparent to the event tracer. It will just pipe along the data and
+ * make it available for the user again in the post-processing stage.
+ * @param[in] metadata_len Length of the metadata buffer.
+ */
+inline void event_tracer_log_profiling_delegate(
+    EventTracer* event_tracer,
+    const char* name,
+    DebugHandle delegate_debug_id,
+    et_timestamp_t start_time,
+    et_timestamp_t end_time,
+    const void* metadata = nullptr,
+    size_t metadata_len = 0) {
+#ifdef ET_EVENT_TRACER_ENABLED
+  if (event_tracer) {
+    event_tracer->log_profiling_delegate(
+        name, delegate_debug_id, start_time, end_time, metadata, metadata_len);
+  }
+#else //! ET_EVENT_TRACER_ENABLED
+  (void)name;
+  (void)delegate_debug_id;
+  (void)start_time;
+  (void)end_time;
+  (void)metadata;
+  (void)metadata_len;
+#endif
+}
+
+/**
+ * This templated interfaces can be called in a loop etc. to log any number of
+ * debug events that are part of this delegate. Supported values types are int,
+ * bool, double, tensor and array of tensors. Can be left in production code as
+ * these hooks compile conditionally.
+ *
+ * @param[in] event_tracer The event tracer instance that is doing the logging.
+ * @param[in] name Human readable name for the delegate event. This name has
+ * to be the same name that was passed in during the Debug delegate mapping
+ * generation in the export/ahead-of-time process. If indices and not names
+ * are used by this delegate to identify ops executed in the backend then
+ * nullptr can be passed in. Users calling this interface do not need to keep
+ * the memory pointed to by this pointer around. The string must
+ * be copied over into internal memory during this call.
+ * @param[in] delegate_debug_id The id of the delegate event. If string
+ * based names are used by this delegate to identify ops executed in the
+ * backend then -1 should be passed in here.
+ * @param[in] output The output to be logged.
+ */
+template <typename T>
+inline void event_tracer_log_output_delegate(
+    EventTracer* event_tracer,
+    const char* name,
+    DebugHandle delegate_debug_id,
+    const T& output) {
+#ifdef ET_EVENT_TRACER_ENABLED
+  if (event_tracer) {
+    static_assert(
+        std::is_same<T, int>::value || std::is_same<T, bool>::value ||
+            std::is_same<T, double>::value ||
+            std::is_same<T, executorch::aten::Tensor>::value ||
+            std::is_same<T, ArrayRef<executorch::aten::Tensor>>::value,
+        "Unsupported type for intermediate output");
+    event_tracer->log_intermediate_output_delegate(
+        name, delegate_debug_id, output);
+  }
+#else //! ET_EVENT_TRACER_ENABLED
+  (void)name;
+  (void)delegate_debug_id;
+  (void)output;
+#endif
+}
+
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::event_tracer_end_profiling_delegate;
+using ::executorch::runtime::event_tracer_log_output_delegate;
+using ::executorch::runtime::event_tracer_log_profiling_delegate;
+using ::executorch::runtime::event_tracer_start_profiling_delegate;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/exec_aten/exec_aten.h b/include/executorch/runtime/core/exec_aten/exec_aten.h
new file mode 100644
index 00000000000..10075ab5920
--- /dev/null
+++ b/include/executorch/runtime/core/exec_aten/exec_aten.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/tensor_shape_dynamism.h> // @manual
+#include <executorch/runtime/platform/compiler.h>
+#ifdef USE_ATEN_LIB
+#include <ATen/Tensor.h> // @manual
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h> // @manual
+#include <c10/core/Layout.h> // @manual
+#include <c10/core/MemoryFormat.h> // @manual
+#include <c10/core/Scalar.h> // @manual
+#include <c10/util/ArrayRef.h> // @manual
+#include <c10/util/BFloat16-math.h> // @manual
+#include <c10/util/BFloat16.h> // @manual
+#include <c10/util/Half.h> // @manual
+#include <c10/util/Optional.h> // @manual
+#include <c10/util/complex.h> // @manual
+#include <c10/util/qint32.h> // @manual
+#include <c10/util/qint8.h> // @manual
+#include <c10/util/quint2x4.h> // @manual
+#include <c10/util/quint4x2.h> // @manual
+#include <c10/util/quint8.h> // @manual
+#include <c10/util/string_view.h> // @manual
+#include <torch/torch.h>
+#else // use executor
+#include <executorch/runtime/core/array_ref.h> // @manual
+#include <executorch/runtime/core/portable_type/bfloat16.h> // @manual
+#include <executorch/runtime/core/portable_type/bfloat16_math.h> // @manual
+#include <executorch/runtime/core/portable_type/complex.h> // @manual
+#include <executorch/runtime/core/portable_type/device.h> // @manual
+#include <executorch/runtime/core/portable_type/half.h> // @manual
+#include <executorch/runtime/core/portable_type/optional.h> // @manual
+#include <executorch/runtime/core/portable_type/qint_types.h> // @manual
+#include <executorch/runtime/core/portable_type/scalar.h> // @manual
+#include <executorch/runtime/core/portable_type/scalar_type.h> // @manual
+#include <executorch/runtime/core/portable_type/string_view.h> // @manual
+#include <executorch/runtime/core/portable_type/tensor.h> // @manual
+#include <executorch/runtime/core/portable_type/tensor_options.h> // @manual
+
+#endif
+
+/**
+ * This hack is for separating out ATen mode vs non-ATen mode. In ATen mode,
+ * we use the ATen types directly. In non-ATen mode, we use the portable types.
+ * To avoid duplicate symbols and/or duplicate operator registration, when a
+ * user depends on both the ATen mode and non-ATen mode versions of the
+ * ExecuTorch library.
+ */
+#ifndef ET_RUNTIME_NAMESPACE
+#if defined(USE_ATEN_LIB)
+#define ET_RUNTIME_NAMESPACE runtime::aten
+#else
+#define ET_RUNTIME_NAMESPACE runtime
+#endif
+#endif
+
+namespace executorch {
+namespace aten {
+
+using TensorShapeDynamism = executorch::runtime::TensorShapeDynamism;
+
+#ifdef USE_ATEN_LIB
+
+using Tensor = at::Tensor;
+using TensorList = at::TensorList;
+using TensorImpl = at::TensorImpl;
+using string_view = std::string_view;
+template <typename T>
+using ArrayRef = c10::ArrayRef<T>;
+template <typename T>
+using optional = std::optional<T>;
+using nullopt_t = std::nullopt_t;
+using std::nullopt;
+using ScalarType = at::ScalarType;
+using Scalar = c10::Scalar;
+using MemoryFormat = c10::MemoryFormat;
+using SizesType = int64_t;
+using DimOrderType = uint8_t;
+using StridesType = int64_t;
+using Device = c10::Device;
+using DeviceType = c10::DeviceType;
+using Layout = c10::Layout;
+
+// Custom types that map to ScalarType
+using Half = c10::Half;
+template <typename T>
+using complex = c10::complex<T>;
+using qint8 = c10::qint8;
+using quint8 = c10::quint8;
+using qint32 = c10::qint32;
+using BFloat16 = c10::BFloat16;
+using quint4x2 = c10::quint4x2;
+using quint2x4 = c10::quint2x4;
+using IntArrayRef = at::IntArrayRef;
+
+template <typename T>
+using OptionalArrayRef = c10::OptionalArrayRef<T>;
+using OptionalIntArrayRef = OptionalArrayRef<int64_t>;
+
+inline ssize_t compute_numel(const SizesType* sizes, ssize_t dim) {
+  return static_cast<ssize_t>(
+      c10::multiply_integers(c10::ArrayRef<SizesType>(sizes, dim)));
+}
+
+#undef ET_PRI_TENSOR_SIZE
+#define ET_PRI_TENSOR_SIZE PRId64
+
+#undef ET_PRI_TENSOR_DIM
+#define ET_PRI_TENSOR_DIM PRId64
+
+#undef ET_PRI_TENSOR_NUMEL
+#define ET_PRI_TENSOR_NUMEL PRId64
+
+#undef ET_PRI_SIZES_AND_STRIDES
+#define ET_PRI_SIZES_AND_STRIDES PRId64
+
+#else // Use executor types
+
+using Tensor = torch::executor::Tensor;
+using TensorImpl = torch::executor::TensorImpl;
+using string_view = torch::executor::string_view;
+template <typename T>
+using ArrayRef = torch::executor::ArrayRef<T>;
+template <typename T>
+using optional = torch::executor::optional<T>;
+using nullopt_t = torch::executor::nullopt_t;
+// NOLINTNEXTLINE(facebook-hte-NamespaceScopedStaticDeclaration)
+using std::nullopt;
+using ScalarType = torch::executor::ScalarType;
+using TensorList = ArrayRef<Tensor>;
+using Scalar = torch::executor::Scalar;
+using MemoryFormat = torch::executor::MemoryFormat;
+using SizesType = torch::executor::Tensor::SizesType;
+using DimOrderType = torch::executor::Tensor::DimOrderType;
+using StridesType = torch::executor::Tensor::StridesType;
+using Device = torch::executor::Device;
+using DeviceType = torch::executor::DeviceType;
+using Layout = torch::executor::Layout;
+
+// Custom types that map to ScalarType
+using Half = torch::executor::Half;
+template <typename T>
+using complex = torch::executor::complex<T>;
+using qint8 = torch::executor::qint8;
+using quint8 = torch::executor::quint8;
+using qint32 = torch::executor::qint32;
+using BFloat16 = torch::executor::BFloat16;
+using quint4x2 = torch::executor::quint4x2;
+using quint2x4 = torch::executor::quint2x4;
+
+using IntArrayRef = torch::executor::IntArrayRef;
+
+template <typename T>
+using OptionalArrayRef =
+    torch::executor::optional<torch::executor::ArrayRef<T>>;
+using OptionalIntArrayRef = OptionalArrayRef<int64_t>;
+
+using torch::executor::compute_numel;
+
+#endif // Use ExecuTorch types
+
+} // namespace aten
+} // namespace executorch
+
+// DEPRECATED: The exec_aten:: namespace is deprecated. Use executorch::aten::
+// instead.
+namespace exec_aten = executorch::aten;
+
+namespace torch {
+namespace executor {
+using TensorList = ::executorch::aten::TensorList;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/exec_aten/testing_util/tensor_factory.h b/include/executorch/runtime/core/exec_aten/testing_util/tensor_factory.h
new file mode 100644
index 00000000000..1e29b220251
--- /dev/null
+++ b/include/executorch/runtime/core/exec_aten/testing_util/tensor_factory.h
@@ -0,0 +1,1132 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/core/tensor_shape_dynamism.h>
+#include <executorch/runtime/platform/assert.h>
+
+#ifdef USE_ATEN_LIB
+#include <ATen/ATen.h>
+#else // !USE_ATEN_LIB
+#include <memory>
+#include <numeric>
+#include <vector>
+#endif // !USE_ATEN_LIB
+
+namespace executorch {
+namespace runtime {
+namespace testing {
+
+namespace internal {
+
+/**
+ * Returns the number of elements in the tensor, given the dimension
+ * sizes, assuming contiguous data.
+ */
+inline size_t sizes_to_numel(const std::vector<int32_t>& sizes) {
+  size_t n = 1;
+  for (auto s : sizes) {
+    n *= s;
+  }
+  return n;
+}
+
+/**
+ * Check if given strides is legal under given sizes. In the `make` function,
+ * the `strides` shall ensure:
+ *  - a. strides.size() == sizes.size()
+ *  - b. all strides are positive.
+ *  - c. All underlying data be accessed.
+ *  - d. All legal indexes can access an underlying data.
+ *  - e. No two indexes access a same data.
+ *  - f. No out of bounds data can be accessed.
+ *
+ * @param[in] sizes The sizes of the dimensions of the Tensor.
+ * @param[in] strides The desired strides for creating new tensor.
+ * @return The strides is legal or not
+ */
+
+inline bool check_strides(
+    const std::vector<int32_t> sizes,
+    const std::vector<executorch::aten::StridesType> strides) {
+  if (sizes.size() != strides.size()) {
+    // The length of stride vector shall equal to size vector.
+    return false;
+  }
+
+  if (strides.size() == 0) {
+    // Both sizes and strides are empty vector. Legal!
+    return true;
+  }
+
+  // Check if input non-empty strides is legal. The defination of legal is in
+  // the comment above function. To check it, we first reformat the strides into
+  // contiguous style, in where the strides should be sorted from high to low.
+  // Then rearrange the size based on same transformation. After that, we can
+  // check if strides[i] == strides[i + 1] * sizes[i + 1] for all i in
+  // [0, sizes.size() - 1) and strides[sizes.size() - 1] == 1
+
+  // Get the mapping between current strides and sorted strides (from high to
+  // low, if equal then check if correspond size is 1 or 0 in same dimension)
+  // e.g. a = tensor(3, 2, 1).permute(2, 1, 0), a.size() == (1, 2, 3) and
+  // a.strides == (1, 1, 2). We want to sort create a mapping to make the
+  // sorted_stride as (2, 1, 1) while sorted_size == (3, 2, 1)
+  std::vector<std::int32_t> sorted_idx(sizes.size());
+  for (size_t i = 0; i < sizes.size(); i++) {
+    sorted_idx[i] = i;
+  }
+  std::sort(
+      sorted_idx.begin(),
+      sorted_idx.end(),
+      [&](const int32_t& a, const int32_t& b) {
+        if (strides[a] != strides[b]) {
+          return strides[a] > strides[b];
+        } else {
+          // When strides equal to each other, put the index whose
+          // coresponding size equal to 0 or 1 to the right. Update the rule to
+          // the following comparsion to circumvent strict weak ordering.
+          return (sizes[a] ? sizes[a] : 1) > (sizes[b] ? sizes[b] : 1);
+        }
+      });
+
+  // Use the mapping to rearrange the sizes and strides
+  std::vector<std::int32_t> sorted_sizes(sizes.size());
+  std::vector<std::int32_t> sorted_strides(sizes.size());
+  for (size_t i = 0; i < sizes.size(); i++) {
+    sorted_sizes[i] = sizes[sorted_idx[i]] == 0 ? 1 : sizes[sorted_idx[i]];
+    sorted_strides[i] = strides[sorted_idx[i]];
+  }
+
+  // All strides should be positive. We have sorted it mainly based on strides,
+  // so sorted_strides[-1] has lowest value.
+  if (sorted_strides[strides.size() - 1] <= 0) {
+    return false;
+  }
+
+  // Check if strides is legal
+  bool legal = sorted_strides[strides.size() - 1] == 1;
+  for (size_t i = 0; i < strides.size() - 1 && legal; i++) {
+    legal = legal &&
+        (sorted_strides[i] == sorted_strides[i + 1] * sorted_sizes[i + 1]);
+  }
+
+  return legal;
+}
+
+/**
+ * Check that a given dim order array is valid. A dim order array is valid if
+ * each value from 0 to sizes.size() - 1 appears exactly once in the dim_order
+ * array.
+ */
+inline bool check_dim_order(
+    const std::vector<int32_t>& sizes,
+    const std::vector<uint8_t>& dim_order) {
+  if (sizes.size() != dim_order.size()) {
+    return false;
+  }
+  size_t gauss_sum = 0;
+  std::vector<int> count(dim_order.size(), 0);
+  for (int i = 0; i < dim_order.size(); i++) {
+    if (dim_order[i] >= sizes.size()) {
+      return false;
+    }
+    gauss_sum += static_cast<size_t>(dim_order[i]) + 1;
+  }
+  // Use the gaussian sum to verify each dim appears exactly once
+  size_t expected_sum = (sizes.size() * (sizes.size() + 1)) / 2;
+  if (gauss_sum != expected_sum) {
+    return false;
+  }
+
+  return true;
+}
+
+inline std::vector<executorch::aten::StridesType> strides_from_dim_order(
+    const std::vector<int32_t>& sizes,
+    const std::vector<uint8_t>& dim_order) {
+  bool legal = check_dim_order(sizes, dim_order);
+  ET_CHECK_MSG(legal, "The input dim_order variable is illegal.");
+
+  size_t ndim = sizes.size();
+  std::vector<executorch::aten::StridesType> strides(ndim);
+  strides[dim_order[ndim - 1]] = 1;
+  for (int i = ndim - 2; i >= 0; --i) {
+    uint8_t cur_dim = dim_order[i];
+    uint8_t next_dim = dim_order[i + 1];
+    strides[cur_dim] = (!sizes[next_dim]) ? strides[next_dim]
+                                          : strides[next_dim] * sizes[next_dim];
+  }
+  return strides;
+}
+
+inline std::vector<uint8_t> channels_last_dim_order(size_t dims) {
+  ET_CHECK_MSG(
+      dims >= 4 && dims <= 5,
+      "Channels last dim order only valid for 4-dim and 5-dim tensors!");
+
+  std::vector<uint8_t> dim_order(dims);
+  // Channels is always assigned to dim 1
+  dim_order[dims - 1] = 1;
+
+  dim_order[0] = 0;
+  int d = 1;
+  while (d < dims - 1) {
+    dim_order[d] = d + 1;
+    d++;
+  }
+  return dim_order;
+}
+
+} // namespace internal
+
+#ifdef USE_ATEN_LIB
+
+// Note that this USE_ATEN_LIB section uses ATen-specific namespaces instead of
+// exec_aten because we know that we're working with ATen, and many of these
+// names aren't mapped into executorch::aten::.
+
+namespace internal {
+
+// This wrapper lets us override the C type associated with some ScalarType
+// values while using the defaults for everything else.
+template <c10::ScalarType DTYPE>
+struct ScalarTypeToCppTypeWrapper {
+  using ctype = typename c10::impl::ScalarTypeToCPPTypeT<DTYPE>;
+};
+
+// Use a C type of `uint8_t` instead of `bool`. The C type will be used to
+// declare a `std::vector<CTYPE>`, and `std::vector<bool>` is often optimized to
+// store a single bit per entry instead of using an array of separate `bool`
+// elements. Since the tensor data will point into the vector, it needs to use
+// one byte per element.
+template <>
+struct ScalarTypeToCppTypeWrapper<c10::ScalarType::Bool> {
+  using ctype = uint8_t;
+};
+
+} // namespace internal
+
+template <at::ScalarType DTYPE>
+class TensorFactory {
+ public:
+  /*
+   * The C types that backs the associated DTYPE. E.g., `float` for
+   * `ScalarType::Float`.
+   */
+
+  /**
+   * Used for the vector provided to the factory functions. May differ
+   * from the type usually associate with the ScalarType.
+   *
+   * Used for the vector<> parameters passed to the factory functions.
+   */
+  using ctype = typename internal::ScalarTypeToCppTypeWrapper<DTYPE>::ctype;
+
+  /**
+   * The official C type for the scalar type. Used when accessing elements
+   * of a constructed Tensor.
+   */
+  using true_ctype = typename c10::impl::ScalarTypeToCPPTypeT<DTYPE>;
+
+  TensorFactory() = default;
+
+  /**
+   * Returns a new Tensor with the specified shape, data and stride.
+   *
+   * @param[in] sizes The sizes of the dimensions of the Tensor.
+   * @param[in] data The data that the Tensor should be initialized with. The
+   *     size of this vector must be equal to the product of the elements of
+   *     `sizes`.
+   * @param[in] strides The strides for each dimensions of the Tensor. If empty
+   *     or not specificed, the function will return a contiguous tensor based
+   *     on data and size. If not, the strides shall follow the rules:
+   *            - a. strides.size() == sizes.size().
+   *            - b. all strides are positive.
+   *            - c. All underlying data be accessed.
+   *            - d. All legal indexes can access an underlying data.
+   *            - e. No two indexes access a same data.
+   *            - f. No out of bounds data can be accessed.
+   *
+   * @return A new Tensor with the specified shape and data.
+   */
+  at::Tensor make(
+      const std::vector<int32_t>& sizes,
+      const std::vector<ctype>& data,
+      const std::vector<executorch::aten::StridesType> strides = {},
+      ET_UNUSED TensorShapeDynamism dynamism =
+          TensorShapeDynamism::DYNAMIC_UNBOUND) {
+    auto expected_numel = internal::sizes_to_numel(sizes);
+    ET_CHECK_MSG(
+        expected_numel == data.size(),
+        "Number of data elements %zd "
+        "does not match expected number of elements %zd",
+        data.size(),
+        expected_numel);
+
+    at::Tensor t;
+    if (strides.empty()) {
+      t = zeros(sizes);
+    } else {
+      bool legal = internal::check_strides(sizes, strides);
+      ET_CHECK_MSG(legal, "The input strides variable is illegal.");
+
+      t = empty_strided(sizes, strides);
+    }
+    if (t.nbytes() > 0) {
+      std::transform(
+          data.begin(), data.end(), t.template data<true_ctype>(), [](auto x) {
+            return static_cast<true_ctype>(x);
+          });
+    }
+    return t;
+  }
+
+  /**
+   * Returns a new Tensor with the specified shape, data and dim order.
+   *
+   * @param[in] sizes The sizes of the dimensions of the Tensor.
+   * @param[in] data The data that the Tensor should be initialized with. The
+   *     size of this vector must be equal to the product of the elements of
+   *     `sizes`.
+   * @param[in] dim_order The dim order describing how tensor memory is laid
+   * out. If empty or not specificed, the function will use a contiguous dim
+   * order of {0, 1, 2, 3, ...}
+   *
+   * @return A new Tensor with the specified shape and data.
+   */
+  at::Tensor make_with_dimorder(
+      const std::vector<int32_t>& sizes,
+      const std::vector<ctype>& data,
+      const std::vector<uint8_t> dim_order = {},
+      ET_UNUSED TensorShapeDynamism dynamism =
+          TensorShapeDynamism::DYNAMIC_UNBOUND) {
+    auto expected_numel = internal::sizes_to_numel(sizes);
+    ET_CHECK_MSG(
+        expected_numel == data.size(),
+        "Number of data elements %zd "
+        "does not match expected number of elements %zd",
+        data.size(),
+        expected_numel);
+
+    at::Tensor t;
+    if (dim_order.empty()) {
+      t = zeros(sizes);
+    } else {
+      auto strides = internal::strides_from_dim_order(sizes, dim_order);
+      t = empty_strided(sizes, strides);
+    }
+    if (t.nbytes() > 0) {
+      std::transform(
+          data.begin(), data.end(), t.template data<true_ctype>(), [](auto x) {
+            return static_cast<true_ctype>(x);
+          });
+    }
+    return t;
+  }
+
+  /**
+   * Returns a new Tensor with the specified shape and data in channels last
+   * memory layout.
+   *
+   * @param[in] sizes The sizes of the dimensions of the Tensor.
+   * @param[in] data The data that the Tensor should be initialized with. The
+   *     size of this vector must be equal to the product of the elements of
+   *     `sizes`.
+   *
+   * @return A new Tensor with the specified shape and data.
+   */
+  at::Tensor make_channels_last(
+      const std::vector<int32_t>& sizes,
+      const std::vector<ctype>& data,
+      ET_UNUSED TensorShapeDynamism dynamism =
+          TensorShapeDynamism::DYNAMIC_UNBOUND) {
+    return make_with_dimorder(
+        sizes, data, internal::channels_last_dim_order(sizes.size()), dynamism);
+  }
+
+  /**
+   * Given data in contiguous memory format, returns a new Tensor with the
+   * specified shape and the same data but in channels last memory format.
+   *
+   * @param[in] sizes The sizes of the dimensions of the Tensor.
+   * @param[in] data The data in contiguous memory format that the Tensor should
+   * be initialized with. The size of this vector must be equal to the product
+   * of the elements of `sizes`.
+   *
+   * @return A new Tensor with the specified shape and data in channls last
+   * memory format.
+   */
+  at::Tensor channels_last_like(
+      const at::Tensor& input,
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
+    ET_CHECK_MSG(
+        input.sizes().size() == 4, "Only 4D tensors can be channels last");
+
+    const std::vector<int32_t> sizes(
+        input.sizes().begin(), input.sizes().end());
+
+    std::vector<uint8_t> contiguous_dim_order(sizes.size());
+    for (uint8_t i = 0; i < sizes.size(); i++) {
+      contiguous_dim_order[i] = i;
+    }
+    std::vector<executorch::aten::StridesType> contiguous_strides =
+        internal::strides_from_dim_order(sizes, contiguous_dim_order);
+
+    for (int32_t i = 0; i < input.dim(); i++) {
+      ET_CHECK_MSG(
+          input.strides()[i] == contiguous_strides[i],
+          "Input tensor is not contiguous");
+    }
+
+    int32_t N = sizes[0];
+    int32_t C = sizes[1];
+    int32_t H = sizes[2];
+    int32_t W = sizes[3];
+
+    std::vector<ctype> contiguous_data(
+        input.data_ptr<ctype>(), input.data_ptr<ctype>() + input.numel());
+    std::vector<ctype> channels_last_data(
+        N * C * H * W); // Create a new blob with the same total size to contain
+                        // channels_last data
+    for (int32_t n = 0; n < N; ++n) {
+      for (int32_t c = 0; c < C; ++c) {
+        for (int32_t h = 0; h < H; ++h) {
+          for (int32_t w = 0; w < W; ++w) {
+            // Calculate the index in the original blob
+            int32_t old_index = ((n * C + c) * H + h) * W + w;
+            // Calculate the index in the new blob
+            int32_t new_index = ((n * H + h) * W + w) * C + c;
+            // Copy the data
+            channels_last_data[new_index] = contiguous_data[old_index];
+          }
+        }
+      }
+    }
+
+    return make_with_dimorder(
+        sizes,
+        channels_last_data,
+        internal::channels_last_dim_order(sizes.size()),
+        dynamism);
+  }
+
+  /**
+   * Returns a new Tensor with the specified shape, containing contiguous
+   * data will all elements set to `value`.
+   *
+   * @param[in] sizes The sizes of the dimensions of the Tensor.
+   * @param[in] value The value of all elements of the Tensor.
+   * @return A new Tensor with the specified shape.
+   */
+  at::Tensor full(
+      const std::vector<int32_t>& sizes,
+      ctype value,
+      ET_UNUSED TensorShapeDynamism dynamism =
+          TensorShapeDynamism::DYNAMIC_UNBOUND) {
+    auto sizes64 = vec_32_to_64(sizes);
+    return at::full(at::IntArrayRef(sizes64), value, at::dtype(DTYPE));
+  }
+
+  /**
+   * Returns a new Tensor with the specified shape, containing channels-last
+   * contiguous data will all elements set to `value`.
+   *
+   * @param[in] sizes The sizes of the dimensions of the Tensor.
+   * @param[in] value The value of all elements of the Tensor.
+   * @return A new Tensor with the specified shape.
+   */
+  at::Tensor full_channels_last(
+      const std::vector<int32_t>& sizes,
+      ctype value,
+      ET_UNUSED TensorShapeDynamism dynamism =
+          TensorShapeDynamism::DYNAMIC_UNBOUND) {
+    auto sizes64 = vec_32_to_64(sizes);
+    return at::full(at::IntArrayRef(sizes64), value, at::dtype(DTYPE))
+        .to(at::MemoryFormat::ChannelsLast);
+  }
+
+  /**
+   * Returns a new Tensor with the specified shape, containing channels-last
+   * contiguous data with all `0` elements.
+   *
+   * @param[in] sizes The sizes of the dimensions of the Tensor.
+   * @return A new Tensor with the specified shape.
+   */
+  at::Tensor zeros(
+      const std::vector<int32_t>& sizes,
+      ET_UNUSED TensorShapeDynamism dynamism =
+          TensorShapeDynamism::DYNAMIC_UNBOUND) {
+    auto sizes64 = vec_32_to_64(sizes);
+    return at::zeros(at::IntArrayRef(sizes64), at::dtype(DTYPE));
+  }
+
+  /**
+   * Returns a new Tensor with the specified shape, containing contiguous data
+   * with all `0` elements.
+   *
+   * @param[in] sizes The sizes of the dimensions of the Tensor.
+   * @return A new Tensor with the specified shape.
+   */
+  at::Tensor zeros_channels_last(
+      const std::vector<int32_t>& sizes,
+      ET_UNUSED TensorShapeDynamism dynamism =
+          TensorShapeDynamism::DYNAMIC_UNBOUND) {
+    auto sizes64 = vec_32_to_64(sizes);
+    return at::zeros(at::IntArrayRef(sizes64), at::dtype(DTYPE))
+        .to(at::MemoryFormat::ChannelsLast);
+  }
+
+  /**
+   * Returns a new Tensor with the specified shape, containing contiguous data
+   * with all `1` elements.
+   *
+   * @param[in] sizes The sizes of the dimensions of the Tensor.
+   * @return A new Tensor with the specified shape.
+   */
+  at::Tensor ones(
+      const std::vector<int32_t>& sizes,
+      ET_UNUSED TensorShapeDynamism dynamism =
+          TensorShapeDynamism::DYNAMIC_UNBOUND) {
+    auto sizes64 = vec_32_to_64(sizes);
+    return at::ones(at::IntArrayRef(sizes64), at::dtype(DTYPE));
+  }
+
+  /**
+   * Returns a new Tensor with the same shape as the input tensor, containing
+   * contiguous data with all `0` elements.
+   *
+   * @param[in] input The tensor that supplies the shape of the new Tensor.
+   * @return A new Tensor with the specified shape.
+   */
+  at::Tensor zeros_like(
+      const at::Tensor& input,
+      ET_UNUSED TensorShapeDynamism dynamism =
+          TensorShapeDynamism::DYNAMIC_UNBOUND) {
+    std::vector<int64_t> sizes64 = {input.sizes().begin(), input.sizes().end()};
+    return at::full(at::IntArrayRef(sizes64), 0, at::dtype(DTYPE));
+  }
+
+  /**
+   * Returns a new Tensor with the same shape as the input tensor, containing
+   * contiguous data with all `1` elements.
+   *
+   * @param[in] input The tensor that supplies the shape of the new Tensor.
+   * @return A new Tensor with the specified shape.
+   */
+  at::Tensor ones_like(
+      const at::Tensor& input,
+      ET_UNUSED TensorShapeDynamism dynamism =
+          TensorShapeDynamism::DYNAMIC_UNBOUND) {
+    std::vector<int64_t> sizes64 = {input.sizes().begin(), input.sizes().end()};
+    return at::full(at::IntArrayRef(sizes64), 1, at::dtype(DTYPE));
+  }
+
+ private:
+  /// Copies an int32_t vector into a new int64_t vector.
+  static std::vector<int64_t> vec_32_to_64(const std::vector<int32_t>& in) {
+    std::vector<int64_t> out{};
+    out.reserve(in.size());
+    for (auto i : in) {
+      out.push_back(i);
+    }
+    return out;
+  }
+
+  /**
+   * Returns a new Tensor with the specified shape and stride.
+   *
+   * @param[in] sizes The sizes of the dimensions of the Tensor.
+   * @param[in] strides The strides for each dimensions of the Tensor
+   * @return A new Tensor with the specified shape and strides.
+   */
+  at::Tensor empty_strided(
+      const std::vector<int32_t>& sizes,
+      const std::vector<executorch::aten::StridesType>& strides,
+      ET_UNUSED TensorShapeDynamism dynamism =
+          TensorShapeDynamism::DYNAMIC_UNBOUND) {
+    auto sizes64 = vec_32_to_64(sizes);
+    return at::empty_strided(
+        sizes64,
+        strides,
+        DTYPE,
+        /*layout_opt=*/at::Layout::Strided,
+        /*device_opt=*/at::Device(at::DeviceType::CPU),
+        /*pin_memory_opt=*/false);
+  }
+};
+
+#else // !USE_ATEN_LIB
+
+namespace {
+/*
+ * Dimension order represents how dimensions are laid out in memory,
+ * starting from the inner-most to the outer-most dimension.
+ * Thus, the conversion from strides is done by sorting the strides
+ * from larger to smaller since the dimension with the largest stride
+ * is the outer-most and the dimension with the smallest stride is the
+ inner-most.
+ * For example, tensor with sizes = (3, 5, 2) and strides = (5, 1, 15), implies
+ * dimension order of (2, 0, 1), because 2nd dimension has the biggest stride of
+ 15,
+ * followed by 0th dimension with stride of 5 and then innermost dimension being
+ the 1st
+ * dimension with size of 1. This order of (2, 0, 1) can be obtained
+ * by sorting strides from large to smaller.
+
+ * When strides do not convey dimension order unambiguously, dimension order
+ * returned is dependent on stability of sort. We employ stable sort to preserve
+ * original order. Thus when strides = (4, 3, 1, 1) returned value is (0, 1, 2,
+ 3)
+ * Another example is: sizes = (1, 3, 1, 1) with strides = (3, 1, 3, 3),
+ returned
+ * value is (0, 2, 3, 1)
+*/
+// copied from
+// https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
+// TODO: Add assert for strides[i] != 0 because strides of 0 is really used,
+// by pytorch/aten, to convey broadcasting dim.
+
+inline std::vector<uint8_t> dim_order_from_stride(
+    const std::vector<int32_t>& v) {
+  std::vector<uint8_t> indices(v.size());
+  std::iota(indices.begin(), indices.end(), 0);
+  stable_sort(indices.begin(), indices.end(), [&v](size_t i1, size_t i2) {
+    return v[i1] > v[i2];
+  });
+  return indices;
+}
+
+inline void validate_strides(
+    const std::vector<int32_t>& sizes,
+    const std::vector<int32_t>& strides) {
+  if (sizes.size() != strides.size()) {
+    ET_CHECK_MSG(false, "Stride and sizes are not equal in length");
+  }
+  for (const auto& s : strides) {
+    if (s == 0) {
+      ET_CHECK_MSG(false, "Stride value of 0 is not supported");
+    }
+  }
+  // No two dimensions can have same stride value
+  for (int32_t i = 0; i < strides.size(); ++i) {
+    for (int32_t j = i + 1; j < strides.size(); ++j) {
+      if ((sizes[i] == 0) || (sizes[j] == 0) ||
+          ((sizes[i] == 1) || (sizes[j] == 1))) {
+        continue;
+      }
+      if ((strides[i] == strides[j])) {
+        ET_CHECK_MSG(
+            false,
+            "Stride value and size dont comply at index %d."
+            " strides[%d]: %d, strides[%d] = %d, sizes[%d] = %d, sizes[%d] = %d",
+            i,
+            i,
+            strides[i],
+            j,
+            strides[j],
+            i,
+            sizes[i],
+            j,
+            sizes[j]);
+      }
+    }
+  }
+}
+
+} // namespace
+
+// Note that this !USE_ATEN_LIB section uses ExecuTorch-specific namespaces
+// instead of exec_aten to make it clear that we're dealing with ETensor, and
+// because many of these names aren't mapped into executorch::aten::.
+
+namespace internal {
+
+// This wrapper lets us override the C type associated with some ScalarType
+// values while using the defaults for everything else.
+template <torch::executor::ScalarType DTYPE>
+struct ScalarTypeToCppTypeWrapper {
+  using ctype =
+      typename ::executorch::runtime::ScalarTypeToCppType<DTYPE>::type;
+};
+
+// Use a C type of `uint8_t` instead of `bool`. The C type will be used to
+// declare a `std::vector<CTYPE>`, and `std::vector<bool>` is often optimized to
+// store a single bit per entry instead of using an array of separate `bool`
+// elements. Since the tensor data will point into the vector, it needs to use
+// one byte per element.
+template <>
+struct ScalarTypeToCppTypeWrapper<torch::executor::ScalarType::Bool> {
+  using ctype = uint8_t;
+};
+
+// Use a C type of `uint16_t` instead of `Bits16` to simplify code reuse when
+// testing multiple integer types.
+template <>
+struct ScalarTypeToCppTypeWrapper<torch::executor::ScalarType::Bits16> {
+  using ctype = uint16_t;
+};
+
+// Use a C type of `uint16_t` instead of `UInt16` to simplify code reuse when
+// testing multiple integer types.
+template <>
+struct ScalarTypeToCppTypeWrapper<torch::executor::ScalarType::UInt16> {
+  using ctype = uint16_t;
+};
+
+// To allow implicit conversion between simple types to `ctype`
+#define SPECIALIZE_ScalarTypeToCppTypeWrapper(CTYPE, DTYPE)               \
+  template <>                                                             \
+  struct ScalarTypeToCppTypeWrapper<torch::executor::ScalarType::DTYPE> { \
+    using ctype = typename CTYPE::underlying;                             \
+  };
+
+ET_FORALL_QINT_TYPES(SPECIALIZE_ScalarTypeToCppTypeWrapper)
+
+#undef SPECIALIZE_ScalarTypeToCppTypeWrapper
+
+} // namespace internal
+
+/**
+ * A helper class for creating Tensors, simplifying memory management.
+ *
+ * NOTE: A given TensorFactory instance owns the memory pointed to by all
+ * Tensors that it creates, and must live longer than those Tensors.
+ *
+ * Example:
+ * @code{.cpp}
+ * // A factory instance will create Tensors of a single dtype.
+ * TensorFactory<ScalarType::Int> tf;
+ *
+ * // You can create more factories if you need tensors of multiple
+ * // dtypes.
+ * TensorFactory<ScalarType::Float> tf_float;
+ *
+ * // The factory will copy the vectors provided to it, letting callers provide
+ * // inline literals.
+ * Tensor t1 = tf.make(
+ *     {2, 2}, // sizes
+ *     {1, 2, 3, 4}); // data
+ *
+ * // There are helpers for creating Tensors with all 1 or 0 elements.
+ * Tensor z = tf.zeros({2, 2});
+ * Tensor o = tf_float.ones({2, 2});
+ *
+ * // Sometimes it's helpful to share parameters.
+ * std::vector<int32_t> sizes = {2, 2};
+ * Tensor t3 = tf.make(sizes, {1, 2, 3, 4});
+ * Tensor t4 = tf.ones(sizes);
+ *
+ * // But remember that the inputs are copied, so providing the same data vector
+ * // to two Tensors will not share the same underlying data.
+ * std::vector<int> data = {1, 2, 3, 4};
+ * Tensor t5 = tf.make(sizes, data);
+ * Tensor t6 = tf.make(sizes, data);
+ * t5.mutable_data_ptr<int>()[0] = 99;
+ * EXPECT_NE(t5, t6);
+ * @endcode
+ *
+ * @tparam DTYPE The dtype of Tensors created by this factory, as a ScalarType
+ *     value like `ScalarType::Int`.
+ */
+template <torch::executor::ScalarType DTYPE>
+class TensorFactory {
+ public:
+  /**
+   * The C type that backs the associated DTYPE. E.g., `float` for
+   * `ScalarType::Float`.
+   */
+  using ctype = typename internal::ScalarTypeToCppTypeWrapper<DTYPE>::ctype;
+
+  /**
+   * The official C type for the scalar type. Used when accessing elements
+   * of a constructed Tensor.
+   */
+  using true_ctype =
+      typename executorch::runtime::ScalarTypeToCppType<DTYPE>::type;
+
+  TensorFactory() = default;
+
+  /**
+   * Returns a new Tensor with the specified shape, data and stride.
+   *
+   * @param[in] sizes The sizes of the dimensions of the Tensor.
+   * @param[in] data The data that the Tensor should be initialized with. The
+   *     size of this vector must be equal to the product of the elements of
+   *     `sizes`.
+   * @param[in] strides The strides for each dimensions of the Tensor. If empty
+   *     or not specificed, the function will return a contiguous tensor based
+   *     on data and size. If not, the strides shall follow the rules:
+   *            - a. strides.size() == sizes.size().
+   *            - b. all strides are positive.
+   *            - c. All underlying data be accessed.
+   *            - d. All legal indexes can access an underlying data.
+   *            - e. No two indexes access a same data.
+   *            - f. No out of bounds data can be accessed.
+   *
+   * @return A new Tensor with the specified shape and data.
+   */
+  torch::executor::Tensor make(
+      const std::vector<int32_t>& sizes,
+      const std::vector<ctype>& data,
+      const std::vector<executorch::aten::StridesType> strides = {},
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
+    std::vector<int32_t> default_strides;
+    // Generate strides from the tensor dimensions, assuming contiguous data if
+    // given strides is empty.
+    if (!sizes.empty() && strides.empty()) {
+      default_strides.resize(sizes.size(), 1);
+      for (size_t i = sizes.size() - 1; i > 0; --i) {
+        // For sizes[i] == 0, treat it as 1 to be consistent with core Pytorch
+        auto sizes_i = sizes[i] ? sizes[i] : 1;
+        default_strides[i - 1] = default_strides[i] * sizes_i;
+      }
+    }
+    auto& actual_strides = default_strides.empty() ? strides : default_strides;
+    validate_strides(sizes, actual_strides);
+    auto dim_order = dim_order_from_stride(actual_strides);
+
+    auto expected_numel = internal::sizes_to_numel(sizes);
+    ET_CHECK_MSG(
+        expected_numel == data.size(),
+        "Number of data elements %zd "
+        "does not match expected number of elements %zd",
+        data.size(),
+        expected_numel);
+
+    bool legal = internal::check_strides(sizes, actual_strides);
+    ET_CHECK_MSG(legal, "The input strides variable is illegal.");
+
+    memory_.emplace_back(std::make_unique<TensorMemory>(
+        sizes, data, dim_order, actual_strides, dynamism));
+    return torch::executor::Tensor(&memory_.back()->impl_);
+  }
+
+  /**
+   * Returns a new Tensor with the specified shape, data and dim order.
+   *
+   * @param[in] sizes The sizes of the dimensions of the Tensor.
+   * @param[in] data The data that the Tensor should be initialized with. The
+   *     size of this vector must be equal to the product of the elements of
+   *     `sizes`.
+   * @param[in] dim_order The dim order describing how tensor memory is laid
+   * out. If empty or not specificed, the function will use a contiguous dim
+   * order of {0, 1, 2, 3, ...}
+   *
+   * @return A new Tensor with the specified shape and data.
+   */
+  torch::executor::Tensor make_with_dimorder(
+      const std::vector<int32_t>& sizes,
+      const std::vector<ctype>& data,
+      const std::vector<uint8_t> dim_order = {},
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
+    std::vector<uint8_t> default_dim_order;
+    // Generate strides from the tensor dimensions, assuming contiguous data if
+    // given strides is empty.
+    if (!sizes.empty() && dim_order.empty()) {
+      default_dim_order.resize(sizes.size(), 1);
+      for (size_t i = 0; i < sizes.size(); ++i) {
+        default_dim_order[i] = i;
+      }
+    }
+    auto& actual_dim_order =
+        default_dim_order.empty() ? dim_order : default_dim_order;
+
+    auto strides = internal::strides_from_dim_order(sizes, actual_dim_order);
+
+    auto expected_numel = internal::sizes_to_numel(sizes);
+    ET_CHECK_MSG(
+        expected_numel == data.size(),
+        "Number of data elements %zd "
+        "does not match expected number of elements %zd",
+        data.size(),
+        expected_numel);
+
+    memory_.emplace_back(std::make_unique<TensorMemory>(
+        sizes, data, actual_dim_order, strides, dynamism));
+    return torch::executor::Tensor(&memory_.back()->impl_);
+  }
+
+  /**
+   * Returns a new Tensor with the specified shape and data in channels last
+   * memory format.
+   *
+   * @param[in] sizes The sizes of the dimensions of the Tensor.
+   * @param[in] data The data that the Tensor should be initialized with. The
+   *     size of this vector must be equal to the product of the elements of
+   *     `sizes`.
+   *
+   * @return A new Tensor with the specified shape and data.
+   */
+  torch::executor::Tensor make_channels_last(
+      const std::vector<int32_t>& sizes,
+      const std::vector<ctype>& data,
+      const std::vector<uint8_t> dim_order = {},
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
+    return make_with_dimorder(
+        sizes, data, internal::channels_last_dim_order(sizes.size()), dynamism);
+  }
+
+  /**
+   * Given data in contiguous memory format, returns a new Tensor with the
+   * specified shape and the same data but in channels last memory format.
+   *
+   * @param[in] sizes The sizes of the dimensions of the Tensor.
+   * @param[in] data The data in contiguous memory format that the Tensor should
+   * be initialized with. The size of this vector must be equal to the product
+   * of the elements of `sizes`.
+   *
+   * @return A new Tensor with the specified shape and data in channls last
+   * memory format.
+   */
+  torch::executor::Tensor channels_last_like(
+      const torch::executor::Tensor& input,
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
+    const std::vector<int32_t> sizes(
+        input.sizes().begin(), input.sizes().end());
+
+    ET_CHECK_MSG(sizes.size() == 4, "Only 4D tensors can be channels last");
+    ET_CHECK_MSG(
+        is_contiguous_dim_order(input.dim_order().data(), input.dim()) == true,
+        "Input tensor is not contiguous");
+    int32_t N = sizes[0];
+    int32_t C = sizes[1];
+    int32_t H = sizes[2];
+    int32_t W = sizes[3];
+
+    std::vector<ctype> contiguous_data(
+        input.data_ptr<ctype>(), input.data_ptr<ctype>() + input.numel());
+    std::vector<ctype> channels_last_data(
+        N * C * H * W); // Create a new blob with the same total size to contain
+                        // channels_last data
+    for (int32_t n = 0; n < N; ++n) {
+      for (int32_t c = 0; c < C; ++c) {
+        for (int32_t h = 0; h < H; ++h) {
+          for (int32_t w = 0; w < W; ++w) {
+            // Calculate the index in the original blob
+            int32_t old_index = ((n * C + c) * H + h) * W + w;
+            // Calculate the index in the new blob
+            int32_t new_index = ((n * H + h) * W + w) * C + c;
+            // Copy the data
+            channels_last_data[new_index] = contiguous_data[old_index];
+          }
+        }
+      }
+    }
+
+    return make_with_dimorder(
+        sizes,
+        channels_last_data,
+        internal::channels_last_dim_order(sizes.size()),
+        dynamism);
+  }
+
+  /**
+   * Returns a new Tensor with the specified shape, containing contiguous data
+   * will all elements set to `value`.
+   *
+   * @param[in] sizes The sizes of the dimensions of the Tensor.
+   * @param[in] value The value of all elements of the Tensor.
+   * @return A new Tensor with the specified shape.
+   */
+  torch::executor::Tensor full(
+      const std::vector<int32_t>& sizes,
+      ctype value,
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
+    std::vector<ctype> data(internal::sizes_to_numel(sizes), value);
+    return make(sizes, data, /* empty strides */ {}, dynamism);
+  }
+
+  /**
+   * Returns a new Tensor with the specified shape, containing channels last
+   * contiguous data will all elements set to `value`.
+   *
+   * @param[in] sizes The sizes of the dimensions of the Tensor.
+   * @param[in] value The value of all elements of the Tensor.
+   * @return A new Tensor with the specified shape.
+   */
+  torch::executor::Tensor full_channels_last(
+      const std::vector<int32_t>& sizes,
+      ctype value,
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
+    std::vector<ctype> data(internal::sizes_to_numel(sizes), value);
+    return make_with_dimorder(
+        sizes, data, internal::channels_last_dim_order(sizes.size()), dynamism);
+  }
+
+  /**
+   * Returns a new Tensor with the specified shape, containing contiguous data
+   * in channels last memory format with all `0` elements.
+   *
+   * @param[in] sizes The sizes of the dimensions of the Tensor.
+   * @return A new Tensor with the specified shape.
+   */
+  torch::executor::Tensor zeros_channels_last(
+      const std::vector<int32_t>& sizes,
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
+    return full_channels_last(sizes, 0, dynamism);
+  }
+
+  /**
+   * Returns a new Tensor with the specified shape, containing contiguous data
+   * in contiguous memory format with all `0` elements.
+   *
+   * @param[in] sizes The sizes of the dimensions of the Tensor.
+   * @return A new Tensor with the specified shape.
+   */
+  torch::executor::Tensor zeros(
+      const std::vector<int32_t>& sizes,
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
+    return full(sizes, 0, dynamism);
+  }
+
+  /**
+   * Returns a new Tensor with the specified shape, containing contiguous data
+   * with all `1` elements.
+   *
+   * @param[in] sizes The sizes of the dimensions of the Tensor.
+   * @return A new Tensor with the specified shape.
+   */
+  torch::executor::Tensor ones(
+      const std::vector<int32_t>& sizes,
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
+    return full(sizes, 1, dynamism);
+  }
+
+  /**
+   * Returns a new Tensor with the same shape as the input tensor, containing
+   * contiguous data with all `0` elements.
+   *
+   * @param[in] input The tensor that supplies the shape of the new Tensor.
+   * @return A new Tensor with the specified shape.
+   */
+  torch::executor::Tensor zeros_like(
+      const torch::executor::Tensor& input,
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
+    std::vector<int32_t> sizes = {input.sizes().begin(), input.sizes().end()};
+    return full(sizes, 0, dynamism);
+  }
+
+  /**
+   * Returns a new Tensor with the same shape as the input tensor, containing
+   * contiguous data with all `1` elements.
+   *
+   * @param[in] input The tensor that supplies the shape of the new Tensor.
+   * @return A new Tensor with the specified shape.
+   */
+  torch::executor::Tensor ones_like(
+      const torch::executor::Tensor& input,
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
+    std::vector<int32_t> sizes = {input.sizes().begin(), input.sizes().end()};
+    return full(sizes, 1, dynamism);
+  }
+
+ private:
+  /**
+   * Owns all backing memory for a single Tensor.
+   */
+  struct TensorMemory {
+    TensorMemory(
+        const std::vector<int32_t>& sizes,
+        const std::vector<ctype>& data,
+        const std::vector<uint8_t>& dim_order,
+        const std::vector<int32_t>& strides,
+        TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC)
+        : sizes_(sizes),
+          data_(data),
+          dim_order_(dim_order),
+          strides_(strides),
+          impl_(
+              DTYPE,
+              /*dim=*/sizes_.size(),
+              sizes_.data(),
+              data_.data(),
+              dim_order_.data(),
+              strides_.data(),
+              dynamism) {
+      // The only valid values for bool are 0 and 1; coerce!
+      if constexpr (std::is_same_v<true_ctype, bool>) {
+        for (auto& x : data_) {
+          x = static_cast<true_ctype>(x);
+        }
+      }
+    }
+
+    std::vector<int32_t> sizes_;
+    std::vector<ctype> data_;
+    std::vector<uint8_t> dim_order_;
+    std::vector<executorch::aten::StridesType> strides_;
+    torch::executor::TensorImpl impl_;
+  };
+
+  /**
+   * The memory pointed to by Tensors created by this factory. This is a vector
+   * of pointers so that the TensorMemory objects won't move if the vector needs
+   * to resize/realloc.
+   */
+  std::vector<std::unique_ptr<TensorMemory>> memory_;
+};
+
+#endif // !USE_ATEN_LIB
+
+/**
+ * A helper class for creating TensorLists, simplifying memory management.
+ *
+ * NOTE: A given TensorListFactory owns the memory pointed to by all TensorLists
+ * (and Tensors they contain), and must live longer than those TensorLists and
+ * Tensors.
+ */
+template <executorch::aten::ScalarType DTYPE>
+class TensorListFactory final {
+ public:
+  TensorListFactory() = default;
+  ~TensorListFactory() = default;
+
+  /**
+   * Returns a TensorList containing Tensors with the same shapes as the
+   * provided Tensors, but filled with zero elements. The dtypes of the template
+   * entries are ignored.
+   */
+  executorch::aten::TensorList zeros_like(
+      const std::vector<executorch::aten::Tensor>& templates) {
+    memory_.emplace_back(
+        std::make_unique<std::vector<executorch::aten::Tensor>>());
+    auto& vec = memory_.back();
+    std::for_each(
+        templates.begin(),
+        templates.end(),
+        [&](const executorch::aten::Tensor& t) {
+          vec->push_back(tf_.zeros_like(t));
+        });
+    return executorch::aten::TensorList(vec->data(), vec->size());
+  }
+
+ private:
+  TensorFactory<DTYPE> tf_;
+  /**
+   * The memory pointed to by TensorLists created by this factory. This is a
+   * vector of pointers so that the elements won't move if the vector needs to
+   * resize/realloc.
+   */
+  std::vector<std::unique_ptr<std::vector<executorch::aten::Tensor>>> memory_;
+};
+
+} // namespace testing
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+namespace testing {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::testing::TensorFactory;
+using ::executorch::runtime::testing::TensorListFactory;
+} // namespace testing
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/exec_aten/testing_util/tensor_util.h b/include/executorch/runtime/core/exec_aten/testing_util/tensor_util.h
new file mode 100644
index 00000000000..4284176c2dc
--- /dev/null
+++ b/include/executorch/runtime/core/exec_aten/testing_util/tensor_util.h
@@ -0,0 +1,378 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <gmock/gmock.h> // For MATCHER_P
+
+#include <optional>
+
+namespace executorch {
+namespace runtime {
+namespace testing {
+
+namespace internal {
+constexpr double kDefaultRtol = 1e-5;
+constexpr double kDefaultAtol = 1e-8;
+// Per
+// https://en.wikipedia.org/wiki/Half-precision_floating-point_format,
+// float16 has about 3.3 digits of precision.
+constexpr double kDefaultHalfAtol = 1e-3;
+
+// Following similar reasoning to float16, BFloat16 has
+// math.log10(2**8) = 2.4 digits of precision.
+constexpr double kDefaultBFloat16Atol = 1e-2;
+} // namespace internal
+
+/**
+ *  Returns true if the tensors are of the same shape and dtype, and if all
+ * elements are close to each other.
+ *
+ * TODO(T132992348): This function will currently fail an ET_CHECK if the
+ * strides of the tensors are not identical. Add support for comparing
+ * tensors with different strides.
+ *
+ * Note that gtest users can write `EXPECT_THAT(tensor1, IsCloseTo(tensor2))` or
+ * `EXPECT_THAT(tensor1, Not(IsCloseTo(tensor2)))`, or use the helper macros
+ * `EXPECT_TENSOR_CLOSE()` and `EXPECT_TENSOR_NOT_CLOSE()`.
+ *
+ * For exact equality, use `EXPECT_THAT(tensor1, IsEqualTo(tensor2))` or
+ * `EXPECT_THAT(tensor1, Not(IsEqualTo(tensor2)))`, or the helper macros
+ * `EXPECT_TENSOR_EQ()` and `EXPECT_TENSOR_NE()`.
+ *
+ * An element A is close to B when one is true:
+ *
+ * (1) A is equal to B.
+ * (2) A and B are both NaN, are both -infinity, or are both +infinity.
+ * (3) The error abs(A - B) is finite and less than the max error
+ *     (atol + abs(rtol * B)).
+ *
+ * If both rtol/atol are zero, this function checks for exact equality.
+ *
+ * NOTE: rtol/atol are ignored and treated as zero for non-floating-point
+ * dtypes.
+ *
+ * @param[in] a The first tensor to compare.
+ * @param[in] b The second tensor to compare.
+ * @param[in] rtol Relative tolerance; see note above.
+ * @param[in] atol Absolute tolerance; see note above.
+ * @retval true All corresponding elements of the two tensors are within the
+ *     specified tolerance of each other.
+ * @retval false One or more corresponding elements of the two tensors are
+ *     outside of the specified tolerance of each other.
+ */
+bool tensors_are_close(
+    const executorch::aten::Tensor& a,
+    const executorch::aten::Tensor& b,
+    double rtol = internal::kDefaultRtol,
+    std::optional<double> opt_atol = std::nullopt);
+
+/**
+ * Returns true if the tensors are of the same numel and dtype, and if all
+ * elements are close to each other. The tensor shapes do not need to be same.
+ *
+ * Note that gtest users can write `EXPECT_THAT(tensor1,
+ * IsDataCloseTo(tensor2))` or `EXPECT_THAT(tensor1,
+ * Not(IsDataCloseTo(tensor2)))`, or use the helper macros
+ * `EXPECT_TENSOR_DATA_CLOSE()` and `EXPECT_TENSOR_DATA_NOT_CLOSE()`.
+ *
+ * For exact equality, use `EXPECT_THAT(tensor1, IsDataEqualTo(tensor2))` or
+ * `EXPECT_THAT(tensor1, Not(IsDataEqualTo(tensor2)))`, or the helper macros
+ * `EXPECT_TENSOR_DATA_EQ()` and `EXPECT_TENSOR_DATA_NE()`.
+ *
+ * The defination of an element A is close to B is in the comment of the
+ * function `tensors_are_close`
+ *
+ * @param[in] a The first tensor to compare.
+ * @param[in] b The second tensor to compare.
+ * @param[in] rtol Relative tolerance; see note above.
+ * @param[in] atol Absolute tolerance; see note above.
+ * @retval true All corresponding elements of the two tensors are within the
+ *     specified tolerance of each other.
+ * @retval false One or more corresponding elements of the two tensors are
+ *     outside of the specified tolerance of each other.
+ */
+bool tensor_data_is_close(
+    const executorch::aten::Tensor& a,
+    const executorch::aten::Tensor& b,
+    double rtol = internal::kDefaultRtol,
+    std::optional<double> opt_atol = std::nullopt);
+
+/**
+ * Returns true if the two lists are of the same length, and
+ * tensor_data_is_close(tensors_a[i], tensors_b[i], rtol, atol) is true for all
+ * i.
+ */
+bool tensor_lists_are_close(
+    const executorch::aten::Tensor* tensors_a,
+    size_t num_tensors_a,
+    const executorch::aten::Tensor* tensors_b,
+    size_t num_tensors_b,
+    double rtol = internal::kDefaultRtol,
+    std::optional<double> opt_atol = std::nullopt);
+
+/**
+ * Lets gtest users write `EXPECT_THAT(tensor1, IsCloseTo(tensor2))` or
+ * `EXPECT_THAT(tensor1, Not(IsCloseTo(tensor2)))`.
+ *
+ * See also `EXPECT_TENSOR_CLOSE()` and `EXPECT_TENSOR_NOT_CLOSE()`.
+ */
+MATCHER_P(IsCloseTo, other, "") {
+  return tensors_are_close(arg, other);
+}
+
+/**
+ * Lets gtest users write
+ * `EXPECT_THAT(tensor1, IsCloseToWithTol(tensor2, rtol, atol))`
+ * or `EXPECT_THAT(tensor1, Not(IsCloseToWithTol(tensor2, rtol, atol)))`.
+ *
+ * See also `EXPECT_TENSOR_CLOSE_WITH_TOL()` and
+ * `EXPECT_TENSOR_NOT_CLOSE_WITH_TOL()`.
+ */
+MATCHER_P3(IsCloseToWithTol, other, rtol, atol, "") {
+  return tensors_are_close(arg, other, rtol, atol);
+}
+
+/**
+ * Lets gtest users write `EXPECT_THAT(tensor1, IsEqualTo(tensor2))` or
+ * `EXPECT_THAT(tensor1, Not(IsEqualTo(tensor2)))`.
+ *
+ * See also `EXPECT_TENSOR_EQ()` and `EXPECT_TENSOR_NE()`.
+ */
+MATCHER_P(IsEqualTo, other, "") {
+  return tensors_are_close(arg, other, /*rtol=*/0, /*atol=*/0);
+}
+
+/**
+ * Lets gtest users write `EXPECT_THAT(tensor1, IsDataCloseTo(tensor2))` or
+ * `EXPECT_THAT(tensor1, Not(IsDataCloseTo(tensor2)))`.
+ *
+ * See also `EXPECT_TENSOR_DATA_CLOSE()` and `EXPECT_TENSOR_DATA_NOT_CLOSE()`.
+ */
+MATCHER_P(IsDataCloseTo, other, "") {
+  return tensor_data_is_close(arg, other);
+}
+
+/**
+ * Lets gtest users write
+ * `EXPECT_THAT(tensor1, IsDataCloseToWithTol(tensor2, rtol, atol))`
+ * or `EXPECT_THAT(tensor1, Not(IsDataCloseToWithTol(tensor2, rtol, atol)))`.
+ *
+ * See also `EXPECT_TENSOR_CLOSE_WITH_TOL()` and
+ * `EXPECT_TENSOR_NOT_CLOSE_WITH_TOL()`.
+ */
+MATCHER_P3(IsDataCloseToWithTol, other, rtol, atol, "") {
+  return tensor_data_is_close(arg, other, rtol, atol);
+}
+
+/**
+ * Lets gtest users write `EXPECT_THAT(tensor1, IsDataEqualTo(tensor2))` or
+ * `EXPECT_THAT(tensor1, Not(IsDataEqualTo(tensor2)))`.
+ *
+ * See also `EXPECT_TENSOR_DATA_EQ()` and `EXPECT_TENSOR_DATA_NE()`.
+ */
+MATCHER_P(IsDataEqualTo, other, "") {
+  return tensor_data_is_close(arg, other, /*rtol=*/0, /*atol=*/0);
+}
+
+/**
+ * Lets gtest users write `EXPECT_THAT(tensor_list1,
+ * IsListCloseTo(tensor_list2))` or `EXPECT_THAT(tensor_list1,
+ * Not(IsListCloseTo(tensor_list2)))`.
+ *
+ * The lists can be any container of Tensor that supports ::data() and ::size().
+ *
+ * See also `EXPECT_TENSOR_LISTS_CLOSE()` and `EXPECT_TENSOR_LISTS_NOT_CLOSE()`.
+ */
+MATCHER_P(IsListCloseTo, other, "") {
+  return tensor_lists_are_close(
+      arg.data(), arg.size(), other.data(), other.size());
+}
+
+/**
+ * Lets gtest users write `EXPECT_THAT(tensor_list1,
+ * IsListEqualTo(tensor_list2))` or `EXPECT_THAT(tensor_list1,
+ * Not(IsListEqualTo(tensor_list2)))`.
+ *
+ * The lists can be any container of Tensor that supports ::data() and ::size().
+ *
+ * See also `EXPECT_TENSOR_LISTS_EQ()` and `EXPECT_TENSOR_LISTS_NE()`.
+ */
+MATCHER_P(IsListEqualTo, other, "") {
+  return tensor_lists_are_close(
+      arg.data(),
+      arg.size(),
+      other.data(),
+      other.size(),
+      /*rtol=*/0,
+      /*atol=*/0);
+}
+
+/*
+ * NOTE: Although it would be nice to make `EXPECT_EQ(t1, t2)` and friends work,
+ * that would require implementing `bool operator==(Tensor, Tensor)`.
+ *
+ * at::Tensor implements `Tensor operator==(Tensor, Tensor)`, returning an
+ * element-by-element comparison. This causes an ambiguous conflict with the
+ * `bool`-returning operator.
+ */
+#define EXPECT_TENSOR_EQ(t1, t2) \
+  EXPECT_THAT((t1), ::executorch::runtime::testing::IsEqualTo(t2))
+#define EXPECT_TENSOR_NE(t1, t2) \
+  EXPECT_THAT((t1), ::testing::Not(executorch::runtime::testing::IsEqualTo(t2)))
+#define ASSERT_TENSOR_EQ(t1, t2) \
+  ASSERT_THAT((t1), ::executorch::runtime::testing::IsEqualTo(t2))
+#define ASSERT_TENSOR_NE(t1, t2) \
+  ASSERT_THAT((t1), ::testing::Not(executorch::runtime::testing::IsEqualTo(t2)))
+
+#define EXPECT_TENSOR_CLOSE(t1, t2) \
+  EXPECT_THAT((t1), ::executorch::runtime::testing::IsCloseTo(t2))
+#define EXPECT_TENSOR_NOT_CLOSE(t1, t2) \
+  EXPECT_THAT((t1), ::testing::Not(executorch::runtime::testing::IsCloseTo(t2)))
+#define ASSERT_TENSOR_CLOSE(t1, t2) \
+  ASSERT_THAT((t1), ::executorch::runtime::testing::IsCloseTo(t2))
+#define ASSERT_TENSOR_NOT_CLOSE(t1, t2) \
+  ASSERT_THAT((t1), ::testing::Not(executorch::runtime::testing::IsCloseTo(t2)))
+
+#define EXPECT_TENSOR_CLOSE_WITH_TOL(t1, t2, rtol, atol) \
+  EXPECT_THAT(                                           \
+      (t1), ::executorch::runtime::testing::IsCloseToWithTol(t2, rtol, atol))
+#define EXPECT_TENSOR_NOT_CLOSE_WITH_TOL(t1, t2, rtol, atol) \
+  EXPECT_THAT(                                               \
+      (t1),                                                  \
+      ::testing::Not(                                        \
+          executorch::runtime::testing::IsCloseToWithTol(t2, rtol, atol)))
+#define ASSERT_TENSOR_CLOSE_WITH_TOL(t1, t2, rtol, atol) \
+  ASSERT_THAT(                                           \
+      (t1), ::executorch::runtime::testing::IsCloseToWithTol(t2, rtol, atol))
+#define ASSERT_TENSOR_NOT_CLOSE_WITH_TOL(t1, t2, rtol, atol) \
+  ASSERT_THAT(                                               \
+      (t1),                                                  \
+      ::testing::Not(                                        \
+          executorch::runtime::testing::IsCloseToWithTol(t2, rtol, atol)))
+
+#define EXPECT_TENSOR_DATA_EQ(t1, t2) \
+  EXPECT_THAT((t1), ::executorch::runtime::testing::IsDataEqualTo(t2))
+#define EXPECT_TENSOR_DATA_NE(t1, t2) \
+  EXPECT_THAT(                        \
+      (t1), ::testing::Not(executorch::runtime::testing::IsDataEqualTo(t2)))
+#define ASSERT_TENSOR_DATA_EQ(t1, t2) \
+  ASSERT_THAT((t1), ::executorch::runtime::testing::IsDataEqualTo(t2))
+#define ASSERT_TENSOR_DATA_NE(t1, t2) \
+  ASSERT_THAT(                        \
+      (t1), ::testing::Not(executorch::runtime::testing::IsDataEqualTo(t2)))
+
+#define EXPECT_TENSOR_DATA_CLOSE(t1, t2) \
+  EXPECT_THAT((t1), ::executorch::runtime::testing::IsDataCloseTo(t2))
+#define EXPECT_TENSOR_DATA_NOT_CLOSE(t1, t2) \
+  EXPECT_THAT(                               \
+      (t1), ::testing::Not(executorch::runtime::testing::IsDataCloseTo(t2)))
+#define ASSERT_TENSOR_DATA_CLOSE(t1, t2) \
+  ASSERT_THAT((t1), ::executorch::runtime::testing::IsDataCloseTo(t2))
+#define ASSERT_TENSOR_DATA_NOT_CLOSE(t1, t2) \
+  ASSERT_THAT(                               \
+      (t1), ::testing::Not(executorch::runtime::testing::IsDataCloseTo(t2)))
+
+#define EXPECT_TENSOR_DATA_CLOSE_WITH_TOL(t1, t2, rtol, atol) \
+  EXPECT_THAT(                                                \
+      (t1),                                                   \
+      ::executorch::runtime::testing::IsDataCloseToWithTol(t2, rtol, atol))
+#define EXPECT_TENSOR_DATA_NOT_CLOSE_WITH_TOL(t1, t2, rtol, atol) \
+  EXPECT_THAT(                                                    \
+      (t1),                                                       \
+      ::testing::Not(                                             \
+          executorch::runtime::testing::IsDataCloseToWithTol(t2, rtol, atol)))
+#define ASSERT_TENSOR_DATA_CLOSE_WITH_TOL(t1, t2, rtol, atol) \
+  ASSERT_THAT(                                                \
+      (t1),                                                   \
+      ::executorch::runtime::testing::IsDataCloseToWithTol(t2, rtol, atol))
+#define ASSERT_TENSOR_DATA_NOT_CLOSE_WITH_TOL(t1, t2, rtol, atol) \
+  ASSERT_THAT(                                                    \
+      (t1),                                                       \
+      ::testing::Not(                                             \
+          executorch::runtime::testing::IsDataCloseToWithTol(t2, rtol, atol)))
+
+/*
+ * Helpers for comparing lists of Tensors.
+ */
+
+#define EXPECT_TENSOR_LISTS_EQ(t1, t2) \
+  EXPECT_THAT((t1), ::executorch::runtime::testing::IsListEqualTo(t2))
+#define EXPECT_TENSOR_LISTS_NE(t1, t2) \
+  EXPECT_THAT(                         \
+      (t1), ::testing::Not(executorch::runtime::testing::IsListEqualTo(t2)))
+#define ASSERT_TENSOR_LISTS_EQ(t1, t2) \
+  ASSERT_THAT((t1), ::executorch::runtime::testing::IsListEqualTo(t2))
+#define ASSERT_TENSOR_LISTS_NE(t1, t2) \
+  ASSERT_THAT(                         \
+      (t1), ::testing::Not(executorch::runtime::testing::IsListEqualTo(t2)))
+
+#define EXPECT_TENSOR_LISTS_CLOSE(t1, t2) \
+  EXPECT_THAT((t1), ::executorch::runtime::testing::IsListCloseTo(t2))
+#define EXPECT_TENSOR_LISTS_NOT_CLOSE(t1, t2) \
+  EXPECT_THAT(                                \
+      (t1), ::testing::Not(executorch::runtime::testing::IsListCloseTo(t2)))
+#define ASSERT_TENSOR_LISTS_CLOSE(t1, t2) \
+  ASSERT_THAT((t1), ::executorch::runtime::testing::IsListCloseTo(t2))
+#define ASSERT_TENSOR_LISTS_NOT_CLOSE(t1, t2) \
+  ASSERT_THAT(                                \
+      (t1), ::testing::Not(executorch::runtime::testing::IsListCloseTo(t2)))
+
+} // namespace testing
+} // namespace runtime
+} // namespace executorch
+
+// ATen already defines operator<<() for Tensor and ScalarType.
+#ifndef USE_ATEN_LIB
+
+/*
+ * These functions must be declared in the original namespaces of their
+ * associated types so that C++ can find them.
+ */
+namespace executorch {
+namespace runtime {
+namespace etensor {
+
+/**
+ * Prints the ScalarType to the stream as a human-readable string.
+ *
+ * See also executorch::runtime::toString(ScalarType t) in ScalarTypeUtil.h.
+ */
+std::ostream& operator<<(std::ostream& os, const ScalarType& t);
+
+/**
+ * Prints the Tensor to the stream as a human-readable string.
+ */
+std::ostream& operator<<(std::ostream& os, const Tensor& t);
+
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
+
+#endif // !USE_ATEN_LIB
+
+namespace torch {
+namespace executor {
+namespace testing {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::testing::IsCloseTo;
+using ::executorch::runtime::testing::IsCloseToWithTol;
+using ::executorch::runtime::testing::IsDataCloseTo;
+using ::executorch::runtime::testing::IsDataCloseToWithTol;
+using ::executorch::runtime::testing::IsDataEqualTo;
+using ::executorch::runtime::testing::IsEqualTo;
+using ::executorch::runtime::testing::IsListCloseTo;
+using ::executorch::runtime::testing::IsListEqualTo;
+using ::executorch::runtime::testing::tensor_data_is_close;
+using ::executorch::runtime::testing::tensor_lists_are_close;
+using ::executorch::runtime::testing::tensors_are_close;
+} // namespace testing
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/exec_aten/util/dim_order_util.h b/include/executorch/runtime/core/exec_aten/util/dim_order_util.h
new file mode 100644
index 00000000000..f5eed09300b
--- /dev/null
+++ b/include/executorch/runtime/core/exec_aten/util/dim_order_util.h
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <c10/util/irange.h>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/assert.h>
+#include <executorch/runtime/platform/compiler.h>
+
+namespace executorch {
+namespace runtime {
+
+namespace {
+template <typename DimOrderType>
+bool validate_dim_order(const DimOrderType* dim_order, const size_t dims) {
+  for (size_t i = 0; i < dims; ++i) {
+    if (dim_order[i] >= static_cast<DimOrderType>(dims)) {
+      return false;
+    }
+  }
+  return true;
+}
+} // namespace
+
+/**
+ * Check if a given dim_order array is equivalent to the contiguous dim order of
+ * {0, 1, 2, 3, ...}
+ *
+ * @param[in] dim_order pointer to dim_order array
+ * @param[in] dims length of the dim_order array
+ */
+template <typename DimOrderType>
+inline bool is_contiguous_dim_order(
+    const DimOrderType* dim_order,
+    const size_t dims) {
+  for (size_t i = 0; i < dims; ++i) {
+    if (dim_order[i] != static_cast<DimOrderType>(i)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+/**
+ * Check if a given dim_order array is equivalent to a channels last dim order.
+ * Channels last dim order is only valid for 4-dim and 5-dim tensors.
+ *
+ * @param[in] dim_order pointer to dim_order array
+ * @param[in] dims length of the dim_order array
+ */
+template <typename DimOrderType>
+bool is_channels_last_dim_order(
+    const DimOrderType* dim_order,
+    const size_t dims) {
+  if (dims != 4 && dims != 5) {
+    return false;
+  }
+  // 4-dim tensor is interpreted as NCHW, 5-dim tensor is interpreted as NCHWD
+  DimOrderType channels_dim = 1;
+  // Last value in the dim order should be the channels dim
+  if (dim_order[dims - 1] != channels_dim) {
+    return false;
+  }
+
+  if (dim_order[0] != 0) {
+    return false;
+  }
+  DimOrderType d = 1;
+  while (d < static_cast<DimOrderType>(dims) - 1) {
+    if (dim_order[d] != d + 1) {
+      return false;
+    }
+    d++;
+  }
+  return true;
+}
+
+/*
+ * This utility translated sizes to strides by using dimension order
+ * information. Dimension order specifies how the dimensions are laid out in the
+ * memory. For example for Size = [2, 3, 4, 5] dim_names = [N, C, H, W]
+ * dim_order = [0, 2, 3, 1]
+ * strides = [60, 1, 15, 3]
+ * param[in]: sizes, pointer to sizes array
+ * param[in]: dim_order, pointer to dimension order array
+ * param[in]: dims, number of dims. Sizes and dim_order must be sizes to dims
+ * param[out]: strides, pointer to strides array that is filled in
+ *
+ * NB: Reason for not using ArrayRef is the dependency on kernel_types.h
+ * This header cannot be included, because of circular dep it causes.
+ * kernel_types depends on executorch_kernel_types in lean mode, which compiles
+ * TensorImpl.cpp. executorch_kernel_types needs to depend on dim_order_utils
+ * in order to utilize dim_order_to_stride in its resize impl. If
+ * dim_order_utils depends on kernel_type, we have circular deps. This is also
+ * the reason for templatizing this function. Better ideas welcome!
+ * TODO(T148342910)
+ *
+ * Note that this function does not check that the provided dim order is valid.
+ * This function should only be used when the validity of the dim order has been
+ * checked beforehand. A safer version of this function is provided below as
+ * dim_order_to_stride which will check that the dim order is valid.
+ */
+template <typename SizesType, typename DimOrderType, typename StridesType>
+inline void dim_order_to_stride_nocheck(
+    const SizesType* sizes,
+    const DimOrderType* dim_order,
+    const size_t dims,
+    StridesType* strides) {
+  // For 0 dim tensors, just return ok.
+  if (dims == 0) {
+    return;
+  }
+  // Fastest moving dim has stride of 1.
+  // For example:
+  // Size = [2, 3, 4, 5] dim_names = [N, C, H, W]
+  // dim_order = [0, 2, 3, 1]
+  // strides = [60, 1, 15, 3]
+  strides[dim_order[dims - 1]] = 1;
+  for (int32_t i = dims - 2; i >= 0; --i) {
+    if (sizes[dim_order[i + 1]] == 0) {
+      strides[dim_order[i]] = strides[dim_order[i + 1]];
+    } else {
+      strides[dim_order[i]] =
+          strides[dim_order[i + 1]] * sizes[dim_order[i + 1]];
+    }
+  }
+}
+
+template <typename SizesType, typename DimOrderType, typename StridesType>
+ET_NODISCARD inline Error dim_order_to_stride(
+    const SizesType* sizes,
+    const DimOrderType* dim_order,
+    const size_t dims,
+    StridesType* strides) {
+  // For 0 dim tensors, just return ok.
+  if (dims == 0) {
+    return Error::Ok;
+  }
+  ET_CHECK_OR_RETURN_ERROR(
+      validate_dim_order(dim_order, dims),
+      InvalidArgument,
+      "Invalid dim order. One of the value is larger than the number of dims %zu",
+      dims);
+
+  dim_order_to_stride_nocheck(sizes, dim_order, dims, strides);
+  return Error::Ok;
+}
+
+namespace internal {
+
+template <typename StridesType, typename DimOrderType>
+struct StrideDimOrder {
+  StridesType stride;
+  DimOrderType dim_order;
+
+  StrideDimOrder(StridesType stride_, DimOrderType dim_order_)
+      : stride(stride_), dim_order(dim_order_) {}
+  StrideDimOrder() = default;
+  bool operator>(const StrideDimOrder& other) const {
+    // descending order
+    return stride < other.stride;
+  }
+};
+
+template <typename ValueType>
+struct Sorter {
+ public:
+  void quick_sort(ValueType arr[], int32_t low, int32_t high) {
+    if (low < high) {
+      ValueType pivot = arr[high];
+      int32_t pos = partition(arr, low, high, pivot);
+
+      quick_sort(arr, low, pos - 1);
+      quick_sort(arr, pos + 1, high);
+    }
+  }
+
+ private:
+  void swap(ValueType arr[], int32_t pos1, int32_t pos2) noexcept {
+    ValueType temp = arr[pos1];
+    arr[pos1] = arr[pos2];
+    arr[pos2] = temp;
+  }
+
+  int32_t
+  partition(ValueType arr[], int32_t low, int32_t high, ValueType pivot) {
+    int32_t i = low;
+    int32_t j = low;
+    while (i <= high) {
+      if (arr[i] > pivot) {
+        i++;
+      } else {
+        swap(arr, i++, j++);
+      }
+    }
+    return j - 1;
+  }
+};
+
+} // namespace internal
+
+/*
+ * This utility translated strides to dimension order
+ * information. Dimension order specifies how the dimensions are laid out in the
+ * memory. For example for tensor with sizes [3, 5, 2] and strides [5, 1, 15],
+ * dim order should be [2, 0, 1], which is obtained by sorting strides in
+ * descending order. param[in]: sizes, pointer to sizes array param[in]:
+ * dim_order, pointer to dimension order array param[in]: dims, number of dims.
+ * Sizes and dim_order must be sizes to dims param[out]: strides, pointer to
+ * strides array that is filled in
+ *
+ * NB: Reason for not using ArrayRef is the dependency on kernel_types.h
+ * This header cannot be included, because of circular dep it causes.
+ * kernel_types depends on executorch_kernel_types in lean mode, which compiles
+ * TensorImpl.cpp. executorch_kernel_types needs to depend on dim_order_utils
+ * in order to utilize dim_order_to_stride in its resize impl. If
+ * dim_order_utils depends on kernel_type, we have circular deps. This is also
+ * the reason for templatizing this function. Better ideas welcome!
+ * TODO(T148342910)
+ */
+template <typename DimOrderType, typename StridesType>
+ET_NODISCARD inline Error stride_to_dim_order(
+    const StridesType* strides,
+    const size_t dims,
+    DimOrderType* dim_order) {
+  const size_t kMaxNumOfDimensions = 16;
+  ET_CHECK_OR_RETURN_ERROR(
+      dim_order != nullptr,
+      MemoryAllocationFailed,
+      "Need memory to get dim_order.");
+  ET_CHECK_OR_RETURN_ERROR(
+      dims <= kMaxNumOfDimensions,
+      NotSupported,
+      "dims %zu exceeds maximum allowed %zu",
+      dims,
+      kMaxNumOfDimensions);
+  internal::StrideDimOrder<StridesType, DimOrderType>
+      array[kMaxNumOfDimensions];
+  for (DimOrderType i = 0; i < dims; i++) {
+    array[i].dim_order = i;
+    array[i].stride = strides[i];
+  }
+
+  internal::Sorter<internal::StrideDimOrder<StridesType, DimOrderType>> sorter;
+
+  sorter.quick_sort(array, 0, dims - 1);
+
+  for (const auto i : c10::irange(dims)) {
+    dim_order[i] = array[i].dim_order;
+  }
+  return Error::Ok;
+}
+
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::dim_order_to_stride;
+using ::executorch::runtime::dim_order_to_stride_nocheck;
+using ::executorch::runtime::is_channels_last_dim_order;
+using ::executorch::runtime::is_contiguous_dim_order;
+using ::executorch::runtime::stride_to_dim_order;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/exec_aten/util/scalar_type_util.h b/include/executorch/runtime/core/exec_aten/util/scalar_type_util.h
new file mode 100644
index 00000000000..6f81146e925
--- /dev/null
+++ b/include/executorch/runtime/core/exec_aten/util/scalar_type_util.h
@@ -0,0 +1,1398 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * @file
+ *
+ * Forked from
+ * https://github.com/pytorch/pytorch/blob/master/c10/core/ScalarType.h
+ *
+ * See file comment in ../ScalarType.h.
+ *
+ * This file contains all of the non-critical parts of the original ScalarType.h
+ * that are not required for the core ExecuTorch runtime, but may be helpful for
+ * code that uses ScalarType.
+ */
+
+#pragma once
+
+#include <array>
+#include <cinttypes>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+#include <executorch/runtime/platform/assert.h>
+
+#ifdef USE_ATEN_LIB
+// Note that a lot of the macros/functions defined in this ScalarTypeUtil.h file
+// are also defined in c10/core/ScalarType.h, which is included via
+// kernel_types.h when building in ATen mode. They tend to use different names
+// and a different namespace, but if there are conflicts they should be resolved
+// here.
+#define ET_FORALL_SCALAR_TYPES AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS
+#include <c10/core/ScalarType.h>
+namespace executorch {
+namespace aten {
+using ScalarType = at::ScalarType;
+} // namespace aten
+} // namespace executorch
+#else // !USE_ATEN_LIB
+#include <executorch/runtime/core/portable_type/scalar_type.h>
+#include <executorch/runtime/core/portable_type/string_view.h>
+namespace executorch {
+namespace aten {
+using ScalarType = torch::executor::ScalarType;
+using string_view = torch::executor::string_view;
+} // namespace aten
+} // namespace executorch
+#endif // USE_ATEN_LIB
+// DEPRECATED: The executorch::aten:: namespace is deprecated. Use
+// executorch::aten:: instead.
+namespace exec_aten = ::executorch::aten;
+
+namespace executorch {
+namespace runtime {
+
+#if !defined(USE_ATEN_LIB)
+// Util to figure out if the scalar type if one of the
+// supported floating point types.
+// In aten mode, aten lib already has these utils as part of
+// its vec_base.h
+template <typename T>
+struct is_floating_point
+    : std::integral_constant<
+          bool,
+          std::is_floating_point<T>::value ||
+              std::is_same_v<T, torch::executor::Half> ||
+              std::is_same_v<T, torch::executor::BFloat16>> {};
+
+// Util to figure out if the scalar type is one of the
+// reduced precision floating point types.
+template <typename T>
+struct is_reduced_floating_point
+    : std::integral_constant<
+          bool,
+          std::is_same_v<T, torch::executor::Half> ||
+              std::is_same_v<T, torch::executor::BFloat16>> {};
+
+template <typename T>
+constexpr bool is_reduced_floating_point_v =
+    is_reduced_floating_point<T>::value;
+#endif
+
+/// Maps ScalarTypes to C++ types.
+template <::executorch::aten::ScalarType N>
+struct ScalarTypeToCppType;
+
+#define SPECIALIZE_ScalarTypeToCppType(cpp_type, scalar_type)               \
+  template <>                                                               \
+  struct ScalarTypeToCppType<::executorch::aten::ScalarType::scalar_type> { \
+    using type = cpp_type;                                                  \
+  };
+
+ET_FORALL_SCALAR_TYPES(SPECIALIZE_ScalarTypeToCppType)
+
+#undef SPECIALIZE_ScalarTypeToCppType
+
+/// Maps C++ types to ScalarTypes.
+template <typename T>
+struct CppTypeToScalarType;
+
+#define SPECIALIZE_CppTypeToScalarType(cpp_type, scalar_type) \
+  template <>                                                 \
+  struct CppTypeToScalarType<cpp_type>                        \
+      : std::integral_constant<                               \
+            ::executorch::aten::ScalarType,                   \
+            ::executorch::aten::ScalarType::scalar_type> {};
+
+ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType)
+
+#undef SPECIALIZE_CppTypeToScalarType
+
+//
+// Macros that iterate across different subsets of ScalarTypes.
+//
+// See ET_FORALL_SCALAR_TYPES in ScalarType.h to iterate across all ScalarType
+// names and types.
+//
+// For all of these macros, the final `_` parameter is the name of another macro
+// that takes two parameters: the name of a C type, and the name of the
+// corresponding ScalarType enumerator.
+//
+// Note that these macros should use fully-qualified namespaces (starting with
+// `::`) to ensure that they can be called safely in any arbitrary namespace.
+//
+
+// In this context, "INT" means integer C types, which is why the quantized
+// integer types are not included.
+#define ET_FORALL_INT_TYPES(_) \
+  _(uint8_t, Byte)             \
+  _(int8_t, Char)              \
+  _(int16_t, Short)            \
+  _(int32_t, Int)              \
+  _(int64_t, Long)
+
+// Here `ANOTHER_INPUT` should be another variable to be forwarded to a given
+// function.
+#define ET_FORALL_INT_TYPES_WITH(ANOTHER_INPUT, _) \
+  _(ANOTHER_INPUT, uint8_t, Byte)                  \
+  _(ANOTHER_INPUT, int8_t, Char)                   \
+  _(ANOTHER_INPUT, int16_t, Short)                 \
+  _(ANOTHER_INPUT, int32_t, Int)                   \
+  _(ANOTHER_INPUT, int64_t, Long)
+
+#define ET_FORALL_INT_TYPES_WITH2(ANOTHER_INPUT1, ANOTHER_INPUT2, _) \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, uint8_t, Byte)                   \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, int8_t, Char)                    \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, int16_t, Short)                  \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, int32_t, Int)                    \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, int64_t, Long)
+
+#define ET_FORALL_INT_TYPES_AND(SCALARTYPE, _)             \
+  _(uint8_t, Byte)                                         \
+  _(int8_t, Char)                                          \
+  _(int16_t, Short)                                        \
+  _(int32_t, Int)                                          \
+  _(int64_t, Long)                                         \
+  _(::executorch::runtime::ScalarTypeToCppType<            \
+        ::executorch::aten::ScalarType::SCALARTYPE>::type, \
+    SCALARTYPE)
+
+// In this context, "FLOAT" means float C types, which is why BFloat16 is not
+// included.
+#define ET_FORALL_FLOAT_TYPES(_) \
+  _(float, Float)                \
+  _(double, Double)
+
+#define ET_FORALL_FLOAT_TYPES_AND(SCALARTYPE, _)           \
+  _(float, Float)                                          \
+  _(double, Double)                                        \
+  _(::executorch::runtime::ScalarTypeToCppType<            \
+        ::executorch::aten::ScalarType::SCALARTYPE>::type, \
+    SCALARTYPE)
+
+#define ET_FORALL_FLOAT_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, _) \
+  _(float, Float)                                               \
+  _(double, Double)                                             \
+  _(::executorch::runtime::ScalarTypeToCppType<                 \
+        ::executorch::aten::ScalarType::SCALARTYPE1>::type,     \
+    SCALARTYPE1)                                                \
+  _(::executorch::runtime::ScalarTypeToCppType<                 \
+        ::executorch::aten::ScalarType::SCALARTYPE2>::type,     \
+    SCALARTYPE2)
+
+#define ET_FORALL_FLOATH_TYPES(_) ET_FORALL_FLOAT_TYPES_AND(Half, _)
+
+#define ET_FORALL_FLOATHBF16_TYPES(_) \
+  ET_FORALL_FLOAT_TYPES_AND2(Half, BFloat16, _)
+
+// Here `ANOTHER_INPUT` should be another variable to be forwarded to a given
+// function. Not to be confused with another scalar type as in
+// `ET_FORALL_FLOAT_TYPES_AND`.
+#define ET_FORALL_FLOAT_TYPES_WITH(ANOTHER_INPUT, _) \
+  _(ANOTHER_INPUT, float, Float)                     \
+  _(ANOTHER_INPUT, double, Double)
+
+#define ET_FORALL_FLOAT_TYPES_WITH2(ANOTHER_INPUT1, ANOTHER_INPUT2, _) \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, float, Float)                      \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, double, Double)
+
+#define ET_FORALL_FLOATHBF16_TYPES_WITH2(ANOTHER_INPUT1, ANOTHER_INPUT2, _) \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, float, Float)                           \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, double, Double)                         \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, ::executorch::aten::Half, Half)         \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, ::executorch::aten::BFloat16, BFloat16)
+
+// In this context, "REAL" means integer/float C types, which is why BFloat16
+// and Half are not included.
+#define ET_FORALL_REAL_TYPES(_) \
+  _(uint8_t, Byte)              \
+  _(int8_t, Char)               \
+  _(int16_t, Short)             \
+  _(int32_t, Int)               \
+  _(int64_t, Long)              \
+  _(float, Float)               \
+  _(double, Double)
+
+// Here `ANOTHER_INPUT` should be another variable to be forwarded to a given
+// function. Not to be confused with another scalar type as in
+// `ET_FORALL_REAL_TYPES_AND`.
+#define ET_FORALL_REAL_TYPES_WITH(ANOTHER_INPUT, _) \
+  _(ANOTHER_INPUT, uint8_t, Byte)                   \
+  _(ANOTHER_INPUT, int8_t, Char)                    \
+  _(ANOTHER_INPUT, int16_t, Short)                  \
+  _(ANOTHER_INPUT, int32_t, Int)                    \
+  _(ANOTHER_INPUT, int64_t, Long)                   \
+  _(ANOTHER_INPUT, float, Float)                    \
+  _(ANOTHER_INPUT, double, Double)
+
+#define ET_FORALL_REAL_TYPES_WITH2(ANOTHER_INPUT1, ANOTHER_INPUT2, _) \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, uint8_t, Byte)                    \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, int8_t, Char)                     \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, int16_t, Short)                   \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, int32_t, Int)                     \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, int64_t, Long)                    \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, float, Float)                     \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, double, Double)
+
+#define ET_FORALL_REALHBF16_TYPES_WITH2(ANOTHER_INPUT1, ANOTHER_INPUT2, _) \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, uint8_t, Byte)                         \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, int8_t, Char)                          \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, int16_t, Short)                        \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, int32_t, Int)                          \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, int64_t, Long)                         \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, float, Float)                          \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, double, Double)                        \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, ::executorch::aten::Half, Half)        \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, ::executorch::aten::BFloat16, BFloat16)
+
+#define ET_FORALL_REALHBBF16_TYPES_WITH2(ANOTHER_INPUT1, ANOTHER_INPUT2, _) \
+  ET_FORALL_REALHBF16_TYPES_WITH2(ANOTHER_INPUT2, ANOTHER_INPUT2, _)        \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, bool, Bool)
+
+// For macros that take `SCALARTYPEn` parameters, those parameters should be
+// an unquoted/unqualified enumerator name like `Int` or `Float`.
+#define ET_FORALL_REAL_TYPES_AND(SCALARTYPE, _)            \
+  _(uint8_t, Byte)                                         \
+  _(int8_t, Char)                                          \
+  _(int16_t, Short)                                        \
+  _(int32_t, Int)                                          \
+  _(int64_t, Long)                                         \
+  _(float, Float)                                          \
+  _(double, Double)                                        \
+  _(::executorch::runtime::ScalarTypeToCppType<            \
+        ::executorch::aten::ScalarType::SCALARTYPE>::type, \
+    SCALARTYPE)
+
+#define ET_FORALL_REAL_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, _) \
+  _(uint8_t, Byte)                                             \
+  _(int8_t, Char)                                              \
+  _(int16_t, Short)                                            \
+  _(int32_t, Int)                                              \
+  _(int64_t, Long)                                             \
+  _(float, Float)                                              \
+  _(double, Double)                                            \
+  _(::executorch::runtime::ScalarTypeToCppType<                \
+        ::executorch::aten::ScalarType::SCALARTYPE1>::type,    \
+    SCALARTYPE1)                                               \
+  _(::executorch::runtime::ScalarTypeToCppType<                \
+        ::executorch::aten::ScalarType::SCALARTYPE2>::type,    \
+    SCALARTYPE2)
+
+#define ET_FORALL_REALH_TYPES(_) ET_FORALL_REAL_TYPES_AND(Half, _)
+
+#define ET_FORALL_REALHBF16_TYPES(_) \
+  ET_FORALL_REAL_TYPES_AND2(Half, BFloat16, _)
+
+#define ET_FORALL_REALHBBF16_TYPES(_) \
+  ET_FORALL_REAL_TYPES_AND3(Bool, Half, BFloat16, _)
+
+#define ET_FORALL_REAL_TYPES_AND_WITH(SCALARTYPE, ANOTHER_INPUT, _) \
+  _(ANOTHER_INPUT, uint8_t, Byte)                                   \
+  _(ANOTHER_INPUT, int8_t, Char)                                    \
+  _(ANOTHER_INPUT, int16_t, Short)                                  \
+  _(ANOTHER_INPUT, int32_t, Int)                                    \
+  _(ANOTHER_INPUT, int64_t, Long)                                   \
+  _(ANOTHER_INPUT, float, Float)                                    \
+  _(ANOTHER_INPUT, double, Double)                                  \
+  _(ANOTHER_INPUT,                                                  \
+    ::executorch::runtime::ScalarTypeToCppType<                     \
+        ::executorch::aten::ScalarType::SCALARTYPE>::type,          \
+    SCALARTYPE)
+
+#define ET_FORALL_REAL_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, _) \
+  _(uint8_t, Byte)                                             \
+  _(int8_t, Char)                                              \
+  _(int16_t, Short)                                            \
+  _(int32_t, Int)                                              \
+  _(int64_t, Long)                                             \
+  _(float, Float)                                              \
+  _(double, Double)                                            \
+  _(::executorch::runtime::ScalarTypeToCppType<                \
+        ::executorch::aten::ScalarType::SCALARTYPE1>::type,    \
+    SCALARTYPE1)                                               \
+  _(::executorch::runtime::ScalarTypeToCppType<                \
+        ::executorch::aten::ScalarType::SCALARTYPE2>::type,    \
+    SCALARTYPE2)
+
+#define ET_FORALL_REAL_TYPES_AND3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, _) \
+  _(uint8_t, Byte)                                                          \
+  _(int8_t, Char)                                                           \
+  _(int16_t, Short)                                                         \
+  _(int32_t, Int)                                                           \
+  _(int64_t, Long)                                                          \
+  _(float, Float)                                                           \
+  _(double, Double)                                                         \
+  _(::executorch::runtime::ScalarTypeToCppType<                             \
+        ::executorch::aten::ScalarType::SCALARTYPE1>::type,                 \
+    SCALARTYPE1)                                                            \
+  _(::executorch::runtime::ScalarTypeToCppType<                             \
+        ::executorch::aten::ScalarType::SCALARTYPE2>::type,                 \
+    SCALARTYPE2)                                                            \
+  _(::executorch::runtime::ScalarTypeToCppType<                             \
+        ::executorch::aten::ScalarType::SCALARTYPE3>::type,                 \
+    SCALARTYPE3)
+
+#define ET_FORALL_QINT_TYPES(_)            \
+  _(::torch::executor::qint8, QInt8)       \
+  _(::torch::executor::quint8, QUInt8)     \
+  _(::torch::executor::qint32, QInt32)     \
+  _(::torch::executor::quint4x2, QUInt4x2) \
+  _(::torch::executor::quint2x4, QUInt2x4)
+
+// In this context, "COMPLEX" means complex types based on primitive C types,
+// which is why ComplexHalf is not included.
+#define ET_FORALL_COMPLEX_TYPES(_)                    \
+  _(::executorch::aten::complex<float>, ComplexFloat) \
+  _(::executorch::aten::complex<double>, ComplexDouble)
+
+#define ET_FORALL_COMPLEXH_TYPES(_)                                     \
+  _(::executorch::aten::complex<::executorch::aten::Half>, ComplexHalf) \
+  _(::executorch::aten::complex<float>, ComplexFloat)                   \
+  _(::executorch::aten::complex<double>, ComplexDouble)
+
+//
+// Utility functions to retrieve metadata for a given ScalarType
+//
+
+/**
+ * Returns true if the parameter is one of the values covered by
+ * ET_FORALL_SCALAR_TYPES.
+ */
+inline bool isValid(::executorch::aten::ScalarType type) {
+  return static_cast<int8_t>(type) >= 0 &&
+      type < ::executorch::aten::ScalarType::NumOptions &&
+      type != ::executorch::aten::ScalarType::Undefined;
+}
+
+/**
+ * Returns the name of a ScalarType as a C string.
+ *
+ * @param[in] t The type to get the name of.
+ * @return The name of the type, or "UNKNOWN_SCALAR" if the type is not known.
+ */
+inline const char* toString(::executorch::aten::ScalarType t) {
+#define DEFINE_CASE(_, name)                 \
+  case ::executorch::aten::ScalarType::name: \
+    return #name;
+
+  switch (t) {
+    ET_FORALL_SCALAR_TYPES(DEFINE_CASE)
+    case ::executorch::aten::ScalarType::Undefined:
+      return "Undefined";
+    default:
+      return "UNKNOWN_SCALAR";
+  }
+#undef DEFINE_CASE
+}
+
+/**
+ * Returns the size in bytes of the C type associated with the ScalarType.
+ *
+ * Calls ET_CHECK_MSG() if the type is unknown or is ScalarType::Undefined.
+ *
+ * @param[in] t The type to get the underlying C type size of.
+ * @return The size of the associated C type in bytes.
+ */
+inline size_t elementSize(::executorch::aten::ScalarType t) {
+#define CASE_ELEMENTSIZE_CASE(ctype, name)   \
+  case ::executorch::aten::ScalarType::name: \
+    return sizeof(ctype);
+
+  switch (t) {
+    ET_FORALL_SCALAR_TYPES(CASE_ELEMENTSIZE_CASE)
+    default:
+      ET_CHECK_MSG(false, "Unknown ScalarType %" PRId8, static_cast<int8_t>(t));
+  }
+#undef CASE_ELEMENTSIZE_CASE
+}
+
+inline constexpr bool isIntegralType(
+    ::executorch::aten::ScalarType t,
+    bool includeBool) {
+  return (includeBool && t == ::executorch::aten::ScalarType::Bool) ||
+      (t == ::executorch::aten::ScalarType::Byte ||
+       t == ::executorch::aten::ScalarType::Char ||
+       t == ::executorch::aten::ScalarType::Int ||
+       t == ::executorch::aten::ScalarType::Long ||
+       t == ::executorch::aten::ScalarType::Short);
+}
+
+template <typename T, bool includeBool>
+struct is_integral_type
+    : public std::integral_constant<
+          bool,
+          isIntegralType(CppTypeToScalarType<T>::value, includeBool)> {};
+
+inline constexpr bool isFloatingType(::executorch::aten::ScalarType t) {
+  return (
+      t == ::executorch::aten::ScalarType::Double ||
+      t == ::executorch::aten::ScalarType::Float ||
+      t == ::executorch::aten::ScalarType::Half ||
+      t == ::executorch::aten::ScalarType::BFloat16);
+}
+
+inline bool isRealType(::executorch::aten::ScalarType t) {
+  return (
+      t == ::executorch::aten::ScalarType::Byte ||
+      t == ::executorch::aten::ScalarType::Char ||
+      t == ::executorch::aten::ScalarType::Short ||
+      t == ::executorch::aten::ScalarType::Int ||
+      t == ::executorch::aten::ScalarType::Long ||
+      t == ::executorch::aten::ScalarType::Float ||
+      t == ::executorch::aten::ScalarType::Double);
+}
+
+inline bool isRealHType(::executorch::aten::ScalarType t) {
+  return (
+      t == ::executorch::aten::ScalarType::Byte ||
+      t == ::executorch::aten::ScalarType::Char ||
+      t == ::executorch::aten::ScalarType::Short ||
+      t == ::executorch::aten::ScalarType::Int ||
+      t == ::executorch::aten::ScalarType::Long ||
+      t == ::executorch::aten::ScalarType::Float ||
+      t == ::executorch::aten::ScalarType::Double ||
+      t == ::executorch::aten::ScalarType::Half);
+}
+
+inline bool isRealHBType(::executorch::aten::ScalarType t) {
+  return (isRealHType(t) || t == ::executorch::aten::ScalarType::Bool);
+}
+
+inline bool isRealHBF16Type(::executorch::aten::ScalarType t) {
+  return (isRealHType(t) || t == ::executorch::aten::ScalarType::BFloat16);
+}
+
+inline bool isRealHBBF16Type(::executorch::aten::ScalarType t) {
+  return (isRealHBType(t) || t == ::executorch::aten::ScalarType::BFloat16);
+}
+
+inline constexpr bool isComplexType(::executorch::aten::ScalarType t) {
+  return (
+      t == ::executorch::aten::ScalarType::ComplexHalf ||
+      t == ::executorch::aten::ScalarType::ComplexFloat ||
+      t == ::executorch::aten::ScalarType::ComplexDouble);
+}
+
+template <typename T>
+struct is_complex_type : std::integral_constant<
+                             bool,
+                             isComplexType(CppTypeToScalarType<T>::value)> {};
+
+constexpr bool isBitsType(::executorch::aten::ScalarType t) {
+  return t == ::executorch::aten::ScalarType::Bits1x8 ||
+      t == ::executorch::aten::ScalarType::Bits2x4 ||
+      t == ::executorch::aten::ScalarType::Bits4x2 ||
+      t == ::executorch::aten::ScalarType::Bits8 ||
+      t == ::executorch::aten::ScalarType::Bits16;
+}
+
+template <typename T>
+struct is_bits_type
+    : std::integral_constant<bool, isBitsType(CppTypeToScalarType<T>::value)> {
+};
+
+constexpr bool isQIntType(::executorch::aten::ScalarType t) {
+  // Don't forget to extend this when adding new QInt types
+  return t == ::executorch::aten::ScalarType::QInt8 ||
+      t == ::executorch::aten::ScalarType::QUInt8 ||
+      t == ::executorch::aten::ScalarType::QInt32 ||
+      t == ::executorch::aten::ScalarType::QUInt4x2 ||
+      t == ::executorch::aten::ScalarType::QUInt2x4;
+}
+
+template <typename T>
+struct is_qint_type
+    : std::integral_constant<bool, isQIntType(CppTypeToScalarType<T>::value)> {
+};
+
+constexpr bool isFloat8Type(::executorch::aten::ScalarType t) {
+  // Don't forget to extend this when adding new QInt types
+  return t == ::executorch::aten::ScalarType::Float8_e5m2 ||
+      t == ::executorch::aten::ScalarType::Float8_e4m3fn ||
+      t == ::executorch::aten::ScalarType::Float8_e5m2fnuz ||
+      t == ::executorch::aten::ScalarType::Float8_e4m3fnuz;
+}
+
+template <typename T>
+struct is_float8_type
+    : std::
+          integral_constant<bool, isFloat8Type(CppTypeToScalarType<T>::value)> {
+};
+
+constexpr bool isBarebonesUnsignedType(::executorch::aten::ScalarType t) {
+  // Don't forget to extend this when adding new QInt types
+  return t == ::executorch::aten::ScalarType::UInt16 ||
+      t == ::executorch::aten::ScalarType::UInt32 ||
+      t == ::executorch::aten::ScalarType::UInt64;
+}
+
+template <typename T>
+struct is_barebones_unsigned_type
+    : std::integral_constant<
+          bool,
+          isBarebonesUnsignedType(CppTypeToScalarType<T>::value)> {};
+
+inline ::executorch::aten::ScalarType toQIntType(
+    ::executorch::aten::ScalarType t) {
+  switch (t) {
+    case ::executorch::aten::ScalarType::Byte:
+      return ::executorch::aten::ScalarType::QUInt8;
+    case ::executorch::aten::ScalarType::Char:
+      return ::executorch::aten::ScalarType::QInt8;
+    case ::executorch::aten::ScalarType::Int:
+      return ::executorch::aten::ScalarType::QInt32;
+    default:
+      return t;
+  }
+}
+
+inline ::executorch::aten::ScalarType toUnderlying(
+    ::executorch::aten::ScalarType t) {
+  switch (t) {
+    case ::executorch::aten::ScalarType::QUInt8:
+      return ::executorch::aten::ScalarType::Byte;
+    case ::executorch::aten::ScalarType::QInt8:
+      return ::executorch::aten::ScalarType::Char;
+    case ::executorch::aten::ScalarType::QInt32:
+      return ::executorch::aten::ScalarType::Int;
+    case ::executorch::aten::ScalarType::QUInt4x2:
+      return ::executorch::aten::ScalarType::Byte;
+    case ::executorch::aten::ScalarType::QUInt2x4:
+      return ::executorch::aten::ScalarType::Byte;
+    default:
+      return t;
+  }
+}
+
+inline bool isSignedType(::executorch::aten::ScalarType t) {
+  ET_CHECK_MSG(
+      !::executorch::runtime::isQIntType(t),
+      "isSignedType not supported for quantized types like %" PRId8,
+      static_cast<int8_t>(t));
+#define CASE_SIGNED(ctype, name)             \
+  case ::executorch::aten::ScalarType::name: \
+    return std::numeric_limits<ctype>::is_signed;
+
+  switch (t) {
+    case ::executorch::aten::ScalarType::ComplexHalf:
+    case ::executorch::aten::ScalarType::ComplexFloat:
+    case ::executorch::aten::ScalarType::ComplexDouble:
+      return true;
+      ET_FORALL_REAL_TYPES_AND3(Half, Bool, BFloat16, CASE_SIGNED)
+    default:
+      ET_CHECK_MSG(false, "Unknown ScalarType %" PRId8, static_cast<int8_t>(t));
+  }
+#undef CASE_SIGNED
+}
+
+inline bool isUnderlying(
+    ::executorch::aten::ScalarType type,
+    ::executorch::aten::ScalarType qtype) {
+  return type == ::executorch::runtime::toUnderlying(qtype);
+}
+
+inline constexpr ::executorch::aten::ScalarType toRealValueType(
+    ::executorch::aten::ScalarType t) {
+  switch (t) {
+    case ::executorch::aten::ScalarType::ComplexHalf:
+      return ::executorch::aten::ScalarType::Half;
+    case ::executorch::aten::ScalarType::ComplexFloat:
+      return ::executorch::aten::ScalarType::Float;
+    case ::executorch::aten::ScalarType::ComplexDouble:
+      return ::executorch::aten::ScalarType::Double;
+    default:
+      return t;
+  }
+}
+
+inline constexpr ::executorch::aten::ScalarType toComplexType(
+    ::executorch::aten::ScalarType t) {
+  switch (t) {
+    case ::executorch::aten::ScalarType::BFloat16:
+      // BFloat16 has range equivalent to Float,
+      // so we map it to ComplexFloat.
+      return ::executorch::aten::ScalarType::ComplexFloat;
+    case ::executorch::aten::ScalarType::Half:
+      return ::executorch::aten::ScalarType::ComplexHalf;
+    case ::executorch::aten::ScalarType::Float:
+      return ::executorch::aten::ScalarType::ComplexFloat;
+    case ::executorch::aten::ScalarType::Double:
+      return ::executorch::aten::ScalarType::ComplexDouble;
+    case ::executorch::aten::ScalarType::ComplexHalf:
+      return ::executorch::aten::ScalarType::ComplexHalf;
+    case ::executorch::aten::ScalarType::ComplexFloat:
+      return ::executorch::aten::ScalarType::ComplexFloat;
+    case ::executorch::aten::ScalarType::ComplexDouble:
+      return ::executorch::aten::ScalarType::ComplexDouble;
+    default:
+      ET_CHECK_MSG(
+          false,
+          "Unknown Complex ScalarType for %" PRId8,
+          static_cast<int8_t>(t));
+  }
+}
+
+/**
+ * Encodes type casting rules that are consistent with ATen behaviour.
+ */
+inline constexpr bool canCast(
+    const ::executorch::aten::ScalarType from,
+    const ::executorch::aten::ScalarType to) {
+  // Disallow complex -> non-complex
+  return !(::executorch::runtime::isComplexType(from) &&
+           !::executorch::runtime::isComplexType(to)) &&
+      // Disallow float -> integral
+      !(::executorch::runtime::isFloatingType(from) &&
+        ::executorch::runtime::isIntegralType(to, /*includeBool=*/false)) &&
+      // Treat bool as a special category. Disallow non-bool -> bool
+      !(from != ::executorch::aten::ScalarType::Bool &&
+        to == ::executorch::aten::ScalarType::Bool);
+}
+
+template <typename T1, typename T2>
+struct can_cast : std::integral_constant<
+                      bool,
+                      canCast(
+                          CppTypeToScalarType<T1>::value,
+                          CppTypeToScalarType<T2>::value)> {};
+
+/**
+ * When casting from floating point to integral type, if the floating value is
+ * outside the integral type range, then an error is thrown if sanitization is
+ * enabled. To circumvent this, we cast the floating point to int64_t first.
+ */
+template <
+    typename To,
+    typename From,
+    std::enable_if_t<
+        (std::is_floating_point<From>::value && std::is_integral<To>::value),
+        int> = 0>
+To convert(From val) {
+  return static_cast<To>(static_cast<int64_t>(val));
+}
+
+template <
+    typename To,
+    typename From,
+    std::enable_if_t<
+        !(std::is_floating_point<From>::value && std::is_integral<To>::value),
+        int> = 0>
+To convert(From val) {
+  return static_cast<To>(val);
+}
+
+namespace internal {
+// This is generated according to NumPy's promote_types
+inline constexpr auto u1 = ::executorch::aten::ScalarType::Byte;
+inline constexpr auto i1 = ::executorch::aten::ScalarType::Char;
+inline constexpr auto i2 = ::executorch::aten::ScalarType::Short;
+inline constexpr auto i4 = ::executorch::aten::ScalarType::Int;
+inline constexpr auto i8 = ::executorch::aten::ScalarType::Long;
+inline constexpr auto f2 = ::executorch::aten::ScalarType::Half;
+inline constexpr auto f4 = ::executorch::aten::ScalarType::Float;
+inline constexpr auto f8 = ::executorch::aten::ScalarType::Double;
+inline constexpr auto c2 = ::executorch::aten::ScalarType::ComplexHalf;
+inline constexpr auto c4 = ::executorch::aten::ScalarType::ComplexFloat;
+inline constexpr auto c8 = ::executorch::aten::ScalarType::ComplexDouble;
+inline constexpr auto b1 = ::executorch::aten::ScalarType::Bool;
+inline constexpr auto bf = ::executorch::aten::ScalarType::BFloat16;
+
+using U1 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Byte>::type;
+using I1 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Char>::type;
+using I2 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Short>::type;
+using I4 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Int>::type;
+using I8 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Long>::type;
+using F2 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Half>::type;
+using F4 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Float>::type;
+using F8 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Double>::type;
+using C2 = typename ScalarTypeToCppType<
+    ::executorch::aten::ScalarType::ComplexHalf>::type;
+using C4 = typename ScalarTypeToCppType<
+    ::executorch::aten::ScalarType::ComplexFloat>::type;
+using C8 = typename ScalarTypeToCppType<
+    ::executorch::aten::ScalarType::ComplexDouble>::type;
+using B1 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Bool>::type;
+using BF = typename ScalarTypeToCppType<
+    ::executorch::aten::ScalarType::BFloat16>::type;
+
+inline constexpr std::array<::executorch::aten::ScalarType, 13> index2dtype = {
+    {u1, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, b1, bf}};
+
+constexpr std::array<
+    int64_t,
+    static_cast<size_t>(::executorch::aten::ScalarType::NumOptions)>
+calculate_dtype2index() {
+  std::array<
+      int64_t,
+      static_cast<size_t>(::executorch::aten::ScalarType::NumOptions)>
+      inverse = {};
+  for (int64_t i = 0;
+       i < static_cast<int64_t>(::executorch::aten::ScalarType::NumOptions);
+       i++) {
+    inverse[i] = -1;
+  }
+  for (int64_t i = 0; i < static_cast<int64_t>(index2dtype.size()); i++) {
+    inverse[static_cast<int64_t>(index2dtype[i])] = i;
+  }
+  return inverse;
+}
+
+inline constexpr auto dtype2index = calculate_dtype2index();
+inline constexpr int NUM_PROMOTE_TYPES = 13;
+// Should match _promoteTypesLookup in c10/core/ScalarType.cpp so that
+// we match PyTorch core type promotion semantics.
+inline constexpr ::executorch::aten::ScalarType
+    promoteTypesLookup[NUM_PROMOTE_TYPES][NUM_PROMOTE_TYPES] = {
+        /*        u1  i1  i2  i4  i8  f2  f4  f8  c2  c4  c8  b1  bf*/
+        /* u1 */ {u1, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, u1, bf},
+        /* i1 */ {i2, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, i1, bf},
+        /* i2 */ {i2, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, i2, bf},
+        /* i4 */ {i4, i4, i4, i4, i8, f2, f4, f8, c2, c4, c8, i4, bf},
+        /* i8 */ {i8, i8, i8, i8, i8, f2, f4, f8, c2, c4, c8, i8, bf},
+        /* f2 */ {f2, f2, f2, f2, f2, f2, f4, f8, c2, c4, c8, f2, f4},
+        /* f4 */ {f4, f4, f4, f4, f4, f4, f4, f8, c4, c4, c8, f4, f4},
+        /* f8 */ {f8, f8, f8, f8, f8, f8, f8, f8, c8, c8, c8, f8, f8},
+        /* c2 */ {c2, c2, c2, c2, c2, c2, c4, c8, c2, c4, c8, c2, c4},
+        /* c4 */ {c4, c4, c4, c4, c4, c4, c4, c8, c4, c4, c8, c4, c4},
+        /* c8 */ {c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8},
+        /* b1 */ {u1, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, b1, bf},
+        /* bf */ {bf, bf, bf, bf, bf, f4, f4, f8, c4, c4, c8, bf, bf},
+};
+
+} // namespace internal
+
+/**
+ * Implements type promotion rules that are consistent with ATen behaviour,
+ * which in turn is consistent with NumPy's promote_types.
+ * If half_to_float is set to true, then half and bfloat16 will be promoted to
+ * float instead
+ */
+inline constexpr ::executorch::aten::ScalarType promoteTypes(
+    ::executorch::aten::ScalarType a,
+    ::executorch::aten::ScalarType b,
+    bool half_to_float = false) {
+  // For QInt types, only allow exact match
+  if (::executorch::runtime::isQIntType(a) && a == b) {
+    return a;
+  }
+  if (::executorch::runtime::isQIntType(a) ||
+      ::executorch::runtime::isQIntType(b)) {
+    ET_CHECK_MSG(false, "promoteTypes not valid for quantized dtypes");
+  }
+
+  // For Bits types, only allow exact match
+  if (::executorch::runtime::isBitsType(a) && a == b) {
+    return a;
+  }
+  if (::executorch::runtime::isBitsType(a) ||
+      ::executorch::runtime::isBitsType(b)) {
+    ET_CHECK_MSG(false, "promoteTypes not valid for bits dtypes");
+  }
+
+  // For Float8 types, only allow exact match
+  if (::executorch::runtime::isFloat8Type(a) && a == b) {
+    return a;
+  }
+  if (::executorch::runtime::isFloat8Type(a) ||
+      ::executorch::runtime::isFloat8Type(b)) {
+    ET_CHECK_MSG(false, "promoteTypes not valid for float8 dtypes");
+  }
+
+  // For barebones uint types, only allow exact match
+  if (::executorch::runtime::isBarebonesUnsignedType(a) && a == b) {
+    return a;
+  }
+  if (::executorch::runtime::isBarebonesUnsignedType(a) ||
+      ::executorch::runtime::isBarebonesUnsignedType(b)) {
+    ET_CHECK_MSG(false, "promoteTypes not valid for barebone unsigned dtypes");
+  }
+
+  auto ix_a = ::executorch::runtime::internal::dtype2index[(int)a];
+  ET_CHECK(ix_a != -1);
+  auto ix_b = ::executorch::runtime::internal::dtype2index[(int)b];
+  ET_CHECK(ix_b != -1);
+  ::executorch::aten::ScalarType promoted_type =
+      ::executorch::runtime::internal::promoteTypesLookup[ix_a][ix_b];
+
+  if (half_to_float &&
+      (promoted_type == ::executorch::aten::ScalarType::Half ||
+       promoted_type == ::executorch::aten::ScalarType::BFloat16)) {
+    promoted_type = ::executorch::aten::ScalarType::Float;
+  }
+
+  return promoted_type;
+}
+
+template <typename T1, typename T2, bool half_to_float = false>
+struct promote_types {
+ private:
+  static_assert(
+      std::is_same_v<T1, T2> ||
+          (!is_qint_type<T1>::value && !is_qint_type<T2>::value),
+      "promote_types not valid for quantized dtypes");
+  static_assert(
+      std::is_same_v<T1, T2> ||
+          (!is_bits_type<T1>::value && !is_bits_type<T2>::value),
+      "promote_types not valid for bits dtypes");
+  static_assert(
+      std::is_same_v<T1, T2> ||
+          (!is_float8_type<T1>::value && !is_float8_type<T2>::value),
+      "promote_types not valid for float8 dtypes");
+  static_assert(
+      std::is_same_v<T1, T2> ||
+          (!is_barebones_unsigned_type<T1>::value &&
+           !is_barebones_unsigned_type<T2>::value),
+      "promote_types not valid for barebones unsigned dtypes");
+
+  using promoted_type_not_respecting_half_to_float =
+      typename ScalarTypeToCppType<promoteTypes(
+          CppTypeToScalarType<T1>::value,
+          CppTypeToScalarType<T2>::value)>::type;
+
+ public:
+  using type = std::conditional_t<
+      half_to_float &&
+          (std::is_same_v<
+               promoted_type_not_respecting_half_to_float,
+               typename ScalarTypeToCppType<
+                   ::executorch::aten::ScalarType::Half>::type> ||
+           std::is_same_v<
+               promoted_type_not_respecting_half_to_float,
+               typename ScalarTypeToCppType<
+                   ::executorch::aten::ScalarType::BFloat16>::type>),
+      typename ScalarTypeToCppType<::executorch::aten::ScalarType::Float>::type,
+      promoted_type_not_respecting_half_to_float>;
+};
+
+//
+// Helper macros for switch case macros (see below)
+//
+// These macros are not meant to be used directly. They provide an easy way to
+// generate a switch statement that can handle subsets of ScalarTypes supported
+// by ExecuTorch.
+//
+
+#ifdef ET_INTERNAL_CHECK_SELECTIVE_BUILD
+#define ET_INTERNAL_SWITCH_CASE(enum_type, CTYPE_ALIAS, ...)         \
+  case enum_type: {                                                  \
+    ET_INTERNAL_CHECK_SELECTIVE_BUILD(enum_type);                    \
+    using CTYPE_ALIAS =                                              \
+        ::executorch::runtime::ScalarTypeToCppType<enum_type>::type; \
+    return __VA_ARGS__();                                            \
+  }
+#else
+#define ET_INTERNAL_SWITCH_CASE(enum_type, CTYPE_ALIAS, ...)         \
+  case enum_type: {                                                  \
+    using CTYPE_ALIAS =                                              \
+        ::executorch::runtime::ScalarTypeToCppType<enum_type>::type; \
+    return __VA_ARGS__();                                            \
+  }
+#endif
+
+#define ET_INTERNAL_SWITCH(TYPE, CONTEXT, NAME, ...) \
+  [&] {                                              \
+    const auto& _st = TYPE;                          \
+    constexpr const char* et_switch_name = NAME;     \
+    (void)et_switch_name; /* Suppress unused var */  \
+    switch (_st) {                                   \
+      __VA_ARGS__                                    \
+      default:                                       \
+        ET_CHECK_MSG(                                \
+            false,                                   \
+            "Unhandled dtype %s for %s",             \
+            ::executorch::runtime::toString(_st),    \
+            et_switch_name);                         \
+    }                                                \
+  }()
+
+#define ET_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, ...)            \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Byte, CTYPE_ALIAS, __VA_ARGS__)  \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Char, CTYPE_ALIAS, __VA_ARGS__)  \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Short, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Int, CTYPE_ALIAS, __VA_ARGS__)   \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_UINT_TYPES(CTYPE_ALIAS, ...)            \
+  ET_INTERNAL_SWITCH_CASE(                                              \
+      ::executorch::aten::ScalarType::UInt16, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                              \
+      ::executorch::aten::ScalarType::UInt32, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                              \
+      ::executorch::aten::ScalarType::UInt64, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES(CTYPE_ALIAS, ...)          \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Float, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, ...)  \
+  ET_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES(CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_COMPLEX_TYPES(CTYPE_ALIAS, ...)               \
+  ET_INTERNAL_SWITCH_CASE(                                                    \
+      ::executorch::aten::ScalarType::ComplexFloat, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                                    \
+      ::executorch::aten::ScalarType::ComplexDouble, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_COMPLEXH_TYPES(CTYPE_ALIAS, ...)  \
+  ET_INTERNAL_SWITCH_CASE_COMPLEX_TYPES(CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                        \
+      ::executorch::aten::ScalarType::ComplexHalf, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_QINT_TYPES(CTYPE_ALIAS, ...)              \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::QInt8, CTYPE_ALIAS, __VA_ARGS__)    \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::QUInt8, CTYPE_ALIAS, __VA_ARGS__)   \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::QInt32, CTYPE_ALIAS, __VA_ARGS__)   \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::QUInt4x2, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::QUInt2x4, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_BITS_TYPES(CTYPE_ALIAS, ...)             \
+  ET_INTERNAL_SWITCH_CASE(                                               \
+      ::executorch::aten::ScalarType::Bits1x8, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                               \
+      ::executorch::aten::ScalarType::Bits2x4, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                               \
+      ::executorch::aten::ScalarType::Bits4x2, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                               \
+      ::executorch::aten::ScalarType::Bits8, CTYPE_ALIAS, __VA_ARGS__)   \
+  ET_INTERNAL_SWITCH_CASE(                                               \
+      ::executorch::aten::ScalarType::Bits16, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_ALL_TYPES(CTYPE_ALIAS, ...)               \
+  ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, __VA_ARGS__)            \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::Half, CTYPE_ALIAS, __VA_ARGS__)     \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::BFloat16, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__)     \
+  ET_INTERNAL_SWITCH_CASE_COMPLEXH_TYPES(CTYPE_ALIAS, __VA_ARGS__)        \
+  ET_INTERNAL_SWITCH_CASE_QINT_TYPES(CTYPE_ALIAS, __VA_ARGS__)            \
+  ET_INTERNAL_SWITCH_CASE_BITS_TYPES(CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND(ADDITIONAL, CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, __VA_ARGS__)               \
+  ET_INTERNAL_SWITCH_CASE(                                                   \
+      ::executorch::aten::ScalarType::ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND2(                             \
+    ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, ...)                              \
+  ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, __VA_ARGS__)               \
+  ET_INTERNAL_SWITCH_CASE(                                                   \
+      ::executorch::aten::ScalarType::ADDITIONAL1, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                                   \
+      ::executorch::aten::ScalarType::ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND3(                             \
+    ADDITIONAL1, ADDITIONAL2, ADDITIONAL3, CTYPE_ALIAS, ...)                 \
+  ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, __VA_ARGS__)               \
+  ET_INTERNAL_SWITCH_CASE(                                                   \
+      ::executorch::aten::ScalarType::ADDITIONAL1, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                                   \
+      ::executorch::aten::ScalarType::ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                                   \
+      ::executorch::aten::ScalarType::ADDITIONAL3, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_INT_TYPES_AND(ADDITIONAL, CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, __VA_ARGS__)               \
+  ET_INTERNAL_SWITCH_CASE(                                                  \
+      ::executorch::aten::ScalarType::ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND(ADDITIONAL, CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES(CTYPE_ALIAS, __VA_ARGS__)               \
+  ET_INTERNAL_SWITCH_CASE(                                                    \
+      ::executorch::aten::ScalarType::ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND2( \
+    ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, ...)   \
+  ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND(        \
+      ADDITIONAL1, CTYPE_ALIAS, __VA_ARGS__)      \
+  ET_INTERNAL_SWITCH_CASE(                        \
+      ::executorch::aten::ScalarType::ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND3(            \
+    ADDITIONAL1, ADDITIONAL2, ADDITIONAL3, CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND2(                  \
+      ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__)    \
+  ET_INTERNAL_SWITCH_CASE(                                   \
+      ::executorch::aten::ScalarType::ADDITIONAL3, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_SCALAR_OBJ_TYPES(CTYPE_ALIAS, ...)    \
+  ET_INTERNAL_SWITCH_CASE(                                            \
+      ::executorch::aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                            \
+      ::executorch::aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                            \
+      ::executorch::aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_SCALAR_OBJ_REAL_TYPES(CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH_CASE(                                              \
+      ::executorch::aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)   \
+  ET_INTERNAL_SWITCH_CASE(                                              \
+      ::executorch::aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_SCALAR_OBJ_INTB_TYPES(CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH_CASE(                                              \
+      ::executorch::aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__)   \
+  ET_INTERNAL_SWITCH_CASE(                                              \
+      ::executorch::aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_SCALAR_OBJ_FLOATB_TYPES(CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__)     \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__)
+
+//
+// Switch case macros
+//
+// These macros provide an easy way to generate switch statements that apply a
+// common lambda function to subsets of ScalarTypes supported by ExecuTorch.
+// The lambda function can type specialize to the ctype associated with the
+// ScalarType being handled through an alias passed as the CTYPE_ALIAS argument.
+//
+// Arguments:
+//   - ADDITIONAL: Additional ScalarType case to add
+//   - TYPE: The ScalarType to handle through the switch statement
+//   - CONTEXT: The KernelRuntimeContext instance used for error handling, etc.
+//   - NAME: A name for this operation which will be used in error messages
+//   - CTYPE_ALIAS: A typedef for the ctype associated with the ScalarType.
+//   - [&](){...}: A lambda function to be applied to each ScalarType case
+//
+// An example usage is:
+//
+// ET_SWITCH_REAL_TYPES(input.scalar_type(), "example", CTYPE, [&]() {
+//   output.mutable_data_ptr<CTYPE>[0] = input.const_data_ptr<CTYPE>[0];
+// });
+//
+// Note that these can be nested as well:
+//
+// ET_SWITCH_REAL_TYPES(input.scalar_type(), "example", CTYPE_IN, [&]() {
+//   ET_SWITCH_REAL_TYPES(output.scalar_type(), "example", CTYPE_OUT, [&]() {
+//     output.mutable_data_ptr<CTYPE_OUT>[0] =
+//         input.const_data_ptr<CTYPE_IN>[0];
+//   });
+// });
+//
+// These macros are adapted from Dispatch.h in the ATen library. The primary
+// difference is that the CTYPE_ALIAS argument is exposed to users, which is
+// used to alias the ctype associated with the ScalarType that is being handled.
+//
+
+#define ET_SWITCH_ALL_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH(                                              \
+      TYPE,                                                        \
+      CONTEXT,                                                     \
+      NAME,                                                        \
+      ET_INTERNAL_SWITCH_CASE_ALL_TYPES(CTYPE_ALIAS, __VA_ARGS__))
+
+#define ET_SWITCH_REAL_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH(                                               \
+      TYPE,                                                         \
+      CONTEXT,                                                      \
+      NAME,                                                         \
+      ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, __VA_ARGS__))
+
+#define ET_SWITCH_REAL_TYPES_AND(                      \
+    ADDITIONAL, TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH(                                  \
+      TYPE,                                            \
+      CONTEXT,                                         \
+      NAME,                                            \
+      ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND(          \
+          ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__))
+
+#define ET_SWITCH_REAL_TYPES_AND2(                                   \
+    ADDITIONAL1, ADDITIONAL2, TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH(                                                \
+      TYPE,                                                          \
+      CONTEXT,                                                       \
+      NAME,                                                          \
+      ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND2(                       \
+          ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__))
+
+#define ET_SWITCH_REAL_TYPES_AND3(             \
+    ADDITIONAL1,                               \
+    ADDITIONAL2,                               \
+    ADDITIONAL3,                               \
+    TYPE,                                      \
+    CONTEXT,                                   \
+    NAME,                                      \
+    CTYPE_ALIAS,                               \
+    ...)                                       \
+  ET_INTERNAL_SWITCH(                          \
+      TYPE,                                    \
+      CONTEXT,                                 \
+      NAME,                                    \
+      ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND3( \
+          ADDITIONAL1, ADDITIONAL2, ADDITIONAL3, CTYPE_ALIAS, __VA_ARGS__))
+
+#define ET_SWITCH_REALH_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_SWITCH_REAL_TYPES_AND(Half, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_SWITCH_REALHBF16_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_SWITCH_REAL_TYPES_AND2(                                             \
+      Half, BFloat16, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_SWITCH_REALB_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_SWITCH_REAL_TYPES_AND(Bool, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_SWITCH_REALHB_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_SWITCH_REAL_TYPES_AND2(                                          \
+      Half, Bool, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_SWITCH_REALHBBF16_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_SWITCH_REAL_TYPES_AND3(                                              \
+      Half, Bool, BFloat16, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_SWITCH_REALHBBF16_AND_UINT_TYPES(              \
+    TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...)                \
+  ET_INTERNAL_SWITCH(                                     \
+      TYPE,                                               \
+      CONTEXT,                                            \
+      NAME,                                               \
+      ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND3(            \
+          Half, Bool, BFloat16, CTYPE_ALIAS, __VA_ARGS__) \
+          ET_INTERNAL_SWITCH_CASE_UINT_TYPES(CTYPE_ALIAS, __VA_ARGS__))
+
+#define ET_SWITCH_INT_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH(                                              \
+      TYPE,                                                        \
+      CONTEXT,                                                     \
+      NAME,                                                        \
+      ET_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, __VA_ARGS__))
+
+#define ET_SWITCH_INT_TYPES_AND(                       \
+    ADDITIONAL, TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH(                                  \
+      TYPE,                                            \
+      CONTEXT,                                         \
+      NAME,                                            \
+      ET_INTERNAL_SWITCH_CASE_INT_TYPES_AND(           \
+          ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__))
+
+#define ET_SWITCH_FLOAT_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH(                                                \
+      TYPE,                                                          \
+      CONTEXT,                                                       \
+      NAME,                                                          \
+      ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES(CTYPE_ALIAS, __VA_ARGS__))
+
+#define ET_SWITCH_FLOAT_TYPES_AND(                     \
+    ADDITIONAL, TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH(                                  \
+      TYPE,                                            \
+      CONTEXT,                                         \
+      NAME,                                            \
+      ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND(         \
+          ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__))
+
+#define ET_SWITCH_FLOAT_TYPES_AND2(                                  \
+    ADDITIONAL1, ADDITIONAL2, TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH(                                                \
+      TYPE,                                                          \
+      CONTEXT,                                                       \
+      NAME,                                                          \
+      ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND2(                      \
+          ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__))
+
+#define ET_SWITCH_FLOAT_TYPES_AND3(             \
+    ADDITIONAL1,                                \
+    ADDITIONAL2,                                \
+    ADDITIONAL3,                                \
+    TYPE,                                       \
+    CONTEXT,                                    \
+    NAME,                                       \
+    CTYPE_ALIAS,                                \
+    ...)                                        \
+  ET_INTERNAL_SWITCH(                           \
+      TYPE,                                     \
+      CONTEXT,                                  \
+      NAME,                                     \
+      ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND3( \
+          ADDITIONAL1, ADDITIONAL2, ADDITIONAL3, CTYPE_ALIAS, __VA_ARGS__))
+
+#define ET_SWITCH_FLOATH_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_SWITCH_FLOAT_TYPES_AND(Half, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_SWITCH_FLOATHBF16_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_SWITCH_FLOAT_TYPES_AND2(                                             \
+      Half, BFloat16, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_SWITCH_FLOATHBF16_TYPES_AND(                \
+    ADDITIONAL, TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_SWITCH_FLOAT_TYPES_AND3(                          \
+      Half,                                            \
+      BFloat16,                                        \
+      ADDITIONAL,                                      \
+      TYPE,                                            \
+      CONTEXT,                                         \
+      NAME,                                            \
+      CTYPE_ALIAS,                                     \
+      __VA_ARGS__)
+
+#define ET_SWITCH_QINT_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH(                                               \
+      TYPE,                                                         \
+      CONTEXT,                                                      \
+      NAME,                                                         \
+      ET_INTERNAL_SWITCH_CASE_QINT_TYPES(CTYPE_ALIAS, __VA_ARGS__))
+
+#define ET_SWITCH_COMPLEX_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH(                                                  \
+      TYPE,                                                            \
+      CONTEXT,                                                         \
+      NAME,                                                            \
+      ET_INTERNAL_SWITCH_CASE_COMPLEX_TYPES(CTYPE_ALIAS, __VA_ARGS__))
+
+#define ET_SWITCH_COMPLEXH_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH(                                                   \
+      TYPE,                                                             \
+      CONTEXT,                                                          \
+      NAME,                                                             \
+      ET_INTERNAL_SWITCH_CASE_COMPLEXH_TYPES(CTYPE_ALIAS, __VA_ARGS__))
+
+#define ET_SWITCH_SCALAR_OBJ_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH(                                                     \
+      TYPE,                                                               \
+      CONTEXT,                                                            \
+      NAME,                                                               \
+      ET_INTERNAL_SWITCH_CASE_SCALAR_OBJ_TYPES(CTYPE_ALIAS, __VA_ARGS__))
+
+#define ET_SWITCH_SCALAR_OBJ_REAL_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH(                                                          \
+      TYPE,                                                                    \
+      CONTEXT,                                                                 \
+      NAME,                                                                    \
+      ET_INTERNAL_SWITCH_CASE_SCALAR_OBJ_REAL_TYPES(CTYPE_ALIAS, __VA_ARGS__))
+
+#define ET_SWITCH_SCALAR_OBJ_INTB_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH(                                                          \
+      TYPE,                                                                    \
+      CONTEXT,                                                                 \
+      NAME,                                                                    \
+      ET_INTERNAL_SWITCH_CASE_SCALAR_OBJ_INTB_TYPES(CTYPE_ALIAS, __VA_ARGS__))
+
+#define ET_SWITCH_SCALAR_OBJ_FLOATB_TYPES(             \
+    TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...)             \
+  ET_INTERNAL_SWITCH(                                  \
+      TYPE,                                            \
+      CONTEXT,                                         \
+      NAME,                                            \
+      ET_INTERNAL_SWITCH_CASE_SCALAR_OBJ_FLOATB_TYPES( \
+          CTYPE_ALIAS, __VA_ARGS__))
+
+#define ET_SWITCH_TWO_TYPES(T1, T2, TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH(                                                      \
+      TYPE,                                                                \
+      CONTEXT,                                                             \
+      NAME,                                                                \
+      ET_INTERNAL_SWITCH_CASE(                                             \
+          ::executorch::aten::ScalarType::T1, CTYPE_ALIAS, __VA_ARGS__)    \
+          ET_INTERNAL_SWITCH_CASE(                                         \
+              ::executorch::aten::ScalarType::T2, CTYPE_ALIAS, __VA_ARGS__))
+
+#define ET_SWITCH_THREE_TYPES(                                              \
+    T1, T2, T3, TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...)                      \
+  ET_INTERNAL_SWITCH(                                                       \
+      TYPE,                                                                 \
+      CONTEXT,                                                              \
+      NAME,                                                                 \
+      ET_INTERNAL_SWITCH_CASE(                                              \
+          ::executorch::aten::ScalarType::T1, CTYPE_ALIAS, __VA_ARGS__)     \
+          ET_INTERNAL_SWITCH_CASE(                                          \
+              ::executorch::aten::ScalarType::T2, CTYPE_ALIAS, __VA_ARGS__) \
+              ET_INTERNAL_SWITCH_CASE(                                      \
+                  ::executorch::aten::ScalarType::T3,                       \
+                  CTYPE_ALIAS,                                              \
+                  __VA_ARGS__))
+
+} // namespace runtime
+} // namespace executorch
+
+namespace executorch {
+namespace aten {
+#ifdef USE_ATEN_LIB
+using ::at::elementSize;
+#else // USE_ATEN_LIB
+using ::executorch::runtime::elementSize;
+#endif // USE_ATEN_LIB
+} // namespace aten
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::can_cast;
+using ::executorch::runtime::canCast;
+using ::executorch::runtime::convert;
+using ::executorch::runtime::CppTypeToScalarType;
+using ::executorch::runtime::elementSize;
+using ::executorch::runtime::is_barebones_unsigned_type;
+using ::executorch::runtime::is_bits_type;
+using ::executorch::runtime::is_complex_type;
+using ::executorch::runtime::is_float8_type;
+using ::executorch::runtime::is_integral_type;
+using ::executorch::runtime::is_qint_type;
+using ::executorch::runtime::isBitsType;
+using ::executorch::runtime::isComplexType;
+using ::executorch::runtime::isFloatingType;
+using ::executorch::runtime::isIntegralType;
+using ::executorch::runtime::isQIntType;
+using ::executorch::runtime::isRealHBType;
+using ::executorch::runtime::isRealHType;
+using ::executorch::runtime::isRealType;
+using ::executorch::runtime::isValid;
+using ::executorch::runtime::promote_types;
+using ::executorch::runtime::promoteTypes;
+using ::executorch::runtime::ScalarTypeToCppType;
+using ::executorch::runtime::toString;
+#if !defined(USE_ATEN_LIB)
+using ::executorch::runtime::is_floating_point;
+using ::executorch::runtime::is_reduced_floating_point;
+#endif
+namespace internal {
+using ::executorch::runtime::internal::B1;
+using ::executorch::runtime::internal::C2;
+using ::executorch::runtime::internal::C4;
+using ::executorch::runtime::internal::C8;
+using ::executorch::runtime::internal::F2;
+using ::executorch::runtime::internal::F4;
+using ::executorch::runtime::internal::F8;
+using ::executorch::runtime::internal::I1;
+using ::executorch::runtime::internal::I2;
+using ::executorch::runtime::internal::I4;
+using ::executorch::runtime::internal::I8;
+using ::executorch::runtime::internal::U1;
+} // namespace internal
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/exec_aten/util/tensor_dimension_limit.h b/include/executorch/runtime/core/exec_aten/util/tensor_dimension_limit.h
new file mode 100644
index 00000000000..6e072ab0582
--- /dev/null
+++ b/include/executorch/runtime/core/exec_aten/util/tensor_dimension_limit.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+namespace executorch::runtime {
+/**
+ * The expected output size may not be the existing size of any inputs and
+ * outputs if the operator supports both broadcast and dynamic shape.
+ * Therefore such operators needs extra space to store the calculated expected
+ * output size. such dynamic allocation is troublesome in executorch so we can
+ * just hard code a static value of a relatively small value because users
+ * don't create high dimensional tensors.
+ */
+constexpr size_t kTensorDimensionLimit = 16;
+} // namespace executorch::runtime
diff --git a/include/executorch/runtime/core/exec_aten/util/tensor_shape_to_c_string.h b/include/executorch/runtime/core/exec_aten/util/tensor_shape_to_c_string.h
new file mode 100644
index 00000000000..efc91f8b5d4
--- /dev/null
+++ b/include/executorch/runtime/core/exec_aten/util/tensor_shape_to_c_string.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+
+#include <executorch/runtime/core/exec_aten/util/tensor_dimension_limit.h>
+#include <executorch/runtime/core/span.h>
+
+namespace executorch::runtime {
+
+/**
+ * Maximum size of a string returned by tensor_shape_to_c_string, for
+ * stack allocation.
+ */
+constexpr size_t kTensorShapeStringSizeLimit = 1 + /* opening parenthesis */
+    10 * kTensorDimensionLimit + /* maximum digits we will print; update
+                                  * kMaximumPrintableTensorShapeElement
+                                  * if changing */
+    2 * kTensorDimensionLimit + /* comma and space after each item,
+                                 * overwritten with closing paren and
+                                 * NUL terminator for last element */
+    1; /* padding for temporary NUL terminator for simplicity of implementation
+        */
+
+namespace internal {
+constexpr size_t kMaximumPrintableTensorShapeElement =
+    std::numeric_limits<int32_t>::max();
+} // namespace internal
+
+/**
+ * Convert a shape to a NUL-terminated C string with limited size. If
+ * elements of the shape are larger than
+ * kMaximumPrintableTensorShapeElement, those elements will be
+ * rendered as ERR instead.
+ *
+ * NOTE: There are two overloads of this function to support both ATen
+ * tensors and ExecuTorch Tensors, which have different SizesType,
+ * while also avoiding a dependency on exec_aten.h from this header
+ * because that would cause a circular dependency.
+ */
+std::array<char, kTensorShapeStringSizeLimit> tensor_shape_to_c_string(
+    executorch::runtime::Span<const std::int32_t> shape);
+
+/**
+ * Convert a shape to a NUL-terminated C string with limited size. If
+ * elements of the shape are larger than
+ * kMaximumPrintableTensorShapeElement, those elements will be
+ * rendered as ERR instead.
+ *
+ * NOTE: There are two overloads of this function to support both ATen
+ * tensors and ExecuTorch Tensors, which have different SizesType,
+ * while also avoiding a dependency on exec_aten.h from this header
+ * because that would cause a circular dependency.
+ */
+std::array<char, kTensorShapeStringSizeLimit> tensor_shape_to_c_string(
+    executorch::runtime::Span<const std::int64_t> shape);
+
+} // namespace executorch::runtime
diff --git a/include/executorch/runtime/core/exec_aten/util/tensor_util.h b/include/executorch/runtime/core/exec_aten/util/tensor_util.h
new file mode 100644
index 00000000000..b0b79882361
--- /dev/null
+++ b/include/executorch/runtime/core/exec_aten/util/tensor_util.h
@@ -0,0 +1,1408 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <c10/util/irange.h>
+#include <algorithm>
+#include <array> // std::array
+#include <cinttypes> // PRId64
+#include <cmath>
+#include <cstddef> // size_t
+
+#include <limits>
+
+#include <executorch/runtime/core/array_ref.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_dimension_limit.h>
+#include <executorch/runtime/core/span.h>
+#include <executorch/runtime/platform/assert.h>
+#include <executorch/runtime/platform/compiler.h>
+
+/// All assertion messages should begin with this prefix.
+#define ET_TENSOR_CHECK_PREFIX__ "Tensors do not match"
+#define ET_MIN2(a, b) (std::min(a, b))
+#define ET_MIN3(a, b, c) (std::min(a, std::min(b, c)))
+
+#define ET_NORMALIZE_IX(IX, UPPER_BOUND) IX < 0 ? IX + UPPER_BOUND : IX
+
+#define ET_CHECK_VALID_IX(IX, UPPER_BOUND)                  \
+  ET_CHECK_MSG(                                             \
+      IX >= -static_cast<int64_t>(UPPER_BOUND) &&           \
+          IX < static_cast<int64_t>(UPPER_BOUND),           \
+      "index %" PRId64 " must be within range [-%zd, %zd)", \
+      IX,                                                   \
+      UPPER_BOUND,                                          \
+      UPPER_BOUND)
+
+#define ET_CHECK_VALID_DIM(DIM, UPPER_BOUND)              \
+  ET_CHECK_MSG(                                           \
+      DIM >= -static_cast<int64_t>(UPPER_BOUND) &&        \
+          DIM < static_cast<int64_t>(UPPER_BOUND),        \
+      "dim %" PRId64 " must be within range [-%zd, %zd)", \
+      DIM,                                                \
+      UPPER_BOUND,                                        \
+      UPPER_BOUND)
+
+#define ET_CHECK_NON_ZERO_DIM_SIZE(DIM, T)           \
+  const size_t udim = ET_NORMALIZE_IX(DIM, T.dim()); \
+  ET_CHECK_MSG(                                      \
+      T.size(udim) != 0, "Expected dim %zd to have non-zero size.", udim);
+
+/**
+ * Asserts that all tensors have the same shape.
+ * This also handles a edge case where there is only one element in all the
+ * tensors being compared but the number of dimensions >= 0. In the for loop
+ * iterating over the dimensions we make sure that we pick the smallest
+ * dimension of all the tensors as the upper bound for the for loop.
+ */
+#define ET_CHECK_SAME_SHAPE2(a__, b__)                                    \
+  ({                                                                      \
+    const size_t a_numel__ = (a__).numel();                               \
+    const size_t b_numel__ = (b__).numel();                               \
+    const size_t a_dim__ = (a__).dim();                                   \
+    const size_t b_dim__ = (b__).dim();                                   \
+    ET_CHECK_MSG(                                                         \
+        a_numel__ == b_numel__ &&                                         \
+            ((a_numel__ == 1 && b_numel__ == 1) || (a_dim__ == b_dim__)), \
+        ET_TENSOR_CHECK_PREFIX__ ": numel={%zu, %zu}, dim={%zu, %zu}",    \
+        a_numel__,                                                        \
+        b_numel__,                                                        \
+        a_dim__,                                                          \
+        b_dim__);                                                         \
+    for (size_t dim__ = 0; dim__ < ET_MIN2(a_dim__, b_dim__); ++dim__) {  \
+      size_t a_size__ = (a__).size(dim__);                                \
+      size_t b_size__ = (b__).size(dim__);                                \
+      ET_CHECK_MSG(                                                       \
+          a_size__ == b_size__,                                           \
+          ET_TENSOR_CHECK_PREFIX__ " at size(%zu): {%zu, %zu}",           \
+          dim__,                                                          \
+          a_size__,                                                       \
+          b_size__);                                                      \
+    }                                                                     \
+  })
+
+#define ET_CHECK_SAME_SHAPE3(a__, b__, c__)                            \
+  ({                                                                   \
+    const size_t a_numel__ = (a__).numel();                            \
+    const size_t b_numel__ = (b__).numel();                            \
+    const size_t c_numel__ = (c__).numel();                            \
+    const size_t a_dim__ = (a__).dim();                                \
+    const size_t b_dim__ = (b__).dim();                                \
+    const size_t c_dim__ = (c__).dim();                                \
+    ET_CHECK_MSG(                                                      \
+        a_numel__ == b_numel__ && b_numel__ == c_numel__ &&            \
+            ((a_numel__ == 1 && b_numel__ == 1 && c_numel__ == 1) ||   \
+             a_dim__ == b_dim__ && b_dim__ == c_dim__),                \
+        ET_TENSOR_CHECK_PREFIX__                                       \
+        ": numel={%zu, %zu, %zu}, dim={%zu, %zu, %zu}",                \
+        a_numel__,                                                     \
+        b_numel__,                                                     \
+        c_numel__,                                                     \
+        a_dim__,                                                       \
+        b_dim__,                                                       \
+        c_dim__);                                                      \
+    for (size_t dim__ = 0; dim__ < ET_MIN3(a_dim__, b_dim__, c_dim__); \
+         ++dim__) {                                                    \
+      size_t a_size__ = (a__).size(dim__);                             \
+      size_t b_size__ = (b__).size(dim__);                             \
+      size_t c_size__ = (c__).size(dim__);                             \
+      ET_CHECK_MSG(                                                    \
+          a_size__ == b_size__ && b_size__ == c_size__,                \
+          ET_TENSOR_CHECK_PREFIX__ " at size(%zu): {%zu, %zu, %zu}",   \
+          dim__,                                                       \
+          a_size__,                                                    \
+          b_size__,                                                    \
+          c_size__);                                                   \
+    }                                                                  \
+  })
+
+/// Asserts that all tensors have the same dtype.
+#define ET_CHECK_SAME_DTYPE2(a__, b__)                                   \
+  ({                                                                     \
+    const ::executorch::aten::ScalarType a_type__ = (a__).scalar_type(); \
+    const ::executorch::aten::ScalarType b_type__ = (b__).scalar_type(); \
+    ET_CHECK_MSG(                                                        \
+        a_type__ == b_type__,                                            \
+        ET_TENSOR_CHECK_PREFIX__ ": dtype={%" PRId8 ", %" PRId8 "}",     \
+        static_cast<int8_t>(a_type__),                                   \
+        static_cast<int8_t>(b_type__));                                  \
+  })
+
+#define ET_CHECK_SAME_DTYPE3(a__, b__, c__)                                 \
+  ({                                                                        \
+    const ::executorch::aten::ScalarType a_type__ = (a__).scalar_type();    \
+    const ::executorch::aten::ScalarType b_type__ = (b__).scalar_type();    \
+    const ::executorch::aten::ScalarType c_type__ = (c__).scalar_type();    \
+    ET_CHECK_MSG(                                                           \
+        a_type__ == b_type__ && b_type__ == c_type__,                       \
+        ET_TENSOR_CHECK_PREFIX__ ": dtype={%" PRId8 ", %" PRId8 ", %" PRId8 \
+                                 "}",                                       \
+        static_cast<int8_t>(a_type__),                                      \
+        static_cast<int8_t>(b_type__),                                      \
+        static_cast<int8_t>(c_type__));                                     \
+  })
+
+/**
+ * Asserts that all tensors have the same shape and dtype.
+ *
+ * This macro should produce less code/data than calling the SHAPE and DTYPE
+ * macros independently, because it only calls ET_CHECK_MSG once.
+ */
+#define ET_CHECK_SAME_SHAPE_AND_DTYPE2(a__, b__)                              \
+  ({                                                                          \
+    const size_t a_numel__ = (a__).numel();                                   \
+    const size_t b_numel__ = (b__).numel();                                   \
+    const size_t a_dim__ = (a__).dim();                                       \
+    const size_t b_dim__ = (b__).dim();                                       \
+    const ::executorch::aten::ScalarType a_type__ = (a__).scalar_type();      \
+    const ::executorch::aten::ScalarType b_type__ = (b__).scalar_type();      \
+                                                                              \
+    ET_CHECK_MSG(                                                             \
+        a_numel__ == b_numel__ &&                                             \
+            ((a_numel__ == 1 && b_numel__ == 1) || a_dim__ == b_dim__) &&     \
+            a_type__ == b_type__,                                             \
+        ET_TENSOR_CHECK_PREFIX__                                              \
+        ": numel={%zu, %zu}, dim={%zu, %zu}, dtype={%" PRId8 ", %" PRId8 "}", \
+        a_numel__,                                                            \
+        b_numel__,                                                            \
+        a_dim__,                                                              \
+        b_dim__,                                                              \
+        static_cast<int8_t>(a_type__),                                        \
+        static_cast<int8_t>(b_type__));                                       \
+    for (size_t dim__ = 0; dim__ < ET_MIN2(a_dim__, b_dim__); ++dim__) {      \
+      size_t a_size__ = (a__).size(dim__);                                    \
+      size_t b_size__ = (b__).size(dim__);                                    \
+      ET_CHECK_MSG(                                                           \
+          a_size__ == b_size__,                                               \
+          ET_TENSOR_CHECK_PREFIX__ " at size(%zu): {%zu, %zu}",               \
+          dim__,                                                              \
+          a_size__,                                                           \
+          b_size__);                                                          \
+    }                                                                         \
+  })
+
+#define ET_CHECK_SAME_SHAPE_AND_DTYPE3(a__, b__, c__)                    \
+  ({                                                                     \
+    const size_t a_numel__ = (a__).numel();                              \
+    const size_t b_numel__ = (b__).numel();                              \
+    const size_t c_numel__ = (c__).numel();                              \
+    const size_t a_dim__ = (a__).dim();                                  \
+    const size_t b_dim__ = (b__).dim();                                  \
+    const size_t c_dim__ = (c__).dim();                                  \
+    const ::executorch::aten::ScalarType a_type__ = (a__).scalar_type(); \
+    const ::executorch::aten::ScalarType b_type__ = (b__).scalar_type(); \
+    const ::executorch::aten::ScalarType c_type__ = (c__).scalar_type(); \
+                                                                         \
+    ET_CHECK_MSG(                                                        \
+        a_numel__ == b_numel__ && b_numel__ == c_numel__ &&              \
+            ((a_numel__ == 1 && b_numel__ == 1 && c_numel__ == 1) ||     \
+             (a_dim__ == b_dim__ && b_dim__ == c_dim__)) &&              \
+            a_type__ == b_type__ && b_type__ == c_type__,                \
+        ET_TENSOR_CHECK_PREFIX__                                         \
+        ": numel={%zu, %zu, %zu}, dim={%zu, %zu, %zu}, "                 \
+        "dtype={%" PRId8 ", %" PRId8 ", %" PRId8 "}",                    \
+        a_numel__,                                                       \
+        b_numel__,                                                       \
+        c_numel__,                                                       \
+        a_dim__,                                                         \
+        b_dim__,                                                         \
+        c_dim__,                                                         \
+        static_cast<int8_t>(a_type__),                                   \
+        static_cast<int8_t>(b_type__),                                   \
+        static_cast<int8_t>(c_type__));                                  \
+    for (size_t dim__ = 0; dim__ < ET_MIN3(a_dim__, b_dim__, c_dim__);   \
+         ++dim__) {                                                      \
+      size_t a_size__ = (a__).size(dim__);                               \
+      size_t b_size__ = (b__).size(dim__);                               \
+      size_t c_size__ = (c__).size(dim__);                               \
+      ET_CHECK_MSG(                                                      \
+          a_size__ == b_size__ && b_size__ == c_size__,                  \
+          ET_TENSOR_CHECK_PREFIX__ " at size(%zu): {%zu, %zu, %zu}",     \
+          dim__,                                                         \
+          a_size__,                                                      \
+          b_size__,                                                      \
+          c_size__);                                                     \
+    }                                                                    \
+  })
+
+/**
+ * Assert that the input tensor is contiguous tensor.
+ */
+#define ET_CHECK_CONTIGUOUS(a__)                                              \
+  ({                                                                          \
+    const ::executorch::aten::ArrayRef<executorch::aten::StridesType>         \
+        strides = a__.strides();                                              \
+    const ::executorch::aten::ArrayRef<executorch::aten::StridesType> sizes = \
+        a__.sizes();                                                          \
+    ET_CHECK_MSG(                                                             \
+        strides[strides.size() - 1] == 1,                                     \
+        "The stride of the last dimension shall be 1 for contiguous tensor, " \
+        "not %d",                                                             \
+        strides[strides.size() - 1]);                                         \
+    for (size_t i = strides.size() - 1; i > 0; i--) {                         \
+      ET_CHECK_MSG(                                                           \
+          strides[i - 1] == strides[i] * sizes[i],                            \
+          "The stride of the %zu-th dimension shall equal to "                \
+          "strides[%zu] * sizes[%zu], now is %d and %d",                      \
+          i - 1,                                                              \
+          i,                                                                  \
+          i,                                                                  \
+          strides[i - 1],                                                     \
+          strides[i] * sizes[i]);                                             \
+    }                                                                         \
+  })
+
+/**
+ * Assert the input two tensors share same strides.
+ * Noted that this function does not make any check or promise on the contiguity
+ * of any input tensors.
+ */
+#define ET_CHECK_SAME_STRIDES2(a__, b__)                                       \
+  ({                                                                           \
+    ET_CHECK_MSG(                                                              \
+        a__.dim() == b__.dim(),                                                \
+        "Two tensors shall have same number of strides, but not %zu and %zu.", \
+        a__.dim(),                                                             \
+        b__.dim());                                                            \
+    const ::executorch::aten::ArrayRef<executorch::aten::StridesType>          \
+        a_strides = a__.strides();                                             \
+    const ::executorch::aten::ArrayRef<executorch::aten::StridesType>          \
+        b_strides = b__.strides();                                             \
+    for (size_t i = 0; i < a__.dim(); i++) {                                   \
+      ET_CHECK_MSG(                                                            \
+          a_strides[i] == b_strides[i],                                        \
+          "a.strides()[%zu] shall equal to b.strides()[%zu], "                 \
+          "but now is %d and %d.",                                             \
+          i,                                                                   \
+          i,                                                                   \
+          (int32_t)a_strides[i],                                               \
+          (int32_t)b_strides[i]);                                              \
+    }                                                                          \
+  })
+
+/**
+ * Assert the input three tensors share same strides.
+ * Noted that this function does not make any check or promise on the contiguity
+ * of any input tensors.
+ */
+#define ET_CHECK_SAME_STRIDES3(a__, b__, c__)                           \
+  ({                                                                    \
+    ET_CHECK_MSG(                                                       \
+        a__.dim() == b__.dim() && b__.dim() == c__.dim(),               \
+        "Three tensors shall have same number of strides, "             \
+        "but not %zu, %zu and %zu.",                                    \
+        a__.dim(),                                                      \
+        b__.dim(),                                                      \
+        c__.dim());                                                     \
+    const ::executorch::aten::ArrayRef<executorch::aten::StridesType>   \
+        a_strides = a__.strides();                                      \
+    const ::executorch::aten::ArrayRef<executorch::aten::StridesType>   \
+        b_strides = b__.strides();                                      \
+    const ::executorch::aten::ArrayRef<executorch::aten::StridesType>   \
+        c_strides = c__.strides();                                      \
+    for (size_t i = 0; i < a__.dim(); i++) {                            \
+      ET_CHECK_MSG(                                                     \
+          a_strides[i] == b_strides[i] && b_strides[i] == c_strides[i], \
+          "a_strides[%zu], b_strides[%zu] and c_strides[%zu] "          \
+          "shall share same value, but now is %d, %d and %d",           \
+          i,                                                            \
+          i,                                                            \
+          i,                                                            \
+          (int32_t)a_strides[i],                                        \
+          (int32_t)b_strides[i],                                        \
+          (int32_t)c_strides[i]);                                       \
+    }                                                                   \
+  })
+
+#define ET_CHECK_DEFAULT_OR_CHANNELSLAST_DIMORDER(t__)           \
+  ({                                                             \
+    ET_CHECK_MSG(                                                \
+        is_contiguous_dim_order(                                 \
+            t__.dim_order().data(), t__.dim_order().size()) ||   \
+            is_channels_last_dim_order(                          \
+                t__.dim_order().data(), t__.dim_order().size()), \
+        "Tensor must have default or channels last dim order");  \
+  })
+
+/**
+ * DEPRECATED: Please use ET_CHECK_OR_RETURN_FALSE instead and provide
+ * an informative message. (For example, the values of any variables used in
+ * `cond` would not be reported automatically by this macro.)
+ *
+ * A convenience macro to be used in utility functions that check whether input
+ * tensor(s) are valid, which are expected to return a boolean. Checks whether
+ * `cond` is true; if not, log the failed check and return false.
+ *
+ * @param[in] cond the condition to check
+ */
+#define ET_LOG_AND_RETURN_IF_FALSE(cond) ET_CHECK_OR_RETURN_FALSE(cond, "")
+
+/**
+ * DEPRECATED: Please use ET_CHECK_OR_RETURN_FALSE instead.
+ */
+#define ET_LOG_MSG_AND_RETURN_IF_FALSE ET_CHECK_OR_RETURN_FALSE
+
+/**
+ * If `cond` is false, log `cond` and return from the kernel with a failure
+ * state set.
+ *
+ * @param[in] context the runtime context
+ * @param[in] cond the condition to check
+ * @param[in] error torch::executor::Error enum value (e.g `InvalidArgument`)
+ * @param[in] retval return value of the kernel to allow for early exit
+ */
+#define ET_KERNEL_CHECK(context, cond, error, retval) \
+  do {                                                \
+    if (!(cond)) {                                    \
+      ET_LOG(Error, "Check failed (%s): ", #cond);    \
+      context.fail(torch::executor::Error::error);    \
+      return retval;                                  \
+    }                                                 \
+  } while (false)
+
+/**
+ * If `cond` is false, log `message` and return from the kernel with a failure
+ * state set.
+ *
+ * @param[in] context the runtime context
+ * @param[in] cond the condition to check
+ * @param[in] error torch::executor::Error enum value (e.g `InvalidArgument`)
+ * @param[in] retval return value of the kernel to allow for early exit
+ */
+#define ET_KERNEL_CHECK_MSG(context, cond, error, retval, message, ...)   \
+  do {                                                                    \
+    if (!(cond)) {                                                        \
+      ET_LOG(Error, "Check failed (%s): " message, #cond, ##__VA_ARGS__); \
+      context.fail(torch::executor::Error::error);                        \
+      return retval;                                                      \
+    }                                                                     \
+  } while (false)
+
+/**
+ * Convenience macro to extract a scalar tensor into a value
+ */
+#define ET_EXTRACT_SCALAR_TENSOR(scalar_tensor, out_val) \
+  ET_CHECK_MSG(                                          \
+      extract_scalar_tensor(scalar_tensor, &out_val),    \
+      #scalar_tensor " could not be extracted: wrong type or out of range");
+
+namespace executorch {
+namespace ET_RUNTIME_NAMESPACE {
+//
+// Utility functions for checking tensor attributes
+//
+//
+
+/*
+ * Returns true if the given dimension value is between -upper_bound and
+ * upper_bound - 1, inclusive.
+ */
+inline bool dim_is_valid(int64_t dim, int64_t upper_bound) {
+  ET_CHECK_OR_RETURN_FALSE(
+      dim >= -upper_bound && dim < upper_bound,
+      "Dimension %" PRId64
+      " is out of range. Dimension should be between %" PRId64 " and %" PRId64
+      ", inclusive.",
+      dim,
+      -upper_bound,
+      upper_bound - 1);
+
+  return true;
+}
+
+/*
+ * Returns the tensor's number of dimensions, except when the tensor is zero
+ * dimensional. In this case, it returns 1. This is used to properly handle
+ * the zero dimensional tensors in some kernels, that treat them as 1D tensors
+ * with a single element.
+ */
+inline ssize_t nonzero_dim(const executorch::aten::Tensor& tensor) {
+  return tensor.dim() == 0 ? 1 : tensor.dim();
+}
+
+/*
+ * Returns the size along a dimension dim, except when the tensor is zero
+ * dimensional. In this case, it returns 1. This is used to properly handle
+ * the zero dimensional tensors in some kernels, that treat them as 1D tensors
+ * with a single element.
+ */
+inline ssize_t nonempty_size(
+    const executorch::aten::Tensor& tensor,
+    ssize_t dim) {
+  return tensor.dim() == 0 ? 1 : tensor.size(dim);
+}
+
+inline bool tensor_can_cast_to(
+    executorch::aten::Tensor a,
+    executorch::aten::ScalarType dtype) {
+  ET_CHECK_OR_RETURN_FALSE(
+      ::torch::executor::canCast(a.scalar_type(), dtype),
+      "Tensor of dtype %s cannot cast to dtype %s",
+      ::torch::executor::toString(a.scalar_type()),
+      ::torch::executor::toString(dtype));
+
+  return true;
+}
+
+inline bool tensor_is_bool_type(executorch::aten::Tensor t) {
+  ET_CHECK_OR_RETURN_FALSE(
+      t.scalar_type() == executorch::aten::ScalarType::Bool,
+      "Expected to find bool type, but tensor has type %s",
+      ::torch::executor::toString(t.scalar_type()));
+
+  return true;
+}
+
+inline bool tensor_is_type(
+    executorch::aten::Tensor t,
+    executorch::aten::ScalarType dtype) {
+  ET_CHECK_OR_RETURN_FALSE(
+      t.scalar_type() == dtype,
+      "Expected to find %s type, but tensor has type %s",
+      ::torch::executor::toString(dtype),
+      ::torch::executor::toString(t.scalar_type()));
+
+  return true;
+}
+
+inline bool tensor_is_type(
+    executorch::aten::Tensor t,
+    executorch::aten::ScalarType dtype,
+    executorch::aten::ScalarType dtype2) {
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      t.scalar_type() == dtype || t.scalar_type() == dtype2,
+      "Expected to find %s or %s type, but tensor has type %s",
+      ::torch::executor::toString(dtype),
+      ::torch::executor::toString(dtype2),
+      ::torch::executor::toString(t.scalar_type()));
+
+  return true;
+}
+
+inline bool tensor_is_type(
+    executorch::aten::Tensor t,
+    executorch::aten::ScalarType dtype,
+    executorch::aten::ScalarType dtype2,
+    executorch::aten::ScalarType dtype3) {
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      t.scalar_type() == dtype || t.scalar_type() == dtype2 ||
+          t.scalar_type() == dtype3,
+      "Expected to find %s, %s, or %s type, but tensor has type %s",
+      ::torch::executor::toString(dtype),
+      ::torch::executor::toString(dtype2),
+      ::torch::executor::toString(dtype3),
+      ::torch::executor::toString(t.scalar_type()));
+
+  return true;
+}
+
+inline bool tensor_is_integral_type(
+    executorch::aten::Tensor t,
+    bool includeBool = false) {
+  ET_CHECK_OR_RETURN_FALSE(
+      ::torch::executor::isIntegralType(t.scalar_type(), includeBool),
+      "Expected to find a integral type, but tensor has type %s",
+      ::torch::executor::toString(t.scalar_type()));
+
+  return true;
+}
+
+inline bool tensor_is_floating_type(executorch::aten::Tensor t) {
+  ET_CHECK_OR_RETURN_FALSE(
+      ::torch::executor::isFloatingType(t.scalar_type()),
+      "Expected to find a floating type, but tensor has type %s",
+      ::torch::executor::toString(t.scalar_type()));
+
+  return true;
+}
+
+inline bool tensor_is_real_type(executorch::aten::Tensor t) {
+  ET_CHECK_OR_RETURN_FALSE(
+      ::torch::executor::isRealType(t.scalar_type()),
+      "Expected to find a real type, but tensor has type %s",
+      ::torch::executor::toString(t.scalar_type()));
+
+  return true;
+}
+
+inline bool tensor_is_realh_type(executorch::aten::Tensor t) {
+  ET_CHECK_OR_RETURN_FALSE(
+      ::torch::executor::isRealHType(t.scalar_type()),
+      "Expected to find a real type, but tensor has type %s",
+      ::torch::executor::toString(t.scalar_type()));
+
+  return true;
+}
+
+inline bool tensor_is_realhbf16_type(executorch::aten::Tensor t) {
+  ET_CHECK_OR_RETURN_FALSE(
+      executorch::runtime::isRealHBF16Type(t.scalar_type()),
+      "Expected to find a real type, but tensor has type %s",
+      ::torch::executor::toString(t.scalar_type()));
+
+  return true;
+}
+
+inline bool tensor_is_realhb_type(executorch::aten::Tensor t) {
+  ET_CHECK_OR_RETURN_FALSE(
+      ::torch::executor::isRealHBType(t.scalar_type()),
+      "Expected to find a real type, but tensor has type %s",
+      ::torch::executor::toString(t.scalar_type()));
+
+  return true;
+}
+
+inline bool tensor_is_realhbbf16_type(executorch::aten::Tensor t) {
+  ET_CHECK_OR_RETURN_FALSE(
+      executorch::runtime::isRealHBBF16Type(t.scalar_type()),
+      "Expected to find a real type, but tensor has type %s",
+      ::torch::executor::toString(t.scalar_type()));
+
+  return true;
+}
+
+inline bool tensor_is_complex_type(executorch::aten::Tensor t) {
+  ET_CHECK_OR_RETURN_FALSE(
+      ::torch::executor::isComplexType(t.scalar_type()),
+      "Expected to find a complex type, but tensor has type %s",
+      ::torch::executor::toString(t.scalar_type()));
+
+  return true;
+}
+
+inline bool tensor_is_bits_type(executorch::aten::Tensor t) {
+  ET_CHECK_OR_RETURN_FALSE(
+      ::torch::executor::isBitsType(t.scalar_type()),
+      "Expected to find a bits type, but tensor has type %s",
+      ::torch::executor::toString(t.scalar_type()));
+
+  return true;
+}
+
+inline bool tensors_have_same_dtype(
+    executorch::aten::Tensor a,
+    executorch::aten::Tensor b) {
+  ET_CHECK_OR_RETURN_FALSE(
+      a.scalar_type() == b.scalar_type(),
+      ET_TENSOR_CHECK_PREFIX__ ": dtype={%s, %s}",
+      ::torch::executor::toString(a.scalar_type()),
+      ::torch::executor::toString(b.scalar_type()));
+  return true;
+}
+
+inline bool tensors_have_same_dtype(
+    executorch::aten::Tensor a,
+    executorch::aten::Tensor b,
+    executorch::aten::Tensor c) {
+  ET_CHECK_OR_RETURN_FALSE(
+      a.scalar_type() == b.scalar_type() && b.scalar_type() == c.scalar_type(),
+      ET_TENSOR_CHECK_PREFIX__ ": dtype={%s, %s, %s}",
+      ::torch::executor::toString(a.scalar_type()),
+      ::torch::executor::toString(b.scalar_type()),
+      ::torch::executor::toString(c.scalar_type()));
+  return true;
+}
+
+inline bool tensor_is_rank(executorch::aten::Tensor t, size_t rank) {
+  ET_CHECK_OR_RETURN_FALSE(
+      static_cast<size_t>(t.dim()) == rank,
+      "Expected tensor.dim() to be %zu, but got %zu",
+      static_cast<size_t>(rank),
+      static_cast<size_t>(t.dim()));
+
+  return true;
+}
+
+inline bool tensor_has_rank_greater_or_equal_to(
+    executorch::aten::Tensor t,
+    size_t rank) {
+  ET_CHECK_OR_RETURN_FALSE(
+      static_cast<size_t>(t.dim()) >= rank,
+      "Expected tensor.dim() to be >= %zu, but got %zu",
+      static_cast<size_t>(rank),
+      static_cast<size_t>(t.dim()));
+
+  return true;
+}
+
+inline bool tensor_has_rank_smaller_or_equal_to(
+    executorch::aten::Tensor t,
+    size_t rank) {
+  ET_CHECK_OR_RETURN_FALSE(
+      static_cast<size_t>(t.dim()) <= rank,
+      "Expected tensor.dim() to be <= %zu, but got %zu",
+      static_cast<size_t>(rank),
+      static_cast<size_t>(t.dim()));
+
+  return true;
+}
+
+inline bool tensor_has_dim(executorch::aten::Tensor t, int64_t d) {
+  if (t.dim() == 0) {
+    ET_CHECK_OR_RETURN_FALSE(
+        d == 0 || d == -1,
+        "dim must be 0 or -1 for 0-dim tensor, got %" PRId64,
+        d);
+  } else {
+    ET_CHECK_OR_RETURN_FALSE(
+        d > 0 ? d < t.dim() : t.dim() + d >= 0,
+        "%zu-dim tensor does not have dim at index %zu",
+        static_cast<size_t>(t.dim()),
+        static_cast<size_t>(d));
+  }
+  return true;
+}
+
+inline bool tensor_has_non_empty_dim(executorch::aten::Tensor t, int64_t d) {
+  const size_t udim = ET_NORMALIZE_IX(d, t.dim());
+  ET_LOG_AND_RETURN_IF_FALSE(tensor_has_dim(t, d));
+  ET_LOG_AND_RETURN_IF_FALSE(t.size(udim) != 0);
+  return true;
+}
+
+inline bool
+tensor_dim_has_index(executorch::aten::Tensor t, int64_t d, int64_t ix) {
+  // Indexing ops don't support zero-dim tensors
+  ET_CHECK(t.dim() != 0);
+  if (d < 0) {
+    d += t.dim();
+  }
+  // Dimension must have been already checked by tensor_has_dim
+  ET_CHECK(d >= 0 && d < t.dim());
+
+  ET_CHECK_OR_RETURN_FALSE(
+      ix >= -t.size(d) && ix < t.size(d),
+      "index %" PRId64 " out of range [-%zu,%zu) at dimension %" PRId64 ")",
+      ix,
+      static_cast<size_t>(t.size(d)),
+      static_cast<size_t>(t.size(d)),
+      d);
+  return true;
+}
+
+inline bool tensors_have_same_size_at_dims(
+    executorch::aten::Tensor a,
+    size_t dim_a,
+    executorch::aten::Tensor b,
+    size_t dim_b) {
+  ET_CHECK_OR_RETURN_FALSE(
+      dim_a < static_cast<size_t>(a.dim()),
+      "Cannot retrieve dim %zu from tensor with dim %zu",
+      static_cast<size_t>(dim_a),
+      static_cast<size_t>(a.dim()));
+  ET_CHECK_OR_RETURN_FALSE(
+      dim_b < static_cast<size_t>(b.dim()),
+      "Cannot retrieve dim %zu from tensor with dim %zu",
+      static_cast<size_t>(dim_b),
+      static_cast<size_t>(b.dim()));
+  ET_CHECK_OR_RETURN_FALSE(
+      a.size(dim_a) == b.size(dim_b),
+      ET_TENSOR_CHECK_PREFIX__
+      ": a.size(%zu) = %zu does not match b.size(%zu) = %zu",
+      static_cast<size_t>(dim_a),
+      static_cast<size_t>(a.size(dim_a)),
+      static_cast<size_t>(dim_b),
+      static_cast<size_t>(b.size(dim_b)));
+
+  return true;
+}
+
+inline bool tensors_have_same_shape(
+    executorch::aten::Tensor a,
+    executorch::aten::Tensor b) {
+  if (a.numel() == 1 && b.numel() == 1) {
+    // PyTorch operators treat all scalar tensors as the same shape even if
+    // they have different dims.
+    return true;
+  }
+  if (!(a.sizes() == b.sizes() && a.numel() == b.numel())) {
+    ET_LOG(
+        Error,
+        ET_TENSOR_CHECK_PREFIX__ ": numel=(%zu,  %zu), dim=(%zu, %zu)",
+        static_cast<size_t>(a.numel()),
+        static_cast<size_t>(b.numel()),
+        static_cast<size_t>(a.dim()),
+        static_cast<size_t>(b.dim()));
+    // Using [[maybe_unused]] as ET_LOG may not trigger based on verbosity
+    for ([[maybe_unused]] const auto d :
+         c10::irange(ET_MIN2(a.dim(), b.dim()))) {
+      ET_LOG(
+          Error,
+          "    size(%zu): (%zu, %zu)",
+          static_cast<size_t>(d),
+          static_cast<size_t>(a.size(d)),
+          static_cast<size_t>(b.size(d)));
+    }
+
+    return false;
+  }
+
+  return true;
+}
+
+inline bool tensors_have_same_shape(
+    executorch::aten::Tensor a,
+    executorch::aten::Tensor b,
+    executorch::aten::Tensor c) {
+  if (a.numel() == 1 && b.numel() == 1 && c.numel() == 1) {
+    // PyTorch operators treat all scalar tensors as the same shape even if
+    // they have different dims.
+    return true;
+  }
+  bool cond1 = (a.sizes() == b.sizes()) && (a.numel() == b.numel());
+  bool cond2 = (b.sizes() == c.sizes()) && (b.numel() == c.numel());
+
+  if (!(cond1 && cond2)) {
+    ET_LOG(
+        Error,
+        ET_TENSOR_CHECK_PREFIX__ ": numel=(%zu, %zu, %zu), dim=(%zu, %zu, %zu)",
+        static_cast<size_t>(a.numel()),
+        static_cast<size_t>(b.numel()),
+        static_cast<size_t>(c.numel()),
+        static_cast<size_t>(a.dim()),
+        static_cast<size_t>(b.dim()),
+        static_cast<size_t>(c.dim()));
+    for ([[maybe_unused]] const auto d :
+         c10::irange(ET_MIN3(a.dim(), b.dim(), c.dim()))) {
+      ET_LOG(
+          Error,
+          "    size(%zu): (%zu, %zu, %zu)",
+          static_cast<size_t>(d),
+          static_cast<size_t>(a.size(d)),
+          static_cast<size_t>(b.size(d)),
+          static_cast<size_t>(c.size(d)));
+    }
+
+    return false;
+  }
+
+  return true;
+}
+
+inline bool tensors_have_same_shape_and_dtype(
+    executorch::aten::Tensor a,
+    executorch::aten::Tensor b) {
+  return tensors_have_same_shape(a, b) && tensors_have_same_dtype(a, b);
+}
+
+inline bool tensors_have_same_shape_and_dtype(
+    executorch::aten::Tensor a,
+    executorch::aten::Tensor b,
+    executorch::aten::Tensor c) {
+  return tensors_have_same_shape(a, b, c) && tensors_have_same_dtype(a, b, c);
+}
+
+inline bool tensor_has_expected_size(
+    executorch::aten::Tensor a,
+    executorch::aten::ArrayRef<executorch::aten::SizesType> expected_sizes) {
+  if (!(a.sizes() == expected_sizes)) {
+    ET_LOG(
+        Error,
+        ET_TENSOR_CHECK_PREFIX__ ": dim=(%zu, %zu)",
+        static_cast<size_t>(a.dim()),
+        static_cast<size_t>(expected_sizes.size()));
+    size_t a_dim = static_cast<size_t>(a.dim());
+    size_t expected_dim = static_cast<size_t>(expected_sizes.size());
+    for ([[maybe_unused]] const auto d :
+         c10::irange(ET_MIN2(a_dim, expected_dim))) {
+      ET_LOG(
+          Error,
+          "    size(%zu): (%zu, %zu)",
+          static_cast<size_t>(d),
+          static_cast<size_t>(a.size(d)),
+          static_cast<size_t>(expected_sizes[d]));
+    }
+
+    return false;
+  }
+  return true;
+}
+
+inline bool tensors_have_same_strides(
+    executorch::aten::Tensor a,
+    executorch::aten::Tensor b) {
+  if (a.strides() != b.strides()) {
+    ET_LOG(
+        Error,
+        ET_TENSOR_CHECK_PREFIX__ ": dim=(%zu, %zu)",
+        static_cast<size_t>(a.dim()),
+        static_cast<size_t>(b.dim()));
+    for ([[maybe_unused]] const auto d :
+         c10::irange(ET_MIN2(a.dim(), b.dim()))) {
+      ET_LOG(
+          Error,
+          "    stride(%zu): (%zu, %zu)",
+          static_cast<size_t>(d),
+          static_cast<size_t>(a.strides()[d]),
+          static_cast<size_t>(b.strides()[d]));
+    }
+
+    return false;
+  }
+  return true;
+}
+
+inline bool tensors_have_same_strides(
+    executorch::aten::Tensor a,
+    executorch::aten::Tensor b,
+    executorch::aten::Tensor c) {
+  if (!(a.strides() == b.strides() && b.strides() == c.strides())) {
+    ET_LOG(
+        Error,
+        ET_TENSOR_CHECK_PREFIX__ ": dim=(%zu, %zu, %zu)",
+        static_cast<size_t>(a.dim()),
+        static_cast<size_t>(b.dim()),
+        static_cast<size_t>(c.dim()));
+    for ([[maybe_unused]] const auto d :
+         c10::irange(ET_MIN3(a.dim(), b.dim(), c.dim()))) {
+      ET_LOG(
+          Error,
+          "    stride(%zu): (%zu, %zu, %zu)",
+          static_cast<size_t>(d),
+          static_cast<size_t>(a.strides()[d]),
+          static_cast<size_t>(b.strides()[d]),
+          static_cast<size_t>(c.strides()[d]));
+    }
+
+    return false;
+  }
+  return true;
+}
+
+inline bool tensor_is_contiguous(executorch::aten::Tensor t) {
+  const auto strides = t.strides();
+  const auto sizes = t.sizes();
+  // If tensor is 0-dim (i.e. a scalar tensor) it is contiguous
+  if (strides.size() == 0) {
+    return true;
+  }
+  ET_CHECK_OR_RETURN_FALSE(
+      strides[strides.size() - 1] == 1,
+      "Tensor is not contiguous; the stride of the last dimension must be 1, "
+      "but got %zu",
+      static_cast<size_t>(strides[strides.size() - 1]));
+  for (int i = strides.size() - 1; i > 0; --i) {
+    ET_CHECK_OR_RETURN_FALSE(
+        strides[i - 1] == strides[i] * sizes[i],
+        "Tensor is not contiguous; the stride of dim %zu should be equal to "
+        "strides[%zu] * sizes[%zu] = %zu, but found %zu",
+        static_cast<size_t>(i - 1),
+        static_cast<size_t>(i),
+        static_cast<size_t>(i),
+        static_cast<size_t>(strides[i] * sizes[i]),
+        static_cast<size_t>(strides[i - 1]));
+  }
+  return true;
+}
+
+inline bool tensors_have_same_rank(
+    executorch::aten::Tensor a,
+    executorch::aten::Tensor b) {
+  ET_CHECK_OR_RETURN_FALSE(
+      a.dim() == b.dim(),
+      ET_TENSOR_CHECK_PREFIX__ ": rank={%zd, %zd}",
+      ssize_t(a.dim()),
+      ssize_t(b.dim()));
+  return true;
+}
+
+inline bool tensor_is_scalar(executorch::aten::Tensor t) {
+  return t.dim() == 0 && t.numel() == 1;
+}
+
+/// Returns the product of dim[0:dim), not including dim.
+inline size_t getLeadingDims(
+    const executorch::aten::Tensor& tensor,
+    int64_t dim) {
+  ET_CHECK_MSG(
+      dim >= 0 && dim <= tensor.dim(),
+      "Ending dimension %" PRId64
+      " should be in the range [0, tensor.dim() %zd].",
+      dim,
+      ssize_t(tensor.dim()));
+  size_t dims = 1;
+  for (const auto i : c10::irange(dim)) {
+    dims *= static_cast<size_t>(tensor.size(i));
+  }
+  return dims;
+}
+
+/// Returns the product of dim[dim+1:].
+inline size_t getTrailingDims(
+    const executorch::aten::Tensor& tensor,
+    int64_t dim) {
+  ET_CHECK_MSG(
+      dim >= -1 && dim < tensor.dim(),
+      "Starting dimension %" PRId64
+      " should be in the range [-1, tensor.dim() -1 %zd).",
+      dim,
+      ssize_t(tensor.dim()));
+  size_t dims = 1;
+  for (size_t i = dim + 1; i < static_cast<size_t>(tensor.dim()); ++i) {
+    dims *= static_cast<size_t>(tensor.size(i));
+  }
+  return dims;
+}
+
+/**
+ * Given a N-dimensional tensor coordinate, return a linear index that can be
+ * used to access the corresponding element in the tensor's data buffer.
+ *
+ * @param[in] tensor The tensor that will be indexed
+ * @param[in] coordinate A n-dimensional array representing the coordinate to
+ * index. It is assumed that the array has kTensorDimensionLimit elements.
+ * @param[out] index The linear index to element at the specified coordinate
+ * in the tensor.
+ */
+inline size_t coordinateToIndex(
+    const executorch::aten::Tensor& tensor,
+    const size_t* const coordinate) {
+  size_t index = 0;
+  for (int d = 0; d < tensor.dim(); ++d) {
+    index += coordinate[d] * getTrailingDims(tensor, d);
+  }
+  return index;
+}
+
+/**
+ * Produce a memoized array for use with repeated calls to
+ * coordinateToIndexWithTrailingDimsMemo, which will be faster than
+ * repeated calls to coordinateToIndex.
+ */
+inline void memoizeTrailingDims(
+    const executorch::aten::Tensor& tensor,
+    size_t trailing_dims_memo[kTensorDimensionLimit]) {
+  const auto tensorDim = tensor.dim();
+  size_t dims = 1;
+  for (int ii = tensorDim - 1; ii >= 0; --ii) {
+    trailing_dims_memo[ii] = dims;
+    dims *= static_cast<size_t>(tensor.size(ii));
+  }
+}
+
+/**
+ * Like coordinateToIndex, but faster for repeated calls with the same
+ * tensor. trailing_dims_memo must be produced by a call to
+ * memoizeTrailingDims.
+ */
+inline size_t coordinateToIndexWithTrailingDimsMemo(
+    const executorch::aten::Tensor& tensor,
+    const size_t* const coordinate,
+    const size_t trailing_dims_memo[kTensorDimensionLimit]) {
+  size_t index = 0;
+  for (int d = 0; d < tensor.dim(); ++d) {
+    index += coordinate[d] * trailing_dims_memo[d];
+  }
+  return index;
+}
+
+/**
+ * Given the linear index return the N-dimensional tensor coordinate. This is
+ * the inverse operation of coordinateToIndex.
+ *
+ * @param[in] tensor The tensor that will be indexed
+ * @param[in] index The linear index to element at the specified coordinate in
+ * the tensor.
+ * @param[out] coordinate A n-dimensional array representing the coordinate to
+ * index. It is assumed that the array has kTensorDimensionLimit elements.
+ * @returns void
+ */
+inline void indexToCoordinate(
+    const executorch::aten::Tensor& tensor,
+    size_t index,
+    size_t* coordinate) {
+  ET_CHECK(index < static_cast<size_t>(tensor.numel()));
+  for (auto i = 0; i < tensor.dim(); ++i) {
+    auto dim = tensor.dim() - 1 - i;
+    size_t dim_size = tensor.size(dim);
+    coordinate[dim] = index % dim_size;
+    index /= dim_size;
+  }
+}
+
+/**
+ * Extracts an integer value from a scalar Tensor.
+ *
+ * @param[in] tensor The source of the value to extract.
+ * @param[out] out_val The extracted value, on success.
+ * @returns `true` if a value was extracted, and sets `*out_val` to that
+ * value. `false` if a value could not be extracted: either it was not an
+ * integer Scalar Tensor, or the value of that Scalar Tensor could not be
+ * represented by INT_T.
+ */
+template <
+    typename INT_T,
+    typename std::enable_if<
+        std::is_integral<INT_T>::value && !std::is_same<INT_T, bool>::value,
+        bool>::type = true>
+bool extract_scalar_tensor(executorch::aten::Tensor tensor, INT_T* out_val) {
+  if (tensor.numel() != 1) {
+    return false;
+  }
+#define CASE_INT_DTYPE(TENSOR_CTYPE, TENSOR_DTYPE)                     \
+  case executorch::aten::ScalarType::TENSOR_DTYPE: {                   \
+    const TENSOR_CTYPE val = tensor.const_data_ptr<TENSOR_CTYPE>()[0]; \
+    if (val < std::numeric_limits<INT_T>::lowest() ||                  \
+        val > std::numeric_limits<INT_T>::max()) {                     \
+      return false;                                                    \
+    }                                                                  \
+    *out_val = static_cast<INT_T>(val);                                \
+    return true;                                                       \
+  }
+
+  switch (tensor.scalar_type()) {
+    ET_FORALL_INT_TYPES(CASE_INT_DTYPE);
+    default:
+      return false;
+  }
+#undef CASE_INT_DTYPE
+}
+
+/**
+ * Extracts a floating point value from a scalar Tensor.
+ *
+ * @param[in] tensor The source of the value to extract.
+ * @param[out] out_val The extracted value, on success.
+ * @returns `true` if a value was extracted, and sets `*out_val` to that
+ * value. `false` if a value could not be extracted: either it was not a
+ * floating point Scalar Tensor, or the value of that Scalar Tensor could not
+ * be represented by FLOAT_T.
+ */
+template <
+    typename FLOAT_T,
+    typename std::enable_if<
+        std::is_floating_point_v<FLOAT_T> ||
+            std::is_same_v<FLOAT_T, executorch::aten::BFloat16> ||
+            std::is_same_v<FLOAT_T, executorch::aten::Half>,
+        bool>::type = true>
+bool extract_scalar_tensor(executorch::aten::Tensor tensor, FLOAT_T* out_val) {
+  if (tensor.numel() != 1) {
+    return false;
+  }
+#define CASE_REAL_DTYPE(TENSOR_CTYPE, TENSOR_DTYPE)                    \
+  case executorch::aten::ScalarType::TENSOR_DTYPE: {                   \
+    /* ET_FORALL_REAL_TYPES guarantees TENSOR_CTYPE is a real type. */ \
+    double val =                                                       \
+        static_cast<double>(tensor.const_data_ptr<TENSOR_CTYPE>()[0]); \
+    if (std::isfinite(val) &&                                          \
+        (val < std::numeric_limits<FLOAT_T>::lowest() ||               \
+         val > std::numeric_limits<FLOAT_T>::max())) {                 \
+      return false;                                                    \
+    }                                                                  \
+    *out_val = static_cast<FLOAT_T>(val);                              \
+    return true;                                                       \
+  }
+
+  switch (tensor.scalar_type()) {
+    ET_FORALL_REALHBF16_TYPES(CASE_REAL_DTYPE);
+    default:
+      return false;
+  }
+#undef CASE_REAL_DTYPE
+}
+
+/**
+ * Extracts a boolean value from a Scalar.
+ *
+ * @param[in] scalar The source of the value to extract.
+ * @param[out] out_val The extracted value, on success.
+ * @returns `true` if a value was extracted, and sets `*out_val` to that
+ * value. `false` if a value could not be extracted, i.e. not a boolean
+ */
+template <
+    typename BOOL_T,
+    typename std::enable_if<std::is_same<BOOL_T, bool>::value, bool>::type =
+        true>
+bool extract_scalar_tensor(executorch::aten::Tensor tensor, BOOL_T* out_val) {
+  if (tensor.scalar_type() != executorch::aten::ScalarType::Bool) {
+    return false;
+  }
+  if (tensor.numel() != 1) {
+    return false;
+  }
+
+  bool val = tensor.const_data_ptr<bool>()[0];
+
+  *out_val = static_cast<BOOL_T>(val);
+
+  return true;
+}
+
+/// These APIs should not be used outside of Executor.cpp.
+namespace internal {
+/**
+ * Share t_src's data_ptr with t_dst.
+ */
+ET_NODISCARD Error share_tensor_data(
+    const executorch::aten::Tensor& t_dst,
+    const executorch::aten::Tensor& t_src);
+
+/**
+ * Copy t_src's data_ptr to t_dst.
+ */
+ET_NODISCARD Error copy_tensor_data(
+    const executorch::aten::Tensor& t_dst,
+    const executorch::aten::Tensor& t_src);
+
+/**
+ * Set the data_ptr of t to buffer.
+ */
+ET_NODISCARD Error set_tensor_data(
+    const executorch::aten::Tensor& t,
+    void* buffer,
+    size_t buffer_size);
+
+/**
+ * Reset tensor's data_ptr, clear all the storage for at::Tensor.
+ */
+void reset_data_ptr(const executorch::aten::Tensor& tensor);
+
+/**
+ * Resize tensor impl
+ */
+ET_NODISCARD Error resize_tensor_impl(
+    executorch::aten::TensorImpl* impl,
+    executorch::aten::ArrayRef<executorch::aten::SizesType> new_sizes);
+
+} // namespace internal
+
+/**
+ * Resize a tensor to new_sizes, rank must stay the same. Currently does not
+ * expand the tensor if new size exceeds the current capacity. Currently
+ * fails an ET_CHECK if the tensor cannot be resized.
+ *
+ * WARNING: Placeholder API until discussion around runtime context is
+ * settled, will likely move to be a class method on a TensorResizer object
+ * passed in through runtimeContext.
+ */
+ET_NODISCARD inline Error resize_tensor(
+    executorch::aten::Tensor t,
+    executorch::aten::ArrayRef<executorch::aten::SizesType> new_sizes) {
+  return internal::resize_tensor_impl(t.unsafeGetTensorImpl(), new_sizes);
+}
+
+/**
+ * Resize a tensor to new_sizes, rank must stay the same. Currently does not
+ * expand the tensor if new size exceeds the current capacity. Currently
+ * fails an ET_CHECK if the tensor cannot be resized.
+ *
+ * WARNING: Placeholder API until discussion around runtime context is
+ * settled, will likely move to be a class method on a TensorResizer object
+ * passed in through runtimeContext.
+ */
+template <
+    typename T,
+    typename std::enable_if<
+        !std::is_same<executorch::aten::SizesType, T>::value,
+        int>::type = 0>
+ET_NODISCARD inline Error resize_tensor(
+    executorch::aten::Tensor t,
+    executorch::aten::ArrayRef<T> new_sizes) {
+  // Need to cast the input array to an array of Tensor::SizesType
+  std::array<executorch::aten::SizesType, kTensorDimensionLimit>
+      new_sizes_casted{};
+  size_t new_sizes_ndim = new_sizes.size();
+  for (size_t i = 0; i < new_sizes_ndim; ++i) {
+    new_sizes_casted[i] =
+        static_cast<executorch::aten::SizesType>(new_sizes[i]);
+  }
+
+  return internal::resize_tensor_impl(
+      t.unsafeGetTensorImpl(), {new_sizes_casted.data(), new_sizes_ndim});
+}
+
+/// DEPRECATED: Use `resize_tensor()` instead, which can fail non-fatally.
+ET_DEPRECATED inline void resize(
+    executorch::aten::Tensor t,
+    executorch::aten::ArrayRef<executorch::aten::SizesType> new_sizes) {
+  Error err = resize_tensor(t, new_sizes);
+  ET_CHECK_MSG(
+      err == Error::Ok, "Could not resize Tensor; see logs for details");
+}
+/**
+ * Get dim_order of a Tensor and write it to out_dim_order.
+ * @param tensor The tensor where we want to get dim order from.
+ * @param out_dim_order Pointing to an array of DimOrderType where we write
+ * dim order into it.
+ * @param out_dim_order_size Size of the DimOrderType array.
+ */
+ET_NODISCARD Error get_dim_order(
+    const executorch::aten::Tensor& tensor,
+    executorch::aten::DimOrderType* out_dim_order,
+    size_t out_dim_order_size);
+
+/**
+ * Checks whether a tensor has a valid dim order. If the dim order could not
+ * be determined, then this function returns false by default.
+ */
+bool tensor_has_valid_dim_order(executorch::aten::Tensor t);
+
+/**
+ * Checks whether a tensor has either the default of channels last dim order.
+ * If the dim order could not be determined, then this function returns false
+ * by default.
+ */
+bool tensor_is_default_or_channels_last_dim_order(executorch::aten::Tensor t);
+
+/**
+ * Checks whether a tensor has the default dimension order.
+ * Logs an error message if the tensor does not meet the expected criteria.
+ *
+ * @param t The tensor to check the dimension order of.
+ * @return True if the tensor has the default dimension order, false otherwise.
+ */
+bool tensor_is_default_dim_order(executorch::aten::Tensor t);
+
+/**
+ * Checks whether a tensor has the channels last dimension order.
+ * Logs an error message if the tensor does not meet the expected criteria.
+ *
+ * @param t The tensor to check the dimension order of.
+ * @return True if the tensor has the channels last dimension order, false
+ * otherwise.
+ */
+bool tensor_is_channels_last_dim_order(executorch::aten::Tensor t);
+
+/**
+ * Asserts that four tensors have the same dim_order
+ *
+ * Note that this macro only tests dim order, but not others like actual data,
+ * sizes, etc.
+ *
+ */
+bool tensors_have_same_dim_order(
+    const executorch::aten::ArrayRef<executorch::aten::Tensor> tensor_list);
+
+/**
+ * Asserts that two tensors have the same dim_order
+ *
+ * Note that this macro only tests dim order, but not others like actual data,
+ * sizes, etc.
+ */
+
+inline bool tensors_have_same_dim_order(
+    const executorch::aten::Tensor& a,
+    const executorch::aten::Tensor& b) {
+  executorch::aten::Tensor tensor_list[2] = {a, b};
+  return tensors_have_same_dim_order(tensor_list);
+}
+
+/**
+ * Asserts that three tensors have the same dim_order
+ *
+ * Note that this macro only tests dim order, but not others like actual data,
+ * sizes, etc.
+ *
+ */
+
+inline bool tensors_have_same_dim_order(
+    const executorch::aten::Tensor& a,
+    const executorch::aten::Tensor& b,
+    const executorch::aten::Tensor& c) {
+  executorch::aten::Tensor tensor_list[3] = {a, b, c};
+  return tensors_have_same_dim_order(tensor_list);
+}
+
+/**
+ * Asserts that four tensors have the same dim_order
+ *
+ * Note that this macro only tests dim order, but not others like actual data,
+ * sizes, etc.
+ *
+ */
+
+inline bool tensors_have_same_dim_order(
+    const executorch::aten::Tensor& a,
+    const executorch::aten::Tensor& b,
+    const executorch::aten::Tensor& c,
+    const executorch::aten::Tensor& d) {
+  executorch::aten::Tensor tensor_list[4] = {a, b, c, d};
+  return tensors_have_same_dim_order(tensor_list);
+}
+
+/**
+ * Given an n-dimensional coordinate array and an array of tensor strides,
+ * calculates the linear index that can be used to retrieve the value at the
+ * given coordinates.
+ * @param coordinate Pointer to the array of coordinates.
+ * @param strides Pointer to the array of strides.
+ * @param ndim Number of dimensions in the tensor.
+ */
+inline size_t calculate_linear_index(
+    const executorch::aten::SizesType* coordinate,
+    const executorch::aten::StridesType* strides,
+    const size_t ndim) {
+  size_t index = 0;
+  for (size_t i = 0; i < ndim; i++) {
+    index += coordinate[i] * strides[i];
+  }
+  return index;
+}
+
+} // namespace ET_RUNTIME_NAMESPACE
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::ET_RUNTIME_NAMESPACE::calculate_linear_index;
+using ::executorch::ET_RUNTIME_NAMESPACE::coordinateToIndex;
+using ::executorch::ET_RUNTIME_NAMESPACE::dim_is_valid;
+using ::executorch::ET_RUNTIME_NAMESPACE::extract_scalar_tensor;
+using ::executorch::ET_RUNTIME_NAMESPACE::get_dim_order;
+using ::executorch::ET_RUNTIME_NAMESPACE::getLeadingDims;
+using ::executorch::ET_RUNTIME_NAMESPACE::getTrailingDims;
+using ::executorch::ET_RUNTIME_NAMESPACE::indexToCoordinate;
+using ::executorch::ET_RUNTIME_NAMESPACE::nonempty_size;
+using ::executorch::ET_RUNTIME_NAMESPACE::nonzero_dim;
+using ::executorch::ET_RUNTIME_NAMESPACE::resize;
+using ::executorch::ET_RUNTIME_NAMESPACE::resize_tensor;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_can_cast_to;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_dim_has_index;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_has_dim;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_has_expected_size;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_has_non_empty_dim;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_has_rank_greater_or_equal_to;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_has_rank_smaller_or_equal_to;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_has_valid_dim_order;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_bits_type;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_bool_type;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_complex_type;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_contiguous;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_default_dim_order;
+using ::executorch::ET_RUNTIME_NAMESPACE::
+    tensor_is_default_or_channels_last_dim_order;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_floating_type;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_integral_type;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_rank;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_real_type;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_realh_type;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_realhb_type;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_scalar;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_dim_order;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_dtype;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_rank;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_shape;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_shape_and_dtype;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_size_at_dims;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_strides;
+using ::executorch::runtime::kTensorDimensionLimit;
+namespace internal {
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::copy_tensor_data;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::reset_data_ptr;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::resize_tensor_impl;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::set_tensor_data;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::share_tensor_data;
+} // namespace internal
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/freeable_buffer.h b/include/executorch/runtime/core/freeable_buffer.h
new file mode 100644
index 00000000000..a90c899103d
--- /dev/null
+++ b/include/executorch/runtime/core/freeable_buffer.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstddef>
+
+namespace executorch {
+namespace runtime {
+
+/**
+ * A read-only buffer than can be freed.
+ */
+class FreeableBuffer final {
+ public:
+  // Callback signature for the function that does the freeing.
+  using FreeFn = void (*)(void* context, void* data, size_t size);
+
+  /**
+   * Creates an empty FreeableBuffer with size zero and a null data pointer.
+   */
+  FreeableBuffer()
+      : free_fn_(nullptr),
+        free_fn_context_(nullptr),
+        data_(nullptr),
+        size_(0) {}
+
+  /**
+   * Creates a FreeableBuffer with an optional free function.
+   *
+   * @param[in] data The data of the segment.
+   * @param[in] size The size of the segment data, in bytes.
+   * @param[in] free_fn Optional function to free the data. Guaranteed to be
+   *     called exactly once before the FreeableBuffer is destroyed. May be
+   *     nullptr. NOTE: This function must be thread-safe. If it modifies common
+   *     state, the function must do its own locking.
+   * @param[in] free_fn_context Opaque pointer to pass as the `context`
+   *     parameter of `free_fn`. May be nullptr.
+   */
+  FreeableBuffer(
+      const void* data,
+      size_t size,
+      FreeFn free_fn,
+      void* free_fn_context = nullptr)
+      : free_fn_(free_fn),
+        free_fn_context_(free_fn_context),
+        data_(data),
+        size_(size) {}
+
+  /**
+   * Move ctor. Takes the ownership of the data previously owned by `rhs`,
+   * leaving `rhs` pointing to nullptr.
+   */
+  FreeableBuffer(FreeableBuffer&& rhs) noexcept
+      : free_fn_(rhs.free_fn_),
+        free_fn_context_(rhs.free_fn_context_),
+        data_(rhs.data_),
+        size_(rhs.size_) {
+    rhs.free_fn_ = nullptr;
+    rhs.free_fn_context_ = nullptr;
+    rhs.data_ = nullptr;
+    rhs.size_ = 0;
+  }
+
+  ~FreeableBuffer() {
+    Free();
+  }
+
+  /**
+   * Frees the data if not already free. Safe to call multiple times.
+   */
+  void Free() {
+    if (data_ != nullptr) {
+      if (free_fn_ != nullptr) {
+        free_fn_(free_fn_context_, const_cast<void*>(data_), size_);
+      }
+      data_ = nullptr;
+      size_ = 0;
+    }
+  }
+
+  /**
+   * Size of the data in bytes. Returns 0 if the data has been freed.
+   */
+  size_t size() const {
+    return size_;
+  }
+
+  /**
+   * Pointer to the data. Returns nullptr if the data has been freed.
+   */
+  const void* data() const {
+    return data_;
+  }
+
+ private:
+  // Delete other rule-of-five methods.
+  FreeableBuffer(const FreeableBuffer& rhs) = delete;
+  FreeableBuffer& operator=(FreeableBuffer&& rhs) noexcept = delete;
+  FreeableBuffer& operator=(const FreeableBuffer& rhs) = delete;
+
+  FreeFn free_fn_;
+  void* free_fn_context_;
+  const void* data_;
+  size_t size_;
+};
+
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::FreeableBuffer;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/function_ref.h b/include/executorch/runtime/core/function_ref.h
new file mode 100644
index 00000000000..07d3d582b0c
--- /dev/null
+++ b/include/executorch/runtime/core/function_ref.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+//===- llvm/ADT/STLFunctionalExtras.h - Extras for <functional> -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains some extension to <functional>.
+//
+// No library is required when using these functions.
+//
+//===----------------------------------------------------------------------===//
+//     Extra additions to <functional>
+//===----------------------------------------------------------------------===//
+
+/// An efficient, type-erasing, non-owning reference to a callable. This is
+/// intended for use as the type of a function parameter that is not used
+/// after the function in question returns.
+///
+/// This class does not own the callable, so it is not in general safe to store
+/// a FunctionRef.
+
+// torch::executor: modified from llvm::function_ref
+// - renamed to FunctionRef
+// - removed LLVM_GSL_POINTER and LLVM_LIFETIME_BOUND macro uses
+// - use namespaced internal::remove_cvref_t
+
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+
+namespace executorch::runtime {
+
+//===----------------------------------------------------------------------===//
+//     Features from C++20
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+
+template <typename T>
+struct remove_cvref {
+  using type =
+      typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+};
+
+template <typename T>
+using remove_cvref_t = typename remove_cvref<T>::type;
+
+} // namespace internal
+
+template <typename Fn>
+class FunctionRef;
+
+template <typename Ret, typename... Params>
+class FunctionRef<Ret(Params...)> {
+  Ret (*callback)(intptr_t callable, Params... params) = nullptr;
+  intptr_t callable;
+
+  template <typename Callable>
+  static Ret callback_fn(intptr_t callable, Params... params) {
+    return (*reinterpret_cast<Callable*>(callable))(
+        std::forward<Params>(params)...);
+  }
+
+ public:
+  FunctionRef() = default;
+  FunctionRef(std::nullptr_t) {}
+
+  template <typename Callable>
+  FunctionRef(
+      Callable&& callable,
+      // This is not the copy-constructor.
+      std::enable_if_t<!std::is_same<
+          internal::remove_cvref_t<Callable>,
+          FunctionRef>::value>* = nullptr,
+      // Functor must be callable and return a suitable type.
+      std::enable_if_t<
+          std::is_void<Ret>::value ||
+          std::is_convertible<
+              decltype(std::declval<Callable>()(std::declval<Params>()...)),
+              Ret>::value>* = nullptr)
+      : callback(callback_fn<std::remove_reference_t<Callable>>),
+        callable(reinterpret_cast<intptr_t>(&callable)) {}
+
+  Ret operator()(Params... params) const {
+    return callback(callable, std::forward<Params>(params)...);
+  }
+
+  explicit operator bool() const {
+    return callback;
+  }
+
+  bool operator==(const FunctionRef<Ret(Params...)>& Other) const {
+    return callable == Other.callable;
+  }
+};
+} // namespace executorch::runtime
diff --git a/include/executorch/runtime/core/hierarchical_allocator.h b/include/executorch/runtime/core/hierarchical_allocator.h
new file mode 100644
index 00000000000..b5031fa38e5
--- /dev/null
+++ b/include/executorch/runtime/core/hierarchical_allocator.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <c10/util/irange.h>
+#include <executorch/runtime/core/memory_allocator.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/core/span.h>
+#include <executorch/runtime/platform/assert.h>
+#include <executorch/runtime/platform/compiler.h>
+#include <executorch/runtime/platform/log.h>
+#include <cstdint>
+
+namespace executorch {
+namespace runtime {
+
+/**
+ * A group of buffers that can be used to represent a device's memory hierarchy.
+ */
+class HierarchicalAllocator final {
+ public:
+  /**
+   * Constructs a new hierarchical allocator with the given array of buffers.
+   *
+   * - Memory IDs are based on the index into `buffers`: `buffers[N]` will have
+   *   a memory ID of `N`.
+   * - `buffers.size()` must be >= `MethodMeta::num_non_const_buffers()`.
+   * - `buffers[N].size()` must be >= `MethodMeta::non_const_buffer_size(N)`.
+   */
+  explicit HierarchicalAllocator(Span<Span<uint8_t>> buffers)
+      : buffers_(buffers) {}
+
+  /**
+   * DEPRECATED: Use spans instead.
+   */
+  ET_DEPRECATED HierarchicalAllocator(
+      uint32_t n_allocators,
+      MemoryAllocator* allocators)
+      : buffers_(to_spans(n_allocators, allocators)) {}
+
+  /**
+   * Returns the address at the byte offset `offset_bytes` from the given
+   * buffer's base address, which points to at least `size_bytes` of memory.
+   *
+   * @param[in] memory_id The ID of the buffer in the hierarchy.
+   * @param[in] offset_bytes The offset in bytes into the specified buffer.
+   * @param[in] size_bytes The amount of memory that should be available at
+   *     the offset.
+   *
+   * @returns On success, the address of the requested byte offset into the
+   *     specified buffer. On failure, a non-Ok Error.
+   */
+  ET_NODISCARD Result<void*> get_offset_address(
+      uint32_t memory_id,
+      size_t offset_bytes,
+      size_t size_bytes) {
+    ET_CHECK_OR_RETURN_ERROR(
+        memory_id < buffers_.size(),
+        InvalidArgument,
+        "id %" PRIu32 " >= %" ET_PRIsize_t,
+        memory_id,
+        buffers_.size());
+    Span<uint8_t> buffer = buffers_[memory_id];
+    ET_CHECK_OR_RETURN_ERROR(
+        offset_bytes + size_bytes <= buffer.size(),
+        MemoryAllocationFailed,
+        "offset_bytes (%" ET_PRIsize_t ") + size_bytes (%" ET_PRIsize_t
+        ") >= allocator size (%" ET_PRIsize_t
+        ") "
+        "for memory_id %" PRIu32,
+        offset_bytes,
+        size_bytes,
+        buffer.size(),
+        memory_id);
+    return buffer.data() + offset_bytes;
+  }
+
+ private:
+  // TODO(T162089316): Remove the span array and to_spans once all users move to
+  // spans. This array is necessary to hold the pointers and sizes that were
+  // originally provided as MemoryAllocator instances.
+  static constexpr size_t kSpanArraySize = 16;
+  // NOTE: span_array_ must be declared before buffers_ so that it isn't
+  // re-initialized to zeros after initializing buffers_.
+  Span<uint8_t> span_array_[kSpanArraySize];
+  Span<Span<uint8_t>> to_spans(
+      uint32_t n_allocators,
+      MemoryAllocator* allocators) {
+    ET_CHECK_MSG(
+        n_allocators <= kSpanArraySize,
+        "n_allocators %" PRIu32 " > %zu",
+        n_allocators,
+        kSpanArraySize);
+    for (const auto i : c10::irange(n_allocators)) {
+      span_array_[i] =
+          Span<uint8_t>(allocators[i].base_address(), allocators[i].size());
+    }
+    return {span_array_, n_allocators};
+  }
+
+  /// The underlying buffers.
+  Span<Span<uint8_t>> buffers_;
+};
+
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::HierarchicalAllocator;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/memory_allocator.h b/include/executorch/runtime/core/memory_allocator.h
new file mode 100644
index 00000000000..6f4496513a7
--- /dev/null
+++ b/include/executorch/runtime/core/memory_allocator.h
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <cinttypes>
+#include <cstdint>
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/assert.h>
+#include <executorch/runtime/platform/compiler.h>
+#include <executorch/runtime/platform/log.h>
+#include <executorch/runtime/platform/profiler.h>
+
+namespace executorch {
+namespace runtime {
+
+/**
+ * A class that does simple allocation based on a size and returns the pointer
+ * to the memory address. It bookmarks a buffer with certain size. The
+ * allocation is simply checking space and growing the cur_ pointer with each
+ * allocation request.
+ *
+ * Simple example:
+ *
+ *   // User allocates a 100 byte long memory in the heap.
+ *   uint8_t* memory_pool = malloc(100 * sizeof(uint8_t));
+ *   MemoryAllocator allocator(100, memory_pool)
+ *   // Pass allocator object in the Executor
+ *
+ *   Underneath the hood, ExecuTorch will call
+ *   allocator.allocate() to keep iterating cur_ pointer
+ */
+class MemoryAllocator {
+ public:
+  /**
+   * Default alignment of memory returned by this class. Ensures that pointer
+   * fields of structs will be aligned. Larger types like `long double` may not
+   * be, however, depending on the toolchain and architecture.
+   */
+  static constexpr size_t kDefaultAlignment = alignof(void*);
+
+  /**
+   * Constructs a new memory allocator of a given `size`, starting at the
+   * provided `base_address`.
+   *
+   * @param[in] size The size in bytes of the buffer at `base_address`.
+   * @param[in] base_address The buffer to allocate from. Does not take
+   *     ownership of this buffer, so it must be valid for the lifetime of of
+   *     the MemoryAllocator.
+   */
+  MemoryAllocator(uint32_t size, uint8_t* base_address)
+      : begin_(base_address),
+        end_(base_address + size),
+        cur_(base_address),
+        size_(size) {}
+
+  /**
+   * Allocates `size` bytes of memory.
+   *
+   * @param[in] size Number of bytes to allocate.
+   * @param[in] alignment Minimum alignment for the returned pointer. Must be a
+   *     power of 2.
+   *
+   * @returns Aligned pointer to the allocated memory on success.
+   * @retval nullptr Not enough memory, or `alignment` was not a power of 2.
+   */
+  virtual void* allocate(size_t size, size_t alignment = kDefaultAlignment) {
+    if (!isPowerOf2(alignment)) {
+      ET_LOG(Error, "Alignment %zu is not a power of 2", alignment);
+      return nullptr;
+    }
+
+    // The allocation will occupy [start, end), where the start is the next
+    // position that's a multiple of alignment.
+    uint8_t* start = alignPointer(cur_, alignment);
+    uint8_t* end = start + size;
+
+    // If the end of this allocation exceeds the end of this allocator, print
+    // error messages and return nullptr
+    if (end > end_) {
+      ET_LOG(
+          Error,
+          "Memory allocation failed: %zuB requested (adjusted for alignment), %zuB available",
+          static_cast<size_t>(end - cur_),
+          static_cast<size_t>(end_ - cur_));
+      return nullptr;
+    }
+
+    // Otherwise, record how many bytes were used, advance cur_ to the new end,
+    // and then return start. Note that the number of bytes used is (end - cur_)
+    // instead of (end - start) because start > cur_ if there is a misalignment
+    EXECUTORCH_TRACK_ALLOCATION(prof_id_, end - cur_);
+    cur_ = end;
+    return static_cast<void*>(start);
+  }
+
+  /**
+   * Allocates a buffer large enough for an instance of type T. Note that the
+   * memory will not be initialized.
+   *
+   * Example:
+   * @code
+   *   auto p = memory_allocator->allocateInstance<MyType>();
+   * @endcode
+   *
+   * @param[in] alignment Minimum alignment for the returned pointer. Must be a
+   *     power of 2. Defaults to the natural alignment of T.
+   *
+   * @returns Aligned pointer to the allocated memory on success.
+   * @retval nullptr Not enough memory, or `alignment` was not a power of 2.
+   */
+  template <typename T>
+  T* allocateInstance(size_t alignment = alignof(T)) {
+    return static_cast<T*>(this->allocate(sizeof(T), alignment));
+  }
+
+  /**
+   * Allocates `size` number of chunks of type T, where each chunk is of size
+   * equal to sizeof(T) bytes.
+   *
+   * @param[in] size Number of memory chunks to allocate.
+   * @param[in] alignment Minimum alignment for the returned pointer. Must be a
+   *     power of 2. Defaults to the natural alignment of T.
+   *
+   * @returns Aligned pointer to the allocated memory on success.
+   * @retval nullptr Not enough memory, or `alignment` was not a power of 2.
+   */
+  template <typename T>
+  T* allocateList(size_t size, size_t alignment = alignof(T)) {
+    // Some users of this method allocate lists of pointers, causing the next
+    // line to expand to `sizeof(type *)`, which triggers a clang-tidy warning.
+    // NOLINTNEXTLINE(bugprone-sizeof-expression)
+    return static_cast<T*>(this->allocate(size * sizeof(T), alignment));
+  }
+
+  // Returns the allocator memory's base address.
+  virtual uint8_t* base_address() const {
+    return begin_;
+  }
+
+  // Returns the total size of the allocator's memory buffer.
+  virtual uint32_t size() const {
+    return size_;
+  }
+
+  // Resets the current pointer to the base address. It does nothing to
+  // the contents.
+  virtual void reset() {
+    cur_ = begin_;
+  }
+
+  void enable_profiling(ET_UNUSED const char* name) {
+    prof_id_ = EXECUTORCH_TRACK_ALLOCATOR(name);
+  }
+
+  virtual ~MemoryAllocator() {}
+
+ protected:
+  /**
+   * Returns the profiler ID for this allocator.
+   */
+  int32_t prof_id() const {
+    return prof_id_;
+  }
+
+  /**
+   * Returns true if the value is an integer power of 2.
+   */
+  static bool isPowerOf2(size_t value) {
+    return value > 0 && (value & ~(value - 1)) == value;
+  }
+
+  /**
+   * Returns the next alignment for a given pointer.
+   */
+  static uint8_t* alignPointer(void* ptr, size_t alignment) {
+    intptr_t addr = reinterpret_cast<intptr_t>(ptr);
+    if ((addr & (alignment - 1)) == 0) {
+      // Already aligned.
+      return reinterpret_cast<uint8_t*>(ptr);
+    }
+    addr = (addr | (alignment - 1)) + 1;
+    return reinterpret_cast<uint8_t*>(addr);
+  }
+
+ private:
+  uint8_t* const begin_;
+  uint8_t* const end_;
+  uint8_t* cur_;
+  uint32_t const size_;
+  int32_t prof_id_ = -1;
+};
+
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::MemoryAllocator;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/named_data_map.h b/include/executorch/runtime/core/named_data_map.h
new file mode 100644
index 00000000000..14179d22795
--- /dev/null
+++ b/include/executorch/runtime/core/named_data_map.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#ifdef __GNUC__
+// Disable -Wdeprecated-declarations, as some builds use 'Werror'.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/freeable_buffer.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/core/span.h>
+#include <executorch/runtime/core/tensor_layout.h>
+#include <executorch/runtime/platform/compiler.h>
+
+namespace executorch {
+namespace ET_RUNTIME_NAMESPACE {
+/**
+ * Interface to access and retrieve data via name.
+ * See executorch/extension/flat_tensor/ for an example.
+ */
+class ET_EXPERIMENTAL NamedDataMap {
+ public:
+  virtual ~NamedDataMap() = default;
+  /**
+   * Get metadata by key.
+   *
+   * @param key The name of the tensor.
+   * @return Result containing TensorLayout with tensor metadata.
+   */
+  ET_NODISCARD virtual Result<const TensorLayout> get_metadata(
+      const char* key) const = 0;
+  /**
+   * Get data by key.
+   *
+   * @param key Name of the data.
+   * @return Result containing a FreeableBuffer with the tensor data.
+   */
+  ET_NODISCARD virtual Result<FreeableBuffer> get_data(
+      const char* key) const = 0;
+
+  /**
+   * Loads data corresponding to the key into the provided buffer.
+   *
+   * @param key The name of the data.
+   * @param size The number of bytes to load. Use `get_metadata` to retrieve the
+   * size of the data for a given key.
+   * @param buffer The buffer to load the data into. Must point to at least
+   * `size` bytes of memory.
+   * @returns an Error indicating if the load was successful.
+   */
+  ET_NODISCARD virtual Error
+  load_data_into(const char* key, void* buffer, size_t size) const = 0;
+
+  /**
+   * Get the number of keys in the NamedDataMap.
+   *
+   * @return Result containing the number of keys.
+   */
+  ET_NODISCARD virtual Result<size_t> get_num_keys() const = 0;
+
+  /**
+   * Get the key at the given index.
+   *
+   * @param index The index of the key to retrieve.
+   * @return Result containing the key at the given index. Note: the returned
+   * pointer is only valid for the lifetime of the DataMap.
+   */
+  ET_NODISCARD virtual Result<const char*> get_key(size_t index) const = 0;
+};
+
+} // namespace ET_RUNTIME_NAMESPACE
+} // namespace executorch
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
diff --git a/include/executorch/runtime/core/portable_type/bfloat16.h b/include/executorch/runtime/core/portable_type/bfloat16.h
new file mode 100644
index 00000000000..233d571478e
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/bfloat16.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <c10/util/BFloat16.h>
+
+namespace executorch::runtime::etensor {
+using c10::BFloat16;
+namespace internal {
+using c10::detail::f32_from_bits;
+using c10::detail::round_to_nearest_even;
+} // namespace internal
+} // namespace executorch::runtime::etensor
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::etensor::BFloat16;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/portable_type/bfloat16_math.h b/include/executorch/runtime/core/portable_type/bfloat16_math.h
new file mode 100644
index 00000000000..3f6bf14a464
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/bfloat16_math.h
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/portable_type/bfloat16.h>
+#include <executorch/runtime/core/portable_type/half.h>
+
+#include <c10/util/BFloat16-math.h>
diff --git a/include/executorch/runtime/core/portable_type/bits_types.h b/include/executorch/runtime/core/portable_type/bits_types.h
new file mode 100644
index 00000000000..cddffc485ec
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/bits_types.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <cstdint>
+
+namespace executorch {
+namespace runtime {
+namespace etensor {
+
+/**
+ * bits1x8 is an uninterpreted dtype of a tensor with 1 bit (packed to byte
+ * boundary), without any semantics defined.
+ */
+struct alignas(1) bits1x8 {
+  using underlying = uint8_t;
+  uint8_t val_;
+  bits1x8() = default;
+  explicit bits1x8(uint8_t val) : val_(val) {}
+};
+
+/**
+ * bits2x4 is an uninterpreted dtype of a tensor with 2 bits (packed to byte
+ * boundary), without any semantics defined.
+ */
+struct alignas(1) bits2x4 {
+  using underlying = uint8_t;
+  uint8_t val_;
+  bits2x4() = default;
+  explicit bits2x4(uint8_t val) : val_(val) {}
+};
+
+/**
+ * bits4x2 is an uninterpreted dtype of a tensor with 4 bits (packed to byte
+ * boundary), without any semantics defined.
+ */
+struct alignas(1) bits4x2 {
+  using underlying = uint8_t;
+  uint8_t val_;
+  bits4x2() = default;
+  explicit bits4x2(uint8_t val) : val_(val) {}
+};
+
+/**
+ * bits8 is an uninterpreted dtype of a tensor with 8 bits, without any
+ * semantics defined.
+ */
+struct alignas(1) bits8 {
+  uint8_t val_;
+  bits8() = default;
+  explicit bits8(uint8_t val) : val_(val) {}
+};
+
+/**
+ * bits16 is an uninterpreted dtype of a tensor with 16 bits, without any
+ * semantics defined.
+ */
+struct alignas(2) bits16 {
+  uint16_t val_;
+  bits16() = default;
+  explicit bits16(uint16_t val) : val_(val) {}
+};
+
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::etensor::bits16;
+using ::executorch::runtime::etensor::bits1x8;
+using ::executorch::runtime::etensor::bits2x4;
+using ::executorch::runtime::etensor::bits4x2;
+using ::executorch::runtime::etensor::bits8;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/portable_type/c10/c10/macros/Export.h b/include/executorch/runtime/core/portable_type/c10/c10/macros/Export.h
new file mode 100644
index 00000000000..21808de77a3
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/c10/c10/macros/Export.h
@@ -0,0 +1,162 @@
+#ifndef C10_MACROS_EXPORT_H_
+#define C10_MACROS_EXPORT_H_
+
+/* Header file to define the common scaffolding for exported symbols.
+ *
+ * Export is by itself a quite tricky situation to deal with, and if you are
+ * hitting this file, make sure you start with the background here:
+ * - Linux: https://gcc.gnu.org/wiki/Visibility
+ * - Windows:
+ * https://docs.microsoft.com/en-us/cpp/cpp/dllexport-dllimport?view=vs-2017
+ *
+ * Do NOT include this file directly. Instead, use c10/macros/Macros.h
+ */
+
+// You do not need to edit this part of file unless you are changing the core
+// pytorch export abstractions.
+//
+// This part defines the C10 core export and import macros. This is controlled
+// by whether we are building shared libraries or not, which is determined
+// during build time and codified in c10/core/cmake_macros.h.
+// When the library is built as a shared lib, EXPORT and IMPORT will contain
+// visibility attributes. If it is being built as a static lib, then EXPORT
+// and IMPORT basically have no effect.
+
+// As a rule of thumb, you should almost NEVER mix static and shared builds for
+// libraries that depend on c10. AKA, if c10 is built as a static library, we
+// recommend everything dependent on c10 to be built statically. If c10 is built
+// as a shared library, everything dependent on it should be built as shared. In
+// the PyTorch project, all native libraries shall use the macro
+// C10_BUILD_SHARED_LIB to check whether pytorch is building shared or static
+// libraries.
+
+// For build systems that do not directly depend on CMake and directly build
+// from the source directory (such as Buck), one may not have a cmake_macros.h
+// file at all. In this case, the build system is responsible for providing
+// correct macro definitions corresponding to the cmake_macros.h.in file.
+//
+// In such scenarios, one should define the macro
+//     C10_USING_CUSTOM_GENERATED_MACROS
+// to inform this header that it does not need to include the cmake_macros.h
+// file.
+
+#ifndef C10_USING_CUSTOM_GENERATED_MACROS
+#include <c10/macros/cmake_macros.h>
+#endif // C10_USING_CUSTOM_GENERATED_MACROS
+
+#ifdef _WIN32
+#define C10_HIDDEN
+#if defined(C10_BUILD_SHARED_LIBS)
+#define C10_EXPORT __declspec(dllexport)
+#define C10_IMPORT __declspec(dllimport)
+#else
+#define C10_EXPORT
+#define C10_IMPORT
+#endif
+#else // _WIN32
+#if defined(__GNUC__)
+#define C10_EXPORT __attribute__((__visibility__("default")))
+#define C10_HIDDEN __attribute__((__visibility__("hidden")))
+#else // defined(__GNUC__)
+#define C10_EXPORT
+#define C10_HIDDEN
+#endif // defined(__GNUC__)
+#define C10_IMPORT C10_EXPORT
+#endif // _WIN32
+
+#ifdef NO_EXPORT
+#undef C10_EXPORT
+#define C10_EXPORT
+#endif
+
+// Definition of an adaptive XX_API macro, that depends on whether you are
+// building the library itself or not, routes to XX_EXPORT and XX_IMPORT.
+// Basically, you will need to do this for each shared library that you are
+// building, and the instruction is as follows: assuming that you are building
+// a library called libawesome.so. You should:
+// (1) for your cmake target (usually done by "add_library(awesome, ...)"),
+//     define a macro called AWESOME_BUILD_MAIN_LIB using
+//     target_compile_options.
+// (2) define the AWESOME_API macro similar to the one below.
+// And in the source file of your awesome library, use AWESOME_API to
+// annotate public symbols.
+
+// Here, for the C10 library, we will define the macro C10_API for both import
+// and export.
+
+// This one is being used by libc10.so
+#ifdef C10_BUILD_MAIN_LIB
+#define C10_API C10_EXPORT
+#else
+#define C10_API C10_IMPORT
+#endif
+
+// This one is being used by libtorch.so
+#ifdef CAFFE2_BUILD_MAIN_LIB
+#define TORCH_API C10_EXPORT
+#else
+#define TORCH_API C10_IMPORT
+#endif
+
+// You may be wondering: Whose brilliant idea was it to split torch_cuda into
+// two pieces with confusing names?
+// Once upon a time, there _was_ only TORCH_CUDA_API. All was happy until we
+// tried to compile PyTorch for CUDA 11.1, which ran into relocation marker
+// issues when linking big binaries.
+// (https://github.com/pytorch/pytorch/issues/39968) We had two choices:
+//    (1) Stop supporting so many GPU architectures
+//    (2) Do something else
+// We chose #2 and decided to split the behemoth that was torch_cuda into two
+// smaller libraries, one with most of the core kernel functions (torch_cuda_cu)
+// and the other that had..well..everything else (torch_cuda_cpp). The idea was
+// this: instead of linking our static libraries (like the hefty
+// libcudnn_static.a) with another huge library, torch_cuda, and run into pesky
+// relocation marker issues, we could link our static libraries to a smaller
+// part of torch_cuda (torch_cuda_cpp) and avoid the issues.
+
+// libtorch_cuda_cu.so
+#ifdef TORCH_CUDA_CU_BUILD_MAIN_LIB
+#define TORCH_CUDA_CU_API C10_EXPORT
+#elif defined(BUILD_SPLIT_CUDA)
+#define TORCH_CUDA_CU_API C10_IMPORT
+#endif
+
+// libtorch_cuda_cpp.so
+#ifdef TORCH_CUDA_CPP_BUILD_MAIN_LIB
+#define TORCH_CUDA_CPP_API C10_EXPORT
+#elif defined(BUILD_SPLIT_CUDA)
+#define TORCH_CUDA_CPP_API C10_IMPORT
+#endif
+
+// libtorch_cuda.so (where torch_cuda_cu and torch_cuda_cpp are a part of the
+// same api)
+#ifdef TORCH_CUDA_BUILD_MAIN_LIB
+#define TORCH_CUDA_CPP_API C10_EXPORT
+#define TORCH_CUDA_CU_API C10_EXPORT
+#elif !defined(BUILD_SPLIT_CUDA)
+#define TORCH_CUDA_CPP_API C10_IMPORT
+#define TORCH_CUDA_CU_API C10_IMPORT
+#endif
+
+#if defined(TORCH_HIP_BUILD_MAIN_LIB)
+#define TORCH_HIP_CPP_API C10_EXPORT
+#define TORCH_HIP_API C10_EXPORT
+#else
+#define TORCH_HIP_CPP_API C10_IMPORT
+#define TORCH_HIP_API C10_IMPORT
+#endif
+
+#if defined(TORCH_XPU_BUILD_MAIN_LIB)
+#define TORCH_XPU_API C10_EXPORT
+#else
+#define TORCH_XPU_API C10_IMPORT
+#endif
+
+// Enums only need to be exported on windows for non-CUDA files
+#if defined(_WIN32) && defined(__CUDACC__)
+#define C10_API_ENUM C10_API
+#else
+#define C10_API_ENUM
+#endif
+
+#endif // C10_MACROS_MACROS_H_
diff --git a/include/executorch/runtime/core/portable_type/c10/c10/macros/Macros.h b/include/executorch/runtime/core/portable_type/c10/c10/macros/Macros.h
new file mode 100644
index 00000000000..919eb6c8567
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/c10/c10/macros/Macros.h
@@ -0,0 +1,511 @@
+#ifndef C10_MACROS_MACROS_H_
+#define C10_MACROS_MACROS_H_
+#include <cassert>
+
+/* Main entry for c10/macros.
+ *
+ * In your code, include c10/macros/Macros.h directly, instead of individual
+ * files in this folder.
+ */
+
+// For build systems that do not directly depend on CMake and directly build
+// from the source directory (such as Buck), one may not have a cmake_macros.h
+// file at all. In this case, the build system is responsible for providing
+// correct macro definitions corresponding to the cmake_macros.h.in file.
+//
+// In such scenarios, one should define the macro
+//     C10_USING_CUSTOM_GENERATED_MACROS
+// to inform this header that it does not need to include the cmake_macros.h
+// file.
+
+#ifndef C10_USING_CUSTOM_GENERATED_MACROS
+#include <c10/macros/cmake_macros.h>
+#endif // C10_USING_CUSTOM_GENERATED_MACROS
+
+#include <c10/macros/Export.h>
+
+#if defined(__clang__)
+#define __ubsan_ignore_float_divide_by_zero__ \
+  __attribute__((no_sanitize("float-divide-by-zero")))
+#define __ubsan_ignore_undefined__ __attribute__((no_sanitize("undefined")))
+#define __ubsan_ignore_signed_int_overflow__ \
+  __attribute__((no_sanitize("signed-integer-overflow")))
+#define __ubsan_ignore_pointer_overflow__ \
+  __attribute__((no_sanitize("pointer-overflow")))
+#define __ubsan_ignore_function__ __attribute__((no_sanitize("function")))
+#define __ubsan_ignore_float_cast_overflow__ \
+  __attribute__((no_sanitize("float-cast-overflow")))
+#else
+#define __ubsan_ignore_float_divide_by_zero__
+#define __ubsan_ignore_undefined__
+#define __ubsan_ignore_signed_int_overflow__
+#define __ubsan_ignore_pointer_overflow__
+#define __ubsan_ignore_function__
+#define __ubsan_ignore_float_cast_overflow__
+#endif
+
+// Detect address sanitizer as some stuff doesn't work with it
+#undef C10_ASAN_ENABLED
+
+// for clang
+#if defined(__has_feature)
+#if ((__has_feature(address_sanitizer)))
+#define C10_ASAN_ENABLED 1
+#endif
+#endif
+
+// for gcc
+#if defined(__SANITIZE_ADDRESS__)
+#if __SANITIZE_ADDRESS__
+#if !defined(C10_ASAN_ENABLED)
+#define C10_ASAN_ENABLED 1
+#endif
+#endif
+#endif
+
+#if !defined(C10_ASAN_ENABLED)
+#define C10_ASAN_ENABLED 0
+#endif
+
+// Detect undefined-behavior sanitizer (UBSAN)
+#undef C10_UBSAN_ENABLED
+
+// for clang or gcc >= 14
+// NB: gcc 14 adds support for Clang's __has_feature
+//   https://gcc.gnu.org/gcc-14/changes.html
+//   gcc < 14 doesn't have a macro for UBSAN
+//   (e.g. __SANITIZE_UNDEFINED__ does not exist in gcc)
+//   https://github.com/google/sanitizers/issues/765
+#if defined(__has_feature)
+#if ((__has_feature(undefined_behavior_sanitizer)))
+#define C10_UBSAN_ENABLED 1
+#endif
+#endif
+
+#if !defined(C10_UBSAN_ENABLED)
+#define C10_UBSAN_ENABLED 0
+#endif
+
+// Disable the copy and assignment operator for a class. Note that this will
+// disable the usage of the class in std containers.
+#define C10_DISABLE_COPY_AND_ASSIGN(classname) \
+  classname(const classname&) = delete;        \
+  classname& operator=(const classname&) = delete
+
+#define C10_CONCATENATE_IMPL(s1, s2) s1##s2
+#define C10_CONCATENATE(s1, s2) C10_CONCATENATE_IMPL(s1, s2)
+
+#define C10_MACRO_EXPAND(args) args
+
+#define C10_STRINGIZE_IMPL(x) #x
+#define C10_STRINGIZE(x) C10_STRINGIZE_IMPL(x)
+
+/**
+ * C10_ANONYMOUS_VARIABLE(str) introduces a new identifier which starts with
+ * str and ends with a unique number.
+ */
+#ifdef __COUNTER__
+#define C10_UID __COUNTER__
+#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __COUNTER__)
+#else
+#define C10_UID __LINE__
+#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __LINE__)
+#endif
+
+#ifdef __has_cpp_attribute
+#define C10_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#else
+#define C10_HAS_CPP_ATTRIBUTE(x) (0)
+#endif
+
+#ifndef FBCODE_CAFFE2
+/// DEPRECATED: Warn if a type or return value is discarded.
+#define C10_NODISCARD [[nodiscard]]
+
+/// DEPRECATED: Suppress an unused variable.
+#define C10_UNUSED [[maybe_unused]]
+#endif
+
+#if !defined(__has_attribute)
+#define __has_attribute(x) 0
+#endif
+
+// Direct port of LLVM_ATTRIBUTE_USED.
+#if __has_attribute(used)
+#define C10_USED __attribute__((__used__))
+#else
+#define C10_USED
+#endif
+
+#define C10_RESTRICT __restrict
+
+// Simply define the namespace, in case a dependent library want to refer to
+// the c10 namespace but not any nontrivial files.
+namespace c10 {}
+namespace c10::cuda {}
+namespace c10::hip {}
+namespace c10::xpu {}
+
+// Since C10 is the core library for caffe2 (and aten), we will simply reroute
+// all abstractions defined in c10 to be available in caffe2 as well.
+// This is only for backwards compatibility. Please use the symbols from the
+// c10 namespace where possible.
+namespace caffe2 {
+using namespace c10;
+}
+namespace at {
+using namespace c10;
+}
+namespace at::cuda {
+using namespace c10::cuda;
+} // namespace at::cuda
+
+// WARNING!!! THIS IS A GIANT HACK!!!
+// This line means you cannot simultaneously include c10/hip
+// and c10/cuda and then use them from the at::cuda namespace.
+// This is true in practice, because HIPIFY works inplace on
+// files in ATen/cuda, so it assumes that c10::hip is available
+// from at::cuda.  This namespace makes that happen.  When
+// HIPIFY is no longer out-of-place, we can switch the cuda
+// here to hip and everyone is happy.
+namespace at::cuda {
+using namespace c10::hip;
+} // namespace at::cuda
+
+namespace at::xpu {
+using namespace c10::xpu;
+} // namespace at::xpu
+
+// C10_LIKELY/C10_UNLIKELY
+//
+// These macros provide parentheses, so you can use these macros as:
+//
+//    if C10_LIKELY(some_expr) {
+//      ...
+//    }
+//
+// NB: static_cast to boolean is mandatory in C++, because __builtin_expect
+// takes a long argument, which means you may trigger the wrong conversion
+// without it.
+//
+#if defined(__GNUC__) || defined(__ICL) || defined(__clang__)
+#define C10_LIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 1))
+#define C10_UNLIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 0))
+#else
+#define C10_LIKELY(expr) (expr)
+#define C10_UNLIKELY(expr) (expr)
+#endif
+
+/// C10_NOINLINE - Functions whose declaration is annotated with this will not
+/// be inlined.
+#ifdef __GNUC__
+#define C10_NOINLINE __attribute__((noinline))
+#elif _MSC_VER
+#define C10_NOINLINE __declspec(noinline)
+#else
+#define C10_NOINLINE
+#endif
+
+#if defined(_MSC_VER)
+#define C10_ALWAYS_INLINE __forceinline
+#elif __has_attribute(always_inline) || defined(__GNUC__)
+#define C10_ALWAYS_INLINE __attribute__((__always_inline__)) inline
+#else
+#define C10_ALWAYS_INLINE inline
+#endif
+
+// Unlike C10_ALWAYS_INLINE, C10_ALWAYS_INLINE_ATTRIBUTE can be used
+// on a lambda.
+#if defined(_MSC_VER)
+// MSVC 14.39 is reasonably recent and doesn't like
+// [[msvc::forceinline]] on a lambda, so don't try to use it.
+#define C10_ALWAYS_INLINE_ATTRIBUTE
+#elif __has_attribute(always_inline) || defined(__GNUC__)
+#define C10_ALWAYS_INLINE_ATTRIBUTE __attribute__((__always_inline__))
+#else
+#define C10_ALWAYS_INLINE_ATTRIBUTE
+#endif
+
+#if defined(_MSC_VER)
+#define C10_ATTR_VISIBILITY_HIDDEN
+#elif defined(__GNUC__)
+#define C10_ATTR_VISIBILITY_HIDDEN __attribute__((__visibility__("hidden")))
+#else
+#define C10_ATTR_VISIBILITY_HIDDEN
+#endif
+
+#define C10_ERASE C10_ALWAYS_INLINE C10_ATTR_VISIBILITY_HIDDEN
+
+#include <cstdint>
+
+#ifdef __HIPCC__
+// Unlike CUDA, HIP requires a HIP header to be included for __host__ to work.
+// We do this #include here so that C10_HOST_DEVICE and friends will Just Work.
+// See https://github.com/ROCm-Developer-Tools/HIP/issues/441
+#include <hip/hip_runtime.h>
+#endif
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+// Designates functions callable from the host (CPU) and the device (GPU)
+#define C10_HOST_DEVICE __host__ __device__
+#define C10_DEVICE __device__
+#define C10_HOST __host__
+// constants from
+// (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications)
+// The maximum number of threads per multiprocessor is 1024 for Turing
+// architecture (7.5), 1536 for Geforce Ampere (8.6)/Jetson Orin (8.7), and
+// 2048 for all other architectures. You'll get warnings if you exceed these
+// constants. Hence, the following macros adjust the input values from the user
+// to resolve potential warnings.
+#if __CUDA_ARCH__ == 750
+constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1024;
+#elif __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 870 || __CUDA_ARCH__ == 890
+constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1536;
+#else
+constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 2048;
+#endif
+// CUDA_MAX_THREADS_PER_BLOCK is same for all architectures currently
+constexpr uint32_t CUDA_MAX_THREADS_PER_BLOCK = 1024;
+// CUDA_THREADS_PER_BLOCK_FALLBACK is the "canonical fallback" choice of block
+// size. 256 is a good number for this fallback and should give good occupancy
+// and versatility across all architectures.
+constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
+// NOTE: if you are thinking of constexpr-ify the inputs to launch bounds, it
+//       turns out that although __launch_bounds__ can take constexpr, it
+//       can't take a constexpr that has anything to do with templates.
+//       Currently we use launch_bounds that depend on template arguments in
+//       Loops.cuh, Reduce.cuh and LossCTC.cuh. Hence, C10_MAX_THREADS_PER_BLOCK
+//       and C10_MIN_BLOCKS_PER_SM are kept as macros.
+// Suppose you were planning to write __launch_bounds__(a, b), based on your
+// performance tuning on a modern GPU. Instead, you should write
+// __launch_bounds__(C10_MAX_THREADS_PER_BLOCK(a), C10_MIN_BLOCKS_PER_SM(a, b)),
+// which will also properly respect limits on old architectures.
+#define C10_MAX_THREADS_PER_BLOCK(val)           \
+  (((val) <= CUDA_MAX_THREADS_PER_BLOCK) ? (val) \
+                                         : CUDA_THREADS_PER_BLOCK_FALLBACK)
+#define C10_MIN_BLOCKS_PER_SM(threads_per_block, blocks_per_sm)        \
+  ((((threads_per_block) * (blocks_per_sm) <= CUDA_MAX_THREADS_PER_SM) \
+        ? (blocks_per_sm)                                              \
+        : ((CUDA_MAX_THREADS_PER_SM + (threads_per_block)-1) /         \
+           (threads_per_block))))
+// C10_LAUNCH_BOUNDS is analogous to __launch_bounds__
+#define C10_LAUNCH_BOUNDS_0 \
+  __launch_bounds__(        \
+      256, 4) // default launch bounds that should give good occupancy and
+              // versatility across all architectures.
+#define C10_LAUNCH_BOUNDS_1(max_threads_per_block) \
+  __launch_bounds__((C10_MAX_THREADS_PER_BLOCK((max_threads_per_block))))
+#define C10_LAUNCH_BOUNDS_2(max_threads_per_block, min_blocks_per_sm) \
+  __launch_bounds__(                                                  \
+      (C10_MAX_THREADS_PER_BLOCK((max_threads_per_block))),           \
+      (C10_MIN_BLOCKS_PER_SM((max_threads_per_block), (min_blocks_per_sm))))
+#else
+#define C10_HOST_DEVICE
+#define C10_HOST
+#define C10_DEVICE
+#endif
+
+#if defined(USE_ROCM)
+#define C10_HIP_HOST_DEVICE __host__ __device__
+#else
+#define C10_HIP_HOST_DEVICE
+#endif
+
+#if defined(USE_ROCM)
+#define C10_WARP_SIZE warpSize // = 64 or 32 (Defined in hip_runtime.h)
+#else
+#define C10_WARP_SIZE 32
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+#define __func__ __FUNCTION__
+#endif
+
+// CUDA_KERNEL_ASSERT checks the assertion
+// even when NDEBUG is defined. This is useful for important assertions in CUDA
+// code that would otherwise be suppressed when building Release.
+#if defined(__ANDROID__) || defined(__APPLE__) || defined(__FreeBSD__)
+// Those platforms do not support assert()
+#define CUDA_KERNEL_ASSERT(cond)
+#define CUDA_KERNEL_ASSERT_MSG(cond, msg)
+#define SYCL_KERNEL_ASSERT(cond)
+#elif defined(_MSC_VER)
+#if defined(NDEBUG)
+extern "C" {
+C10_IMPORT
+#if defined(__SYCL_DEVICE_ONLY__)
+extern SYCL_EXTERNAL void _wassert(
+    const wchar_t* wexpr,
+    const wchar_t* wfile,
+    unsigned line);
+#else
+#if defined(__CUDA_ARCH__)
+__host__ __device__
+#endif // __CUDA_ARCH__
+    void
+    _wassert(wchar_t const* _Message, wchar_t const* _File, unsigned _Line);
+#endif // __SYCL_DEVICE_ONLY__
+}
+#endif // NDEBUG
+#define CUDA_KERNEL_ASSERT(cond)                 \
+  if (C10_UNLIKELY(!(cond))) {                   \
+    (void)(_wassert(                             \
+               _CRT_WIDE(#cond),                 \
+               _CRT_WIDE(__FILE__),              \
+               static_cast<unsigned>(__LINE__)), \
+           0);                                   \
+  }
+// TODO: This doesn't assert the message because I (chilli) couldn't figure out
+// a nice way to convert a char* to a wchar_t*
+#define CUDA_KERNEL_ASSERT_MSG(cond, msg)        \
+  if (C10_UNLIKELY(!(cond))) {                   \
+    (void)(_wassert(                             \
+               _CRT_WIDE(#cond),                 \
+               _CRT_WIDE(__FILE__),              \
+               static_cast<unsigned>(__LINE__)), \
+           0);                                   \
+  }
+#define SYCL_KERNEL_ASSERT(cond)                 \
+  if (C10_UNLIKELY(!(cond))) {                   \
+    (void)(_wassert(                             \
+               _CRT_WIDE(#cond),                 \
+               _CRT_WIDE(__FILE__),              \
+               static_cast<unsigned>(__LINE__)), \
+           0);                                   \
+  }
+#else // __APPLE__, _MSC_VER
+#if defined(NDEBUG)
+extern "C" {
+#if defined(__SYCL_DEVICE_ONLY__)
+extern SYCL_EXTERNAL void __assert_fail(
+    const char* expr,
+    const char* file,
+    unsigned int line,
+    const char* func);
+#else // __SYCL_DEVICE_ONLY__
+#if (defined(__CUDA_ARCH__) && !(defined(__clang__) && defined(__CUDA__)))
+// CUDA supports __assert_fail function which are common for both device
+// and host side code.
+__host__ __device__
+#endif
+
+    // This forward declaration matching the declaration of __assert_fail
+    // exactly how it is in glibc in case parts of the program are compiled with
+    // different NDEBUG settings. Otherwise we might get 'ambiguous declaration'
+    // error. Note: On ROCm - this declaration serves for host side compilation.
+    void
+    __assert_fail(
+        const char* assertion,
+        const char* file,
+        unsigned int line,
+        const char* function) noexcept __attribute__((__noreturn__));
+
+#endif // __SYCL_DEVICE_ONLY__
+}
+#endif // NDEBUG
+// ROCm disable kernel assert by default
+#if !defined(C10_USE_ROCM_KERNEL_ASSERT) and defined(USE_ROCM)
+#define CUDA_KERNEL_ASSERT(cond)
+#define CUDA_KERNEL_ASSERT_MSG(cond, msg)
+#define SYCL_KERNEL_ASSERT(cond)
+#else
+#define CUDA_KERNEL_ASSERT(cond)                                         \
+  if (C10_UNLIKELY(!(cond))) {                                           \
+    __assert_fail(                                                       \
+        #cond, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
+  }
+#define CUDA_KERNEL_ASSERT_MSG(cond, msg)                              \
+  if (C10_UNLIKELY(!(cond))) {                                         \
+    __assert_fail(                                                     \
+        msg, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
+  }
+#define SYCL_KERNEL_ASSERT(cond)                                         \
+  if (C10_UNLIKELY(!(cond))) {                                           \
+    __assert_fail(                                                       \
+        #cond, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
+  }
+#endif //  C10_USE_ROCM_KERNEL_ASSERT and USE_ROCM
+#endif // __APPLE__
+
+#ifdef __APPLE__
+#include <TargetConditionals.h>
+#endif
+
+#if defined(__ANDROID__)
+#define C10_ANDROID 1
+#define C10_MOBILE 1
+#elif (                   \
+    defined(__APPLE__) && \
+    (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE))
+#define C10_IOS 1
+#define C10_MOBILE 1
+#endif // ANDROID / IOS
+
+#if defined(C10_MOBILE) && C10_MOBILE
+#define C10_ALWAYS_INLINE_UNLESS_MOBILE inline
+#else
+#define C10_ALWAYS_INLINE_UNLESS_MOBILE C10_ALWAYS_INLINE
+#endif
+
+#if !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED)
+#define CONSTEXPR_EXCEPT_WIN_CUDA constexpr
+#define C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA constexpr
+
+#define STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(field, val) \
+  static constexpr const char field[] = val;
+#define STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA(cls, field, val)
+#endif // !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED)
+
+#ifndef HAS_DEMANGLE
+#if defined(__ANDROID__) || defined(_WIN32) || defined(__EMSCRIPTEN__)
+#define HAS_DEMANGLE 0
+#elif defined(__APPLE__) && \
+    (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE)
+#define HAS_DEMANGLE 0
+#else
+#define HAS_DEMANGLE 1
+#endif
+#endif // HAS_DEMANGLE
+
+#define _C10_PRAGMA__(string) _Pragma(#string)
+#define _C10_PRAGMA_(string) _C10_PRAGMA__(string)
+
+#ifdef __clang__
+#define C10_CLANG_DIAGNOSTIC_PUSH() _Pragma("clang diagnostic push")
+#define C10_CLANG_DIAGNOSTIC_POP() _Pragma("clang diagnostic pop")
+#define C10_CLANG_DIAGNOSTIC_IGNORE(flag) \
+  _C10_PRAGMA_(clang diagnostic ignored flag)
+#define C10_CLANG_HAS_WARNING(flag) __has_warning(flag)
+#else
+#define C10_CLANG_DIAGNOSTIC_PUSH()
+#define C10_CLANG_DIAGNOSTIC_POP()
+#define C10_CLANG_DIAGNOSTIC_IGNORE(flag)
+#define C10_CLANG_HAS_WARNING(flag) 0
+#endif
+
+#ifdef __clang__
+
+#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning)         \
+  _C10_PRAGMA_(clang diagnostic push)                               \
+  _C10_PRAGMA_(clang diagnostic ignored "-Wunknown-warning-option") \
+  _C10_PRAGMA_(clang diagnostic ignored warning)
+
+#define C10_DIAGNOSTIC_POP() _C10_PRAGMA_(clang diagnostic pop)
+
+#elif __GNUC__
+
+#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning) \
+  _C10_PRAGMA_(GCC diagnostic push)                         \
+  _C10_PRAGMA_(GCC diagnostic ignored "-Wpragmas")          \
+  _C10_PRAGMA_(GCC diagnostic ignored warning)
+
+#define C10_DIAGNOSTIC_POP() _C10_PRAGMA_(GCC diagnostic pop)
+
+#else
+
+#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning)
+#define C10_DIAGNOSTIC_POP()
+
+#endif
+
+#endif // C10_MACROS_MACROS_H_
diff --git a/include/executorch/runtime/core/portable_type/c10/c10/util/BFloat16-inl.h b/include/executorch/runtime/core/portable_type/c10/c10/util/BFloat16-inl.h
new file mode 100644
index 00000000000..10ab0c828d7
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/c10/c10/util/BFloat16-inl.h
@@ -0,0 +1,343 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/bit_cast.h>
+
+#include <limits>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
+#endif
+
+#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+#if defined(CL_SYCL_LANGUAGE_VERSION)
+#include <CL/sycl.hpp> // for SYCL 1.2.1
+#else
+#include <sycl/sycl.hpp> // for SYCL 2020
+#endif
+#include <ext/oneapi/bfloat16.hpp>
+#endif
+
+namespace c10 {
+
+/// Constructors
+inline C10_HOST_DEVICE BFloat16::BFloat16(float value)
+    :
+#if defined(__CUDACC__) && !defined(USE_ROCM) && defined(__CUDA_ARCH__) && \
+    __CUDA_ARCH__ >= 800
+      x(__bfloat16_as_ushort(__float2bfloat16(value)))
+#elif defined(__SYCL_DEVICE_ONLY__) && \
+    defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+      x(c10::bit_cast<uint16_t>(sycl::ext::oneapi::bfloat16(value)))
+#else
+      // RNE by default
+      x(detail::round_to_nearest_even(value))
+#endif
+{
+}
+
+/// Implicit conversions
+inline C10_HOST_DEVICE BFloat16::operator float() const {
+#if defined(__CUDACC__) && !defined(USE_ROCM)
+  return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
+#elif defined(__SYCL_DEVICE_ONLY__) && \
+    defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+  return float(*reinterpret_cast<const sycl::ext::oneapi::bfloat16*>(&x));
+#else
+  return detail::f32_from_bits(x);
+#endif
+}
+
+#if defined(__CUDACC__) && !defined(USE_ROCM)
+inline C10_HOST_DEVICE BFloat16::BFloat16(const __nv_bfloat16& value) {
+  x = *reinterpret_cast<const unsigned short*>(&value);
+}
+inline C10_HOST_DEVICE BFloat16::operator __nv_bfloat16() const {
+  return *reinterpret_cast<const __nv_bfloat16*>(&x);
+}
+#endif
+
+#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+inline C10_HOST_DEVICE BFloat16::BFloat16(
+    const sycl::ext::oneapi::bfloat16& value) {
+  x = *reinterpret_cast<const unsigned short*>(&value);
+}
+inline C10_HOST_DEVICE BFloat16::operator sycl::ext::oneapi::bfloat16() const {
+  return *reinterpret_cast<const sycl::ext::oneapi::bfloat16*>(&x);
+}
+#endif
+
+// CUDA intrinsics
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+inline C10_DEVICE BFloat16 __ldg(const BFloat16* ptr) {
+#if !defined(USE_ROCM) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return __ldg(reinterpret_cast<const __nv_bfloat16*>(ptr));
+#else
+  return *ptr;
+#endif
+}
+#endif
+
+/// Arithmetic
+
+inline C10_HOST_DEVICE BFloat16
+operator+(const BFloat16& a, const BFloat16& b) {
+  return static_cast<float>(a) + static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE BFloat16
+operator-(const BFloat16& a, const BFloat16& b) {
+  return static_cast<float>(a) - static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE BFloat16
+operator*(const BFloat16& a, const BFloat16& b) {
+  return static_cast<float>(a) * static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE BFloat16 operator/(const BFloat16& a, const BFloat16& b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE BFloat16 operator-(const BFloat16& a) {
+  return -static_cast<float>(a);
+}
+
+inline C10_HOST_DEVICE BFloat16& operator+=(BFloat16& a, const BFloat16& b) {
+  a = a + b;
+  return a;
+}
+
+inline C10_HOST_DEVICE BFloat16& operator-=(BFloat16& a, const BFloat16& b) {
+  a = a - b;
+  return a;
+}
+
+inline C10_HOST_DEVICE BFloat16& operator*=(BFloat16& a, const BFloat16& b) {
+  a = a * b;
+  return a;
+}
+
+inline C10_HOST_DEVICE BFloat16& operator/=(BFloat16& a, const BFloat16& b) {
+  a = a / b;
+  return a;
+}
+
+inline C10_HOST_DEVICE BFloat16& operator|(BFloat16& a, const BFloat16& b) {
+  a.x = a.x | b.x;
+  return a;
+}
+
+inline C10_HOST_DEVICE BFloat16& operator^(BFloat16& a, const BFloat16& b) {
+  a.x = a.x ^ b.x;
+  return a;
+}
+
+inline C10_HOST_DEVICE BFloat16& operator&(BFloat16& a, const BFloat16& b) {
+  a.x = a.x & b.x;
+  return a;
+}
+
+/// Arithmetic with floats
+
+inline C10_HOST_DEVICE float operator+(BFloat16 a, float b) {
+  return static_cast<float>(a) + b;
+}
+inline C10_HOST_DEVICE float operator-(BFloat16 a, float b) {
+  return static_cast<float>(a) - b;
+}
+inline C10_HOST_DEVICE float operator*(BFloat16 a, float b) {
+  return static_cast<float>(a) * b;
+}
+inline C10_HOST_DEVICE float operator/(BFloat16 a, float b) {
+  return static_cast<float>(a) / b;
+}
+
+inline C10_HOST_DEVICE float operator+(float a, BFloat16 b) {
+  return a + static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator-(float a, BFloat16 b) {
+  return a - static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator*(float a, BFloat16 b) {
+  return a * static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator/(float a, BFloat16 b) {
+  return a / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE float& operator+=(float& a, const BFloat16& b) {
+  return a += static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator-=(float& a, const BFloat16& b) {
+  return a -= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator*=(float& a, const BFloat16& b) {
+  return a *= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator/=(float& a, const BFloat16& b) {
+  return a /= static_cast<float>(b);
+}
+
+/// Arithmetic with doubles
+
+inline C10_HOST_DEVICE double operator+(BFloat16 a, double b) {
+  return static_cast<double>(a) + b;
+}
+inline C10_HOST_DEVICE double operator-(BFloat16 a, double b) {
+  return static_cast<double>(a) - b;
+}
+inline C10_HOST_DEVICE double operator*(BFloat16 a, double b) {
+  return static_cast<double>(a) * b;
+}
+inline C10_HOST_DEVICE double operator/(BFloat16 a, double b) {
+  return static_cast<double>(a) / b;
+}
+
+inline C10_HOST_DEVICE double operator+(double a, BFloat16 b) {
+  return a + static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator-(double a, BFloat16 b) {
+  return a - static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator*(double a, BFloat16 b) {
+  return a * static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator/(double a, BFloat16 b) {
+  return a / static_cast<double>(b);
+}
+
+/// Arithmetic with ints
+
+inline C10_HOST_DEVICE BFloat16 operator+(BFloat16 a, int b) {
+  return a + static_cast<BFloat16>(b);
+}
+inline C10_HOST_DEVICE BFloat16 operator-(BFloat16 a, int b) {
+  return a - static_cast<BFloat16>(b);
+}
+inline C10_HOST_DEVICE BFloat16 operator*(BFloat16 a, int b) {
+  return a * static_cast<BFloat16>(b);
+}
+inline C10_HOST_DEVICE BFloat16 operator/(BFloat16 a, int b) {
+  return a / static_cast<BFloat16>(b);
+}
+
+inline C10_HOST_DEVICE BFloat16 operator+(int a, BFloat16 b) {
+  return static_cast<BFloat16>(a) + b;
+}
+inline C10_HOST_DEVICE BFloat16 operator-(int a, BFloat16 b) {
+  return static_cast<BFloat16>(a) - b;
+}
+inline C10_HOST_DEVICE BFloat16 operator*(int a, BFloat16 b) {
+  return static_cast<BFloat16>(a) * b;
+}
+inline C10_HOST_DEVICE BFloat16 operator/(int a, BFloat16 b) {
+  return static_cast<BFloat16>(a) / b;
+}
+
+//// Arithmetic with int64_t
+
+inline C10_HOST_DEVICE BFloat16 operator+(BFloat16 a, int64_t b) {
+  return a + static_cast<BFloat16>(b);
+}
+inline C10_HOST_DEVICE BFloat16 operator-(BFloat16 a, int64_t b) {
+  return a - static_cast<BFloat16>(b);
+}
+inline C10_HOST_DEVICE BFloat16 operator*(BFloat16 a, int64_t b) {
+  return a * static_cast<BFloat16>(b);
+}
+inline C10_HOST_DEVICE BFloat16 operator/(BFloat16 a, int64_t b) {
+  return a / static_cast<BFloat16>(b);
+}
+
+inline C10_HOST_DEVICE BFloat16 operator+(int64_t a, BFloat16 b) {
+  return static_cast<BFloat16>(a) + b;
+}
+inline C10_HOST_DEVICE BFloat16 operator-(int64_t a, BFloat16 b) {
+  return static_cast<BFloat16>(a) - b;
+}
+inline C10_HOST_DEVICE BFloat16 operator*(int64_t a, BFloat16 b) {
+  return static_cast<BFloat16>(a) * b;
+}
+inline C10_HOST_DEVICE BFloat16 operator/(int64_t a, BFloat16 b) {
+  return static_cast<BFloat16>(a) / b;
+}
+
+// Overloading < and > operators, because std::max and std::min use them.
+
+inline C10_HOST_DEVICE bool operator>(BFloat16& lhs, BFloat16& rhs) {
+  return float(lhs) > float(rhs);
+}
+
+inline C10_HOST_DEVICE bool operator<(BFloat16& lhs, BFloat16& rhs) {
+  return float(lhs) < float(rhs);
+}
+
+} // namespace c10
+
+namespace std {
+
+template <>
+class numeric_limits<c10::BFloat16> {
+ public:
+  static constexpr bool is_signed = true;
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = true;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = true;
+  static constexpr auto has_denorm = numeric_limits<float>::has_denorm;
+  static constexpr auto has_denorm_loss =
+      numeric_limits<float>::has_denorm_loss;
+  static constexpr auto round_style = numeric_limits<float>::round_style;
+  static constexpr bool is_iec559 = false;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 8;
+  static constexpr int digits10 = 2;
+  static constexpr int max_digits10 = 4;
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -125;
+  static constexpr int min_exponent10 = -37;
+  static constexpr int max_exponent = 128;
+  static constexpr int max_exponent10 = 38;
+  static constexpr auto traps = numeric_limits<float>::traps;
+  static constexpr auto tinyness_before =
+      numeric_limits<float>::tinyness_before;
+
+  static constexpr c10::BFloat16 min() {
+    return c10::BFloat16(0x0080, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 lowest() {
+    return c10::BFloat16(0xFF7F, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 max() {
+    return c10::BFloat16(0x7F7F, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 epsilon() {
+    return c10::BFloat16(0x3C00, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 round_error() {
+    return c10::BFloat16(0x3F00, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 infinity() {
+    return c10::BFloat16(0x7F80, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 quiet_NaN() {
+    return c10::BFloat16(0x7FC0, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 signaling_NaN() {
+    return c10::BFloat16(0x7F80, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 denorm_min() {
+    return c10::BFloat16(0x0001, c10::BFloat16::from_bits());
+  }
+};
+
+} // namespace std
+
+C10_CLANG_DIAGNOSTIC_POP()
diff --git a/include/executorch/runtime/core/portable_type/c10/c10/util/BFloat16-math.h b/include/executorch/runtime/core/portable_type/c10/c10/util/BFloat16-math.h
new file mode 100644
index 00000000000..8291cd74481
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/c10/c10/util/BFloat16-math.h
@@ -0,0 +1,299 @@
+#pragma once
+
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-float-conversion")
+#endif
+
+namespace c10 {
+template <typename T>
+struct is_reduced_floating_point
+    : std::integral_constant<
+          bool,
+          std::is_same_v<T, c10::Half> || std::is_same_v<T, c10::BFloat16>> {};
+
+template <typename T>
+constexpr bool is_reduced_floating_point_v =
+    is_reduced_floating_point<T>::value;
+} // namespace c10
+
+namespace std {
+
+#if !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED)
+using c10::is_reduced_floating_point;
+using c10::is_reduced_floating_point_v;
+#endif // !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED)
+
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T acos(T a) {
+  return std::acos(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T asin(T a) {
+  return std::asin(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T atan(T a) {
+  return std::atan(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T atanh(T a) {
+  return std::atanh(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T erf(T a) {
+  return std::erf(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T erfc(T a) {
+  return std::erfc(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T exp(T a) {
+  return std::exp(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T expm1(T a) {
+  return std::expm1(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline bool isfinite(T a) {
+  return std::isfinite(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T log(T a) {
+  return std::log(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T log10(T a) {
+  return std::log10(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T log1p(T a) {
+  return std::log1p(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T log2(T a) {
+  return std::log2(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T ceil(T a) {
+  return std::ceil(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T cos(T a) {
+  return std::cos(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T floor(T a) {
+  return std::floor(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T nearbyint(T a) {
+  return std::nearbyint(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T sin(T a) {
+  return std::sin(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T tan(T a) {
+  return std::tan(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T sinh(T a) {
+  return std::sinh(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T cosh(T a) {
+  return std::cosh(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T tanh(T a) {
+  return std::tanh(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T trunc(T a) {
+  return std::trunc(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T lgamma(T a) {
+  return std::lgamma(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T sqrt(T a) {
+  return std::sqrt(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T rsqrt(T a) {
+  return 1.0 / std::sqrt(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T abs(T a) {
+  return std::abs(float(a));
+}
+#if defined(_MSC_VER) && defined(__CUDACC__)
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T pow(T a, double b) {
+  return std::pow(float(a), float(b));
+}
+#else
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T pow(T a, double b) {
+  return std::pow(float(a), b);
+}
+#endif
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T pow(T a, T b) {
+  return std::pow(float(a), float(b));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T fmod(T a, T b) {
+  return std::fmod(float(a), float(b));
+}
+
+/*
+  The following function is inspired from the implementation in `musl`
+  Link to License: https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT
+  ----------------------------------------------------------------------
+  Copyright © 2005-2020 Rich Felker, et al.
+
+  Permission is hereby granted, free of charge, to any person obtaining
+  a copy of this software and associated documentation files (the
+  "Software"), to deal in the Software without restriction, including
+  without limitation the rights to use, copy, modify, merge, publish,
+  distribute, sublicense, and/or sell copies of the Software, and to
+  permit persons to whom the Software is furnished to do so, subject to
+  the following conditions:
+
+  The above copyright notice and this permission notice shall be
+  included in all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+  ----------------------------------------------------------------------
+ */
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+C10_HOST_DEVICE inline T nextafter(T from, T to) {
+  // Reference:
+  // https://git.musl-libc.org/cgit/musl/tree/src/math/nextafter.c
+  using int_repr_t = uint16_t;
+  constexpr uint8_t bits = 16;
+  union {
+    T f;
+    int_repr_t i;
+  } ufrom = {from}, uto = {to};
+
+  // get a mask to get the sign bit i.e. MSB
+  int_repr_t sign_mask = int_repr_t{1} << (bits - 1);
+
+  // short-circuit: if either is NaN, return NaN
+  if (from != from || to != to) {
+    return from + to;
+  }
+
+  // short-circuit: if they are exactly the same.
+  if (ufrom.i == uto.i) {
+    return from;
+  }
+
+  // mask the sign-bit to zero i.e. positive
+  // equivalent to abs(x)
+  int_repr_t abs_from = ufrom.i & ~sign_mask;
+  int_repr_t abs_to = uto.i & ~sign_mask;
+  if (abs_from == 0) {
+    // if both are zero but with different sign,
+    // preserve the sign of `to`.
+    if (abs_to == 0) {
+      return to;
+    }
+    // smallest subnormal with sign of `to`.
+    ufrom.i = (uto.i & sign_mask) | int_repr_t{1};
+    return ufrom.f;
+  }
+
+  // if abs(from) > abs(to) or sign(from) != sign(to)
+  if (abs_from > abs_to || ((ufrom.i ^ uto.i) & sign_mask)) {
+    ufrom.i--;
+  } else {
+    ufrom.i++;
+  }
+
+  return ufrom.f;
+}
+
+} // namespace std
+
+C10_CLANG_DIAGNOSTIC_POP()
diff --git a/include/executorch/runtime/core/portable_type/c10/c10/util/BFloat16.h b/include/executorch/runtime/core/portable_type/c10/c10/util/BFloat16.h
new file mode 100644
index 00000000000..09d3051ab71
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/c10/c10/util/BFloat16.h
@@ -0,0 +1,126 @@
+#pragma once
+
+// Defines the bloat16 type (brain floating-point). This representation uses
+// 1 bit for the sign, 8 bits for the exponent and 7 bits for the mantissa.
+
+#include <c10/macros/Macros.h>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <iosfwd>
+#include <ostream>
+
+#if defined(__CUDACC__) && !defined(USE_ROCM)
+#include <cuda_bf16.h>
+#endif
+
+#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+#if defined(CL_SYCL_LANGUAGE_VERSION)
+#include <CL/sycl.hpp> // for SYCL 1.2.1
+#else
+#include <sycl/sycl.hpp> // for SYCL 2020
+#endif
+#include <ext/oneapi/bfloat16.hpp>
+#endif
+
+namespace c10 {
+
+namespace detail {
+inline C10_HOST_DEVICE float f32_from_bits(uint16_t src) {
+  float res = 0;
+  uint32_t tmp = src;
+  tmp <<= 16;
+
+#if defined(USE_ROCM)
+  float* tempRes;
+
+  // We should be using memcpy in order to respect the strict aliasing rule
+  // but it fails in the HIP environment.
+  tempRes = reinterpret_cast<float*>(&tmp);
+  res = *tempRes;
+#else
+  std::memcpy(&res, &tmp, sizeof(tmp));
+#endif
+
+  return res;
+}
+
+inline C10_HOST_DEVICE uint16_t bits_from_f32(float src) {
+  uint32_t res = 0;
+
+#if defined(USE_ROCM)
+  // We should be using memcpy in order to respect the strict aliasing rule
+  // but it fails in the HIP environment.
+  uint32_t* tempRes = reinterpret_cast<uint32_t*>(&src);
+  res = *tempRes;
+#else
+  std::memcpy(&res, &src, sizeof(res));
+#endif
+
+  return res >> 16;
+}
+
+inline C10_HOST_DEVICE uint16_t round_to_nearest_even(float src) {
+#if defined(USE_ROCM)
+  if (src != src) {
+#elif defined(_MSC_VER)
+  if (isnan(src)) {
+#else
+  if (std::isnan(src)) {
+#endif
+    return UINT16_C(0x7FC0);
+  } else {
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+    union {
+      uint32_t U32; // NOLINT(facebook-hte-BadMemberName)
+      float F32; // NOLINT(facebook-hte-BadMemberName)
+    };
+
+    F32 = src;
+    uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF);
+    return static_cast<uint16_t>((U32 + rounding_bias) >> 16);
+  }
+}
+} // namespace detail
+
+struct alignas(2) BFloat16 {
+  uint16_t x;
+
+  // HIP wants __host__ __device__ tag, CUDA does not
+#if defined(USE_ROCM)
+  C10_HOST_DEVICE BFloat16() = default;
+#else
+  BFloat16() = default;
+#endif
+
+  struct from_bits_t {};
+  static constexpr C10_HOST_DEVICE from_bits_t from_bits() {
+    return from_bits_t();
+  }
+
+  constexpr C10_HOST_DEVICE BFloat16(unsigned short bits, from_bits_t)
+      : x(bits) {}
+  /* implicit */ inline C10_HOST_DEVICE BFloat16(float value);
+  inline C10_HOST_DEVICE operator float() const;
+
+#if defined(__CUDACC__) && !defined(USE_ROCM)
+  inline C10_HOST_DEVICE BFloat16(const __nv_bfloat16& value);
+  explicit inline C10_HOST_DEVICE operator __nv_bfloat16() const;
+#endif
+
+#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+  inline C10_HOST_DEVICE BFloat16(const sycl::ext::oneapi::bfloat16& value);
+  explicit inline C10_HOST_DEVICE operator sycl::ext::oneapi::bfloat16() const;
+#endif
+};
+
+C10_API inline std::ostream& operator<<(
+    std::ostream& out,
+    const BFloat16& value) {
+  out << (float)value;
+  return out;
+}
+
+} // namespace c10
+
+#include <c10/util/BFloat16-inl.h> // IWYU pragma: keep
diff --git a/include/executorch/runtime/core/portable_type/c10/c10/util/Half-inl.h b/include/executorch/runtime/core/portable_type/c10/c10/util/Half-inl.h
new file mode 100644
index 00000000000..ae4469e5636
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/c10/c10/util/Half-inl.h
@@ -0,0 +1,350 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/bit_cast.h>
+
+#include <cstring>
+#include <limits>
+
+#ifdef __CUDACC__
+#include <cuda_fp16.h>
+#endif
+
+#ifdef __HIPCC__
+#include <hip/hip_fp16.h>
+#endif
+
+#if defined(CL_SYCL_LANGUAGE_VERSION)
+#include <CL/sycl.hpp> // for SYCL 1.2.1
+#elif defined(SYCL_LANGUAGE_VERSION)
+#include <sycl/sycl.hpp> // for SYCL 2020
+#endif
+
+#if (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
+    !defined(__APPLE__)
+#include <ATen/cpu/vec/vec_half.h>
+#endif
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
+#endif
+
+namespace c10 {
+
+#if defined(__aarch64__) && !defined(__CUDACC__)
+/// Constructors
+inline Half::Half(float16_t value) : x(detail::fp16_to_bits(value)) {}
+inline Half::operator float16_t() const {
+  return detail::fp16_from_bits(x);
+}
+#else
+
+inline C10_HOST_DEVICE Half::Half(float value)
+    :
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+      x(__half_as_short(__float2half(value)))
+#elif defined(__SYCL_DEVICE_ONLY__)
+      x(c10::bit_cast<uint16_t>(sycl::half(value)))
+#elif (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
+    !defined(__APPLE__)
+      x(at::vec::float2half_scalar(value))
+#else
+      x(detail::fp16_ieee_from_fp32_value(value))
+#endif
+{
+}
+
+/// Implicit conversions
+
+inline C10_HOST_DEVICE Half::operator float() const {
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+  return __half2float(*reinterpret_cast<const __half*>(&x));
+#elif defined(__SYCL_DEVICE_ONLY__)
+  return float(c10::bit_cast<sycl::half>(x));
+#elif (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
+    !defined(__APPLE__)
+  return at::vec::half2float_scalar(x);
+#elif defined(__aarch64__) && !defined(__CUDACC__)
+  return detail::native_fp16_to_fp32_value(x);
+#else
+  return detail::fp16_ieee_to_fp32_value(x);
+#endif
+}
+
+#endif /* !defined(__aarch64__) || defined(__CUDACC__) \
+        */
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+inline C10_HOST_DEVICE Half::Half(const __half& value) {
+  x = *reinterpret_cast<const unsigned short*>(&value);
+}
+inline C10_HOST_DEVICE Half::operator __half() const {
+  return *reinterpret_cast<const __half*>(&x);
+}
+#endif
+
+#ifdef SYCL_LANGUAGE_VERSION
+inline C10_HOST_DEVICE Half::Half(const sycl::half& value) {
+  x = *reinterpret_cast<const unsigned short*>(&value);
+}
+inline C10_HOST_DEVICE Half::operator sycl::half() const {
+  return *reinterpret_cast<const sycl::half*>(&x);
+}
+#endif
+
+// CUDA intrinsics
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)) || \
+    (defined(__clang__) && defined(__CUDA__))
+inline __device__ Half __ldg(const Half* ptr) {
+  return __ldg(reinterpret_cast<const __half*>(ptr));
+}
+#endif
+
+/// Arithmetic
+
+inline C10_HOST_DEVICE Half operator+(const Half& a, const Half& b) {
+  return static_cast<float>(a) + static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Half operator-(const Half& a, const Half& b) {
+  return static_cast<float>(a) - static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Half operator*(const Half& a, const Half& b) {
+  return static_cast<float>(a) * static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Half operator/(const Half& a, const Half& b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Half operator-(const Half& a) {
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+    defined(__HIP_DEVICE_COMPILE__)
+  return __hneg(a);
+#elif defined(__SYCL_DEVICE_ONLY__)
+  return -c10::bit_cast<sycl::half>(a);
+#else
+  return -static_cast<float>(a);
+#endif
+}
+
+inline C10_HOST_DEVICE Half& operator+=(Half& a, const Half& b) {
+  a = a + b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Half& operator-=(Half& a, const Half& b) {
+  a = a - b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Half& operator*=(Half& a, const Half& b) {
+  a = a * b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Half& operator/=(Half& a, const Half& b) {
+  a = a / b;
+  return a;
+}
+
+/// Arithmetic with floats
+
+inline C10_HOST_DEVICE float operator+(Half a, float b) {
+  return static_cast<float>(a) + b;
+}
+inline C10_HOST_DEVICE float operator-(Half a, float b) {
+  return static_cast<float>(a) - b;
+}
+inline C10_HOST_DEVICE float operator*(Half a, float b) {
+  return static_cast<float>(a) * b;
+}
+inline C10_HOST_DEVICE float operator/(Half a, float b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / b;
+}
+
+inline C10_HOST_DEVICE float operator+(float a, Half b) {
+  return a + static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator-(float a, Half b) {
+  return a - static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator*(float a, Half b) {
+  return a * static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator/(float a, Half b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE float& operator+=(float& a, const Half& b) {
+  return a += static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator-=(float& a, const Half& b) {
+  return a -= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator*=(float& a, const Half& b) {
+  return a *= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator/=(float& a, const Half& b) {
+  return a /= static_cast<float>(b);
+}
+
+/// Arithmetic with doubles
+
+inline C10_HOST_DEVICE double operator+(Half a, double b) {
+  return static_cast<double>(a) + b;
+}
+inline C10_HOST_DEVICE double operator-(Half a, double b) {
+  return static_cast<double>(a) - b;
+}
+inline C10_HOST_DEVICE double operator*(Half a, double b) {
+  return static_cast<double>(a) * b;
+}
+inline C10_HOST_DEVICE double operator/(Half a, double b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<double>(a) / b;
+}
+
+inline C10_HOST_DEVICE double operator+(double a, Half b) {
+  return a + static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator-(double a, Half b) {
+  return a - static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator*(double a, Half b) {
+  return a * static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator/(double a, Half b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<double>(b);
+}
+
+/// Arithmetic with ints
+
+inline C10_HOST_DEVICE Half operator+(Half a, int b) {
+  return a + static_cast<Half>(b);
+}
+inline C10_HOST_DEVICE Half operator-(Half a, int b) {
+  return a - static_cast<Half>(b);
+}
+inline C10_HOST_DEVICE Half operator*(Half a, int b) {
+  return a * static_cast<Half>(b);
+}
+inline C10_HOST_DEVICE Half operator/(Half a, int b) {
+  return a / static_cast<Half>(b);
+}
+
+inline C10_HOST_DEVICE Half operator+(int a, Half b) {
+  return static_cast<Half>(a) + b;
+}
+inline C10_HOST_DEVICE Half operator-(int a, Half b) {
+  return static_cast<Half>(a) - b;
+}
+inline C10_HOST_DEVICE Half operator*(int a, Half b) {
+  return static_cast<Half>(a) * b;
+}
+inline C10_HOST_DEVICE Half operator/(int a, Half b) {
+  return static_cast<Half>(a) / b;
+}
+
+//// Arithmetic with int64_t
+
+inline C10_HOST_DEVICE Half operator+(Half a, int64_t b) {
+  return a + static_cast<Half>(b);
+}
+inline C10_HOST_DEVICE Half operator-(Half a, int64_t b) {
+  return a - static_cast<Half>(b);
+}
+inline C10_HOST_DEVICE Half operator*(Half a, int64_t b) {
+  return a * static_cast<Half>(b);
+}
+inline C10_HOST_DEVICE Half operator/(Half a, int64_t b) {
+  return a / static_cast<Half>(b);
+}
+
+inline C10_HOST_DEVICE Half operator+(int64_t a, Half b) {
+  return static_cast<Half>(a) + b;
+}
+inline C10_HOST_DEVICE Half operator-(int64_t a, Half b) {
+  return static_cast<Half>(a) - b;
+}
+inline C10_HOST_DEVICE Half operator*(int64_t a, Half b) {
+  return static_cast<Half>(a) * b;
+}
+inline C10_HOST_DEVICE Half operator/(int64_t a, Half b) {
+  return static_cast<Half>(a) / b;
+}
+
+/// NOTE: we do not define comparisons directly and instead rely on the implicit
+/// conversion from c10::Half to float.
+
+} // namespace c10
+
+namespace std {
+
+template <>
+class numeric_limits<c10::Half> {
+ public:
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_signed = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = true;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = true;
+  static constexpr auto has_denorm = numeric_limits<float>::has_denorm;
+  static constexpr auto has_denorm_loss =
+      numeric_limits<float>::has_denorm_loss;
+  static constexpr auto round_style = numeric_limits<float>::round_style;
+  static constexpr bool is_iec559 = true;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 11;
+  static constexpr int digits10 = 3;
+  static constexpr int max_digits10 = 5;
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -13;
+  static constexpr int min_exponent10 = -4;
+  static constexpr int max_exponent = 16;
+  static constexpr int max_exponent10 = 4;
+  static constexpr auto traps = numeric_limits<float>::traps;
+  static constexpr auto tinyness_before =
+      numeric_limits<float>::tinyness_before;
+  static constexpr c10::Half min() {
+    return c10::Half(0x0400, c10::Half::from_bits());
+  }
+  static constexpr c10::Half lowest() {
+    return c10::Half(0xFBFF, c10::Half::from_bits());
+  }
+  static constexpr c10::Half max() {
+    return c10::Half(0x7BFF, c10::Half::from_bits());
+  }
+  static constexpr c10::Half epsilon() {
+    return c10::Half(0x1400, c10::Half::from_bits());
+  }
+  static constexpr c10::Half round_error() {
+    return c10::Half(0x3800, c10::Half::from_bits());
+  }
+  static constexpr c10::Half infinity() {
+    return c10::Half(0x7C00, c10::Half::from_bits());
+  }
+  static constexpr c10::Half quiet_NaN() {
+    return c10::Half(0x7E00, c10::Half::from_bits());
+  }
+  static constexpr c10::Half signaling_NaN() {
+    return c10::Half(0x7D00, c10::Half::from_bits());
+  }
+  static constexpr c10::Half denorm_min() {
+    return c10::Half(0x0001, c10::Half::from_bits());
+  }
+};
+
+} // namespace std
+
+C10_CLANG_DIAGNOSTIC_POP()
diff --git a/include/executorch/runtime/core/portable_type/c10/c10/util/Half.h b/include/executorch/runtime/core/portable_type/c10/c10/util/Half.h
new file mode 100644
index 00000000000..373881f21e5
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/c10/c10/util/Half.h
@@ -0,0 +1,424 @@
+#pragma once
+
+/// Defines the Half type (half-precision floating-point) including conversions
+/// to standard C types and basic arithmetic operations. Note that arithmetic
+/// operations are implemented by converting to floating point and
+/// performing the operation in float32, instead of using CUDA half intrinsics.
+/// Most uses of this type within ATen are memory bound, including the
+/// element-wise kernels, and the half intrinsics aren't efficient on all GPUs.
+/// If you are writing a compute bound kernel, you can use the CUDA half
+/// intrinsics directly on the Half type from device code.
+
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/bit_cast.h>
+#include <c10/util/floating_point_utils.h>
+#include <type_traits>
+
+#if defined(__cplusplus)
+#include <cmath>
+#elif !defined(__OPENCL_VERSION__)
+#include <math.h>
+#endif
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#include <cstdint>
+#include <cstring>
+#include <iosfwd>
+#include <limits>
+#include <ostream>
+
+#ifdef __CUDACC__
+#include <cuda_fp16.h>
+#endif
+
+#ifdef __HIPCC__
+#include <hip/hip_fp16.h>
+#endif
+
+#if defined(CL_SYCL_LANGUAGE_VERSION)
+#include <CL/sycl.hpp> // for SYCL 1.2.1
+#elif defined(SYCL_LANGUAGE_VERSION)
+#include <sycl/sycl.hpp> // for SYCL 2020
+#endif
+
+#if defined(__aarch64__) && !defined(__CUDACC__)
+#include <arm_neon.h>
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || \
+    defined(_M_IX86)
+#if defined(__F16C__) &&                               \
+    !(defined(__CUDA_ARCH__) || defined(__CUDACC__) || \
+      defined(__HIP_DEVICE_COMPILE__))
+#define C10_X86_F16 1
+#include <immintrin.h> // import conversion ops from f16cintrin.h
+#endif // defined(__F16C__) && !(defined(__CUDA_ARCH__) || defined(__CUDACC__)
+       // || defined(__HIP_DEVICE_COMPILE__))
+#endif // __x86_64__ || _M_X64 || __i386 || _M_IX86
+#endif // __GNUC__ || __clang__
+
+namespace c10 {
+
+namespace detail {
+
+/*
+ * Convert a 16-bit floating-point number in IEEE half-precision format, in bit
+ * representation, to a 32-bit floating-point number in IEEE single-precision
+ * format, in bit representation.
+ *
+ * @note The implementation doesn't use any floating-point operations.
+ */
+inline uint32_t fp16_ieee_to_fp32_bits(uint16_t h) {
+  /*
+   * Extend the half-precision floating-point number to 32 bits and shift to the
+   * upper part of the 32-bit word:
+   *      +---+-----+------------+-------------------+
+   *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+   *      +---+-----+------------+-------------------+
+   * Bits  31  26-30    16-25            0-15
+   *
+   * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0
+   * - zero bits.
+   */
+  const uint32_t w = (uint32_t)h << 16;
+  /*
+   * Extract the sign of the input number into the high bit of the 32-bit word:
+   *
+   *      +---+----------------------------------+
+   *      | S |0000000 00000000 00000000 00000000|
+   *      +---+----------------------------------+
+   * Bits  31                 0-31
+   */
+  const uint32_t sign = w & UINT32_C(0x80000000);
+  /*
+   * Extract mantissa and biased exponent of the input number into the bits 0-30
+   * of the 32-bit word:
+   *
+   *      +---+-----+------------+-------------------+
+   *      | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+   *      +---+-----+------------+-------------------+
+   * Bits  30  27-31     17-26            0-16
+   */
+  const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
+  /*
+   * Renorm shift is the number of bits to shift mantissa left to make the
+   * half-precision number normalized. If the initial number is normalized, some
+   * of its high 6 bits (sign == 0 and 5-bit exponent) equals one. In this case
+   * renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note
+   * that if we shift denormalized nonsign by renorm_shift, the unit bit of
+   * mantissa will shift into exponent, turning the biased exponent into 1, and
+   * making mantissa normalized (i.e. without leading 1).
+   */
+#ifdef _MSC_VER
+  unsigned long nonsign_bsr;
+  _BitScanReverse(&nonsign_bsr, (unsigned long)nonsign);
+  uint32_t renorm_shift = (uint32_t)nonsign_bsr ^ 31;
+#else
+  uint32_t renorm_shift = __builtin_clz(nonsign);
+#endif
+  renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
+  /*
+   * Iff half-precision number has exponent of 15, the addition overflows
+   * it into bit 31, and the subsequent shift turns the high 9 bits
+   * into 1. Thus inf_nan_mask == 0x7F800000 if the half-precision number
+   * had exponent of 15 (i.e. was NaN or infinity) 0x00000000 otherwise
+   */
+  const int32_t inf_nan_mask =
+      ((int32_t)(nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000);
+  /*
+   * Iff nonsign is 0, it overflows into 0xFFFFFFFF, turning bit 31
+   * into 1. Otherwise, bit 31 remains 0. The signed shift right by 31
+   * broadcasts bit 31 into all bits of the zero_mask. Thus zero_mask ==
+   * 0xFFFFFFFF if the half-precision number was zero (+0.0h or -0.0h)
+   * 0x00000000 otherwise
+   */
+  const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31;
+  /*
+   * 1. Shift nonsign left by renorm_shift to normalize it (if the input
+   * was denormal)
+   * 2. Shift nonsign right by 3 so the exponent (5 bits originally)
+   * becomes an 8-bit field and 10-bit mantissa shifts into the 10 high
+   * bits of the 23-bit mantissa of IEEE single-precision number.
+   * 3. Add 0x70 to the exponent (starting at bit 23) to compensate the
+   * different in exponent bias (0x7F for single-precision number less 0xF
+   * for half-precision number).
+   * 4. Subtract renorm_shift from the exponent (starting at bit 23) to
+   * account for renormalization. As renorm_shift is less than 0x70, this
+   * can be combined with step 3.
+   * 5. Binary OR with inf_nan_mask to turn the exponent into 0xFF if the
+   * input was NaN or infinity.
+   * 6. Binary ANDNOT with zero_mask to turn the mantissa and exponent
+   * into zero if the input was zero.
+   * 7. Combine with the sign of the input number.
+   */
+  return sign |
+      ((((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) |
+        inf_nan_mask) &
+       ~zero_mask);
+}
+
+/*
+ * Convert a 16-bit floating-point number in IEEE half-precision format, in bit
+ * representation, to a 32-bit floating-point number in IEEE single-precision
+ * format.
+ *
+ * @note The implementation relies on IEEE-like (no assumption about rounding
+ * mode and no operations on denormals) floating-point operations and bitcasts
+ * between integer and floating-point variables.
+ */
+C10_HOST_DEVICE inline float fp16_ieee_to_fp32_value(uint16_t h) {
+#ifdef C10_X86_F16
+  return _cvtsh_ss(h);
+#else
+  /*
+   * Extend the half-precision floating-point number to 32 bits and shift to the
+   * upper part of the 32-bit word:
+   *      +---+-----+------------+-------------------+
+   *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+   *      +---+-----+------------+-------------------+
+   * Bits  31  26-30    16-25            0-15
+   *
+   * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0
+   * - zero bits.
+   */
+  const uint32_t w = (uint32_t)h << 16;
+  /*
+   * Extract the sign of the input number into the high bit of the 32-bit word:
+   *
+   *      +---+----------------------------------+
+   *      | S |0000000 00000000 00000000 00000000|
+   *      +---+----------------------------------+
+   * Bits  31                 0-31
+   */
+  const uint32_t sign = w & UINT32_C(0x80000000);
+  /*
+   * Extract mantissa and biased exponent of the input number into the high bits
+   * of the 32-bit word:
+   *
+   *      +-----+------------+---------------------+
+   *      |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000|
+   *      +-----+------------+---------------------+
+   * Bits  27-31    17-26            0-16
+   */
+  const uint32_t two_w = w + w;
+
+  /*
+   * Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become
+   * mantissa and exponent of a single-precision floating-point number:
+   *
+   *       S|Exponent |          Mantissa
+   *      +-+---+-----+------------+----------------+
+   *      |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000|
+   *      +-+---+-----+------------+----------------+
+   * Bits   | 23-31   |           0-22
+   *
+   * Next, there are some adjustments to the exponent:
+   * - The exponent needs to be corrected by the difference in exponent bias
+   * between single-precision and half-precision formats (0x7F - 0xF = 0x70)
+   * - Inf and NaN values in the inputs should become Inf and NaN values after
+   * conversion to the single-precision number. Therefore, if the biased
+   * exponent of the half-precision input was 0x1F (max possible value), the
+   * biased exponent of the single-precision output must be 0xFF (max possible
+   * value). We do this correction in two steps:
+   *   - First, we adjust the exponent by (0xFF - 0x1F) = 0xE0 (see exp_offset
+   * below) rather than by 0x70 suggested by the difference in the exponent bias
+   * (see above).
+   *   - Then we multiply the single-precision result of exponent adjustment by
+   * 2**(-112) to reverse the effect of exponent adjustment by 0xE0 less the
+   * necessary exponent adjustment by 0x70 due to difference in exponent bias.
+   *     The floating-point multiplication hardware would ensure than Inf and
+   * NaN would retain their value on at least partially IEEE754-compliant
+   * implementations.
+   *
+   * Note that the above operations do not handle denormal inputs (where biased
+   * exponent == 0). However, they also do not operate on denormal inputs, and
+   * do not produce denormal results.
+   */
+  constexpr uint32_t exp_offset = UINT32_C(0xE0) << 23;
+  // const float exp_scale = 0x1.0p-112f;
+  constexpr uint32_t scale_bits = (uint32_t)15 << 23;
+  float exp_scale_val = 0;
+#if defined(_MSC_VER) && defined(__clang__)
+  __builtin_memcpy(&exp_scale_val, &scale_bits, sizeof(exp_scale_val));
+#else
+  std::memcpy(&exp_scale_val, &scale_bits, sizeof(exp_scale_val));
+#endif
+
+  const float exp_scale = exp_scale_val;
+  const float normalized_value =
+      fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+
+  /*
+   * Convert denormalized half-precision inputs into single-precision results
+   * (always normalized). Zero inputs are also handled here.
+   *
+   * In a denormalized number the biased exponent is zero, and mantissa has
+   * on-zero bits. First, we shift mantissa into bits 0-9 of the 32-bit word.
+   *
+   *                  zeros           |  mantissa
+   *      +---------------------------+------------+
+   *      |0000 0000 0000 0000 0000 00|MM MMMM MMMM|
+   *      +---------------------------+------------+
+   * Bits             10-31                0-9
+   *
+   * Now, remember that denormalized half-precision numbers are represented as:
+   *    FP16 = mantissa * 2**(-24).
+   * The trick is to construct a normalized single-precision number with the
+   * same mantissa and thehalf-precision input and with an exponent which would
+   * scale the corresponding mantissa bits to 2**(-24). A normalized
+   * single-precision floating-point number is represented as: FP32 = (1 +
+   * mantissa * 2**(-23)) * 2**(exponent - 127) Therefore, when the biased
+   * exponent is 126, a unit change in the mantissa of the input denormalized
+   * half-precision number causes a change of the constructed single-precision
+   * number by 2**(-24), i.e. the same amount.
+   *
+   * The last step is to adjust the bias of the constructed single-precision
+   * number. When the input half-precision number is zero, the constructed
+   * single-precision number has the value of FP32 = 1 * 2**(126 - 127) =
+   * 2**(-1) = 0.5 Therefore, we need to subtract 0.5 from the constructed
+   * single-precision number to get the numerical equivalent of the input
+   * half-precision number.
+   */
+  constexpr uint32_t magic_mask = UINT32_C(126) << 23;
+  constexpr float magic_bias = 0.5f;
+  const float denormalized_value =
+      fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+  /*
+   * - Choose either results of conversion of input as a normalized number, or
+   * as a denormalized number, depending on the input exponent. The variable
+   * two_w contains input exponent in bits 27-31, therefore if its smaller than
+   * 2**27, the input is either a denormal number, or zero.
+   * - Combine the result of conversion of exponent and mantissa with the sign
+   * of the input number.
+   */
+  constexpr uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+  const uint32_t result = sign |
+      (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value)
+                                   : fp32_to_bits(normalized_value));
+  return fp32_from_bits(result);
+#endif // C10_X86_F16
+}
+
+/*
+ * Convert a 32-bit floating-point number in IEEE single-precision format to a
+ * 16-bit floating-point number in IEEE half-precision format, in bit
+ * representation.
+ *
+ * @note The implementation relies on IEEE-like (no assumption about rounding
+ * mode and no operations on denormals) floating-point operations and bitcasts
+ * between integer and floating-point variables.
+ */
+inline uint16_t fp16_ieee_from_fp32_value(float f) {
+#ifdef C10_X86_F16
+  return _cvtss_sh(f, _MM_FROUND_TO_NEAREST_INT);
+#else
+  // const float scale_to_inf = 0x1.0p+112f;
+  // const float scale_to_zero = 0x1.0p-110f;
+  constexpr uint32_t scale_to_inf_bits = (uint32_t)239 << 23;
+  constexpr uint32_t scale_to_zero_bits = (uint32_t)17 << 23;
+  float scale_to_inf_val = 0, scale_to_zero_val = 0;
+  std::memcpy(&scale_to_inf_val, &scale_to_inf_bits, sizeof(scale_to_inf_val));
+  std::memcpy(
+      &scale_to_zero_val, &scale_to_zero_bits, sizeof(scale_to_zero_val));
+  const float scale_to_inf = scale_to_inf_val;
+  const float scale_to_zero = scale_to_zero_val;
+
+#if defined(_MSC_VER) && _MSC_VER == 1916
+  float base = ((signbit(f) != 0 ? -f : f) * scale_to_inf) * scale_to_zero;
+#else
+  float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+#endif
+
+  const uint32_t w = fp32_to_bits(f);
+  const uint32_t shl1_w = w + w;
+  const uint32_t sign = w & UINT32_C(0x80000000);
+  uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+  if (bias < UINT32_C(0x71000000)) {
+    bias = UINT32_C(0x71000000);
+  }
+
+  base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+  const uint32_t bits = fp32_to_bits(base);
+  const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+  const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+  const uint32_t nonsign = exp_bits + mantissa_bits;
+  return static_cast<uint16_t>(
+      (sign >> 16) |
+      (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign));
+#endif // C10_X86_F16
+}
+
+#ifdef C10_X86_F16
+#undef C10_X86_F16
+#endif // C10_X86_F16
+
+#if defined(__aarch64__) && !defined(__CUDACC__)
+inline float16_t fp16_from_bits(uint16_t h) {
+  return c10::bit_cast<float16_t>(h);
+}
+
+inline uint16_t fp16_to_bits(float16_t f) {
+  return c10::bit_cast<uint16_t>(f);
+}
+
+// According to https://godbolt.org/z/frExdbsWG it would translate to single
+// fcvt s0, h0
+inline float native_fp16_to_fp32_value(uint16_t h) {
+  return static_cast<float>(fp16_from_bits(h));
+}
+
+inline uint16_t native_fp16_from_fp32_value(float f) {
+  return fp16_to_bits(static_cast<float16_t>(f));
+}
+#endif
+
+} // namespace detail
+
+struct alignas(2) Half {
+  unsigned short x;
+
+  struct from_bits_t {};
+  C10_HOST_DEVICE static constexpr from_bits_t from_bits() {
+    return from_bits_t();
+  }
+
+  // HIP wants __host__ __device__ tag, CUDA does not
+#if defined(USE_ROCM)
+  C10_HOST_DEVICE Half() = default;
+#else
+  Half() = default;
+#endif
+
+  constexpr C10_HOST_DEVICE Half(unsigned short bits, from_bits_t) : x(bits) {}
+#if defined(__aarch64__) && !defined(__CUDACC__)
+  inline Half(float16_t value);
+  inline operator float16_t() const;
+#else
+  inline C10_HOST_DEVICE Half(float value);
+  inline C10_HOST_DEVICE operator float() const;
+#endif
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_HOST_DEVICE Half(const __half& value);
+  inline C10_HOST_DEVICE operator __half() const;
+#endif
+#ifdef SYCL_LANGUAGE_VERSION
+  inline C10_HOST_DEVICE Half(const sycl::half& value);
+  inline C10_HOST_DEVICE operator sycl::half() const;
+#endif
+};
+
+C10_API inline std::ostream& operator<<(std::ostream& out, const Half& value) {
+  out << (float)value;
+  return out;
+}
+
+} // namespace c10
+
+#include <c10/util/Half-inl.h> // IWYU pragma: keep
diff --git a/include/executorch/runtime/core/portable_type/c10/c10/util/TypeSafeSignMath.h b/include/executorch/runtime/core/portable_type/c10/c10/util/TypeSafeSignMath.h
new file mode 100644
index 00000000000..2853ff48d18
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/c10/c10/util/TypeSafeSignMath.h
@@ -0,0 +1,140 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <limits>
+#include <type_traits>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wstring-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wstring-conversion")
+#endif
+#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
+#endif
+
+namespace c10 {
+
+/// Returns false since we cannot have x < 0 if x is unsigned.
+template <typename T>
+inline constexpr bool is_negative(
+    const T& /*x*/,
+    std::true_type /*is_unsigned*/) {
+  return false;
+}
+
+/// Returns true if a signed variable x < 0
+template <typename T>
+inline constexpr bool is_negative(const T& x, std::false_type /*is_unsigned*/) {
+  return x < T(0);
+}
+
+/// Returns true if x < 0
+/// NOTE: Will fail on an unsigned custom type
+///       For the most part it's possible to fix this if
+///       the custom type has a constexpr constructor.
+///       However, notably, c10::Half does not :-(
+template <typename T>
+inline constexpr bool is_negative(const T& x) {
+  return is_negative(x, std::is_unsigned<T>());
+}
+
+/// Returns the sign of an unsigned variable x as 0, 1
+template <typename T>
+inline constexpr int signum(const T& x, std::true_type /*is_unsigned*/) {
+  return T(0) < x;
+}
+
+/// Returns the sign of a signed variable x as -1, 0, 1
+template <typename T>
+inline constexpr int signum(const T& x, std::false_type /*is_unsigned*/) {
+  return (T(0) < x) - (x < T(0));
+}
+
+/// Returns the sign of x as -1, 0, 1
+/// NOTE: Will fail on an unsigned custom type
+///       For the most part it's possible to fix this if
+///       the custom type has a constexpr constructor.
+///       However, notably, c10::Half does not :-(
+template <typename T>
+inline constexpr int signum(const T& x) {
+  return signum(x, std::is_unsigned<T>());
+}
+
+/// Returns true if a and b are not both negative
+template <typename T, typename U>
+inline constexpr bool signs_differ(const T& a, const U& b) {
+  return is_negative(a) != is_negative(b);
+}
+
+// Suppress sign compare warning when compiling with GCC
+// as later does not account for short-circuit rule before
+// raising the warning, see https://godbolt.org/z/Tr3Msnz99
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#endif
+
+/// Returns true if x is greater than the greatest value of the type Limit
+template <typename Limit, typename T>
+inline constexpr bool greater_than_max(const T& x) {
+  constexpr bool can_overflow =
+      std::numeric_limits<T>::digits > std::numeric_limits<Limit>::digits;
+  return can_overflow && x > std::numeric_limits<Limit>::max();
+}
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+/// Returns true if x < lowest(Limit). Standard comparison
+template <typename Limit, typename T>
+inline constexpr bool less_than_lowest(
+    const T& x,
+    std::false_type /*limit_is_unsigned*/,
+    std::false_type /*x_is_unsigned*/) {
+  return x < std::numeric_limits<Limit>::lowest();
+}
+
+/// Returns false since all the limit is signed and therefore includes
+/// negative values but x cannot be negative because it is unsigned
+template <typename Limit, typename T>
+inline constexpr bool less_than_lowest(
+    const T& /*x*/,
+    std::false_type /*limit_is_unsigned*/,
+    std::true_type /*x_is_unsigned*/) {
+  return false;
+}
+
+/// Returns true if x < 0, where 0 is constructed from T.
+/// Limit is not signed, so its lower value is zero
+template <typename Limit, typename T>
+inline constexpr bool less_than_lowest(
+    const T& x,
+    std::true_type /*limit_is_unsigned*/,
+    std::false_type /*x_is_unsigned*/) {
+  return x < T(0);
+}
+
+/// Returns false sign both types are unsigned
+template <typename Limit, typename T>
+inline constexpr bool less_than_lowest(
+    const T& /*x*/,
+    std::true_type /*limit_is_unsigned*/,
+    std::true_type /*x_is_unsigned*/) {
+  return false;
+}
+
+/// Returns true if x is less than the lowest value of type T
+/// NOTE: Will fail on an unsigned custom type
+///       For the most part it's possible to fix this if
+///       the custom type has a constexpr constructor.
+///       However, notably, c10::Half does not :
+template <typename Limit, typename T>
+inline constexpr bool less_than_lowest(const T& x) {
+  return less_than_lowest<Limit>(
+      x, std::is_unsigned<Limit>(), std::is_unsigned<T>());
+}
+
+} // namespace c10
+
+C10_CLANG_DIAGNOSTIC_POP()
diff --git a/include/executorch/runtime/core/portable_type/c10/c10/util/bit_cast.h b/include/executorch/runtime/core/portable_type/c10/c10/util/bit_cast.h
new file mode 100644
index 00000000000..380cfa7db1c
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/c10/c10/util/bit_cast.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <cstring>
+#include <type_traits>
+
+#if __has_include(<bit>) && (defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L)
+#include <bit>
+#define C10_HAVE_STD_BIT_CAST 1
+#else
+#define C10_HAVE_STD_BIT_CAST 0
+#endif // __has_include(<bit>) && (__cplusplus >= 202002L ||
+       // (defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L))
+
+namespace c10 {
+
+#if C10_HAVE_STD_BIT_CAST
+using std::bit_cast;
+#else
+// Implementations of std::bit_cast() from C++ 20.
+//
+// This is a less sketchy version of reinterpret_cast.
+//
+// See https://en.cppreference.com/w/cpp/numeric/bit_cast for more
+// information as well as the source of our implementations.
+template <class To, class From>
+std::enable_if_t<
+    sizeof(To) == sizeof(From) && std::is_trivially_copyable_v<From> &&
+        std::is_trivially_copyable_v<To>,
+    To>
+// constexpr support needs compiler magic
+bit_cast(const From& src) noexcept {
+  static_assert(
+      std::is_trivially_constructible_v<To>,
+      "This implementation additionally requires "
+      "destination type to be trivially constructible");
+
+  To dst;
+  std::memcpy(&dst, &src, sizeof(To));
+  return dst;
+}
+#endif // C10_HAVE_STD_BIT_CAST
+#undef C10_HAVE_STD_BIT_CAST
+
+} // namespace c10
diff --git a/include/executorch/runtime/core/portable_type/c10/c10/util/complex.h b/include/executorch/runtime/core/portable_type/c10/c10/util/complex.h
new file mode 100644
index 00000000000..b63710d9458
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/c10/c10/util/complex.h
@@ -0,0 +1,668 @@
+#pragma once
+
+#include <complex>
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Half.h>
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#include <thrust/complex.h>
+#endif
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-float-conversion")
+#endif
+#if C10_CLANG_HAS_WARNING("-Wfloat-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wfloat-conversion")
+#endif
+
+namespace c10 {
+
+// c10::complex is an implementation of complex numbers that aims
+// to work on all devices supported by PyTorch
+//
+// Most of the APIs duplicates std::complex
+// Reference: https://en.cppreference.com/w/cpp/numeric/complex
+//
+// [NOTE: Complex Operator Unification]
+// Operators currently use a mix of std::complex, thrust::complex, and
+// c10::complex internally. The end state is that all operators will use
+// c10::complex internally.  Until then, there may be some hacks to support all
+// variants.
+//
+//
+// [Note on Constructors]
+//
+// The APIs of constructors are mostly copied from C++ standard:
+//   https://en.cppreference.com/w/cpp/numeric/complex/complex
+//
+// Since C++14, all constructors are constexpr in std::complex
+//
+// There are three types of constructors:
+// - initializing from real and imag:
+//     `constexpr complex( const T& re = T(), const T& im = T() );`
+// - implicitly-declared copy constructor
+// - converting constructors
+//
+// Converting constructors:
+// - std::complex defines converting constructor between float/double/long
+// double,
+//   while we define converting constructor between float/double.
+// - For these converting constructors, upcasting is implicit, downcasting is
+//   explicit.
+// - We also define explicit casting from std::complex/thrust::complex
+//   - Note that the conversion from thrust is not constexpr, because
+//     thrust does not define them as constexpr ????
+//
+//
+// [Operator =]
+//
+// The APIs of operator = are mostly copied from C++ standard:
+//   https://en.cppreference.com/w/cpp/numeric/complex/operator%3D
+//
+// Since C++20, all operator= are constexpr. Although we are not building with
+// C++20, we also obey this behavior.
+//
+// There are three types of assign operator:
+// - Assign a real value from the same scalar type
+//   - In std, this is templated as complex& operator=(const T& x)
+//     with specialization `complex& operator=(T x)` for float/double/long
+//     double Since we only support float and double, on will use `complex&
+//     operator=(T x)`
+// - Copy assignment operator and converting assignment operator
+//   - There is no specialization of converting assignment operators, which type
+//   is
+//     convertible is solely dependent on whether the scalar type is convertible
+//
+// In addition to the standard assignment, we also provide assignment operators
+// with std and thrust
+//
+//
+// [Casting operators]
+//
+// std::complex does not have casting operators. We define casting operators
+// casting to std::complex and thrust::complex
+//
+//
+// [Operator ""]
+//
+// std::complex has custom literals `i`, `if` and `il` defined in namespace
+// `std::literals::complex_literals`. We define our own custom literals in the
+// namespace `c10::complex_literals`. Our custom literals does not follow the
+// same behavior as in std::complex, instead, we define _if, _id to construct
+// float/double complex literals.
+//
+//
+// [real() and imag()]
+//
+// In C++20, there are two overload of these functions, one it to return the
+// real/imag, another is to set real/imag, they are both constexpr. We follow
+// this design.
+//
+//
+// [Operator +=,-=,*=,/=]
+//
+// Since C++20, these operators become constexpr. In our implementation, they
+// are also constexpr.
+//
+// There are two types of such operators: operating with a real number, or
+// operating with another complex number. For the operating with a real number,
+// the generic template form has argument type `const T &`, while the overload
+// for float/double/long double has `T`. We will follow the same type as
+// float/double/long double in std.
+//
+// [Unary operator +-]
+//
+// Since C++20, they are constexpr. We also make them expr
+//
+// [Binary operators +-*/]
+//
+// Each operator has three versions (taking + as example):
+// - complex + complex
+// - complex + real
+// - real + complex
+//
+// [Operator ==, !=]
+//
+// Each operator has three versions (taking == as example):
+// - complex == complex
+// - complex == real
+// - real == complex
+//
+// Some of them are removed on C++20, but we decide to keep them
+//
+// [Operator <<, >>]
+//
+// These are implemented by casting to std::complex
+//
+//
+//
+// TODO(@zasdfgbnm): c10::complex<c10::Half> is not currently supported,
+// because:
+//  - lots of members and functions of c10::Half are not constexpr
+//  - thrust::complex only support float and double
+
+template <typename T>
+struct alignas(sizeof(T) * 2) complex {
+  using value_type = T;
+
+  T real_ = T(0);
+  T imag_ = T(0);
+
+  constexpr complex() = default;
+  C10_HOST_DEVICE constexpr complex(const T& re, const T& im = T())
+      : real_(re), imag_(im) {}
+  template <typename U>
+  explicit constexpr complex(const std::complex<U>& other)
+      : complex(other.real(), other.imag()) {}
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  template <typename U>
+  explicit C10_HOST_DEVICE complex(const thrust::complex<U>& other)
+      : real_(other.real()), imag_(other.imag()) {}
+// NOTE can not be implemented as follow due to ROCm bug:
+//   explicit C10_HOST_DEVICE complex(const thrust::complex<U> &other):
+//   complex(other.real(), other.imag()) {}
+#endif
+
+  // Use SFINAE to specialize casting constructor for c10::complex<float> and
+  // c10::complex<double>
+  template <typename U = T>
+  C10_HOST_DEVICE explicit constexpr complex(
+      const std::enable_if_t<std::is_same_v<U, float>, complex<double>>& other)
+      : real_(other.real_), imag_(other.imag_) {}
+  template <typename U = T>
+  C10_HOST_DEVICE constexpr complex(
+      const std::enable_if_t<std::is_same_v<U, double>, complex<float>>& other)
+      : real_(other.real_), imag_(other.imag_) {}
+
+  constexpr complex<T>& operator=(T re) {
+    real_ = re;
+    imag_ = 0;
+    return *this;
+  }
+
+  constexpr complex<T>& operator+=(T re) {
+    real_ += re;
+    return *this;
+  }
+
+  constexpr complex<T>& operator-=(T re) {
+    real_ -= re;
+    return *this;
+  }
+
+  constexpr complex<T>& operator*=(T re) {
+    real_ *= re;
+    imag_ *= re;
+    return *this;
+  }
+
+  constexpr complex<T>& operator/=(T re) {
+    real_ /= re;
+    imag_ /= re;
+    return *this;
+  }
+
+  template <typename U>
+  constexpr complex<T>& operator=(const complex<U>& rhs) {
+    real_ = rhs.real();
+    imag_ = rhs.imag();
+    return *this;
+  }
+
+  template <typename U>
+  constexpr complex<T>& operator+=(const complex<U>& rhs) {
+    real_ += rhs.real();
+    imag_ += rhs.imag();
+    return *this;
+  }
+
+  template <typename U>
+  constexpr complex<T>& operator-=(const complex<U>& rhs) {
+    real_ -= rhs.real();
+    imag_ -= rhs.imag();
+    return *this;
+  }
+
+  template <typename U>
+  constexpr complex<T>& operator*=(const complex<U>& rhs) {
+    // (a + bi) * (c + di) = (a*c - b*d) + (a * d + b * c) i
+    T a = real_;
+    T b = imag_;
+    U c = rhs.real();
+    U d = rhs.imag();
+    real_ = a * c - b * d;
+    imag_ = a * d + b * c;
+    return *this;
+  }
+
+#ifdef __APPLE__
+#define FORCE_INLINE_APPLE __attribute__((always_inline))
+#else
+#define FORCE_INLINE_APPLE
+#endif
+  template <typename U>
+  constexpr FORCE_INLINE_APPLE complex<T>& operator/=(const complex<U>& rhs)
+      __ubsan_ignore_float_divide_by_zero__ {
+    // (a + bi) / (c + di) = (ac + bd)/(c^2 + d^2) + (bc - ad)/(c^2 + d^2) i
+    // the calculation below follows numpy's complex division
+    T a = real_;
+    T b = imag_;
+    U c = rhs.real();
+    U d = rhs.imag();
+
+#if defined(__GNUC__) && !defined(__clang__)
+    // std::abs is already constexpr by gcc
+    auto abs_c = std::abs(c);
+    auto abs_d = std::abs(d);
+#else
+    auto abs_c = c < 0 ? -c : c;
+    auto abs_d = d < 0 ? -d : d;
+#endif
+
+    if (abs_c >= abs_d) {
+      if (abs_c == U(0) && abs_d == U(0)) {
+        /* divide by zeros should yield a complex inf or nan */
+        real_ = a / abs_c;
+        imag_ = b / abs_d;
+      } else {
+        auto rat = d / c;
+        auto scl = U(1.0) / (c + d * rat);
+        real_ = (a + b * rat) * scl;
+        imag_ = (b - a * rat) * scl;
+      }
+    } else {
+      auto rat = c / d;
+      auto scl = U(1.0) / (d + c * rat);
+      real_ = (a * rat + b) * scl;
+      imag_ = (b * rat - a) * scl;
+    }
+    return *this;
+  }
+#undef FORCE_INLINE_APPLE
+
+  template <typename U>
+  constexpr complex<T>& operator=(const std::complex<U>& rhs) {
+    real_ = rhs.real();
+    imag_ = rhs.imag();
+    return *this;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  template <typename U>
+  C10_HOST_DEVICE complex<T>& operator=(const thrust::complex<U>& rhs) {
+    real_ = rhs.real();
+    imag_ = rhs.imag();
+    return *this;
+  }
+#endif
+
+  template <typename U>
+  explicit constexpr operator std::complex<U>() const {
+    return std::complex<U>(std::complex<T>(real(), imag()));
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  template <typename U>
+  C10_HOST_DEVICE explicit operator thrust::complex<U>() const {
+    return static_cast<thrust::complex<U>>(thrust::complex<T>(real(), imag()));
+  }
+#endif
+
+  // consistent with NumPy behavior
+  explicit constexpr operator bool() const {
+    return real() || imag();
+  }
+
+  C10_HOST_DEVICE constexpr T real() const {
+    return real_;
+  }
+  constexpr void real(T value) {
+    real_ = value;
+  }
+  C10_HOST_DEVICE constexpr T imag() const {
+    return imag_;
+  }
+  constexpr void imag(T value) {
+    imag_ = value;
+  }
+};
+
+namespace complex_literals {
+
+constexpr complex<float> operator""_if(long double imag) {
+  return complex<float>(0.0f, static_cast<float>(imag));
+}
+
+constexpr complex<double> operator""_id(long double imag) {
+  return complex<double>(0.0, static_cast<double>(imag));
+}
+
+constexpr complex<float> operator""_if(unsigned long long imag) {
+  return complex<float>(0.0f, static_cast<float>(imag));
+}
+
+constexpr complex<double> operator""_id(unsigned long long imag) {
+  return complex<double>(0.0, static_cast<double>(imag));
+}
+
+} // namespace complex_literals
+
+template <typename T>
+constexpr complex<T> operator+(const complex<T>& val) {
+  return val;
+}
+
+template <typename T>
+constexpr complex<T> operator-(const complex<T>& val) {
+  return complex<T>(-val.real(), -val.imag());
+}
+
+template <typename T>
+constexpr complex<T> operator+(const complex<T>& lhs, const complex<T>& rhs) {
+  complex<T> result = lhs;
+  return result += rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator+(const complex<T>& lhs, const T& rhs) {
+  complex<T> result = lhs;
+  return result += rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator+(const T& lhs, const complex<T>& rhs) {
+  return complex<T>(lhs + rhs.real(), rhs.imag());
+}
+
+template <typename T>
+constexpr complex<T> operator-(const complex<T>& lhs, const complex<T>& rhs) {
+  complex<T> result = lhs;
+  return result -= rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator-(const complex<T>& lhs, const T& rhs) {
+  complex<T> result = lhs;
+  return result -= rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator-(const T& lhs, const complex<T>& rhs) {
+  complex<T> result = -rhs;
+  return result += lhs;
+}
+
+template <typename T>
+constexpr complex<T> operator*(const complex<T>& lhs, const complex<T>& rhs) {
+  complex<T> result = lhs;
+  return result *= rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator*(const complex<T>& lhs, const T& rhs) {
+  complex<T> result = lhs;
+  return result *= rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator*(const T& lhs, const complex<T>& rhs) {
+  complex<T> result = rhs;
+  return result *= lhs;
+}
+
+template <typename T>
+constexpr complex<T> operator/(const complex<T>& lhs, const complex<T>& rhs) {
+  complex<T> result = lhs;
+  return result /= rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator/(const complex<T>& lhs, const T& rhs) {
+  complex<T> result = lhs;
+  return result /= rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator/(const T& lhs, const complex<T>& rhs) {
+  complex<T> result(lhs, T());
+  return result /= rhs;
+}
+
+// Define operators between integral scalars and c10::complex. std::complex does
+// not support this when T is a floating-point number. This is useful because it
+// saves a lot of "static_cast" when operate a complex and an integer. This
+// makes the code both less verbose and potentially more efficient.
+#define COMPLEX_INTEGER_OP_TEMPLATE_CONDITION                 \
+  typename std::enable_if_t<                                  \
+      std::is_floating_point_v<fT> && std::is_integral_v<iT>, \
+      int> = 0
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator+(const c10::complex<fT>& a, const iT& b) {
+  return a + static_cast<fT>(b);
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator+(const iT& a, const c10::complex<fT>& b) {
+  return static_cast<fT>(a) + b;
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator-(const c10::complex<fT>& a, const iT& b) {
+  return a - static_cast<fT>(b);
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator-(const iT& a, const c10::complex<fT>& b) {
+  return static_cast<fT>(a) - b;
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator*(const c10::complex<fT>& a, const iT& b) {
+  return a * static_cast<fT>(b);
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator*(const iT& a, const c10::complex<fT>& b) {
+  return static_cast<fT>(a) * b;
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator/(const c10::complex<fT>& a, const iT& b) {
+  return a / static_cast<fT>(b);
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator/(const iT& a, const c10::complex<fT>& b) {
+  return static_cast<fT>(a) / b;
+}
+
+#undef COMPLEX_INTEGER_OP_TEMPLATE_CONDITION
+
+template <typename T>
+constexpr bool operator==(const complex<T>& lhs, const complex<T>& rhs) {
+  return (lhs.real() == rhs.real()) && (lhs.imag() == rhs.imag());
+}
+
+template <typename T>
+constexpr bool operator==(const complex<T>& lhs, const T& rhs) {
+  return (lhs.real() == rhs) && (lhs.imag() == T());
+}
+
+template <typename T>
+constexpr bool operator==(const T& lhs, const complex<T>& rhs) {
+  return (lhs == rhs.real()) && (T() == rhs.imag());
+}
+
+template <typename T>
+constexpr bool operator!=(const complex<T>& lhs, const complex<T>& rhs) {
+  return !(lhs == rhs);
+}
+
+template <typename T>
+constexpr bool operator!=(const complex<T>& lhs, const T& rhs) {
+  return !(lhs == rhs);
+}
+
+template <typename T>
+constexpr bool operator!=(const T& lhs, const complex<T>& rhs) {
+  return !(lhs == rhs);
+}
+
+template <typename T, typename CharT, typename Traits>
+std::basic_ostream<CharT, Traits>& operator<<(
+    std::basic_ostream<CharT, Traits>& os,
+    const complex<T>& x) {
+  return (os << static_cast<std::complex<T>>(x));
+}
+
+template <typename T, typename CharT, typename Traits>
+std::basic_istream<CharT, Traits>& operator>>(
+    std::basic_istream<CharT, Traits>& is,
+    complex<T>& x) {
+  std::complex<T> tmp;
+  is >> tmp;
+  x = tmp;
+  return is;
+}
+
+} // namespace c10
+
+// std functions
+//
+// The implementation of these functions also follow the design of C++20
+
+namespace std {
+
+template <typename T>
+constexpr T real(const c10::complex<T>& z) {
+  return z.real();
+}
+
+template <typename T>
+constexpr T imag(const c10::complex<T>& z) {
+  return z.imag();
+}
+
+template <typename T>
+C10_HOST_DEVICE T abs(const c10::complex<T>& z) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return thrust::abs(static_cast<thrust::complex<T>>(z));
+#else
+  return std::abs(static_cast<std::complex<T>>(z));
+#endif
+}
+
+#if defined(USE_ROCM)
+#define ROCm_Bug(x)
+#else
+#define ROCm_Bug(x) x
+#endif
+
+template <typename T>
+C10_HOST_DEVICE T arg(const c10::complex<T>& z) {
+  return ROCm_Bug(std)::atan2(std::imag(z), std::real(z));
+}
+
+#undef ROCm_Bug
+
+template <typename T>
+constexpr T norm(const c10::complex<T>& z) {
+  return z.real() * z.real() + z.imag() * z.imag();
+}
+
+// For std::conj, there are other versions of it:
+//   constexpr std::complex<float> conj( float z );
+//   template< class DoubleOrInteger >
+//   constexpr std::complex<double> conj( DoubleOrInteger z );
+//   constexpr std::complex<long double> conj( long double z );
+// These are not implemented
+// TODO(@zasdfgbnm): implement them as c10::conj
+template <typename T>
+constexpr c10::complex<T> conj(const c10::complex<T>& z) {
+  return c10::complex<T>(z.real(), -z.imag());
+}
+
+// Thrust does not have complex --> complex version of thrust::proj,
+// so this function is not implemented at c10 right now.
+// TODO(@zasdfgbnm): implement it by ourselves
+
+// There is no c10 version of std::polar, because std::polar always
+// returns std::complex. Use c10::polar instead;
+
+} // namespace std
+
+namespace c10 {
+
+template <typename T>
+C10_HOST_DEVICE complex<T> polar(const T& r, const T& theta = T()) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<complex<T>>(thrust::polar(r, theta));
+#else
+  // std::polar() requires r >= 0, so spell out the explicit implementation to
+  // avoid a branch.
+  return complex<T>(r * std::cos(theta), r * std::sin(theta));
+#endif
+}
+
+template <>
+struct alignas(4) complex<Half> {
+  Half real_;
+  Half imag_;
+
+  // Constructors
+  complex() = default;
+  // Half constructor is not constexpr so the following constructor can't
+  // be constexpr
+  C10_HOST_DEVICE explicit inline complex(const Half& real, const Half& imag)
+      : real_(real), imag_(imag) {}
+  C10_HOST_DEVICE inline complex(const c10::complex<float>& value)
+      : real_(value.real()), imag_(value.imag()) {}
+
+  // Conversion operator
+  inline C10_HOST_DEVICE operator c10::complex<float>() const {
+    return {real_, imag_};
+  }
+
+  constexpr C10_HOST_DEVICE Half real() const {
+    return real_;
+  }
+  constexpr C10_HOST_DEVICE Half imag() const {
+    return imag_;
+  }
+
+  C10_HOST_DEVICE complex<Half>& operator+=(const complex<Half>& other) {
+    real_ = static_cast<float>(real_) + static_cast<float>(other.real_);
+    imag_ = static_cast<float>(imag_) + static_cast<float>(other.imag_);
+    return *this;
+  }
+
+  C10_HOST_DEVICE complex<Half>& operator-=(const complex<Half>& other) {
+    real_ = static_cast<float>(real_) - static_cast<float>(other.real_);
+    imag_ = static_cast<float>(imag_) - static_cast<float>(other.imag_);
+    return *this;
+  }
+
+  C10_HOST_DEVICE complex<Half>& operator*=(const complex<Half>& other) {
+    auto a = static_cast<float>(real_);
+    auto b = static_cast<float>(imag_);
+    auto c = static_cast<float>(other.real());
+    auto d = static_cast<float>(other.imag());
+    real_ = a * c - b * d;
+    imag_ = a * d + b * c;
+    return *this;
+  }
+};
+
+} // namespace c10
+
+C10_CLANG_DIAGNOSTIC_POP()
+
+#define C10_INTERNAL_INCLUDE_COMPLEX_REMAINING_H
+// math functions are included in a separate file
+#include <c10/util/complex_math.h> // IWYU pragma: keep
+// utilities for complex types
+#include <c10/util/complex_utils.h> // IWYU pragma: keep
+#undef C10_INTERNAL_INCLUDE_COMPLEX_REMAINING_H
diff --git a/include/executorch/runtime/core/portable_type/c10/c10/util/complex_math.h b/include/executorch/runtime/core/portable_type/c10/c10/util/complex_math.h
new file mode 100644
index 00000000000..2b591026c94
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/c10/c10/util/complex_math.h
@@ -0,0 +1,406 @@
+#if !defined(C10_INTERNAL_INCLUDE_COMPLEX_REMAINING_H)
+#error \
+    "c10/util/complex_math.h is not meant to be individually included. Include c10/util/complex.h instead."
+#endif
+
+namespace c10_complex_math {
+
+// Exponential functions
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> exp(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::exp(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::exp(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> log(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::log(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::log(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> log10(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::log10(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::log10(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> log2(const c10::complex<T>& x) {
+  const c10::complex<T> log2 = c10::complex<T>(::log(2.0), 0.0);
+  return c10_complex_math::log(x) / log2;
+}
+
+// Power functions
+//
+#if defined(_LIBCPP_VERSION) || \
+    (defined(__GLIBCXX__) && !defined(_GLIBCXX11_USE_C99_COMPLEX))
+namespace _detail {
+C10_API c10::complex<float> sqrt(const c10::complex<float>& in);
+C10_API c10::complex<double> sqrt(const c10::complex<double>& in);
+C10_API c10::complex<float> acos(const c10::complex<float>& in);
+C10_API c10::complex<double> acos(const c10::complex<double>& in);
+} // namespace _detail
+#endif
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> sqrt(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::sqrt(static_cast<thrust::complex<T>>(x)));
+#elif !(                        \
+    defined(_LIBCPP_VERSION) || \
+    (defined(__GLIBCXX__) && !defined(_GLIBCXX11_USE_C99_COMPLEX)))
+  return static_cast<c10::complex<T>>(
+      std::sqrt(static_cast<std::complex<T>>(x)));
+#else
+  return _detail::sqrt(x);
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> pow(
+    const c10::complex<T>& x,
+    const c10::complex<T>& y) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(thrust::pow(
+      static_cast<thrust::complex<T>>(x), static_cast<thrust::complex<T>>(y)));
+#else
+  return static_cast<c10::complex<T>>(std::pow(
+      static_cast<std::complex<T>>(x), static_cast<std::complex<T>>(y)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> pow(
+    const c10::complex<T>& x,
+    const T& y) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::pow(static_cast<thrust::complex<T>>(x), y));
+#else
+  return static_cast<c10::complex<T>>(
+      std::pow(static_cast<std::complex<T>>(x), y));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> pow(
+    const T& x,
+    const c10::complex<T>& y) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::pow(x, static_cast<thrust::complex<T>>(y)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::pow(x, static_cast<std::complex<T>>(y)));
+#endif
+}
+
+template <typename T, typename U>
+C10_HOST_DEVICE inline c10::complex<decltype(T() * U())> pow(
+    const c10::complex<T>& x,
+    const c10::complex<U>& y) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(thrust::pow(
+      static_cast<thrust::complex<T>>(x), static_cast<thrust::complex<T>>(y)));
+#else
+  return static_cast<c10::complex<T>>(std::pow(
+      static_cast<std::complex<T>>(x), static_cast<std::complex<T>>(y)));
+#endif
+}
+
+template <typename T, typename U>
+C10_HOST_DEVICE inline c10::complex<decltype(T() * U())> pow(
+    const c10::complex<T>& x,
+    const U& y) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::pow(static_cast<thrust::complex<T>>(x), y));
+#else
+  return static_cast<c10::complex<T>>(
+      std::pow(static_cast<std::complex<T>>(x), y));
+#endif
+}
+
+template <typename T, typename U>
+C10_HOST_DEVICE inline c10::complex<decltype(T() * U())> pow(
+    const T& x,
+    const c10::complex<U>& y) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::pow(x, static_cast<thrust::complex<T>>(y)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::pow(x, static_cast<std::complex<T>>(y)));
+#endif
+}
+
+// Trigonometric functions
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> sin(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::sin(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::sin(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> cos(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::cos(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::cos(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> tan(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::tan(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::tan(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> asin(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::asin(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::asin(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> acos(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::acos(static_cast<thrust::complex<T>>(x)));
+#elif !defined(_LIBCPP_VERSION)
+  return static_cast<c10::complex<T>>(
+      std::acos(static_cast<std::complex<T>>(x)));
+#else
+  return _detail::acos(x);
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> atan(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::atan(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::atan(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+// Hyperbolic functions
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> sinh(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::sinh(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::sinh(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> cosh(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::cosh(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::cosh(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> tanh(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::tanh(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::tanh(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> asinh(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::asinh(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::asinh(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> acosh(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::acosh(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::acosh(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> atanh(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::atanh(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::atanh(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> log1p(const c10::complex<T>& z) {
+#if defined(__APPLE__) || defined(__MACOSX) || defined(__CUDACC__) || \
+    defined(__HIPCC__)
+  // For Mac, the new implementation yielded a high relative error. Falling back
+  // to the old version for now.
+  // See https://github.com/numpy/numpy/pull/22611#issuecomment-1667945354
+  // For CUDA we also use this one, as thrust::log(thrust::complex) takes
+  // *forever* to compile
+
+  // log1p(z) = log(1 + z)
+  // Let's define 1 + z = r * e ^ (i * a), then we have
+  // log(r * e ^ (i * a)) = log(r) + i * a
+  // With z = x + iy, the term r can be written as
+  // r = ((1 + x) ^ 2 + y ^ 2) ^ 0.5
+  //   = (1 + x ^ 2 + 2 * x + y ^ 2) ^ 0.5
+  // So, log(r) is
+  // log(r) = 0.5 * log(1 + x ^ 2 + 2 * x + y ^ 2)
+  //        = 0.5 * log1p(x * (x + 2) + y ^ 2)
+  // we need to use the expression only on certain condition to avoid overflow
+  // and underflow from `(x * (x + 2) + y ^ 2)`
+  T x = z.real();
+  T y = z.imag();
+  T zabs = std::abs(z);
+  T theta = std::atan2(y, x + T(1));
+  if (zabs < 0.5) {
+    T r = x * (T(2) + x) + y * y;
+    if (r == 0) { // handle underflow
+      return {x, theta};
+    }
+    return {T(0.5) * std::log1p(r), theta};
+  } else {
+    T z0 = std::hypot(x + 1, y);
+    return {std::log(z0), theta};
+  }
+#else
+  // CPU path
+  // Based on https://github.com/numpy/numpy/pull/22611#issuecomment-1667945354
+  c10::complex<T> u = z + T(1);
+  if (u == T(1)) {
+    return z;
+  } else {
+    auto log_u = log(u);
+    if (u - T(1) == z) {
+      return log_u;
+    }
+    return log_u * (z / (u - T(1)));
+  }
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> expm1(const c10::complex<T>& z) {
+  // expm1(z) = exp(z) - 1
+  // Define z = x + i * y
+  // f = e ^ (x + i * y) - 1
+  //   = e ^ x * e ^ (i * y) - 1
+  //   = (e ^ x * cos(y) - 1) + i * (e ^ x * sin(y))
+  //   = (e ^ x - 1) * cos(y) - (1 - cos(y)) + i * e ^ x * sin(y)
+  //   = expm1(x) * cos(y) - 2 * sin(y / 2) ^ 2 + i * e ^ x * sin(y)
+  T x = z.real();
+  T y = z.imag();
+  T a = std::sin(y / 2);
+  T er = std::expm1(x) * std::cos(y) - T(2) * a * a;
+  T ei = std::exp(x) * std::sin(y);
+  return {er, ei};
+}
+
+} // namespace c10_complex_math
+
+using c10_complex_math::acos;
+using c10_complex_math::acosh;
+using c10_complex_math::asin;
+using c10_complex_math::asinh;
+using c10_complex_math::atan;
+using c10_complex_math::atanh;
+using c10_complex_math::cos;
+using c10_complex_math::cosh;
+using c10_complex_math::exp;
+using c10_complex_math::expm1;
+using c10_complex_math::log;
+using c10_complex_math::log10;
+using c10_complex_math::log1p;
+using c10_complex_math::log2;
+using c10_complex_math::pow;
+using c10_complex_math::sin;
+using c10_complex_math::sinh;
+using c10_complex_math::sqrt;
+using c10_complex_math::tan;
+using c10_complex_math::tanh;
+
+namespace std {
+
+using c10_complex_math::acos;
+using c10_complex_math::acosh;
+using c10_complex_math::asin;
+using c10_complex_math::asinh;
+using c10_complex_math::atan;
+using c10_complex_math::atanh;
+using c10_complex_math::cos;
+using c10_complex_math::cosh;
+using c10_complex_math::exp;
+using c10_complex_math::expm1;
+using c10_complex_math::log;
+using c10_complex_math::log10;
+using c10_complex_math::log1p;
+using c10_complex_math::log2;
+using c10_complex_math::pow;
+using c10_complex_math::sin;
+using c10_complex_math::sinh;
+using c10_complex_math::sqrt;
+using c10_complex_math::tan;
+using c10_complex_math::tanh;
+
+} // namespace std
diff --git a/include/executorch/runtime/core/portable_type/c10/c10/util/complex_utils.h b/include/executorch/runtime/core/portable_type/c10/c10/util/complex_utils.h
new file mode 100644
index 00000000000..1ca105f1d0a
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/c10/c10/util/complex_utils.h
@@ -0,0 +1,46 @@
+#if !defined(C10_INTERNAL_INCLUDE_COMPLEX_REMAINING_H)
+#error \
+    "c10/util/complex_utils.h is not meant to be individually included. Include c10/util/complex.h instead."
+#endif
+
+#include <limits>
+
+namespace c10 {
+
+template <typename T>
+struct is_complex : public std::false_type {};
+
+template <typename T>
+struct is_complex<std::complex<T>> : public std::true_type {};
+
+template <typename T>
+struct is_complex<c10::complex<T>> : public std::true_type {};
+
+// Extract double from std::complex<double>; is identity otherwise
+// TODO: Write in more idiomatic C++17
+template <typename T>
+struct scalar_value_type {
+  using type = T;
+};
+template <typename T>
+struct scalar_value_type<std::complex<T>> {
+  using type = T;
+};
+template <typename T>
+struct scalar_value_type<c10::complex<T>> {
+  using type = T;
+};
+
+} // namespace c10
+
+namespace std {
+
+template <typename T>
+class numeric_limits<c10::complex<T>> : public numeric_limits<T> {};
+
+template <typename T>
+bool isnan(const c10::complex<T>& v) {
+  return std::isnan(v.real()) || std::isnan(v.imag());
+}
+
+} // namespace std
diff --git a/include/executorch/runtime/core/portable_type/c10/c10/util/floating_point_utils.h b/include/executorch/runtime/core/portable_type/c10/c10/util/floating_point_utils.h
new file mode 100644
index 00000000000..b240c4ea232
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/c10/c10/util/floating_point_utils.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/bit_cast.h>
+#include <cstdint>
+
+namespace c10::detail {
+
+C10_HOST_DEVICE inline float fp32_from_bits(uint32_t w) {
+#if defined(__OPENCL_VERSION__)
+  return as_float(w);
+#elif defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+  return __uint_as_float((unsigned int)w);
+#elif defined(__INTEL_COMPILER)
+  return _castu32_f32(w);
+#else
+  return c10::bit_cast<float>(w);
+#endif
+}
+
+C10_HOST_DEVICE inline uint32_t fp32_to_bits(float f) {
+#if defined(__OPENCL_VERSION__)
+  return as_uint(f);
+#elif defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+  return (uint32_t)__float_as_uint(f);
+#elif defined(__INTEL_COMPILER)
+  return _castf32_u32(f);
+#else
+  return c10::bit_cast<uint32_t>(f);
+#endif
+}
+
+} // namespace c10::detail
diff --git a/include/executorch/runtime/core/portable_type/c10/c10/util/irange.h b/include/executorch/runtime/core/portable_type/c10/c10/util/irange.h
new file mode 100644
index 00000000000..f5310510099
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/c10/c10/util/irange.h
@@ -0,0 +1,123 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#pragma once
+
+#include <c10/util/TypeSafeSignMath.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <iterator>
+#include <type_traits>
+
+namespace c10 {
+
+namespace detail {
+
+template <
+    typename I,
+    bool one_sided = false,
+    std::enable_if_t<std::is_integral_v<I>, int> = 0>
+struct integer_iterator {
+  using iterator_category = std::input_iterator_tag;
+  using value_type = I;
+  using difference_type = std::ptrdiff_t;
+  using pointer = I*;
+  using reference = I&;
+
+  explicit constexpr integer_iterator(I value) : value(value) {}
+
+  constexpr I operator*() const {
+    return value;
+  }
+
+  constexpr I const* operator->() const {
+    return &value;
+  }
+
+  constexpr integer_iterator& operator++() {
+    ++value;
+    return *this;
+  }
+
+  constexpr integer_iterator operator++(int) {
+    const auto copy = *this;
+    ++*this;
+    return copy;
+  }
+
+  constexpr bool operator==(const integer_iterator& other) const {
+    if constexpr (one_sided) {
+      // Range-for loops' end test is `begin != end`, not `begin <
+      // end`. To handle `c10::irange(n)` where n < 0 (which should be
+      // empty), we just make `begin != end` fail whenever `end` is
+      // negative.
+      return is_negative(other.value) || value == other.value;
+    } else {
+      return value == other.value;
+    }
+    // Suppress "warning: missing return statement at end of non-void function"
+    // which Nvidia's Robert Crovella confirms is an NVCC compiler error
+    // here https://stackoverflow.com/a/64561686/752843 on 2020-10-27
+    // `__builtin_unreachable();` would be best here, but it's not
+    // available with all compilers. So we instead return an arbitrary
+    // value trusting that this line will, in fact, never be reached.
+    return false; // Horrible hack
+  }
+
+  constexpr bool operator!=(const integer_iterator& other) const {
+    return !(*this == other);
+  }
+
+ protected:
+  I value;
+};
+
+} // namespace detail
+
+template <
+    typename I,
+    bool one_sided = false,
+    std::enable_if_t<std::is_integral_v<I>, bool> = true>
+struct integer_range {
+ public:
+  constexpr integer_range(I begin, I end) : begin_(begin), end_(end) {}
+  using iterator = detail::integer_iterator<I, one_sided>;
+  constexpr iterator begin() const {
+    return begin_;
+  }
+  constexpr iterator end() const {
+    return end_;
+  }
+
+ private:
+  iterator begin_;
+  iterator end_;
+};
+
+/// Creates an integer range for the half-open interval [begin, end)
+/// If end<=begin, then the range is empty.
+/// The range has the type of the `end` integer; `begin` integer is
+/// cast to this type.
+template <
+    typename Integer1,
+    typename Integer2,
+    std::enable_if_t<std::is_integral_v<Integer1>, bool> = true,
+    std::enable_if_t<std::is_integral_v<Integer2>, bool> = true>
+constexpr integer_range<Integer2> irange(Integer1 begin, Integer2 end) {
+  // If end<=begin then the range is empty; we can achieve this effect by
+  // choosing the larger of {begin, end} as the loop terminator
+  return {
+      static_cast<Integer2>(begin),
+      std::max(static_cast<Integer2>(begin), end)};
+}
+
+/// Creates an integer range for the half-open interval [0, end)
+/// If end<=begin, then the range is empty
+template <
+    typename Integer,
+    std::enable_if_t<std::is_integral_v<Integer>, bool> = true>
+constexpr integer_range<Integer, true> irange(Integer end) {
+  return {Integer(), end};
+}
+
+} // namespace c10
diff --git a/include/executorch/runtime/core/portable_type/complex.h b/include/executorch/runtime/core/portable_type/complex.h
new file mode 100644
index 00000000000..faf13a0432f
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/complex.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <c10/util/complex.h>
+
+namespace executorch::runtime::etensor {
+using c10::complex;
+} // namespace executorch::runtime::etensor
+
+namespace torch::executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::etensor::complex;
+} // namespace torch::executor
diff --git a/include/executorch/runtime/core/portable_type/device.h b/include/executorch/runtime/core/portable_type/device.h
new file mode 100644
index 00000000000..d789df8a84d
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/device.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/platform/assert.h>
+
+namespace executorch {
+namespace runtime {
+namespace etensor {
+
+/// Denotes the specific genre of compute device.
+/// Subset of https://github.com/pytorch/pytorch/blob/main/c10/core/Device.h
+enum class DeviceType : int8_t {
+  CPU = 0,
+};
+
+/// An index representing a specific device; For cpu it should always be -1 or 0
+using DeviceIndex = int8_t;
+
+/**
+ * An abstraction for the compute device on which a tensor is located.
+ * ExecuTorch doesn't allow dynamic dispatching based on device, so this type is
+ * just a skeleton to allow certain kernels that expect device as an
+ * argument to still be run.
+ *
+ * In ExecuTorch this is always expected to be CPU.
+ */
+struct Device final {
+  using Type = DeviceType;
+
+  /// Constructs a new `Device` from a `DeviceType` and an optional device
+  /// index.
+  /* implicit */ Device(DeviceType type, DeviceIndex index = -1)
+      : type_(type), index_(index) {}
+
+  /// Returns the type of device this is. Only CPU is supported.
+  DeviceType type() const noexcept {
+    return type_;
+  }
+
+  /// Returns true if the device is of CPU type.
+  bool is_cpu() const noexcept {
+    return type_ == DeviceType::CPU;
+  }
+
+  /// Returns the device index. Always 0 if specified or -1 if not provided.
+  DeviceIndex index() const noexcept {
+    ET_CHECK(index_ == 0 || index_ == -1);
+    return index_;
+  }
+
+ private:
+  DeviceType type_;
+  DeviceIndex index_ = -1;
+};
+
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::etensor::Device;
+using ::executorch::runtime::etensor::DeviceType;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/portable_type/half.h b/include/executorch/runtime/core/portable_type/half.h
new file mode 100644
index 00000000000..bf4c676ce82
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/half.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <c10/util/Half.h>
+
+namespace executorch::runtime::etensor {
+using c10::Half;
+namespace internal {
+using c10::detail::fp16_ieee_from_fp32_value;
+using c10::detail::fp16_ieee_to_fp32_bits;
+using c10::detail::fp16_ieee_to_fp32_value;
+using c10::detail::fp32_from_bits;
+using c10::detail::fp32_to_bits;
+} // namespace internal
+} // namespace executorch::runtime::etensor
+namespace torch::executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::etensor::Half;
+} // namespace torch::executor
diff --git a/include/executorch/runtime/core/portable_type/optional.h b/include/executorch/runtime/core/portable_type/optional.h
new file mode 100644
index 00000000000..31ad06fd093
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/optional.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <optional>
+
+namespace executorch {
+namespace runtime {
+namespace etensor {
+
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::nullopt;
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::nullopt_t;
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::optional;
+
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::etensor::nullopt;
+using ::executorch::runtime::etensor::nullopt_t;
+using ::executorch::runtime::etensor::optional;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/portable_type/qint_types.h b/include/executorch/runtime/core/portable_type/qint_types.h
new file mode 100644
index 00000000000..183675e1829
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/qint_types.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace executorch {
+namespace runtime {
+namespace etensor {
+
+/**
+ * qint8 is for signed 8 bit quantized Tensors
+ */
+struct alignas(1) qint8 {
+  using underlying = int8_t;
+  int8_t val_;
+  qint8() = default;
+  explicit qint8(int8_t val) : val_(val) {}
+};
+
+/**
+ * quint8 is for unsigned 8 bit quantized Tensors
+ */
+struct alignas(1) quint8 {
+  using underlying = uint8_t;
+  uint8_t val_;
+  quint8() = default;
+  explicit quint8(uint8_t val) : val_(val) {}
+};
+
+/**
+ * qint32 is for signed 32 bit quantized Tensors
+ */
+struct alignas(4) qint32 {
+  using underlying = int32_t;
+  int32_t val_;
+  qint32() = default;
+  explicit qint32(int32_t val) : val_(val) {}
+};
+
+/**
+ * quint4x2 is for un-signed 4 bit quantized Tensors that are packed to byte
+ * boundary.
+ */
+struct alignas(1) quint4x2 {
+  using underlying = uint8_t;
+  uint8_t val_;
+  quint4x2() = default;
+  explicit quint4x2(uint8_t val) : val_(val) {}
+};
+
+/**
+ * quint2x4 is for un-signed 2 bit quantized Tensors that are packed to byte
+ * boundary.
+ */
+struct alignas(1) quint2x4 {
+  using underlying = uint8_t;
+  uint8_t val_;
+  quint2x4() = default;
+  explicit quint2x4(uint8_t val) : val_(val) {}
+};
+
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::etensor::qint32;
+using ::executorch::runtime::etensor::qint8;
+using ::executorch::runtime::etensor::quint2x4;
+using ::executorch::runtime::etensor::quint4x2;
+using ::executorch::runtime::etensor::quint8;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/portable_type/scalar.h b/include/executorch/runtime/core/portable_type/scalar.h
new file mode 100644
index 00000000000..0922cec6b95
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/scalar.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/portable_type/bfloat16.h>
+#include <executorch/runtime/core/portable_type/half.h>
+#include <executorch/runtime/core/tag.h>
+#include <executorch/runtime/platform/assert.h>
+
+#include <cstdint>
+#include <type_traits>
+
+namespace executorch {
+namespace runtime {
+namespace etensor {
+
+/**
+ * Represents a scalar value.
+ *
+ * The API is a source-compatible subset of c10::Scalar, and the
+ * semantics/behavior should also match the c10 version.
+ */
+class Scalar {
+ public:
+  Scalar() : Scalar(int64_t(0)) {}
+
+  template <
+      typename T,
+      typename std::enable_if<std::is_integral<T>::value, bool>::type = true>
+  /*implicit*/ Scalar(T val) : tag(Tag::Int) {
+    v.as_int = static_cast<int64_t>(val);
+  }
+  /*implicit*/ Scalar(bool val) : tag(Tag::Bool) {
+    v.as_bool = val;
+  }
+  /*implicit*/ Scalar(double val) : tag(Tag::Double) {
+    v.as_double = val;
+  }
+  /*implicit*/ Scalar(BFloat16 val) : Scalar((double)(float)val) {}
+  /*implicit*/ Scalar(Half val) : Scalar((double)(float)val) {}
+
+  /// Returns the concrete scalar value stored within.
+  template <typename T>
+  T to() const;
+
+  /// Returns true if the scalar is integral, false otherwise.
+  bool isIntegral(bool includeBool) const {
+    return Tag::Int == tag || (includeBool && isBoolean());
+  }
+
+  /// Returns true if the scalar is a floating point, false otherwise.
+  bool isFloatingPoint() const {
+    return tag == Tag::Double;
+  }
+
+  /// Returns true if the scalar is a boolean, false otherwise.
+  bool isBoolean() const {
+    return tag == Tag::Bool;
+  }
+
+ private:
+  int64_t toInt() const {
+    if (isIntegral(/*includeBool=*/false)) {
+      return v.as_int;
+    } else if (isBoolean()) {
+      return static_cast<int64_t>(v.as_bool);
+    } else {
+      ET_CHECK_MSG(false, "Scalar is not an int nor a Boolean.");
+    }
+  }
+
+  double toFloatingPoint() const {
+    ET_CHECK_MSG(isFloatingPoint(), "Scalar is not a Double.");
+    return v.as_double;
+  }
+
+  double toDouble() const {
+    ET_CHECK_MSG(isFloatingPoint(), "Scalar is not a Double.");
+    return v.as_double;
+  }
+
+  bool toBool() const {
+    ET_CHECK_MSG(isBoolean(), "Scalar is not a Boolean.");
+    return v.as_bool;
+  }
+
+  Tag tag;
+  union v_t {
+    double as_double;
+    int64_t as_int;
+    bool as_bool;
+    v_t() {} // default constructor
+  } v;
+};
+
+#define ET_DEFINE_SCALAR_TO_METHOD(T, name) \
+  template <>                               \
+  inline T Scalar::to<T>() const {          \
+    return to##name();                      \
+  }
+
+ET_DEFINE_SCALAR_TO_METHOD(double, Double)
+ET_DEFINE_SCALAR_TO_METHOD(int64_t, Int)
+ET_DEFINE_SCALAR_TO_METHOD(bool, Bool)
+#undef ET_DEFINE_SCALAR_TO_METHOD
+
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::etensor::Scalar;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/portable_type/scalar_type.h b/include/executorch/runtime/core/portable_type/scalar_type.h
new file mode 100644
index 00000000000..dc8142862f8
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/scalar_type.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * @file
+ *
+ * Forked from
+ * https://github.com/pytorch/pytorch/blob/master/c10/core/ScalarType.h
+ *
+ * Everything but the ScalarType definition is in util/ScalarTypeUtil.h
+ *
+ * Note that these files do not need to be strictly identical to the pytorch
+ * core file, as far as names go. The only critical piece is that the types and
+ * indices of the main ScalarType enum line up, so that serialization is
+ * compatible between the two.
+ *
+ * Modifications for ExecuTorch:
+ * - Namespace torch::executor instead of c10
+ * - Macro prefix ET_ instead of AT_
+ * - Use ET_CHECK_MSG() instead of TORCH_CHECK()
+ * - Don't define standalone constants like `kByte`, `kInt` to keep the
+ *   namespace clean
+ * - Remove operator<< to avoid a dependency on ostream and stdlib
+ * - Make `static inline` functions `inline` to avoid creating multiple
+ *   copies of them. See
+ *   https://gist.github.com/htfy96/50308afc11678d2e3766a36aa60d5f75#conclusion.
+ * - Remove deprecated definitions
+ * - Minor cleanup for internal consistency
+ */
+
+#pragma once
+
+#include <cstdint>
+
+#include <executorch/runtime/core/portable_type/bfloat16.h>
+#include <executorch/runtime/core/portable_type/bits_types.h>
+#include <executorch/runtime/core/portable_type/complex.h>
+#include <executorch/runtime/core/portable_type/half.h>
+#include <executorch/runtime/core/portable_type/qint_types.h>
+
+namespace executorch {
+namespace runtime {
+namespace etensor {
+
+// Placing a bunch of unused dtypes here as our macros don't make it easy
+// to skip scalar types defined in aten that we dont have.
+namespace unused_dtype {
+struct alignas(1) Float8_e5m2 {
+  uint8_t x;
+  using underlying = uint8_t;
+  Float8_e5m2() = default;
+  explicit Float8_e5m2(uint8_t val) : x(val) {}
+};
+struct alignas(1) Float8_e4m3fn {
+  uint8_t x;
+  using underlying = uint8_t;
+  Float8_e4m3fn() = default;
+  explicit Float8_e4m3fn(uint8_t val) : x(val) {}
+};
+struct alignas(1) Float8_e5m2fnuz {
+  uint8_t x;
+  using underlying = uint8_t;
+  Float8_e5m2fnuz() = default;
+  explicit Float8_e5m2fnuz(uint8_t val) : x(val) {}
+};
+struct alignas(1) Float8_e4m3fnuz {
+  uint8_t x;
+  using underlying = uint8_t;
+  Float8_e4m3fnuz() = default;
+  explicit Float8_e4m3fnuz(uint8_t val) : x(val) {}
+};
+
+} // namespace unused_dtype
+
+/**
+ * Calls the provided macro on every ScalarType, providing the C type and the
+ * ScalarType name to each call.
+ *
+ * The indices and C types must be consistent with
+ * AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS in the core pytorch file
+ * c10/core/ScalarType.h. This ensures that ExecuTorch serialization is
+ * compatible with ATen serialization.
+ *
+ * @param _ A macro that takes two parameters: the name of a C type, and the
+ *     name of the corresponding ScalarType enumerator.
+ */
+#define ET_FORALL_SCALAR_TYPES(_)                                            \
+  _(uint8_t, Byte) /* 0 */                                                   \
+  _(int8_t, Char) /* 1 */                                                    \
+  _(int16_t, Short) /* 2 */                                                  \
+  _(int32_t, Int) /* 3 */                                                    \
+  _(int64_t, Long) /* 4 */                                                   \
+  _(::executorch::runtime::etensor::Half, Half) /* 5 */                      \
+  _(float, Float) /* 6 */                                                    \
+  _(double, Double) /* 7 */                                                  \
+  _(::executorch::runtime::etensor::complex<::torch::executor::Half>,        \
+    ComplexHalf) /* 8 */                                                     \
+  _(::executorch::runtime::etensor::complex<float>, ComplexFloat) /* 9 */    \
+  _(::executorch::runtime::etensor::complex<double>, ComplexDouble) /* 10 */ \
+  _(bool, Bool) /* 11 */                                                     \
+  _(::executorch::runtime::etensor::qint8, QInt8) /* 12 */                   \
+  _(::executorch::runtime::etensor::quint8, QUInt8) /* 13 */                 \
+  _(::executorch::runtime::etensor::qint32, QInt32) /* 14 */                 \
+  _(::executorch::runtime::etensor::BFloat16, BFloat16) /* 15 */             \
+  _(::executorch::runtime::etensor::quint4x2, QUInt4x2) /* 16 */             \
+  _(::executorch::runtime::etensor::quint2x4, QUInt2x4) /* 17 */             \
+  _(::executorch::runtime::etensor::bits1x8, Bits1x8) /* 18 */               \
+  _(::executorch::runtime::etensor::bits2x4, Bits2x4) /* 19 */               \
+  _(::executorch::runtime::etensor::bits4x2, Bits4x2) /* 20 */               \
+  _(::executorch::runtime::etensor::bits8, Bits8) /* 21 */                   \
+  _(::executorch::runtime::etensor::bits16, Bits16) /* 22 */                 \
+  _(::executorch::runtime::etensor::unused_dtype::Float8_e5m2,               \
+    Float8_e5m2) /* 23 */                                                    \
+  _(::executorch::runtime::etensor::unused_dtype::Float8_e4m3fn,             \
+    Float8_e4m3fn) /* 24 */                                                  \
+  _(::executorch::runtime::etensor::unused_dtype::Float8_e5m2fnuz,           \
+    Float8_e5m2fnuz) /* 25 */                                                \
+  _(::executorch::runtime::etensor::unused_dtype::Float8_e4m3fnuz,           \
+    Float8_e4m3fnuz) /* 26 */                                                \
+  _(uint16_t, UInt16) /* 27 */                                               \
+  _(uint32_t, UInt32) /* 28 */                                               \
+  _(uint64_t, UInt64) /* 29 */
+
+/**
+ * Data types (dtypes) that can be used as element types in ETensors.
+ */
+enum class ScalarType : int8_t {
+/// Define an enumerator for each ScalarType.
+#define DEFINE_ENUM(unused, name) name,
+  ET_FORALL_SCALAR_TYPES(DEFINE_ENUM)
+#undef DEFINE_ENUM
+
+  /// An explicitly undefined ScalarType. Does not map to any C type.
+  Undefined,
+  /// The number of ScalarType enumerators.
+  NumOptions,
+};
+
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::etensor::ScalarType;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/portable_type/string_view.h b/include/executorch/runtime/core/portable_type/string_view.h
new file mode 100644
index 00000000000..8e28fa022cc
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/string_view.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <string_view>
+
+namespace executorch {
+namespace runtime {
+namespace etensor {
+
+using std::string_view;
+
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::etensor::string_view;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/portable_type/tensor.h b/include/executorch/runtime/core/portable_type/tensor.h
new file mode 100644
index 00000000000..775bccc1b52
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/tensor.h
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/platform/compiler.h>
+
+#include <executorch/runtime/core/portable_type/tensor_impl.h>
+
+namespace executorch {
+namespace runtime {
+namespace etensor {
+
+/**
+ * A minimal Tensor type whose API is a source compatible subset of at::Tensor.
+ *
+ * NOTE: Instances of this class do not own the TensorImpl given to it,
+ * which means that the caller must guarantee that the TensorImpl lives longer
+ * than any Tensor instances that point to it.
+ *
+ * See the documention on TensorImpl for details about the return/parameter
+ * types used here and how they relate to at::Tensor.
+ */
+class Tensor {
+ public:
+  /// The type used for elements of `sizes()`.
+  using SizesType = TensorImpl::SizesType;
+  /// The type used for elements of `dim_order()`.
+  using DimOrderType = TensorImpl::DimOrderType;
+  /// The type used for elements of `strides()`.
+  using StridesType = TensorImpl::StridesType;
+
+  Tensor() = delete;
+  explicit constexpr Tensor(TensorImpl* impl) : impl_(impl) {}
+
+  /**
+   * Returns a pointer to the underlying TensorImpl.
+   *
+   * NOTE: Clients should be wary of operating on the TensorImpl
+   * directly instead of the Tensor. It is easy to break things.
+   */
+  TensorImpl* unsafeGetTensorImpl() const {
+    // TODO(T154114015): See if we can make this api private with friends.
+    return impl_;
+  }
+
+  /**
+   * Returns the size of the tensor in bytes.
+   *
+   * NOTE: Only the alive space is returned not the total capacity of the
+   * underlying data blob.
+   */
+  size_t nbytes() const {
+    return impl_->nbytes();
+  }
+
+  /**
+   * Returns the size of the tensor at the given dimension.
+   *
+   * NOTE: that size() intentionally does not return SizeType even though it
+   * returns an element of an array of SizeType. This is to help make calls of
+   * this method more compatible with at::Tensor, and more consistent with the
+   * rest of the methods on this class and in ETensor.
+   */
+  ssize_t size(ssize_t dim) const {
+    return impl_->size(dim);
+  }
+
+  /// Returns the tensor's number of dimensions.
+  ssize_t dim() const {
+    return impl_->dim();
+  }
+
+  /// Returns the number of elements in the tensor.
+  ssize_t numel() const {
+    return impl_->numel();
+  }
+
+  /// Returns the type of the elements in the tensor (int32, float, bool, etc).
+  ScalarType scalar_type() const {
+    return impl_->scalar_type();
+  }
+
+  inline ScalarType dtype() const {
+    return scalar_type();
+  }
+
+  /// Returns the size in bytes of one element of the tensor.
+  ssize_t element_size() const {
+    return impl_->element_size();
+  }
+
+  /// Returns the sizes of the tensor at each dimension.
+  const ArrayRef<SizesType> sizes() const {
+    return impl_->sizes();
+  }
+
+  /// Returns the order the dimensions are laid out in memory.
+  const ArrayRef<DimOrderType> dim_order() const {
+    return impl_->dim_order();
+  }
+
+  /// Returns the strides of the tensor at each dimension.
+  const ArrayRef<StridesType> strides() const {
+    return impl_->strides();
+  }
+
+  /// Returns the mutability of the shape of the tensor.
+  TensorShapeDynamism shape_dynamism() const {
+    return impl_->shape_dynamism();
+  }
+
+  /// Returns a pointer of type T to the constant underlying data blob.
+  template <typename T>
+  inline const T* const_data_ptr() const {
+    return impl_->data<T>();
+  }
+
+  /// Returns a pointer to the constant underlying data blob.
+  inline const void* const_data_ptr() const {
+    return impl_->data();
+  }
+
+  /// Returns a pointer of type T to the mutable underlying data blob.
+  template <typename T>
+  inline T* mutable_data_ptr() const {
+    return impl_->mutable_data<T>();
+  }
+
+  /// Returns a pointer to the mutable underlying data blob.
+  inline void* mutable_data_ptr() const {
+    return impl_->mutable_data();
+  }
+
+  /// DEPRECATED: Use const_data_ptr or mutable_data_ptr instead.
+  template <typename T>
+  ET_DEPRECATED inline T* data_ptr() const {
+    return impl_->mutable_data<T>();
+  }
+
+  /// DEPRECATED: Use const_data_ptr or mutable_data_ptr instead.
+  ET_DEPRECATED inline void* data_ptr() const {
+    return impl_->mutable_data();
+  }
+
+  /**
+   * DEPRECATED: Changes the data_ptr the tensor aliases. Does not free the
+   * previously pointed to data, does not assume ownership semantics of the new
+   * ptr. This api does not exist in at::Tensor so kernel developers should
+   * avoid it.
+   */
+  ET_DEPRECATED void set_data(void* ptr) const {
+    impl_->set_data(ptr);
+  }
+
+ private:
+  TensorImpl* impl_ = nullptr;
+};
+
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::etensor::Tensor;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/portable_type/tensor_impl.h b/include/executorch/runtime/core/portable_type/tensor_impl.h
new file mode 100644
index 00000000000..1e2b3620ca2
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/tensor_impl.h
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/array_ref.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/portable_type/scalar_type.h>
+#include <executorch/runtime/core/tensor_shape_dynamism.h>
+
+// Forward declaration of a helper that provides access to internal resizing
+// methods of TensorImpl. Real definition is in
+// executorch/runtime/core/exec_aten/tensor_util.h.
+namespace executorch {
+namespace runtime {
+namespace internal {
+class TensorResizerFriend;
+} // namespace internal
+} // namespace runtime
+} // namespace executorch
+
+namespace executorch {
+namespace runtime {
+namespace etensor {
+
+/**
+ * Manages the storage behind an ETensor (torch::executor::Tensor).
+ *
+ * Note that instances of this class do not own the arrays given to it
+ * (sizes/strides/data), which means that the caller must guarantee that they
+ * live longer than a given instance of this class.
+ *
+ * Note on types:
+ *
+ * Code that uses ETensor should also be able to build against at::Tensor. So,
+ * although the overlapping APIs don't need to be exactly the same, their types
+ * should be semantically similar.
+ *
+ * Many of the methods in at::Tensor use int64_t for parameter and return types.
+ * This can be a waste when building for 32-bit environments. So, TensorImpl and
+ * ETensor use ssize_t instead: like int64_t it is signed, but it will match the
+ * native word size of the target architecture. This will avoid unnecessarily
+ * expensive uses of 64-bit integers on 32-bit machines.
+ *
+ * But, since the types are not identical, code that uses ETensor needs to be
+ * generic about the local types it uses when working with these methods. In
+ * most cases, `auto` will do the trick. In the worst case, code can be guarded
+ * with `#ifdef USE_ATEN_LIB`.
+ */
+class TensorImpl {
+ public:
+  /**
+   * The type used for elements of `sizes()`.
+   *
+   * This must match the size/signedness of the type used for `Tensor.sizes` in
+   * //executorch/schema/program.fbs.
+   *
+   * Note that at::TensorImpl uses `int64_t` for this type. ExecuTorch uses
+   * `int32_t` to save memory, since no single size value will ever be larger
+   * than 2 billion.
+   */
+  using SizesType = int32_t;
+
+  /**
+   * The type used for elements of `dim_order()`.
+   *
+   * This must match the size/signedness of the type used for `Tensor.dim_order`
+   * in //executorch/schema/program.fbs.
+   */
+  using DimOrderType = uint8_t;
+
+  /**
+   * The type used for elements of `strides()`.
+   *
+   * This must match the size/signedness of the type used for `Tensor.strides`
+   * in //executorch/schema/program.fbs.
+   *
+   * Note that at::TensorImpl uses `int64_t` for this type. ExecuTorch uses
+   * `int32_t` to save memory, since no single stride value will ever be larger
+   * than 2 billion.
+   */
+  using StridesType = int32_t;
+
+  TensorImpl() = delete;
+
+  /**
+   * @param type The type of the data (int, float, bool).
+   * @param dim Number of dimensions, and the length of the `sizes` array.
+   * @param sizes Sizes of the tensor at each dimension. Must contain `dim`
+   *     entries.
+   * @param data Pointer to the data, whose size is determined by `type`,
+   *     `dim`, and `sizes`. The tensor will not own this memory.
+   * @param dim_order Order in which dimensions are laid out in memory.
+   * @param strides Strides of the tensor at each dimension. Must contain `dim`
+   *     entries.
+   * @param dynamism The mutability of the shape of the tensor.
+   */
+  TensorImpl(
+      ScalarType type,
+      ssize_t dim,
+      SizesType* sizes,
+      void* data = nullptr,
+      DimOrderType* dim_order = nullptr,
+      StridesType* strides = nullptr,
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC);
+
+  /**
+   * Returns the size of the tensor in bytes.
+   *
+   * NOTE: This returns the size of the data used by the tensor's current shape,
+   * not the capacity of the underlying buffer.
+   */
+  size_t nbytes() const;
+
+  /**
+   * Returns the size of the tensor at the given dimension.
+   *
+   * NOTE: size() intentionally does not return SizeType even though it
+   * returns an element of an array of SizeType. This is to help make calls of
+   * this method more compatible with at::Tensor, and more consistent with the
+   * rest of the methods on this class and in ETensor.
+   */
+  ssize_t size(ssize_t dim) const {
+    ET_CHECK_MSG(
+        dim < dim_ && dim >= 0,
+        "Dimension out of range (expected to be in range of [0, %zd], but got %zd",
+        dim_ - 1,
+        dim);
+    return sizes_[dim];
+  }
+
+  /// Returns the tensor's number of dimensions.
+  ssize_t dim() const {
+    return dim_;
+  }
+
+  /// Returns the number of elements in the tensor.
+  ssize_t numel() const {
+    return numel_;
+  }
+
+  /// Returns the type of the elements in the tensor (int32, float, bool, etc).
+  ScalarType scalar_type() const {
+    return type_;
+  }
+
+  inline ScalarType dtype() const {
+    return scalar_type();
+  }
+
+  /// Returns the size in bytes of one element of the tensor.
+  ssize_t element_size() const;
+
+  /// Returns the sizes of the tensor at each dimension.
+  const ArrayRef<SizesType> sizes() const {
+    return ArrayRef<SizesType>{sizes_, static_cast<size_t>(dim_)};
+  }
+
+  /// Returns the order the dimensions are laid out in memory.
+  const ArrayRef<DimOrderType> dim_order() const {
+    return ArrayRef<DimOrderType>{dim_order_, static_cast<size_t>(dim_)};
+  }
+
+  /// Returns the strides of the tensor at each dimension.
+  const ArrayRef<StridesType> strides() const {
+    return ArrayRef<StridesType>{strides_, static_cast<size_t>(dim_)};
+  }
+
+  /// Returns the mutability of the shape of the tensor.
+  TensorShapeDynamism shape_dynamism() const {
+    return shape_dynamism_;
+  }
+
+  /// Returns a pointer of type T to the constant underlying data blob.
+  template <typename T>
+  inline const T* data() const {
+    return static_cast<const T*>(data());
+  }
+
+  /// Returns a pointer to the constant underlying data blob.
+  const void* data() const {
+    return data_;
+  }
+
+  /// Returns a pointer of type T to the mutable underlying data blob.
+  template <typename T>
+  inline T* mutable_data() const {
+    return static_cast<T*>(mutable_data());
+  }
+
+  /// Returns a pointer to the mutable underlying data blob.
+  void* mutable_data() const {
+    return data_;
+  }
+
+  /// Sets the underlying data blob to the passed in pointer.
+  void set_data(void* ptr) {
+    data_ = ptr;
+  }
+
+  /*
+   * DEPRECATED: Use torch::executor::resize_tensor() or
+   * torch::executor::resize_tensor_impl().
+   */
+  ET_DEPRECATED
+  void set_sizes_contiguous(ArrayRef<SizesType> new_sizes) {
+    Error err = internal_resize_contiguous(new_sizes);
+    ET_CHECK_MSG(
+        err == Error::Ok, "Could not resize Tensor; see logs for details");
+  }
+
+ private:
+  // For access to internal_resize_contiguous().
+  friend class ::executorch::runtime::internal::TensorResizerFriend;
+
+  /**
+   * Set the sizes and strides of a tensor assuming contiguous strides.
+   * Requires that `new_sizes.size() == this.dim()`.
+   *
+   * Callers must use torch::executor::resize_tensor() or
+   * torch::executor::resize_tensor_impl() instead, defined in TensorUtil.h.
+   *
+   * Same semantics as at::TensorImpl::set_sizes_contiguous(), but returns an
+   * error instead of panicking on failure. This is not part of the at::Tensor
+   * API, and can only be used in lean mode.
+   */
+  ET_NODISCARD Error internal_resize_contiguous(ArrayRef<SizesType> new_sizes);
+
+ private:
+  // Keep fields arranged to avoid unnecessary alignment holes.
+
+  /// List of sizes of each dimension in the tensor.
+  SizesType* sizes_;
+
+  /// List of the order that dimensions are laid out in memory.
+  DimOrderType* dim_order_;
+
+  // TODO(T148356881): Get rid of strides from ETensor
+  StridesType* strides_;
+
+  /// Pointer to underlying data blob. NOTE: Can be null.
+  void* data_;
+
+  /// Tensor's number of dimensions.
+  const ssize_t dim_;
+
+  /// Number of elements in the tensor.
+  ssize_t numel_;
+
+  /// Maximum number of elements in the bounded tensor. Used when resizing up
+  /// and down.
+  size_t numel_bound_;
+
+  /// Scalar type (int, float, bool, etc) of the tensor data.
+  const ScalarType type_;
+
+  /// Specifies the mutability of the shape of the tensor.
+  const TensorShapeDynamism shape_dynamism_;
+};
+
+/**
+ * Compute the number of elements based on the sizes of a tensor.
+ */
+ssize_t compute_numel(
+    const ::executorch::runtime::etensor::TensorImpl::SizesType* sizes,
+    ssize_t dim);
+
+/// Appropriate format specifier for the result of calling
+/// size(). Must be used instead of using zd directly to support ATen
+/// mode.
+#define ET_PRI_TENSOR_SIZE "zd"
+
+/// Appropriate format specifier for the result of calling
+/// dim(). Must be used instead of using zd directly to support ATen
+/// mode.
+#define ET_PRI_TENSOR_DIM "zd"
+
+/// Appropriate format specifier for the result of calling
+/// numel(). Must be used instead of using zd directly to support ATen
+/// mode.
+#define ET_PRI_TENSOR_NUMEL "zd"
+
+// Appropriate format specifier for elements of sizes() and
+// strides(). Must be used instead of using d directly to support ATen
+// mode.
+#define ET_PRI_SIZES_AND_STRIDES "d"
+
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::etensor::compute_numel;
+using ::executorch::runtime::etensor::TensorImpl;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/portable_type/tensor_options.h b/include/executorch/runtime/core/portable_type/tensor_options.h
new file mode 100644
index 00000000000..8b8f9848648
--- /dev/null
+++ b/include/executorch/runtime/core/portable_type/tensor_options.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace executorch {
+namespace runtime {
+namespace etensor {
+
+/**
+ * Tensor data memory formats supported by ExecuTorch. This concept only exists
+ * for compatibility with ATen; use dim_order to describe non-contiguous
+ * layouts.
+ */
+enum class MemoryFormat : int8_t {
+  /**
+   * Row-major contiguous data.
+   */
+  Contiguous = 0,
+  /**
+   * Output tensor format should remain the same as the input tensor format.
+   * E.g. if the input tensor is in channels_last format, operator output
+   * should be in channels_last format.
+   */
+  Preserve = 1,
+};
+
+/**
+ * Tensor data memory layout. This concept only exists for compatibility
+ * with ATen.
+ */
+enum class Layout : int8_t {
+  /**
+   * The tensor occupies memory densely and indexing is managed through strides.
+   * Contrasted with a sparse tensor layout where the memory structure of the
+   * data blob will be more complicated and indexing requires larger structures.
+   *
+   * This is the only layout supported by ExecuTorch.
+   */
+  Strided = 0,
+};
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::etensor::Layout;
+using ::executorch::runtime::etensor::MemoryFormat;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/result.h b/include/executorch/runtime/core/result.h
new file mode 100644
index 00000000000..377573e6dfa
--- /dev/null
+++ b/include/executorch/runtime/core/result.h
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * @file
+ * Result type to be used in conjunction with ExecuTorch Error type.
+ */
+
+#pragma once
+
+#include <new>
+#include <utility>
+
+#include "executorch/runtime/core/error.h"
+#include "executorch/runtime/platform/assert.h"
+
+namespace executorch {
+namespace runtime {
+
+/**
+ * Result type wrapping either a value of type T or an error.
+ *
+ * Example use case:
+ * @code
+ *   Result<OpFn> getOp(int opcode) {
+ *     if (isValidOpCode(opcode)) {
+ *       return opFns[opcode];
+ *     }
+ *     return Error::NotFound;
+ *   }
+ *
+ *   Error useOp(int opcode) {
+ *     Result<OpFn> op = getOp(opcode);
+ *     if (!op.ok()) {
+ *       return op.error();
+ *     }
+ *     print(op->toString());
+ *     execute(*op);
+ *     return Error::Ok;
+ *   }
+ * @endcode
+ */
+template <typename T>
+class Result final {
+ public:
+  /// `value_type` member for generic programming.
+  typedef T value_type;
+
+  /**
+   * Creates a Result object from an Error.
+   *
+   * To preserve the invariant that `(result.error() == Error::Ok) ==
+   * result.ok()`, an `error` parameter value of `Error:Ok` will be converted to
+   * a non-Ok value.
+   */
+  /* implicit */ Result(Error error)
+      : error_(error == Error::Ok ? Error::Internal : error), hasValue_(false) {
+    if ET_UNLIKELY (error == Error::Ok) {
+      ET_LOG(
+          Debug,
+          "Attempted to create Result from Error::Ok, this has been converted to Error::Internal.");
+    }
+  }
+
+  /// Value copy constructor.
+  /* implicit */ Result(const T& val) : value_(val), hasValue_(true) {}
+
+  /// Value move constructor.
+  /* implicit */ Result(T&& val) : value_(std::move(val)), hasValue_(true) {}
+
+  /// Result move constructor.
+  /* implicit */ Result(Result&& rhs) noexcept : hasValue_(rhs.hasValue_) {
+    if (hasValue_) {
+      // Use the value type's move constructor.
+      new (&value_) T(std::move(rhs.value_));
+    } else {
+      error_ = rhs.error_;
+    }
+  }
+
+  ~Result() {
+    if (hasValue_) {
+      // Manual value destruction.
+      // Result "owns" the memory, so `delete` would segfault.
+      value_.~T();
+    }
+  }
+
+  /**
+   * Returns true if this Result has a value.
+   *
+   * If true, it is guaranteed that `error()` will return `Error::Ok`.
+   * If false, it is guaranteed that `error()` will not return `Error::Ok`.
+   */
+  ET_NODISCARD bool ok() const {
+    return hasValue_;
+  }
+
+  /**
+   * Returns the error code of this Result.
+   *
+   * If this returns `Error::Ok`, it is guaranteed that `ok()` will return true.
+   * If this does not return `Error:Ok`, it is guaranteed that `ok()` will
+   * return false.
+   */
+  ET_NODISCARD Error error() const {
+    if (hasValue_) {
+      return Error::Ok;
+    } else {
+      return error_;
+    }
+  }
+
+  /**
+   * Returns a reference to the Result's value; longhand for operator*().
+   *
+   * Only legal to call if `ok()` returns true.
+   */
+  T& get() {
+    CheckOk();
+    return value_;
+  }
+
+  /**
+   * Returns a reference to the Result's value; longhand for operator*().
+   *
+   * Only legal to call if `ok()` returns true.
+   */
+  const T& get() const {
+    CheckOk();
+    return value_;
+  }
+
+  /*
+   * Returns a reference to the Result's value; shorthand for get().
+   *
+   * Only legal to call if `ok()` returns true.
+   */
+  const T& operator*() const&;
+  T& operator*() &;
+
+  /*
+   * Returns a pointer to the Result's value.
+   *
+   * Only legal to call if `ok()` returns true.
+   */
+  const T* operator->() const;
+  T* operator->();
+
+ private:
+  /**
+   * Delete default constructor since all Results should contain a value or
+   * error.
+   */
+  Result() = delete;
+  /// Delete copy constructor since T may not be copyable.
+  Result(const Result&) = delete;
+  /// Delete copy assignment since T may not be copyable.
+  Result& operator=(const Result&) = delete;
+  /// Delete move assignment since it's not a supported pattern to reuse Result.
+  Result& operator=(Result&& rhs) = delete;
+
+  // Panics if ok() would return false;
+  void CheckOk() const {
+    ET_CHECK(hasValue_);
+  }
+
+  union {
+    T value_; // Used if hasValue_ is true.
+    Error error_; // Used if hasValue_ is false.
+  };
+
+  /// True if the Result contains a value.
+  const bool hasValue_;
+};
+
+template <typename T>
+const T& Result<T>::operator*() const& {
+  CheckOk();
+  return value_;
+}
+
+template <typename T>
+T& Result<T>::operator*() & {
+  CheckOk();
+  return value_;
+}
+
+template <typename T>
+const T* Result<T>::operator->() const {
+  CheckOk();
+  return &value_;
+}
+
+template <typename T>
+T* Result<T>::operator->() {
+  CheckOk();
+  return &value_;
+}
+
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::Result;
+} // namespace executor
+} // namespace torch
+
+/**
+ * Unwrap a Result to obtain its value. If the Result contains an error,
+ * propogate the error via trivial function return.
+ *
+ * Note: A function using ET_UNWRAP should itself return a Result or Error.
+ *
+ * @param[in] result__ Expression yielding the result to unwrap.
+ * @param[in] ... Optional format string for the log error message and its
+ * arguments.
+ */
+#define ET_UNWRAP(result__, ...) ET_INTERNAL_UNWRAP(result__, ##__VA_ARGS__)
+
+// Internal only: Use ET_UNWRAP() instead.
+#define ET_INTERNAL_UNWRAP(...)                                         \
+  ET_INTERNAL_UNWRAP_SELECT(__VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1) \
+  (__VA_ARGS__)
+
+// Internal only: Use ET_UNWRAP() instead.
+#define ET_INTERNAL_UNWRAP_SELECT(                   \
+    _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, N, ...) \
+  ET_INTERNAL_UNWRAP_##N
+
+// Internal only: Use ET_UNWRAP() instead.
+#define ET_INTERNAL_UNWRAP_1(result__) \
+  ({                                   \
+    auto et_result__ = (result__);     \
+    if (!et_result__.ok()) {           \
+      return et_result__.error();      \
+    }                                  \
+    std::move(*et_result__);           \
+  })
+
+// Internal only: Use ET_UNWRAP() instead.
+#define ET_INTERNAL_UNWRAP_2(result__, message__, ...) \
+  ({                                                   \
+    auto et_result__ = (result__);                     \
+    if (!et_result__.ok()) {                           \
+      ET_LOG(Error, message__, ##__VA_ARGS__);         \
+      return et_result__.error();                      \
+    }                                                  \
+    std::move(*et_result__);                           \
+  })
+
+// Internal only: Use ET_UNWRAP() instead.
+#define ET_INTERNAL_UNWRAP_3 ET_INTERNAL_UNWRAP_2
+#define ET_INTERNAL_UNWRAP_4 ET_INTERNAL_UNWRAP_2
+#define ET_INTERNAL_UNWRAP_5 ET_INTERNAL_UNWRAP_2
+#define ET_INTERNAL_UNWRAP_6 ET_INTERNAL_UNWRAP_2
+#define ET_INTERNAL_UNWRAP_7 ET_INTERNAL_UNWRAP_2
+#define ET_INTERNAL_UNWRAP_8 ET_INTERNAL_UNWRAP_2
+#define ET_INTERNAL_UNWRAP_9 ET_INTERNAL_UNWRAP_2
+#define ET_INTERNAL_UNWRAP_10 ET_INTERNAL_UNWRAP_2
diff --git a/include/executorch/runtime/core/span.h b/include/executorch/runtime/core/span.h
new file mode 100644
index 00000000000..1bcde396ccd
--- /dev/null
+++ b/include/executorch/runtime/core/span.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+#include <executorch/runtime/platform/assert.h>
+
+namespace executorch {
+namespace runtime {
+
+/**
+ * Represent a reference to an array (0 or more elements
+ * consecutively in memory), i.e. a start pointer and a length.  It allows
+ * various APIs to take consecutive elements easily and conveniently.
+ *
+ * This class does not own the underlying data, it is expected to be used in
+ * situations where the data resides in some other buffer, whose lifetime
+ * extends past that of the Span.
+ *
+ * Span and ArrayRef are extrememly similar with the difference being ArrayRef
+ * views a list of constant elements and Span views a list of mutable elements.
+ * Clients should decide between the two based on if the list elements for their
+ * use case should be mutable.
+ *
+ * This is intended to be trivially copyable, so it should be passed by
+ * value.
+ */
+template <typename T>
+class Span final {
+ public:
+  using value_type = T;
+  using iterator = T*;
+  using size_type = size_t;
+
+ public:
+  /// Construct an empty Span.
+  /* implicit */ constexpr Span() noexcept : data_(nullptr), length_(0) {}
+
+  /// Construct a Span from a pointer and length.
+  Span(T* data, size_t length) : data_(data), length_(length) {
+    ET_DCHECK(data_ != nullptr || length_ == 0);
+  }
+
+  /// Construct a Span from a range.
+  Span(T* begin, T* end) : data_(begin), length_(end - begin) {}
+
+  /// Construct a Span from a C array.
+  template <size_t N>
+  /* implicit */ constexpr Span(T (&Arr)[N]) : data_(Arr), length_(N) {}
+
+  /// @returns a pointer to the start of the underlying element buffer.
+  iterator begin() const noexcept {
+    return data_;
+  }
+
+  /// @returns a pointer to the end of the underlying element buffer.
+  iterator end() const noexcept {
+    return data_ + length_;
+  }
+
+  /// @retval a boolean indicating if the Span is empty.
+  constexpr bool empty() const noexcept {
+    return length_ == 0;
+  }
+
+  /// @returns a pointer to the start of the underlying element buffer.
+  constexpr T* data() const noexcept {
+    return data_;
+  }
+
+  /// @returns the number of elements in the Span.
+  constexpr size_t size() const noexcept {
+    return length_;
+  }
+
+  /// Unchecked index into the array according to the argument index.
+  /// @returns a reference to the element at the specified index.
+  T& operator[](size_t index) const {
+    return data_[index];
+  }
+
+ private:
+  /// The start of the array, in an external buffer.
+  T* data_;
+
+  /// The number of elements.
+  size_type length_;
+};
+
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::Span;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/tag.h b/include/executorch/runtime/core/tag.h
new file mode 100644
index 00000000000..8ff260cacc1
--- /dev/null
+++ b/include/executorch/runtime/core/tag.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/defines.h>
+#include <executorch/runtime/platform/compiler.h>
+#include <cstdint>
+
+namespace executorch {
+namespace runtime {
+
+#define EXECUTORCH_FORALL_TAGS(_) \
+  _(None)                         \
+  _(Tensor)                       \
+  _(String)                       \
+  _(Double)                       \
+  _(Int)                          \
+  _(Bool)                         \
+  _(ListBool)                     \
+  _(ListDouble)                   \
+  _(ListInt)                      \
+  _(ListTensor)                   \
+  _(ListScalar)                   \
+  _(ListOptionalTensor)
+
+/**
+ * The dynamic type of an EValue.
+ */
+enum class Tag : uint32_t {
+#define DEFINE_TAG(x) x,
+  EXECUTORCH_FORALL_TAGS(DEFINE_TAG)
+#undef DEFINE_TAG
+};
+
+#if ET_ENABLE_ENUM_STRINGS
+inline const char* tag_to_string(Tag tag) {
+  switch (tag) {
+#define CASE_TAG(x) \
+  case Tag::x:      \
+    return #x;
+    EXECUTORCH_FORALL_TAGS(CASE_TAG)
+#undef CASE_TAG
+    default:
+      return "Unknown";
+  }
+}
+#endif // ET_ENABLE_ENUM_STRINGS
+
+/**
+ * Convert a tag value to a string representation. If ET_ENABLE_ENUM_STRINGS is
+ * set (it is on by default), this will return a string name (for example,
+ * "Tensor"). Otherwise, it will return a string representation of the index
+ * value ("1").
+ *
+ * If the user buffer is not large enough to hold the string representation, the
+ * string will be truncated.
+ *
+ * The return value is the number of characters written, or in the case of
+ * truncation, the number of characters that would be written if the buffer was
+ * large enough.
+ */
+size_t tag_to_string(Tag tag, char* buffer, size_t buffer_size);
+
+/* The size of the buffer needed to hold the longest tag string, including the
+ * null terminator. This value is expected to be updated manually, but it
+ * checked in test_tag.cpp.
+ */
+constexpr size_t kTagNameBufferSize = 19;
+
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::Tag;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/core/tensor_layout.h b/include/executorch/runtime/core/tensor_layout.h
new file mode 100644
index 00000000000..42131e6506e
--- /dev/null
+++ b/include/executorch/runtime/core/tensor_layout.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/core/span.h>
+
+namespace executorch {
+namespace ET_RUNTIME_NAMESPACE {
+
+/**
+ * Describes the layout of a tensor.
+ */
+class ET_EXPERIMENTAL TensorLayout final {
+ public:
+  TensorLayout() = delete;
+
+  /**
+   * Creates a TensorLayout from the given parameters.
+   *
+   * @param[in] sizes The sizes of the tensor. Note: the span passed here must
+   * outlive the TensorLayout and all copies of it.
+   * @param[in] dim_order The dim order of the tensor. Note: the span passed
+   * here must outlive the TensorLayout and all copies of it.
+   * @param[in] scalar_type The scalar type of the tensor.
+   * @return A Result containing the TensorLayout on success, or an error.
+   */
+  static executorch::runtime::Result<const TensorLayout> create(
+      Span<const int32_t> sizes,
+      Span<const uint8_t> dim_order,
+      executorch::aten::ScalarType scalar_type);
+
+  /**
+   * Returns the sizes of the tensor.
+   *
+   * NOTE: The TensorLayout must outlive the spans returned here.
+   */
+  Span<const int32_t> sizes() const {
+    return sizes_;
+  }
+
+  /**
+   * Returns the dim order of the tensor.
+   *
+   * NOTE: The TensorLayout must outlive the spans returned here.
+   */
+  Span<const uint8_t> dim_order() const {
+    return dim_order_;
+  }
+
+  /// Returns the scalar type of the tensor.
+  executorch::aten::ScalarType scalar_type() const {
+    return scalar_type_;
+  }
+
+  /// Returns the size of the tensor in bytes.
+  size_t nbytes() const {
+    return nbytes_;
+  }
+
+ private:
+  TensorLayout(
+      Span<const int32_t> sizes,
+      Span<const uint8_t> dim_order,
+      executorch::aten::ScalarType scalar_type,
+      size_t nbytes)
+      : sizes_(sizes),
+        dim_order_(dim_order),
+        scalar_type_(scalar_type),
+        nbytes_(nbytes) {}
+  /// The sizes of the tensor.
+  const Span<const int32_t> sizes_;
+
+  /// The dim order of the tensor.
+  const Span<const uint8_t> dim_order_;
+
+  /// The scalar type of the tensor.
+  const executorch::aten::ScalarType scalar_type_;
+
+  /// The size in bytes of the tensor.
+  const size_t nbytes_;
+};
+
+} // namespace ET_RUNTIME_NAMESPACE
+} // namespace executorch
diff --git a/include/executorch/runtime/core/tensor_shape_dynamism.h b/include/executorch/runtime/core/tensor_shape_dynamism.h
new file mode 100644
index 00000000000..ee956288d74
--- /dev/null
+++ b/include/executorch/runtime/core/tensor_shape_dynamism.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace executorch {
+namespace runtime {
+
+/**
+ * The resizing capabilities of a Tensor.
+ *
+ * The rank of an ExecuTorch Tensors can never change, but shape sometimes can.
+ */
+enum class TensorShapeDynamism : uint8_t {
+  /// Cannot change shape.
+  STATIC = 0,
+  /// Shape cannot exceed initial capacity.
+  DYNAMIC_BOUND = 1,
+  /// No restriction on shape and capacity.
+  DYNAMIC_UNBOUND = 2,
+};
+
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::TensorShapeDynamism;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/kernel/kernel_includes.h b/include/executorch/runtime/kernel/kernel_includes.h
new file mode 100644
index 00000000000..9d6029a9a0a
--- /dev/null
+++ b/include/executorch/runtime/kernel/kernel_includes.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * @file
+ *
+ * Common includes used by all kernel implementations.
+ */
+
+#pragma once
+
+// This list should be very conservative since most kernel .cpp files will
+// include these and depend on their transitive deps. Only add a header if 99%
+// of kernels would have included it anyway.
+#include <executorch/runtime/core/exec_aten/exec_aten.h> // IWYU pragma: export
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h> // IWYU pragma: export
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h> // IWYU pragma: export
+#include <executorch/runtime/kernel/kernel_runtime_context.h> // IWYU pragma: export
diff --git a/include/executorch/runtime/kernel/kernel_runtime_context.h b/include/executorch/runtime/kernel/kernel_runtime_context.h
new file mode 100644
index 00000000000..6facecc7632
--- /dev/null
+++ b/include/executorch/runtime/kernel/kernel_runtime_context.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/event_tracer_hooks.h>
+#include <executorch/runtime/core/memory_allocator.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/platform/compiler.h>
+
+namespace executorch {
+namespace ET_RUNTIME_NAMESPACE {
+
+/**
+ * Runtime state and functionality for kernel implementations.
+ *
+ * NOTE: Will not be passed to operators if running in ATen mode as those
+ * operators do not expect to receive a KernelRuntimeContext argument.
+ */
+class KernelRuntimeContext {
+ public:
+  /**
+   * Construct a new kernel runtime context.
+   *
+   * KernelRuntimeContext does not take ownership
+   * of these pointers, so they must outlive the context instance.
+   *
+   * @param[in] event_tracer The optional EventTracer to use for
+   *     profiling/debugging
+   * @param[in] temp_allocator The optional MemoryAllocator used to allocate
+   *     temporary memory for the kernel. If not provided, an error will be
+   *     returned when calling allocate_temp.
+   */
+  KernelRuntimeContext(
+      EventTracer* event_tracer = nullptr,
+      MemoryAllocator* temp_allocator = nullptr)
+      : event_tracer_(event_tracer), temp_allocator_(temp_allocator) {}
+  /**
+   * Tells the runtime that the kernel call has failed. Prefer this over
+   * ET_CHECK_*(), which fatally panics the process/system.
+   *
+   * If this is not called, the runtime will treat the kernel call as
+   * successful.
+   *
+   * This unusual error-propagation path is required because kernel signatures
+   * do not have a natural way to return errors directly. They are generally
+   * compatible with core PyTorch ATen kernel signatures, which use exceptions
+   * to report errors. But, ExecuTorch does not use exceptions.
+   */
+  void fail(Error error) {
+    failure_state_ = error;
+  }
+
+  /// Returns the current failure state.
+  ET_NODISCARD Error failure_state() const {
+    return failure_state_;
+  }
+
+  /**
+   * INTERNAL ONLY
+   *
+   * Returns a pointer to an instance of EventTracer to do profiling/debugging
+   * logging inside the codegen layer. This is only for internal usage inside
+   * the codegen layer and users should not be accessing this.
+   */
+  EventTracer* internal_event_tracer() {
+    return event_tracer_;
+  }
+
+  /**
+   * Allocates temporary memory that will be freed when the kernel returns. This
+   * returns a pointer to the allocated memory or an error if the allocation
+   * fails.
+   *
+   * @param[in] size Number of bytes to allocate.
+   * @param[in] alignment Minimum alignment for the returned pointer. Must be a
+   *     power of 2.
+   *
+   * @returns A result object containing either a pointer to the allocated
+   *     memory or an error to indicate failure
+   */
+  Result<void*> allocate_temp(
+      size_t size,
+      size_t alignment = MemoryAllocator::kDefaultAlignment) {
+    ET_CHECK_OR_RETURN_ERROR(
+        temp_allocator_ != nullptr, NotFound, "No temp allocator provided");
+    void* temp_memory = temp_allocator_->allocate(size, alignment);
+    ET_CHECK_OR_RETURN_ERROR(
+        temp_memory != nullptr,
+        MemoryAllocationFailed,
+        "Failed to allocate temp memory. Bytes requested: %zu",
+        size);
+    return temp_memory;
+  }
+
+  // TODO(T147221312): Add a way to resize a tensor.
+
+ private:
+  EventTracer* event_tracer_ = nullptr;
+  MemoryAllocator* temp_allocator_ = nullptr;
+  Error failure_state_ = Error::Ok;
+};
+
+} // namespace ET_RUNTIME_NAMESPACE
+} // namespace executorch
+
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+namespace torch {
+namespace executor {
+/// DEPRECATED: Use ::executorch::runtime::KernelRuntimeContext instead.
+using ::executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext;
+/// DEPRECATED: Use ::executorch::runtime::KernelRuntimeContext instead.
+using RuntimeContext = ::executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext;
+} // namespace executor
+} // namespace torch
+namespace executorch {
+namespace aten {
+/// DEPRECATED: Use ::executorch::runtime::KernelRuntimeContext instead.
+using RuntimeContext = ::executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext;
+} // namespace aten
+} // namespace executorch
+// DEPRECATED: The executorch::aten:: namespace is deprecated. Use
+// executorch::aten:: instead.
+namespace exec_aten = ::executorch::aten;
diff --git a/include/executorch/runtime/kernel/operator_registry.h b/include/executorch/runtime/kernel/operator_registry.h
new file mode 100644
index 00000000000..f7a62208dd8
--- /dev/null
+++ b/include/executorch/runtime/kernel/operator_registry.h
@@ -0,0 +1,299 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstring>
+
+#include <executorch/runtime/core/array_ref.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/core/span.h>
+#include <executorch/runtime/platform/compiler.h>
+#include <executorch/runtime/platform/platform.h>
+
+// Debug switch for operator registry
+#if defined(ET_OP_REGISTRY_DEBUG)
+#include <ostream>
+#endif
+
+#define ET_LOG_KERNEL_KEY(k)      \
+  ET_LOG(                         \
+      Info,                       \
+      "key: %s, is_fallback: %s", \
+      k.data(),                   \
+      k.is_fallback() ? "true" : "false");
+#define ET_LOG_TENSOR_META(meta_list)                                \
+  for (const auto& meta : meta_list) {                               \
+    ET_LOG(Info, "dtype: %d | dim order: [", int(meta.dtype_));      \
+    for (size_t i = 0; i < meta.dim_order_.size(); i++) {            \
+      ET_LOG(Info, "%d,", static_cast<int32_t>(meta.dim_order_[i])); \
+    }                                                                \
+    ET_LOG(Info, "]");                                               \
+  }
+
+namespace executorch {
+namespace ET_RUNTIME_NAMESPACE {
+
+class KernelRuntimeContext; // Forward declaration
+using OpFunction = void (*)(KernelRuntimeContext&, EValue**);
+
+/**
+ * Dtype and dim order metadata for a Tensor argument to an operator.
+ * Used by the Executor to hold the tensor metadata info and retrieve kernel.
+ */
+struct TensorMeta {
+  executorch::aten::ScalarType dtype_;
+  Span<executorch::aten::DimOrderType> dim_order_;
+
+  TensorMeta() = default;
+  TensorMeta(
+      executorch::aten::ScalarType dtype,
+      Span<executorch::aten::DimOrderType> order)
+      : dtype_(dtype), dim_order_(order) {}
+
+  bool operator==(const TensorMeta& other) const {
+    return this->equals(other);
+  }
+
+  bool operator!=(const TensorMeta& other) const {
+    return !this->equals(other);
+  }
+
+  bool equals(const TensorMeta& other) const {
+    if (dtype_ != other.dtype_) {
+      return false;
+    }
+    if (dim_order_.size() != other.dim_order_.size()) {
+      return false;
+    }
+    for (size_t i = 0; i < dim_order_.size(); i++) {
+      if (dim_order_[i] != other.dim_order_[i]) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+#if defined(ET_OP_REGISTRY_DEBUG)
+  friend std::ostream& operator<<(std::ostream& os, const TensorMeta& meta) {
+    os << "dtype: " << int(meta.dtype_) << " | dim order: [";
+    for (int i = 0; i < meta.dim_order_.size(); i++) {
+      os << static_cast<int32_t>(meta.dim_order_[i]) << ", ";
+    }
+    os << "]";
+    return os;
+  }
+#endif
+};
+
+/**
+ * Describes which dtype & dim order specialized kernel to be bound to an
+ * operator.
+ *
+ * Kernel key data is a string with the format:
+ *
+ *     "v<version>/<tensor_meta>|<tensor_meta>..."
+ *
+ * The version is v1 for now. If the kernel key format changes, update the
+ * version to avoid breaking pre-existing kernel keys.
+ *
+ * Each tensor_meta has the following format: "<dtype>;<dim_order,...>"
+ *
+ * Example kernel key data: "v1/7;0,1,2,3|1;0,1,2,3,4,5,6,7"
+ *
+ * This has two tensors: the first with dtype=7 and dim order 0,1,2,3, and the
+ * second with dtype=1 and dim order 0,1,2,3,4,5,6,7.
+ *
+ * IMPORTANT:
+ * Users should not construct a kernel key manually. Instead, it should be
+ * generated from kernel yaml.
+ */
+struct KernelKey {
+ public:
+  /**
+   * Creates a fallback (non-specialized) kernel key: this kernel can be used
+   * for all input tensor dtypes and dim orders if the specialized kernel is not
+   * registered.
+   */
+  KernelKey() : is_fallback_(true) {}
+
+  /**
+   * Creates a specialized (non-fallback) kernel key that matches a specific
+   * set of input tensor dtypes and dim orders. See the class comment for the
+   * expected format of `kernel_key_data`.
+   */
+  /* implicit */ KernelKey(const char* kernel_key_data)
+      : kernel_key_data_(kernel_key_data), is_fallback_(false) {}
+
+  bool operator==(const KernelKey& other) const {
+    return this->equals(other);
+  }
+
+  bool operator!=(const KernelKey& other) const {
+    return !this->equals(other);
+  }
+
+  bool equals(const KernelKey& other) const {
+    if (is_fallback_ != other.is_fallback_) {
+      return false;
+    }
+    if (is_fallback_) {
+      return true;
+    }
+    return strcmp(kernel_key_data_, other.kernel_key_data_) == 0;
+  }
+
+  bool is_fallback() const {
+    return is_fallback_;
+  }
+
+  const char* data() const {
+    return kernel_key_data_;
+  }
+
+#if defined(ET_OP_REGISTRY_DEBUG)
+  friend std::ostream& operator<<(std::ostream& os, const KernelKey& key) {
+    os << key.kernel_key_data_ << std::endl;
+    return os;
+  }
+#endif
+
+ private:
+  const char* kernel_key_data_ = nullptr;
+  bool is_fallback_;
+};
+
+/**
+ * Struct that bundles a kernel key, a function and an op name together. An
+ * `Operator` may have more than one `Kernel` (maximum kMaxNumOfKernelPerOp) and
+ * they should have the same op name and different kernel key. A "fallback"
+ * kernel may or may not live in an `Operator`.
+ */
+struct Kernel {
+  const char* name_;
+  // String representation of kernel key, with the same format as
+  // KernelKey.to_string_representation()
+  // Data is not owned by the Kernel struct.
+  KernelKey kernel_key_;
+  OpFunction op_;
+  /**
+   * We are doing a copy of the string pointer instead of duplicating the string
+   * itself, we require the lifetime of the operator name to be at least as long
+   * as the operator registry.
+   */
+  explicit Kernel(const char* name, OpFunction func) : name_(name), op_(func) {}
+
+  explicit Kernel(const char* name, KernelKey key, OpFunction func)
+      : name_(name), kernel_key_(key), op_(func) {}
+
+  Kernel() {}
+};
+
+namespace internal {
+
+/**
+ * A make_kernel_key_string buffer size that is large enough to hold a kernel
+ * key string with 16 tensors of 16 dimensions, plus the trailing NUL byte.
+ */
+constexpr size_t kKernelKeyBufSize = 659;
+
+/**
+ * Given the list of input tensor dtypes + dim orders, writes the kernel key
+ * string into the buffer. Returns an error if the buffer is too small or if the
+ * tensors cannot be represented as a valid key string.
+ */
+Error make_kernel_key_string(
+    Span<const TensorMeta> key,
+    char* buf,
+    size_t buf_size);
+
+} // namespace internal
+
+/**
+ * Checks whether an operator exists with a given name and TensorMeta list. When
+ * TensorMeta is empty, it means this op does not have specialized kernels, so
+ * it checks whether it has any fallback kernels.
+ */
+bool registry_has_op_function(
+    const char* name,
+    Span<const TensorMeta> meta_list = {});
+
+/**
+ * Returns the operator with a given name and TensorMeta list, if present.
+ */
+::executorch::runtime::Result<OpFunction> get_op_function_from_registry(
+    const char* name,
+    Span<const TensorMeta> meta_list = {});
+
+/**
+ * Returns all registered kernels.
+ */
+Span<const Kernel> get_registered_kernels();
+
+/**
+ * Registers the provided kernels.
+ *
+ * @param[in] kernels Kernel objects to register.
+ * @retval Error::Ok always. Panics on error. This function needs to return a
+ *     non-void type to run at static initialization time.
+ */
+ET_NODISCARD Error register_kernels(const Span<const Kernel>);
+
+/**
+ * Registers a single kernel.
+ *
+ * @param[in] kernel Kernel object to register.
+ * @retval Error::Ok always. Panics on error. This function needs to return a
+ *     non-void type to run at static initialization time.
+ */
+ET_NODISCARD inline Error register_kernel(const Kernel& kernel) {
+  return register_kernels({&kernel, 1});
+};
+
+} // namespace ET_RUNTIME_NAMESPACE
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::ET_RUNTIME_NAMESPACE::Kernel;
+using ::executorch::ET_RUNTIME_NAMESPACE::KernelKey;
+using ::executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext;
+using ::executorch::ET_RUNTIME_NAMESPACE::OpFunction;
+using ::executorch::ET_RUNTIME_NAMESPACE::TensorMeta;
+using KernelRuntimeContext =
+    ::executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext;
+
+inline ::executorch::runtime::Error register_kernels(ArrayRef<Kernel> kernels) {
+  return ::executorch::ET_RUNTIME_NAMESPACE::register_kernels(
+      {kernels.data(), kernels.size()});
+}
+inline OpFunction getOpsFn(
+    const char* name,
+    ArrayRef<TensorMeta> meta_list = {}) {
+  auto result =
+      ::executorch::ET_RUNTIME_NAMESPACE::get_op_function_from_registry(
+          name, {meta_list.data(), meta_list.size()});
+  ET_CHECK(result.ok()); // get_op_function_from_registry() logs details.
+  return *result;
+}
+inline bool hasOpsFn(const char* name, ArrayRef<TensorMeta> meta_list = {}) {
+  return ::executorch::ET_RUNTIME_NAMESPACE::registry_has_op_function(
+      name, {meta_list.data(), meta_list.size()});
+}
+inline ArrayRef<Kernel> get_kernels() {
+  Span<const Kernel> kernels =
+      ::executorch::ET_RUNTIME_NAMESPACE::get_registered_kernels();
+  return ArrayRef<Kernel>(kernels.data(), kernels.size());
+}
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/kernel/test/test_util.h b/include/executorch/runtime/kernel/test/test_util.h
new file mode 100644
index 00000000000..be77df1fd0c
--- /dev/null
+++ b/include/executorch/runtime/kernel/test/test_util.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <vector>
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/kernel/operator_registry.h>
+
+namespace executorch {
+namespace runtime {
+
+namespace testing {
+
+inline Error make_kernel_key(
+    const std::vector<std::pair<
+        executorch::aten::ScalarType,
+        std::vector<executorch::aten::DimOrderType>>>& tensors,
+    char* buf,
+    size_t buf_size) {
+  std::vector<TensorMeta> meta;
+  for (auto& t : tensors) {
+    Span<executorch::aten::DimOrderType> dim_order(
+        const_cast<unsigned char*>(t.second.data()), t.second.size());
+    meta.emplace_back(t.first, dim_order);
+  }
+  Span<const TensorMeta> metadata(meta.data(), meta.size());
+  return internal::make_kernel_key_string(metadata, buf, buf_size);
+}
+
+} // namespace testing
+
+} // namespace runtime
+} // namespace executorch
diff --git a/include/executorch/runtime/kernel/thread_parallel_interface.h b/include/executorch/runtime/kernel/thread_parallel_interface.h
new file mode 100644
index 00000000000..8cce610dcb4
--- /dev/null
+++ b/include/executorch/runtime/kernel/thread_parallel_interface.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+
+#include <c10/util/irange.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/function_ref.h>
+#include <executorch/runtime/platform/assert.h>
+
+namespace executorch {
+namespace extension {
+namespace internal {
+template <typename Func>
+inline bool parallel_for_no_threadpool(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const Func& f) {
+  ET_CHECK_OR_RETURN_FALSE(
+      begin >= 0 && end >= 0 && end >= begin,
+      "begin = %" PRId64 ", end = %" PRId64,
+      begin,
+      end);
+  ET_CHECK_OR_RETURN_FALSE(grain_size > 0, "grain_size = %" PRId64, grain_size);
+#ifndef NDEBUG
+  // Go backwards through the range elementwise to catch code that
+  // assumes parallel_for is in order like a regular for loop.
+  for (const auto i : c10::irange(begin, end)) {
+    const auto offset = i - begin;
+    const auto idx = end - offset - 1;
+    f(idx, idx + 1);
+  }
+#else // NDEBUG
+  f(begin, end);
+#endif
+  return true;
+}
+
+// Match GRAIN_SIZE from PyTorch core.
+// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/TensorIterator.h#L78
+constexpr int64_t GRAIN_SIZE = 32768;
+} // namespace internal
+
+#ifdef ET_USE_THREADPOOL
+/**
+ * A helper to run a function in parallel.
+ *
+ * begin, end: describe the extent of the workitems via first and last workitem
+ * to be processed
+ * grain_size: number of workitems processed by user callback which is
+ * described below
+ * f: user function applied in parallel to the chunks, signature:
+ *   void f(int64_t begin, int64_t end)
+ * Returns true if all work items are processed successfully, false otherwise
+ *
+ * Warning: parallel_for does NOT copy thread local states from the current
+ * thread to the worker threads. Users need to protect the access to captured
+ * data if they mutate them in f.
+ */
+bool parallel_for(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    runtime::FunctionRef<void(int64_t, int64_t)> f);
+
+int64_t get_thread_num();
+
+void set_thread_num(int64_t thread_num);
+#else // ET_USE_THREADPOOL
+template <typename Func>
+bool parallel_for(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const Func& func) {
+  return internal::parallel_for_no_threadpool(begin, end, grain_size, func);
+}
+
+inline int64_t get_thread_num() {
+  return 0;
+}
+
+inline void set_thread_num(int64_t thread_num) {
+  ET_DCHECK_MSG(false, "cannot set_thread_num without threading support!");
+}
+#endif // ET_USE_THREADPOOL
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::get_thread_num;
+using ::executorch::extension::parallel_for;
+using ::executorch::extension::set_thread_num;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/platform/abort.h b/include/executorch/runtime/platform/abort.h
new file mode 100644
index 00000000000..ae1a761aa78
--- /dev/null
+++ b/include/executorch/runtime/platform/abort.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * @file
+ * ExecuTorch global abort wrapper function.
+ */
+
+#pragma once
+
+#include <executorch/runtime/platform/compiler.h>
+
+namespace executorch {
+namespace runtime {
+
+/**
+ * Trigger the ExecuTorch global runtime to immediately exit without cleaning
+ * up, and set an abnormal exit status (platform-defined).
+ */
+ET_NORETURN void runtime_abort();
+
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::runtime_abort;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/platform/assert.h b/include/executorch/runtime/platform/assert.h
new file mode 100644
index 00000000000..c8c189dd174
--- /dev/null
+++ b/include/executorch/runtime/platform/assert.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/platform/abort.h>
+#include <executorch/runtime/platform/compiler.h>
+#include <executorch/runtime/platform/log.h>
+
+/**
+ * Assertion failure message emit method.
+ *
+ * @param[in] _format Printf-style message format string.
+ * @param[in] ... Format string arguments.
+ */
+#define ET_ASSERT_MESSAGE_EMIT(_format, ...)     \
+  ET_LOG(                                        \
+      Fatal,                                     \
+      "In function %s(), assert failed" _format, \
+      ET_FUNCTION,                               \
+      ##__VA_ARGS__)
+
+/**
+ * Abort the runtime if the condition is not true.
+ * This check will be performed even in release builds.
+ *
+ * @param[in] _cond Condition asserted as true.
+ * @param[in] _format Printf-style message format string.
+ * @param[in] ... Format string arguments.
+ */
+#define ET_CHECK_MSG(_cond, _format, ...)                               \
+  do {                                                                  \
+    if ET_UNLIKELY (!(_cond)) {                                         \
+      ET_ASSERT_MESSAGE_EMIT(" (%s): " _format, #_cond, ##__VA_ARGS__); \
+      ::executorch::runtime::runtime_abort();                           \
+    }                                                                   \
+  } while (0)
+
+/**
+ * Abort the runtime if the condition is not true.
+ * This check will be performed even in release builds.
+ *
+ * @param[in] _cond Condition asserted as true.
+ */
+#define ET_CHECK(_cond)                       \
+  do {                                        \
+    if ET_UNLIKELY (!(_cond)) {               \
+      ET_ASSERT_MESSAGE_EMIT(": %s", #_cond); \
+      ::executorch::runtime::runtime_abort(); \
+    }                                         \
+  } while (0)
+
+#ifdef NDEBUG
+
+/**
+ * Abort the runtime if the condition is not true.
+ * This check will be performed in debug builds, but not release builds.
+ *
+ * @param[in] _cond Condition asserted as true.
+ * @param[in] _format Printf-style message format string.
+ * @param[in] ... Format string arguments.
+ */
+#define ET_DCHECK_MSG(_cond, _format, ...) ((void)0)
+
+/**
+ * Abort the runtime if the condition is not true.
+ * This check will be performed in debug builds, but not release builds.
+ *
+ * @param[in] _cond Condition asserted as true.
+ */
+#define ET_DCHECK(_cond) ((void)0)
+#define ET_DEBUG_ONLY [[maybe_unused]]
+
+#else // NDEBUG
+
+/**
+ * Abort the runtime if the condition is not true.
+ * This check will be performed in debug builds, but not release builds.
+ *
+ * @param[in] _cond Condition asserted as true.
+ * @param[in] _format Printf-style message format string.
+ * @param[in] ... Format string arguments.
+ */
+#define ET_DCHECK_MSG(_cond, _format, ...) \
+  ET_CHECK_MSG(_cond, _format, ##__VA_ARGS__)
+
+/**
+ * Abort the runtime if the condition is not true.
+ * This check will be performed in debug builds, but not release builds.
+ *
+ * @param[in] _cond Condition asserted as true.
+ */
+#define ET_DCHECK(_cond) ET_CHECK(_cond)
+#define ET_DEBUG_ONLY
+
+#endif // NDEBUG
+
+/**
+ * Assert that this code location is unreachable during execution.
+ */
+#define ET_ASSERT_UNREACHABLE()                                   \
+  do {                                                            \
+    ET_CHECK_MSG(false, "Execution should not reach this point"); \
+    ET_UNREACHABLE();                                             \
+  } while (0)
+
+/**
+ * Assert that this code location is unreachable during execution.
+ *
+ * @param[in] _message Message on how to avoid this assertion error.
+ */
+#define ET_ASSERT_UNREACHABLE_MSG(_format, ...)            \
+  do {                                                     \
+    ET_CHECK_MSG(                                          \
+        false,                                             \
+        "Execution should not reach this point. " _format, \
+        ##__VA_ARGS__);                                    \
+    ET_UNREACHABLE();                                      \
+  } while (0)
diff --git a/include/executorch/runtime/platform/clock.h b/include/executorch/runtime/platform/clock.h
new file mode 100644
index 00000000000..36c25d90c47
--- /dev/null
+++ b/include/executorch/runtime/platform/clock.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * @file
+ * Clock and timing related methods.
+ */
+
+#pragma once
+
+#include <executorch/runtime/platform/platform.h>
+
+namespace executorch {
+namespace runtime {
+
+/**
+ * Convert an interval from units of system ticks to nanoseconds.
+ * The conversion ratio is platform-dependent, and thus depends on
+ * the platform implementation of et_pal_ticks_to_ns_multiplier().
+ *
+ * @param[in] ticks The interval length in system ticks.
+ * @retval The interval length in nanoseconds.
+ */
+inline uint64_t ticks_to_ns(et_timestamp_t ticks) {
+  et_tick_ratio_t ratio = et_pal_ticks_to_ns_multiplier();
+  return static_cast<uint64_t>(ticks) * ratio.numerator / ratio.denominator;
+}
+
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::ticks_to_ns;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/platform/compat_unistd.h b/include/executorch/runtime/platform/compat_unistd.h
new file mode 100644
index 00000000000..c8bc4866702
--- /dev/null
+++ b/include/executorch/runtime/platform/compat_unistd.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * @file
+ * unistd.h related macros for POSIX/Windows compatibility.
+ */
+#pragma once
+
+#if defined(_WIN32) && !defined(_WIN64)
+#error \
+    "You're trying to build ExecuTorch with a too old version of Windows. We need Windows 64-bit."
+#endif
+
+#if !defined(_WIN64)
+#include <unistd.h>
+#else
+#include <io.h>
+#define O_RDONLY _O_RDONLY
+#define open _open
+#define close _close
+#define read _read
+#define write _write
+#define stat _stat64
+#define fstat _fstat64
+#define off_t _off_t
+#define lseek _lseeki64
+
+#include <executorch/runtime/platform/compiler.h> // For ssize_t.
+#include <windows.h>
+// To avoid conflicts with std::numeric_limits<int32_t>::max() in
+// file_data_loader.cpp.
+#undef max
+
+inline ssize_t pread(int fd, void* buf, size_t nbytes, size_t offset) {
+  OVERLAPPED overlapped; /* The offset for ReadFile. */
+  memset(&overlapped, 0, sizeof(overlapped));
+  overlapped.Offset = offset;
+  overlapped.OffsetHigh = offset >> 32;
+
+  BOOL result; /* The result of ReadFile. */
+  DWORD bytes_read; /* The number of bytes read. */
+  HANDLE file = (HANDLE)_get_osfhandle(fd);
+
+  result = ReadFile(file, buf, nbytes, &bytes_read, &overlapped);
+  DWORD error = GetLastError();
+  if (!result) {
+    if (error == ERROR_IO_PENDING) {
+      result = GetOverlappedResult(file, &overlapped, &bytes_read, TRUE);
+      if (!result) {
+        error = GetLastError();
+      }
+    }
+  }
+  if (!result) {
+    // Translate error into errno.
+    switch (error) {
+      case ERROR_HANDLE_EOF:
+        errno = 0;
+        break;
+      default:
+        errno = EIO;
+        break;
+    }
+    return -1;
+  }
+  return bytes_read;
+}
+
+#endif // !defined(_WIN64)
\ No newline at end of file
diff --git a/include/executorch/runtime/platform/compiler.h b/include/executorch/runtime/platform/compiler.h
new file mode 100644
index 00000000000..da7e0988a62
--- /dev/null
+++ b/include/executorch/runtime/platform/compiler.h
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * @file
+ * Compiler utility macros.
+ */
+
+#pragma once
+
+/*
+ * Compiler support checks. Follows the logic used by pytorch/c10/util/C++17.h
+ * but may support older versions.
+ */
+
+// https://gcc.gnu.org/projects/cxx-status.html#cxx17
+#if !defined(__clang__) && !defined(_MSC_VER) && defined(__GNUC__) && \
+    __GNUC__ < 7
+#error \
+    "You're trying to build ExecuTorch with a too old version of GCC. We need GCC 7 or later."
+#endif
+
+// https://clang.llvm.org/cxx_status.html#cxx17
+#if defined(__clang__) && __clang_major__ < 5
+#error \
+    "You're trying to build ExecuTorch with a too old version of Clang. We need Clang 5 or later."
+#endif
+
+#if (defined(_MSC_VER) && (!defined(_MSVC_LANG) || _MSVC_LANG < 201703L)) || \
+    (!defined(_MSC_VER) && __cplusplus < 201703L)
+#error "You need C++17 to compile ExecuTorch"
+#endif
+
+#if defined(_MSC_VER) && (defined(min) || defined(max))
+#error \
+    "Macro clash with min and max -- define NOMINMAX when compiling your program on Windows"
+#endif
+
+/*
+ * Define annotations aliasing C++ declaration attributes.
+ * See all C++ declaration attributes here:
+ *   https://en.cppreference.com/w/cpp/language/attributes
+ *
+ * Note that ExecuTorch supports a lower C++ standard version than all standard
+ * attributes. Therefore, some annotations are defined using their Clang/GNU
+ * counterparts.
+ *
+ * GNU attribute definitions:
+ *   https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html
+ */
+
+#define ET_NORETURN [[noreturn]]
+#define ET_NOINLINE __attribute__((noinline))
+#define ET_INLINE __attribute__((always_inline)) inline
+#define ET_INLINE_ATTRIBUTE __attribute__((always_inline))
+
+#if defined(__GNUC__)
+
+#define ET_UNREACHABLE() __builtin_unreachable()
+
+#elif defined(_MSC_VER)
+
+#define ET_UNREACHABLE() __assume(0)
+
+#else // defined(__GNUC__)
+
+#define ET_UNREACHABLE() \
+  while (1)              \
+    ;
+
+#endif // defined(__GNUC__)
+
+#define ET_DEPRECATED [[deprecated]]
+#define ET_EXPERIMENTAL \
+  [[deprecated("This API is experimental and may change without notice.")]]
+#define ET_FALLTHROUGH [[fallthrough]]
+#define ET_NODISCARD [[nodiscard]]
+#define ET_UNUSED [[maybe_unused]]
+
+// UNLIKELY Macro
+// example
+// if ET_UNLIKELY(a > 10 && b < 5) {
+//   do something
+// }
+#if (__cplusplus) >= 202002L
+
+#define ET_LIKELY(expr) (expr) [[likely]]
+#define ET_UNLIKELY(expr) (expr) [[unlikely]]
+
+#else
+
+#define ET_LIKELY(expr) (expr)
+#define ET_UNLIKELY(expr) (expr)
+
+#endif // (__cplusplus) >= 202002L
+
+/// Define a C symbol with weak linkage.
+#ifdef _MSC_VER
+// There currently doesn't seem to be a great way to do this in Windows and
+// given that weak linkage is not really critical on Windows, we'll just leave
+// it as a stub.
+#define ET_WEAK
+#else
+#define ET_WEAK __attribute__((weak))
+#endif
+
+/**
+ * Annotation marking a function as printf-like, providing compiler support
+ * for format string argument checking.
+ */
+#ifdef _MSC_VER
+#include <sal.h>
+#define ET_PRINTFLIKE(_string_index, _va_index) _Printf_format_string_
+#else
+#define ET_PRINTFLIKE(_string_index, _va_index) \
+  __attribute__((format(printf, _string_index, _va_index)))
+#endif
+
+#ifndef __has_builtin
+#define __has_builtin(x) (0)
+#endif
+
+#if __has_builtin(__builtin_strrchr)
+/// Name of the source file without a directory string.
+#define ET_SHORT_FILENAME (__builtin_strrchr("/" __FILE__, '/') + 1)
+#else
+#define ET_SHORT_FILENAME __FILE__
+#endif
+
+#if __has_builtin(__builtin_LINE)
+/// Current line as an integer.
+#define ET_LINE __builtin_LINE()
+#else
+#define ET_LINE __LINE__
+#endif // __has_builtin(__builtin_LINE)
+
+#if __has_builtin(__builtin_FUNCTION)
+/// Name of the current function as a const char[].
+#define ET_FUNCTION __builtin_FUNCTION()
+#else
+#define ET_FUNCTION __FUNCTION__
+#endif // __has_builtin(__builtin_FUNCTION)
+
+// As of G3 RJ-2024.3 toolchain, zu format specifier is not supported for Xtensa
+#if defined(__XTENSA__)
+#define ET_PRIsize_t "lu"
+#else
+#define ET_PRIsize_t "zu"
+#endif
+
+// Whether the compiler supports GNU statement expressions.
+// https://gcc.gnu.org/onlinedocs/gcc/Statement-Exprs.html
+#ifndef ET_HAVE_GNU_STATEMENT_EXPRESSIONS
+#if (defined(__GNUC__) && __GNUC__ >= 3) || defined(__clang__)
+#define ET_HAVE_GNU_STATEMENT_EXPRESSIONS 1
+#else
+#define ET_HAVE_GNU_STATEMENT_EXPRESSIONS 0
+#endif
+#endif // ifndef
+
+// Define size_t and ssize_t.
+#ifndef _MSC_VER
+#include <sys/types.h>
+#else
+#include <stddef.h>
+using ssize_t = ptrdiff_t;
+#endif
+
+/**
+ * Platform-specific aligned memory allocation and deallocation.
+ *
+ * Usage:
+ *   void* ptr = ET_ALIGNED_ALLOC(alignment, size);
+ *   // use ptr...
+ *   ET_ALIGNED_FREE(ptr);
+ *
+ * Note: alignment must be a power of 2 and size must be an integral multiple of
+ * alignment.
+ */
+#if defined(_MSC_VER)
+#include <malloc.h>
+#define ET_ALIGNED_ALLOC(alignment, size) \
+  _aligned_malloc(((size + alignment - 1) & ~(alignment - 1)), (alignment))
+#define ET_ALIGNED_FREE(ptr) _aligned_free(ptr)
+#elif defined(__APPLE__)
+#include <stdlib.h> // For posix_memalign and free
+inline void* et_apple_aligned_alloc(size_t alignment, size_t size) {
+  void* ptr = nullptr;
+  // The address of the allocated memory must be a multiple of sizeof(void*).
+  if (alignment < sizeof(void*)) {
+    alignment = sizeof(void*);
+  }
+  if (posix_memalign(
+          &ptr, alignment, (size + alignment - 1) & ~(alignment - 1)) != 0) {
+    return nullptr;
+  }
+  return ptr;
+}
+#define ET_ALIGNED_ALLOC(alignment, size) \
+  et_apple_aligned_alloc((alignment), (size))
+#define ET_ALIGNED_FREE(ptr) free(ptr)
+#elif __has_builtin(__builtin_aligned_alloc) || defined(_ISOC11_SOURCE)
+// Linux and posix systems that support aligned_alloc and are >= C++17.
+#include <cstdlib>
+#define ET_ALIGNED_ALLOC(alignment, size) \
+  ::aligned_alloc(alignment, (size + alignment - 1) & ~(alignment - 1))
+#define ET_ALIGNED_FREE(ptr) free(ptr)
+#else
+// If the platform doesn't support aligned_alloc, fallback to malloc.
+#include <stdint.h>
+#include <cstdlib>
+inline void* et_aligned_malloc(size_t alignment, size_t size) {
+  // Place to store the offset to the original pointer.
+  size_t offset_size = sizeof(uint16_t);
+
+  // Malloc extra space for offset + alignment.
+  size_t alloc_size = size + offset_size + alignment - 1;
+  void* ptr = std::malloc(alloc_size);
+
+  if (ptr == nullptr) {
+    // Malloc failed.
+    return nullptr;
+  }
+
+  uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
+  // Align the address past addr + offset_size bytes.
+  // This provides space to store the offset before the aligned pointer.
+  addr = addr + offset_size;
+  uintptr_t aligned_ptr = (addr + alignment - 1) & ~(alignment - 1);
+
+  // Check that alignment didn't overflow the buffer.
+  if (reinterpret_cast<uintptr_t>(aligned_ptr) + size >
+      reinterpret_cast<uintptr_t>(ptr) + alloc_size) {
+    std::free(ptr);
+    return nullptr;
+  }
+
+  // Store the offset to the original pointer.
+  // Used to free the original allocated buffer.
+  *(reinterpret_cast<uint16_t*>(aligned_ptr) - 1) =
+      (uint16_t)(reinterpret_cast<uintptr_t>(aligned_ptr) -
+                 reinterpret_cast<uintptr_t>(ptr));
+
+  return reinterpret_cast<uint16_t*>(aligned_ptr);
+}
+
+inline void et_aligned_free(void* ptr) {
+  if (ptr == nullptr) {
+    return;
+  }
+
+  // Get the original pointer using the offset.
+  uint16_t* original_ptr = reinterpret_cast<uint16_t*>(
+      reinterpret_cast<uintptr_t>(ptr) -
+      *(reinterpret_cast<uint16_t*>(ptr) - 1));
+  std::free(original_ptr);
+}
+
+#define ET_ALIGNED_ALLOC(alignment, size) et_aligned_malloc((alignment), (size))
+#define ET_ALIGNED_FREE(ptr) et_aligned_free(ptr)
+
+#endif
+
+// DEPRECATED: Use the non-underscore-prefixed versions instead.
+// TODO(T199005537): Remove these once all users have stopped using them.
+#define __ET_DEPRECATED ET_DEPRECATED
+#define __ET_FALLTHROUGH ET_FALLTHROUGH
+#define __ET_FUNCTION ET_FUNCTION
+#define __ET_HAVE_GNU_STATEMENT_EXPRESSIONS ET_HAVE_GNU_STATEMENT_EXPRESSIONS
+#define __ET_INLINE ET_INLINE
+#define __ET_LIKELY ET_LIKELY
+#define __ET_LINE ET_LINE
+#define __ET_NODISCARD ET_NODISCARD
+#define __ET_NOINLINE ET_NOINLINE
+#define __ET_NORETURN ET_NORETURN
+#define __ET_PRINTFLIKE ET_PRINTFLIKE
+#define __ET_SHORT_FILENAME ET_SHORT_FILENAME
+#define __ET_UNLIKELY ET_UNLIKELY
+#define __ET_UNREACHABLE ET_UNREACHABLE
+#define __ET_UNUSED ET_UNUSED
+#define __ET_WEAK ET_WEAK
diff --git a/include/executorch/runtime/platform/log.h b/include/executorch/runtime/platform/log.h
new file mode 100644
index 00000000000..72ea8528442
--- /dev/null
+++ b/include/executorch/runtime/platform/log.h
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * @file
+ * ExecuTorch logging API.
+ */
+
+#pragma once
+
+#include <cstdarg>
+#include <cstddef>
+#include <cstdlib>
+
+#include <executorch/runtime/platform/compiler.h>
+#include <executorch/runtime/platform/types.h>
+
+// Set minimum log severity if compiler option is not provided.
+#ifndef ET_MIN_LOG_LEVEL
+#define ET_MIN_LOG_LEVEL Info
+#endif // !defined(ET_MIN_LOG_LEVEL)
+
+/*
+ * Enable logging by default if compiler option is not provided.
+ * This should facilitate less confusion for those developing ExecuTorch.
+ */
+#ifndef ET_LOG_ENABLED
+#define ET_LOG_ENABLED 1
+#endif // !defined(ET_LOG_ENABLED)
+
+// Even though it is supposed to be "portable" some toolchains
+// do not define, so providing a definition here
+#ifndef PRIu64
+#define PRIu64 "llu"
+#endif
+#ifndef PRId64
+#define PRId64 "lld"
+#endif
+
+namespace executorch {
+namespace runtime {
+
+/**
+ * Severity level of a log message. Must be ordered from lowest to highest
+ * severity.
+ */
+enum class LogLevel : uint8_t {
+  /**
+   * Log messages provided for highly granular debuggability.
+   *
+   * Log messages using this severity are unlikely to be compiled by default
+   * into most debug builds.
+   */
+  Debug,
+
+  /**
+   * Log messages providing information about the state of the system
+   * for debuggability.
+   */
+  Info,
+
+  /**
+   * Log messages about errors within ExecuTorch during runtime.
+   */
+  Error,
+
+  /**
+   * Log messages that precede a fatal error. However, logging at this level
+   * does not perform the actual abort, something else needs to.
+   */
+  Fatal,
+
+  /**
+   * Number of supported log levels, with values in [0, NumLevels).
+   */
+  NumLevels,
+};
+
+namespace internal {
+
+/**
+ * Get the current timestamp to construct a log event.
+ *
+ * @retval Monotonically non-decreasing timestamp in system ticks.
+ */
+et_timestamp_t get_log_timestamp();
+
+/**
+ * Log a string message.
+ *
+ * Note: This is an internal function. Use the `ET_LOG` macro instead.
+ *
+ * @param[in] level Log severity level.
+ * @param[in] timestamp Timestamp (in system ticks) of the log event.
+ * @param[in] filename Name of the source file creating the log event.
+ * @param[in] function Name of the function creating the log event.
+ * @param[in] line Source file line of the caller.
+ * @param[in] format Format string.
+ * @param[in] args Variable argument list.
+ */
+ET_PRINTFLIKE(6, 0)
+void vlogf(
+    LogLevel level,
+    et_timestamp_t timestamp,
+    const char* filename,
+    const char* function,
+    size_t line,
+    const char* format,
+    va_list args);
+
+/**
+ * Log a string message.
+ *
+ * Note: This is an internal function. Use the `ET_LOG` macro instead.
+ *
+ * @param[in] level Log severity level.
+ * @param[in] timestamp Timestamp (in system ticks) of the log event.
+ * @param[in] filename Name of the source file creating the log event.
+ * @param[in] function Name of the function creating the log event.
+ * @param[in] line Source file line of the caller.
+ * @param[in] format Format string.
+ */
+ET_PRINTFLIKE(6, 7)
+inline void logf(
+    LogLevel level,
+    et_timestamp_t timestamp,
+    const char* filename,
+    const char* function,
+    size_t line,
+    const char* format,
+    ...) {
+#if ET_LOG_ENABLED
+  va_list args;
+  va_start(args, format);
+  internal::vlogf(level, timestamp, filename, function, line, format, args);
+  va_end(args);
+#endif // ET_LOG_ENABLED
+}
+
+} // namespace internal
+
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::LogLevel;
+} // namespace executor
+} // namespace torch
+
+#if ET_LOG_ENABLED
+
+/**
+ * Log a message at the given log severity level.
+ *
+ * @param[in] _level Log severity level.
+ * @param[in] _format Log message format string.
+ */
+#define ET_LOG(_level, _format, ...)                                 \
+  do {                                                               \
+    const auto _log_level = ::executorch::runtime::LogLevel::_level; \
+    if (static_cast<uint32_t>(_log_level) >=                         \
+        static_cast<uint32_t>(                                       \
+            ::executorch::runtime::LogLevel::ET_MIN_LOG_LEVEL)) {    \
+      const auto _timestamp =                                        \
+          ::executorch::runtime::internal::get_log_timestamp();      \
+      ::executorch::runtime::internal::logf(                         \
+          _log_level,                                                \
+          _timestamp,                                                \
+          ET_SHORT_FILENAME,                                         \
+          ET_FUNCTION,                                               \
+          ET_LINE,                                                   \
+          _format,                                                   \
+          ##__VA_ARGS__);                                            \
+    }                                                                \
+  } while (0)
+#else // ET_LOG_ENABLED
+
+/**
+ * Log a message at the given log severity level.
+ *
+ * @param[in] _level Log severity level.
+ * @param[in] _format Log message format string.
+ */
+#define ET_LOG(_level, _format, ...) ((void)0)
+
+#endif // ET_LOG_ENABLED
diff --git a/include/executorch/runtime/platform/platform.h b/include/executorch/runtime/platform/platform.h
new file mode 100644
index 00000000000..03cdef8eb2f
--- /dev/null
+++ b/include/executorch/runtime/platform/platform.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * @file
+ * Platform abstraction layer to allow individual platform libraries to override
+ * symbols in ExecuTorch. PAL functions are defined as C functions so a platform
+ * library implementer can use C in lieu of C++.
+ */
+
+#pragma once
+
+// Use C-style includes so that C code can include this header.
+#include <stddef.h>
+#include <stdint.h>
+
+#include <executorch/runtime/platform/compiler.h>
+#include <executorch/runtime/platform/types.h>
+
+/**
+ * Clients should neither define nor use this macro. Used to optionally declare
+ * the et_pal_*() functions as weak symbols.
+ *
+ * This provides a way to both:
+ * - Include the header and define weak symbols (used by the internal default
+ *   implementations)
+ * - Include the header and define strong symbols (used by client overrides)
+ */
+#ifndef ET_INTERNAL_PLATFORM_WEAKNESS
+#define ET_INTERNAL_PLATFORM_WEAKNESS
+#endif
+
+extern "C" {
+
+/**
+ * Represents the conversion ratio from system ticks to nanoseconds.
+ * To convert, use nanoseconds = ticks * numerator / denominator.
+ */
+typedef struct {
+  uint64_t numerator;
+  uint64_t denominator;
+} et_tick_ratio_t;
+
+/**
+ * Initialize the platform abstraction layer.
+ *
+ * This function should be called before any other function provided by the PAL
+ * to initialize any global state. Typically overridden by PAL implementer.
+ */
+void et_pal_init(void) ET_INTERNAL_PLATFORM_WEAKNESS;
+
+/**
+ * Immediately abort execution, setting the device into an error state, if
+ * available.
+ */
+ET_NORETURN void et_pal_abort(void) ET_INTERNAL_PLATFORM_WEAKNESS;
+
+/**
+ * Return a monotonically non-decreasing timestamp in system ticks.
+ *
+ * @retval Timestamp value in system ticks.
+ */
+et_timestamp_t et_pal_current_ticks(void) ET_INTERNAL_PLATFORM_WEAKNESS;
+
+/**
+ * Return the conversion rate from system ticks to nanoseconds as a fraction.
+ * To convert a system ticks to nanoseconds, multiply the tick count by the
+ * numerator and then divide by the denominator:
+ *   nanoseconds = ticks * numerator / denominator
+ *
+ * The utility method executorch::runtime::ticks_to_ns(et_timestamp_t) can also
+ * be used to perform the conversion for a given tick count. It is defined in
+ * torch/executor/runtime/platform/clock.h.
+ *
+ * @retval The ratio of nanoseconds to system ticks.
+ */
+et_tick_ratio_t et_pal_ticks_to_ns_multiplier(void)
+    ET_INTERNAL_PLATFORM_WEAKNESS;
+
+/**
+ * Severity level of a log message. Values must map to printable 7-bit ASCII
+ * uppercase letters.
+ */
+typedef enum {
+  kDebug = 'D',
+  kInfo = 'I',
+  kError = 'E',
+  kFatal = 'F',
+  kUnknown = '?', // Exception to the "uppercase letter" rule.
+} et_pal_log_level_t;
+
+/**
+ * Emit a log message via platform output (serial port, console, etc).
+ *
+ * @param[in] timestamp Timestamp of the log event in system ticks since boot.
+ * @param[in] level Severity level of the message. Must be a printable 7-bit
+ *     ASCII uppercase letter.
+ * @param[in] filename Name of the file that created the log event.
+ * @param[in] function Name of the function that created the log event.
+ * @param[in] line Line in the source file where the log event was created.
+ * @param[in] message Message string to log.
+ * @param[in] length Message string length.
+ */
+void et_pal_emit_log_message(
+    et_timestamp_t timestamp,
+    et_pal_log_level_t level,
+    const char* filename,
+    const char* function,
+    size_t line,
+    const char* message,
+    size_t length) ET_INTERNAL_PLATFORM_WEAKNESS;
+
+/**
+ * NOTE: Core runtime code must not call this directly. It may only be called by
+ * a MemoryAllocator wrapper.
+ *
+ * Allocates size bytes of memory.
+ *
+ * @param[in] size Number of bytes to allocate.
+ * @returns the allocated memory, or nullptr on failure. Must be freed using
+ *     et_pal_free().
+ */
+void* et_pal_allocate(size_t size) ET_INTERNAL_PLATFORM_WEAKNESS;
+
+/**
+ * Frees memory allocated by et_pal_allocate().
+ *
+ * @param[in] ptr Pointer to memory to free. May be nullptr.
+ */
+void et_pal_free(void* ptr) ET_INTERNAL_PLATFORM_WEAKNESS;
+
+} // extern "C"
diff --git a/include/executorch/runtime/platform/profiler.h b/include/executorch/runtime/platform/profiler.h
new file mode 100644
index 00000000000..d6362781394
--- /dev/null
+++ b/include/executorch/runtime/platform/profiler.h
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+namespace executorch {
+namespace runtime {
+
+// Version string used to check for compatibility with post-processing
+// tool
+#define ET_PROF_VER 0x00000001
+
+// By default we support profiling upto 1024 perf events. Build
+// targets can override this to increase the profiling buffer size
+// during compilation.
+#ifndef MAX_PROFILE_EVENTS
+#define MAX_PROFILE_EVENTS 1024
+#endif
+// By default we support profiling upto 1024 memory allocation events.
+// Build targets can choose to override this, which will consequently have
+// the effect of increasing/decreasing the profiling buffer size.
+#ifndef MAX_MEM_PROFILE_EVENTS
+#define MAX_MEM_PROFILE_EVENTS 1024
+#endif
+// By default we support profiling only upto 16 allocators. If users
+// have more allocators than these then they can override this during
+// compilation time. There will be an increase/decrease in the profiling
+// buffer size based on the way this value is changed.
+#ifndef MEM_PROFILE_MAX_ALLOCATORS
+#define MEM_PROFILE_MAX_ALLOCATORS 32
+#endif
+// By default we support only one profiling block. If users want to profile
+// something that will be iterated on multiple times then they will have to
+// increment this to support their use case. In post-processing the stats for
+// all these iterations will be consolidated.
+#ifndef MAX_PROFILE_BLOCKS
+#define MAX_PROFILE_BLOCKS 2
+#endif
+
+#define PROF_NAME_MAX_LEN 32
+
+typedef struct alignas(8) {
+  union {
+    const char* name_str;
+    char name[PROF_NAME_MAX_LEN];
+  };
+  // chain_idx == -1 is a null value, when profile event happens out of chain
+  // execution
+  int32_t chain_idx;
+  uint32_t instruction_idx;
+  uint64_t start_time;
+  uint64_t end_time;
+} prof_event_t;
+
+typedef struct alignas(8) {
+  uint32_t allocator_id;
+  uint32_t allocation_size;
+} mem_prof_event_t;
+
+typedef struct alignas(8) {
+  char name[PROF_NAME_MAX_LEN];
+  uint64_t allocator_id;
+} prof_allocator_t;
+
+typedef struct alignas(8) {
+  uint8_t* prof_data;
+  uint32_t num_bytes;
+  uint32_t num_blocks;
+} prof_result_t;
+
+typedef struct alignas(8) {
+  char name[32];
+  uint32_t prof_ver;
+  uint32_t max_prof_entries;
+  uint32_t prof_entries;
+  uint32_t max_allocator_entries;
+  uint32_t allocator_entries;
+  uint32_t max_mem_prof_entries;
+  uint32_t mem_prof_entries;
+} prof_header_t;
+
+/*
+This is what the layout of the profiling buffer looks like.
+---------------------------------------
+| Profiling header                    |
+---------------------------------------
+| Profile events (Perf events)        |
+---------------------------------------
+| Memory allocators info              |
+---------------------------------------
+| Profile events (Memory allocations) |
+---------------------------------------
+*/
+
+// offsets of the various sections in the profiling buffer
+// Total size required for profiling buffer
+constexpr uint32_t prof_buf_size = sizeof(prof_header_t) +
+    sizeof(prof_event_t) * MAX_PROFILE_EVENTS +
+    sizeof(mem_prof_event_t) * MAX_MEM_PROFILE_EVENTS +
+    sizeof(prof_allocator_t) * MEM_PROFILE_MAX_ALLOCATORS;
+
+constexpr size_t prof_header_offset = 0;
+constexpr size_t prof_events_offset = sizeof(prof_header_t);
+constexpr size_t prof_mem_alloc_info_offset =
+    prof_events_offset + sizeof(prof_event_t) * MAX_PROFILE_EVENTS;
+constexpr size_t prof_mem_alloc_events_offset = prof_mem_alloc_info_offset +
+    sizeof(prof_allocator_t) * MEM_PROFILE_MAX_ALLOCATORS;
+
+// Set the initial state for the profiler assuming we're using the
+// statically allocated buffer declared in the profiler module.
+void profiler_init(void);
+
+// This starts the profiling of this event and returns a token
+// by which this event can be referred to in the future.
+uint32_t begin_profiling(const char* name);
+
+// End profiling event represented by token_id
+void end_profiling(uint32_t token_id);
+
+// Dump profiler results, return pointer to prof event array and number of
+// events in it.
+void dump_profile_stats(prof_result_t* prof_result);
+
+void reset_profile_stats();
+
+void track_allocation(int32_t id, uint32_t size);
+
+uint32_t track_allocator(const char* name);
+
+void profiling_create_block(const char* name);
+
+// This class enables scope based profiling where needed. Profiling
+// will be started when the object is created and will end when the
+// object goes out of scope.
+class ExecutorchProfiler {
+ public:
+  explicit ExecutorchProfiler(const char* name);
+
+  ~ExecutorchProfiler();
+
+ private:
+  uint32_t prof_tok;
+};
+
+typedef struct {
+  int32_t chain_idx;
+  uint32_t instruction_idx;
+} prof_state_t;
+
+const prof_state_t& get_profile_tls_state();
+
+void set_profile_tls_state(const prof_state_t& state);
+
+class ExecutorchProfilerInstructionScope {
+ public:
+  explicit ExecutorchProfilerInstructionScope(const prof_state_t& state);
+  ~ExecutorchProfilerInstructionScope();
+
+  // ScopeGuard: non-copyable, non-movable
+  ExecutorchProfilerInstructionScope(
+      const ExecutorchProfilerInstructionScope&) = delete;
+  ExecutorchProfilerInstructionScope& operator=(
+      const ExecutorchProfilerInstructionScope&) = delete;
+
+  ExecutorchProfilerInstructionScope(ExecutorchProfilerInstructionScope&&) =
+      delete;
+  ExecutorchProfilerInstructionScope& operator=(
+      ExecutorchProfilerInstructionScope&&) = delete;
+
+ private:
+  prof_state_t old_state_;
+};
+
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::begin_profiling;
+using ::executorch::runtime::dump_profile_stats;
+using ::executorch::runtime::end_profiling;
+using ::executorch::runtime::ExecutorchProfiler;
+using ::executorch::runtime::ExecutorchProfilerInstructionScope;
+using ::executorch::runtime::get_profile_tls_state;
+using ::executorch::runtime::mem_prof_event_t;
+using ::executorch::runtime::prof_allocator_t;
+using ::executorch::runtime::prof_buf_size;
+using ::executorch::runtime::prof_event_t;
+using ::executorch::runtime::prof_events_offset;
+using ::executorch::runtime::prof_header_offset;
+using ::executorch::runtime::prof_header_t;
+using ::executorch::runtime::prof_mem_alloc_events_offset;
+using ::executorch::runtime::prof_mem_alloc_info_offset;
+using ::executorch::runtime::prof_result_t;
+using ::executorch::runtime::prof_state_t;
+using ::executorch::runtime::profiler_init;
+using ::executorch::runtime::profiling_create_block;
+using ::executorch::runtime::reset_profile_stats;
+using ::executorch::runtime::set_profile_tls_state;
+using ::executorch::runtime::track_allocation;
+using ::executorch::runtime::track_allocator;
+} // namespace executor
+} // namespace torch
+
+#ifdef PROFILING_ENABLED
+
+#define EXECUTORCH_PROFILE_CREATE_BLOCK(name) \
+  ::executorch::runtime::profiling_create_block(name);
+
+// Convenience macros to begin and end profiling. These can be inserted
+// anywhere as it'll be ensured that for the prod builds these will
+// essentially be noops.
+#define EXECUTORCH_BEGIN_PROF(name) \
+  ::executorch::runtime::begin_profiling(name);
+
+#define EXECUTORCH_END_PROF(token_id) \
+  ::executorch::runtime::end_profiling(token_id);
+
+#define EXECUTORCH_SCOPE_PROF(name) \
+  ::executorch::runtime::ExecutorchProfiler profiler(name);
+
+#define EXECUTORCH_PROFILE_INSTRUCTION_SCOPE(chain_idx, instruction_idx) \
+  ::executorch::runtime::ExecutorchProfilerInstructionScope              \
+      __profiler_instruction_scope({chain_idx, instruction_idx});
+
+#define EXECUTORCH_DUMP_PROFILE_RESULTS(prof_result) \
+  ::executorch::runtime::dump_profile_stats(prof_result);
+
+#define EXECUTORCH_RESET_PROFILE_RESULTS() \
+  ::executorch::runtime::reset_profile_stats();
+
+#define EXECUTORCH_TRACK_ALLOCATOR(name) \
+  ::executorch::runtime::track_allocator(name);
+
+#define EXECUTORCH_TRACK_ALLOCATION(id, size) \
+  ::executorch::runtime::track_allocation(id, size);
+
+#else
+
+#define EXECUTORCH_PROFILE_CREATE_BLOCK(name) \
+  do {                                        \
+    (void)(name);                             \
+  } while (0)
+
+#define EXECUTORCH_BEGIN_PROF(name) \
+  {}
+
+#define EXECUTORCH_END_PROF(token_id) \
+  do {                                \
+    (void)(token_id);                 \
+  } while (0)
+
+#define EXECUTORCH_SCOPE_PROF(name) \
+  do {                              \
+    (void)(name);                   \
+  } while (0)
+
+#define EXECUTORCH_PROFILE_INSTRUCTION_SCOPE(chain_idx, instruction_idx) \
+  do {                                                                   \
+    (void)(chain_idx);                                                   \
+    (void)(instruction_idx);                                             \
+  } while (0)
+
+#define EXECUTORCH_DUMP_PROFILE_RESULTS(prof_result_test) \
+  memset(prof_result_test, 0, sizeof(::executorch::runtime::prof_result_t));
+
+#define EXECUTORCH_RESET_PROFILE_RESULTS() \
+  {}
+
+#define EXECUTORCH_TRACK_ALLOCATOR(name) ((void)(name), -1)
+
+#define EXECUTORCH_TRACK_ALLOCATION(id, size) \
+  do {                                        \
+    (void)(id);                               \
+    (void)(size);                             \
+  } while (0)
+
+#endif
diff --git a/include/executorch/runtime/platform/runtime.h b/include/executorch/runtime/platform/runtime.h
new file mode 100644
index 00000000000..375ae7959a8
--- /dev/null
+++ b/include/executorch/runtime/platform/runtime.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * @file
+ * ExecuTorch global runtime wrapper functions.
+ */
+
+#pragma once
+
+#include <executorch/runtime/platform/compiler.h>
+
+namespace executorch {
+namespace runtime {
+
+/**
+ * Initialize the ExecuTorch global runtime.
+ */
+void runtime_init();
+
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::runtime_init;
+} // namespace executor
+} // namespace torch
diff --git a/include/executorch/runtime/platform/system.h b/include/executorch/runtime/platform/system.h
new file mode 100644
index 00000000000..18c6ee3094b
--- /dev/null
+++ b/include/executorch/runtime/platform/system.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * @file
+ * Platform abstraction layer to allow individual host OS to override
+ * symbols in ExecuTorch. PAL functions are defined as C functions so an
+ * implementer can use C in lieu of C++.
+ */
+#pragma once
+
+/**
+ * To enable dynamic linking debugging capability on UNIX-like OS. If enabled
+ * and see an error like: `undefined symbol: dladdr`, install `libdl` to fix.
+ */
+#if defined(ET_USE_LIBDL)
+#include <dlfcn.h>
+#endif
+
+static constexpr const char* DYNAMIC_LIBRARY_NOT_SUPPORTED = "NOT_SUPPORTED";
+static constexpr const char* DYNAMIC_LIBRARY_NOT_FOUND = "NOT_FOUND";
+
+extern "C" {
+
+/**
+ * Return shared library .
+ *
+ * @param[in] addr Address to the symbol we are looking for in shared libraries.
+ * @retval The path to the shared library containing the symbol.
+ */
+inline const char* et_pal_get_shared_library_name(const void* addr) {
+#if defined(ET_USE_LIBDL)
+  Dl_info info;
+  if (dladdr(addr, &info) && info.dli_fname) {
+    return info.dli_fname;
+  } else {
+    return DYNAMIC_LIBRARY_NOT_FOUND;
+  }
+#endif
+  (void)addr;
+  return DYNAMIC_LIBRARY_NOT_SUPPORTED;
+}
+
+} // extern "C"
diff --git a/include/executorch/runtime/platform/test/stub_platform.h b/include/executorch/runtime/platform/test/stub_platform.h
new file mode 100644
index 00000000000..de5599b53b0
--- /dev/null
+++ b/include/executorch/runtime/platform/test/stub_platform.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstddef>
+
+#include <executorch/runtime/platform/compiler.h>
+#include <executorch/runtime/platform/platform.h>
+#include <executorch/runtime/platform/types.h>
+
+/**
+ * An interface for intercepting calls to the PAL layer.
+ */
+class PlatformIntercept {
+ public:
+  PlatformIntercept() = default;
+
+  /// Called when et_pal_init() is called.
+  virtual void init() {}
+
+  // We can't intercept et_pal_abort() since it's marked NORETURN.
+
+  /// Called when et_pal_current_ticks() is called.
+  virtual et_timestamp_t current_ticks() {
+    return 0;
+  }
+
+  virtual et_tick_ratio_t ticks_to_ns_multiplier() {
+    return {1, 1};
+  }
+
+  /// Called when et_pal_emit_log_message() is called.
+  virtual void emit_log_message(
+      ET_UNUSED et_timestamp_t timestamp,
+      ET_UNUSED et_pal_log_level_t level,
+      ET_UNUSED const char* filename,
+      ET_UNUSED const char* function,
+      ET_UNUSED size_t line,
+      ET_UNUSED const char* message,
+      ET_UNUSED size_t length) {}
+
+  virtual void* allocate(ET_UNUSED size_t size) {
+    return nullptr;
+  }
+
+  virtual void free(ET_UNUSED void* ptr) {}
+
+  virtual ~PlatformIntercept() = default;
+};
+
+/**
+ * RAII type to install a PlatformIntercept for the duration of a test case.
+ */
+class InterceptWith {
+ public:
+  explicit InterceptWith(PlatformIntercept& pi) {
+    InterceptWith::install(&pi);
+  }
+
+  ~InterceptWith() {
+    InterceptWith::install(nullptr);
+  }
+
+ private:
+  /**
+   * Installs the PlatformIntercept to forward to when et_pal_* functions are
+   * called. To uninstall, pass in `nullptr`.
+   */
+  static void install(PlatformIntercept* pi);
+};
diff --git a/include/executorch/runtime/platform/types.h b/include/executorch/runtime/platform/types.h
new file mode 100644
index 00000000000..a91f357fe5a
--- /dev/null
+++ b/include/executorch/runtime/platform/types.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * @file
+ * Public types used by the ExecuTorch Platform Abstraction Layer.
+ */
+
+#pragma once
+
+// Use C-style includes so that C code can include this header.
+#include <stdint.h>
+
+extern "C" {
+
+/// Platform timestamp in system ticks.
+typedef uint64_t et_timestamp_t;
+
+} // extern "C"
diff --git a/kernels/.watchman-cookie-madragna-mac-84335-443 b/kernels/.watchman-cookie-madragna-mac-84335-443
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/kernels/quantized/libquantized_ops_aot_lib.dylib b/kernels/quantized/libquantized_ops_aot_lib.dylib
new file mode 100755
index 00000000000..4c7e831d2c1
Binary files /dev/null and b/kernels/quantized/libquantized_ops_aot_lib.dylib differ
diff --git a/preprocess.pt2 b/preprocess.pt2
new file mode 100644
index 00000000000..25dc7217e73
Binary files /dev/null and b/preprocess.pt2 differ
diff --git a/profiler/.watchman-cookie-madragna-mac-84335-443 b/profiler/.watchman-cookie-madragna-mac-84335-443
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/runtime/.watchman-cookie-madragna-mac-84335-443 b/runtime/.watchman-cookie-madragna-mac-84335-443
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/runtime/core/exec_aten/util/dim_order_util.h b/runtime/core/exec_aten/util/dim_order_util.h
index 07b3d5c2a97..f5eed09300b 100644
--- a/runtime/core/exec_aten/util/dim_order_util.h
+++ b/runtime/core/exec_aten/util/dim_order_util.h
@@ -12,7 +12,6 @@
 #include <cstdint>
 #include <cstdio>
 #include <cstring>
-
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/assert.h>
 #include <executorch/runtime/platform/compiler.h>
diff --git a/schema/.watchman-cookie-madragna-mac-84335-443 b/schema/.watchman-cookie-madragna-mac-84335-443
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/scripts/.watchman-cookie-madragna-mac-84335-443 b/scripts/.watchman-cookie-madragna-mac-84335-443
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/share/cmake/executorch-config.cmake b/share/cmake/executorch-config.cmake
new file mode 100644
index 00000000000..14abd4333c0
--- /dev/null
+++ b/share/cmake/executorch-config.cmake
@@ -0,0 +1,72 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Config defining how CMake should find ExecuTorch package. CMake will search
+# for this file and find ExecuTorch package if it is installed. Typical usage
+# is:
+#
+# find_package(executorch REQUIRED)
+# -------
+#
+# Finds the ExecuTorch library
+#
+# This will define the following variables:
+#
+#   EXECUTORCH_FOUND        -- True if the system has the ExecuTorch library
+#   EXECUTORCH_INCLUDE_DIRS -- The include directories for ExecuTorch
+#   EXECUTORCH_LIBRARIES    -- Libraries to link against
+#
+cmake_minimum_required(VERSION 3.19)
+
+# Find prebuilt _portable_lib.<EXT_SUFFIX>.so. This file should be installed under
+# <site-packages>/executorch/share/cmake
+
+# Find python
+if(DEFINED ENV{CONDA_DEFAULT_ENV} AND NOT $ENV{CONDA_DEFAULT_ENV} STREQUAL "base")
+  set(PYTHON_EXECUTABLE
+      python
+  )
+else()
+  set(PYTHON_EXECUTABLE
+      python3
+  )
+endif()
+
+# Get the Python version and platform information
+execute_process(
+    COMMAND ${PYTHON_EXECUTABLE} -c "import sysconfig; print(sysconfig.get_config_var('EXT_SUFFIX'))"
+    OUTPUT_VARIABLE EXT_SUFFIX
+    RESULT_VARIABLE SYSCONFIG_RESULT
+    ERROR_VARIABLE SYSCONFIG_ERROR
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+if(SYSCONFIG_RESULT EQUAL 0)
+  message(STATUS "Sysconfig extension suffix: ${EXT_SUFFIX}")
+else()
+  message(FATAL_ERROR "Failed to retrieve sysconfig config var EXT_SUFFIX: ${SYSCONFIG_ERROR}")
+endif()
+
+find_library(
+  _portable_lib_LIBRARY
+  NAMES _portable_lib${EXT_SUFFIX}
+  PATHS "${CMAKE_CURRENT_LIST_DIR}/../../extension/pybindings/"
+)
+
+set(EXECUTORCH_LIBRARIES)
+set(EXECUTORCH_FOUND OFF)
+if(_portable_lib_LIBRARY)
+  set(EXECUTORCH_FOUND ON)
+  message(STATUS "ExecuTorch portable library is found at ${_portable_lib_LIBRARY}")
+  list(APPEND EXECUTORCH_LIBRARIES _portable_lib)
+  add_library(_portable_lib STATIC IMPORTED)
+  set(EXECUTORCH_INCLUDE_DIRS ${CMAKE_CURRENT_LIST_DIR}/../../include)
+  set_target_properties(_portable_lib PROPERTIES
+    IMPORTED_LOCATION "${_portable_lib_LIBRARY}"
+    INTERFACE_INCLUDE_DIRECTORIES "${EXECUTORCH_INCLUDE_DIRS}"
+    CXX_STANDARD 17
+  )
+endif()
diff --git a/shim_et/.watchman-cookie-madragna-mac-84335-443 b/shim_et/.watchman-cookie-madragna-mac-84335-443
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/src/.watchman-cookie-madragna-mac-84335-443 b/src/.watchman-cookie-madragna-mac-84335-443
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/test/.watchman-cookie-madragna-mac-84335-443 b/test/.watchman-cookie-madragna-mac-84335-443
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/third-party/.watchman-cookie-madragna-mac-84335-443 b/third-party/.watchman-cookie-madragna-mac-84335-443
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/third-party/ao b/third-party/ao
index 5e5db7176cb..c9b9adc77c3 160000
--- a/third-party/ao
+++ b/third-party/ao
@@ -1 +1 @@
-Subproject commit 5e5db7176cb6a966b1f2a56eac86c2b83c8b189f
+Subproject commit c9b9adc77c395730e1bd7235076103db9c99b744
diff --git a/tools/.watchman-cookie-madragna-mac-84335-443 b/tools/.watchman-cookie-madragna-mac-84335-443
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/util/.watchman-cookie-madragna-mac-84335-443 b/util/.watchman-cookie-madragna-mac-84335-443
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/version.py b/version.py
new file mode 100644
index 00000000000..d2546c711b7
--- /dev/null
+++ b/version.py
@@ -0,0 +1,4 @@
+from typing import Optional
+__all__ = ["__version__", "git_version"]
+__version__ = "0.7.0a0+9d726e8"
+git_version: Optional[str] = '9d726e8df2397b4d35bfa845ddbc9ac7c0a1457c'