From 0f76731e217ebfa96127474690156e9af64d7219 Mon Sep 17 00:00:00 2001
From: Github Executorch <github_executorch@arm.com>
Date: Thu, 6 Feb 2025 19:14:56 -0800
Subject: [PATCH 1/9] Use c10 version of half/bfloat16 in executorch

Pull Request resolved: https://github.com/pytorch/executorch/pull/7040

Pull Request resolved: https://github.com/pytorch/pytorch/pull/144111

Accomplished by importing relevant files from c10 into
executorch/runtime/core/portable_type/c10, and then using `using` in
the top-level ExecuTorch headers. This approach should keep the
ExecuTorch build hermetic for embedded use cases. In the future, we
should add a CI job to ensure the c10 files stay identical to the
PyTorch ones.
ghstack-source-id: 265190624
@exported-using-ghexport

Differential Revision: [D66106969](https://our.internmc.facebook.com/intern/diff/D66106969/)
---
 .lintrunner.toml                              |   4 +
 CMakeLists.txt                                |  13 +-
 backends/apple/coreml/CMakeLists.txt          |   2 +
 .../project.pbxproj                           |   4 +
 backends/arm/CMakeLists.txt                   |   3 +-
 backends/qualcomm/CMakeLists.txt              |   2 +
 backends/xnnpack/CMakeLists.txt               |   2 +-
 build/build_apple_frameworks.sh               |  14 +
 build/executorch-config.cmake                 |  13 +-
 .../project.pbxproj                           |  15 +-
 examples/arm/executor_runner/CMakeLists.txt   |   3 +-
 runtime/core/portable_type/bfloat16.h         | 328 +-------
 runtime/core/portable_type/bfloat16_math.h    | 278 +------
 runtime/core/portable_type/c10/TARGETS        |   8 +
 .../core/portable_type/c10/macros/Export.h    | 160 ++++
 .../core/portable_type/c10/macros/Macros.h    | 511 ++++++++++++
 runtime/core/portable_type/c10/targets.bzl    | 100 +++
 .../portable_type/c10/util/BFloat16-inl.h     | 343 ++++++++
 .../portable_type/c10/util/BFloat16-math.h    | 299 +++++++
 .../core/portable_type/c10/util/BFloat16.h    | 130 +++
 .../core/portable_type/c10/util/Half-inl.h    | 350 ++++++++
 runtime/core/portable_type/c10/util/Half.h    | 423 ++++++++++
 .../portable_type/c10/util/TypeSafeSignMath.h | 140 ++++
 .../core/portable_type/c10/util/bit_cast.h    |  44 +
 .../c10/util/floating_point_utils.h           |  33 +
 runtime/core/portable_type/half.h             | 759 +-----------------
 runtime/core/portable_type/targets.bzl        |   3 +
 runtime/kernel/test/CMakeLists.txt            |   5 +-
 shim/xplat/executorch/build/env_interface.bzl |   5 +-
 29 files changed, 2629 insertions(+), 1365 deletions(-)
 create mode 100644 runtime/core/portable_type/c10/TARGETS
 create mode 100644 runtime/core/portable_type/c10/macros/Export.h
 create mode 100644 runtime/core/portable_type/c10/macros/Macros.h
 create mode 100644 runtime/core/portable_type/c10/targets.bzl
 create mode 100644 runtime/core/portable_type/c10/util/BFloat16-inl.h
 create mode 100644 runtime/core/portable_type/c10/util/BFloat16-math.h
 create mode 100644 runtime/core/portable_type/c10/util/BFloat16.h
 create mode 100644 runtime/core/portable_type/c10/util/Half-inl.h
 create mode 100644 runtime/core/portable_type/c10/util/Half.h
 create mode 100644 runtime/core/portable_type/c10/util/TypeSafeSignMath.h
 create mode 100644 runtime/core/portable_type/c10/util/bit_cast.h
 create mode 100644 runtime/core/portable_type/c10/util/floating_point_utils.h

diff --git a/.lintrunner.toml b/.lintrunner.toml
index 093f9cdbcbb..7667ac430d1 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -78,6 +78,8 @@ exclude_patterns = [
     # File contains @generated
     'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h',
     'extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h',
+    # Want to be able to keep c10 in sync with PyTorch core.
+    'runtime/core/portable_type/c10/**',
 ]
 command = [
     'python',
@@ -261,6 +263,8 @@ exclude_patterns = [
     'extension/**',
     'kernels/optimized/**',
     'runtime/core/exec_aten/**',
+    # Want to be able to keep c10 in sync with PyTorch core.
+    'runtime/core/portable_type/c10/**',
     'runtime/executor/tensor_parser_aten.cpp',
     'scripts/**',
     'test/**',
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ca8d1bbbcf2..2b3acf5d27a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -366,7 +366,7 @@ if(NOT "${_repo_dir_name}" STREQUAL "executorch")
       "fix for this restriction."
   )
 endif()
-set(_common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/..)
+set(_common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/runtime/core/portable_type)
 
 #
 # The `_<target>_srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}.
@@ -549,6 +549,7 @@ endif()
 target_include_directories(
   executorch_core PUBLIC ${_common_include_directories}
 )
+target_compile_definitions(executorch_core PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)
 target_compile_options(executorch_core PUBLIC ${_common_compile_options})
 if(MAX_KERNEL_NUM)
   target_compile_definitions(
@@ -569,6 +570,7 @@ if(EXECUTORCH_BUILD_PYBIND AND APPLE)
   target_include_directories(
     executorch_core_shared PUBLIC ${_common_include_directories}
   )
+  target_compile_definitions(executorch_core_shared PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)
   target_compile_options(
     executorch_core_shared PUBLIC ${_common_compile_options}
   )
@@ -589,6 +591,7 @@ endif()
 add_library(executorch ${_executorch__srcs})
 target_link_libraries(executorch PRIVATE executorch_core)
 target_include_directories(executorch PUBLIC ${_common_include_directories})
+target_compile_definitions(executorch PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)
 target_compile_options(executorch PUBLIC ${_common_compile_options})
 target_link_options_shared_lib(executorch)
 
@@ -622,6 +625,12 @@ endif()
 
 # Install `executorch` library as well as `executorch-config.cmake` under
 # ${CMAKE_INSTALL_PREFIX}/
+install(DIRECTORY runtime/core/  DESTINATION include/executorch/runtime/core FILES_MATCHING PATTERN "*.h")
+install(DIRECTORY runtime/kernel/  DESTINATION include/executorch/runtime/kernel FILES_MATCHING PATTERN "*.h")
+install(DIRECTORY runtime/platform/  DESTINATION include/executorch/runtime/platform FILES_MATCHING PATTERN "*.h")
+install(DIRECTORY extension/kernel_util/  DESTINATION include/executorch/extension/kernel_util FILES_MATCHING PATTERN "*.h")
+install(DIRECTORY extension/tensor/  DESTINATION include/executorch/extension/tensor FILES_MATCHING PATTERN "*.h")
+install(DIRECTORY extension/threadpool/  DESTINATION include/executorch/extension/threadpool FILES_MATCHING PATTERN "*.h")
 install(
   TARGETS executorch executorch_core
   DESTINATION lib
@@ -780,6 +789,8 @@ if(EXECUTORCH_BUILD_PYBIND)
   target_include_directories(
     util PUBLIC ${_common_include_directories} ${TORCH_INCLUDE_DIRS}
   )
+  target_compile_definitions(util PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)
+
   target_compile_options(util PUBLIC ${_pybind_compile_options})
   target_link_libraries(util PRIVATE torch c10 executorch extension_tensor)
 
diff --git a/backends/apple/coreml/CMakeLists.txt b/backends/apple/coreml/CMakeLists.txt
index 59f7f473ffe..70daed7d065 100644
--- a/backends/apple/coreml/CMakeLists.txt
+++ b/backends/apple/coreml/CMakeLists.txt
@@ -134,6 +134,8 @@ target_include_directories(
   coremldelegate PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/util
 )
 target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/..)
+target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/runtime/core/portable_type)
+target_compile_definitions(coremldelegate PRIVATE C10_USING_CUSTOM_GENERATED_MACROS)
 target_link_libraries(coremldelegate PRIVATE executorch_core)
 
 if(EXECUTORCH_BUILD_DEVTOOLS)
diff --git a/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj b/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
index 6ff30636a3b..2115f5a4806 100644
--- a/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
+++ b/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
@@ -830,6 +830,7 @@
 				GCC_OPTIMIZATION_LEVEL = 0;
 				GCC_PREPROCESSOR_DEFINITIONS = (
 					"DEBUG=1",
+                                        "C10_USING_CUSTOM_GENERATED_MACROS",
 					"$(inherited)",
 				);
 				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
@@ -911,6 +912,7 @@
 				DEVELOPMENT_TEAM = "";
 				GCC_PREPROCESSOR_DEFINITIONS = (
 					"DEBUG=1",
+                                        "C10_USING_CUSTOM_GENERATED_MACROS",
 					"ET_EVENT_TRACER_ENABLED=1",
 					"$(inherited)",
 				);
@@ -920,6 +922,7 @@
 					"$(SRCROOT)/../kvstore",
 					"$(SRCROOT)/../inmemoryfs",
 					"$(SRCROOT)/../include",
+					"$(SRCROOT)/../include/executorch/runtime/core/portable_type",
 					"$(SRCROOT)/../sdk",
 					"$(SRCROOT)/../util",
 					"$(SRCROOT)/../../third-party/nlohmann_json/single_include",
@@ -951,6 +954,7 @@
 					"$(SRCROOT)/../kvstore",
 					"$(SRCROOT)/../inmemoryfs",
 					"$(SRCROOT)/../include",
+					"$(SRCROOT)/../include/executorch/runtime/core/portable_type",
 					"$(SRCROOT)/../sdk",
 					"$(SRCROOT)/../util",
 					"$(SRCROOT)/../../third-party/nlohmann_json/single_include",
diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt
index 2b7be537ceb..3c1dcf88a30 100644
--- a/backends/arm/CMakeLists.txt
+++ b/backends/arm/CMakeLists.txt
@@ -14,7 +14,8 @@ endif()
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
 
-set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_common_include_directories ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type)
+add_compile_definitions(C10_USING_CUSTOM_GENERATED_MACROS)
 
 # Third-party folder and Ethos-U driver inclued
 set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
index bc0f51a236b..81d61fb0659 100644
--- a/backends/qualcomm/CMakeLists.txt
+++ b/backends/qualcomm/CMakeLists.txt
@@ -54,6 +54,7 @@ add_custom_command(
 )
 
 add_compile_options("-Wall" "-Werror" "-Wno-sign-compare")
+add_compile_definitions(C10_USING_CUSTOM_GENERATED_MACROS)
 
 # GNU emit wanring for ignored attributes Unfortunately, we use [[maybe_unused]]
 # which can be ignored by GNU. So we make it a warning, not an error in GNU.
@@ -73,6 +74,7 @@ endif()
 include_directories(
   BEFORE ${_common_include_directories} ${QNN_SDK_ROOT}/include/QNN
   ${EXECUTORCH_SOURCE_DIR}/third-party/flatbuffers/include
+  ${EXECUTORCH_SOURCE_DIR}/runtime/core/portable_type
 )
 
 set(_qnn_schema__srcs
diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
index a21ef4f6686..a703d67c1b2 100644
--- a/backends/xnnpack/CMakeLists.txt
+++ b/backends/xnnpack/CMakeLists.txt
@@ -119,7 +119,7 @@ target_include_directories(
 target_compile_options(xnnpack_backend PUBLIC ${_common_compile_options})
 target_link_options_shared_lib(xnnpack_backend)
 
-list(APPEND xnn_executor_runner_libs xnnpack_backend)
+list(APPEND xnn_executor_runner_libs xnnpack_backend executorch)
 
 # ios can only build library but not binary
 if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
diff --git a/build/build_apple_frameworks.sh b/build/build_apple_frameworks.sh
index a6494fb233f..7878cd86a3f 100755
--- a/build/build_apple_frameworks.sh
+++ b/build/build_apple_frameworks.sh
@@ -200,6 +200,20 @@ check_command "$BUCK2"
   //extension/tensor: \
 | rsync -av --files-from=- "$SOURCE_ROOT_DIR" "$HEADERS_PATH/executorch"
 
+# HACK: XCFrameworks don't appear to support exporting any build
+# options, but we need the following:
+# - runtime/core/portable/type/c10 reachable with `#include <c10/...>`
+# - exported -DC10_USING_CUSTOM_GENERATED_MACROS compiler flag
+# So, just patch our generated framework to do that.
+sed -i '' '1i\
+#define C10_USING_CUSTOM_GENERATED_MACROS
+' $HEADERS_PATH/executorch/runtime/core/portable_type/c10/macros/Macros.h
+sed -i '' '1i\
+#define C10_USING_CUSTOM_GENERATED_MACROS
+' $HEADERS_PATH/executorch/runtime/core/portable_type/c10/macros/Export.h
+ln -s $HEADERS_PATH/executorch/runtime/core/portable_type/c10 "$HEADERS_PATH/"
+
+
 cp "$SOURCE_ROOT_DIR/extension/apple/ExecuTorch/Exported/"*.h "$HEADERS_PATH/executorch"
 cp "$SOURCE_ROOT_DIR/extension/apple/ExecuTorch/Exported/"*.modulemap "$HEADERS_PATH"
 
diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake
index 96e6390b6db..40c28d0b961 100644
--- a/build/executorch-config.cmake
+++ b/build/executorch-config.cmake
@@ -26,20 +26,21 @@
 
 cmake_minimum_required(VERSION 3.19)
 
-set(_root "${CMAKE_CURRENT_LIST_DIR}/../..")
+set(_root "${CMAKE_CURRENT_LIST_DIR}/../../..")
 set(required_lib_list executorch executorch_core portable_kernels)
 set(EXECUTORCH_LIBRARIES)
-set(EXECUTORCH_INCLUDE_DIRS ${_root})
+set(EXECUTORCH_INCLUDE_DIRS ${_root}/include ${_root}/include/executorch/runtime/core/portable_type ${_root}/lib)
 foreach(lib ${required_lib_list})
   set(lib_var "LIB_${lib}")
   add_library(${lib} STATIC IMPORTED)
   find_library(
     ${lib_var} ${lib}
-    HINTS "${_root}"
+    HINTS "${_root}/lib"
     CMAKE_FIND_ROOT_PATH_BOTH
   )
   set_target_properties(${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}")
-  target_include_directories(${lib} INTERFACE ${_root})
+  target_compile_definitions(${lib} INTERFACE C10_USING_CUSTOM_GENERATED_MACROS)
+  target_include_directories(${lib} INTERFACE ${_root}/include ${_root}/include/executorch/runtime/core/portable_type ${_root}/lib)
   list(APPEND EXECUTORCH_LIBRARIES ${lib})
 endforeach()
 
@@ -93,7 +94,7 @@ foreach(lib ${lib_list})
   set(lib_var "LIB_${lib}")
   find_library(
     ${lib_var} ${lib}
-    HINTS "${_root}"
+    HINTS "${_root}/lib"
     CMAKE_FIND_ROOT_PATH_BOTH
   )
   if(NOT ${lib_var})
@@ -109,7 +110,7 @@ foreach(lib ${lib_list})
       add_library(${lib} STATIC IMPORTED)
     endif()
     set_target_properties(${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}")
-    target_include_directories(${lib} INTERFACE ${_root})
+    target_include_directories(${lib} INTERFACE ${_root}/include ${_root}/include/executorch/runtime/core/portable_type ${_root}/lib)
     list(APPEND EXECUTORCH_LIBRARIES ${lib})
   endif()
 endforeach()
diff --git a/examples/apple/coreml/executor_runner/coreml_executor_runner.xcodeproj/project.pbxproj b/examples/apple/coreml/executor_runner/coreml_executor_runner.xcodeproj/project.pbxproj
index 31e6eba6f1e..71301f113b2 100644
--- a/examples/apple/coreml/executor_runner/coreml_executor_runner.xcodeproj/project.pbxproj
+++ b/examples/apple/coreml/executor_runner/coreml_executor_runner.xcodeproj/project.pbxproj
@@ -214,6 +214,7 @@
 				GCC_OPTIMIZATION_LEVEL = 0;
 				GCC_PREPROCESSOR_DEFINITIONS = (
 					"DEBUG=1",
+                                	"C10_USING_CUSTOM_GENERATED_MACROS",
 					"$(inherited)",
 				);
 				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
@@ -271,6 +272,10 @@
 				ENABLE_USER_SCRIPT_SANDBOXING = YES;
 				GCC_C_LANGUAGE_STANDARD = gnu17;
 				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+                                	"C10_USING_CUSTOM_GENERATED_MACROS",
+					"$(inherited)",
+				);
 				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
 				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
 				GCC_WARN_UNDECLARED_SELECTOR = YES;
@@ -291,7 +296,10 @@
 				CODE_SIGN_STYLE = Automatic;
 				DEVELOPMENT_TEAM = "";
 				ENABLE_HARDENED_RUNTIME = YES;
-				HEADER_SEARCH_PATHS = "$(SRCROOT)/include";
+				HEADER_SEARCH_PATHS = (
+                                	"$(SRCROOT)/include",
+                                	"$(SRCROOT)/include/executorch/runtime/core/portable_type",
+                                );
 				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
 				LIBRARY_SEARCH_PATHS = (
 					"$(SRCROOT)/libraries",
@@ -310,7 +318,10 @@
 				CODE_SIGN_STYLE = Automatic;
 				DEVELOPMENT_TEAM = "";
 				ENABLE_HARDENED_RUNTIME = YES;
-				HEADER_SEARCH_PATHS = "$(SRCROOT)/include";
+				HEADER_SEARCH_PATHS = (
+                                	"$(SRCROOT)/include",
+                                	"$(SRCROOT)/include/executorch/runtime/core/portable_type",
+                                );
 				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
 				LIBRARY_SEARCH_PATHS = (
 					"$(SRCROOT)/libraries",
diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt
index c17888ca0b3..5621576fd70 100644
--- a/examples/arm/executor_runner/CMakeLists.txt
+++ b/examples/arm/executor_runner/CMakeLists.txt
@@ -382,8 +382,9 @@ target_link_options( arm_executor_runner PUBLIC LINKER:-Map=arm_executor_runner.
 
 # ET headers and generated headers includes
 target_include_directories(
-  arm_executor_runner PRIVATE ${ET_INCLUDE_PATH} ${CMAKE_CURRENT_BINARY_DIR}
+  arm_executor_runner PRIVATE ${ET_INCLUDE_PATH} ${ET_DIR_PATH}/runtime/core/portable_type ${CMAKE_CURRENT_BINARY_DIR}
 )
+target_compile_definitions(arm_executor_runner PRIVATE C10_USING_CUSTOM_GENERATED_MACROS)
 
 if(SEMIHOSTING)
   target_compile_definitions(arm_executor_runner PUBLIC SEMIHOSTING)
diff --git a/runtime/core/portable_type/bfloat16.h b/runtime/core/portable_type/bfloat16.h
index c1ff250885a..233d571478e 100644
--- a/runtime/core/portable_type/bfloat16.h
+++ b/runtime/core/portable_type/bfloat16.h
@@ -8,260 +8,15 @@
 
 #pragma once
 
-#include <cmath>
-#include <cstdint>
-#include <cstring>
-#include <limits>
-#include <ostream>
-
-namespace executorch {
-namespace runtime {
-namespace etensor {
+#include <c10/util/BFloat16.h>
 
+namespace executorch::runtime::etensor {
+using c10::BFloat16;
 namespace internal {
-inline float f32_from_bits(uint16_t src) {
-  float res = 0;
-  uint32_t tmp = src;
-  tmp <<= 16;
-  std::memcpy(&res, &tmp, sizeof(tmp));
-  return res;
-}
-
-inline uint16_t round_to_nearest_even(float src) {
-  if (std::isnan(src)) {
-    return UINT16_C(0x7FC0);
-  }
-  uint32_t U32 = 0;
-  std::memcpy(&U32, &src, sizeof(U32));
-  uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF);
-  return static_cast<uint16_t>((U32 + rounding_bias) >> 16);
-}
+using c10::detail::f32_from_bits;
+using c10::detail::round_to_nearest_even;
 } // namespace internal
-
-/**
- * The "brain floating-point" type, compatible with c10/util/BFloat16.h from
- * pytorch core.
- *
- * This representation uses 1 bit for the sign, 8 bits for the exponent and 7
- * bits for the mantissa.
- */
-struct alignas(2) BFloat16 {
-  uint16_t x;
-
-  BFloat16() = default;
-  struct from_bits_t {};
-  static constexpr from_bits_t from_bits() {
-    return from_bits_t();
-  }
-
-  constexpr BFloat16(unsigned short bits, from_bits_t) : x(bits) {}
-  /* implicit */ BFloat16(float value)
-      : x(internal::round_to_nearest_even(value)) {}
-  operator float() const {
-    return internal::f32_from_bits(x);
-  }
-};
-
-inline std::ostream& operator<<(std::ostream& out, const BFloat16& value) {
-  out << (float)value;
-  return out;
-}
-
-/// Arithmetic
-
-inline BFloat16 operator+(const BFloat16& a, const BFloat16& b) {
-  return static_cast<float>(a) + static_cast<float>(b);
-}
-
-inline BFloat16 operator-(const BFloat16& a, const BFloat16& b) {
-  return static_cast<float>(a) - static_cast<float>(b);
-}
-
-inline BFloat16 operator*(const BFloat16& a, const BFloat16& b) {
-  return static_cast<float>(a) * static_cast<float>(b);
-}
-
-inline BFloat16 operator/(const BFloat16& a, const BFloat16& b) {
-  return static_cast<float>(a) / static_cast<float>(b);
-}
-
-inline BFloat16 operator-(const BFloat16& a) {
-  return -static_cast<float>(a);
-}
-
-inline BFloat16& operator+=(BFloat16& a, const BFloat16& b) {
-  a = a + b;
-  return a;
-}
-
-inline BFloat16& operator-=(BFloat16& a, const BFloat16& b) {
-  a = a - b;
-  return a;
-}
-
-inline BFloat16& operator*=(BFloat16& a, const BFloat16& b) {
-  a = a * b;
-  return a;
-}
-
-inline BFloat16& operator/=(BFloat16& a, const BFloat16& b) {
-  a = a / b;
-  return a;
-}
-
-inline BFloat16& operator|(BFloat16& a, const BFloat16& b) {
-  a.x = a.x | b.x;
-  return a;
-}
-
-inline BFloat16& operator^(BFloat16& a, const BFloat16& b) {
-  a.x = a.x ^ b.x;
-  return a;
-}
-
-inline BFloat16& operator&(BFloat16& a, const BFloat16& b) {
-  a.x = a.x & b.x;
-  return a;
-}
-
-/// Arithmetic with floats
-
-inline float operator+(BFloat16 a, float b) {
-  return static_cast<float>(a) + b;
-}
-inline float operator-(BFloat16 a, float b) {
-  return static_cast<float>(a) - b;
-}
-inline float operator*(BFloat16 a, float b) {
-  return static_cast<float>(a) * b;
-}
-inline float operator/(BFloat16 a, float b) {
-  return static_cast<float>(a) / b;
-}
-
-inline float operator+(float a, BFloat16 b) {
-  return a + static_cast<float>(b);
-}
-inline float operator-(float a, BFloat16 b) {
-  return a - static_cast<float>(b);
-}
-inline float operator*(float a, BFloat16 b) {
-  return a * static_cast<float>(b);
-}
-inline float operator/(float a, BFloat16 b) {
-  return a / static_cast<float>(b);
-}
-
-inline float& operator+=(float& a, const BFloat16& b) {
-  return a += static_cast<float>(b);
-}
-inline float& operator-=(float& a, const BFloat16& b) {
-  return a -= static_cast<float>(b);
-}
-inline float& operator*=(float& a, const BFloat16& b) {
-  return a *= static_cast<float>(b);
-}
-inline float& operator/=(float& a, const BFloat16& b) {
-  return a /= static_cast<float>(b);
-}
-
-/// Arithmetic with doubles
-
-inline double operator+(BFloat16 a, double b) {
-  return static_cast<double>(a) + b;
-}
-inline double operator-(BFloat16 a, double b) {
-  return static_cast<double>(a) - b;
-}
-inline double operator*(BFloat16 a, double b) {
-  return static_cast<double>(a) * b;
-}
-inline double operator/(BFloat16 a, double b) {
-  return static_cast<double>(a) / b;
-}
-
-inline double operator+(double a, BFloat16 b) {
-  return a + static_cast<double>(b);
-}
-inline double operator-(double a, BFloat16 b) {
-  return a - static_cast<double>(b);
-}
-inline double operator*(double a, BFloat16 b) {
-  return a * static_cast<double>(b);
-}
-inline double operator/(double a, BFloat16 b) {
-  return a / static_cast<double>(b);
-}
-
-/// Arithmetic with ints
-
-inline BFloat16 operator+(BFloat16 a, int b) {
-  return a + static_cast<BFloat16>(b);
-}
-inline BFloat16 operator-(BFloat16 a, int b) {
-  return a - static_cast<BFloat16>(b);
-}
-inline BFloat16 operator*(BFloat16 a, int b) {
-  return a * static_cast<BFloat16>(b);
-}
-inline BFloat16 operator/(BFloat16 a, int b) {
-  return a / static_cast<BFloat16>(b);
-}
-
-inline BFloat16 operator+(int a, BFloat16 b) {
-  return static_cast<BFloat16>(a) + b;
-}
-inline BFloat16 operator-(int a, BFloat16 b) {
-  return static_cast<BFloat16>(a) - b;
-}
-inline BFloat16 operator*(int a, BFloat16 b) {
-  return static_cast<BFloat16>(a) * b;
-}
-inline BFloat16 operator/(int a, BFloat16 b) {
-  return static_cast<BFloat16>(a) / b;
-}
-
-//// Arithmetic with int64_t
-
-inline BFloat16 operator+(BFloat16 a, int64_t b) {
-  return a + static_cast<BFloat16>(b);
-}
-inline BFloat16 operator-(BFloat16 a, int64_t b) {
-  return a - static_cast<BFloat16>(b);
-}
-inline BFloat16 operator*(BFloat16 a, int64_t b) {
-  return a * static_cast<BFloat16>(b);
-}
-inline BFloat16 operator/(BFloat16 a, int64_t b) {
-  return a / static_cast<BFloat16>(b);
-}
-
-inline BFloat16 operator+(int64_t a, BFloat16 b) {
-  return static_cast<BFloat16>(a) + b;
-}
-inline BFloat16 operator-(int64_t a, BFloat16 b) {
-  return static_cast<BFloat16>(a) - b;
-}
-inline BFloat16 operator*(int64_t a, BFloat16 b) {
-  return static_cast<BFloat16>(a) * b;
-}
-inline BFloat16 operator/(int64_t a, BFloat16 b) {
-  return static_cast<BFloat16>(a) / b;
-}
-
-// Overloading < and > operators, because std::max and std::min use them.
-
-inline bool operator>(BFloat16& lhs, BFloat16& rhs) {
-  return float(lhs) > float(rhs);
-}
-
-inline bool operator<(BFloat16& lhs, BFloat16& rhs) {
-  return float(lhs) < float(rhs);
-}
-
-} // namespace etensor
-} // namespace runtime
-} // namespace executorch
+} // namespace executorch::runtime::etensor
 
 namespace torch {
 namespace executor {
@@ -270,74 +25,3 @@ namespace executor {
 using ::executorch::runtime::etensor::BFloat16;
 } // namespace executor
 } // namespace torch
-
-namespace std {
-
-template <>
-class numeric_limits<executorch::runtime::etensor::BFloat16> {
- public:
-  static constexpr bool is_signed = true;
-  static constexpr bool is_specialized = true;
-  static constexpr bool is_integer = false;
-  static constexpr bool is_exact = false;
-  static constexpr bool has_infinity = true;
-  static constexpr bool has_quiet_NaN = true;
-  static constexpr bool has_signaling_NaN = true;
-  static constexpr auto has_denorm = numeric_limits<float>::has_denorm;
-  static constexpr auto has_denorm_loss =
-      numeric_limits<float>::has_denorm_loss;
-  static constexpr auto round_style = numeric_limits<float>::round_style;
-  static constexpr bool is_iec559 = false;
-  static constexpr bool is_bounded = true;
-  static constexpr bool is_modulo = false;
-  static constexpr int digits = 8;
-  static constexpr int digits10 = 2;
-  static constexpr int max_digits10 = 4;
-  static constexpr int radix = 2;
-  static constexpr int min_exponent = -125;
-  static constexpr int min_exponent10 = -37;
-  static constexpr int max_exponent = 128;
-  static constexpr int max_exponent10 = 38;
-  static constexpr auto traps = numeric_limits<float>::traps;
-  static constexpr auto tinyness_before =
-      numeric_limits<float>::tinyness_before;
-
-  static constexpr torch::executor::BFloat16 min() {
-    return torch::executor::BFloat16(
-        0x0080, torch::executor::BFloat16::from_bits());
-  }
-  static constexpr torch::executor::BFloat16 lowest() {
-    return torch::executor::BFloat16(
-        0xFF7F, torch::executor::BFloat16::from_bits());
-  }
-  static constexpr torch::executor::BFloat16 max() {
-    return torch::executor::BFloat16(
-        0x7F7F, torch::executor::BFloat16::from_bits());
-  }
-  static constexpr torch::executor::BFloat16 epsilon() {
-    return torch::executor::BFloat16(
-        0x3C00, torch::executor::BFloat16::from_bits());
-  }
-  static constexpr torch::executor::BFloat16 round_error() {
-    return torch::executor::BFloat16(
-        0x3F00, torch::executor::BFloat16::from_bits());
-  }
-  static constexpr torch::executor::BFloat16 infinity() {
-    return torch::executor::BFloat16(
-        0x7F80, torch::executor::BFloat16::from_bits());
-  }
-  static constexpr torch::executor::BFloat16 quiet_NaN() {
-    return torch::executor::BFloat16(
-        0x7FC0, torch::executor::BFloat16::from_bits());
-  }
-  static constexpr torch::executor::BFloat16 signaling_NaN() {
-    return torch::executor::BFloat16(
-        0x7F80, torch::executor::BFloat16::from_bits());
-  }
-  static constexpr torch::executor::BFloat16 denorm_min() {
-    return torch::executor::BFloat16(
-        0x0001, torch::executor::BFloat16::from_bits());
-  }
-};
-
-} // namespace std
diff --git a/runtime/core/portable_type/bfloat16_math.h b/runtime/core/portable_type/bfloat16_math.h
index 68ee77cf340..3f6bf14a464 100644
--- a/runtime/core/portable_type/bfloat16_math.h
+++ b/runtime/core/portable_type/bfloat16_math.h
@@ -11,280 +11,4 @@
 #include <executorch/runtime/core/portable_type/bfloat16.h>
 #include <executorch/runtime/core/portable_type/half.h>
 
-namespace std {
-
-template <typename T>
-struct is_reduced_floating_point
-    : std::integral_constant<
-          bool,
-          std::is_same<T, torch::executor::BFloat16>::value ||
-              std::is_same<T, torch::executor::Half>::value> {};
-
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T acos(T a) {
-  return std::acos(float(a));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T asin(T a) {
-  return std::asin(float(a));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T atan(T a) {
-  return std::atan(float(a));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T atanh(T a) {
-  return std::atanh(float(a));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T erf(T a) {
-  return std::erf(float(a));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T erfc(T a) {
-  return std::erfc(float(a));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T exp(T a) {
-  return std::exp(float(a));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T expm1(T a) {
-  return std::expm1(float(a));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline bool isfinite(T a) {
-  return std::isfinite(float(a));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T log(T a) {
-  return std::log(float(a));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T log10(T a) {
-  return std::log10(float(a));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T log1p(T a) {
-  return std::log1p(float(a));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T log2(T a) {
-  return std::log2(float(a));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T ceil(T a) {
-  return std::ceil(float(a));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T cos(T a) {
-  return std::cos(float(a));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T floor(T a) {
-  return std::floor(float(a));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T nearbyint(T a) {
-  return std::nearbyint(float(a));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T sin(T a) {
-  return std::sin(float(a));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T tan(T a) {
-  return std::tan(float(a));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T sinh(T a) {
-  return std::sinh(float(a));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T cosh(T a) {
-  return std::cosh(float(a));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T tanh(T a) {
-  return std::tanh(float(a));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T trunc(T a) {
-  return std::trunc(float(a));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T lgamma(T a) {
-  return std::lgamma(float(a));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T sqrt(T a) {
-  return std::sqrt(float(a));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T rsqrt(T a) {
-  return 1.0 / std::sqrt(float(a));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T abs(T a) {
-  return std::abs(float(a));
-}
-#if defined(_MSC_VER) && defined(__CUDACC__)
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T pow(T a, double b) {
-  return std::pow(float(a), float(b));
-}
-#else
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T pow(T a, double b) {
-  return std::pow(float(a), b);
-}
-#endif
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T pow(T a, T b) {
-  return std::pow(float(a), float(b));
-}
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T fmod(T a, T b) {
-  return std::fmod(float(a), float(b));
-}
-
-/*
-  The following function is inspired from the implementation in `musl`
-  Link to License: https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT
-  ----------------------------------------------------------------------
-  Copyright © 2005-2020 Rich Felker, et al.
-
-  Permission is hereby granted, free of charge, to any person obtaining
-  a copy of this software and associated documentation files (the
-  "Software"), to deal in the Software without restriction, including
-  without limitation the rights to use, copy, modify, merge, publish,
-  distribute, sublicense, and/or sell copies of the Software, and to
-  permit persons to whom the Software is furnished to do so, subject to
-  the following conditions:
-
-  The above copyright notice and this permission notice shall be
-  included in all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-  ----------------------------------------------------------------------
- */
-template <
-    typename T,
-    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
-inline T nextafter(T from, T to) {
-  // Reference:
-  // https://git.musl-libc.org/cgit/musl/tree/src/math/nextafter.c
-  using int_repr_t = uint16_t;
-  constexpr uint8_t bits = 16;
-  union {
-    T f;
-    int_repr_t i;
-  } ufrom = {from}, uto = {to};
-
-  // get a mask to get the sign bit i.e. MSB
-  int_repr_t sign_mask = int_repr_t{1} << (bits - 1);
-
-  // short-circuit: if either is NaN, return NaN
-  if (from != from || to != to) {
-    return from + to;
-  }
-
-  // short-circuit: if they are exactly the same.
-  if (ufrom.i == uto.i) {
-    return from;
-  }
-
-  // mask the sign-bit to zero i.e. positive
-  // equivalent to abs(x)
-  int_repr_t abs_from = ufrom.i & ~sign_mask;
-  int_repr_t abs_to = uto.i & ~sign_mask;
-  if (abs_from == 0) {
-    // if both are zero but with different sign,
-    // preserve the sign of `to`.
-    if (abs_to == 0) {
-      return to;
-    }
-    // smallest subnormal with sign of `to`.
-    ufrom.i = (uto.i & sign_mask) | int_repr_t{1};
-    return ufrom.f;
-  }
-
-  // if abs(from) > abs(to) or sign(from) != sign(to)
-  if (abs_from > abs_to || ((ufrom.i ^ uto.i) & sign_mask)) {
-    ufrom.i--;
-  } else {
-    ufrom.i++;
-  }
-
-  return ufrom.f;
-}
-
-} // namespace std
+#include <c10/util/BFloat16-math.h>
diff --git a/runtime/core/portable_type/c10/TARGETS b/runtime/core/portable_type/c10/TARGETS
new file mode 100644
index 00000000000..2341af9282f
--- /dev/null
+++ b/runtime/core/portable_type/c10/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/runtime/core/portable_type/c10/macros/Export.h b/runtime/core/portable_type/c10/macros/Export.h
new file mode 100644
index 00000000000..cb68060ed81
--- /dev/null
+++ b/runtime/core/portable_type/c10/macros/Export.h
@@ -0,0 +1,160 @@
+#ifndef C10_MACROS_EXPORT_H_
+#define C10_MACROS_EXPORT_H_
+
+/* Header file to define the common scaffolding for exported symbols.
+ *
+ * Export is by itself a quite tricky situation to deal with, and if you are
+ * hitting this file, make sure you start with the background here:
+ * - Linux: https://gcc.gnu.org/wiki/Visibility
+ * - Windows:
+ * https://docs.microsoft.com/en-us/cpp/cpp/dllexport-dllimport?view=vs-2017
+ *
+ * Do NOT include this file directly. Instead, use c10/macros/Macros.h
+ */
+
+// You do not need to edit this part of file unless you are changing the core
+// pytorch export abstractions.
+//
+// This part defines the C10 core export and import macros. This is controlled
+// by whether we are building shared libraries or not, which is determined
+// during build time and codified in c10/core/cmake_macros.h.
+// When the library is built as a shared lib, EXPORT and IMPORT will contain
+// visibility attributes. If it is being built as a static lib, then EXPORT
+// and IMPORT basically have no effect.
+
+// As a rule of thumb, you should almost NEVER mix static and shared builds for
+// libraries that depend on c10. AKA, if c10 is built as a static library, we
+// recommend everything dependent on c10 to be built statically. If c10 is built
+// as a shared library, everything dependent on it should be built as shared. In
+// the PyTorch project, all native libraries shall use the macro
+// C10_BUILD_SHARED_LIB to check whether pytorch is building shared or static
+// libraries.
+
+// For build systems that do not directly depend on CMake and directly build
+// from the source directory (such as Buck), one may not have a cmake_macros.h
+// file at all. In this case, the build system is responsible for providing
+// correct macro definitions corresponding to the cmake_macros.h.in file.
+//
+// In such scenarios, one should define the macro
+//     C10_USING_CUSTOM_GENERATED_MACROS
+// to inform this header that it does not need to include the cmake_macros.h
+// file.
+
+#ifndef C10_USING_CUSTOM_GENERATED_MACROS
+#include <c10/macros/cmake_macros.h>
+#endif // C10_USING_CUSTOM_GENERATED_MACROS
+
+#ifdef _WIN32
+#define C10_HIDDEN
+#if defined(C10_BUILD_SHARED_LIBS)
+#define C10_EXPORT __declspec(dllexport)
+#define C10_IMPORT __declspec(dllimport)
+#else
+#define C10_EXPORT
+#define C10_IMPORT
+#endif
+#else // _WIN32
+#if defined(__GNUC__)
+#define C10_EXPORT __attribute__((__visibility__("default")))
+#define C10_HIDDEN __attribute__((__visibility__("hidden")))
+#else // defined(__GNUC__)
+#define C10_EXPORT
+#define C10_HIDDEN
+#endif // defined(__GNUC__)
+#define C10_IMPORT C10_EXPORT
+#endif // _WIN32
+
+#ifdef NO_EXPORT
+#undef C10_EXPORT
+#define C10_EXPORT
+#endif
+
+// Definition of an adaptive XX_API macro, that depends on whether you are
+// building the library itself or not, routes to XX_EXPORT and XX_IMPORT.
+// Basically, you will need to do this for each shared library that you are
+// building, and the instruction is as follows: assuming that you are building
+// a library called libawesome.so. You should:
+// (1) for your cmake target (usually done by "add_library(awesome, ...)"),
+//     define a macro called AWESOME_BUILD_MAIN_LIB using
+//     target_compile_options.
+// (2) define the AWESOME_API macro similar to the one below.
+// And in the source file of your awesome library, use AWESOME_API to
+// annotate public symbols.
+
+// Here, for the C10 library, we will define the macro C10_API for both import
+// and export.
+
+// This one is being used by libc10.so
+#ifdef C10_BUILD_MAIN_LIB
+#define C10_API C10_EXPORT
+#else
+#define C10_API C10_IMPORT
+#endif
+
+// This one is being used by libtorch.so
+#ifdef CAFFE2_BUILD_MAIN_LIB
+#define TORCH_API C10_EXPORT
+#else
+#define TORCH_API C10_IMPORT
+#endif
+
+// You may be wondering: Whose brilliant idea was it to split torch_cuda into
+// two pieces with confusing names?
+// Once upon a time, there _was_ only TORCH_CUDA_API. All was happy until we
+// tried to compile PyTorch for CUDA 11.1, which ran into relocation marker
+// issues when linking big binaries.
+// (https://github.com/pytorch/pytorch/issues/39968) We had two choices:
+//    (1) Stop supporting so many GPU architectures
+//    (2) Do something else
+// We chose #2 and decided to split the behemoth that was torch_cuda into two
+// smaller libraries, one with most of the core kernel functions (torch_cuda_cu)
+// and the other that had..well..everything else (torch_cuda_cpp). The idea was
+// this: instead of linking our static libraries (like the hefty
+// libcudnn_static.a) with another huge library, torch_cuda, and run into pesky
+// relocation marker issues, we could link our static libraries to a smaller
+// part of torch_cuda (torch_cuda_cpp) and avoid the issues.
+
+// libtorch_cuda_cu.so
+#ifdef TORCH_CUDA_CU_BUILD_MAIN_LIB
+#define TORCH_CUDA_CU_API C10_EXPORT
+#elif defined(BUILD_SPLIT_CUDA)
+#define TORCH_CUDA_CU_API C10_IMPORT
+#endif
+
+// libtorch_cuda_cpp.so
+#ifdef TORCH_CUDA_CPP_BUILD_MAIN_LIB
+#define TORCH_CUDA_CPP_API C10_EXPORT
+#elif defined(BUILD_SPLIT_CUDA)
+#define TORCH_CUDA_CPP_API C10_IMPORT
+#endif
+
+// libtorch_cuda.so (where torch_cuda_cu and torch_cuda_cpp are a part of the
+// same api)
+#ifdef TORCH_CUDA_BUILD_MAIN_LIB
+#define TORCH_CUDA_CPP_API C10_EXPORT
+#define TORCH_CUDA_CU_API C10_EXPORT
+#elif !defined(BUILD_SPLIT_CUDA)
+#define TORCH_CUDA_CPP_API C10_IMPORT
+#define TORCH_CUDA_CU_API C10_IMPORT
+#endif
+
+#if defined(TORCH_HIP_BUILD_MAIN_LIB)
+#define TORCH_HIP_API C10_EXPORT
+#else
+#define TORCH_HIP_API C10_IMPORT
+#endif
+
+#if defined(TORCH_XPU_BUILD_MAIN_LIB)
+#define TORCH_XPU_API C10_EXPORT
+#else
+#define TORCH_XPU_API C10_IMPORT
+#endif
+
+// Enums only need to be exported on windows for non-CUDA files
+#if defined(_WIN32) && defined(__CUDACC__)
+#define C10_API_ENUM C10_API
+#else
+#define C10_API_ENUM
+#endif
+
+#endif // C10_MACROS_MACROS_H_
diff --git a/runtime/core/portable_type/c10/macros/Macros.h b/runtime/core/portable_type/c10/macros/Macros.h
new file mode 100644
index 00000000000..919eb6c8567
--- /dev/null
+++ b/runtime/core/portable_type/c10/macros/Macros.h
@@ -0,0 +1,511 @@
+#ifndef C10_MACROS_MACROS_H_
+#define C10_MACROS_MACROS_H_
+#include <cassert>
+
+/* Main entry for c10/macros.
+ *
+ * In your code, include c10/macros/Macros.h directly, instead of individual
+ * files in this folder.
+ */
+
+// For build systems that do not directly depend on CMake and directly build
+// from the source directory (such as Buck), one may not have a cmake_macros.h
+// file at all. In this case, the build system is responsible for providing
+// correct macro definitions corresponding to the cmake_macros.h.in file.
+//
+// In such scenarios, one should define the macro
+//     C10_USING_CUSTOM_GENERATED_MACROS
+// to inform this header that it does not need to include the cmake_macros.h
+// file.
+
+#ifndef C10_USING_CUSTOM_GENERATED_MACROS
+#include <c10/macros/cmake_macros.h>
+#endif // C10_USING_CUSTOM_GENERATED_MACROS
+
+#include <c10/macros/Export.h>
+
+#if defined(__clang__)
+#define __ubsan_ignore_float_divide_by_zero__ \
+  __attribute__((no_sanitize("float-divide-by-zero")))
+#define __ubsan_ignore_undefined__ __attribute__((no_sanitize("undefined")))
+#define __ubsan_ignore_signed_int_overflow__ \
+  __attribute__((no_sanitize("signed-integer-overflow")))
+#define __ubsan_ignore_pointer_overflow__ \
+  __attribute__((no_sanitize("pointer-overflow")))
+#define __ubsan_ignore_function__ __attribute__((no_sanitize("function")))
+#define __ubsan_ignore_float_cast_overflow__ \
+  __attribute__((no_sanitize("float-cast-overflow")))
+#else
+#define __ubsan_ignore_float_divide_by_zero__
+#define __ubsan_ignore_undefined__
+#define __ubsan_ignore_signed_int_overflow__
+#define __ubsan_ignore_pointer_overflow__
+#define __ubsan_ignore_function__
+#define __ubsan_ignore_float_cast_overflow__
+#endif
+
+// Detect address sanitizer as some stuff doesn't work with it
+#undef C10_ASAN_ENABLED
+
+// for clang
+#if defined(__has_feature)
+#if ((__has_feature(address_sanitizer)))
+#define C10_ASAN_ENABLED 1
+#endif
+#endif
+
+// for gcc
+#if defined(__SANITIZE_ADDRESS__)
+#if __SANITIZE_ADDRESS__
+#if !defined(C10_ASAN_ENABLED)
+#define C10_ASAN_ENABLED 1
+#endif
+#endif
+#endif
+
+#if !defined(C10_ASAN_ENABLED)
+#define C10_ASAN_ENABLED 0
+#endif
+
+// Detect undefined-behavior sanitizer (UBSAN)
+#undef C10_UBSAN_ENABLED
+
+// for clang or gcc >= 14
+// NB: gcc 14 adds support for Clang's __has_feature
+//   https://gcc.gnu.org/gcc-14/changes.html
+//   gcc < 14 doesn't have a macro for UBSAN
+//   (e.g. __SANITIZE_UNDEFINED__ does not exist in gcc)
+//   https://github.com/google/sanitizers/issues/765
+#if defined(__has_feature)
+#if ((__has_feature(undefined_behavior_sanitizer)))
+#define C10_UBSAN_ENABLED 1
+#endif
+#endif
+
+#if !defined(C10_UBSAN_ENABLED)
+#define C10_UBSAN_ENABLED 0
+#endif
+
+// Disable the copy and assignment operator for a class. Note that this will
+// disable the usage of the class in std containers.
+#define C10_DISABLE_COPY_AND_ASSIGN(classname) \
+  classname(const classname&) = delete;        \
+  classname& operator=(const classname&) = delete
+
+#define C10_CONCATENATE_IMPL(s1, s2) s1##s2
+#define C10_CONCATENATE(s1, s2) C10_CONCATENATE_IMPL(s1, s2)
+
+#define C10_MACRO_EXPAND(args) args
+
+#define C10_STRINGIZE_IMPL(x) #x
+#define C10_STRINGIZE(x) C10_STRINGIZE_IMPL(x)
+
+/**
+ * C10_ANONYMOUS_VARIABLE(str) introduces a new identifier which starts with
+ * str and ends with a unique number.
+ */
+#ifdef __COUNTER__
+#define C10_UID __COUNTER__
+#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __COUNTER__)
+#else
+#define C10_UID __LINE__
+#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __LINE__)
+#endif
+
+#ifdef __has_cpp_attribute
+#define C10_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#else
+#define C10_HAS_CPP_ATTRIBUTE(x) (0)
+#endif
+
+#ifndef FBCODE_CAFFE2
+/// DEPRECATED: Warn if a type or return value is discarded.
+#define C10_NODISCARD [[nodiscard]]
+
+/// DEPRECATED: Suppress an unused variable.
+#define C10_UNUSED [[maybe_unused]]
+#endif
+
+#if !defined(__has_attribute)
+#define __has_attribute(x) 0
+#endif
+
+// Direct port of LLVM_ATTRIBUTE_USED.
+#if __has_attribute(used)
+#define C10_USED __attribute__((__used__))
+#else
+#define C10_USED
+#endif
+
+#define C10_RESTRICT __restrict
+
+// Simply define the namespace, in case a dependent library want to refer to
+// the c10 namespace but not any nontrivial files.
+namespace c10 {}
+namespace c10::cuda {}
+namespace c10::hip {}
+namespace c10::xpu {}
+
+// Since C10 is the core library for caffe2 (and aten), we will simply reroute
+// all abstractions defined in c10 to be available in caffe2 as well.
+// This is only for backwards compatibility. Please use the symbols from the
+// c10 namespace where possible.
+namespace caffe2 {
+using namespace c10;
+}
+namespace at {
+using namespace c10;
+}
+namespace at::cuda {
+using namespace c10::cuda;
+} // namespace at::cuda
+
+// WARNING!!! THIS IS A GIANT HACK!!!
+// This line means you cannot simultaneously include c10/hip
+// and c10/cuda and then use them from the at::cuda namespace.
+// This is true in practice, because HIPIFY works inplace on
+// files in ATen/cuda, so it assumes that c10::hip is available
+// from at::cuda.  This namespace makes that happen.  When
+// HIPIFY is no longer out-of-place, we can switch the cuda
+// here to hip and everyone is happy.
+namespace at::cuda {
+using namespace c10::hip;
+} // namespace at::cuda
+
+namespace at::xpu {
+using namespace c10::xpu;
+} // namespace at::xpu
+
+// C10_LIKELY/C10_UNLIKELY
+//
+// These macros provide parentheses, so you can use these macros as:
+//
+//    if C10_LIKELY(some_expr) {
+//      ...
+//    }
+//
+// NB: static_cast to boolean is mandatory in C++, because __builtin_expect
+// takes a long argument, which means you may trigger the wrong conversion
+// without it.
+//
+#if defined(__GNUC__) || defined(__ICL) || defined(__clang__)
+#define C10_LIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 1))
+#define C10_UNLIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 0))
+#else
+#define C10_LIKELY(expr) (expr)
+#define C10_UNLIKELY(expr) (expr)
+#endif
+
+/// C10_NOINLINE - Functions whose declaration is annotated with this will not
+/// be inlined.
+#ifdef __GNUC__
+#define C10_NOINLINE __attribute__((noinline))
+#elif _MSC_VER
+#define C10_NOINLINE __declspec(noinline)
+#else
+#define C10_NOINLINE
+#endif
+
+#if defined(_MSC_VER)
+#define C10_ALWAYS_INLINE __forceinline
+#elif __has_attribute(always_inline) || defined(__GNUC__)
+#define C10_ALWAYS_INLINE __attribute__((__always_inline__)) inline
+#else
+#define C10_ALWAYS_INLINE inline
+#endif
+
+// Unlike C10_ALWAYS_INLINE, C10_ALWAYS_INLINE_ATTRIBUTE can be used
+// on a lambda.
+#if defined(_MSC_VER)
+// MSVC 14.39 is reasonably recent and doesn't like
+// [[msvc::forceinline]] on a lambda, so don't try to use it.
+#define C10_ALWAYS_INLINE_ATTRIBUTE
+#elif __has_attribute(always_inline) || defined(__GNUC__)
+#define C10_ALWAYS_INLINE_ATTRIBUTE __attribute__((__always_inline__))
+#else
+#define C10_ALWAYS_INLINE_ATTRIBUTE
+#endif
+
+#if defined(_MSC_VER)
+#define C10_ATTR_VISIBILITY_HIDDEN
+#elif defined(__GNUC__)
+#define C10_ATTR_VISIBILITY_HIDDEN __attribute__((__visibility__("hidden")))
+#else
+#define C10_ATTR_VISIBILITY_HIDDEN
+#endif
+
+#define C10_ERASE C10_ALWAYS_INLINE C10_ATTR_VISIBILITY_HIDDEN
+
+#include <cstdint>
+
+#ifdef __HIPCC__
+// Unlike CUDA, HIP requires a HIP header to be included for __host__ to work.
+// We do this #include here so that C10_HOST_DEVICE and friends will Just Work.
+// See https://github.com/ROCm-Developer-Tools/HIP/issues/441
+#include <hip/hip_runtime.h>
+#endif
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+// Designates functions callable from the host (CPU) and the device (GPU)
+#define C10_HOST_DEVICE __host__ __device__
+#define C10_DEVICE __device__
+#define C10_HOST __host__
+// constants from
+// (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications)
+// The maximum number of threads per multiprocessor is 1024 for Turing
+// architecture (7.5), 1536 for Geforce Ampere (8.6)/Jetson Orin (8.7), and
+// 2048 for all other architectures. You'll get warnings if you exceed these
+// constants. Hence, the following macros adjust the input values from the user
+// to resolve potential warnings.
+#if __CUDA_ARCH__ == 750
+constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1024;
+#elif __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 870 || __CUDA_ARCH__ == 890
+constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1536;
+#else
+constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 2048;
+#endif
+// CUDA_MAX_THREADS_PER_BLOCK is same for all architectures currently
+constexpr uint32_t CUDA_MAX_THREADS_PER_BLOCK = 1024;
+// CUDA_THREADS_PER_BLOCK_FALLBACK is the "canonical fallback" choice of block
+// size. 256 is a good number for this fallback and should give good occupancy
+// and versatility across all architectures.
+constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
+// NOTE: if you are thinking of constexpr-ify the inputs to launch bounds, it
+//       turns out that although __launch_bounds__ can take constexpr, it
+//       can't take a constexpr that has anything to do with templates.
+//       Currently we use launch_bounds that depend on template arguments in
+//       Loops.cuh, Reduce.cuh and LossCTC.cuh. Hence, C10_MAX_THREADS_PER_BLOCK
+//       and C10_MIN_BLOCKS_PER_SM are kept as macros.
+// Suppose you were planning to write __launch_bounds__(a, b), based on your
+// performance tuning on a modern GPU. Instead, you should write
+// __launch_bounds__(C10_MAX_THREADS_PER_BLOCK(a), C10_MIN_BLOCKS_PER_SM(a, b)),
+// which will also properly respect limits on old architectures.
+#define C10_MAX_THREADS_PER_BLOCK(val)           \
+  (((val) <= CUDA_MAX_THREADS_PER_BLOCK) ? (val) \
+                                         : CUDA_THREADS_PER_BLOCK_FALLBACK)
+#define C10_MIN_BLOCKS_PER_SM(threads_per_block, blocks_per_sm)        \
+  ((((threads_per_block) * (blocks_per_sm) <= CUDA_MAX_THREADS_PER_SM) \
+        ? (blocks_per_sm)                                              \
+        : ((CUDA_MAX_THREADS_PER_SM + (threads_per_block)-1) /         \
+           (threads_per_block))))
+// C10_LAUNCH_BOUNDS is analogous to __launch_bounds__
+#define C10_LAUNCH_BOUNDS_0 \
+  __launch_bounds__(        \
+      256, 4) // default launch bounds that should give good occupancy and
+              // versatility across all architectures.
+#define C10_LAUNCH_BOUNDS_1(max_threads_per_block) \
+  __launch_bounds__((C10_MAX_THREADS_PER_BLOCK((max_threads_per_block))))
+#define C10_LAUNCH_BOUNDS_2(max_threads_per_block, min_blocks_per_sm) \
+  __launch_bounds__(                                                  \
+      (C10_MAX_THREADS_PER_BLOCK((max_threads_per_block))),           \
+      (C10_MIN_BLOCKS_PER_SM((max_threads_per_block), (min_blocks_per_sm))))
+#else
+#define C10_HOST_DEVICE
+#define C10_HOST
+#define C10_DEVICE
+#endif
+
+#if defined(USE_ROCM)
+#define C10_HIP_HOST_DEVICE __host__ __device__
+#else
+#define C10_HIP_HOST_DEVICE
+#endif
+
+#if defined(USE_ROCM)
+#define C10_WARP_SIZE warpSize // = 64 or 32 (Defined in hip_runtime.h)
+#else
+#define C10_WARP_SIZE 32
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+#define __func__ __FUNCTION__
+#endif
+
+// CUDA_KERNEL_ASSERT checks the assertion
+// even when NDEBUG is defined. This is useful for important assertions in CUDA
+// code that would otherwise be suppressed when building Release.
+#if defined(__ANDROID__) || defined(__APPLE__) || defined(__FreeBSD__)
+// Those platforms do not support assert()
+#define CUDA_KERNEL_ASSERT(cond)
+#define CUDA_KERNEL_ASSERT_MSG(cond, msg)
+#define SYCL_KERNEL_ASSERT(cond)
+#elif defined(_MSC_VER)
+#if defined(NDEBUG)
+extern "C" {
+C10_IMPORT
+#if defined(__SYCL_DEVICE_ONLY__)
+extern SYCL_EXTERNAL void _wassert(
+    const wchar_t* wexpr,
+    const wchar_t* wfile,
+    unsigned line);
+#else
+#if defined(__CUDA_ARCH__)
+__host__ __device__
+#endif // __CUDA_ARCH__
+    void
+    _wassert(wchar_t const* _Message, wchar_t const* _File, unsigned _Line);
+#endif // __SYCL_DEVICE_ONLY__
+}
+#endif // NDEBUG
+#define CUDA_KERNEL_ASSERT(cond)                 \
+  if (C10_UNLIKELY(!(cond))) {                   \
+    (void)(_wassert(                             \
+               _CRT_WIDE(#cond),                 \
+               _CRT_WIDE(__FILE__),              \
+               static_cast<unsigned>(__LINE__)), \
+           0);                                   \
+  }
+// TODO: This doesn't assert the message because I (chilli) couldn't figure out
+// a nice way to convert a char* to a wchar_t*
+#define CUDA_KERNEL_ASSERT_MSG(cond, msg)        \
+  if (C10_UNLIKELY(!(cond))) {                   \
+    (void)(_wassert(                             \
+               _CRT_WIDE(#cond),                 \
+               _CRT_WIDE(__FILE__),              \
+               static_cast<unsigned>(__LINE__)), \
+           0);                                   \
+  }
+#define SYCL_KERNEL_ASSERT(cond)                 \
+  if (C10_UNLIKELY(!(cond))) {                   \
+    (void)(_wassert(                             \
+               _CRT_WIDE(#cond),                 \
+               _CRT_WIDE(__FILE__),              \
+               static_cast<unsigned>(__LINE__)), \
+           0);                                   \
+  }
+#else // __APPLE__, _MSC_VER
+#if defined(NDEBUG)
+extern "C" {
+#if defined(__SYCL_DEVICE_ONLY__)
+extern SYCL_EXTERNAL void __assert_fail(
+    const char* expr,
+    const char* file,
+    unsigned int line,
+    const char* func);
+#else // __SYCL_DEVICE_ONLY__
+#if (defined(__CUDA_ARCH__) && !(defined(__clang__) && defined(__CUDA__)))
+// CUDA supports __assert_fail function which are common for both device
+// and host side code.
+__host__ __device__
+#endif
+
+    // This forward declaration matching the declaration of __assert_fail
+    // exactly how it is in glibc in case parts of the program are compiled with
+    // different NDEBUG settings. Otherwise we might get 'ambiguous declaration'
+    // error. Note: On ROCm - this declaration serves for host side compilation.
+    void
+    __assert_fail(
+        const char* assertion,
+        const char* file,
+        unsigned int line,
+        const char* function) noexcept __attribute__((__noreturn__));
+
+#endif // __SYCL_DEVICE_ONLY__
+}
+#endif // NDEBUG
+// ROCm disable kernel assert by default
+#if !defined(C10_USE_ROCM_KERNEL_ASSERT) and defined(USE_ROCM)
+#define CUDA_KERNEL_ASSERT(cond)
+#define CUDA_KERNEL_ASSERT_MSG(cond, msg)
+#define SYCL_KERNEL_ASSERT(cond)
+#else
+#define CUDA_KERNEL_ASSERT(cond)                                         \
+  if (C10_UNLIKELY(!(cond))) {                                           \
+    __assert_fail(                                                       \
+        #cond, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
+  }
+#define CUDA_KERNEL_ASSERT_MSG(cond, msg)                              \
+  if (C10_UNLIKELY(!(cond))) {                                         \
+    __assert_fail(                                                     \
+        msg, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
+  }
+#define SYCL_KERNEL_ASSERT(cond)                                         \
+  if (C10_UNLIKELY(!(cond))) {                                           \
+    __assert_fail(                                                       \
+        #cond, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
+  }
+#endif //  C10_USE_ROCM_KERNEL_ASSERT and USE_ROCM
+#endif // __APPLE__
+
+#ifdef __APPLE__
+#include <TargetConditionals.h>
+#endif
+
+#if defined(__ANDROID__)
+#define C10_ANDROID 1
+#define C10_MOBILE 1
+#elif (                   \
+    defined(__APPLE__) && \
+    (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE))
+#define C10_IOS 1
+#define C10_MOBILE 1
+#endif // ANDROID / IOS
+
+#if defined(C10_MOBILE) && C10_MOBILE
+#define C10_ALWAYS_INLINE_UNLESS_MOBILE inline
+#else
+#define C10_ALWAYS_INLINE_UNLESS_MOBILE C10_ALWAYS_INLINE
+#endif
+
+#if !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED)
+#define CONSTEXPR_EXCEPT_WIN_CUDA constexpr
+#define C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA constexpr
+
+#define STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(field, val) \
+  static constexpr const char field[] = val;
+#define STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA(cls, field, val)
+#endif // !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED)
+
+#ifndef HAS_DEMANGLE
+#if defined(__ANDROID__) || defined(_WIN32) || defined(__EMSCRIPTEN__)
+#define HAS_DEMANGLE 0
+#elif defined(__APPLE__) && \
+    (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE)
+#define HAS_DEMANGLE 0
+#else
+#define HAS_DEMANGLE 1
+#endif
+#endif // HAS_DEMANGLE
+
+#define _C10_PRAGMA__(string) _Pragma(#string)
+#define _C10_PRAGMA_(string) _C10_PRAGMA__(string)
+
+#ifdef __clang__
+#define C10_CLANG_DIAGNOSTIC_PUSH() _Pragma("clang diagnostic push")
+#define C10_CLANG_DIAGNOSTIC_POP() _Pragma("clang diagnostic pop")
+#define C10_CLANG_DIAGNOSTIC_IGNORE(flag) \
+  _C10_PRAGMA_(clang diagnostic ignored flag)
+#define C10_CLANG_HAS_WARNING(flag) __has_warning(flag)
+#else
+#define C10_CLANG_DIAGNOSTIC_PUSH()
+#define C10_CLANG_DIAGNOSTIC_POP()
+#define C10_CLANG_DIAGNOSTIC_IGNORE(flag)
+#define C10_CLANG_HAS_WARNING(flag) 0
+#endif
+
+#ifdef __clang__
+
+#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning)         \
+  _C10_PRAGMA_(clang diagnostic push)                               \
+  _C10_PRAGMA_(clang diagnostic ignored "-Wunknown-warning-option") \
+  _C10_PRAGMA_(clang diagnostic ignored warning)
+
+#define C10_DIAGNOSTIC_POP() _C10_PRAGMA_(clang diagnostic pop)
+
+#elif __GNUC__
+
+#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning) \
+  _C10_PRAGMA_(GCC diagnostic push)                         \
+  _C10_PRAGMA_(GCC diagnostic ignored "-Wpragmas")          \
+  _C10_PRAGMA_(GCC diagnostic ignored warning)
+
+#define C10_DIAGNOSTIC_POP() _C10_PRAGMA_(GCC diagnostic pop)
+
+#else
+
+#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning)
+#define C10_DIAGNOSTIC_POP()
+
+#endif
+
+#endif // C10_MACROS_MACROS_H_
diff --git a/runtime/core/portable_type/c10/targets.bzl b/runtime/core/portable_type/c10/targets.bzl
new file mode 100644
index 00000000000..1e60b70a4b8
--- /dev/null
+++ b/runtime/core/portable_type/c10/targets.bzl
@@ -0,0 +1,100 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def get_sleef_preprocessor_flags():
+    if runtime.is_oss:
+        return []
+    return ["-DAT_BUILD_ARM_VEC256_WITH_SLEEF"]
+
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+    runtime.cxx_library(
+        name = "c10",
+        header_namespace = "c10",
+        exported_headers = [
+            "macros/Export.h",
+            "macros/Macros.h",
+            "util/BFloat16.h",
+            "util/BFloat16-inl.h",
+            "util/BFloat16-math.h",
+            "util/Half.h",
+            "util/Half-inl.h",
+            "util/TypeSafeSignMath.h",
+            "util/bit_cast.h",
+            "util/floating_point_utils.h",
+        ],
+        exported_preprocessor_flags = [
+            # NOTE: If we define C10_EMBEDDED to prevent Half and
+            # BFloat16 from supporting streams, non-ExecuTorch-core
+            # uses of other ATen headers that try to print ATen
+            # primitive types fail to build because, apparently, there
+            # are implicit conversions from Half/BFloat16 to a variety
+            # of primitive types, not just float. Since merely
+            # including <ostream> shouldn't result in any runtime
+            # artifacts if stream code is never actually called, it
+            # seems best to just not define C10_EMBEDDED, but if you
+            # need it, it's there.
+            # "-DC10_EMBEDDED",
+            "-DC10_USE_GLOG",
+            "-DC10_USE_MINIMAL_GLOG",
+            "-DC10_USING_CUSTOM_GENERATED_MACROS",
+        ],
+        visibility = [
+            "//executorch/runtime/core/portable_type/...",
+        ],
+        deps = select({
+            "DEFAULT": [],
+            # Half-inl.h depends on vec_half.h from ATen, but only when building for x86.
+            "ovr_config//cpu:x86_64": [
+                ":aten_headers_for_executorch",
+            ],
+        }),
+    )
+
+    runtime.cxx_library(
+        name = "aten_headers_for_executorch",
+        srcs = [],
+        visibility = ["//executorch/kernels/optimized/..."],
+        exported_deps = select({
+            "DEFAULT": [],
+            "ovr_config//cpu:arm64": [
+                "fbsource//third-party/sleef:sleef_arm",
+            ] if not runtime.is_oss else [],
+            # fbsource//third-party/sleef:sleef currently fails to
+            # link with missing symbols, hence the fbcode-specific dep below.
+        }),
+        fbcode_exported_deps = ([
+            "//caffe2:aten-headers-cpu",
+            "//caffe2:generated-config-header",
+            "//caffe2/c10:c10_headers",
+        ] + select({
+            "DEFAULT": [],
+            "ovr_config//cpu:x86_64": [
+                "third-party//sleef:sleef",
+            ]
+        })) if not runtime.is_oss else [],
+        fbcode_exported_preprocessor_flags = [
+            # We don't -DCPU_CAPABILITY=AVX2 because that trips
+            # -Wmacro-redefined, and we only care about getting
+            # reasonable vectorization and Sleef support.
+            "-DCPU_CAPABILITY_AVX2",
+            "-DHAVE_AVX2_CPU_DEFINITION",
+            "-DSTANDALONE_TORCH_HEADER",
+        ] + get_sleef_preprocessor_flags(),
+        xplat_exported_deps = [
+            "//xplat/caffe2:aten_header",
+            "//xplat/caffe2:generated_aten_config_header",
+            "//xplat/caffe2/c10:c10_headers",
+        ],
+        exported_preprocessor_flags = select({
+            # Intentionally punting on non-fbcode x86 sleef support
+            # for now because of fbsource//third-party/sleef:sleef
+            # linker failure.
+            "ovr_config//cpu:arm64": get_sleef_preprocessor_flags(),
+            "DEFAULT": [],
+        }) + ["-DSTANDALONE_TORCH_HEADER"],
+    )
diff --git a/runtime/core/portable_type/c10/util/BFloat16-inl.h b/runtime/core/portable_type/c10/util/BFloat16-inl.h
new file mode 100644
index 00000000000..10ab0c828d7
--- /dev/null
+++ b/runtime/core/portable_type/c10/util/BFloat16-inl.h
@@ -0,0 +1,343 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/bit_cast.h>
+
+#include <limits>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
+#endif
+
+#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+#if defined(CL_SYCL_LANGUAGE_VERSION)
+#include <CL/sycl.hpp> // for SYCL 1.2.1
+#else
+#include <sycl/sycl.hpp> // for SYCL 2020
+#endif
+#include <ext/oneapi/bfloat16.hpp>
+#endif
+
+namespace c10 {
+
+/// Constructors
+inline C10_HOST_DEVICE BFloat16::BFloat16(float value)
+    :
+#if defined(__CUDACC__) && !defined(USE_ROCM) && defined(__CUDA_ARCH__) && \
+    __CUDA_ARCH__ >= 800
+      x(__bfloat16_as_ushort(__float2bfloat16(value)))
+#elif defined(__SYCL_DEVICE_ONLY__) && \
+    defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+      x(c10::bit_cast<uint16_t>(sycl::ext::oneapi::bfloat16(value)))
+#else
+      // RNE by default
+      x(detail::round_to_nearest_even(value))
+#endif
+{
+}
+
+/// Implicit conversions
+inline C10_HOST_DEVICE BFloat16::operator float() const {
+#if defined(__CUDACC__) && !defined(USE_ROCM)
+  return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
+#elif defined(__SYCL_DEVICE_ONLY__) && \
+    defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+  return float(*reinterpret_cast<const sycl::ext::oneapi::bfloat16*>(&x));
+#else
+  return detail::f32_from_bits(x);
+#endif
+}
+
+#if defined(__CUDACC__) && !defined(USE_ROCM)
+inline C10_HOST_DEVICE BFloat16::BFloat16(const __nv_bfloat16& value) {
+  x = *reinterpret_cast<const unsigned short*>(&value);
+}
+inline C10_HOST_DEVICE BFloat16::operator __nv_bfloat16() const {
+  return *reinterpret_cast<const __nv_bfloat16*>(&x);
+}
+#endif
+
+#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+inline C10_HOST_DEVICE BFloat16::BFloat16(
+    const sycl::ext::oneapi::bfloat16& value) {
+  x = *reinterpret_cast<const unsigned short*>(&value);
+}
+inline C10_HOST_DEVICE BFloat16::operator sycl::ext::oneapi::bfloat16() const {
+  return *reinterpret_cast<const sycl::ext::oneapi::bfloat16*>(&x);
+}
+#endif
+
+// CUDA intrinsics
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+inline C10_DEVICE BFloat16 __ldg(const BFloat16* ptr) {
+#if !defined(USE_ROCM) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return __ldg(reinterpret_cast<const __nv_bfloat16*>(ptr));
+#else
+  return *ptr;
+#endif
+}
+#endif
+
+/// Arithmetic
+
+inline C10_HOST_DEVICE BFloat16
+operator+(const BFloat16& a, const BFloat16& b) {
+  return static_cast<float>(a) + static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE BFloat16
+operator-(const BFloat16& a, const BFloat16& b) {
+  return static_cast<float>(a) - static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE BFloat16
+operator*(const BFloat16& a, const BFloat16& b) {
+  return static_cast<float>(a) * static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE BFloat16 operator/(const BFloat16& a, const BFloat16& b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE BFloat16 operator-(const BFloat16& a) {
+  return -static_cast<float>(a);
+}
+
+inline C10_HOST_DEVICE BFloat16& operator+=(BFloat16& a, const BFloat16& b) {
+  a = a + b;
+  return a;
+}
+
+inline C10_HOST_DEVICE BFloat16& operator-=(BFloat16& a, const BFloat16& b) {
+  a = a - b;
+  return a;
+}
+
+inline C10_HOST_DEVICE BFloat16& operator*=(BFloat16& a, const BFloat16& b) {
+  a = a * b;
+  return a;
+}
+
+inline C10_HOST_DEVICE BFloat16& operator/=(BFloat16& a, const BFloat16& b) {
+  a = a / b;
+  return a;
+}
+
+inline C10_HOST_DEVICE BFloat16& operator|(BFloat16& a, const BFloat16& b) {
+  a.x = a.x | b.x;
+  return a;
+}
+
+inline C10_HOST_DEVICE BFloat16& operator^(BFloat16& a, const BFloat16& b) {
+  a.x = a.x ^ b.x;
+  return a;
+}
+
+inline C10_HOST_DEVICE BFloat16& operator&(BFloat16& a, const BFloat16& b) {
+  a.x = a.x & b.x;
+  return a;
+}
+
+/// Arithmetic with floats
+
+inline C10_HOST_DEVICE float operator+(BFloat16 a, float b) {
+  return static_cast<float>(a) + b;
+}
+inline C10_HOST_DEVICE float operator-(BFloat16 a, float b) {
+  return static_cast<float>(a) - b;
+}
+inline C10_HOST_DEVICE float operator*(BFloat16 a, float b) {
+  return static_cast<float>(a) * b;
+}
+inline C10_HOST_DEVICE float operator/(BFloat16 a, float b) {
+  return static_cast<float>(a) / b;
+}
+
+inline C10_HOST_DEVICE float operator+(float a, BFloat16 b) {
+  return a + static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator-(float a, BFloat16 b) {
+  return a - static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator*(float a, BFloat16 b) {
+  return a * static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator/(float a, BFloat16 b) {
+  return a / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE float& operator+=(float& a, const BFloat16& b) {
+  return a += static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator-=(float& a, const BFloat16& b) {
+  return a -= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator*=(float& a, const BFloat16& b) {
+  return a *= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator/=(float& a, const BFloat16& b) {
+  return a /= static_cast<float>(b);
+}
+
+/// Arithmetic with doubles
+
+inline C10_HOST_DEVICE double operator+(BFloat16 a, double b) {
+  return static_cast<double>(a) + b;
+}
+inline C10_HOST_DEVICE double operator-(BFloat16 a, double b) {
+  return static_cast<double>(a) - b;
+}
+inline C10_HOST_DEVICE double operator*(BFloat16 a, double b) {
+  return static_cast<double>(a) * b;
+}
+inline C10_HOST_DEVICE double operator/(BFloat16 a, double b) {
+  return static_cast<double>(a) / b;
+}
+
+inline C10_HOST_DEVICE double operator+(double a, BFloat16 b) {
+  return a + static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator-(double a, BFloat16 b) {
+  return a - static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator*(double a, BFloat16 b) {
+  return a * static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator/(double a, BFloat16 b) {
+  return a / static_cast<double>(b);
+}
+
+/// Arithmetic with ints
+
+inline C10_HOST_DEVICE BFloat16 operator+(BFloat16 a, int b) {
+  return a + static_cast<BFloat16>(b);
+}
+inline C10_HOST_DEVICE BFloat16 operator-(BFloat16 a, int b) {
+  return a - static_cast<BFloat16>(b);
+}
+inline C10_HOST_DEVICE BFloat16 operator*(BFloat16 a, int b) {
+  return a * static_cast<BFloat16>(b);
+}
+inline C10_HOST_DEVICE BFloat16 operator/(BFloat16 a, int b) {
+  return a / static_cast<BFloat16>(b);
+}
+
+inline C10_HOST_DEVICE BFloat16 operator+(int a, BFloat16 b) {
+  return static_cast<BFloat16>(a) + b;
+}
+inline C10_HOST_DEVICE BFloat16 operator-(int a, BFloat16 b) {
+  return static_cast<BFloat16>(a) - b;
+}
+inline C10_HOST_DEVICE BFloat16 operator*(int a, BFloat16 b) {
+  return static_cast<BFloat16>(a) * b;
+}
+inline C10_HOST_DEVICE BFloat16 operator/(int a, BFloat16 b) {
+  return static_cast<BFloat16>(a) / b;
+}
+
+//// Arithmetic with int64_t
+
+inline C10_HOST_DEVICE BFloat16 operator+(BFloat16 a, int64_t b) {
+  return a + static_cast<BFloat16>(b);
+}
+inline C10_HOST_DEVICE BFloat16 operator-(BFloat16 a, int64_t b) {
+  return a - static_cast<BFloat16>(b);
+}
+inline C10_HOST_DEVICE BFloat16 operator*(BFloat16 a, int64_t b) {
+  return a * static_cast<BFloat16>(b);
+}
+inline C10_HOST_DEVICE BFloat16 operator/(BFloat16 a, int64_t b) {
+  return a / static_cast<BFloat16>(b);
+}
+
+inline C10_HOST_DEVICE BFloat16 operator+(int64_t a, BFloat16 b) {
+  return static_cast<BFloat16>(a) + b;
+}
+inline C10_HOST_DEVICE BFloat16 operator-(int64_t a, BFloat16 b) {
+  return static_cast<BFloat16>(a) - b;
+}
+inline C10_HOST_DEVICE BFloat16 operator*(int64_t a, BFloat16 b) {
+  return static_cast<BFloat16>(a) * b;
+}
+inline C10_HOST_DEVICE BFloat16 operator/(int64_t a, BFloat16 b) {
+  return static_cast<BFloat16>(a) / b;
+}
+
+// Overloading < and > operators, because std::max and std::min use them.
+
+inline C10_HOST_DEVICE bool operator>(BFloat16& lhs, BFloat16& rhs) {
+  return float(lhs) > float(rhs);
+}
+
+inline C10_HOST_DEVICE bool operator<(BFloat16& lhs, BFloat16& rhs) {
+  return float(lhs) < float(rhs);
+}
+
+} // namespace c10
+
+namespace std {
+
+template <>
+class numeric_limits<c10::BFloat16> {
+ public:
+  static constexpr bool is_signed = true;
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = true;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = true;
+  static constexpr auto has_denorm = numeric_limits<float>::has_denorm;
+  static constexpr auto has_denorm_loss =
+      numeric_limits<float>::has_denorm_loss;
+  static constexpr auto round_style = numeric_limits<float>::round_style;
+  static constexpr bool is_iec559 = false;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 8;
+  static constexpr int digits10 = 2;
+  static constexpr int max_digits10 = 4;
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -125;
+  static constexpr int min_exponent10 = -37;
+  static constexpr int max_exponent = 128;
+  static constexpr int max_exponent10 = 38;
+  static constexpr auto traps = numeric_limits<float>::traps;
+  static constexpr auto tinyness_before =
+      numeric_limits<float>::tinyness_before;
+
+  static constexpr c10::BFloat16 min() {
+    return c10::BFloat16(0x0080, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 lowest() {
+    return c10::BFloat16(0xFF7F, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 max() {
+    return c10::BFloat16(0x7F7F, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 epsilon() {
+    return c10::BFloat16(0x3C00, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 round_error() {
+    return c10::BFloat16(0x3F00, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 infinity() {
+    return c10::BFloat16(0x7F80, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 quiet_NaN() {
+    return c10::BFloat16(0x7FC0, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 signaling_NaN() {
+    return c10::BFloat16(0x7F80, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 denorm_min() {
+    return c10::BFloat16(0x0001, c10::BFloat16::from_bits());
+  }
+};
+
+} // namespace std
+
+C10_CLANG_DIAGNOSTIC_POP()
diff --git a/runtime/core/portable_type/c10/util/BFloat16-math.h b/runtime/core/portable_type/c10/util/BFloat16-math.h
new file mode 100644
index 00000000000..8291cd74481
--- /dev/null
+++ b/runtime/core/portable_type/c10/util/BFloat16-math.h
@@ -0,0 +1,299 @@
+#pragma once
+
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-float-conversion")
+#endif
+
+namespace c10 {
+template <typename T>
+struct is_reduced_floating_point
+    : std::integral_constant<
+          bool,
+          std::is_same_v<T, c10::Half> || std::is_same_v<T, c10::BFloat16>> {};
+
+template <typename T>
+constexpr bool is_reduced_floating_point_v =
+    is_reduced_floating_point<T>::value;
+} // namespace c10
+
+namespace std {
+
+#if !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED)
+using c10::is_reduced_floating_point;
+using c10::is_reduced_floating_point_v;
+#endif // !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED)
+
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T acos(T a) {
+  return std::acos(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T asin(T a) {
+  return std::asin(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T atan(T a) {
+  return std::atan(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T atanh(T a) {
+  return std::atanh(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T erf(T a) {
+  return std::erf(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T erfc(T a) {
+  return std::erfc(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T exp(T a) {
+  return std::exp(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T expm1(T a) {
+  return std::expm1(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline bool isfinite(T a) {
+  return std::isfinite(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T log(T a) {
+  return std::log(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T log10(T a) {
+  return std::log10(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T log1p(T a) {
+  return std::log1p(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T log2(T a) {
+  return std::log2(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T ceil(T a) {
+  return std::ceil(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T cos(T a) {
+  return std::cos(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T floor(T a) {
+  return std::floor(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T nearbyint(T a) {
+  return std::nearbyint(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T sin(T a) {
+  return std::sin(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T tan(T a) {
+  return std::tan(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T sinh(T a) {
+  return std::sinh(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T cosh(T a) {
+  return std::cosh(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T tanh(T a) {
+  return std::tanh(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T trunc(T a) {
+  return std::trunc(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T lgamma(T a) {
+  return std::lgamma(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T sqrt(T a) {
+  return std::sqrt(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T rsqrt(T a) {
+  return 1.0 / std::sqrt(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T abs(T a) {
+  return std::abs(float(a));
+}
+#if defined(_MSC_VER) && defined(__CUDACC__)
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T pow(T a, double b) {
+  return std::pow(float(a), float(b));
+}
+#else
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T pow(T a, double b) {
+  return std::pow(float(a), b);
+}
+#endif
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T pow(T a, T b) {
+  return std::pow(float(a), float(b));
+}
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+inline T fmod(T a, T b) {
+  return std::fmod(float(a), float(b));
+}
+
+/*
+  The following function is inspired from the implementation in `musl`
+  Link to License: https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT
+  ----------------------------------------------------------------------
+  Copyright © 2005-2020 Rich Felker, et al.
+
+  Permission is hereby granted, free of charge, to any person obtaining
+  a copy of this software and associated documentation files (the
+  "Software"), to deal in the Software without restriction, including
+  without limitation the rights to use, copy, modify, merge, publish,
+  distribute, sublicense, and/or sell copies of the Software, and to
+  permit persons to whom the Software is furnished to do so, subject to
+  the following conditions:
+
+  The above copyright notice and this permission notice shall be
+  included in all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+  ----------------------------------------------------------------------
+ */
+template <
+    typename T,
+    typename std::enable_if_t<c10::is_reduced_floating_point_v<T>, int> = 0>
+C10_HOST_DEVICE inline T nextafter(T from, T to) {
+  // Reference:
+  // https://git.musl-libc.org/cgit/musl/tree/src/math/nextafter.c
+  using int_repr_t = uint16_t;
+  constexpr uint8_t bits = 16;
+  union {
+    T f;
+    int_repr_t i;
+  } ufrom = {from}, uto = {to};
+
+  // get a mask to get the sign bit i.e. MSB
+  int_repr_t sign_mask = int_repr_t{1} << (bits - 1);
+
+  // short-circuit: if either is NaN, return NaN
+  if (from != from || to != to) {
+    return from + to;
+  }
+
+  // short-circuit: if they are exactly the same.
+  if (ufrom.i == uto.i) {
+    return from;
+  }
+
+  // mask the sign-bit to zero i.e. positive
+  // equivalent to abs(x)
+  int_repr_t abs_from = ufrom.i & ~sign_mask;
+  int_repr_t abs_to = uto.i & ~sign_mask;
+  if (abs_from == 0) {
+    // if both are zero but with different sign,
+    // preserve the sign of `to`.
+    if (abs_to == 0) {
+      return to;
+    }
+    // smallest subnormal with sign of `to`.
+    ufrom.i = (uto.i & sign_mask) | int_repr_t{1};
+    return ufrom.f;
+  }
+
+  // if abs(from) > abs(to) or sign(from) != sign(to)
+  if (abs_from > abs_to || ((ufrom.i ^ uto.i) & sign_mask)) {
+    ufrom.i--;
+  } else {
+    ufrom.i++;
+  }
+
+  return ufrom.f;
+}
+
+} // namespace std
+
+C10_CLANG_DIAGNOSTIC_POP()
diff --git a/runtime/core/portable_type/c10/util/BFloat16.h b/runtime/core/portable_type/c10/util/BFloat16.h
new file mode 100644
index 00000000000..ad1271fc729
--- /dev/null
+++ b/runtime/core/portable_type/c10/util/BFloat16.h
@@ -0,0 +1,130 @@
+#pragma once
+
+// Defines the bloat16 type (brain floating-point). This representation uses
+// 1 bit for the sign, 8 bits for the exponent and 7 bits for the mantissa.
+
+#include <c10/macros/Macros.h>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <iosfwd>
+#ifndef C10_EMBEDDED
+#include <ostream>
+#endif // C10_EMBEDDED
+
+#if defined(__CUDACC__) && !defined(USE_ROCM)
+#include <cuda_bf16.h>
+#endif
+
+#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+#if defined(CL_SYCL_LANGUAGE_VERSION)
+#include <CL/sycl.hpp> // for SYCL 1.2.1
+#else
+#include <sycl/sycl.hpp> // for SYCL 2020
+#endif
+#include <ext/oneapi/bfloat16.hpp>
+#endif
+
+namespace c10 {
+
+namespace detail {
+inline C10_HOST_DEVICE float f32_from_bits(uint16_t src) {
+  float res = 0;
+  uint32_t tmp = src;
+  tmp <<= 16;
+
+#if defined(USE_ROCM)
+  float* tempRes;
+
+  // We should be using memcpy in order to respect the strict aliasing rule
+  // but it fails in the HIP environment.
+  tempRes = reinterpret_cast<float*>(&tmp);
+  res = *tempRes;
+#else
+  std::memcpy(&res, &tmp, sizeof(tmp));
+#endif
+
+  return res;
+}
+
+inline C10_HOST_DEVICE uint16_t bits_from_f32(float src) {
+  uint32_t res = 0;
+
+#if defined(USE_ROCM)
+  // We should be using memcpy in order to respect the strict aliasing rule
+  // but it fails in the HIP environment.
+  uint32_t* tempRes = reinterpret_cast<uint32_t*>(&src);
+  res = *tempRes;
+#else
+  std::memcpy(&res, &src, sizeof(res));
+#endif
+
+  return res >> 16;
+}
+
+inline C10_HOST_DEVICE uint16_t round_to_nearest_even(float src) {
+#if defined(USE_ROCM)
+  if (src != src) {
+#elif defined(_MSC_VER)
+  if (isnan(src)) {
+#else
+  if (std::isnan(src)) {
+#endif
+    return UINT16_C(0x7FC0);
+  } else {
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+    union {
+      uint32_t U32; // NOLINT(facebook-hte-BadMemberName)
+      float F32; // NOLINT(facebook-hte-BadMemberName)
+    };
+
+    F32 = src;
+    uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF);
+    return static_cast<uint16_t>((U32 + rounding_bias) >> 16);
+  }
+}
+} // namespace detail
+
+struct alignas(2) BFloat16 {
+  uint16_t x;
+
+  // HIP wants __host__ __device__ tag, CUDA does not
+#if defined(USE_ROCM)
+  C10_HOST_DEVICE BFloat16() = default;
+#else
+  BFloat16() = default;
+#endif
+
+  struct from_bits_t {};
+  static constexpr C10_HOST_DEVICE from_bits_t from_bits() {
+    return from_bits_t();
+  }
+
+  constexpr C10_HOST_DEVICE BFloat16(unsigned short bits, from_bits_t)
+      : x(bits) {}
+  /* implicit */ inline C10_HOST_DEVICE BFloat16(float value);
+  inline C10_HOST_DEVICE operator float() const;
+
+#if defined(__CUDACC__) && !defined(USE_ROCM)
+  inline C10_HOST_DEVICE BFloat16(const __nv_bfloat16& value);
+  explicit inline C10_HOST_DEVICE operator __nv_bfloat16() const;
+#endif
+
+#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+  inline C10_HOST_DEVICE BFloat16(const sycl::ext::oneapi::bfloat16& value);
+  explicit inline C10_HOST_DEVICE operator sycl::ext::oneapi::bfloat16() const;
+#endif
+};
+
+#ifndef C10_EMBEDDED
+C10_API inline std::ostream& operator<<(
+    std::ostream& out,
+    const BFloat16& value) {
+  out << (float)value;
+  return out;
+}
+#endif // C10_EMBEDDED
+
+} // namespace c10
+
+#include <c10/util/BFloat16-inl.h> // IWYU pragma: keep
diff --git a/runtime/core/portable_type/c10/util/Half-inl.h b/runtime/core/portable_type/c10/util/Half-inl.h
new file mode 100644
index 00000000000..ae4469e5636
--- /dev/null
+++ b/runtime/core/portable_type/c10/util/Half-inl.h
@@ -0,0 +1,350 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/bit_cast.h>
+
+#include <cstring>
+#include <limits>
+
+#ifdef __CUDACC__
+#include <cuda_fp16.h>
+#endif
+
+#ifdef __HIPCC__
+#include <hip/hip_fp16.h>
+#endif
+
+#if defined(CL_SYCL_LANGUAGE_VERSION)
+#include <CL/sycl.hpp> // for SYCL 1.2.1
+#elif defined(SYCL_LANGUAGE_VERSION)
+#include <sycl/sycl.hpp> // for SYCL 2020
+#endif
+
+#if (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
+    !defined(__APPLE__)
+#include <ATen/cpu/vec/vec_half.h>
+#endif
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
+#endif
+
+namespace c10 {
+
+#if defined(__aarch64__) && !defined(__CUDACC__)
+/// Constructors
+inline Half::Half(float16_t value) : x(detail::fp16_to_bits(value)) {}
+inline Half::operator float16_t() const {
+  return detail::fp16_from_bits(x);
+}
+#else
+
+inline C10_HOST_DEVICE Half::Half(float value)
+    :
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+      x(__half_as_short(__float2half(value)))
+#elif defined(__SYCL_DEVICE_ONLY__)
+      x(c10::bit_cast<uint16_t>(sycl::half(value)))
+#elif (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
+    !defined(__APPLE__)
+      x(at::vec::float2half_scalar(value))
+#else
+      x(detail::fp16_ieee_from_fp32_value(value))
+#endif
+{
+}
+
+/// Implicit conversions
+
+inline C10_HOST_DEVICE Half::operator float() const {
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+  return __half2float(*reinterpret_cast<const __half*>(&x));
+#elif defined(__SYCL_DEVICE_ONLY__)
+  return float(c10::bit_cast<sycl::half>(x));
+#elif (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
+    !defined(__APPLE__)
+  return at::vec::half2float_scalar(x);
+#elif defined(__aarch64__) && !defined(__CUDACC__)
+  return detail::native_fp16_to_fp32_value(x);
+#else
+  return detail::fp16_ieee_to_fp32_value(x);
+#endif
+}
+
+#endif /* !defined(__aarch64__) || defined(__CUDACC__) \
+        */
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+inline C10_HOST_DEVICE Half::Half(const __half& value) {
+  x = *reinterpret_cast<const unsigned short*>(&value);
+}
+inline C10_HOST_DEVICE Half::operator __half() const {
+  return *reinterpret_cast<const __half*>(&x);
+}
+#endif
+
+#ifdef SYCL_LANGUAGE_VERSION
+inline C10_HOST_DEVICE Half::Half(const sycl::half& value) {
+  x = *reinterpret_cast<const unsigned short*>(&value);
+}
+inline C10_HOST_DEVICE Half::operator sycl::half() const {
+  return *reinterpret_cast<const sycl::half*>(&x);
+}
+#endif
+
+// CUDA intrinsics
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)) || \
+    (defined(__clang__) && defined(__CUDA__))
+inline __device__ Half __ldg(const Half* ptr) {
+  return __ldg(reinterpret_cast<const __half*>(ptr));
+}
+#endif
+
+/// Arithmetic
+
+inline C10_HOST_DEVICE Half operator+(const Half& a, const Half& b) {
+  return static_cast<float>(a) + static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Half operator-(const Half& a, const Half& b) {
+  return static_cast<float>(a) - static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Half operator*(const Half& a, const Half& b) {
+  return static_cast<float>(a) * static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Half operator/(const Half& a, const Half& b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Half operator-(const Half& a) {
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+    defined(__HIP_DEVICE_COMPILE__)
+  return __hneg(a);
+#elif defined(__SYCL_DEVICE_ONLY__)
+  return -c10::bit_cast<sycl::half>(a);
+#else
+  return -static_cast<float>(a);
+#endif
+}
+
+inline C10_HOST_DEVICE Half& operator+=(Half& a, const Half& b) {
+  a = a + b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Half& operator-=(Half& a, const Half& b) {
+  a = a - b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Half& operator*=(Half& a, const Half& b) {
+  a = a * b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Half& operator/=(Half& a, const Half& b) {
+  a = a / b;
+  return a;
+}
+
+/// Arithmetic with floats
+
+inline C10_HOST_DEVICE float operator+(Half a, float b) {
+  return static_cast<float>(a) + b;
+}
+inline C10_HOST_DEVICE float operator-(Half a, float b) {
+  return static_cast<float>(a) - b;
+}
+inline C10_HOST_DEVICE float operator*(Half a, float b) {
+  return static_cast<float>(a) * b;
+}
+inline C10_HOST_DEVICE float operator/(Half a, float b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / b;
+}
+
+inline C10_HOST_DEVICE float operator+(float a, Half b) {
+  return a + static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator-(float a, Half b) {
+  return a - static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator*(float a, Half b) {
+  return a * static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator/(float a, Half b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE float& operator+=(float& a, const Half& b) {
+  return a += static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator-=(float& a, const Half& b) {
+  return a -= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator*=(float& a, const Half& b) {
+  return a *= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator/=(float& a, const Half& b) {
+  return a /= static_cast<float>(b);
+}
+
+/// Arithmetic with doubles
+
+inline C10_HOST_DEVICE double operator+(Half a, double b) {
+  return static_cast<double>(a) + b;
+}
+inline C10_HOST_DEVICE double operator-(Half a, double b) {
+  return static_cast<double>(a) - b;
+}
+inline C10_HOST_DEVICE double operator*(Half a, double b) {
+  return static_cast<double>(a) * b;
+}
+inline C10_HOST_DEVICE double operator/(Half a, double b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<double>(a) / b;
+}
+
+inline C10_HOST_DEVICE double operator+(double a, Half b) {
+  return a + static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator-(double a, Half b) {
+  return a - static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator*(double a, Half b) {
+  return a * static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator/(double a, Half b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<double>(b);
+}
+
+/// Arithmetic with ints
+
+inline C10_HOST_DEVICE Half operator+(Half a, int b) {
+  return a + static_cast<Half>(b);
+}
+inline C10_HOST_DEVICE Half operator-(Half a, int b) {
+  return a - static_cast<Half>(b);
+}
+inline C10_HOST_DEVICE Half operator*(Half a, int b) {
+  return a * static_cast<Half>(b);
+}
+inline C10_HOST_DEVICE Half operator/(Half a, int b) {
+  return a / static_cast<Half>(b);
+}
+
+inline C10_HOST_DEVICE Half operator+(int a, Half b) {
+  return static_cast<Half>(a) + b;
+}
+inline C10_HOST_DEVICE Half operator-(int a, Half b) {
+  return static_cast<Half>(a) - b;
+}
+inline C10_HOST_DEVICE Half operator*(int a, Half b) {
+  return static_cast<Half>(a) * b;
+}
+inline C10_HOST_DEVICE Half operator/(int a, Half b) {
+  return static_cast<Half>(a) / b;
+}
+
+//// Arithmetic with int64_t
+
+inline C10_HOST_DEVICE Half operator+(Half a, int64_t b) {
+  return a + static_cast<Half>(b);
+}
+inline C10_HOST_DEVICE Half operator-(Half a, int64_t b) {
+  return a - static_cast<Half>(b);
+}
+inline C10_HOST_DEVICE Half operator*(Half a, int64_t b) {
+  return a * static_cast<Half>(b);
+}
+inline C10_HOST_DEVICE Half operator/(Half a, int64_t b) {
+  return a / static_cast<Half>(b);
+}
+
+inline C10_HOST_DEVICE Half operator+(int64_t a, Half b) {
+  return static_cast<Half>(a) + b;
+}
+inline C10_HOST_DEVICE Half operator-(int64_t a, Half b) {
+  return static_cast<Half>(a) - b;
+}
+inline C10_HOST_DEVICE Half operator*(int64_t a, Half b) {
+  return static_cast<Half>(a) * b;
+}
+inline C10_HOST_DEVICE Half operator/(int64_t a, Half b) {
+  return static_cast<Half>(a) / b;
+}
+
+/// NOTE: we do not define comparisons directly and instead rely on the implicit
+/// conversion from c10::Half to float.
+
+} // namespace c10
+
+namespace std {
+
+template <>
+class numeric_limits<c10::Half> {
+ public:
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_signed = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = true;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = true;
+  static constexpr auto has_denorm = numeric_limits<float>::has_denorm;
+  static constexpr auto has_denorm_loss =
+      numeric_limits<float>::has_denorm_loss;
+  static constexpr auto round_style = numeric_limits<float>::round_style;
+  static constexpr bool is_iec559 = true;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 11;
+  static constexpr int digits10 = 3;
+  static constexpr int max_digits10 = 5;
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -13;
+  static constexpr int min_exponent10 = -4;
+  static constexpr int max_exponent = 16;
+  static constexpr int max_exponent10 = 4;
+  static constexpr auto traps = numeric_limits<float>::traps;
+  static constexpr auto tinyness_before =
+      numeric_limits<float>::tinyness_before;
+  static constexpr c10::Half min() {
+    return c10::Half(0x0400, c10::Half::from_bits());
+  }
+  static constexpr c10::Half lowest() {
+    return c10::Half(0xFBFF, c10::Half::from_bits());
+  }
+  static constexpr c10::Half max() {
+    return c10::Half(0x7BFF, c10::Half::from_bits());
+  }
+  static constexpr c10::Half epsilon() {
+    return c10::Half(0x1400, c10::Half::from_bits());
+  }
+  static constexpr c10::Half round_error() {
+    return c10::Half(0x3800, c10::Half::from_bits());
+  }
+  static constexpr c10::Half infinity() {
+    return c10::Half(0x7C00, c10::Half::from_bits());
+  }
+  static constexpr c10::Half quiet_NaN() {
+    return c10::Half(0x7E00, c10::Half::from_bits());
+  }
+  static constexpr c10::Half signaling_NaN() {
+    return c10::Half(0x7D00, c10::Half::from_bits());
+  }
+  static constexpr c10::Half denorm_min() {
+    return c10::Half(0x0001, c10::Half::from_bits());
+  }
+};
+
+} // namespace std
+
+C10_CLANG_DIAGNOSTIC_POP()
diff --git a/runtime/core/portable_type/c10/util/Half.h b/runtime/core/portable_type/c10/util/Half.h
new file mode 100644
index 00000000000..5625d4c3403
--- /dev/null
+++ b/runtime/core/portable_type/c10/util/Half.h
@@ -0,0 +1,423 @@
+#pragma once
+
+/// Defines the Half type (half-precision floating-point) including conversions
+/// to standard C types and basic arithmetic operations. Note that arithmetic
+/// operations are implemented by converting to floating point and
+/// performing the operation in float32, instead of using CUDA half intrinsics.
+/// Most uses of this type within ATen are memory bound, including the
+/// element-wise kernels, and the half intrinsics aren't efficient on all GPUs.
+/// If you are writing a compute bound kernel, you can use the CUDA half
+/// intrinsics directly on the Half type from device code.
+
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/bit_cast.h>
+#include <c10/util/floating_point_utils.h>
+#include <type_traits>
+
+#if defined(__cplusplus)
+#include <cmath>
+#elif !defined(__OPENCL_VERSION__)
+#include <math.h>
+#endif
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#include <cstdint>
+#include <cstring>
+#include <iosfwd>
+#include <limits>
+#ifndef C10_EMBEDDED
+#include <ostream>
+#endif // C10_EMBEDDED
+
+#ifdef __CUDACC__
+#include <cuda_fp16.h>
+#endif
+
+#ifdef __HIPCC__
+#include <hip/hip_fp16.h>
+#endif
+
+#if defined(CL_SYCL_LANGUAGE_VERSION)
+#include <CL/sycl.hpp> // for SYCL 1.2.1
+#elif defined(SYCL_LANGUAGE_VERSION)
+#include <sycl/sycl.hpp> // for SYCL 2020
+#endif
+
+#if defined(__aarch64__) && !defined(__CUDACC__)
+#include <arm_neon.h>
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || \
+    defined(_M_IX86)
+#if defined(__F16C__) &&                               \
+    !(defined(__CUDA_ARCH__) || defined(__CUDACC__) || \
+      defined(__HIP_DEVICE_COMPILE__))
+#define C10_X86_F16 1
+#include <immintrin.h> // import conversion ops from f16cintrin.h
+#endif // defined(__F16C__) && !(defined(__CUDA_ARCH__) || defined(__CUDACC__)
+       // || defined(__HIP_DEVICE_COMPILE__))
+#endif // __x86_64__ || _M_X64 || __i386 || _M_IX86
+#endif // __GNUC__ || __clang__
+
+namespace c10 {
+
+namespace detail {
+
+/*
+ * Convert a 16-bit floating-point number in IEEE half-precision format, in bit
+ * representation, to a 32-bit floating-point number in IEEE single-precision
+ * format, in bit representation.
+ *
+ * @note The implementation doesn't use any floating-point operations.
+ */
+inline uint32_t fp16_ieee_to_fp32_bits(uint16_t h) {
+  /*
+   * Extend the half-precision floating-point number to 32 bits and shift to the
+   * upper part of the 32-bit word:
+   *      +---+-----+------------+-------------------+
+   *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+   *      +---+-----+------------+-------------------+
+   * Bits  31  26-30    16-25            0-15
+   *
+   * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0
+   * - zero bits.
+   */
+  const uint32_t w = (uint32_t)h << 16;
+  /*
+   * Extract the sign of the input number into the high bit of the 32-bit word:
+   *
+   *      +---+----------------------------------+
+   *      | S |0000000 00000000 00000000 00000000|
+   *      +---+----------------------------------+
+   * Bits  31                 0-31
+   */
+  const uint32_t sign = w & UINT32_C(0x80000000);
+  /*
+   * Extract mantissa and biased exponent of the input number into the bits 0-30
+   * of the 32-bit word:
+   *
+   *      +---+-----+------------+-------------------+
+   *      | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+   *      +---+-----+------------+-------------------+
+   * Bits  30  27-31     17-26            0-16
+   */
+  const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
+  /*
+   * Renorm shift is the number of bits to shift mantissa left to make the
+   * half-precision number normalized. If the initial number is normalized, some
+   * of its high 6 bits (sign == 0 and 5-bit exponent) equals one. In this case
+   * renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note
+   * that if we shift denormalized nonsign by renorm_shift, the unit bit of
+   * mantissa will shift into exponent, turning the biased exponent into 1, and
+   * making mantissa normalized (i.e. without leading 1).
+   */
+#ifdef _MSC_VER
+  unsigned long nonsign_bsr;
+  _BitScanReverse(&nonsign_bsr, (unsigned long)nonsign);
+  uint32_t renorm_shift = (uint32_t)nonsign_bsr ^ 31;
+#else
+  uint32_t renorm_shift = __builtin_clz(nonsign);
+#endif
+  renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
+  /*
+   * Iff half-precision number has exponent of 15, the addition overflows
+   * it into bit 31, and the subsequent shift turns the high 9 bits
+   * into 1. Thus inf_nan_mask == 0x7F800000 if the half-precision number
+   * had exponent of 15 (i.e. was NaN or infinity) 0x00000000 otherwise
+   */
+  const int32_t inf_nan_mask =
+      ((int32_t)(nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000);
+  /*
+   * Iff nonsign is 0, it overflows into 0xFFFFFFFF, turning bit 31
+   * into 1. Otherwise, bit 31 remains 0. The signed shift right by 31
+   * broadcasts bit 31 into all bits of the zero_mask. Thus zero_mask ==
+   * 0xFFFFFFFF if the half-precision number was zero (+0.0h or -0.0h)
+   * 0x00000000 otherwise
+   */
+  const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31;
+  /*
+   * 1. Shift nonsign left by renorm_shift to normalize it (if the input
+   * was denormal)
+   * 2. Shift nonsign right by 3 so the exponent (5 bits originally)
+   * becomes an 8-bit field and 10-bit mantissa shifts into the 10 high
+   * bits of the 23-bit mantissa of IEEE single-precision number.
+   * 3. Add 0x70 to the exponent (starting at bit 23) to compensate the
+   * different in exponent bias (0x7F for single-precision number less 0xF
+   * for half-precision number).
+   * 4. Subtract renorm_shift from the exponent (starting at bit 23) to
+   * account for renormalization. As renorm_shift is less than 0x70, this
+   * can be combined with step 3.
+   * 5. Binary OR with inf_nan_mask to turn the exponent into 0xFF if the
+   * input was NaN or infinity.
+   * 6. Binary ANDNOT with zero_mask to turn the mantissa and exponent
+   * into zero if the input was zero.
+   * 7. Combine with the sign of the input number.
+   */
+  return sign |
+      ((((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) |
+        inf_nan_mask) &
+       ~zero_mask);
+}
+
+/*
+ * Convert a 16-bit floating-point number in IEEE half-precision format, in bit
+ * representation, to a 32-bit floating-point number in IEEE single-precision
+ * format.
+ *
+ * @note The implementation relies on IEEE-like (no assumption about rounding
+ * mode and no operations on denormals) floating-point operations and bitcasts
+ * between integer and floating-point variables.
+ */
+C10_HOST_DEVICE inline float fp16_ieee_to_fp32_value(uint16_t h) {
+#ifdef C10_X86_F16
+  return _cvtsh_ss(h);
+#else
+  /*
+   * Extend the half-precision floating-point number to 32 bits and shift to the
+   * upper part of the 32-bit word:
+   *      +---+-----+------------+-------------------+
+   *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+   *      +---+-----+------------+-------------------+
+   * Bits  31  26-30    16-25            0-15
+   *
+   * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0
+   * - zero bits.
+   */
+  const uint32_t w = (uint32_t)h << 16;
+  /*
+   * Extract the sign of the input number into the high bit of the 32-bit word:
+   *
+   *      +---+----------------------------------+
+   *      | S |0000000 00000000 00000000 00000000|
+   *      +---+----------------------------------+
+   * Bits  31                 0-31
+   */
+  const uint32_t sign = w & UINT32_C(0x80000000);
+  /*
+   * Extract mantissa and biased exponent of the input number into the high bits
+   * of the 32-bit word:
+   *
+   *      +-----+------------+---------------------+
+   *      |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000|
+   *      +-----+------------+---------------------+
+   * Bits  27-31    17-26            0-16
+   */
+  const uint32_t two_w = w + w;
+
+  /*
+   * Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become
+   * mantissa and exponent of a single-precision floating-point number:
+   *
+   *       S|Exponent |          Mantissa
+   *      +-+---+-----+------------+----------------+
+   *      |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000|
+   *      +-+---+-----+------------+----------------+
+   * Bits   | 23-31   |           0-22
+   *
+   * Next, there are some adjustments to the exponent:
+   * - The exponent needs to be corrected by the difference in exponent bias
+   * between single-precision and half-precision formats (0x7F - 0xF = 0x70)
+   * - Inf and NaN values in the inputs should become Inf and NaN values after
+   * conversion to the single-precision number. Therefore, if the biased
+   * exponent of the half-precision input was 0x1F (max possible value), the
+   * biased exponent of the single-precision output must be 0xFF (max possible
+   * value). We do this correction in two steps:
+   *   - First, we adjust the exponent by (0xFF - 0x1F) = 0xE0 (see exp_offset
+   * below) rather than by 0x70 suggested by the difference in the exponent bias
+   * (see above).
+   *   - Then we multiply the single-precision result of exponent adjustment by
+   * 2**(-112) to reverse the effect of exponent adjustment by 0xE0 less the
+   * necessary exponent adjustment by 0x70 due to difference in exponent bias.
+   *     The floating-point multiplication hardware would ensure than Inf and
+   * NaN would retain their value on at least partially IEEE754-compliant
+   * implementations.
+   *
+   * Note that the above operations do not handle denormal inputs (where biased
+   * exponent == 0). However, they also do not operate on denormal inputs, and
+   * do not produce denormal results.
+   */
+  constexpr uint32_t exp_offset = UINT32_C(0xE0) << 23;
+  // const float exp_scale = 0x1.0p-112f;
+  constexpr uint32_t scale_bits = (uint32_t)15 << 23;
+  float exp_scale_val = 0;
+  std::memcpy(&exp_scale_val, &scale_bits, sizeof(exp_scale_val));
+  const float exp_scale = exp_scale_val;
+  const float normalized_value =
+      fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+
+  /*
+   * Convert denormalized half-precision inputs into single-precision results
+   * (always normalized). Zero inputs are also handled here.
+   *
+   * In a denormalized number the biased exponent is zero, and mantissa has
+   * on-zero bits. First, we shift mantissa into bits 0-9 of the 32-bit word.
+   *
+   *                  zeros           |  mantissa
+   *      +---------------------------+------------+
+   *      |0000 0000 0000 0000 0000 00|MM MMMM MMMM|
+   *      +---------------------------+------------+
+   * Bits             10-31                0-9
+   *
+   * Now, remember that denormalized half-precision numbers are represented as:
+   *    FP16 = mantissa * 2**(-24).
+   * The trick is to construct a normalized single-precision number with the
+   * same mantissa and thehalf-precision input and with an exponent which would
+   * scale the corresponding mantissa bits to 2**(-24). A normalized
+   * single-precision floating-point number is represented as: FP32 = (1 +
+   * mantissa * 2**(-23)) * 2**(exponent - 127) Therefore, when the biased
+   * exponent is 126, a unit change in the mantissa of the input denormalized
+   * half-precision number causes a change of the constructed single-precision
+   * number by 2**(-24), i.e. the same amount.
+   *
+   * The last step is to adjust the bias of the constructed single-precision
+   * number. When the input half-precision number is zero, the constructed
+   * single-precision number has the value of FP32 = 1 * 2**(126 - 127) =
+   * 2**(-1) = 0.5 Therefore, we need to subtract 0.5 from the constructed
+   * single-precision number to get the numerical equivalent of the input
+   * half-precision number.
+   */
+  constexpr uint32_t magic_mask = UINT32_C(126) << 23;
+  constexpr float magic_bias = 0.5f;
+  const float denormalized_value =
+      fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+  /*
+   * - Choose either results of conversion of input as a normalized number, or
+   * as a denormalized number, depending on the input exponent. The variable
+   * two_w contains input exponent in bits 27-31, therefore if its smaller than
+   * 2**27, the input is either a denormal number, or zero.
+   * - Combine the result of conversion of exponent and mantissa with the sign
+   * of the input number.
+   */
+  constexpr uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+  const uint32_t result = sign |
+      (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value)
+                                   : fp32_to_bits(normalized_value));
+  return fp32_from_bits(result);
+#endif // C10_X86_F16
+}
+
+/*
+ * Convert a 32-bit floating-point number in IEEE single-precision format to a
+ * 16-bit floating-point number in IEEE half-precision format, in bit
+ * representation.
+ *
+ * @note The implementation relies on IEEE-like (no assumption about rounding
+ * mode and no operations on denormals) floating-point operations and bitcasts
+ * between integer and floating-point variables.
+ */
+inline uint16_t fp16_ieee_from_fp32_value(float f) {
+#ifdef C10_X86_F16
+  return _cvtss_sh(f, _MM_FROUND_TO_NEAREST_INT);
+#else
+  // const float scale_to_inf = 0x1.0p+112f;
+  // const float scale_to_zero = 0x1.0p-110f;
+  constexpr uint32_t scale_to_inf_bits = (uint32_t)239 << 23;
+  constexpr uint32_t scale_to_zero_bits = (uint32_t)17 << 23;
+  float scale_to_inf_val = 0, scale_to_zero_val = 0;
+  std::memcpy(&scale_to_inf_val, &scale_to_inf_bits, sizeof(scale_to_inf_val));
+  std::memcpy(
+      &scale_to_zero_val, &scale_to_zero_bits, sizeof(scale_to_zero_val));
+  const float scale_to_inf = scale_to_inf_val;
+  const float scale_to_zero = scale_to_zero_val;
+
+#if defined(_MSC_VER) && _MSC_VER == 1916
+  float base = ((signbit(f) != 0 ? -f : f) * scale_to_inf) * scale_to_zero;
+#else
+  float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+#endif
+
+  const uint32_t w = fp32_to_bits(f);
+  const uint32_t shl1_w = w + w;
+  const uint32_t sign = w & UINT32_C(0x80000000);
+  uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+  if (bias < UINT32_C(0x71000000)) {
+    bias = UINT32_C(0x71000000);
+  }
+
+  base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+  const uint32_t bits = fp32_to_bits(base);
+  const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+  const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+  const uint32_t nonsign = exp_bits + mantissa_bits;
+  return static_cast<uint16_t>(
+      (sign >> 16) |
+      (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign));
+#endif // C10_X86_F16
+}
+
+#ifdef C10_X86_F16
+#undef C10_X86_F16
+#endif // C10_X86_F16
+
+#if defined(__aarch64__) && !defined(__CUDACC__)
+inline float16_t fp16_from_bits(uint16_t h) {
+  return c10::bit_cast<float16_t>(h);
+}
+
+inline uint16_t fp16_to_bits(float16_t f) {
+  return c10::bit_cast<uint16_t>(f);
+}
+
+// According to https://godbolt.org/z/frExdbsWG it would translate to single
+// fcvt s0, h0
+inline float native_fp16_to_fp32_value(uint16_t h) {
+  return static_cast<float>(fp16_from_bits(h));
+}
+
+inline uint16_t native_fp16_from_fp32_value(float f) {
+  return fp16_to_bits(static_cast<float16_t>(f));
+}
+#endif
+
+} // namespace detail
+
+struct alignas(2) Half {
+  unsigned short x;
+
+  struct from_bits_t {};
+  C10_HOST_DEVICE static constexpr from_bits_t from_bits() {
+    return from_bits_t();
+  }
+
+  // HIP wants __host__ __device__ tag, CUDA does not
+#if defined(USE_ROCM)
+  C10_HOST_DEVICE Half() = default;
+#else
+  Half() = default;
+#endif
+
+  constexpr C10_HOST_DEVICE Half(unsigned short bits, from_bits_t) : x(bits) {}
+#if defined(__aarch64__) && !defined(__CUDACC__)
+  inline Half(float16_t value);
+  inline operator float16_t() const;
+#else
+  inline C10_HOST_DEVICE Half(float value);
+  inline C10_HOST_DEVICE operator float() const;
+#endif
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_HOST_DEVICE Half(const __half& value);
+  inline C10_HOST_DEVICE operator __half() const;
+#endif
+#ifdef SYCL_LANGUAGE_VERSION
+  inline C10_HOST_DEVICE Half(const sycl::half& value);
+  inline C10_HOST_DEVICE operator sycl::half() const;
+#endif
+};
+
+#ifndef C10_EMBEDDED
+C10_API inline std::ostream& operator<<(std::ostream& out, const Half& value) {
+  out << (float)value;
+  return out;
+}
+#endif // C10_EMBEDDED
+
+} // namespace c10
+
+#include <c10/util/Half-inl.h> // IWYU pragma: keep
diff --git a/runtime/core/portable_type/c10/util/TypeSafeSignMath.h b/runtime/core/portable_type/c10/util/TypeSafeSignMath.h
new file mode 100644
index 00000000000..2853ff48d18
--- /dev/null
+++ b/runtime/core/portable_type/c10/util/TypeSafeSignMath.h
@@ -0,0 +1,140 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <limits>
+#include <type_traits>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wstring-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wstring-conversion")
+#endif
+#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
+#endif
+
+namespace c10 {
+
+/// Returns false since we cannot have x < 0 if x is unsigned.
+template <typename T>
+inline constexpr bool is_negative(
+    const T& /*x*/,
+    std::true_type /*is_unsigned*/) {
+  return false;
+}
+
+/// Returns true if a signed variable x < 0
+template <typename T>
+inline constexpr bool is_negative(const T& x, std::false_type /*is_unsigned*/) {
+  return x < T(0);
+}
+
+/// Returns true if x < 0
+/// NOTE: Will fail on an unsigned custom type
+///       For the most part it's possible to fix this if
+///       the custom type has a constexpr constructor.
+///       However, notably, c10::Half does not :-(
+template <typename T>
+inline constexpr bool is_negative(const T& x) {
+  return is_negative(x, std::is_unsigned<T>());
+}
+
+/// Returns the sign of an unsigned variable x as 0, 1
+template <typename T>
+inline constexpr int signum(const T& x, std::true_type /*is_unsigned*/) {
+  return T(0) < x;
+}
+
+/// Returns the sign of a signed variable x as -1, 0, 1
+template <typename T>
+inline constexpr int signum(const T& x, std::false_type /*is_unsigned*/) {
+  return (T(0) < x) - (x < T(0));
+}
+
+/// Returns the sign of x as -1, 0, 1
+/// NOTE: Will fail on an unsigned custom type
+///       For the most part it's possible to fix this if
+///       the custom type has a constexpr constructor.
+///       However, notably, c10::Half does not :-(
+template <typename T>
+inline constexpr int signum(const T& x) {
+  return signum(x, std::is_unsigned<T>());
+}
+
+/// Returns true if a and b are not both negative
+template <typename T, typename U>
+inline constexpr bool signs_differ(const T& a, const U& b) {
+  return is_negative(a) != is_negative(b);
+}
+
+// Suppress sign compare warning when compiling with GCC
+// as later does not account for short-circuit rule before
+// raising the warning, see https://godbolt.org/z/Tr3Msnz99
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#endif
+
+/// Returns true if x is greater than the greatest value of the type Limit
+template <typename Limit, typename T>
+inline constexpr bool greater_than_max(const T& x) {
+  constexpr bool can_overflow =
+      std::numeric_limits<T>::digits > std::numeric_limits<Limit>::digits;
+  return can_overflow && x > std::numeric_limits<Limit>::max();
+}
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+/// Returns true if x < lowest(Limit). Standard comparison
+template <typename Limit, typename T>
+inline constexpr bool less_than_lowest(
+    const T& x,
+    std::false_type /*limit_is_unsigned*/,
+    std::false_type /*x_is_unsigned*/) {
+  return x < std::numeric_limits<Limit>::lowest();
+}
+
+/// Returns false since all the limit is signed and therefore includes
+/// negative values but x cannot be negative because it is unsigned
+template <typename Limit, typename T>
+inline constexpr bool less_than_lowest(
+    const T& /*x*/,
+    std::false_type /*limit_is_unsigned*/,
+    std::true_type /*x_is_unsigned*/) {
+  return false;
+}
+
+/// Returns true if x < 0, where 0 is constructed from T.
+/// Limit is not signed, so its lower value is zero
+template <typename Limit, typename T>
+inline constexpr bool less_than_lowest(
+    const T& x,
+    std::true_type /*limit_is_unsigned*/,
+    std::false_type /*x_is_unsigned*/) {
+  return x < T(0);
+}
+
+/// Returns false sign both types are unsigned
+template <typename Limit, typename T>
+inline constexpr bool less_than_lowest(
+    const T& /*x*/,
+    std::true_type /*limit_is_unsigned*/,
+    std::true_type /*x_is_unsigned*/) {
+  return false;
+}
+
+/// Returns true if x is less than the lowest value of type T
+/// NOTE: Will fail on an unsigned custom type
+///       For the most part it's possible to fix this if
+///       the custom type has a constexpr constructor.
+///       However, notably, c10::Half does not :
+template <typename Limit, typename T>
+inline constexpr bool less_than_lowest(const T& x) {
+  return less_than_lowest<Limit>(
+      x, std::is_unsigned<Limit>(), std::is_unsigned<T>());
+}
+
+} // namespace c10
+
+C10_CLANG_DIAGNOSTIC_POP()
diff --git a/runtime/core/portable_type/c10/util/bit_cast.h b/runtime/core/portable_type/c10/util/bit_cast.h
new file mode 100644
index 00000000000..c1d2c102886
--- /dev/null
+++ b/runtime/core/portable_type/c10/util/bit_cast.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <cstring>
+#include <type_traits>
+
+#if __has_include(<bit>) && (__cplusplus >= 202002L || (defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L))
+#include <bit>
+#define C10_HAVE_STD_BIT_CAST 1
+#else
+#define C10_HAVE_STD_BIT_CAST 0
+#endif // __has_include(<bit>) && (__cplusplus >= 202002L ||
+       // (defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L))
+
+namespace c10 {
+
+#if C10_HAVE_STD_BIT_CAST
+using std::bit_cast;
+#else
+// Implementations of std::bit_cast() from C++ 20.
+//
+// This is a less sketchy version of reinterpret_cast.
+//
+// See https://en.cppreference.com/w/cpp/numeric/bit_cast for more
+// information as well as the source of our implementations.
+template <class To, class From>
+std::enable_if_t<
+    sizeof(To) == sizeof(From) && std::is_trivially_copyable_v<From> &&
+        std::is_trivially_copyable_v<To>,
+    To>
+// constexpr support needs compiler magic
+bit_cast(const From& src) noexcept {
+  static_assert(
+      std::is_trivially_constructible_v<To>,
+      "This implementation additionally requires "
+      "destination type to be trivially constructible");
+
+  To dst;
+  std::memcpy(&dst, &src, sizeof(To));
+  return dst;
+}
+#endif // C10_HAVE_STD_BIT_CAST
+#undef C10_HAVE_STD_BIT_CAST
+
+} // namespace c10
diff --git a/runtime/core/portable_type/c10/util/floating_point_utils.h b/runtime/core/portable_type/c10/util/floating_point_utils.h
new file mode 100644
index 00000000000..b240c4ea232
--- /dev/null
+++ b/runtime/core/portable_type/c10/util/floating_point_utils.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/bit_cast.h>
+#include <cstdint>
+
+namespace c10::detail {
+
+C10_HOST_DEVICE inline float fp32_from_bits(uint32_t w) {
+#if defined(__OPENCL_VERSION__)
+  return as_float(w);
+#elif defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+  return __uint_as_float((unsigned int)w);
+#elif defined(__INTEL_COMPILER)
+  return _castu32_f32(w);
+#else
+  return c10::bit_cast<float>(w);
+#endif
+}
+
+C10_HOST_DEVICE inline uint32_t fp32_to_bits(float f) {
+#if defined(__OPENCL_VERSION__)
+  return as_uint(f);
+#elif defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+  return (uint32_t)__float_as_uint(f);
+#elif defined(__INTEL_COMPILER)
+  return _castf32_u32(f);
+#else
+  return c10::bit_cast<uint32_t>(f);
+#endif
+}
+
+} // namespace c10::detail
diff --git a/runtime/core/portable_type/half.h b/runtime/core/portable_type/half.h
index 267d17bdba8..bf4c676ce82 100644
--- a/runtime/core/portable_type/half.h
+++ b/runtime/core/portable_type/half.h
@@ -8,757 +8,20 @@
 
 #pragma once
 
-#include <math.h>
-#include <cstdint>
-#include <cstring>
-#include <limits>
-#include <ostream>
-
-#if defined(__GNUC__) || defined(__clang__)
-#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
-#ifndef __ARM_V8_ONLY__
-#define NATIVE_FP16 1
-#endif // __ARM_V8_ONLY__
-#endif // __aarch64__
-#endif // GNUC or clang
-
-#if defined(__GNUC__) || defined(__clang__)
-#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || \
-    defined(_M_IX86)
-#if defined(__AVX2__)
-#define X86_F16 1
-#include <immintrin.h> // import conversion ops from f16cintrin.h
-#endif // __AVX2__
-#endif // __x86_64__ || _M_X64 || __i386 || _M_IX86
-#endif // __GNUC__ || __clang__
-
-namespace executorch {
-namespace runtime {
-namespace etensor {
-
-/**
- * A half-precision floating point type, compatible with c10/util/Half.h from
- * pytorch core.
- */
-struct alignas(2) Half {
-  union {
-#ifdef NATIVE_FP16
-    _Float16 y;
-#endif
-    uint16_t x;
-  };
-
-  struct from_bits_t {};
-  static constexpr from_bits_t from_bits() {
-    return from_bits_t();
-  }
-
-  Half() = default;
-
-  constexpr Half(uint16_t bits, from_bits_t) : x(bits) {}
-  /* implicit */ inline Half(float value);
-  inline operator float() const;
-};
+#include <c10/util/Half.h>
 
+namespace executorch::runtime::etensor {
+using c10::Half;
 namespace internal {
-
-inline float fp32_from_bits(uint32_t w) {
-  static_assert(sizeof(float) == sizeof(uint32_t));
-  union {
-    uint32_t as_bits;
-    float as_value;
-  } fp32 = {w};
-  return fp32.as_value;
-}
-
-inline uint32_t fp32_to_bits(float f) {
-  static_assert(sizeof(float) == sizeof(uint32_t));
-  union {
-    float as_value;
-    uint32_t as_bits;
-  } fp32 = {f};
-  return fp32.as_bits;
-}
-
-/*
- * Convert a 16-bit floating-point number in IEEE half-precision format, in bit
- * representation, to a 32-bit floating-point number in IEEE single-precision
- * format, in bit representation.
- *
- * @note The implementation doesn't use any floating-point operations.
- */
-inline uint32_t fp16_ieee_to_fp32_bits(uint16_t h) {
-  /*
-   * Extend the half-precision floating-point number to 32 bits and shift to the
-   * upper part of the 32-bit word:
-   *      +---+-----+------------+-------------------+
-   *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
-   *      +---+-----+------------+-------------------+
-   * Bits  31  26-30    16-25            0-15
-   *
-   * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0
-   * - zero bits.
-   */
-  const uint32_t w = (uint32_t)h << 16;
-  /*
-   * Extract the sign of the input number into the high bit of the 32-bit word:
-   *
-   *      +---+----------------------------------+
-   *      | S |0000000 00000000 00000000 00000000|
-   *      +---+----------------------------------+
-   * Bits  31                 0-31
-   */
-  const uint32_t sign = w & UINT32_C(0x80000000);
-  /*
-   * Extract mantissa and biased exponent of the input number into the bits 0-30
-   * of the 32-bit word:
-   *
-   *      +---+-----+------------+-------------------+
-   *      | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
-   *      +---+-----+------------+-------------------+
-   * Bits  30  27-31     17-26            0-16
-   */
-  const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
-  /*
-   * Renorm shift is the number of bits to shift mantissa left to make the
-   * half-precision number normalized. If the initial number is normalized, some
-   * of its high 6 bits (sign == 0 and 5-bit exponent) equals one. In this case
-   * renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note
-   * that if we shift denormalized nonsign by renorm_shift, the unit bit of
-   * mantissa will shift into exponent, turning the biased exponent into 1, and
-   * making mantissa normalized (i.e. without leading 1).
-   */
-#ifdef _MSC_VER
-  unsigned long nonsign_bsr;
-  _BitScanReverse(&nonsign_bsr, (unsigned long)nonsign);
-  uint32_t renorm_shift = (uint32_t)nonsign_bsr ^ 31;
-#else
-  uint32_t renorm_shift = __builtin_clz(nonsign);
-#endif
-  renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
-  /*
-   * Iff half-precision number has exponent of 15, the addition overflows
-   * it into bit 31, and the subsequent shift turns the high 9 bits
-   * into 1. Thus inf_nan_mask == 0x7F800000 if the half-precision number
-   * had exponent of 15 (i.e. was NaN or infinity) 0x00000000 otherwise
-   */
-  const int32_t inf_nan_mask =
-      ((int32_t)(nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000);
-  /*
-   * Iff nonsign is 0, it overflows into 0xFFFFFFFF, turning bit 31
-   * into 1. Otherwise, bit 31 remains 0. The signed shift right by 31
-   * broadcasts bit 31 into all bits of the zero_mask. Thus zero_mask ==
-   * 0xFFFFFFFF if the half-precision number was zero (+0.0h or -0.0h)
-   * 0x00000000 otherwise
-   */
-  const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31;
-  /*
-   * 1. Shift nonsign left by renorm_shift to normalize it (if the input
-   * was denormal)
-   * 2. Shift nonsign right by 3 so the exponent (5 bits originally)
-   * becomes an 8-bit field and 10-bit mantissa shifts into the 10 high
-   * bits of the 23-bit mantissa of IEEE single-precision number.
-   * 3. Add 0x70 to the exponent (starting at bit 23) to compensate the
-   * different in exponent bias (0x7F for single-precision number less 0xF
-   * for half-precision number).
-   * 4. Subtract renorm_shift from the exponent (starting at bit 23) to
-   * account for renormalization. As renorm_shift is less than 0x70, this
-   * can be combined with step 3.
-   * 5. Binary OR with inf_nan_mask to turn the exponent into 0xFF if the
-   * input was NaN or infinity.
-   * 6. Binary ANDNOT with zero_mask to turn the mantissa and exponent
-   * into zero if the input was zero.
-   * 7. Combine with the sign of the input number.
-   */
-  return sign |
-      ((((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) |
-        inf_nan_mask) &
-       ~zero_mask);
-}
-
-/*
- * Convert a 16-bit floating-point number in IEEE half-precision format, in bit
- * representation, to a 32-bit floating-point number in IEEE single-precision
- * format.
- *
- * @note The implementation relies on IEEE-like (no assumption about rounding
- * mode and no operations on denormals) floating-point operations and bitcasts
- * between integer and floating-point variables.
- */
-inline float fp16_ieee_to_fp32_value(uint16_t h) {
-#ifdef X86_F16
-  return _cvtsh_ss(h);
-#else
-
-  /*
-   * Extend the half-precision floating-point number to 32 bits and shift to the
-   * upper part of the 32-bit word:
-   *      +---+-----+------------+-------------------+
-   *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
-   *      +---+-----+------------+-------------------+
-   * Bits  31  26-30    16-25            0-15
-   *
-   * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0
-   * - zero bits.
-   */
-  const uint32_t w = (uint32_t)h << 16;
-  /*
-   * Extract the sign of the input number into the high bit of the 32-bit word:
-   *
-   *      +---+----------------------------------+
-   *      | S |0000000 00000000 00000000 00000000|
-   *      +---+----------------------------------+
-   * Bits  31                 0-31
-   */
-  const uint32_t sign = w & UINT32_C(0x80000000);
-  /*
-   * Extract mantissa and biased exponent of the input number into the high bits
-   * of the 32-bit word:
-   *
-   *      +-----+------------+---------------------+
-   *      |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000|
-   *      +-----+------------+---------------------+
-   * Bits  27-31    17-26            0-16
-   */
-  const uint32_t two_w = w + w;
-
-  /*
-   * Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become
-   * mantissa and exponent of a single-precision floating-point number:
-   *
-   *       S|Exponent |          Mantissa
-   *      +-+---+-----+------------+----------------+
-   *      |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000|
-   *      +-+---+-----+------------+----------------+
-   * Bits   | 23-31   |           0-22
-   *
-   * Next, there are some adjustments to the exponent:
-   * - The exponent needs to be corrected by the difference in exponent bias
-   * between single-precision and half-precision formats (0x7F - 0xF = 0x70)
-   * - Inf and NaN values in the inputs should become Inf and NaN values after
-   * conversion to the single-precision number. Therefore, if the biased
-   * exponent of the half-precision input was 0x1F (max possible value), the
-   * biased exponent of the single-precision output must be 0xFF (max possible
-   * value). We do this correction in two steps:
-   *   - First, we adjust the exponent by (0xFF - 0x1F) = 0xE0 (see exp_offset
-   * below) rather than by 0x70 suggested by the difference in the exponent bias
-   * (see above).
-   *   - Then we multiply the single-precision result of exponent adjustment by
-   * 2**(-112) to reverse the effect of exponent adjustment by 0xE0 less the
-   * necessary exponent adjustment by 0x70 due to difference in exponent bias.
-   *     The floating-point multiplication hardware would ensure than Inf and
-   * NaN would retain their value on at least partially IEEE754-compliant
-   * implementations.
-   *
-   * Note that the above operations do not handle denormal inputs (where biased
-   * exponent == 0). However, they also do not operate on denormal inputs, and
-   * do not produce denormal results.
-   */
-  constexpr uint32_t exp_offset = UINT32_C(0xE0) << 23;
-  // const float exp_scale = 0x1.0p-112f;
-  constexpr uint32_t scale_bits = (uint32_t)15 << 23;
-  float exp_scale_val = 0;
-  std::memcpy(&exp_scale_val, &scale_bits, sizeof(exp_scale_val));
-  const float exp_scale = exp_scale_val;
-  const float normalized_value =
-      fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
-
-  /*
-   * Convert denormalized half-precision inputs into single-precision results
-   * (always normalized). Zero inputs are also handled here.
-   *
-   * In a denormalized number the biased exponent is zero, and mantissa has
-   * on-zero bits. First, we shift mantissa into bits 0-9 of the 32-bit word.
-   *
-   *                  zeros           |  mantissa
-   *      +---------------------------+------------+
-   *      |0000 0000 0000 0000 0000 00|MM MMMM MMMM|
-   *      +---------------------------+------------+
-   * Bits             10-31                0-9
-   *
-   * Now, remember that denormalized half-precision numbers are represented as:
-   *    FP16 = mantissa * 2**(-24).
-   * The trick is to construct a normalized single-precision number with the
-   * same mantissa and thehalf-precision input and with an exponent which would
-   * scale the corresponding mantissa bits to 2**(-24). A normalized
-   * single-precision floating-point number is represented as: FP32 = (1 +
-   * mantissa * 2**(-23)) * 2**(exponent - 127) Therefore, when the biased
-   * exponent is 126, a unit change in the mantissa of the input denormalized
-   * half-precision number causes a change of the constructed single-precision
-   * number by 2**(-24), i.e. the same amount.
-   *
-   * The last step is to adjust the bias of the constructed single-precision
-   * number. When the input half-precision number is zero, the constructed
-   * single-precision number has the value of FP32 = 1 * 2**(126 - 127) =
-   * 2**(-1) = 0.5 Therefore, we need to subtract 0.5 from the constructed
-   * single-precision number to get the numerical equivalent of the input
-   * half-precision number.
-   */
-  constexpr uint32_t magic_mask = UINT32_C(126) << 23;
-  constexpr float magic_bias = 0.5f;
-  const float denormalized_value =
-      fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
-
-  /*
-   * - Choose either results of conversion of input as a normalized number, or
-   * as a denormalized number, depending on the input exponent. The variable
-   * two_w contains input exponent in bits 27-31, therefore if its smaller than
-   * 2**27, the input is either a denormal number, or zero.
-   * - Combine the result of conversion of exponent and mantissa with the sign
-   * of the input number.
-   */
-  constexpr uint32_t denormalized_cutoff = UINT32_C(1) << 27;
-  const uint32_t result = sign |
-      (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value)
-                                   : fp32_to_bits(normalized_value));
-  return fp32_from_bits(result);
-
-#endif // not X86_F16
-}
-
-/*
- * Convert a 32-bit floating-point number in IEEE single-precision format to a
- * 16-bit floating-point number in IEEE half-precision format, in bit
- * representation.
- *
- * @note The implementation relies on IEEE-like (no assumption about rounding
- * mode and no operations on denormals) floating-point operations and bitcasts
- * between integer and floating-point variables.
- */
-inline uint16_t fp16_ieee_from_fp32_value(float f) {
-#ifdef X86_F16
-  return _cvtss_sh(f, _MM_FROUND_TO_NEAREST_INT);
-#else
-
-  // const float scale_to_inf = 0x1.0p+112f;
-  // const float scale_to_zero = 0x1.0p-110f;
-  constexpr uint32_t scale_to_inf_bits = (uint32_t)239 << 23;
-  constexpr uint32_t scale_to_zero_bits = (uint32_t)17 << 23;
-  float scale_to_inf_val = 0, scale_to_zero_val = 0;
-  std::memcpy(&scale_to_inf_val, &scale_to_inf_bits, sizeof(scale_to_inf_val));
-  std::memcpy(
-      &scale_to_zero_val, &scale_to_zero_bits, sizeof(scale_to_zero_val));
-  const float scale_to_inf = scale_to_inf_val;
-  const float scale_to_zero = scale_to_zero_val;
-
-#if defined(_MSC_VER) && _MSC_VER == 1916
-  float base = ((signbit(f) != 0 ? -f : f) * scale_to_inf) * scale_to_zero;
-#else
-  float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
-#endif
-
-  const uint32_t w = fp32_to_bits(f);
-  const uint32_t shl1_w = w + w;
-  const uint32_t sign = w & UINT32_C(0x80000000);
-  uint32_t bias = shl1_w & UINT32_C(0xFF000000);
-  if (bias < UINT32_C(0x71000000)) {
-    bias = UINT32_C(0x71000000);
-  }
-
-  base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
-  const uint32_t bits = fp32_to_bits(base);
-  const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
-  const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
-  const uint32_t nonsign = exp_bits + mantissa_bits;
-  return static_cast<uint16_t>(
-      (sign >> 16) |
-      (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign));
-#endif // not X86_F16
-}
-
+using c10::detail::fp16_ieee_from_fp32_value;
+using c10::detail::fp16_ieee_to_fp32_bits;
+using c10::detail::fp16_ieee_to_fp32_value;
+using c10::detail::fp32_from_bits;
+using c10::detail::fp32_to_bits;
 } // namespace internal
-
-/// Constructors
-#ifdef NATIVE_FP16
-inline Half::Half(float value) : y(value) {}
-#else
-inline Half::Half(float value)
-    : x(internal::fp16_ieee_from_fp32_value(value)) {}
-#endif
-
-/// Implicit conversions
-#ifdef NATIVE_FP16
-inline Half::operator float() const {
-  return (float)y;
-}
-#else
-inline Half::operator float() const {
-  return internal::fp16_ieee_to_fp32_value(x);
-}
-#endif
-
-/// Arithmetic
-
-#ifdef NATIVE_FP16
-
-#define return_half(r) \
-  do {                 \
-    Half ret;          \
-    ret.y = r;         \
-    return ret;        \
-  } while (0)
-
-inline Half operator+(const Half& a, const Half& b) {
-  return_half(a.y + b.y);
-}
-
-inline Half operator-(const Half& a, const Half& b) {
-  return_half(a.y - b.y);
-  return static_cast<float>(a) - static_cast<float>(b);
-}
-
-inline Half operator*(const Half& a, const Half& b) {
-  return_half(a.y * b.y);
-}
-
-inline Half operator/(const Half& a, const Half& b) {
-  return_half(a.y / b.y);
-}
-
-inline Half operator-(const Half& a) {
-  return_half(-a.y);
-}
-
-inline Half& operator+=(Half& a, const Half& b) {
-  a.y += b.y;
-  return a;
-}
-
-inline Half& operator-=(Half& a, const Half& b) {
-  a.y -= b.y;
-  return a;
-}
-
-inline Half& operator*=(Half& a, const Half& b) {
-  a.y *= b.y;
-  return a;
-}
-
-inline Half& operator/=(Half& a, const Half& b) {
-  a.y /= b.y;
-  return a;
-}
-
-#else
-
-inline Half operator+(const Half& a, const Half& b) {
-  return static_cast<float>(a) + static_cast<float>(b);
-}
-
-inline Half operator-(const Half& a, const Half& b) {
-  return static_cast<float>(a) - static_cast<float>(b);
-}
-
-inline Half operator*(const Half& a, const Half& b) {
-  return static_cast<float>(a) * static_cast<float>(b);
-}
-
-inline Half operator/(const Half& a, const Half& b) {
-  return static_cast<float>(a) / static_cast<float>(b);
-}
-
-inline Half operator-(const Half& a) {
-  return -static_cast<float>(a);
-}
-
-inline Half& operator+=(Half& a, const Half& b) {
-  a = a + b;
-  return a;
-}
-
-inline Half& operator-=(Half& a, const Half& b) {
-  a = a - b;
-  return a;
-}
-
-inline Half& operator*=(Half& a, const Half& b) {
-  a = a * b;
-  return a;
-}
-
-inline Half& operator/=(Half& a, const Half& b) {
-  a = a / b;
-  return a;
-}
-
-#endif
-
-/// Arithmetic with floats
-
-inline float operator+(Half a, float b) {
-  return static_cast<float>(a) + b;
-}
-inline float operator-(Half a, float b) {
-  return static_cast<float>(a) - b;
-}
-inline float operator*(Half a, float b) {
-  return static_cast<float>(a) * b;
-}
-inline float operator/(Half a, float b) {
-  return static_cast<float>(a) / b;
-}
-
-inline float operator+(float a, Half b) {
-  return a + static_cast<float>(b);
-}
-inline float operator-(float a, Half b) {
-  return a - static_cast<float>(b);
-}
-inline float operator*(float a, Half b) {
-  return a * static_cast<float>(b);
-}
-inline float operator/(float a, Half b) {
-  return a / static_cast<float>(b);
-}
-
-inline float& operator+=(float& a, const Half& b) {
-  return a += static_cast<float>(b);
-}
-inline float& operator-=(float& a, const Half& b) {
-  return a -= static_cast<float>(b);
-}
-inline float& operator*=(float& a, const Half& b) {
-  return a *= static_cast<float>(b);
-}
-inline float& operator/=(float& a, const Half& b) {
-  return a /= static_cast<float>(b);
-}
-
-/// Arithmetic with doubles
-
-inline double operator+(Half a, double b) {
-  return static_cast<double>(a) + b;
-}
-inline double operator-(Half a, double b) {
-  return static_cast<double>(a) - b;
-}
-inline double operator*(Half a, double b) {
-  return static_cast<double>(a) * b;
-}
-inline double operator/(Half a, double b) {
-  return static_cast<double>(a) / b;
-}
-
-inline double operator+(double a, Half b) {
-  return a + static_cast<double>(b);
-}
-inline double operator-(double a, Half b) {
-  return a - static_cast<double>(b);
-}
-inline double operator*(double a, Half b) {
-  return a * static_cast<double>(b);
-}
-inline double operator/(double a, Half b) {
-  return a / static_cast<double>(b);
-}
-
-/// Arithmetic with ints
-
-#ifdef NATIVE_FP16
-
-inline Half operator+(Half a, int32_t b) {
-  return_half(a.y + b);
-}
-inline Half operator-(Half a, int32_t b) {
-  return_half(a.y - b);
-}
-inline Half operator*(Half a, int32_t b) {
-  return_half(a.y * b);
-}
-inline Half operator/(Half a, int32_t b) {
-  return_half(a.y / b);
-}
-
-inline Half operator+(int32_t a, Half b) {
-  return_half(a + b.y);
-}
-inline Half operator-(int32_t a, Half b) {
-  return_half(a - b.y);
-}
-inline Half operator*(int32_t a, Half b) {
-  return_half(a * b.y);
-}
-inline Half operator/(int32_t a, Half b) {
-  return_half(a / b.y);
-}
-
-#else
-
-inline Half operator+(Half a, int32_t b) {
-  return a + static_cast<Half>(b);
-}
-inline Half operator-(Half a, int32_t b) {
-  return a - static_cast<Half>(b);
-}
-inline Half operator*(Half a, int32_t b) {
-  return a * static_cast<Half>(b);
-}
-inline Half operator/(Half a, int32_t b) {
-  return a / static_cast<Half>(b);
-}
-
-inline Half operator+(int32_t a, Half b) {
-  return static_cast<Half>(a) + b;
-}
-inline Half operator-(int32_t a, Half b) {
-  return static_cast<Half>(a) - b;
-}
-inline Half operator*(int32_t a, Half b) {
-  return static_cast<Half>(a) * b;
-}
-inline Half operator/(int32_t a, Half b) {
-  return static_cast<Half>(a) / b;
-}
-
-#endif
-
-//// Arithmetic with int64_t
-
-#ifdef NATIVE_FP16
-
-inline Half operator+(Half a, int64_t b) {
-  return_half(a.y + b);
-}
-inline Half operator-(Half a, int64_t b) {
-  return_half(a.y - b);
-}
-inline Half operator*(Half a, int64_t b) {
-  return_half(a.y * b);
-}
-inline Half operator/(Half a, int64_t b) {
-  return_half(a.y / b);
-}
-
-inline Half operator+(int64_t a, Half b) {
-  return_half(a + b.y);
-}
-inline Half operator-(int64_t a, Half b) {
-  return_half(a - b.y);
-}
-inline Half operator*(int64_t a, Half b) {
-  return_half(a * b.y);
-}
-inline Half operator/(int64_t a, Half b) {
-  return_half(a / b.y);
-}
-
-#else
-
-inline Half operator+(Half a, int64_t b) {
-  return a + static_cast<Half>(b);
-}
-inline Half operator-(Half a, int64_t b) {
-  return a - static_cast<Half>(b);
-}
-inline Half operator*(Half a, int64_t b) {
-  return a * static_cast<Half>(b);
-}
-inline Half operator/(Half a, int64_t b) {
-  return a / static_cast<Half>(b);
-}
-
-inline Half operator+(int64_t a, Half b) {
-  return static_cast<Half>(a) + b;
-}
-inline Half operator-(int64_t a, Half b) {
-  return static_cast<Half>(a) - b;
-}
-inline Half operator*(int64_t a, Half b) {
-  return static_cast<Half>(a) * b;
-}
-inline Half operator/(int64_t a, Half b) {
-  return static_cast<Half>(a) / b;
-}
-
-#endif
-
-/// NOTE: we do not define comparisons directly and instead rely on the implicit
-/// conversion Half to float.
-
-static inline std::ostream& operator<<(
-    std::ostream& out,
-    const executorch::runtime::etensor::Half& value) {
-  out << (float)value;
-  return out;
-}
-
-} // namespace etensor
-} // namespace runtime
-} // namespace executorch
-namespace torch {
-namespace executor {
+} // namespace executorch::runtime::etensor
+namespace torch::executor {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
 using ::executorch::runtime::etensor::Half;
-} // namespace executor
-} // namespace torch
-
-namespace std {
-
-template <>
-class numeric_limits<executorch::runtime::etensor::Half> {
- public:
-  static constexpr bool is_specialized = true;
-  static constexpr bool is_signed = true;
-  static constexpr bool is_integer = false;
-  static constexpr bool is_exact = false;
-  static constexpr bool has_infinity = true;
-  static constexpr bool has_quiet_NaN = true;
-  static constexpr bool has_signaling_NaN = true;
-  static constexpr auto has_denorm = numeric_limits<float>::has_denorm;
-  static constexpr auto has_denorm_loss =
-      numeric_limits<float>::has_denorm_loss;
-  static constexpr auto round_style = numeric_limits<float>::round_style;
-  static constexpr bool is_iec559 = true;
-  static constexpr bool is_bounded = true;
-  static constexpr bool is_modulo = false;
-  static constexpr int digits = 11;
-  static constexpr int digits10 = 3;
-  static constexpr int max_digits10 = 5;
-  static constexpr int radix = 2;
-  static constexpr int min_exponent = -13;
-  static constexpr int min_exponent10 = -4;
-  static constexpr int max_exponent = 16;
-  static constexpr int max_exponent10 = 4;
-  static constexpr auto traps = numeric_limits<float>::traps;
-  static constexpr auto tinyness_before =
-      numeric_limits<float>::tinyness_before;
-  static constexpr executorch::runtime::etensor::Half min() {
-    return executorch::runtime::etensor::Half(
-        0x0400, executorch::runtime::etensor::Half::from_bits());
-  }
-  static constexpr executorch::runtime::etensor::Half lowest() {
-    return executorch::runtime::etensor::Half(
-        0xFBFF, executorch::runtime::etensor::Half::from_bits());
-  }
-  static constexpr executorch::runtime::etensor::Half max() {
-    return executorch::runtime::etensor::Half(
-        0x7BFF, executorch::runtime::etensor::Half::from_bits());
-  }
-  static constexpr executorch::runtime::etensor::Half epsilon() {
-    return executorch::runtime::etensor::Half(
-        0x1400, executorch::runtime::etensor::Half::from_bits());
-  }
-  static constexpr executorch::runtime::etensor::Half round_error() {
-    return executorch::runtime::etensor::Half(
-        0x3800, executorch::runtime::etensor::Half::from_bits());
-  }
-  static constexpr executorch::runtime::etensor::Half infinity() {
-    return executorch::runtime::etensor::Half(
-        0x7C00, executorch::runtime::etensor::Half::from_bits());
-  }
-  static constexpr executorch::runtime::etensor::Half quiet_NaN() {
-    return executorch::runtime::etensor::Half(
-        0x7E00, executorch::runtime::etensor::Half::from_bits());
-  }
-  static constexpr executorch::runtime::etensor::Half signaling_NaN() {
-    return executorch::runtime::etensor::Half(
-        0x7D00, executorch::runtime::etensor::Half::from_bits());
-  }
-  static constexpr executorch::runtime::etensor::Half denorm_min() {
-    return executorch::runtime::etensor::Half(
-        0x0001, executorch::runtime::etensor::Half::from_bits());
-  }
-};
-
-} // namespace std
+} // namespace torch::executor
diff --git a/runtime/core/portable_type/targets.bzl b/runtime/core/portable_type/targets.bzl
index b8ccbe602ed..0532def9307 100644
--- a/runtime/core/portable_type/targets.bzl
+++ b/runtime/core/portable_type/targets.bzl
@@ -50,6 +50,9 @@ def define_common_targets():
             "qint_types.h",
             "bits_types.h",
         ],
+        exported_deps = [
+            "//executorch/runtime/core/portable_type/c10:c10",
+        ],
         visibility = [
             "//executorch/extension/...",
             "//executorch/runtime/core/exec_aten/util/...",
diff --git a/runtime/kernel/test/CMakeLists.txt b/runtime/kernel/test/CMakeLists.txt
index b1b6044791c..4e8c24776f1 100644
--- a/runtime/kernel/test/CMakeLists.txt
+++ b/runtime/kernel/test/CMakeLists.txt
@@ -47,14 +47,11 @@ add_executable(
 )
 target_link_libraries(
   operator_registry_max_kernel_num_test GTest::gtest GTest::gtest_main
-  GTest::gmock
+  GTest::gmock executorch
 )
 target_compile_definitions(
   operator_registry_max_kernel_num_test PRIVATE "-DMAX_KERNEL_NUM=1"
 )
-target_include_directories(
-  operator_registry_max_kernel_num_test PRIVATE ${EXECUTORCH_ROOT}/..
-)
 # TODO: This is currently not working!
 # add_test(operator_registry_max_kernel_num_test operator_registry_max_kernel_num_test)
 
diff --git a/shim/xplat/executorch/build/env_interface.bzl b/shim/xplat/executorch/build/env_interface.bzl
index c4111c744b8..d32e62c6109 100644
--- a/shim/xplat/executorch/build/env_interface.bzl
+++ b/shim/xplat/executorch/build/env_interface.bzl
@@ -119,7 +119,7 @@ def _remove_platform_specific_args(kwargs):
     keys = []
     for key in kwargs:
         if (key.endswith("_platform_preprocessor_flags") or key.endswith("_platform_deps") or
-            key.startswith("fbobjc") or key.endswith("_platform_compiler_flags")):
+            key.startswith("fbobjc") or key.endswith("_platform_compiler_flags") or key == "fbcode_exported_preprocessor_flags"):
             keys.append(key)
     for key in keys:
         kwargs.pop(key)
@@ -140,7 +140,8 @@ def _patch_headers(kwargs):
 
     # header_namespace is to workaround the fact that all C++ source files are having the pattern:
     # `include <executorch/.../*.h>` but BUCK2 root is at executorch/ so the `executorch/` prefix is redundant.
-    kwargs["header_namespace"] = "executorch/" + native.package_name()
+    if "header_namespace" not in kwargs:
+        kwargs["header_namespace"] = "executorch/" + native.package_name()
     return kwargs
 
 def _patch_pp_flags(kwargs):

From 5704dae7cc9f61f9d096eaa3e58f0d21c742ea05 Mon Sep 17 00:00:00 2001
From: Github Executorch <github_executorch@arm.com>
Date: Thu, 6 Feb 2025 19:14:57 -0800
Subject: [PATCH 2/9] Integrate torchgen exception boundary with ExecuTorch

Pull Request resolved: https://github.com/pytorch/executorch/pull/7546

As of #7746, we build with exceptions by default, so we just need to use them.

ghstack-source-id: 265190625
@exported-using-ghexport

Differential Revision: [D67904052](https://our.internmc.facebook.com/intern/diff/D67904052/)
---
 build/Codegen.cmake                       |  9 +++++++--
 configurations/CMakeLists.txt             |  2 +-
 kernels/optimized/CMakeLists.txt          |  1 +
 shim/xplat/executorch/codegen/codegen.bzl | 22 ++++++++++++++++++----
 4 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/build/Codegen.cmake b/build/Codegen.cmake
index 435b3d24802..c36b1a26b40 100644
--- a/build/Codegen.cmake
+++ b/build/Codegen.cmake
@@ -59,13 +59,15 @@ endfunction()
 # Invoked as generate_bindings_for_kernels( LIB_NAME lib_name FUNCTIONS_YAML
 # functions_yaml CUSTOM_OPS_YAML custom_ops_yaml )
 function(generate_bindings_for_kernels)
+  set(options ADD_EXCEPTION_BOUNDARY)
   set(arg_names LIB_NAME FUNCTIONS_YAML CUSTOM_OPS_YAML)
-  cmake_parse_arguments(GEN "" "${arg_names}" "" ${ARGN})
+  cmake_parse_arguments(GEN "${options}" "${arg_names}" "" ${ARGN})
 
   message(STATUS "Generating kernel bindings:")
   message(STATUS "  LIB_NAME: ${GEN_LIB_NAME}")
   message(STATUS "  FUNCTIONS_YAML: ${GEN_FUNCTIONS_YAML}")
   message(STATUS "  CUSTOM_OPS_YAML: ${GEN_CUSTOM_OPS_YAML}")
+  message(STATUS "  ADD_EXCEPTION_BOUNDARY: ${GEN_ADD_EXCEPTION_BOUNDARY}")
 
   # Command to generate selected_operators.yaml from custom_ops.yaml.
   file(GLOB_RECURSE _codegen_templates "${EXECUTORCH_ROOT}/codegen/templates/*")
@@ -93,7 +95,10 @@ function(generate_bindings_for_kernels)
       --tags-path=${site-packages-out}/torchgen/packaged/ATen/native/tags.yaml
       --aten-yaml-path=${site-packages-out}/torchgen/packaged/ATen/native/native_functions.yaml
       --op-selection-yaml-path=${_oplist_yaml}
-  )
+    )
+  if(GEN_ADD_EXCEPTION_BOUNDARY)
+    set(_gen_command "${_gen_command}" --add-exception-boundary)
+  endif()
 
   set(_gen_command_sources
       ${_out_dir}/RegisterCodegenUnboxedKernelsEverything.cpp
diff --git a/configurations/CMakeLists.txt b/configurations/CMakeLists.txt
index 9c618001964..eddb8b2a12c 100644
--- a/configurations/CMakeLists.txt
+++ b/configurations/CMakeLists.txt
@@ -42,7 +42,7 @@ if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
 
   generate_bindings_for_kernels(
     LIB_NAME "optimized_native_cpu_ops_lib" FUNCTIONS_YAML
-    ${CMAKE_CURRENT_BINARY_DIR}/merged.yaml
+    ${CMAKE_CURRENT_BINARY_DIR}/merged.yaml ADD_EXCEPTION_BOUNDARY
   )
   message("Generated files ${gen_command_sources}")
 
diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
index abdeeb73453..99e388095f6 100644
--- a/kernels/optimized/CMakeLists.txt
+++ b/kernels/optimized/CMakeLists.txt
@@ -55,6 +55,7 @@ gen_selected_ops(LIB_NAME "optimized_ops_lib" OPS_SCHEMA_YAML "${_yaml}")
 generate_bindings_for_kernels(
   LIB_NAME "optimized_ops_lib" FUNCTIONS_YAML
   ${CMAKE_CURRENT_SOURCE_DIR}/optimized-oss.yaml
+  ADD_EXCEPTION_BOUNDARY
 )
 message("Generated files ${gen_command_sources}")
 
diff --git a/shim/xplat/executorch/codegen/codegen.bzl b/shim/xplat/executorch/codegen/codegen.bzl
index 4b69a2cf4a0..210999b633c 100644
--- a/shim/xplat/executorch/codegen/codegen.bzl
+++ b/shim/xplat/executorch/codegen/codegen.bzl
@@ -116,7 +116,8 @@ def _prepare_genrule_and_lib(
         custom_ops_yaml_path = None,
         custom_ops_requires_runtime_registration = True,
         manual_registration = False,
-        aten_mode = False):
+        aten_mode = False,
+        support_exceptions = True):
     """
     This function returns two dicts `genrules` and `libs`, derived from the arguments being passed
     to `executorch_generated_lib`. `genrules` contains all information related to what genrules to
@@ -156,6 +157,10 @@ def _prepare_genrule_and_lib(
         # actually-generated files matches GENERATED_FILES.
     ]
 
+    if support_exceptions:
+        genrule_cmd.append("--add-exception-boundary")
+
+
     # Sources for generated kernel registration lib
     sources = MANUAL_REGISTRATION_SOURCES if manual_registration else GENERATED_SOURCES
 
@@ -217,6 +222,7 @@ def _prepare_genrule_and_lib(
 def _prepare_custom_ops_genrule_and_lib(
         name,
         custom_ops_yaml_path = None,
+        support_exceptions = True,
         deps = [],
         kernels = []):
     """Similar to _prepare_genrule_and_lib but for custom ops."""
@@ -250,6 +256,8 @@ def _prepare_custom_ops_genrule_and_lib(
             "--install_dir=${OUT}",
             "--op_selection_yaml_path=$(location :{}[selected_operators.yaml])".format(oplist_dir_name),
         ]
+        if support_exceptions:
+            genrule_cmd.append("--add-exception-boundary")
 
         # Determine what sources custom_ops_<name> target should include
         custom_ops_sources = CUSTOM_OPS_SCHEMA_REGISTRATION_SOURCES + (
@@ -281,6 +289,7 @@ def exir_custom_ops_aot_lib(
         deps = [],
         compiler_flags = [],
         define_static_target = False,
+        support_exceptions = True,
         platforms = get_default_executorch_platforms()):
     """Generates a C++ library that helps to register the custom ops into PyTorch,
     so they are visible to EXIR. To use this, we need to load the generated so file:
@@ -297,11 +306,13 @@ def exir_custom_ops_aot_lib(
         visibility: visibility of the generated library.
         kernels: C++ kernels for these custom ops. They need to be implemented using ATen/c10 basics.
         deps: dependencies of the generated library.
+        support_exceptions: enable try/catch wrapper around operator implemntations to make sure exceptions thrown will not bring down the process. Disable if your use case disables exceptions in the build.
     """
     genrules, libs = _prepare_custom_ops_genrule_and_lib(
         name = name,
         custom_ops_yaml_path = selects.apply(yaml_target, lambda y: "$(location {})".format(y)),
         kernels = kernels,
+        support_exceptions = support_exceptions,
         deps = deps,
     )
     for genrule in genrules:
@@ -368,7 +379,7 @@ def copy_portable_header_files(name):
     )
 
 def build_portable_lib(name, oplist_header_name, feature = None, expose_operator_symbols = False):
-    """Build portable lib from source. We build from source so that the generated header file, 
+    """Build portable lib from source. We build from source so that the generated header file,
     selected_op_variants.h, can be used to selectively build the lib for different dtypes.
     """
 
@@ -446,7 +457,8 @@ def executorch_generated_lib(
         kernel_deps = [],
         dtype_selective_build = False,
         feature = None,
-        expose_operator_symbols = False):
+        expose_operator_symbols = False,
+        support_exceptions = True):
     """Emits 0-3 C++ library targets (in fbcode or xplat) containing code to
     dispatch the operators specified in the provided yaml files.
 
@@ -495,6 +507,7 @@ def executorch_generated_lib(
         compiler_flags: compiler_flags args to runtime.cxx_library
         dtype_selective_build: In additional to operator selection, dtype selective build further selects the dtypes for each operator. Can be used with model or dict selective build APIs, where dtypes can be specified. Note: this is only available in xplat.
         feature: Product-Feature Hierarchy (PFH). For internal use only, required for FoA in production. See: https://fburl.com/wiki/2wzjpyqy
+        support_exceptions: enable try/catch wrapper around operator implemntations to make sure exceptions thrown will not bring down the process. Disable if your use case disables exceptions in the build.
     """
     if functions_yaml_target and aten_mode:
         fail("{} is providing functions_yaml_target in ATen mode, it will be ignored. `native_functions.yaml` will be the source of truth.".format(name))
@@ -534,6 +547,7 @@ def executorch_generated_lib(
         custom_ops_requires_runtime_registration = custom_ops_requires_runtime_registration,
         aten_mode = aten_mode,
         manual_registration = manual_registration,
+        support_exceptions = support_exceptions,
     )
 
     # genrule for selective build from static operator list
@@ -672,7 +686,7 @@ def executorch_generated_lib(
             platforms = platforms,
         )
 
-# Util macro that takes in a binary or a shared library, find targets ending with `_et_oplist` in the transitive closure of deps, 
+# Util macro that takes in a binary or a shared library, find targets ending with `_et_oplist` in the transitive closure of deps,
 # get the `selected_operators.yaml` from those targets, try to merge them into a single yaml. This target will fail to build, if
 # there are intersections of all `selected_operators.yaml` the `target` is depending on.
 #

From 250d2fef69cd7817e379205caabfee2d07104da6 Mon Sep 17 00:00:00 2001
From: Github Executorch <github_executorch@arm.com>
Date: Thu, 6 Feb 2025 19:14:58 -0800
Subject: [PATCH 3/9] Reuse GELU implementation from PyTorch core

Pull Request resolved: https://github.com/pytorch/executorch/pull/7041

kernels/optimized doesn't need to support embedded systems, so it can just take a header-only dep on PyTorch.

Note that, because we will pick up Sleef internally and ignore it
externally thanks to ATen vec, this PR gets to enable optimized GELU in OSS.

Testing: CI to make sure this doesn't break mobile build modes; happy to take advice on anything not currently covered that might break.
ghstack-source-id: 265190627
@exported-using-ghexport

Differential Revision: [D66335522](https://our.internmc.facebook.com/intern/diff/D66335522/)
---
 .ci/scripts/build_llama_android.sh            |  8 +++
 .ci/scripts/test_llama.sh                     |  1 +
 .ci/scripts/test_llava.sh                     | 10 ++--
 .ci/scripts/test_model.sh                     |  5 +-
 .ci/scripts/test_phi_3_mini.sh                |  4 ++
 .ci/scripts/utils.sh                          |  1 +
 .github/workflows/pull.yml                    |  2 +
 .github/workflows/trunk.yml                   |  2 +
 build/Utils.cmake                             | 17 +++++++
 build/build_android_llm_demo.sh               |  8 +++
 kernels/optimized/CMakeLists.txt              |  2 +
 kernels/optimized/cpu/op_gelu.cpp             | 51 ++++++-------------
 kernels/optimized/cpu/targets.bzl             | 15 +++---
 kernels/optimized/op_registration_util.bzl    | 11 ++--
 kernels/optimized/optimized-oss.yaml          |  9 +++-
 .../optimized/op_registration_util.bzl        |  2 +-
 test/run_oss_cpp_tests.sh                     |  7 +++
 17 files changed, 102 insertions(+), 53 deletions(-)

diff --git a/.ci/scripts/build_llama_android.sh b/.ci/scripts/build_llama_android.sh
index 6b8f851d772..d37c65aa8ec 100644
--- a/.ci/scripts/build_llama_android.sh
+++ b/.ci/scripts/build_llama_android.sh
@@ -10,6 +10,12 @@ set -exu
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+which "${PYTHON_EXECUTABLE}"
+CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
+
 install_executorch_and_backend_lib() {
   echo "Installing executorch and xnnpack backend"
   clean_executorch_install_folders
@@ -22,6 +28,7 @@ install_executorch_and_backend_lib() {
     -DANDROID_ABI="${ANDROID_ABI}" \
     -DCMAKE_INSTALL_PREFIX=cmake-android-out \
     -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
@@ -47,6 +54,7 @@ build_llama_runner() {
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
     -Bcmake-android-out/examples/models/llama examples/models/llama
 
     cmake --build cmake-android-out/examples/models/llama -j4 --config Release
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
index 9bb881ce8eb..9735e26798d 100644
--- a/.ci/scripts/test_llama.sh
+++ b/.ci/scripts/test_llama.sh
@@ -154,6 +154,7 @@ cmake_install_executorch_libraries() {
     rm -rf cmake-out
     retry cmake \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
+        -DCMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')" \
         -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh
index a9e1313756c..c511942be91 100644
--- a/.ci/scripts/test_llava.sh
+++ b/.ci/scripts/test_llava.sh
@@ -30,9 +30,11 @@ fi
 NPROC=8
 if hash nproc &> /dev/null; then NPROC=$(nproc); fi
 
+python_lib=$($PYTHON_EXECUTABLE -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')
+CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
 EXECUTORCH_COMMON_CMAKE_ARGS="                      \
         -DCMAKE_INSTALL_PREFIX=${BUILD_DIR}         \
-        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}            \
+        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}      \
         -DEXECUTORCH_ENABLE_LOGGING=ON              \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON      \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
@@ -46,6 +48,7 @@ EXECUTORCH_COMMON_CMAKE_ARGS="                      \
 cmake_install_executorch_libraries() {
     cmake                               \
         ${EXECUTORCH_COMMON_CMAKE_ARGS} \
+        "-DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}" \
         -B${BUILD_DIR} .
 
     cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${CMAKE_BUILD_TYPE}
@@ -56,6 +59,7 @@ cmake_install_executorch_libraries_for_android() {
         -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
         -DANDROID_ABI=arm64-v8a                                                 \
         ${EXECUTORCH_COMMON_CMAKE_ARGS}                                         \
+        "-DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}" \
         -B${BUILD_DIR} .
 
     cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${CMAKE_BUILD_TYPE}
@@ -76,7 +80,7 @@ cmake_build_llava_runner() {
 
     cmake                                 \
         ${LLAVA_COMMON_CMAKE_ARGS}        \
-        -DCMAKE_PREFIX_PATH="$python_lib" \
+        -DCMAKE_PREFIX_PATH="$python_lib;${CMAKE_PREFIX_PATH}" \
         -B${BUILD_DIR}/${dir}             \
         ${dir}
 
@@ -92,7 +96,7 @@ cmake_build_llava_runner_for_android() {
         -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
         -DANDROID_ABI=arm64-v8a                                                 \
         ${LLAVA_COMMON_CMAKE_ARGS}                                              \
-        -DCMAKE_PREFIX_PATH="$python_lib"                                       \
+        -DCMAKE_PREFIX_PATH="$python_lib;${CMAKE_PREFIX_PATH}"                  \
         -DLLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE=ON                                  \
         -B${BUILD_DIR}/${dir}                                                   \
         ${dir}
diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
index b4fbc4486a2..ef4859135c6 100755
--- a/.ci/scripts/test_model.sh
+++ b/.ci/scripts/test_model.sh
@@ -50,10 +50,12 @@ prepare_artifacts_upload() {
 
 build_cmake_executor_runner() {
   echo "Building executor_runner"
+  CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
   rm -rf ${CMAKE_OUTPUT_DIR}
   cmake -DCMAKE_BUILD_TYPE=Debug \
       -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
       -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+      -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
       -B${CMAKE_OUTPUT_DIR} .
 
   cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug
@@ -98,8 +100,7 @@ test_model() {
 
 build_cmake_xnn_executor_runner() {
   echo "Building xnn_executor_runner"
-  SITE_PACKAGES="$(${PYTHON_EXECUTABLE} -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
-  CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch"
+  CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
 
   (rm -rf ${CMAKE_OUTPUT_DIR} \
     && mkdir ${CMAKE_OUTPUT_DIR} \
diff --git a/.ci/scripts/test_phi_3_mini.sh b/.ci/scripts/test_phi_3_mini.sh
index 40767013e23..64dd6b829d8 100644
--- a/.ci/scripts/test_phi_3_mini.sh
+++ b/.ci/scripts/test_phi_3_mini.sh
@@ -22,8 +22,10 @@ NPROC=8
 if hash nproc &> /dev/null; then NPROC=$(nproc); fi
 
 cmake_install_executorch_libraries() {
+  CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
   cmake -DPYTHON_EXECUTABLE=python \
       -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
+      -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
       -DEXECUTORCH_ENABLE_LOGGING=1 \
       -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
       -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
@@ -39,8 +41,10 @@ cmake_install_executorch_libraries() {
 }
 
 cmake_build_phi_3_mini() {
+  CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
   cmake -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
       -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
+      -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
       -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
       -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
       -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
index be684b7bfa2..c21d0bb604e 100644
--- a/.ci/scripts/utils.sh
+++ b/.ci/scripts/utils.sh
@@ -136,6 +136,7 @@ cmake_install_executorch_lib() {
   clean_executorch_install_folders
   retry cmake -DBUCK2="$BUCK" \
           -DCMAKE_INSTALL_PREFIX=cmake-out \
+          -DCMAKE_PREFIX_PATH="$($PYTHON_EXECUTABLE -c 'import torch as _; print(_.__path__[0])')" \
           -DCMAKE_BUILD_TYPE=Release \
           -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
           -Bcmake-out .
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 59467e5d047..075ba39430a 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -147,6 +147,8 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
+        source .ci/scripts/utils.sh
+        install_executorch "use-pt-pinned-commit"
         BUILD_TOOL="cmake"
         PYTHON_EXECUTABLE=python \
         bash .ci/scripts/build_llama_android.sh  "${BUILD_TOOL}"
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 04a6c96f3ec..18e34bff72a 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -394,6 +394,7 @@ jobs:
         rm -rf cmake-out
         cmake \
             -DCMAKE_INSTALL_PREFIX=cmake-out \
+            -DCMAKE_PREFIX_PATH="$(python -c 'import torch as _; print(_.__path__[0])')" \
             -DCMAKE_BUILD_TYPE=Release \
             -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
             -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
@@ -411,6 +412,7 @@ jobs:
         cmake \
             -DCMAKE_INSTALL_PREFIX=cmake-out \
             -DCMAKE_BUILD_TYPE=Release \
+            -DCMAKE_PREFIX_PATH="$(python -c 'import torch as _; print(_.__path__[0])')" \
             -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
             -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
             -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
diff --git a/build/Utils.cmake b/build/Utils.cmake
index 47bc46b18d3..a2f2b61225f 100644
--- a/build/Utils.cmake
+++ b/build/Utils.cmake
@@ -321,3 +321,20 @@ function(resolve_python_executable)
     )
   endif()
 endfunction()
+
+# find_package(Torch CONFIG REQUIRED) replacement for targets that
+# have a header-only Torch dependency. Because find_package sets
+# variables in the parent scope, we use a macro to preserve this
+# rather than maintaining our own list of those variables.
+macro(find_package_torch_headers)
+  # We cannot simply use CMAKE_FIND_ROOT_PATH_BOTH, because that does
+  # not propagate into TorchConfig.cmake.
+  foreach(mode_kind IN ITEMS PACKAGE LIBRARY INCLUDE)
+    set(OLD_CMAKE_FIND_ROOT_PATH_MODE_${mode_kind} ${CMAKE_FIND_ROOT_PATH_MODE_${mode_kind}})
+    set(CMAKE_FIND_ROOT_PATH_MODE_${mode_kind} BOTH)
+  endforeach()
+  find_package(Torch CONFIG REQUIRED)
+  foreach(mode_kind IN ITEMS PACKAGE LIBRARY INCLUDE)
+    set(CMAKE_FIND_ROOT_PATH_MODE_${mode_kind} ${OLD_CMAKE_FIND_ROOT_PATH_MODE_${mode_kind}})
+  endforeach()
+endmacro()
diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh
index f8ded210996..b72968037c1 100644
--- a/build/build_android_llm_demo.sh
+++ b/build/build_android_llm_demo.sh
@@ -7,6 +7,12 @@
 
 set -ex
 
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+which "${PYTHON_EXECUTABLE}"
+CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
+
 build_jar() {
   pushd extension/android
   ./gradlew build
@@ -36,6 +42,7 @@ build_android_native_library() {
   fi
 
   cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
+    -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
     -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
     -DANDROID_ABI="${ANDROID_ABI}" \
     -DANDROID_PLATFORM=android-26 \
@@ -69,6 +76,7 @@ build_android_native_library() {
     -DANDROID_ABI="${ANDROID_ABI}" \
     -DANDROID_PLATFORM=android-26 \
     -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
+    -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
     -DEXECUTORCH_ENABLE_LOGGING=ON \
     -DEXECUTORCH_LOG_LEVEL=Info \
     -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
index 99e388095f6..3aecc0e1b33 100644
--- a/kernels/optimized/CMakeLists.txt
+++ b/kernels/optimized/CMakeLists.txt
@@ -61,6 +61,8 @@ message("Generated files ${gen_command_sources}")
 
 list(TRANSFORM _optimized_kernels__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(optimized_kernels ${_optimized_kernels__srcs})
+find_package_torch_headers()
+target_include_directories(optimized_kernels PRIVATE ${TORCH_INCLUDE_DIRS})
 target_link_libraries(
   optimized_kernels PRIVATE executorch_core cpublas extension_threadpool
 )
diff --git a/kernels/optimized/cpu/op_gelu.cpp b/kernels/optimized/cpu/op_gelu.cpp
index 88591323397..dcb6bbc4279 100644
--- a/kernels/optimized/cpu/op_gelu.cpp
+++ b/kernels/optimized/cpu/op_gelu.cpp
@@ -13,6 +13,7 @@
 
 #include <cmath>
 
+#include <ATen/native/cpu/Gelu.h>
 #include <executorch/kernels/portable/cpu/util/activation_ops_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
@@ -47,48 +48,26 @@ void gelu(
   CTYPE* out_data = output.mutable_data_ptr<CTYPE>();
   size_t lim = input.numel();
 
-  // TODO: Add fast path for tanh using sleef's tanh
   if (approximate == "tanh") {
-    // 0.5 * x * (1 + Tanh(sqrt(2 / pi) * (x + 0.044715 * x^3))
-    for (size_t i = 0; i < lim; ++i) {
-      const CTYPE x = in_data[i];
-      const CTYPE kBeta = M_SQRT2 * M_2_SQRTPI * 0.5;
-      const CTYPE kKappa = 0.044715;
-      auto x_cube = x * x * x;
-      auto inner = kBeta * (x + kKappa * x_cube);
-      out_data[i] = CTYPE(0.5) * x * (CTYPE(1) + std::tanh(inner));
+    using Vec = at::vec::Vectorized<CTYPE>;
+    int i = 0;
+    for (; i < lim - (lim % Vec::size()); i += Vec::size()) {
+      Vec x = Vec::loadu(in_data + i);
+      at::native::vectorized_gelu_approximated_with_tanh(x).store(out_data + i);
     }
-  } else if (approximate == "none") { // dont appx
-    // GELU(x) = x * Φ(x) where Φ(x) is the is the Cumulative Distribution
-    // Function for Gaussian Distribution.
-
-#ifndef __aarch64__
-    for (size_t i = 0; i < lim; ++i) {
-      const CTYPE x = in_data[i];
-      out_data[i] = CTYPE(0.5) * x * (CTYPE(1) + std::erf(x * M_SQRT1_2));
+    for (; i < lim; ++i) {
+      out_data[i] = at::native::scalar_gelu_approximated_with_tanh(in_data[i]);
     }
-#else
-    size_t i = 0;
-    if constexpr (std::is_same_v<CTYPE, float>) {
-      for (; i + 4 < lim; i += 4) {
-        const float32x4_t in =
-            vld1q_f32(static_cast<const float*>(&in_data[i]));
-        const float32x4_t m_sqrt1_2x4 = {
-            M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2};
-        const float32x4_t ones = vmovq_n_f32(1.0);
-        const float32x4_t halves = vmovq_n_f32(0.5);
-        float32x4_t out = Sleef_erff4_u10(vmulq_f32(in, m_sqrt1_2x4));
-        vst1q_f32(
-            static_cast<float*>(&out_data[i]),
-            vmulq_f32(vmulq_f32(vaddq_f32(out, ones), in), halves));
-      }
+  } else if (approximate == "none") {
+    using Vec = at::vec::Vectorized<CTYPE>;
+    int i = 0;
+    for (; i < lim - (lim % Vec::size()); i += Vec::size()) {
+      Vec x = Vec::loadu(in_data + i);
+      at::native::vectorized_gelu(x).store(out_data + i);
     }
     for (; i < lim; ++i) {
-      const CTYPE x = in_data[i];
-      out_data[i] = CTYPE(0.5) * x * (CTYPE(1) + std::erf(x * M_SQRT1_2));
+      out_data[i] = at::native::scalar_gelu(in_data[i]);
     }
-#endif // __aarch64__
-
   } else {
     ET_KERNEL_CHECK_MSG(
         context,
diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl
index d97e1eb5122..d8d6cfdec71 100644
--- a/kernels/optimized/cpu/targets.bzl
+++ b/kernels/optimized/cpu/targets.bzl
@@ -28,13 +28,9 @@ _OPTIMIZED_ATEN_OPS = (
     op_target(name = "op_sigmoid"),
     op_target(
         name = "op_gelu",
-        deps = select({
-            "DEFAULT": [],
-            "ovr_config//cpu:arm64": [
-                "fbsource//third-party/sleef:sleef_arm",
-            ],
-        }) + [
+        deps = [
             "//executorch/kernels/portable/cpu/util:activation_ops_util",
+            "//executorch/runtime/core/portable_type/c10:aten_headers_for_executorch",
         ],
     ),
     op_target(
@@ -96,6 +92,13 @@ _OPTIMIZED_ATEN_OPS = (
     ),
 )
 
+
+def get_sleef_preprocessor_flags():
+    if runtime.is_oss:
+        return []
+    return ["-DAT_BUILD_ARM_VEC256_WITH_SLEEF"]
+
+
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
 
diff --git a/kernels/optimized/op_registration_util.bzl b/kernels/optimized/op_registration_util.bzl
index 6839454be29..3af20680b5b 100644
--- a/kernels/optimized/op_registration_util.bzl
+++ b/kernels/optimized/op_registration_util.bzl
@@ -90,9 +90,14 @@ def define_op_library(name, deps):
             "//executorch/kernels/test/...",
             "@EXECUTORCH_CLIENTS",
         ],
-        # kernels often have helpers with no prototypes just disabling the warning here as the headers
-        # are codegend and linked in later
-        compiler_flags = ["-Wno-missing-prototypes"] + get_compiler_optimization_flags(),
+        compiler_flags = [
+            # kernels often have helpers with no prototypes just disabling the warning here as the headers
+            # are codegend and linked in later
+            "-Wno-missing-prototypes",
+            # pragma unroll fails with -Os, don't need to warn us and
+            # fail Werror builds; see https://godbolt.org/z/zvf85vTsr
+            "-Wno-pass-failed",
+        ] + get_compiler_optimization_flags(),
         deps = [
             "//executorch/runtime/kernel:kernel_includes",
         ] + augmented_deps + get_vec_deps(),
diff --git a/kernels/optimized/optimized-oss.yaml b/kernels/optimized/optimized-oss.yaml
index 52262e2dd53..28f1d595272 100644
--- a/kernels/optimized/optimized-oss.yaml
+++ b/kernels/optimized/optimized-oss.yaml
@@ -1,8 +1,8 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 # This yaml file contains operators that have optimized kernels available.
-# Note that this is a copy of optimized.yaml that does not include gelu and
-# log_softmax, due to the OSS build not currently including sleef.
+# Note that this is a copy of optimized.yaml that does not include log_softmax,
+# due to the OSS build not currently including sleef.
 # TODO (T183193812)
 
 - op: add.out
@@ -40,6 +40,11 @@
     - arg_meta: null
       kernel_name: torch::executor::opt_sigmoid_out
 
+- op: gelu.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_gelu_out
+
 - op: le.Scalar_out
   kernels:
     - arg_meta: null
diff --git a/shim/xplat/executorch/kernels/optimized/op_registration_util.bzl b/shim/xplat/executorch/kernels/optimized/op_registration_util.bzl
index 37a68abaa07..c079b97f634 100644
--- a/shim/xplat/executorch/kernels/optimized/op_registration_util.bzl
+++ b/shim/xplat/executorch/kernels/optimized/op_registration_util.bzl
@@ -134,5 +134,5 @@ def define_op_target(name, deps):
 
 def is_op_disabled(name):
     # TODO (gjcomer) Enable ops with sleef dependency in OSS
-    disabled_ops = ["op_gelu", "op_log_softmax"]
+    disabled_ops = ["op_log_softmax"]
     return name in disabled_ops
diff --git a/test/run_oss_cpp_tests.sh b/test/run_oss_cpp_tests.sh
index 2c8685ea5b7..f8d0a361733 100755
--- a/test/run_oss_cpp_tests.sh
+++ b/test/run_oss_cpp_tests.sh
@@ -22,13 +22,20 @@ elif [[ $(uname) == "Linux" ]]; then
   export LLVM_COV="${LLVM_COV:-llvm-cov}"
 fi
 
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+which "${PYTHON_EXECUTABLE}"
+
 build_executorch() {
   BUILD_VULKAN="OFF"
   if [ -x "$(command -v glslc)" ]; then
     BUILD_VULKAN="ON"
   fi
+  CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
   cmake . \
     -DCMAKE_INSTALL_PREFIX=cmake-out \
+    -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
     -DEXECUTORCH_USE_CPP_CODE_COVERAGE=ON \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \

From 2e2155314b2989b713ccd008fac844a55767d5e8 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 11 Feb 2025 11:27:08 -0800
Subject: [PATCH 4/9] Update CMakeLists.txt

typo while merging, apparently
---
 kernels/optimized/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
index d424f124cff..5338307c2b1 100644
--- a/kernels/optimized/CMakeLists.txt
+++ b/kernels/optimized/CMakeLists.txt
@@ -60,7 +60,7 @@ generate_bindings_for_kernels(
 message("Generated files ${gen_command_sources}")
 
 list(TRANSFORM _optimized_kernels__srcs PREPEND "${EXECUTORCH_ROOT}/")
-add_library(optimized_kernels ${_optimized_kernels__srcs}
+add_library(optimized_kernels ${_optimized_kernels__srcs})
 find_package_torch_headers()
 target_include_directories(optimized_kernels PRIVATE ${TORCH_INCLUDE_DIRS} "${EXECUTORCH_ROOT}/third-party/pocketfft")
 target_link_libraries(

From d7b7aea41997f91024c5b5d9a19c49aa4444dc75 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 11 Feb 2025 16:28:52 -0800
Subject: [PATCH 5/9] Find Torch in module mode

https://cmake.org/cmake/help/latest/command/find_package.html
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 30f1f5c42cf..06eb1bdead8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -750,7 +750,7 @@ if(EXECUTORCH_BUILD_PYBIND)
   endif()
 
   # find pytorch lib, to allow pybind to take at::Tensor as input/output
-  find_package(Torch CONFIG REQUIRED)
+  find_package(Torch REQUIRED)
   find_library(
     TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib"
   )

From e37a0f3b8bda89323163c86bd191246cd338f4d2 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 11 Feb 2025 16:40:31 -0800
Subject: [PATCH 6/9] Change some more

---
 examples/models/llava/CMakeLists.txt    | 2 +-
 extension/llm/custom_ops/CMakeLists.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt
index f22b4471538..30db61a8836 100644
--- a/examples/models/llava/CMakeLists.txt
+++ b/examples/models/llava/CMakeLists.txt
@@ -79,7 +79,7 @@ if(LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE)
   add_definitions(-DLLAVA_NO_TORCH_DUMMY_IMAGE=1)
   message("Buidling the runner without Torch, feeding a dummy image!")
 else()
-  find_package(Torch CONFIG REQUIRED)
+  find_package(Torch REQUIRED)
 endif()
 
 #
diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt
index 16ca4fff805..f511340563c 100644
--- a/extension/llm/custom_ops/CMakeLists.txt
+++ b/extension/llm/custom_ops/CMakeLists.txt
@@ -69,7 +69,7 @@ install(TARGETS custom_ops DESTINATION lib)
 
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
   # Add a AOT library
-  find_package(Torch CONFIG REQUIRED)
+  find_package(Torch REQUIRED)
   add_library(
     custom_ops_aot_lib SHARED
     ${_custom_ops__srcs}

From 211131f9f4010e86ab70cc44e242bcc2fad52f20 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 11 Feb 2025 19:06:14 -0800
Subject: [PATCH 7/9] Revert "Change some more"

This reverts commit e37a0f3b8bda89323163c86bd191246cd338f4d2.
---
 examples/models/llava/CMakeLists.txt    | 2 +-
 extension/llm/custom_ops/CMakeLists.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt
index 30db61a8836..f22b4471538 100644
--- a/examples/models/llava/CMakeLists.txt
+++ b/examples/models/llava/CMakeLists.txt
@@ -79,7 +79,7 @@ if(LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE)
   add_definitions(-DLLAVA_NO_TORCH_DUMMY_IMAGE=1)
   message("Buidling the runner without Torch, feeding a dummy image!")
 else()
-  find_package(Torch REQUIRED)
+  find_package(Torch CONFIG REQUIRED)
 endif()
 
 #
diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt
index f511340563c..16ca4fff805 100644
--- a/extension/llm/custom_ops/CMakeLists.txt
+++ b/extension/llm/custom_ops/CMakeLists.txt
@@ -69,7 +69,7 @@ install(TARGETS custom_ops DESTINATION lib)
 
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
   # Add a AOT library
-  find_package(Torch REQUIRED)
+  find_package(Torch CONFIG REQUIRED)
   add_library(
     custom_ops_aot_lib SHARED
     ${_custom_ops__srcs}

From 169c57d8b11eb6f1cf08f84e51a6eecd556b27ae Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 11 Feb 2025 19:06:34 -0800
Subject: [PATCH 8/9] Revert "Find Torch in module mode"

This reverts commit d7b7aea41997f91024c5b5d9a19c49aa4444dc75.
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 06eb1bdead8..30f1f5c42cf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -750,7 +750,7 @@ if(EXECUTORCH_BUILD_PYBIND)
   endif()
 
   # find pytorch lib, to allow pybind to take at::Tensor as input/output
-  find_package(Torch REQUIRED)
+  find_package(Torch CONFIG REQUIRED)
   find_library(
     TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib"
   )

From f705b01a51486e1cbe318d142015da35a83e2578 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 11 Feb 2025 20:08:39 -0800
Subject: [PATCH 9/9] Fix multiple MKL targets

---
 CMakeLists.txt                          | 6 +++++-
 build/Codegen.cmake                     | 4 +++-
 build/Utils.cmake                       | 4 +++-
 extension/llm/custom_ops/CMakeLists.txt | 4 +++-
 kernels/optimized/CMakeLists.txt        | 1 -
 5 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 30f1f5c42cf..c37dcbc7a1d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -614,6 +614,8 @@ if(BUILD_EXECUTORCH_PORTABLE_OPS)
 endif()
 
 if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
+  # find pytorch lib here to make it available to all sub-directories
+  find_package_torch_headers()
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized)
 endif()
 
@@ -750,7 +752,9 @@ if(EXECUTORCH_BUILD_PYBIND)
   endif()
 
   # find pytorch lib, to allow pybind to take at::Tensor as input/output
-  find_package(Torch CONFIG REQUIRED)
+  if(NOT TARGET torch)
+    find_package(Torch CONFIG REQUIRED)
+  endif()
   find_library(
     TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib"
   )
diff --git a/build/Codegen.cmake b/build/Codegen.cmake
index c36b1a26b40..f2da23baeaa 100644
--- a/build/Codegen.cmake
+++ b/build/Codegen.cmake
@@ -146,7 +146,9 @@ function(gen_custom_ops_aot_lib)
     ${_out_dir}/CustomOpsNativeFunctions.h "${GEN_KERNEL_SOURCES}"
   )
   # Find `Torch`.
-  find_package(Torch REQUIRED)
+  if(NOT TARGET torch)
+    find_package(Torch REQUIRED)
+  endif()
   # This lib uses ATen lib, so we explicitly enable rtti and exceptions.
   target_compile_options(${GEN_LIB_NAME} PRIVATE -frtti -fexceptions)
   target_compile_definitions(${GEN_LIB_NAME} PRIVATE USE_ATEN_LIB=1)
diff --git a/build/Utils.cmake b/build/Utils.cmake
index 991591e207e..dca3f189ec4 100644
--- a/build/Utils.cmake
+++ b/build/Utils.cmake
@@ -336,7 +336,9 @@ macro(find_package_torch_headers)
     set(OLD_CMAKE_FIND_ROOT_PATH_MODE_${mode_kind} ${CMAKE_FIND_ROOT_PATH_MODE_${mode_kind}})
     set(CMAKE_FIND_ROOT_PATH_MODE_${mode_kind} BOTH)
   endforeach()
-  find_package(Torch CONFIG REQUIRED)
+  if(NOT TARGET torch)
+    find_package(Torch CONFIG REQUIRED)
+  endif()
   foreach(mode_kind IN ITEMS PACKAGE LIBRARY INCLUDE)
     set(CMAKE_FIND_ROOT_PATH_MODE_${mode_kind} ${OLD_CMAKE_FIND_ROOT_PATH_MODE_${mode_kind}})
   endforeach()
diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt
index 16ca4fff805..4b793905339 100644
--- a/extension/llm/custom_ops/CMakeLists.txt
+++ b/extension/llm/custom_ops/CMakeLists.txt
@@ -69,7 +69,9 @@ install(TARGETS custom_ops DESTINATION lib)
 
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
   # Add a AOT library
-  find_package(Torch CONFIG REQUIRED)
+  if(NOT TARGET torch)
+    find_package(Torch CONFIG REQUIRED)
+  endif()
   add_library(
     custom_ops_aot_lib SHARED
     ${_custom_ops__srcs}
diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
index 5338307c2b1..1f3aff57ecf 100644
--- a/kernels/optimized/CMakeLists.txt
+++ b/kernels/optimized/CMakeLists.txt
@@ -61,7 +61,6 @@ message("Generated files ${gen_command_sources}")
 
 list(TRANSFORM _optimized_kernels__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(optimized_kernels ${_optimized_kernels__srcs})
-find_package_torch_headers()
 target_include_directories(optimized_kernels PRIVATE ${TORCH_INCLUDE_DIRS} "${EXECUTORCH_ROOT}/third-party/pocketfft")
 target_link_libraries(
   optimized_kernels PRIVATE executorch_core cpublas extension_threadpool