pytorch · swolchok · Apr 23, 2025 · Mar 19, 2025 · Mar 19, 2025 · Mar 19, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -645,13 +645,18 @@ target_link_options_shared_lib(executorch)
 # Real integrations should supply their own YAML file that only lists the
 # operators necessary for the models that will run.
 #
+if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
+  # find pytorch lib here to make it available to all
+  # sub-directories. Find it before including portable so that
+  # optimized_portable_kernels can use it.
+  find_package_torch_headers()
+endif()
+
 if(BUILD_EXECUTORCH_PORTABLE_OPS)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable)
 endif()
 
 if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
-  # find pytorch lib here to make it available to all sub-directories
-  find_package_torch_headers()
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized)
 endif()
 

diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
@@ -62,6 +62,7 @@ message("Generated files ${gen_command_sources}")
 list(TRANSFORM _optimized_kernels__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(optimized_kernels ${_optimized_kernels__srcs})
 target_include_directories(optimized_kernels PRIVATE ${TORCH_INCLUDE_DIRS} "${EXECUTORCH_ROOT}/third-party/pocketfft")
+target_compile_definitions(optimized_kernels PRIVATE ET_USE_PYTORCH_HEADERS)
 target_link_libraries(
   optimized_kernels PUBLIC executorch_core cpublas extension_threadpool
 )

diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt
@@ -66,13 +66,13 @@ gen_operators_lib(
 # Portable kernels support optional parallelization (and, in the
 # future, perhaps other performance features). If support is present,
 # produce an optimized version.
-set(BUILD_OPTIMIZED_PORTABLE_KERNELS EXECUTORCH_BUILD_PTHREADPOOL)
-
-if(BUILD_OPTIMIZED_PORTABLE_KERNELS)
+if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   add_library(optimized_portable_kernels ${_portable_kernels__srcs})
   target_link_libraries(optimized_portable_kernels PRIVATE executorch)
   target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool)
   target_compile_options(optimized_portable_kernels PUBLIC ${_common_compile_options})
+  target_include_directories(optimized_portable_kernels PRIVATE ${TORCH_INCLUDE_DIRS})
+  target_compile_definitions(optimized_portable_kernels PRIVATE ET_USE_PYTORCH_HEADERS)
   install(
     TARGETS optimized_portable_kernels
     DESTINATION lib

@@ -52,7 +52,10 @@ Tensor& mul_out(
       out);
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
         [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
           return val_a * val_b;
         },
@@ -61,8 +64,7 @@ Tensor& mul_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;

@@ -290,6 +290,25 @@ bool check_tensor_dtype(
     SupportedTensorDtypes dtypes,
     const ScalarType compute_type);
 
+/// Return the one output type we are willing to emit specialized code
+/// to handle, given a compute type of CTYPE_COMMON and supported
+/// output types of out_dtypes.
+template <typename CTYPE_COMMON>
+inline constexpr ScalarType specialized_output_scalar_type(
+    SupportedTensorDtypes out_dtypes) {
+  switch (out_dtypes) {
+    case SupportedTensorDtypes::BOOL_OR_BYTE:
+      return ScalarType::Bool;
+    case SupportedTensorDtypes::REALHBBF16:
+    case SupportedTensorDtypes::REALHBF16:
+    case SupportedTensorDtypes::FLOATHBF16:
+    case SupportedTensorDtypes::INTB:
+    case SupportedTensorDtypes::SAME_AS_COMPUTE:
+    case SupportedTensorDtypes::SAME_AS_COMMON:
+      return CppTypeToScalarType<CTYPE_COMMON>::value;
+  }
+}
+
 } // namespace internal
 } // namespace utils
 } // namespace native

@@ -53,10 +53,44 @@ inline int64_t scalar_to<int64_t>(const Scalar& s) {
 namespace internal {
 template <
     typename CTYPE_COMPUTE,
-    const char* op_name,
+    typename CTYPE_OUT,
     typename Op,
     typename... Args>
-inline void apply_elementwise_fn(
+inline void dtype_specialized_elementwise_fn_impl(
+    const Op& compute_fun,
+    KernelRuntimeContext& ctx,
+    const Tensor& out,
+    Args... inputs) {
+  constexpr auto kNumInputs = sizeof...(inputs);
+  ET_DCHECK(((inputs.first->element_size() == sizeof(CTYPE_COMPUTE)) && ...));
+
+  ::executorch::extension::parallel_for(
+      0,
+      out.numel(),
+      ::executorch::extension::internal::GRAIN_SIZE,
+      [&](const auto begin, const auto end) {
+        std::array<const CTYPE_COMPUTE*, kNumInputs> inputs_data_ptrs = {
+            inputs.first->template const_data_ptr<CTYPE_COMPUTE>()...};
+
+        CTYPE_OUT* const data_out = out.mutable_data_ptr<CTYPE_OUT>();
+
+        const auto range =
+            BroadcastIndexesRange<kNumInputs>(out, (*inputs.first)...);
+        auto begin_it = range.begin();
+        begin_it += begin;
+        for (; (*begin_it)[0] < end; ++begin_it) {
+          const auto& indexes = *begin_it;
+          std::array<CTYPE_COMPUTE, kNumInputs> loaded_inputs;
+          for (const auto idx : c10::irange(kNumInputs)) {
+            loaded_inputs[idx] = inputs_data_ptrs[idx][indexes[idx + 1]];
+          }
+          data_out[indexes[0]] = std::apply(compute_fun, loaded_inputs);
+        }
+      });
+}
+
+template <typename CTYPE_COMPUTE, typename Op, typename... Args>
+inline bool validate_elementwise_fn_inputs(
     const Op& compute_fun,
     KernelRuntimeContext& ctx,
     const Tensor& out,
@@ -65,7 +99,6 @@ inline void apply_elementwise_fn(
   static_assert(
       (std::is_same_v<Args, std::pair<const Tensor*, SupportedTensorDtypes>> &&
        ...));
-  constexpr auto kNumInputs = sizeof...(inputs);
   constexpr auto compute_type = CppTypeToScalarType<CTYPE_COMPUTE>::value;
   const auto check_input_dtype = [](auto input, auto compute_type) {
     return internal::check_tensor_dtype(
@@ -75,7 +108,24 @@ inline void apply_elementwise_fn(
       ctx,
       (check_input_dtype(inputs, compute_type) && ...) &&
           internal::check_tensor_dtype(out, out_dtypes, compute_type),
-      InvalidArgument, );
+      InvalidArgument,
+      false);
+
+  return true;
+}
+
+template <
+    typename CTYPE_COMPUTE,
+    const char* op_name,
+    typename Op,
+    typename... Args>
+inline void apply_elementwise_fn_generic_impl(
+    const Op& compute_fun,
+    KernelRuntimeContext& ctx,
+    const Tensor& out,
+    SupportedTensorDtypes out_dtypes,
+    Args... inputs) {
+  constexpr auto kNumInputs = sizeof...(inputs);
 
   struct InputInfo {
     load_to_compute_fn<CTYPE_COMPUTE> load_to_compute;
@@ -120,6 +170,64 @@ inline void apply_elementwise_fn(
       });
 }
 
+template <
+    typename CTYPE_COMPUTE,
+    const char* op_name,
+    typename Op,
+    typename... Args>
+inline void apply_elementwise_fn_runtime_out_dtypes(
+    const Op& compute_fun,
+    KernelRuntimeContext& ctx,
+    const Tensor& out,
+    SupportedTensorDtypes out_dtypes,
+    Args... inputs) {
+  const bool inputs_valid = validate_elementwise_fn_inputs<CTYPE_COMPUTE>(
+      compute_fun, ctx, out, out_dtypes, inputs...);
+  if (!inputs_valid) {
+    return;
+  }
+
+  apply_elementwise_fn_generic_impl<CTYPE_COMPUTE, op_name>(
+      compute_fun, ctx, out, out_dtypes, inputs...);
+}
+
+template <
+    typename CTYPE_COMPUTE,
+    const char* op_name,
+    SupportedTensorDtypes out_dtypes,
+    typename Op,
+    typename... Args>
+inline void apply_elementwise_fn(
+    const Op& compute_fun,
+    KernelRuntimeContext& ctx,
+    const Tensor& out,
+    Args... inputs) {
+  const bool inputs_valid = validate_elementwise_fn_inputs<CTYPE_COMPUTE>(
+      compute_fun, ctx, out, out_dtypes, inputs...);
+  if (!inputs_valid) {
+    return;
+  }
+
+  constexpr auto compute_type = CppTypeToScalarType<CTYPE_COMPUTE>::value;
+  const bool all_inputs_compute_dtype =
+      ((inputs.first->scalar_type() == compute_type) && ...);
+
+  constexpr ScalarType out_specialized_scalar_type =
+      specialized_output_scalar_type<CTYPE_COMPUTE>(out_dtypes);
+  if (all_inputs_compute_dtype &&
+      out.scalar_type() == out_specialized_scalar_type) {
+    using CTYPE_OUT =
+        typename ScalarTypeToCppType<out_specialized_scalar_type>::type;
+    dtype_specialized_elementwise_fn_impl<CTYPE_COMPUTE, CTYPE_OUT>(
+        compute_fun, ctx, out, inputs...);
+    return;
+  }
+
+  apply_elementwise_fn_generic_impl<CTYPE_COMPUTE, op_name>(
+      compute_fun, ctx, out, out_dtypes, inputs...);
+}
+
+/// DEPRECATED: prefer the variant with out_dtypes in the template argument.
 template <typename CTYPE_COMPUTE, const char* op_name, typename Op>
 inline void apply_unitensor_elementwise_fn(
     const Op& compute_fun,
@@ -128,32 +236,96 @@ inline void apply_unitensor_elementwise_fn(
     SupportedTensorDtypes a_dtypes,
     const Tensor& out,
     SupportedTensorDtypes out_dtypes) {
-  internal::apply_elementwise_fn<CTYPE_COMPUTE, op_name>(
+  internal::apply_elementwise_fn_runtime_out_dtypes<CTYPE_COMPUTE, op_name>(
       compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes));
 }
 
+template <
+    typename CTYPE_COMPUTE,
+    const char* op_name,
+    SupportedTensorDtypes out_dtypes,
+    typename Op>
+inline void apply_unitensor_elementwise_fn(
+    const Op& compute_fun,
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    SupportedTensorDtypes a_dtypes,
+    const Tensor& out) {
+  internal::apply_elementwise_fn<CTYPE_COMPUTE, op_name, out_dtypes>(
+      compute_fun, ctx, out, std::make_pair(&a, a_dtypes));
+}
+
+/**
+ * DEPRECATED: prefer the variant with out_dtypes in the template argument list.
+ */
+template <typename CTYPE_COMPUTE, const char* op_name, typename Op>
+inline void apply_bitensor_elementwise_fn(
+    const Op& compute_fun,
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    SupportedTensorDtypes a_dtypes,
+    const Tensor& b,
+    SupportedTensorDtypes b_dtypes,
+    const Tensor& out,
+    SupportedTensorDtypes out_dtypes) {
+  internal::apply_elementwise_fn_runtime_out_dtypes<CTYPE_COMPUTE, op_name>(
+      compute_fun,
+      ctx,
+      out,
+      out_dtypes,
+      std::make_pair(&a, a_dtypes),
+      std::make_pair(&b, b_dtypes));
+}
+
 /**
  * Useful for bi-tensor elementwise operators. For each element of the inputs,
  * perform a computation and write to the corresponding element of the output.
  * Tensor broadcasting is applied wherever it is required.
  */
-template <typename CTYPE_COMPUTE, const char* op_name, typename Op>
+template <
+    typename CTYPE_COMPUTE,
+    const char* op_name,
+    SupportedTensorDtypes out_dtypes,
+    typename Op>
 inline void apply_bitensor_elementwise_fn(
     const Op& compute_fun,
     KernelRuntimeContext& ctx,
     const Tensor& a,
     SupportedTensorDtypes a_dtypes,
     const Tensor& b,
     SupportedTensorDtypes b_dtypes,
+    const Tensor& out) {
+  internal::apply_elementwise_fn<CTYPE_COMPUTE, op_name, out_dtypes>(
+      compute_fun,
+      ctx,
+      out,
+      std::make_pair(&a, a_dtypes),
+      std::make_pair(&b, b_dtypes));
+}
+
+/**
+ * DEPRECATED: prefer the variant with out_dtypes in the template argument list.
+ */
+template <typename CTYPE_COMPUTE, const char* op_name, typename Op>
+inline void apply_tritensor_elementwise_fn(
+    const Op& compute_fun,
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    SupportedTensorDtypes a_dtypes,
+    const Tensor& b,
+    SupportedTensorDtypes b_dtypes,
+    const Tensor& c,
+    SupportedTensorDtypes c_dtypes,
     const Tensor& out,
     SupportedTensorDtypes out_dtypes) {
-  internal::apply_elementwise_fn<CTYPE_COMPUTE, op_name>(
+  internal::apply_elementwise_fn_runtime_out_dtypes<CTYPE_COMPUTE, op_name>(
       compute_fun,
       ctx,
       out,
       out_dtypes,
       std::make_pair(&a, a_dtypes),
-      std::make_pair(&b, b_dtypes));
+      std::make_pair(&b, b_dtypes),
+      std::make_pair(&c, c_dtypes));
 }
 
 /**
@@ -176,7 +348,11 @@ inline void apply_bitensor_elementwise_fn(
  * static constexpr const char op_name[] = "my_op";
  * apply_ternary_elementwise_fn<CTYPE_COMPUTE, op_name>.
  */
-template <typename CTYPE_COMPUTE, const char* op_name, typename Op>
+template <
+    typename CTYPE_COMPUTE,
+    const char* op_name,
+    SupportedTensorDtypes out_dtypes,
+    typename Op>
 inline void apply_tritensor_elementwise_fn(
     const Op& compute_fun,
     KernelRuntimeContext& ctx,
@@ -186,13 +362,11 @@ inline void apply_tritensor_elementwise_fn(
     SupportedTensorDtypes b_dtypes,
     const Tensor& c,
     SupportedTensorDtypes c_dtypes,
-    const Tensor& out,
-    SupportedTensorDtypes out_dtypes) {
-  internal::apply_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    const Tensor& out) {
+  internal::apply_elementwise_fn<CTYPE_COMPUTE, op_name, out_dtypes>(
       compute_fun,
       ctx,
       out,
-      out_dtypes,
       std::make_pair(&a, a_dtypes),
       std::make_pair(&b, b_dtypes),
       std::make_pair(&c, c_dtypes));

diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl
@@ -73,6 +73,7 @@ def define_common_targets():
             # -Wmacro-redefined, and we only care about getting
             # reasonable vectorization and Sleef support.
             "-DCPU_CAPABILITY_AVX2",
+            "-DET_USE_PYTORCH_HEADERS",
             "-DHAVE_AVX2_CPU_DEFINITION",
             "-DSTANDALONE_TORCH_HEADER",
         ] + get_sleef_preprocessor_flags(),
@@ -86,5 +87,5 @@ def define_common_targets():
             # linker failure.
             "ovr_config//cpu:arm64": get_sleef_preprocessor_flags(),
             "DEFAULT": [],
-        }) + ["-DSTANDALONE_TORCH_HEADER"],
+        }) + ["-DSTANDALONE_TORCH_HEADER"] + ([] if runtime.is_oss else ["-DET_USE_PYTORCH_HEADERS"]),
     )