diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt index a846b87c198..ee800549518 100644 --- a/.ci/docker/ci_commit_pins/pytorch.txt +++ b/.ci/docker/ci_commit_pins/pytorch.txt @@ -1 +1 @@ -7ae0ce6360b6e4f944906502d20da24c04debee5 +59d5cf083b4f860dea76fe8936076177f9367f10 diff --git a/backends/arm/test/models/test_conformer.py b/backends/arm/test/models/test_conformer.py index dc5ecc7ca97..b293555e66b 100644 --- a/backends/arm/test/models/test_conformer.py +++ b/backends/arm/test/models/test_conformer.py @@ -31,7 +31,7 @@ class TestConformer(unittest.TestCase): # .to_executorch step, i.e. after Arm partitioner. ops_after_partitioner = { "executorch_exir_dialects_edge__ops_aten_max_default": 1, - "torch.ops.aten._assert_scalar.default": 10, + "torch.ops.aten._assert_scalar.default": 7, "torch.ops.aten._local_scalar_dense.default": 1, } diff --git a/install_requirements.py b/install_requirements.py index 6c3b4186697..4c468b979d1 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -71,7 +71,7 @@ def python_is_compatible(): # # NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt # by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/ -NIGHTLY_VERSION = "dev20250310" +NIGHTLY_VERSION = "dev20250325" def install_requirements(use_pytorch_nightly): @@ -80,7 +80,7 @@ def install_requirements(use_pytorch_nightly): # Setting use_pytorch_nightly to false to test the pinned PyTorch commit. Note # that we don't need to set any version number there because they have already # been installed on CI before this step, so pip won't reinstall them - f"torch==2.7.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torch", + f"torch==2.8.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torch", ( f"torchvision==0.22.0.{NIGHTLY_VERSION}" if use_pytorch_nightly diff --git a/kernels/optimized/cpu/op_elu.cpp b/kernels/optimized/cpu/op_elu.cpp new file mode 100644 index 00000000000..d6c3f2b4840 --- /dev/null +++ b/kernels/optimized/cpu/op_elu.cpp @@ -0,0 +1,96 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include + +namespace torch::executor::native { + +namespace { +template +void elu( + KernelRuntimeContext& context, + const Tensor& input, + const Scalar& alpha, + const Scalar& scale, + const Scalar& input_scale, + Tensor& out) { + const CTYPE* in_data = input.const_data_ptr(); + CTYPE* out_data = out.mutable_data_ptr(); + using MathT = + std::conditional_t, float, CTYPE>; + MathT math_alpha = 0; + MathT math_scale = 0; + MathT math_input_scale = 0; + ET_EXTRACT_SCALAR(alpha, math_alpha); + ET_EXTRACT_SCALAR(scale, math_scale); + ET_EXTRACT_SCALAR(input_scale, math_input_scale); + const auto scalar_func = + at::native::get_scalar_elu_elementwise_func( + math_alpha, math_scale, math_input_scale); + const auto vec_func = at::native::get_vectorized_elu_elementwise_func( + math_alpha, math_scale, math_input_scale); + + ::executorch::extension::parallel_for( + 0, + out.numel(), + ::executorch::extension::internal::GRAIN_SIZE, + [&](const auto begin, const auto end) { + using Vec = at::vec::Vectorized; + const auto vectorized_begin = + begin + (Vec::size() - begin % Vec::size()) % Vec::size(); + const auto vectorized_end = end - (end % Vec::size()); + // Scalar prologue. + for (const auto idx : c10::irange(begin, vectorized_begin)) { + out_data[idx] = scalar_func(in_data[idx]); + } + + // Main vectorized loop. + for (auto idx = vectorized_begin; idx < vectorized_end; + idx += Vec::size()) { + auto result_vec = vec_func(Vec::loadu(&in_data[idx])); + result_vec.store(&out_data[idx]); + } + + // Scalar epilogue. + for (const auto idx : c10::irange(vectorized_end, end)) { + out_data[idx] = scalar_func(in_data[idx]); + } + }); +} +} // namespace + +Tensor& opt_elu_out( + KernelRuntimeContext& ctx, + const Tensor& in, + const Scalar& alpha, + const Scalar& scale, + const Scalar& input_scale, + Tensor& out) { + ET_KERNEL_CHECK(ctx, tensors_have_same_dtype(in, out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out); + + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_floating_type(in), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensors_have_same_dtype(in, out), InvalidArgument, out); + + ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, "elu.out", CTYPE, [&]() { + elu(ctx, in, alpha, scale, input_scale, out); + }); + return out; +} + +} // namespace torch::executor::native diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl index 017dff8a127..cf7cb2f00e1 100644 --- a/kernels/optimized/cpu/targets.bzl +++ b/kernels/optimized/cpu/targets.bzl @@ -25,6 +25,14 @@ _OPTIMIZED_ATEN_OPS = ( "//executorch/kernels/portable/cpu/util:broadcast_util", ], ), + op_target( + name = "op_elu", + deps = [ + "//executorch/extension/threadpool:threadpool", + "//executorch/kernels/portable/cpu:scalar_utils", + "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch", + ], + ), op_target(name = "op_exp"), op_target( name = "op_fft_r2c", diff --git a/kernels/optimized/optimized.yaml b/kernels/optimized/optimized.yaml index 4f90059aa93..864c3ed5780 100644 --- a/kernels/optimized/optimized.yaml +++ b/kernels/optimized/optimized.yaml @@ -37,6 +37,11 @@ - arg_meta: null kernel_name: torch::executor::opt_div_scalar_out +- op: elu.out + kernels: + - arg_meta: null + kernel_name: torch::executor::opt_elu_out + - op: exp.out kernels: - arg_meta: null diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt index 42578acbedd..2d497dfc124 100644 --- a/kernels/test/CMakeLists.txt +++ b/kernels/test/CMakeLists.txt @@ -274,6 +274,7 @@ set(_optimized_kernels_test_sources "op_add_test.cpp" "op_bmm_test.cpp" "op_div_test.cpp" + "op_elu_test.cpp" "op_exp_test.cpp" "op_fft_r2c_test.cpp" "op_gelu_test.cpp" diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl index 3824551a46b..05e678c6229 100644 --- a/kernels/test/targets.bzl +++ b/kernels/test/targets.bzl @@ -215,7 +215,7 @@ def define_common_targets(): _common_op_test("op_detach_copy_test", ["aten", "portable"]) _common_op_test("op_diagonal_copy_test", ["aten", "portable"]) _common_op_test("op_div_test", ["aten", "portable", "optimized"]) - _common_op_test("op_elu_test", ["aten", "portable"]) + _common_op_test("op_elu_test", ["aten", "portable", "optimized"]) _common_op_test("op_embedding_test", ["aten", "portable"]) _common_op_test("op_empty_test", ["aten", "portable"]) _common_op_test("op_eq_test", ["aten", "portable"])