diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index a846b87c198..ee800549518 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-7ae0ce6360b6e4f944906502d20da24c04debee5
+59d5cf083b4f860dea76fe8936076177f9367f10
diff --git a/backends/arm/test/models/test_conformer.py b/backends/arm/test/models/test_conformer.py
index dc5ecc7ca97..b293555e66b 100644
--- a/backends/arm/test/models/test_conformer.py
+++ b/backends/arm/test/models/test_conformer.py
@@ -31,7 +31,7 @@ class TestConformer(unittest.TestCase):
     # .to_executorch step, i.e. after Arm partitioner.
     ops_after_partitioner = {
         "executorch_exir_dialects_edge__ops_aten_max_default": 1,
-        "torch.ops.aten._assert_scalar.default": 10,
+        "torch.ops.aten._assert_scalar.default": 7,
         "torch.ops.aten._local_scalar_dense.default": 1,
     }
 
diff --git a/install_requirements.py b/install_requirements.py
index 6c3b4186697..4c468b979d1 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -71,7 +71,7 @@ def python_is_compatible():
 #
 # NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt
 # by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/
-NIGHTLY_VERSION = "dev20250310"
+NIGHTLY_VERSION = "dev20250325"
 
 
 def install_requirements(use_pytorch_nightly):
@@ -80,7 +80,7 @@ def install_requirements(use_pytorch_nightly):
         # Setting use_pytorch_nightly to false to test the pinned PyTorch commit. Note
         # that we don't need to set any version number there because they have already
         # been installed on CI before this step, so pip won't reinstall them
-        f"torch==2.7.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torch",
+        f"torch==2.8.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torch",
         (
             f"torchvision==0.22.0.{NIGHTLY_VERSION}"
             if use_pytorch_nightly
diff --git a/kernels/optimized/cpu/op_elu.cpp b/kernels/optimized/cpu/op_elu.cpp
new file mode 100644
index 00000000000..d6c3f2b4840
--- /dev/null
+++ b/kernels/optimized/cpu/op_elu.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/native/cpu/Elu.h>
+
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
+#include <executorch/runtime/platform/assert.h>
+
+namespace torch::executor::native {
+
+namespace {
+template <typename CTYPE>
+void elu(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    const Scalar& alpha,
+    const Scalar& scale,
+    const Scalar& input_scale,
+    Tensor& out) {
+  const CTYPE* in_data = input.const_data_ptr<CTYPE>();
+  CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
+  using MathT =
+      std::conditional_t<c10::is_reduced_floating_point_v<CTYPE>, float, CTYPE>;
+  MathT math_alpha = 0;
+  MathT math_scale = 0;
+  MathT math_input_scale = 0;
+  ET_EXTRACT_SCALAR(alpha, math_alpha);
+  ET_EXTRACT_SCALAR(scale, math_scale);
+  ET_EXTRACT_SCALAR(input_scale, math_input_scale);
+  const auto scalar_func =
+      at::native::get_scalar_elu_elementwise_func<CTYPE, MathT>(
+          math_alpha, math_scale, math_input_scale);
+  const auto vec_func = at::native::get_vectorized_elu_elementwise_func<CTYPE>(
+      math_alpha, math_scale, math_input_scale);
+
+  ::executorch::extension::parallel_for(
+      0,
+      out.numel(),
+      ::executorch::extension::internal::GRAIN_SIZE,
+      [&](const auto begin, const auto end) {
+        using Vec = at::vec::Vectorized<CTYPE>;
+        const auto vectorized_begin =
+            begin + (Vec::size() - begin % Vec::size()) % Vec::size();
+        const auto vectorized_end = end - (end % Vec::size());
+        // Scalar prologue.
+        for (const auto idx : c10::irange(begin, vectorized_begin)) {
+          out_data[idx] = scalar_func(in_data[idx]);
+        }
+
+        // Main vectorized loop.
+        for (auto idx = vectorized_begin; idx < vectorized_end;
+             idx += Vec::size()) {
+          auto result_vec = vec_func(Vec::loadu(&in_data[idx]));
+          result_vec.store(&out_data[idx]);
+        }
+
+        // Scalar epilogue.
+        for (const auto idx : c10::irange(vectorized_end, end)) {
+          out_data[idx] = scalar_func(in_data[idx]);
+        }
+      });
+}
+} // namespace
+
+Tensor& opt_elu_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    const Scalar& alpha,
+    const Scalar& scale,
+    const Scalar& input_scale,
+    Tensor& out) {
+  ET_KERNEL_CHECK(ctx, tensors_have_same_dtype(in, out), InvalidArgument, out);
+  ET_KERNEL_CHECK(
+      ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out);
+
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_floating_type(in), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensors_have_same_dtype(in, out), InvalidArgument, out);
+
+  ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, "elu.out", CTYPE, [&]() {
+    elu<CTYPE>(ctx, in, alpha, scale, input_scale, out);
+  });
+  return out;
+}
+
+} // namespace torch::executor::native
diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl
index 017dff8a127..cf7cb2f00e1 100644
--- a/kernels/optimized/cpu/targets.bzl
+++ b/kernels/optimized/cpu/targets.bzl
@@ -25,6 +25,14 @@ _OPTIMIZED_ATEN_OPS = (
             "//executorch/kernels/portable/cpu/util:broadcast_util",
         ],
     ),
+    op_target(
+        name = "op_elu",
+        deps = [
+            "//executorch/extension/threadpool:threadpool",
+            "//executorch/kernels/portable/cpu:scalar_utils",
+            "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch",
+        ],
+    ),
     op_target(name = "op_exp"),
     op_target(
         name = "op_fft_r2c",
diff --git a/kernels/optimized/optimized.yaml b/kernels/optimized/optimized.yaml
index 4f90059aa93..864c3ed5780 100644
--- a/kernels/optimized/optimized.yaml
+++ b/kernels/optimized/optimized.yaml
@@ -37,6 +37,11 @@
     - arg_meta: null
       kernel_name: torch::executor::opt_div_scalar_out
 
+- op: elu.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_elu_out
+
 - op: exp.out
   kernels:
     - arg_meta: null
diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt
index 42578acbedd..2d497dfc124 100644
--- a/kernels/test/CMakeLists.txt
+++ b/kernels/test/CMakeLists.txt
@@ -274,6 +274,7 @@ set(_optimized_kernels_test_sources
     "op_add_test.cpp"
     "op_bmm_test.cpp"
     "op_div_test.cpp"
+    "op_elu_test.cpp"
     "op_exp_test.cpp"
     "op_fft_r2c_test.cpp"
     "op_gelu_test.cpp"
diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl
index 3824551a46b..05e678c6229 100644
--- a/kernels/test/targets.bzl
+++ b/kernels/test/targets.bzl
@@ -215,7 +215,7 @@ def define_common_targets():
     _common_op_test("op_detach_copy_test", ["aten", "portable"])
     _common_op_test("op_diagonal_copy_test", ["aten", "portable"])
     _common_op_test("op_div_test", ["aten", "portable", "optimized"])
-    _common_op_test("op_elu_test", ["aten", "portable"])
+    _common_op_test("op_elu_test", ["aten", "portable", "optimized"])
     _common_op_test("op_embedding_test", ["aten", "portable"])
     _common_op_test("op_empty_test", ["aten", "portable"])
     _common_op_test("op_eq_test", ["aten", "portable"])