diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index ee800549518..75a95d0522b 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-59d5cf083b4f860dea76fe8936076177f9367f10
+01f1cc44cbbfdf6307aa01b803a4ee22f9ade946
diff --git a/backends/xnnpack/cmake/Dependencies.cmake b/backends/xnnpack/cmake/Dependencies.cmake
index fef63badf23..64d2409fb61 100644
--- a/backends/xnnpack/cmake/Dependencies.cmake
+++ b/backends/xnnpack/cmake/Dependencies.cmake
@@ -35,7 +35,11 @@ set(XNNPACK_BUILD_TESTS
 set(XNNPACK_ENABLE_AVXVNNI
     OFF
     CACHE BOOL ""
-)
+  )
+# Work around observed failure: https://github.com/pytorch/executorch/pull/10362#issuecomment-2906391232
+set(XNNPACK_ENABLE_AVX512VNNIGFNI
+  OFF
+  CACHE BOOL "")
 
 if(EXECUTORCH_XNNPACK_ENABLE_KLEIDI)
     set(XNNPACK_ENABLE_KLEIDIAI
diff --git a/install_requirements.py b/install_requirements.py
index 567dca4ebf1..2fcd65ea338 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -71,7 +71,7 @@ def python_is_compatible():
 #
 # NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt
 # by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/
-NIGHTLY_VERSION = "dev20250524"
+NIGHTLY_VERSION = "dev20250422"
 
 
 def install_requirements(use_pytorch_nightly):
diff --git a/runtime/core/portable_type/c10/c10/macros/Macros.h b/runtime/core/portable_type/c10/c10/macros/Macros.h
index 919eb6c8567..7e61ad7e26b 100644
--- a/runtime/core/portable_type/c10/c10/macros/Macros.h
+++ b/runtime/core/portable_type/c10/c10/macros/Macros.h
@@ -508,4 +508,14 @@ __host__ __device__
 
 #endif
 
+// This macro is used to find older C++ compilers
+// that don't support move optimization for return values.
+
+#if (defined(__GNUC__) && __GNUC__ < 13) || \
+    (defined(__clang_major__) && __clang_major__ < 13)
+#define C10_RETURN_MOVE_IF_OLD_COMPILER 1
+#else
+#define C10_RETURN_MOVE_IF_OLD_COMPILER 0
+#endif
+
 #endif // C10_MACROS_MACROS_H_
diff --git a/runtime/core/portable_type/c10/c10/util/BFloat16.h b/runtime/core/portable_type/c10/c10/util/BFloat16.h
index 09d3051ab71..93d0ec54fb0 100644
--- a/runtime/core/portable_type/c10/c10/util/BFloat16.h
+++ b/runtime/core/portable_type/c10/c10/util/BFloat16.h
@@ -31,7 +31,7 @@ inline C10_HOST_DEVICE float f32_from_bits(uint16_t src) {
   uint32_t tmp = src;
   tmp <<= 16;
 
-#if defined(USE_ROCM)
+#if defined(USE_ROCM) && defined(__HIPCC__)
   float* tempRes;
 
   // We should be using memcpy in order to respect the strict aliasing rule
@@ -48,7 +48,7 @@ inline C10_HOST_DEVICE float f32_from_bits(uint16_t src) {
 inline C10_HOST_DEVICE uint16_t bits_from_f32(float src) {
   uint32_t res = 0;
 
-#if defined(USE_ROCM)
+#if defined(USE_ROCM) && defined(__HIPCC__)
   // We should be using memcpy in order to respect the strict aliasing rule
   // but it fails in the HIP environment.
   uint32_t* tempRes = reinterpret_cast<uint32_t*>(&src);
@@ -61,7 +61,7 @@ inline C10_HOST_DEVICE uint16_t bits_from_f32(float src) {
 }
 
 inline C10_HOST_DEVICE uint16_t round_to_nearest_even(float src) {
-#if defined(USE_ROCM)
+#if defined(USE_ROCM) && defined(__HIPCC__)
   if (src != src) {
 #elif defined(_MSC_VER)
   if (isnan(src)) {
@@ -87,7 +87,7 @@ struct alignas(2) BFloat16 {
   uint16_t x;
 
   // HIP wants __host__ __device__ tag, CUDA does not
-#if defined(USE_ROCM)
+#if defined(USE_ROCM) && defined(__HIPCC__)
   C10_HOST_DEVICE BFloat16() = default;
 #else
   BFloat16() = default;