bunp up pytorch nightly version (#12407)

Gasoonjia · facebook-github-bot · commit 0944897f0edd · 2025-07-11T15:59:07.000-07:00
Summary:

as title

Reviewed By: shoumikhin

Differential Revision: D78181215
diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-7cda4017ddda554752e89069ae205be5e8388f59
+87b2ac5d1992c9b4f4167517251b99b513c328a8
diff --git a/install_requirements.py b/install_requirements.py
@@ -71,7 +71,7 @@ def python_is_compatible():
 #
 # NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt
 # by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/
-NIGHTLY_VERSION = "dev20250706"
+NIGHTLY_VERSION = "dev20250711"
 
 
 def install_requirements(use_pytorch_nightly):
diff --git a/runtime/core/portable_type/c10/c10/util/BFloat16.h b/runtime/core/portable_type/c10/c10/util/BFloat16.h
@@ -4,6 +4,7 @@
 // 1 bit for the sign, 8 bits for the exponent and 7 bits for the mantissa.
 
 #include <c10/macros/Macros.h>
+#include <c10/util/bit_cast.h>
 #include <cmath>
 #include <cstdint>
 #include <cstring>
@@ -67,13 +68,123 @@ inline C10_HOST_DEVICE uint16_t round_to_nearest_even(float src) {
 #endif
     return UINT16_C(0x7FC0);
   } else {
-    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-    union {
-      uint32_t U32; // NOLINT(facebook-hte-BadMemberName)
-      float F32; // NOLINT(facebook-hte-BadMemberName)
-    };
+    const uint32_t U32 = c10::bit_cast<uint32_t>(src);
+    uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF);
+    return static_cast<uint16_t>((U32 + rounding_bias) >> 16);
+  }
+}
+} // namespace detail
+
+struct alignas(2) BFloat16 {
+  uint16_t x;
+
+  // HIP wants __host__ __device__ tag, CUDA does not
+#if defined(USE_ROCM) && defined(__HIPCC__)
+  C10_HOST_DEVICE BFloat16() = default;
+#else
+  BFloat16() = default;
+#endif
+
+  struct from_bits_t {};
+  static constexpr C10_HOST_DEVICE from_bits_t from_bits() {
+    return from_bits_t();
+  }
+
+  constexpr C10_HOST_DEVICE BFloat16(unsigned short bits, from_bits_t)
+      : x(bits) {}
+  /* implicit */ inline C10_HOST_DEVICE BFloat16(float value);
+  inline C10_HOST_DEVICE operator float() const;
+
+#if defined(__CUDACC__) && !defined(USE_ROCM)
+  inline C10_HOST_DEVICE BFloat16(const __nv_bfloat16& value);
+  explicit inline C10_HOST_DEVICE operator __nv_bfloat16() const;
+#endif
+
+#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+  inline C10_HOST_DEVICE BFloat16(const sycl::ext::oneapi::bfloat16& value);
+  explicit inline C10_HOST_DEVICE operator sycl::ext::oneapi::bfloat16() const;
+#endif
+};
+
+inline std::ostream& operator<<(std::ostream& out, const BFloat16& value) {
+  out << (float)value;
+  return out;
+}
+
+} // namespace c10
+
+#include <c10/util/BFloat16-inl.h> // IWYU pragma: keep
+#pragma once
+
+// Defines the bloat16 type (brain floating-point). This representation uses
+// 1 bit for the sign, 8 bits for the exponent and 7 bits for the mantissa.
+
+#include <c10/macros/Macros.h>
+#include <c10/util/bit_cast.h>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <iosfwd>
+#include <ostream>
+
+#if defined(__CUDACC__) && !defined(USE_ROCM)
+#include <cuda_bf16.h>
+#endif
+
+#if defined(CL_SYCL_LANGUAGE_VERSION)
+#include <CL/sycl.hpp> // for SYCL 1.2.1
+#elif defined(SYCL_LANGUAGE_VERSION)
+#include <sycl/sycl.hpp> // for SYCL 2020
+#endif
+
+namespace c10 {
+
+namespace detail {
+inline C10_HOST_DEVICE float f32_from_bits(uint16_t src) {
+  float res = 0;
+  uint32_t tmp = src;
+  tmp <<= 16;
+
+#if defined(USE_ROCM) && defined(__HIPCC__)
+  float* tempRes;
+
+  // We should be using memcpy in order to respect the strict aliasing rule
+  // but it fails in the HIP environment.
+  tempRes = reinterpret_cast<float*>(&tmp);
+  res = *tempRes;
+#else
+  std::memcpy(&res, &tmp, sizeof(tmp));
+#endif
 
-    F32 = src;
+  return res;
+}
+
+inline C10_HOST_DEVICE uint16_t bits_from_f32(float src) {
+  uint32_t res = 0;
+
+#if defined(USE_ROCM) && defined(__HIPCC__)
+  // We should be using memcpy in order to respect the strict aliasing rule
+  // but it fails in the HIP environment.
+  uint32_t* tempRes = reinterpret_cast<uint32_t*>(&src);
+  res = *tempRes;
+#else
+  std::memcpy(&res, &src, sizeof(res));
+#endif
+
+  return res >> 16;
+}
+
+inline C10_HOST_DEVICE uint16_t round_to_nearest_even(float src) {
+#if defined(USE_ROCM) && defined(__HIPCC__)
+  if (src != src) {
+#elif defined(_MSC_VER)
+  if (isnan(src)) {
+#else
+  if (std::isnan(src)) {
+#endif
+    return UINT16_C(0x7FC0);
+  } else {
+    const uint32_t U32 = c10::bit_cast<uint32_t>(src);
     uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF);
     return static_cast<uint16_t>((U32 + rounding_bias) >> 16);
   }
@@ -111,9 +222,7 @@ struct alignas(2) BFloat16 {
 #endif
 };
 
-C10_API inline std::ostream& operator<<(
-    std::ostream& out,
-    const BFloat16& value) {
+inline std::ostream& operator<<(std::ostream& out, const BFloat16& value) {
   out << (float)value;
   return out;
 }
diff --git a/runtime/core/portable_type/c10/c10/util/Half.h b/runtime/core/portable_type/c10/c10/util/Half.h
@@ -414,7 +414,7 @@ struct alignas(2) Half {
 #endif
 };
 
-C10_API inline std::ostream& operator<<(std::ostream& out, const Half& value) {
+inline std::ostream& operator<<(std::ostream& out, const Half& value) {
   out << (float)value;
   return out;
 }
diff --git a/runtime/core/portable_type/c10/c10/util/bit_cast.h b/runtime/core/portable_type/c10/c10/util/bit_cast.h
@@ -3,6 +3,8 @@
 #include <cstring>
 #include <type_traits>
 
+#include <c10/macros/Macros.h>
+
 #if __has_include(<bit>) && (defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L)
 #include <bit>
 #define C10_HAVE_STD_BIT_CAST 1
@@ -23,7 +25,7 @@ using std::bit_cast;
 // See https://en.cppreference.com/w/cpp/numeric/bit_cast for more
 // information as well as the source of our implementations.
 template <class To, class From>
-std::enable_if_t<
+C10_HOST_DEVICE std::enable_if_t<
     sizeof(To) == sizeof(From) && std::is_trivially_copyable_v<From> &&
         std::is_trivially_copyable_v<To>,
     To>

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-7cda4017ddda554752e89069ae205be5e8388f59`
	`1`	`+87b2ac5d1992c9b4f4167517251b99b513c328a8`
Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,7 @@ def python_is_compatible():`
`71`	`71`	`#`
`72`	`72`	`# NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt`
`73`	`73`	`# by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/`
`74`		`-NIGHTLY_VERSION = "dev20250706"`
	`74`	`+NIGHTLY_VERSION = "dev20250711"`
`75`	`75`
`76`	`76`
`77`	`77`	`def install_requirements(use_pytorch_nightly):`
Original file line number	Diff line number	Diff line change
`@@ -414,7 +414,7 @@ struct alignas(2) Half {`
`414`	`414`	`#endif`
`415`	`415`	`};`
`416`	`416`
`417`		`-C10_API inline std::ostream& operator<<(std::ostream& out, const Half& value) {`
	`417`	`+inline std::ostream& operator<<(std::ostream& out, const Half& value) {`
`418`	`418`	`out << (float)value;`
`419`	`419`	`return out;`
`420`	`420`	`}`