|
| 1 | +From c0be238cae714bd5e3ad67554bdf8068026073a1 Mon Sep 17 00:00:00 2001 |
| 2 | +From: oqs <2227-loqs@users.noreply.gitlab.archlinux.org> |
| 3 | +Date: Wed, 6 Aug 2025 19:34:12 +0000 |
| 4 | +Subject: [PATCH 40/40] Support cuda 12.8 |
| 5 | + |
| 6 | +--- |
| 7 | + tensorflow/core/kernels/gpu_prim.h | 13 +++++-------- |
| 8 | + 1 file changed, 5 insertions(+), 8 deletions(-) |
| 9 | + |
| 10 | +diff --git a/tensorflow/core/kernels/gpu_prim.h b/tensorflow/core/kernels/gpu_prim.h |
| 11 | +index bef22b50..f80bd54d 100644 |
| 12 | +--- a/tensorflow/core/kernels/gpu_prim.h |
| 13 | ++++ b/tensorflow/core/kernels/gpu_prim.h |
| 14 | +@@ -44,10 +44,9 @@ __device__ __forceinline__ void ThreadStoreVolatilePtr<Eigen::half>( |
| 15 | + Eigen::numext::bit_cast<uint16_t>(val); |
| 16 | + } |
| 17 | + |
| 18 | +-template <> |
| 19 | +-__device__ __forceinline__ Eigen::half ThreadLoadVolatilePointer<Eigen::half>( |
| 20 | +- Eigen::half *ptr, Int2Type<true> /*is_primitive*/) { |
| 21 | +- uint16_t result = *reinterpret_cast<volatile uint16_t *>(ptr); |
| 22 | ++__device__ __forceinline__ Eigen::half ThreadLoadVolatilePointer( |
| 23 | ++ const Eigen::half *ptr, Int2Type<true> /*is_primitive*/) { |
| 24 | ++ uint16_t result = *reinterpret_cast<volatile const uint16_t *>(ptr); |
| 25 | + return Eigen::numext::bit_cast<Eigen::half>(result); |
| 26 | + } |
| 27 | + |
| 28 | +@@ -59,10 +58,8 @@ __device__ __forceinline__ void ThreadStoreVolatilePtr<Eigen::bfloat16>( |
| 29 | + Eigen::numext::bit_cast<uint16_t>(val); |
| 30 | + } |
| 31 | + |
| 32 | +-template <> |
| 33 | +-__device__ __forceinline__ Eigen::bfloat16 |
| 34 | +-ThreadLoadVolatilePointer<Eigen::bfloat16>(Eigen::bfloat16 *ptr, |
| 35 | +- Int2Type<true> /*is_primitive*/) { |
| 36 | ++__device__ __forceinline__ Eigen::bfloat16 ThreadLoadVolatilePointer( |
| 37 | ++ Eigen::bfloat16 *ptr, Int2Type<true> /*is_primitive*/) { |
| 38 | + uint16_t result = *reinterpret_cast<volatile uint16_t *>(ptr); |
| 39 | + return Eigen::numext::bit_cast<Eigen::bfloat16>(result); |
| 40 | + } |
0 commit comments