3535#include " vendors/cuda.h"
3636#endif // defined(GGML_USE_HIP)
3737
38- #ifndef GGML_CUDA_WARP_MASK
39- #define GGML_CUDA_WARP_MASK 0xffffffff
40- #endif // GGML_CUDA_WARP_MASK
41-
4238#define STRINGIZE_IMPL (...) #__VA_ARGS__
4339#define STRINGIZE (...) STRINGIZE_IMPL(__VA_ARGS__)
4440
@@ -379,22 +375,22 @@ struct ggml_cuda_unroll<1> {
379375
380376template <int width = WARP_SIZE>
381377static __device__ __forceinline__ int warp_reduce_sum (int x) {
382- #if ( !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE) || (defined(GGML_USE_HIP) && HIP_VERSION >= 70000000)
383- return __reduce_add_sync (GGML_CUDA_WARP_MASK , x);
378+ #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
379+ return __reduce_add_sync (0xFFFFFFFF , x);
384380#else
385381#pragma unroll
386382 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
387- x += __shfl_xor_sync (GGML_CUDA_WARP_MASK , x, offset, width);
383+ x += __shfl_xor_sync (0xFFFFFFFF , x, offset, width);
388384 }
389385 return x;
390- #endif // ( !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE) || (defined(GGML_USE_HIP) && HIP_VERSION >= 70000000)
386+ #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
391387}
392388
393389template <int width = WARP_SIZE>
394390static __device__ __forceinline__ float warp_reduce_sum (float x) {
395391#pragma unroll
396392 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
397- x += __shfl_xor_sync (GGML_CUDA_WARP_MASK , x, offset, width);
393+ x += __shfl_xor_sync (0xFFFFFFFF , x, offset, width);
398394 }
399395 return x;
400396}
@@ -403,8 +399,8 @@ template<int width = WARP_SIZE>
403399static __device__ __forceinline__ float2 warp_reduce_sum (float2 a) {
404400#pragma unroll
405401 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
406- a.x += __shfl_xor_sync (GGML_CUDA_WARP_MASK , a.x , offset, width);
407- a.y += __shfl_xor_sync (GGML_CUDA_WARP_MASK , a.y , offset, width);
402+ a.x += __shfl_xor_sync (0xFFFFFFFF , a.x , offset, width);
403+ a.y += __shfl_xor_sync (0xFFFFFFFF , a.y , offset, width);
408404 }
409405 return a;
410406}
@@ -414,7 +410,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
414410#ifdef FP16_AVAILABLE
415411#pragma unroll
416412 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
417- a = __hadd2 (a, __shfl_xor_sync (GGML_CUDA_WARP_MASK , a, offset, width));
413+ a = __hadd2 (a, __shfl_xor_sync (0xFFFFFFFF , a, offset, width));
418414 }
419415 return a;
420416
@@ -449,20 +445,20 @@ static __device__ __forceinline__ int warp_reduce_all(int x) {
449445#ifdef GGML_USE_HIP
450446#pragma unroll
451447 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
452- x = x && __shfl_xor_sync (GGML_CUDA_WARP_MASK , x, offset, width);
448+ x = x && __shfl_xor_sync (0xFFFFFFFF , x, offset, width);
453449 }
454450 return x;
455451#else
456452 static_assert (width == WARP_SIZE, " width != WARP_SIZE not implemented" );
457- return __all_sync (GGML_CUDA_WARP_MASK , x);
453+ return __all_sync (0xFFFFFFFF , x);
458454#endif // GGML_USE_HIP
459455}
460456
461457template <int width = WARP_SIZE>
462458static __device__ __forceinline__ float warp_reduce_max (float x) {
463459#pragma unroll
464460 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
465- x = fmaxf (x, __shfl_xor_sync (GGML_CUDA_WARP_MASK , x, offset, width));
461+ x = fmaxf (x, __shfl_xor_sync (0xFFFFFFFF , x, offset, width));
466462 }
467463 return x;
468464}
@@ -505,7 +501,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
505501#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
506502#pragma unroll
507503 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
508- x = ggml_cuda_hmax2 (x, __shfl_xor_sync (GGML_CUDA_WARP_MASK , x, offset, width));
504+ x = ggml_cuda_hmax2 (x, __shfl_xor_sync (0xFFFFFFFF , x, offset, width));
509505 }
510506 return x;
511507#else
0 commit comments