3535#include " vendors/cuda.h"
3636#endif // defined(GGML_USE_HIP)
3737
38- #ifndef GGML_WARP_SYNC_MASK
39- #define GGML_WARP_SYNC_MASK 0xffffffff
40- #endif
38+ #ifndef GGML_CUDA_WARP_MASK
39+ #define GGML_CUDA_WARP_MASK 0xffffffff
40+ #endif // GGML_CUDA_WARP_MASK
4141
4242#define STRINGIZE_IMPL (...) #__VA_ARGS__
4343#define STRINGIZE (...) STRINGIZE_IMPL(__VA_ARGS__)
@@ -380,11 +380,11 @@ struct ggml_cuda_unroll<1> {
380380template <int width = WARP_SIZE>
381381static __device__ __forceinline__ int warp_reduce_sum (int x) {
382382#if (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE) || (defined(GGML_USE_HIP) && HIP_VERSION >= 70000000)
383- return __reduce_add_sync (GGML_WARP_SYNC_MASK , x);
383+ return __reduce_add_sync (GGML_CUDA_WARP_MASK , x);
384384#else
385385#pragma unroll
386386 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
387- x += __shfl_xor_sync (GGML_WARP_SYNC_MASK , x, offset, width);
387+ x += __shfl_xor_sync (GGML_CUDA_WARP_MASK , x, offset, width);
388388 }
389389 return x;
390390#endif // (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE) || (defined(GGML_USE_HIP) && HIP_VERSION >= 70000000)
@@ -394,7 +394,7 @@ template<int width = WARP_SIZE>
394394static __device__ __forceinline__ float warp_reduce_sum (float x) {
395395#pragma unroll
396396 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
397- x += __shfl_xor_sync (GGML_WARP_SYNC_MASK , x, offset, width);
397+ x += __shfl_xor_sync (GGML_CUDA_WARP_MASK , x, offset, width);
398398 }
399399 return x;
400400}
@@ -403,8 +403,8 @@ template<int width = WARP_SIZE>
403403static __device__ __forceinline__ float2 warp_reduce_sum (float2 a) {
404404#pragma unroll
405405 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
406- a.x += __shfl_xor_sync (GGML_WARP_SYNC_MASK , a.x , offset, width);
407- a.y += __shfl_xor_sync (GGML_WARP_SYNC_MASK , a.y , offset, width);
406+ a.x += __shfl_xor_sync (GGML_CUDA_WARP_MASK , a.x , offset, width);
407+ a.y += __shfl_xor_sync (GGML_CUDA_WARP_MASK , a.y , offset, width);
408408 }
409409 return a;
410410}
@@ -414,7 +414,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
414414#ifdef FP16_AVAILABLE
415415#pragma unroll
416416 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
417- a = __hadd2 (a, __shfl_xor_sync (GGML_WARP_SYNC_MASK , a, offset, width));
417+ a = __hadd2 (a, __shfl_xor_sync (GGML_CUDA_WARP_MASK , a, offset, width));
418418 }
419419 return a;
420420
@@ -449,20 +449,20 @@ static __device__ __forceinline__ int warp_reduce_all(int x) {
449449#ifdef GGML_USE_HIP
450450#pragma unroll
451451 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
452- x = x && __shfl_xor_sync (GGML_WARP_SYNC_MASK , x, offset, width);
452+ x = x && __shfl_xor_sync (GGML_CUDA_WARP_MASK , x, offset, width);
453453 }
454454 return x;
455455#else
456456 static_assert (width == WARP_SIZE, " width != WARP_SIZE not implemented" );
457- return __all_sync (GGML_WARP_SYNC_MASK , x);
457+ return __all_sync (GGML_CUDA_WARP_MASK , x);
458458#endif // GGML_USE_HIP
459459}
460460
461461template <int width = WARP_SIZE>
462462static __device__ __forceinline__ float warp_reduce_max (float x) {
463463#pragma unroll
464464 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
465- x = fmaxf (x, __shfl_xor_sync (GGML_WARP_SYNC_MASK , x, offset, width));
465+ x = fmaxf (x, __shfl_xor_sync (GGML_CUDA_WARP_MASK , x, offset, width));
466466 }
467467 return x;
468468}
@@ -505,7 +505,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
505505#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
506506#pragma unroll
507507 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
508- x = ggml_cuda_hmax2 (x, __shfl_xor_sync (GGML_WARP_SYNC_MASK , x, offset, width));
508+ x = ggml_cuda_hmax2 (x, __shfl_xor_sync (GGML_CUDA_WARP_MASK , x, offset, width));
509509 }
510510 return x;
511511#else
0 commit comments