@@ -375,22 +375,22 @@ struct ggml_cuda_unroll<1> {
375375
376376template <int width = WARP_SIZE>
377377static __device__ __forceinline__ int warp_reduce_sum (int x) {
378- #if ( !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE) || (defined(GGML_USE_HIP) && HIP_VERSION >= 70000000)
379- return __reduce_add_sync (GGML_WARP_SYNC_MASK , x);
378+ #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
379+ return __reduce_add_sync (0xFFFFFFFF , x);
380380#else
381381#pragma unroll
382382 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
383- x += __shfl_xor_sync (GGML_WARP_SYNC_MASK , x, offset, width);
383+ x += __shfl_xor_sync (0xFFFFFFFF , x, offset, width);
384384 }
385385 return x;
386- #endif // ( !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE) || (defined(GGML_USE_HIP) && HIP_VERSION >= 70000000)
386+ #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
387387}
388388
389389template <int width = WARP_SIZE>
390390static __device__ __forceinline__ float warp_reduce_sum (float x) {
391391#pragma unroll
392392 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
393- x += __shfl_xor_sync (GGML_WARP_SYNC_MASK , x, offset, width);
393+ x += __shfl_xor_sync (0xFFFFFFFF , x, offset, width);
394394 }
395395 return x;
396396}
@@ -399,8 +399,8 @@ template<int width = WARP_SIZE>
399399static __device__ __forceinline__ float2 warp_reduce_sum (float2 a) {
400400#pragma unroll
401401 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
402- a.x += __shfl_xor_sync (GGML_WARP_SYNC_MASK , a.x , offset, width);
403- a.y += __shfl_xor_sync (GGML_WARP_SYNC_MASK , a.y , offset, width);
402+ a.x += __shfl_xor_sync (0xFFFFFFFF , a.x , offset, width);
403+ a.y += __shfl_xor_sync (0xFFFFFFFF , a.y , offset, width);
404404 }
405405 return a;
406406}
@@ -410,7 +410,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
410410#ifdef FP16_AVAILABLE
411411#pragma unroll
412412 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
413- a = __hadd2 (a, __shfl_xor_sync (GGML_WARP_SYNC_MASK , a, offset, width));
413+ a = __hadd2 (a, __shfl_xor_sync (0xFFFFFFFF , a, offset, width));
414414 }
415415 return a;
416416
@@ -445,20 +445,20 @@ static __device__ __forceinline__ int warp_reduce_all(int x) {
445445#ifdef GGML_USE_HIP
446446#pragma unroll
447447 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
448- x = x && __shfl_xor_sync (GGML_WARP_SYNC_MASK , x, offset, width);
448+ x = x && __shfl_xor_sync (0xFFFFFFFF , x, offset, width);
449449 }
450450 return x;
451451#else
452452 static_assert (width == WARP_SIZE, " width != WARP_SIZE not implemented" );
453- return __all_sync (GGML_WARP_SYNC_MASK , x);
453+ return __all_sync (0xFFFFFFFF , x);
454454#endif // GGML_USE_HIP
455455}
456456
457457template <int width = WARP_SIZE>
458458static __device__ __forceinline__ float warp_reduce_max (float x) {
459459#pragma unroll
460460 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
461- x = fmaxf (x, __shfl_xor_sync (GGML_WARP_SYNC_MASK , x, offset, width));
461+ x = fmaxf (x, __shfl_xor_sync (0xFFFFFFFF , x, offset, width));
462462 }
463463 return x;
464464}
@@ -501,7 +501,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
501501#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
502502#pragma unroll
503503 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
504- x = ggml_cuda_hmax2 (x, __shfl_xor_sync (GGML_WARP_SYNC_MASK , x, offset, width));
504+ x = ggml_cuda_hmax2 (x, __shfl_xor_sync (0xFFFFFFFF , x, offset, width));
505505 }
506506 return x;
507507#else
0 commit comments