@@ -359,14 +359,56 @@ struct ggml_cuda_unroll<1> {
359359 }
360360};
361361
362+ #ifdef GGML_USE_HIP
363+ template <int dpp_ctrl, typename T, int row_mask = 0xf , int bank_mask = 0xf , bool bound_ctrl = true >
364+ static __device__ __forceinline__ T hip_move_dpp (T old, T v) {
365+ return __builtin_bit_cast (
366+ T,
367+ __builtin_amdgcn_update_dpp (
368+ __builtin_bit_cast (int , old),
369+ __builtin_bit_cast (int , v),
370+ dpp_ctrl,
371+ row_mask,
372+ bank_mask,
373+ bound_ctrl
374+ )
375+ );
376+ }
377+
378+ template <int mask, typename T>
379+ static __device__ __forceinline__ T hip_ds_swizzle (T v) {
380+ return __builtin_bit_cast (T, __builtin_amdgcn_ds_swizzle (__builtin_bit_cast (int , v), mask));
381+ }
382+ #endif // GGML_USE_HIP
383+
384+ template <int width = WARP_SIZE, typename T>
385+ static __device__ __forceinline__ T ggml_cuda_shfl_xor_sync (T x, int offset) {
386+ #if defined(GGML_USE_HIP)
387+ static T old;
388+
389+ // clang (v20) will not unroll loops with just the plain `offset` in switch
390+ switch (~offset) {
391+ // subgroups (width) should not make a difference for a butterfly shuffle pattern
392+ case ~1 : return hip_move_dpp<0x160 + 1 >(old, x); // row_xor_mask: offset
393+ case ~2 : return hip_move_dpp<0x160 + 2 >(old, x);
394+ case ~4 : return hip_move_dpp<0x160 + 4 >(old, x);
395+ case ~8 : return hip_move_dpp<0x160 + 8 >(old, x);
396+ case ~16 : return hip_ds_swizzle<0x401f >(x); // swap neighboring groups of 16
397+ default : return __shfl_xor (x, offset, width);
398+ }
399+ #else
400+ return __shfl_xor_sync (0xffffffff , x, offset, width);
401+ #endif // GGML_USE_HIP
402+ }
403+
362404template <int width = WARP_SIZE>
363405static __device__ __forceinline__ int warp_reduce_sum (int x) {
364406#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
365407 return __reduce_add_sync (0xffffffff , x);
366408#else
367409#pragma unroll
368410 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
369- x += __shfl_xor_sync ( 0xffffffff , x, offset, width );
411+ x += ggml_cuda_shfl_xor_sync<width>( x, offset);
370412 }
371413 return x;
372414#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
@@ -376,7 +418,7 @@ template<int width = WARP_SIZE>
376418static __device__ __forceinline__ float warp_reduce_sum (float x) {
377419#pragma unroll
378420 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
379- x += __shfl_xor_sync ( 0xffffffff , x, offset, width );
421+ x += ggml_cuda_shfl_xor_sync<width>( x, offset);
380422 }
381423 return x;
382424}
@@ -385,8 +427,8 @@ template<int width = WARP_SIZE>
385427static __device__ __forceinline__ float2 warp_reduce_sum (float2 a) {
386428#pragma unroll
387429 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
388- a.x += __shfl_xor_sync ( 0xffffffff , a.x , offset, width );
389- a.y += __shfl_xor_sync ( 0xffffffff , a.y , offset, width );
430+ a.x += ggml_cuda_shfl_xor_sync<width>( a.x , offset);
431+ a.y += ggml_cuda_shfl_xor_sync<width>( a.y , offset);
390432 }
391433 return a;
392434}
@@ -396,7 +438,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
396438#ifdef FP16_AVAILABLE
397439#pragma unroll
398440 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
399- a = __hadd2 (a, __shfl_xor_sync ( 0xffffffff , a, offset, width ));
441+ a = __hadd2 (a, ggml_cuda_shfl_xor_sync<width>( a, offset));
400442 }
401443 return a;
402444
@@ -413,7 +455,7 @@ static __device__ __forceinline__ int warp_reduce_all(int x) {
413455 } else {
414456#pragma unroll
415457 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
416- x = __shfl_xor_sync ( 0xffffffff , x, offset, width ) && x;
458+ x = ggml_cuda_shfl_xor_sync<width>( x, offset) && x;
417459 }
418460 return x;
419461 }
@@ -426,7 +468,7 @@ static __device__ __forceinline__ int warp_reduce_any(int x) {
426468 } else {
427469#pragma unroll
428470 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
429- x = __shfl_xor_sync ( 0xffffffff , x, offset, width ) || x;
471+ x = ggml_cuda_shfl_xor_sync<width>( x, offset) || x;
430472 }
431473 return x;
432474 }
@@ -436,7 +478,7 @@ template<int width = WARP_SIZE>
436478static __device__ __forceinline__ float warp_reduce_max (float x) {
437479#pragma unroll
438480 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
439- x = fmaxf (x, __shfl_xor_sync ( 0xffffffff , x, offset, width ));
481+ x = fmaxf (x, ggml_cuda_shfl_xor_sync<width>( x, offset));
440482 }
441483 return x;
442484}
@@ -475,7 +517,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
475517#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP)
476518#pragma unroll
477519 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
478- x = ggml_cuda_hmax2 (x, __shfl_xor_sync ( 0xffffffff , x, offset, width ));
520+ x = ggml_cuda_hmax2 (x, ggml_cuda_shfl_xor_sync<width>( x, offset));
479521 }
480522 return x;
481523#else
0 commit comments