@@ -359,14 +359,56 @@ struct ggml_cuda_unroll<1> {
359359    }
360360};
361361
362+ #ifdef  GGML_USE_HIP
363+ template  <int  dpp_ctrl, typename  T, int  row_mask = 0xf , int  bank_mask = 0xf , bool  bound_ctrl = true >
364+ static  __device__  __forceinline__  T hip_move_dpp (T old, T v) {
365+     return  __builtin_bit_cast (
366+         T,
367+         __builtin_amdgcn_update_dpp (
368+             __builtin_bit_cast (int , old),
369+             __builtin_bit_cast (int , v),
370+             dpp_ctrl,
371+             row_mask,
372+             bank_mask,
373+             bound_ctrl
374+         )
375+     );
376+ }
377+ 
378+ template  <int  mask, typename  T>
379+ static  __device__  __forceinline__  T hip_ds_swizzle (T v) {
380+     return  __builtin_bit_cast (T, __builtin_amdgcn_ds_swizzle (__builtin_bit_cast (int , v), mask));
381+ }
382+ #endif  //  GGML_USE_HIP
383+ 
384+ template <int  width = WARP_SIZE, typename  T>
385+ static  __device__  __forceinline__  T ggml_cuda_shfl_xor_sync (T x, int  offset) {
386+ #if  defined(GGML_USE_HIP)
387+     static  T old;
388+ 
389+     //  clang (v20) will not unroll loops with just the plain `offset` in switch
390+     switch  (~offset) {
391+         //  subgroups (width) should not make a difference for a butterfly shuffle pattern
392+         case  ~1 : return  hip_move_dpp<0x160  + 1 >(old, x);  //  row_xor_mask: offset
393+         case  ~2 : return  hip_move_dpp<0x160  + 2 >(old, x);
394+         case  ~4 : return  hip_move_dpp<0x160  + 4 >(old, x);
395+         case  ~8 : return  hip_move_dpp<0x160  + 8 >(old, x);
396+         case  ~16 : return  hip_ds_swizzle<0x401f >(x);  //  swap neighboring groups of 16
397+         default : return  __shfl_xor (x, offset, width);
398+     }
399+ #else 
400+     return  __shfl_xor_sync (0xffffffff , x, offset, width);
401+ #endif  //  GGML_USE_HIP
402+ }
403+ 
362404template <int  width = WARP_SIZE>
363405static  __device__  __forceinline__  int  warp_reduce_sum (int  x) {
364406#if  !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
365407    return  __reduce_add_sync (0xffffffff , x);
366408#else 
367409#pragma  unroll
368410    for  (int  offset = width/2 ; offset > 0 ; offset >>= 1 ) {
369-         x += __shfl_xor_sync ( 0xffffffff ,  x, offset, width );
411+         x += ggml_cuda_shfl_xor_sync<width>( x, offset);
370412    }
371413    return  x;
372414#endif  //  !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
@@ -376,7 +418,7 @@ template<int width = WARP_SIZE>
376418static  __device__  __forceinline__  float  warp_reduce_sum (float  x) {
377419#pragma  unroll
378420    for  (int  offset = width/2 ; offset > 0 ; offset >>= 1 ) {
379-         x += __shfl_xor_sync ( 0xffffffff ,  x, offset, width );
421+         x += ggml_cuda_shfl_xor_sync<width>( x, offset);
380422    }
381423    return  x;
382424}
@@ -385,8 +427,8 @@ template<int width = WARP_SIZE>
385427static  __device__  __forceinline__  float2  warp_reduce_sum (float2  a) {
386428#pragma  unroll
387429    for  (int  offset = width/2 ; offset > 0 ; offset >>= 1 ) {
388-         a.x  += __shfl_xor_sync ( 0xffffffff ,  a.x , offset, width );
389-         a.y  += __shfl_xor_sync ( 0xffffffff ,  a.y , offset, width );
430+         a.x  += ggml_cuda_shfl_xor_sync<width>( a.x , offset);
431+         a.y  += ggml_cuda_shfl_xor_sync<width>( a.y , offset);
390432    }
391433    return  a;
392434}
@@ -396,7 +438,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
396438#ifdef  FP16_AVAILABLE
397439#pragma  unroll
398440    for  (int  offset = width/2 ; offset > 0 ; offset >>= 1 ) {
399-         a = __hadd2 (a, __shfl_xor_sync ( 0xffffffff ,  a, offset, width ));
441+         a = __hadd2 (a, ggml_cuda_shfl_xor_sync<width>( a, offset));
400442    }
401443    return  a;
402444
@@ -413,7 +455,7 @@ static __device__ __forceinline__ int warp_reduce_all(int x) {
413455    } else  {
414456#pragma  unroll
415457        for  (int  offset = width/2 ; offset > 0 ; offset >>= 1 ) {
416-             x = __shfl_xor_sync ( 0xffffffff ,  x, offset, width ) && x;
458+             x = ggml_cuda_shfl_xor_sync<width>( x, offset) && x;
417459        }
418460        return  x;
419461    }
@@ -426,7 +468,7 @@ static __device__ __forceinline__ int warp_reduce_any(int x) {
426468    } else  {
427469#pragma  unroll
428470        for  (int  offset = width/2 ; offset > 0 ; offset >>= 1 ) {
429-             x = __shfl_xor_sync ( 0xffffffff ,  x, offset, width ) || x;
471+             x = ggml_cuda_shfl_xor_sync<width>( x, offset) || x;
430472        }
431473        return  x;
432474    }
@@ -436,7 +478,7 @@ template<int width = WARP_SIZE>
436478static  __device__  __forceinline__  float  warp_reduce_max (float  x) {
437479#pragma  unroll
438480    for  (int  offset = width/2 ; offset > 0 ; offset >>= 1 ) {
439-         x = fmaxf (x, __shfl_xor_sync ( 0xffffffff ,  x, offset, width ));
481+         x = fmaxf (x, ggml_cuda_shfl_xor_sync<width>( x, offset));
440482    }
441483    return  x;
442484}
@@ -475,7 +517,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
475517#if  !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP)
476518#pragma  unroll
477519   for  (int  offset = width/2 ; offset > 0 ; offset >>= 1 ) {
478-        x = ggml_cuda_hmax2 (x, __shfl_xor_sync ( 0xffffffff ,  x, offset, width ));
520+        x = ggml_cuda_hmax2 (x, ggml_cuda_shfl_xor_sync<width>( x, offset));
479521   }
480522   return  x;
481523#else 
0 commit comments