use dpp instead of shfls for reduction from maximumbusdatatype

jammm · jammm · commit a537092c85f1 · 2025-10-19T23:48:19.000+09:00
Originally written by maximumbusdatatype ggml-org#16291 (comment)
diff --git a/ggml/src/ggml-cuda/argmax.cu b/ggml/src/ggml-cuda/argmax.cu
@@ -22,8 +22,8 @@ static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __rest
 
 #pragma unroll
     for (int offset = 16; offset > 0; offset >>= 1) {
-        const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE);
-        const int   col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE);
+        const float val = ggml_cuda_shfl_xor_sync(maxval, offset);
+        const int   col = ggml_cuda_shfl_xor_sync(argmax, offset);
         if (val > maxval) {
             maxval = val;
             argmax = col;
@@ -51,8 +51,8 @@ static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __rest
             }
 #pragma unroll
             for (int offset = 16; offset > 0; offset >>= 1) {
-                const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE);
-                const int   col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE);
+                const float val = ggml_cuda_shfl_xor_sync(maxval, offset);
+                const int   col = ggml_cuda_shfl_xor_sync(argmax, offset);
                 if (val > maxval) {
                     maxval = val;
                     argmax = col;
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
@@ -359,14 +359,56 @@ struct ggml_cuda_unroll<1> {
     }
 };
 
+#ifdef GGML_USE_HIP
+template <int dpp_ctrl, typename T, int row_mask = 0xf, int bank_mask = 0xf, bool bound_ctrl = true>
+static __device__ __forceinline__ T hip_move_dpp(T old, T v) {
+    return __builtin_bit_cast(
+        T,
+        __builtin_amdgcn_update_dpp(
+            __builtin_bit_cast(int, old),
+            __builtin_bit_cast(int, v),
+            dpp_ctrl,
+            row_mask,
+            bank_mask,
+            bound_ctrl
+        )
+    );
+}
+
+template <int mask, typename T>
+static __device__ __forceinline__ T hip_ds_swizzle(T v) {
+    return __builtin_bit_cast(T, __builtin_amdgcn_ds_swizzle(__builtin_bit_cast(int, v), mask));
+}
+#endif // GGML_USE_HIP
+
+template<int width = WARP_SIZE, typename T>
+static __device__ __forceinline__ T ggml_cuda_shfl_xor_sync(T x, int offset) {
+#if defined(GGML_USE_HIP)
+    static T old;
+
+    // clang (v20) will not unroll loops with just the plain `offset` in switch
+    switch (~offset) {
+        // subgroups (width) should not make a difference for a butterfly shuffle pattern
+        case ~1: return hip_move_dpp<0x160 + 1>(old, x);  // row_xor_mask: offset
+        case ~2: return hip_move_dpp<0x160 + 2>(old, x);
+        case ~4: return hip_move_dpp<0x160 + 4>(old, x);
+        case ~8: return hip_move_dpp<0x160 + 8>(old, x);
+        case ~16: return hip_ds_swizzle<0x401f>(x);  // swap neighboring groups of 16
+        default: return __shfl_xor(x, offset, width);
+    }
+#else
+    return __shfl_xor_sync(0xffffffff, x, offset, width);
+#endif // GGML_USE_HIP
+}
+
 template<int width = WARP_SIZE>
 static __device__ __forceinline__ int warp_reduce_sum(int x) {
 #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
     return __reduce_add_sync(0xffffffff, x);
 #else
 #pragma unroll
     for (int offset = width/2; offset > 0; offset >>= 1) {
-        x += __shfl_xor_sync(0xffffffff, x, offset, width);
+        x += ggml_cuda_shfl_xor_sync<width>(x, offset);
     }
     return x;
 #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
@@ -376,7 +418,7 @@ template<int width = WARP_SIZE>
 static __device__ __forceinline__ float warp_reduce_sum(float x) {
 #pragma unroll
     for (int offset = width/2; offset > 0; offset >>= 1) {
-        x += __shfl_xor_sync(0xffffffff, x, offset, width);
+        x += ggml_cuda_shfl_xor_sync<width>(x, offset);
     }
     return x;
 }
@@ -385,8 +427,8 @@ template<int width = WARP_SIZE>
 static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
 #pragma unroll
     for (int offset = width/2; offset > 0; offset >>= 1) {
-        a.x += __shfl_xor_sync(0xffffffff, a.x, offset, width);
-        a.y += __shfl_xor_sync(0xffffffff, a.y, offset, width);
+        a.x += ggml_cuda_shfl_xor_sync<width>(a.x, offset);
+        a.y += ggml_cuda_shfl_xor_sync<width>(a.y, offset);
     }
     return a;
 }
@@ -396,7 +438,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
 #ifdef FP16_AVAILABLE
 #pragma unroll
     for (int offset = width/2; offset > 0; offset >>= 1) {
-        a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, offset, width));
+        a = __hadd2(a, ggml_cuda_shfl_xor_sync<width>(a, offset));
     }
     return a;
 
@@ -413,7 +455,7 @@ static __device__ __forceinline__ int warp_reduce_all(int x) {
     } else {
 #pragma unroll
         for (int offset = width/2; offset > 0; offset >>= 1) {
-            x = __shfl_xor_sync(0xffffffff, x, offset, width) && x;
+            x = ggml_cuda_shfl_xor_sync<width>(x, offset) && x;
         }
         return x;
     }
@@ -426,7 +468,7 @@ static __device__ __forceinline__ int warp_reduce_any(int x) {
     } else {
 #pragma unroll
         for (int offset = width/2; offset > 0; offset >>= 1) {
-            x = __shfl_xor_sync(0xffffffff, x, offset, width) || x;
+            x = ggml_cuda_shfl_xor_sync<width>(x, offset) || x;
         }
         return x;
     }
@@ -436,7 +478,7 @@ template<int width = WARP_SIZE>
 static __device__ __forceinline__ float warp_reduce_max(float x) {
 #pragma unroll
     for (int offset = width/2; offset > 0; offset >>= 1) {
-        x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, offset, width));
+        x = fmaxf(x, ggml_cuda_shfl_xor_sync<width>(x, offset));
     }
     return x;
 }
@@ -475,7 +517,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
 #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP)
 #pragma unroll
    for (int offset = width/2; offset > 0; offset >>= 1) {
-       x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width));
+       x = ggml_cuda_hmax2(x, ggml_cuda_shfl_xor_sync<width>(x, offset));
    }
    return x;
 #else
diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -268,8 +268,8 @@ static __device__ __forceinline__ void quantize_q8_1_to_shared(
     }
 #pragma unroll
     for (int mask = QI8_1/2; mask > 0; mask >>= 1) {
-        amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, mask, 32));
-        sum +=             __shfl_xor_sync(0xFFFFFFFF, sum,  mask, 32);
+        amax = fmaxf(amax, ggml_cuda_shfl_xor_sync<32>(amax, mask));
+        sum +=             ggml_cuda_shfl_xor_sync<32>(sum,  mask);
     }
 
     const float d = amax / 127;
diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -572,7 +572,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
         for (int col = 0; col < cols_per_thread; ++col) {
 #pragma unroll
             for (int offset = 16; offset >= 4; offset >>= 1) {
-                KQ_max_new[col] = fmaxf(KQ_max_new[col], __shfl_xor_sync(0xFFFFFFFF, KQ_max_new[col], offset, WARP_SIZE));
+                KQ_max_new[col] = fmaxf(KQ_max_new[col], ggml_cuda_shfl_xor_sync(KQ_max_new[col], offset));
             }
         }
 
@@ -627,7 +627,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
         for (int col = 0; col < cols_per_thread; ++col) {
 #pragma unroll
             for (int offset = 2; offset >= 1; offset >>= 1) {
-                KQ_max_new[col] = fmaxf(KQ_max_new[col], __shfl_xor_sync(0xFFFFFFFF, KQ_max_new[col], offset, WARP_SIZE));
+                KQ_max_new[col] = fmaxf(KQ_max_new[col], ggml_cuda_shfl_xor_sync(KQ_max_new[col], offset));
             }
         }
 
@@ -950,7 +950,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
         for (int col = 0; col < cols_per_thread; ++col) {
 #pragma unroll
             for (int offset = offset_first; offset >= offset_last; offset >>= 1) {
-                KQ_rowsum[col] += __shfl_xor_sync(0xFFFFFFFF, KQ_rowsum[col], offset, WARP_SIZE);
+                KQ_rowsum[col] += ggml_cuda_shfl_xor_sync(KQ_rowsum[col], offset);
             }
         }
     }
@@ -1083,7 +1083,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
 #pragma unroll
         for (int offset = np*cols_per_warp/2; offset >= cols_per_warp; offset >>= 1) {
             if (offset < WARP_SIZE) {
-                KQ_cmn = fmaxf(KQ_cmn, __shfl_xor_sync(0xFFFFFFFF, KQ_cmn, offset, WARP_SIZE));
+                KQ_cmn = fmaxf(KQ_cmn, ggml_cuda_shfl_xor_sync(KQ_cmn, offset));
             }
         }
 
@@ -1101,7 +1101,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
 #pragma unroll
         for (int offset = np*cols_per_warp/2; offset >= cols_per_warp; offset >>= 1) {
             if (offset < WARP_SIZE) {
-                KQ_crs += __shfl_xor_sync(0xFFFFFFFF, KQ_crs, offset, WARP_SIZE);
+                KQ_crs += ggml_cuda_shfl_xor_sync(KQ_crs, offset);
             }
         }
 
diff --git a/ggml/src/ggml-cuda/fattn-vec.cuh b/ggml/src/ggml-cuda/fattn-vec.cuh
@@ -282,7 +282,7 @@ static __global__ void flash_attn_ext_vec(
         for (int j = 0; j < ncols; ++j) {
 #pragma unroll
             for (int offset = nthreads_KQ; offset < WARP_SIZE; offset <<= 1) {
-                KQ_max_new[j] = fmaxf(KQ_max_new[j], __shfl_xor_sync(0xFFFFFFFF, KQ_max_new[j], offset, WARP_SIZE));
+                KQ_max_new[j] = fmaxf(KQ_max_new[j], ggml_cuda_shfl_xor_sync(KQ_max_new[j], offset));
             }
             const float KQ_max_scale = expf(KQ_max[j] - KQ_max_new[j]);
             KQ_max[j] = KQ_max_new[j];
diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu
@@ -89,7 +89,7 @@ static __global__ void quantize_mmq_q8_1(
     // Exchange max. abs. value between vals_per_scale/4 threads.
 #pragma unroll
     for (int offset = vals_per_scale/8; offset > 0; offset >>= 1) {
-        amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, offset, WARP_SIZE));
+        amax = fmaxf(amax, ggml_cuda_shfl_xor_sync(amax, offset));
     }
 
     float sum;
@@ -99,7 +99,7 @@ static __global__ void quantize_mmq_q8_1(
         // Calculate sums across vals_per_sum/4 threads.
 #pragma unroll
         for (int offset = vals_per_sum/8; offset > 0; offset >>= 1) {
-            sum += __shfl_xor_sync(0xFFFFFFFF, sum, offset, WARP_SIZE);
+            sum += ggml_cuda_shfl_xor_sync(sum, offset);
         }
     }
 
diff --git a/ggml/src/ggml-cuda/topk-moe.cu b/ggml/src/ggml-cuda/topk-moe.cu
@@ -90,8 +90,8 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
 
 #pragma unroll
         for (int mask = WARP_SIZE / 2; mask > 0; mask /= 2) {
-            const float val    = __shfl_xor_sync(0xFFFFFFFF, max_val, mask, WARP_SIZE);
-            const int   expert = __shfl_xor_sync(0xFFFFFFFF, max_expert, mask, WARP_SIZE);
+            const float val    = ggml_cuda_shfl_xor_sync(max_val, mask);
+            const int   expert = ggml_cuda_shfl_xor_sync(max_expert, mask);
             if (val > max_val || (val == max_val && expert < max_expert)) {
                 max_val    = val;
                 max_expert = expert;
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
@@ -27,7 +27,6 @@
 #define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
 #define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
 #define __shfl_up_sync(mask, var, laneMask, width) __shfl_up(var, laneMask, width)
-#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 #define __all_sync(mask, var) __all(var)
 #define __any_sync(mask, var) __any(var)
 #define cublasCreate hipblasCreate

Original file line number	Diff line number	Diff line change
`@@ -268,8 +268,8 @@ static __device__ __forceinline__ void quantize_q8_1_to_shared(`
`268`	`268`	`}`
`269`	`269`	`#pragma unroll`
`270`	`270`	`for (int mask = QI8_1/2; mask > 0; mask >>= 1) {`
`271`		`- amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, mask, 32));`
`272`		`- sum += __shfl_xor_sync(0xFFFFFFFF, sum, mask, 32);`
	`271`	`+ amax = fmaxf(amax, ggml_cuda_shfl_xor_sync<32>(amax, mask));`
	`272`	`+ sum += ggml_cuda_shfl_xor_sync<32>(sum, mask);`
`273`	`273`	`}`
`274`	`274`
`275`	`275`	`const float d = amax / 127;`
Original file line number	Diff line number	Diff line change
`@@ -572,7 +572,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(`
`572`	`572`	`for (int col = 0; col < cols_per_thread; ++col) {`
`573`	`573`	`#pragma unroll`
`574`	`574`	`for (int offset = 16; offset >= 4; offset >>= 1) {`
`575`		`- KQ_max_new[col] = fmaxf(KQ_max_new[col], __shfl_xor_sync(0xFFFFFFFF, KQ_max_new[col], offset, WARP_SIZE));`
	`575`	`+ KQ_max_new[col] = fmaxf(KQ_max_new[col], ggml_cuda_shfl_xor_sync(KQ_max_new[col], offset));`
`576`	`576`	`}`
`577`	`577`	`}`
`578`	`578`
`@@ -627,7 +627,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(`
`627`	`627`	`for (int col = 0; col < cols_per_thread; ++col) {`
`628`	`628`	`#pragma unroll`
`629`	`629`	`for (int offset = 2; offset >= 1; offset >>= 1) {`
`630`		`- KQ_max_new[col] = fmaxf(KQ_max_new[col], __shfl_xor_sync(0xFFFFFFFF, KQ_max_new[col], offset, WARP_SIZE));`
	`630`	`+ KQ_max_new[col] = fmaxf(KQ_max_new[col], ggml_cuda_shfl_xor_sync(KQ_max_new[col], offset));`
`631`	`631`	`}`
`632`	`632`	`}`
`633`	`633`
`@@ -950,7 +950,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(`
`950`	`950`	`for (int col = 0; col < cols_per_thread; ++col) {`
`951`	`951`	`#pragma unroll`
`952`	`952`	`for (int offset = offset_first; offset >= offset_last; offset >>= 1) {`
`953`		`- KQ_rowsum[col] += __shfl_xor_sync(0xFFFFFFFF, KQ_rowsum[col], offset, WARP_SIZE);`
	`953`	`+ KQ_rowsum[col] += ggml_cuda_shfl_xor_sync(KQ_rowsum[col], offset);`
`954`	`954`	`}`
`955`	`955`	`}`
`956`	`956`	`}`
`@@ -1083,7 +1083,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(`
`1083`	`1083`	`#pragma unroll`
`1084`	`1084`	`for (int offset = np*cols_per_warp/2; offset >= cols_per_warp; offset >>= 1) {`
`1085`	`1085`	`if (offset < WARP_SIZE) {`
`1086`		`- KQ_cmn = fmaxf(KQ_cmn, __shfl_xor_sync(0xFFFFFFFF, KQ_cmn, offset, WARP_SIZE));`
	`1086`	`+ KQ_cmn = fmaxf(KQ_cmn, ggml_cuda_shfl_xor_sync(KQ_cmn, offset));`
`1087`	`1087`	`}`
`1088`	`1088`	`}`
`1089`	`1089`
`@@ -1101,7 +1101,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(`
`1101`	`1101`	`#pragma unroll`
`1102`	`1102`	`for (int offset = np*cols_per_warp/2; offset >= cols_per_warp; offset >>= 1) {`
`1103`	`1103`	`if (offset < WARP_SIZE) {`
`1104`		`- KQ_crs += __shfl_xor_sync(0xFFFFFFFF, KQ_crs, offset, WARP_SIZE);`
	`1104`	`+ KQ_crs += ggml_cuda_shfl_xor_sync(KQ_crs, offset);`
`1105`	`1105`	`}`
`1106`	`1106`	`}`
`1107`	`1107`
Original file line number	Diff line number	Diff line change
`@@ -282,7 +282,7 @@ static __global__ void flash_attn_ext_vec(`
`282`	`282`	`for (int j = 0; j < ncols; ++j) {`
`283`	`283`	`#pragma unroll`
`284`	`284`	`for (int offset = nthreads_KQ; offset < WARP_SIZE; offset <<= 1) {`
`285`		`- KQ_max_new[j] = fmaxf(KQ_max_new[j], __shfl_xor_sync(0xFFFFFFFF, KQ_max_new[j], offset, WARP_SIZE));`
	`285`	`+ KQ_max_new[j] = fmaxf(KQ_max_new[j], ggml_cuda_shfl_xor_sync(KQ_max_new[j], offset));`
`286`	`286`	`}`
`287`	`287`	`const float KQ_max_scale = expf(KQ_max[j] - KQ_max_new[j]);`
`288`	`288`	`KQ_max[j] = KQ_max_new[j];`
Original file line number	Diff line number	Diff line change
`@@ -89,7 +89,7 @@ static __global__ void quantize_mmq_q8_1(`
`89`	`89`	`// Exchange max. abs. value between vals_per_scale/4 threads.`
`90`	`90`	`#pragma unroll`
`91`	`91`	`for (int offset = vals_per_scale/8; offset > 0; offset >>= 1) {`
`92`		`- amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, offset, WARP_SIZE));`
	`92`	`+ amax = fmaxf(amax, ggml_cuda_shfl_xor_sync(amax, offset));`
`93`	`93`	`}`
`94`	`94`
`95`	`95`	`float sum;`
`@@ -99,7 +99,7 @@ static __global__ void quantize_mmq_q8_1(`
`99`	`99`	`// Calculate sums across vals_per_sum/4 threads.`
`100`	`100`	`#pragma unroll`
`101`	`101`	`for (int offset = vals_per_sum/8; offset > 0; offset >>= 1) {`
`102`		`- sum += __shfl_xor_sync(0xFFFFFFFF, sum, offset, WARP_SIZE);`
	`102`	`+ sum += ggml_cuda_shfl_xor_sync(sum, offset);`
`103`	`103`	`}`
`104`	`104`	`}`
`105`	`105`