CUDA/HIP: add support for selectable warp size to mmv

IMbackK · IMbackK · commit f5dd31f4a947 · 2025-01-30T18:40:37.000+01:00
diff --git a/ggml/src/ggml-cuda/mmv.cu b/ggml/src/ggml-cuda/mmv.cu
@@ -18,8 +18,8 @@ static __global__ void mul_mat_vec(
     extern __shared__ char data_mmv[];
     float * buf_iw = (float *) data_mmv;
 
-    if (block_size > WARP_SIZE) {
-        if (tid < WARP_SIZE) {
+    if (block_size > GGML_TRUE_WARP_SIZE) {
+        if (tid < GGML_TRUE_WARP_SIZE) {
             buf_iw[tid] = 0.0f;
         }
         __syncthreads();
@@ -67,16 +67,16 @@ static __global__ void mul_mat_vec(
         static_assert(std::is_same<T, void>::value, "unsupported type");
     }
 
-    sumf = warp_reduce_sum(sumf);
+    sumf = warp_reduce_sum<GGML_TRUE_WARP_SIZE>(sumf);
 
-    if (block_size > WARP_SIZE) {
-        buf_iw[tid/WARP_SIZE] = sumf;
+    if (block_size > GGML_TRUE_WARP_SIZE) {
+        buf_iw[tid/GGML_TRUE_WARP_SIZE] = sumf;
         __syncthreads();
-        if (tid >= WARP_SIZE) {
+        if (tid >= GGML_TRUE_WARP_SIZE) {
             return;
         }
         sumf = buf_iw[tid];
-        sumf = warp_reduce_sum(sumf);
+        sumf = warp_reduce_sum<GGML_TRUE_WARP_SIZE>(sumf);
     }
 
     if (tid != 0) {
@@ -96,18 +96,27 @@ static void launch_mul_mat_vec_cuda(
     GGML_ASSERT(stride_row % 2 == 0);
     GGML_ASSERT(nchannels_y % nchannels_x == 0);
     const int64_t channel_ratio = nchannels_y / nchannels_x;
+    int device;
+    int warp_size;
 
-    int64_t block_size_best = WARP_SIZE;
-    int64_t niter_best      = (ncols + 2*WARP_SIZE - 1) / (2*WARP_SIZE);
-    for (int64_t block_size = 2*WARP_SIZE; block_size <= 256; block_size += WARP_SIZE) {
+    CUDA_CHECK(cudaGetDevice(&device));
+    warp_size = ggml_cuda_info().devices[device].warp_size;
+
+    int64_t block_size_best = warp_size;
+    int64_t niter_best      = (ncols + 2*warp_size - 1) / (2*warp_size);
+    int64_t max_block_size  = 256;
+    if(ggml_cuda_info().devices[device].cc > GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_info().devices[device].cc < GGML_CUDA_CC_RDNA1) {
+        max_block_size = 128;
+    }
+    for (int64_t block_size = 2*warp_size; block_size <= max_block_size; block_size += warp_size) {
         const int64_t niter = (ncols + 2*block_size - 1) / (2*block_size);
         if (niter < niter_best) {
             niter_best      = niter;
             block_size_best = block_size;
         }
     }
 
-    const int smem = WARP_SIZE*sizeof(float);
+    const int smem = warp_size*sizeof(float);
     const dim3 block_nums(nrows, 1, nchannels_y);
     const dim3 block_dims(block_size_best, 1, 1);
     switch (block_size_best) {
diff --git a/ggml/src/ggml-cuda/vendors/cuda.h b/ggml/src/ggml-cuda/vendors/cuda.h
@@ -13,3 +13,5 @@
 #define CUBLAS_COMPUTE_32F CUDA_R_32F
 #define cublasComputeType_t cudaDataType_t
 #endif // CUDART_VERSION < 11020
+
+#define GGML_TRUE_WARP_SIZE 32 // Only use this in device code
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#define HIP_ENABLE_WARP_SYNC_BUILTINS 1
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
 #include <hip/hip_fp16.h>
@@ -8,6 +9,7 @@
 // for rocblas_initialize()
 #include "rocblas/rocblas.h"
 #endif // __HIP_PLATFORM_AMD__
+
 #define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
 #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
 #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
@@ -137,6 +139,8 @@
 #define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
 #define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
 
+#define GGML_TRUE_WARP_SIZE __AMDGCN_WAVEFRONT_SIZE // Only use this in device code
+
 #define __CUDA_ARCH__ 1300
 
 #if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__)
diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h
@@ -135,3 +135,5 @@
 #define cudaStreamEndCapture musaStreamEndCapture
 
 typedef mt_bfloat16 nv_bfloat16;
+
+#define GGML_TRUE_WARP_SIZE 32 // Only use this in device code