Skip to content

Commit ad4a700

Browse files
authored
HIP: enable mfma mmq on gfx908 and gfx90a for select datatypes and shapes (#14949)
1 parent e32a4ec commit ad4a700

File tree

3 files changed

+25
-10
lines changed

3 files changed

+25
-10
lines changed

ggml/src/ggml-cuda/common.cuh

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -227,9 +227,9 @@ typedef float2 dfloat2;
227227
#define FP16_MMA_AVAILABLE
228228
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
229229

230-
#if defined(GGML_USE_HIP) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
230+
#if defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
231231
#define AMD_MFMA_AVAILABLE
232-
#endif // defined(GGML_USE_HIP) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
232+
#endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
233233

234234
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
235235
#define NEW_MMA_AVAILABLE
@@ -293,10 +293,9 @@ static bool fp32_mma_hardware_available(const int cc) {
293293
return GGML_CUDA_CC_IS_CDNA(cc);
294294
}
295295

296-
// AMD CDNA3 matrix cores.. Will add support for other CDNA generations later.
297296
static bool amd_mfma_available(const int cc) {
298297
#if !defined(GGML_HIP_NO_MMQ_MFMA)
299-
return GGML_CUDA_CC_IS_CDNA3(cc);
298+
return GGML_CUDA_CC_IS_CDNA(cc);
300299
#else
301300
return false;
302301
#endif //!defined(GGML_HIP_NO_MMQ_MFMA)

ggml/src/ggml-cuda/mmq.cu

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,8 @@ void ggml_cuda_mul_mat_q(
109109
const int64_t s03 = src0->nb[3] / ts_src0;
110110
const int64_t s3 = dst->nb[3] / ts_dst;
111111

112-
const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
113-
|| (GGML_CUDA_CC_IS_AMD(cc) && GGML_CUDA_CC_IS_CDNA3(cc)));
112+
const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
113+
|| GGML_CUDA_CC_IS_CDNA(cc);
114114

115115
if (!ids) {
116116
const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
@@ -252,7 +252,7 @@ void ggml_cuda_op_mul_mat_q(
252252
// Also its fixup needs to allocate a temporary buffer in the memory pool.
253253
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
254254
const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
255-
|| (GGML_CUDA_CC_IS_AMD(cc) && GGML_CUDA_CC_IS_CDNA3(cc)))
255+
|| GGML_CUDA_CC_IS_CDNA(cc))
256256
&& src1_ncols == ne11;
257257
const mmq_args args = {
258258
src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
@@ -306,7 +306,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
306306
return false;
307307
}
308308

309-
if (new_mma_available(cc) || amd_mfma_available(cc)) {
309+
if (new_mma_available(cc)) {
310310
return true;
311311
}
312312

@@ -322,5 +322,21 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
322322
return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
323323
}
324324

325+
if (amd_mfma_available(cc)) {
326+
// As of ROCM 7.0 rocblas/tensile performs very poorly on CDNA3 and hipblaslt (via ROCBLAS_USE_HIPBLASLT)
327+
// performs better but is currently suffering from a crash on this architecture.
328+
// TODO: Revisit when hipblaslt is fixed on CDNA3
329+
if (GGML_CUDA_CC_IS_CDNA3(cc)) {
330+
return true;
331+
}
332+
if (ne11 <= 128 || type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
333+
return true;
334+
}
335+
if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) {
336+
return true;
337+
}
338+
return false;
339+
}
340+
325341
return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
326342
}

ggml/src/ggml-cuda/mmq.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3096,8 +3096,8 @@ static __global__ void mul_mat_q(
30963096
}
30973097
__syncthreads();
30983098

3099-
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
3100-
#if (defined(GGML_USE_HIP) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
3099+
// On non-CDNA AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
3100+
#if (defined(GGML_USE_HIP) && !defined(CDNA)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
31013101
{
31023102
const int wt = blockIdx.z / nchannels_y;
31033103
const int zt = blockIdx.z - wt*nchannels_y;

0 commit comments

Comments
 (0)