Skip to content

Commit c278a68

Browse files
committed
Revert "HIP: enable mfma mmq on gfx908 and gfx90a for select datatypes and shapes (ggml-org#14949)"
This reverts commit ad4a700.
1 parent 70a81cd commit c278a68

File tree

3 files changed

+10
-25
lines changed

3 files changed

+10
-25
lines changed

ggml/src/ggml-cuda/common.cuh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -231,9 +231,9 @@ typedef float2 dfloat2;
231231
#define FP16_MMA_AVAILABLE
232232
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
233233

234-
#if defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
234+
#if defined(GGML_USE_HIP) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
235235
#define AMD_MFMA_AVAILABLE
236-
#endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
236+
#endif // defined(GGML_USE_HIP) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
237237

238238
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
239239
#define NEW_MMA_AVAILABLE
@@ -297,9 +297,10 @@ static bool fp32_mma_hardware_available(const int cc) {
297297
return GGML_CUDA_CC_IS_CDNA(cc);
298298
}
299299

300+
// AMD CDNA3 matrix cores.. Will add support for other CDNA generations later.
300301
static bool amd_mfma_available(const int cc) {
301302
#if !defined(GGML_HIP_NO_MMQ_MFMA)
302-
return GGML_CUDA_CC_IS_CDNA(cc);
303+
return GGML_CUDA_CC_IS_CDNA3(cc);
303304
#else
304305
return false;
305306
#endif //!defined(GGML_HIP_NO_MMQ_MFMA)

ggml/src/ggml-cuda/mmq.cu

Lines changed: 4 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,8 @@ void ggml_cuda_mul_mat_q(
109109
const int64_t s03 = src0->nb[3] / ts_src0;
110110
const int64_t s3 = dst->nb[3] / ts_dst;
111111

112-
const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
113-
|| GGML_CUDA_CC_IS_CDNA(cc);
112+
const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
113+
|| (GGML_CUDA_CC_IS_AMD(cc) && GGML_CUDA_CC_IS_CDNA3(cc)));
114114

115115
if (!ids) {
116116
const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
@@ -252,7 +252,7 @@ void ggml_cuda_op_mul_mat_q(
252252
// Also its fixup needs to allocate a temporary buffer in the memory pool.
253253
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
254254
const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
255-
|| GGML_CUDA_CC_IS_CDNA(cc))
255+
|| (GGML_CUDA_CC_IS_AMD(cc) && GGML_CUDA_CC_IS_CDNA3(cc)))
256256
&& src1_ncols == ne11;
257257
const mmq_args args = {
258258
src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
@@ -308,7 +308,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
308308
return false;
309309
}
310310

311-
if (new_mma_available(cc)) {
311+
if (new_mma_available(cc) || amd_mfma_available(cc)) {
312312
return true;
313313
}
314314

@@ -324,21 +324,5 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
324324
return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
325325
}
326326

327-
if (amd_mfma_available(cc)) {
328-
// As of ROCM 7.0 rocblas/tensile performs very poorly on CDNA3 and hipblaslt (via ROCBLAS_USE_HIPBLASLT)
329-
// performs better but is currently suffering from a crash on this architecture.
330-
// TODO: Revisit when hipblaslt is fixed on CDNA3
331-
if (GGML_CUDA_CC_IS_CDNA3(cc)) {
332-
return true;
333-
}
334-
if (ne11 <= 128 || type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
335-
return true;
336-
}
337-
if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) {
338-
return true;
339-
}
340-
return false;
341-
}
342-
343327
return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
344328
}

ggml/src/ggml-cuda/mmq.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3097,8 +3097,8 @@ static __global__ void mul_mat_q(
30973097
}
30983098
__syncthreads();
30993099

3100-
// On non-CDNA AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
3101-
#if (defined(GGML_USE_HIP) && !defined(CDNA)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
3100+
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
3101+
#if (defined(GGML_USE_HIP) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
31023102
{
31033103
const int wt = blockIdx.z / nchannels_y;
31043104
const int zt = blockIdx.z - wt*nchannels_y;

0 commit comments

Comments
 (0)