Skip to content

Commit 78a76c7

Browse files
committed
musa: enable fp16 mma (all) and cublas on qy2
Signed-off-by: Xiaodong Ye <[email protected]>
1 parent 73e53dc commit 78a76c7

File tree

3 files changed

+27
-20
lines changed

3 files changed

+27
-20
lines changed

ggml/src/ggml-cuda/common.cuh

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,9 @@
7676
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
7777

7878
// Moore Threads
79-
#define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210)
80-
81-
#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
82-
#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
83-
#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
79+
#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
80+
#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
81+
#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
8482

8583
#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
8684
#define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
@@ -211,6 +209,10 @@ typedef float2 dfloat2;
211209
#define FP16_MMA_AVAILABLE
212210
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
213211

212+
#if defined(GGML_USE_MUSA)
213+
#define FP16_MMA_AVAILABLE
214+
#endif // defined(GGML_USE_MUSA)
215+
214216
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
215217
#define NEW_MMA_AVAILABLE
216218
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
@@ -219,9 +221,9 @@ typedef float2 dfloat2;
219221
#define CP_ASYNC_AVAILABLE
220222
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
221223

222-
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1)
224+
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
223225
#define FLASH_ATTN_AVAILABLE
224-
#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1)
226+
#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
225227

226228
static bool fp16_available(const int cc) {
227229
return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
@@ -233,7 +235,8 @@ static bool fast_fp16_available(const int cc) {
233235

234236
// To be used for feature selection of external libraries, e.g. cuBLAS.
235237
static bool fast_fp16_hardware_available(const int cc) {
236-
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc);
238+
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc) ||
239+
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
237240
}
238241

239242
// Any FP16 tensor core instructions are available for ggml code.
@@ -242,7 +245,8 @@ static bool fp16_mma_available(const int cc) {
242245
return false;
243246
#else
244247
if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
245-
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc)) {
248+
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) ||
249+
GGML_CUDA_CC_IS_MTHREADS(cc)) {
246250
return true;
247251
} else if (GGML_CUDA_CC_IS_RDNA4(cc)) {
248252
#if defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
@@ -259,7 +263,8 @@ static bool fp16_mma_available(const int cc) {
259263
// To be used for feature selection of external libraries, e.g. cuBLAS.
260264
static bool fp16_mma_hardware_available(const int cc) {
261265
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) ||
262-
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
266+
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc) ||
267+
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
263268
}
264269

265270
static bool bf16_mma_hardware_available(const int cc) {

ggml/src/ggml-cuda/fattn-wmma-f16.cu

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,11 @@
99
#ifdef FP16_MMA_AVAILABLE
1010
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
1111
#include <mma.h>
12+
#ifdef GGML_USE_MUSA
13+
namespace wmma = mtmusa::wmma;
14+
#else // GGML_USE_MUSA
1215
namespace wmma = nvcuda::wmma;
16+
#endif // GGML_USE_MUSA
1317
#elif defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)
1418
#undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers
1519
#include <rocwmma/rocwmma.hpp>

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1229,7 +1229,7 @@ static void ggml_cuda_op_mul_mat_cublas(
12291229

12301230
const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
12311231

1232-
if (src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
1232+
if ((GGML_CUDA_CC_IS_NVIDIA(cc) || GGML_CUDA_CC_IS_AMD(cc) || (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2)) && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
12331233
ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
12341234
if (src1->type != GGML_TYPE_BF16) {
12351235
const to_bf16_cuda_t to_bf16_cuda = ggml_get_to_bf16_cuda(src1->type);
@@ -1257,7 +1257,7 @@ static void ggml_cuda_op_mul_mat_cublas(
12571257

12581258
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
12591259
to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
1260-
} else if (((GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || GGML_CUDA_CC_IS_AMD(cc)) && use_fp16) {
1260+
} else if (((GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || GGML_CUDA_CC_IS_AMD(cc) || (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2)) && use_fp16) {
12611261
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
12621262
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
12631263
if (src0->type != GGML_TYPE_F16) {
@@ -3061,9 +3061,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
30613061
return false;
30623062
}
30633063
#ifdef GGML_USE_MUSA
3064-
if (b->type == GGML_TYPE_F16 && b->ne[2]*b->ne[3] > 1 &&
3065-
!ggml_is_transposed(a) && !ggml_is_transposed(b)) {
3066-
return false;
3064+
const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
3065+
if (b->ne[2]*b->ne[3] > 1 && !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
3066+
if (GGML_CUDA_CC_IS_QY1(cc) && op->op == GGML_OP_MUL_MAT &&
3067+
a->type == GGML_TYPE_F16 && b->type == GGML_TYPE_F16) {
3068+
return false;
3069+
}
30673070
}
30683071
#endif // GGML_USE_MUSA
30693072
switch (a->type) {
@@ -3090,11 +3093,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
30903093
case GGML_TYPE_IQ4_NL:
30913094
case GGML_TYPE_IQ4_XS:
30923095
case GGML_TYPE_BF16:
3093-
#ifdef GGML_USE_MUSA
3094-
if (a->type == GGML_TYPE_Q3_K) {
3095-
return false;
3096-
}
3097-
#endif // GGML_USE_MUSA
30983096
return true;
30993097
default:
31003098
return false;

0 commit comments

Comments
 (0)