Skip to content

Commit f85dbd2

Browse files
committed
musa: enable
Signed-off-by: Xiaodong Ye <[email protected]>
1 parent 9070365 commit f85dbd2

File tree

3 files changed

+72
-28
lines changed

3 files changed

+72
-28
lines changed

ggml/src/ggml-cuda/common.cuh

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -76,12 +76,11 @@
7676
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
7777

7878
// Moore Threads
79-
#define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210)
80-
81-
#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
82-
#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
83-
#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
79+
#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 210) // MTT S80, MTT S3000
80+
#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 220) // MTT S4000
81+
#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 310) // TBD
8482

83+
#define GGML_CUDA_CC_TO_MTHREADS(cc) ((cc) - GGML_CUDA_CC_OFFSET_MTHREADS)
8584
#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
8685
#define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
8786
#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NG)
@@ -203,9 +202,9 @@ typedef float2 dfloat2;
203202
#define FP16_AVAILABLE
204203
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
205204

206-
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
205+
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != GGML_CUDA_CC_DP4A
207206
#define FAST_FP16_AVAILABLE
208-
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
207+
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != GGML_CUDA_CC_DP4A
209208

210209
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
211210
#define FP16_MMA_AVAILABLE
@@ -215,6 +214,10 @@ typedef float2 dfloat2;
215214
#define FP16_MMA_AVAILABLE
216215
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))
217216

217+
#if defined(GGML_USE_MUSA)
218+
#define FP16_MMA_AVAILABLE
219+
#endif // defined(GGML_USE_MUSA)
220+
218221
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
219222
#define NEW_MMA_AVAILABLE
220223
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
@@ -223,21 +226,22 @@ typedef float2 dfloat2;
223226
#define CP_ASYNC_AVAILABLE
224227
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
225228

226-
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1)
229+
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < GGML_CUDA_CC_TO_MTHREADS(GGML_CUDA_CC_QY2))
227230
#define FLASH_ATTN_AVAILABLE
228-
#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1)
231+
#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < GGML_CUDA_CC_TO_MTHREADS(GGML_CUDA_CC_QY2))
229232

230233
static bool fp16_available(const int cc) {
231234
return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
232235
}
233236

234237
static bool fast_fp16_available(const int cc) {
235-
return (GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && cc != 610) || GGML_CUDA_CC_IS_AMD(cc);
238+
return (GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && cc != GGML_CUDA_CC_DP4A) || GGML_CUDA_CC_IS_AMD(cc);
236239
}
237240

238241
// To be used for feature selection of external libraries, e.g. cuBLAS.
239242
static bool fast_fp16_hardware_available(const int cc) {
240-
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc);
243+
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) ||
244+
GGML_CUDA_CC_IS_AMD(cc) || GGML_CUDA_CC_IS_MTHREADS(cc);
241245
}
242246

243247
// Any FP16 tensor core instructions are available for ggml code.
@@ -246,14 +250,16 @@ static bool fp16_mma_available(const int cc) {
246250
return false;
247251
#else
248252
return (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
249-
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
253+
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc) ||
254+
GGML_CUDA_CC_IS_MTHREADS(cc);
250255
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
251256
}
252257

253258
// To be used for feature selection of external libraries, e.g. cuBLAS.
254259
static bool fp16_mma_hardware_available(const int cc) {
255260
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) ||
256-
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
261+
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc) ||
262+
GGML_CUDA_CC_IS_MTHREADS(cc);
257263
}
258264

259265
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.

ggml/src/ggml-cuda/fattn-wmma-f16.cu

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,11 @@
99
#ifdef FP16_MMA_AVAILABLE
1010
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
1111
#include <mma.h>
12+
#ifdef GGML_USE_MUSA
13+
namespace wmma = mtmusa::wmma;
14+
#else // GGML_USE_MUSA
1215
namespace wmma = nvcuda::wmma;
16+
#endif // GGML_USE_MUSA
1317
#elif defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)
1418
#undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers
1519
#include <rocwmma/rocwmma.hpp>

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 49 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -268,8 +268,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
268268
// FIXME: Ensure compatibility with varying warp sizes across different MUSA archs.
269269
info.devices[id].warp_size = 32;
270270
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
271-
info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100;
272-
info.devices[id].cc += prop.minor * 0x10;
271+
info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + 100*prop.major + 10*prop.minor;
273272
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
274273
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
275274
#else
@@ -1200,7 +1199,8 @@ static void ggml_cuda_op_mul_mat_cublas(
12001199

12011200
const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
12021201

1203-
if (src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
1202+
if (!(GGML_CUDA_CC_IS_MTHREADS(cc) && cc < GGML_CUDA_CC_QY2) &&
1203+
src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
12041204
ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
12051205
if (src1->type != GGML_TYPE_BF16) {
12061206
const to_bf16_cuda_t to_bf16_cuda = ggml_get_to_bf16_cuda(src1->type);
@@ -1870,13 +1870,24 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
18701870
// use cublasGemmBatchedEx
18711871
const int64_t ne23 = ne12*ne13;
18721872

1873+
#ifdef GGML_USE_MUSA
1874+
const void ** ptrs_src;
1875+
void ** ptrs_dst;
1876+
CUDA_CHECK(cudaMalloc((void **)&ptrs_src, sizeof(void *)*2*ne23));
1877+
CUDA_CHECK(cudaMalloc((void **)&ptrs_dst, sizeof(void *)*1*ne23));
1878+
#else // GGML_USE_MUSA
18731879
ggml_cuda_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
18741880
ggml_cuda_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23);
1881+
#endif // GGML_USE_MUSA
18751882

18761883
dim3 block_dims(ne13, ne12);
18771884
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
18781885
src0_f16, src1_f16, dst_t,
1886+
#ifdef GGML_USE_MUSA
1887+
ptrs_src, ptrs_dst,
1888+
#else // GGML_USE_MUSA
18791889
ptrs_src.get(), ptrs_dst.get(),
1890+
#endif // GGML_USE_MUSA
18801891
ne12, ne13,
18811892
ne23,
18821893
nb02, nb03,
@@ -1886,15 +1897,31 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
18861897
r2, r3);
18871898
CUDA_CHECK(cudaGetLastError());
18881899

1889-
CUBLAS_CHECK(
1900+
#ifdef GGML_USE_MUSA
1901+
cudaDeviceSynchronize();
1902+
const void **Aarray = (const void **) (ptrs_src + 0*ne23);
1903+
const void **Barray = (const void **) (ptrs_src + 1*ne23);
1904+
void **Carray = ( void **) (ptrs_dst + 0*ne23);
1905+
#else // GGML_USE_MUSA
1906+
const void **Aarray = (const void **) (ptrs_src.get() + 0*ne23);
1907+
const void **Barray = (const void **) (ptrs_src.get() + 1*ne23);
1908+
void **Carray = ( void **) (ptrs_dst.get() + 0*ne23);
1909+
#endif // GGML_USE_MUSA
1910+
1911+
CUBLAS_CHECK(
18901912
cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
18911913
ne01, ne11, ne10,
1892-
alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00,
1893-
(const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, s11,
1894-
beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne0,
1914+
alpha, Aarray, CUDA_R_16F, nb01/nb00,
1915+
Barray, CUDA_R_16F, s11,
1916+
beta, Carray, cu_data_type, ne0,
18951917
ne23,
18961918
cu_compute_type,
18971919
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1920+
1921+
#ifdef GGML_USE_MUSA
1922+
CUDA_CHECK(cudaFree(ptrs_src));
1923+
CUDA_CHECK(cudaFree(ptrs_dst));
1924+
#endif // GGML_USE_MUSA
18981925
}
18991926
#endif
19001927

@@ -1918,6 +1945,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
19181945

19191946
bool any_gpus_with_slow_fp16 = false;
19201947
bool any_gpus_without_fp16_mma = false;
1948+
bool any_gpus_without_batched_cublas = false;
19211949

19221950
if (split) {
19231951
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
@@ -1932,12 +1960,14 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
19321960
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
19331961
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
19341962
any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available(cc);
1963+
any_gpus_without_batched_cublas = any_gpus_without_batched_cublas || !(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
19351964
}
19361965
} else {
19371966
const int cc = ggml_cuda_info().devices[ctx.device].cc;
19381967
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
19391968
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
19401969
any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available(cc);
1970+
any_gpus_without_batched_cublas = any_gpus_without_batched_cublas || !(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
19411971
}
19421972

19431973
// debug helpers
@@ -1956,7 +1986,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
19561986
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
19571987
} else if (!split && use_mul_mat_q) {
19581988
ggml_cuda_mul_mat_q(ctx, src0, src1, nullptr, dst);
1959-
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
1989+
} else if (!split && !any_gpus_without_batched_cublas && src0->type == GGML_TYPE_F16 &&
1990+
(src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
19601991
!ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
19611992
// general KQ + KQV multi-batch without FlashAttention
19621993
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
@@ -2996,9 +3027,17 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
29963027
return false;
29973028
}
29983029
#ifdef GGML_USE_MUSA
2999-
if (b->type == GGML_TYPE_F16 && b->ne[2]*b->ne[3] > 1 &&
3030+
const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
3031+
if (GGML_CUDA_CC_IS_MTHREADS(cc) && b->ne[2]*b->ne[3] > 1 &&
30003032
!ggml_is_transposed(a) && !ggml_is_transposed(b)) {
3001-
return false;
3033+
if (GGML_CUDA_CC_IS_QY1(cc) && op->op == GGML_OP_MUL_MAT
3034+
&& b->type == GGML_TYPE_F16) {
3035+
return false;
3036+
}
3037+
if (GGML_CUDA_CC_IS_QY2(cc) && op->op == GGML_OP_MUL_MAT_ID &&
3038+
a->type == GGML_TYPE_Q2_K && b->type == GGML_TYPE_F32) {
3039+
return false;
3040+
}
30023041
}
30033042
#endif // GGML_USE_MUSA
30043043
switch (a->type) {
@@ -3025,11 +3064,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
30253064
case GGML_TYPE_IQ4_NL:
30263065
case GGML_TYPE_IQ4_XS:
30273066
case GGML_TYPE_BF16:
3028-
#ifdef GGML_USE_MUSA
3029-
if (a->type == GGML_TYPE_Q3_K) {
3030-
return false;
3031-
}
3032-
#endif // GGML_USE_MUSA
30333067
return true;
30343068
default:
30353069
return false;

0 commit comments

Comments
 (0)