7878// Moore Threads
7979#define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210 )
8080
81- #define GGML_CUDA_CC_QY1 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x210 ) // MTT S80, MTT S3000
82- #define GGML_CUDA_CC_QY2 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x220 ) // MTT S4000
83- #define GGML_CUDA_CC_NG (GGML_MUSA_CC_OFFSET_MTHREADS + 0x310 ) // TBD
81+ #define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210 ) // MTT S80, MTT S3000
82+ #define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220 ) // MTT S4000
83+ #define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310 ) // TBD
8484
8585#define GGML_CUDA_CC_IS_MTHREADS (cc ) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
8686#define GGML_CUDA_CC_IS_QY1 (cc ) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
87- #define GGML_CUDA_CC_IS_QY2 (cc ) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NEXT )
87+ #define GGML_CUDA_CC_IS_QY2 (cc ) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NG )
8888#define GGML_CUDA_CC_IS_NG (cc ) (cc >= GGML_CUDA_CC_NG)
8989
9090#ifdef __CUDA_ARCH_LIST__
@@ -215,6 +215,10 @@ typedef float2 dfloat2;
215215#define FP16_MMA_AVAILABLE
216216#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))
217217
218+ #if defined(GGML_USE_MUSA) && !GGML_CUDA_MUSA_ARCH_IS_QY1
219+ #define FP16_MMA_AVAILABLE
220+ #endif // defined(GGML_USE_MUSA) && !GGML_CUDA_MUSA_ARCH_IS_QY1
221+
218222#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
219223#define NEW_MMA_AVAILABLE
220224#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
@@ -232,12 +236,12 @@ static bool fp16_available(const int cc) {
232236}
233237
234238static bool fast_fp16_available (const int cc) {
235- return ( GGML_CUDA_CC_IS_NVIDIA (cc) && fp16_available (cc) && cc != 610 ) || GGML_CUDA_CC_IS_AMD (cc) ;
239+ return fp16_available (cc) && cc != 610 ;
236240}
237241
238242// To be used for feature selection of external libraries, e.g. cuBLAS.
239243static bool fast_fp16_hardware_available (const int cc) {
240- return ( GGML_CUDA_CC_IS_NVIDIA (cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610 ) || GGML_CUDA_CC_IS_AMD (cc) ;
244+ return cc >= GGML_CUDA_CC_PASCAL && cc != 610 && cc != GGML_CUDA_CC_QY1 ;
241245}
242246
243247// Any FP16 tensor core instructions are available for ggml code.
@@ -246,13 +250,15 @@ static bool fp16_mma_available(const int cc) {
246250 return false ;
247251#else
248252 return (GGML_CUDA_CC_IS_NVIDIA (cc) && ggml_cuda_highest_compiled_arch (cc) >= GGML_CUDA_CC_VOLTA) ||
253+ (GGML_CUDA_CC_IS_MTHREADS (cc) && cc >= GGML_CUDA_CC_QY2) ||
249254 GGML_CUDA_CC_IS_CDNA (cc) || GGML_CUDA_CC_IS_RDNA3 (cc) || GGML_CUDA_CC_IS_RDNA4 (cc);
250255#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
251256}
252257
253258// To be used for feature selection of external libraries, e.g. cuBLAS.
254259static bool fp16_mma_hardware_available (const int cc) {
255260 return (GGML_CUDA_CC_IS_NVIDIA (cc) && cc >= GGML_CUDA_CC_VOLTA) ||
261+ (GGML_CUDA_CC_IS_MTHREADS (cc) && cc >= GGML_CUDA_CC_QY2) ||
256262 GGML_CUDA_CC_IS_CDNA (cc) || GGML_CUDA_CC_IS_RDNA3 (cc) || GGML_CUDA_CC_IS_RDNA4 (cc);
257263}
258264
0 commit comments