7676#define  GGML_CUDA_CC_IS_CDNA (cc )  (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
7777
7878//  Moore Threads
79- #define  GGML_CUDA_MUSA_ARCH_IS_QY1  (__MUSA_ARCH__ <= 210 )
80- 
81- #define  GGML_CUDA_CC_QY1   (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210 ) //  MTT S80, MTT S3000
82- #define  GGML_CUDA_CC_QY2   (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220 ) //  MTT S4000
83- #define  GGML_CUDA_CC_NG    (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310 ) //  TBD
79+ #define  GGML_CUDA_CC_QY1  (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210 ) //  MTT S80, MTT S3000
80+ #define  GGML_CUDA_CC_QY2  (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220 ) //  MTT S4000
81+ #define  GGML_CUDA_CC_NG   (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310 ) //  TBD
8482
8583#define  GGML_CUDA_CC_IS_MTHREADS (cc ) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
8684#define  GGML_CUDA_CC_IS_QY1 (cc )      (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
@@ -203,9 +201,9 @@ typedef float2 dfloat2;
203201#define  FAST_FP16_AVAILABLE 
204202#endif  //  defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
205203
206- #if  !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) &&  __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
204+ #if  (! defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA) 
207205#define  FP16_MMA_AVAILABLE 
208- #endif  //  !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) &&  __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
206+ #endif  //  (! defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA) 
209207
210208#if  defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
211209#define  FP16_MMA_AVAILABLE 
@@ -219,9 +217,9 @@ typedef float2 dfloat2;
219217#define  CP_ASYNC_AVAILABLE 
220218#endif  //  !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
221219
222- #if  !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1 )
220+ #if  !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220 )
223221#define  FLASH_ATTN_AVAILABLE 
224- #endif  //  !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1 )
222+ #endif  //  !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220 )
225223
226224static  bool  fp16_available (const  int  cc) {
227225    return  ggml_cuda_highest_compiled_arch (cc) >= GGML_CUDA_CC_PASCAL;
@@ -233,7 +231,8 @@ static bool fast_fp16_available(const int cc) {
233231
234232//  To be used for feature selection of external libraries, e.g. cuBLAS.
235233static  bool  fast_fp16_hardware_available (const  int  cc) {
236-     return  (GGML_CUDA_CC_IS_NVIDIA (cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610 ) || GGML_CUDA_CC_IS_AMD (cc);
234+     return  (GGML_CUDA_CC_IS_NVIDIA (cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610 ) || GGML_CUDA_CC_IS_AMD (cc) ||
235+         (GGML_CUDA_CC_IS_MTHREADS (cc) && cc >= GGML_CUDA_CC_QY2);
237236}
238237
239238//  Any FP16 tensor core instructions are available for ggml code.
@@ -242,7 +241,8 @@ static bool fp16_mma_available(const int cc) {
242241    return  false ;
243242#else 
244243    if  ((GGML_CUDA_CC_IS_NVIDIA (cc) && ggml_cuda_highest_compiled_arch (cc) >= GGML_CUDA_CC_VOLTA) ||
245-         GGML_CUDA_CC_IS_CDNA (cc) || GGML_CUDA_CC_IS_RDNA3 (cc)) {
244+         GGML_CUDA_CC_IS_CDNA (cc) || GGML_CUDA_CC_IS_RDNA3 (cc) ||
245+         GGML_CUDA_CC_IS_MTHREADS (cc)) {
246246        return  true ;
247247    } else  if  (GGML_CUDA_CC_IS_RDNA4 (cc)) {
248248#if  defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
@@ -259,7 +259,8 @@ static bool fp16_mma_available(const int cc) {
259259//  To be used for feature selection of external libraries, e.g. cuBLAS.
260260static  bool  fp16_mma_hardware_available (const  int  cc) {
261261    return  (GGML_CUDA_CC_IS_NVIDIA (cc) && cc >= GGML_CUDA_CC_VOLTA) ||
262-         GGML_CUDA_CC_IS_CDNA (cc) || GGML_CUDA_CC_IS_RDNA3 (cc) || GGML_CUDA_CC_IS_RDNA4 (cc);
262+         GGML_CUDA_CC_IS_CDNA (cc) || GGML_CUDA_CC_IS_RDNA3 (cc) || GGML_CUDA_CC_IS_RDNA4 (cc) ||
263+         (GGML_CUDA_CC_IS_MTHREADS (cc) && cc >= GGML_CUDA_CC_QY2);
263264}
264265
265266static  bool  bf16_mma_hardware_available (const  int  cc) {
0 commit comments