|
41 | 41 | #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed) |
42 | 42 | #define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons |
43 | 43 |
|
44 | | -#define GGML_CUDA_CC_PASCAL 600 |
45 | | -#define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products |
46 | | -#define GGML_CUDA_CC_VOLTA 700 |
47 | | -#define GGML_CUDA_CC_TURING 750 |
48 | | -#define GGML_CUDA_CC_AMPERE 800 |
49 | | -#define GGML_CUDA_CC_ADA_LOVELACE 890 |
50 | | -#define GGML_CUDA_CC_OFFSET_AMD 0x1000000 |
51 | | - |
| 44 | +#define GGML_CUDA_CC_PASCAL 600 |
| 45 | +#define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products |
| 46 | +#define GGML_CUDA_CC_VOLTA 700 |
| 47 | +#define GGML_CUDA_CC_TURING 750 |
| 48 | +#define GGML_CUDA_CC_AMPERE 800 |
| 49 | +#define GGML_CUDA_CC_ADA_LOVELACE 890 |
| 50 | +#define GGML_CUDA_CC_OFFSET_AMD 0x1000000 |
| 51 | +#define GGML_CUDA_CC_OFFSET_MTHREADS 0x0100000 |
| 52 | + |
| 53 | +// AMD |
52 | 54 | // GCN/CNDA, wave size is 64 |
53 | 55 | #define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16 |
54 | 56 | #define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue |
|
70 | 72 | #define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA) |
71 | 73 | #define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1) |
72 | 74 |
|
73 | | -#define GGML_CUDA_CC_QY1 210 |
74 | | -#define GGML_CUDA_CC_QY2 220 |
| 75 | +// Moore Threads |
| 76 | +#define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210) |
| 77 | + |
| 78 | +#define GGML_CUDA_CC_QY1 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000 |
| 79 | +#define GGML_CUDA_CC_QY2 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000 |
| 80 | +#define GGML_CUDA_CC_NG (GGML_MUSA_CC_OFFSET_MTHREADS + 0x310) // TBD |
| 81 | + |
| 82 | +#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS) |
| 83 | +#define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2) |
| 84 | +#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NEXT) |
| 85 | +#define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG) |
75 | 86 |
|
76 | 87 | #ifdef __CUDA_ARCH_LIST__ |
77 | 88 | constexpr bool ggml_cuda_has_arch_impl(int) { |
@@ -209,42 +220,42 @@ typedef float2 dfloat2; |
209 | 220 | #define CP_ASYNC_AVAILABLE |
210 | 221 | #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE |
211 | 222 |
|
212 | | -#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1) |
| 223 | +#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1) |
213 | 224 | #define FLASH_ATTN_AVAILABLE |
214 | | -#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1) |
| 225 | +#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1) |
215 | 226 |
|
216 | 227 | static bool fp16_available(const int cc) { |
217 | 228 | return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL; |
218 | 229 | } |
219 | 230 |
|
220 | 231 | static bool fast_fp16_available(const int cc) { |
221 | | - return fp16_available(cc) && cc != 610; |
| 232 | + return (!GGML_CUDA_CC_IS_MTHREADS(cc) && fp16_available(cc) && cc != 610) || GGML_CUDA_CC_IS_AMD(cc); |
222 | 233 | } |
223 | 234 |
|
224 | 235 | // To be used for feature selection of external libraries, e.g. cuBLAS. |
225 | 236 | static bool fast_fp16_hardware_available(const int cc) { |
226 | | - return cc >= GGML_CUDA_CC_PASCAL && cc != 610; |
| 237 | + return (!GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc); |
227 | 238 | } |
228 | 239 |
|
229 | 240 | // Any FP16 tensor core instructions are available for ggml code. |
230 | 241 | static bool fp16_mma_available(const int cc) { |
231 | 242 | #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN) |
232 | 243 | return false; |
233 | 244 | #else |
234 | | - return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA || |
235 | | - GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3; |
| 245 | + return !GGML_CUDA_CC_IS_MTHREADS(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA || |
| 246 | + GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc); |
236 | 247 | #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN) |
237 | 248 | } |
238 | 249 |
|
239 | 250 | // To be used for feature selection of external libraries, e.g. cuBLAS. |
240 | 251 | static bool fp16_mma_hardware_available(const int cc) { |
241 | | - return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA || |
242 | | - GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3; |
| 252 | + return !GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_VOLTA || |
| 253 | + GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc); |
243 | 254 | } |
244 | 255 |
|
245 | 256 | // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later. |
246 | 257 | static bool new_mma_available(const int cc) { |
247 | | - return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING; |
| 258 | + return !GGML_CUDA_CC_IS_MTHREADS(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING; |
248 | 259 | } |
249 | 260 |
|
250 | 261 | static bool cp_async_available(const int cc) { |
@@ -422,13 +433,13 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i |
422 | 433 |
|
423 | 434 | #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) |
424 | 435 |
|
425 | | -#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA) |
| 436 | +#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A |
426 | 437 | return __dp4a(a, b, c); |
427 | | -#else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA) |
| 438 | +#else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A |
428 | 439 | const int8_t * a8 = (const int8_t *) &a; |
429 | 440 | const int8_t * b8 = (const int8_t *) &b; |
430 | 441 | return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3]; |
431 | | -#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA) |
| 442 | +#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A |
432 | 443 |
|
433 | 444 | #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) |
434 | 445 | } |
|
0 commit comments