|
49 | 49 | #define GGML_CUDA_CC_ADA_LOVELACE 890 |
50 | 50 | #define GGML_CUDA_CC_OFFSET_AMD 0x1000000 |
51 | 51 | #define GGML_CUDA_CC_OFFSET_MTHREADS 0x0100000 |
| 52 | +#define GGML_CUDA_CC_IS_NVIDIA(cc) (cc < GGML_CUDA_CC_OFFSET_MTHREADS) |
52 | 53 |
|
53 | 54 | // AMD |
54 | 55 | // GCN/CNDA, wave size is 64 |
|
79 | 80 | #define GGML_CUDA_CC_QY2 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000 |
80 | 81 | #define GGML_CUDA_CC_NG (GGML_MUSA_CC_OFFSET_MTHREADS + 0x310) // TBD |
81 | 82 |
|
82 | | -#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS) |
| 83 | +#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD) |
83 | 84 | #define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2) |
84 | 85 | #define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NEXT) |
85 | 86 | #define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG) |
@@ -229,33 +230,33 @@ static bool fp16_available(const int cc) { |
229 | 230 | } |
230 | 231 |
|
231 | 232 | static bool fast_fp16_available(const int cc) { |
232 | | - return (!GGML_CUDA_CC_IS_MTHREADS(cc) && fp16_available(cc) && cc != 610) || GGML_CUDA_CC_IS_AMD(cc); |
| 233 | + return (GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && cc != 610) || GGML_CUDA_CC_IS_AMD(cc); |
233 | 234 | } |
234 | 235 |
|
235 | 236 | // To be used for feature selection of external libraries, e.g. cuBLAS. |
236 | 237 | static bool fast_fp16_hardware_available(const int cc) { |
237 | | - return (!GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc); |
| 238 | + return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc); |
238 | 239 | } |
239 | 240 |
|
240 | 241 | // Any FP16 tensor core instructions are available for ggml code. |
241 | 242 | static bool fp16_mma_available(const int cc) { |
242 | 243 | #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN) |
243 | 244 | return false; |
244 | 245 | #else |
245 | | - return !GGML_CUDA_CC_IS_MTHREADS(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA || |
| 246 | + return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA || |
246 | 247 | GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc); |
247 | 248 | #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN) |
248 | 249 | } |
249 | 250 |
|
250 | 251 | // To be used for feature selection of external libraries, e.g. cuBLAS. |
251 | 252 | static bool fp16_mma_hardware_available(const int cc) { |
252 | | - return !GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_VOLTA || |
| 253 | + return GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA || |
253 | 254 | GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc); |
254 | 255 | } |
255 | 256 |
|
256 | 257 | // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later. |
257 | 258 | static bool new_mma_available(const int cc) { |
258 | | - return !GGML_CUDA_CC_IS_MTHREADS(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING; |
| 259 | + return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING; |
259 | 260 | } |
260 | 261 |
|
261 | 262 | static bool cp_async_available(const int cc) { |
@@ -433,13 +434,13 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i |
433 | 434 |
|
434 | 435 | #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) |
435 | 436 |
|
436 | | -#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A |
| 437 | +#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA) |
437 | 438 | return __dp4a(a, b, c); |
438 | | -#else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A |
| 439 | +#else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA) |
439 | 440 | const int8_t * a8 = (const int8_t *) &a; |
440 | 441 | const int8_t * b8 = (const int8_t *) &b; |
441 | 442 | return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3]; |
442 | | -#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A |
| 443 | +#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA) |
443 | 444 |
|
444 | 445 | #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) |
445 | 446 | } |
|
0 commit comments