|
49 | 49 | #define GGML_CUDA_CC_ADA_LOVELACE 890 |
50 | 50 | #define GGML_CUDA_CC_OFFSET_AMD 0x1000000 |
51 | 51 |
|
52 | | -// GCN/CNDA, wave size is 64 |
| 52 | +// GCN/CDNA, wave size is 64 |
53 | 53 | #define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16 |
54 | 54 | #define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue |
55 | 55 | #define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a |
56 | 56 | #define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers |
57 | 57 | #define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing |
58 | 58 | #define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300 |
59 | 59 |
|
60 | | -// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32 |
| 60 | +// RDNA removes MFMA, dp4a, xnack, acc registers, wave size is 32 |
61 | 61 | #define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000 |
62 | 62 | #define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a |
63 | 63 | #define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA |
| 64 | +#define GGML_CUDA_CC_RDNA4 (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000 |
64 | 65 |
|
65 | 66 | #define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD) |
66 | 67 | #define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1) |
67 | 68 | #define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2) |
68 | 69 | #define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3) |
69 | 70 | #define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3) |
| 71 | +#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4) |
70 | 72 | #define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA) |
71 | 73 | #define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1) |
72 | 74 |
|
@@ -197,9 +199,9 @@ typedef float2 dfloat2; |
197 | 199 | #define FP16_MMA_AVAILABLE |
198 | 200 | #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA |
199 | 201 |
|
200 | | -#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3)) |
| 202 | +#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4)) |
201 | 203 | #define FP16_MMA_AVAILABLE |
202 | | -#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3)) |
| 204 | +#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4)) |
203 | 205 |
|
204 | 206 | #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING |
205 | 207 | #define NEW_MMA_AVAILABLE |
@@ -232,14 +234,14 @@ static bool fp16_mma_available(const int cc) { |
232 | 234 | return false; |
233 | 235 | #else |
234 | 236 | return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA || |
235 | | - GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3; |
| 237 | + GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3 || cc >= GGML_CUDA_CC_RDNA4; |
236 | 238 | #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN) |
237 | 239 | } |
238 | 240 |
|
239 | 241 | // To be used for feature selection of external libraries, e.g. cuBLAS. |
240 | 242 | static bool fp16_mma_hardware_available(const int cc) { |
241 | 243 | return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA || |
242 | | - GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3; |
| 244 | + GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3 || cc >= GGML_CUDA_CC_RDNA4; |
243 | 245 | } |
244 | 246 |
|
245 | 247 | // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later. |
@@ -397,7 +399,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i |
397 | 399 | #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) |
398 | 400 | #if defined(CDNA) || defined(RDNA2) || defined(__gfx906__) |
399 | 401 | c = __builtin_amdgcn_sdot4(a, b, c, false); |
400 | | -#elif defined(RDNA3) |
| 402 | +#elif defined(RDNA3) || defined(RDNA4) |
401 | 403 | c = __builtin_amdgcn_sudot4( true, a, true, b, c, false); |
402 | 404 | #elif defined(RDNA1) || defined(__gfx900__) |
403 | 405 | int tmp1; |
|
0 commit comments