|
46 | 46 | #define GGML_CUDA_CC_VOLTA 700 |
47 | 47 | #define GGML_CUDA_CC_TURING 750 |
48 | 48 | #define GGML_CUDA_CC_AMPERE 800 |
49 | | -#define CC_OFFSET_AMD 1000000 |
| 49 | +#define GGML_CUDA_CC_OFFSET_AMD 1000000 |
50 | 50 |
|
51 | 51 | // GCN/CNDA, wave size is 64 |
52 | | -#define CC_GCN4 (CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16 |
53 | | -#define CC_VEGA (CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue |
54 | | -#define CC_VEGA20 (CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a |
55 | | -#define CC_CDNA (CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers |
56 | | -#define CC_CDNA2 (CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing |
57 | | -#define CC_CDNA3 (CC_OFFSET_AMD + 942) // MI300 |
| 52 | +#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16 |
| 53 | +#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue |
| 54 | +#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a |
| 55 | +#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers |
| 56 | +#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing |
| 57 | +#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 942) // MI300 |
58 | 58 |
|
59 | 59 | // RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32 |
60 | | -#define CC_RDNA1 (CC_OFFSET_AMD + 1010) // RX 5000 |
61 | | -#define CC_RDNA2 (CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a |
62 | | -#define CC_RDNA3 (CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA |
| 60 | +#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 1010) // RX 5000 |
| 61 | +#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a |
| 62 | +#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA |
63 | 63 |
|
64 | | -#define CC_QY1 210 |
65 | | -#define CC_QY2 220 |
| 64 | +#define GGML_CUDA_CC_QY1 210 |
| 65 | +#define GGML_CUDA_CC_QY2 220 |
66 | 66 |
|
67 | 67 | #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses |
68 | 68 |
|
@@ -147,20 +147,20 @@ typedef float2 dfloat2; |
147 | 147 | #define INT8_MMA_AVAILABLE |
148 | 148 | #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING |
149 | 149 |
|
150 | | -#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1) |
| 150 | +#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1) |
151 | 151 | #define FLASH_ATTN_AVAILABLE |
152 | | -#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1) |
| 152 | +#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1) |
153 | 153 |
|
154 | 154 | static constexpr bool fast_fp16_available(const int cc) { |
155 | 155 | return cc >= GGML_CUDA_CC_PASCAL && cc != 610; |
156 | 156 | } |
157 | 157 |
|
158 | 158 | static constexpr bool fp16_mma_available(const int cc) { |
159 | | - return cc < CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA; |
| 159 | + return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA; |
160 | 160 | } |
161 | 161 |
|
162 | 162 | static constexpr bool int8_mma_available(const int cc) { |
163 | | - return cc < CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING; |
| 163 | + return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING; |
164 | 164 | } |
165 | 165 |
|
166 | 166 | [[noreturn]] |
|
0 commit comments