4141#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
4242#define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
4343
44- #define CC_PASCAL 600
45- #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
46- #define CC_VOLTA 700
47- #define CC_TURING 750
48- #define CC_AMPERE 800
49- #define CC_OFFSET_AMD 1000000
44+ #define GGML_CUDA_CC_PASCAL 600
45+ #define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
46+ #define GGML_CUDA_CC_VOLTA 700
47+ #define GGML_CUDA_CC_TURING 750
48+ #define GGML_CUDA_CC_AMPERE 800
49+ #define GGML_CUDA_CC_OFFSET_AMD 1000000
5050
5151// GCN/CNDA, wave size is 64
52- #define CC_GCN4 (CC_OFFSET_AMD + 803 ) // Tonga, Fiji, Polaris, minimum for fast fp16
53- #define CC_VEGA (CC_OFFSET_AMD + 900 ) // Vega56/64, minimum for fp16 dual issue
54- #define CC_VEGA20 (CC_OFFSET_AMD + 906 ) // MI50/Radeon VII, minimum for dp4a
55- #define CC_CDNA (CC_OFFSET_AMD + 908 ) // MI100, minimum for MFMA, acc registers
56- #define CC_CDNA2 (CC_OFFSET_AMD + 910 ) // MI210, minimum acc register renameing
57- #define CC_CDNA3 (CC_OFFSET_AMD + 942 ) // MI300
52+ #define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 803 ) // Tonga, Fiji, Polaris, minimum for fast fp16
53+ #define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 900 ) // Vega56/64, minimum for fp16 dual issue
54+ #define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 906 ) // MI50/Radeon VII, minimum for dp4a
55+ #define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 908 ) // MI100, minimum for MFMA, acc registers
56+ #define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 910 ) // MI210, minimum acc register renameing
57+ #define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 942 ) // MI300
5858
5959// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
60- #define CC_RDNA1 (CC_OFFSET_AMD + 1010 ) // RX 5000
61- #define CC_RDNA2 (CC_OFFSET_AMD + 1030 ) // RX 6000, minimum for dp4a
62- #define CC_RDNA3 (CC_OFFSET_AMD + 1100 ) // RX 7000, minimum for WMMA
60+ #define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 1010 ) // RX 5000
61+ #define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 1030 ) // RX 6000, minimum for dp4a
62+ #define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 1100 ) // RX 7000, minimum for WMMA
6363
64- #define CC_QY1 210
65- #define CC_QY2 220
64+ #define GGML_CUDA_CC_QY1 210
65+ #define GGML_CUDA_CC_QY2 220
6666
6767#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
6868
@@ -131,36 +131,36 @@ typedef float dfloat; // dequantize float
131131typedef float2 dfloat2;
132132#endif // GGML_CUDA_F16
133133
134- #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
134+ #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
135135#define FP16_AVAILABLE
136- #endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
136+ #endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
137137
138138#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
139139#define FAST_FP16_AVAILABLE
140140#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
141141
142- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
142+ #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
143143#define FP16_MMA_AVAILABLE
144- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
144+ #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
145145
146- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
146+ #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
147147#define INT8_MMA_AVAILABLE
148- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
148+ #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
149149
150- #if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1 )
150+ #if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1 )
151151#define FLASH_ATTN_AVAILABLE
152- #endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1 )
152+ #endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1 )
153153
154154static constexpr bool fast_fp16_available (const int cc) {
155- return cc >= CC_PASCAL && cc != 610 ;
155+ return cc >= GGML_CUDA_CC_PASCAL && cc != 610 ;
156156}
157157
158158static constexpr bool fp16_mma_available (const int cc) {
159- return cc < CC_OFFSET_AMD && cc >= CC_VOLTA ;
159+ return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA ;
160160}
161161
162162static constexpr bool int8_mma_available (const int cc) {
163- return cc < CC_OFFSET_AMD && cc >= CC_TURING ;
163+ return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING ;
164164}
165165
166166[[noreturn]]
@@ -187,15 +187,15 @@ static __device__ void no_device_code(
187187#endif // __CUDA_ARCH__
188188
189189static __device__ __forceinline__ int warp_reduce_sum (int x) {
190- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
190+ #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
191191 return __reduce_add_sync (0xffffffff , x);
192192#else
193193#pragma unroll
194194 for (int offset = 16 ; offset > 0 ; offset >>= 1 ) {
195195 x += __shfl_xor_sync (0xffffffff , x, offset, 32 );
196196 }
197197 return x;
198- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
198+ #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
199199}
200200
201201static __device__ __forceinline__ float warp_reduce_sum (float x) {
@@ -284,7 +284,7 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
284284}
285285
286286static __device__ __forceinline__ half2 warp_reduce_max (half2 x) {
287- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
287+ #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
288288#pragma unroll
289289 for (int offset = 16 ; offset > 0 ; offset >>= 1 ) {
290290 x = ggml_cuda_hmax2 (x, __shfl_xor_sync (0xffffffff , x, offset, 32 ));
@@ -293,7 +293,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
293293#else
294294 GGML_UNUSED (x);
295295 NO_DEVICE_CODE;
296- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
296+ #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
297297}
298298
299299#if CUDART_VERSION < CUDART_HMASK
@@ -333,13 +333,13 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
333333
334334#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
335335
336- #if __CUDA_ARCH__ >= MIN_CC_DP4A
336+ #if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
337337 return __dp4a (a, b, c);
338- #else // __CUDA_ARCH__ >= MIN_CC_DP4A
338+ #else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
339339 const int8_t * a8 = (const int8_t *) &a;
340340 const int8_t * b8 = (const int8_t *) &b;
341341 return c + a8[0 ]*b8[0 ] + a8[1 ]*b8[1 ] + a8[2 ]*b8[2 ] + a8[3 ]*b8[3 ];
342- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
342+ #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
343343
344344#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
345345}
0 commit comments