4141#define  CUDART_HMAX    11070  //  CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
4242#define  CUDART_HMASK   12000  //  CUDA 12.0, min. ver. for half2 -> uint mask comparisons
4343
44- #define  CC_PASCAL      600 
45- #define  MIN_CC_DP4A    610  //  minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
46- #define  CC_VOLTA       700 
47- #define  CC_TURING      750 
48- #define  CC_AMPERE      800 
49- #define  CC_OFFSET_AMD  1000000 
44+ #define  GGML_CUDA_CC_PASCAL      600 
45+ #define  GGML_CUDA_CC_DP4A         610  //  minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
46+ #define  GGML_CUDA_CC_VOLTA       700 
47+ #define  GGML_CUDA_CC_TURING      750 
48+ #define  GGML_CUDA_CC_AMPERE      800 
49+ #define  GGML_CUDA_CC_OFFSET_AMD  1000000 
5050
5151//  GCN/CNDA, wave size is 64
52- #define  CC_GCN4        (CC_OFFSET_AMD  + 803 )  //  Tonga, Fiji, Polaris, minimum for fast fp16
53- #define  CC_VEGA        (CC_OFFSET_AMD  + 900 )  //  Vega56/64, minimum for fp16 dual issue
54- #define  CC_VEGA20      (CC_OFFSET_AMD  + 906 )  //  MI50/Radeon VII, minimum for dp4a
55- #define  CC_CDNA        (CC_OFFSET_AMD  + 908 )  //  MI100, minimum for MFMA, acc registers
56- #define  CC_CDNA2       (CC_OFFSET_AMD  + 910 )  //  MI210, minimum acc register renameing
57- #define  CC_CDNA3       (CC_OFFSET_AMD  + 942 )  //  MI300
52+ #define  GGML_CUDA_CC_GCN4        (GGML_CUDA_CC_OFFSET_AMD  + 803 )  //  Tonga, Fiji, Polaris, minimum for fast fp16
53+ #define  GGML_CUDA_CC_VEGA        (GGML_CUDA_CC_OFFSET_AMD  + 900 )  //  Vega56/64, minimum for fp16 dual issue
54+ #define  GGML_CUDA_CC_VEGA20      (GGML_CUDA_CC_OFFSET_AMD  + 906 )  //  MI50/Radeon VII, minimum for dp4a
55+ #define  GGML_CUDA_CC_CDNA        (GGML_CUDA_CC_OFFSET_AMD  + 908 )  //  MI100, minimum for MFMA, acc registers
56+ #define  GGML_CUDA_CC_CDNA2       (GGML_CUDA_CC_OFFSET_AMD  + 910 )  //  MI210, minimum acc register renameing
57+ #define  GGML_CUDA_CC_CDNA3       (GGML_CUDA_CC_OFFSET_AMD  + 942 )  //  MI300
5858
5959//  RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
60- #define  CC_RDNA1       (CC_OFFSET_AMD  + 1010 ) //  RX 5000
61- #define  CC_RDNA2       (CC_OFFSET_AMD  + 1030 ) //  RX 6000, minimum for dp4a
62- #define  CC_RDNA3       (CC_OFFSET_AMD  + 1100 ) //  RX 7000, minimum for WMMA
60+ #define  GGML_CUDA_CC_RDNA1       (GGML_CUDA_CC_OFFSET_AMD  + 1010 ) //  RX 5000
61+ #define  GGML_CUDA_CC_RDNA2       (GGML_CUDA_CC_OFFSET_AMD  + 1030 ) //  RX 6000, minimum for dp4a
62+ #define  GGML_CUDA_CC_RDNA3       (GGML_CUDA_CC_OFFSET_AMD  + 1100 ) //  RX 7000, minimum for WMMA
6363
64- #define  CC_QY1         210 
65- #define  CC_QY2         220 
64+ #define  GGML_CUDA_CC_QY1         210 
65+ #define  GGML_CUDA_CC_QY2         220 
6666
6767#define  MATRIX_ROW_PADDING  512  //  last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
6868
@@ -131,36 +131,36 @@ typedef float dfloat; // dequantize float
131131typedef  float2  dfloat2;
132132#endif  //  GGML_CUDA_F16
133133
134- #if  (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL 
134+ #if  (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL 
135135#define  FP16_AVAILABLE 
136- #endif  //  (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL 
136+ #endif  //  (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL 
137137
138138#if  defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
139139#define  FAST_FP16_AVAILABLE 
140140#endif  //  defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
141141
142- #if  !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA 
142+ #if  !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA 
143143#define  FP16_MMA_AVAILABLE 
144- #endif  //  !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA 
144+ #endif  //  !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA 
145145
146- #if  !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING 
146+ #if  !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING 
147147#define  INT8_MMA_AVAILABLE 
148- #endif  //  !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING 
148+ #endif  //  !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING 
149149
150- #if  !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1 )
150+ #if  !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1 )
151151#define  FLASH_ATTN_AVAILABLE 
152- #endif  //  !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1 )
152+ #endif  //  !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1 )
153153
154154static  constexpr  bool  fast_fp16_available (const  int  cc) {
155-     return  cc >= CC_PASCAL  && cc != 610 ;
155+     return  cc >= GGML_CUDA_CC_PASCAL  && cc != 610 ;
156156}
157157
158158static  constexpr  bool  fp16_mma_available (const  int  cc) {
159-     return  cc < CC_OFFSET_AMD  && cc >= CC_VOLTA ;
159+     return  cc < GGML_CUDA_CC_OFFSET_AMD  && cc >= GGML_CUDA_CC_VOLTA ;
160160}
161161
162162static  constexpr  bool  int8_mma_available (const  int  cc) {
163-     return  cc < CC_OFFSET_AMD  && cc >= CC_TURING ;
163+     return  cc < GGML_CUDA_CC_OFFSET_AMD  && cc >= GGML_CUDA_CC_TURING ;
164164}
165165
166166[[noreturn]]
@@ -187,15 +187,15 @@ static __device__ void no_device_code(
187187#endif  //  __CUDA_ARCH__
188188
189189static  __device__  __forceinline__  int  warp_reduce_sum (int  x) {
190- #if  !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE 
190+ #if  !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE 
191191    return  __reduce_add_sync (0xffffffff , x);
192192#else 
193193#pragma  unroll
194194    for  (int  offset = 16 ; offset > 0 ; offset >>= 1 ) {
195195        x += __shfl_xor_sync (0xffffffff , x, offset, 32 );
196196    }
197197    return  x;
198- #endif  //  !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE 
198+ #endif  //  !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE 
199199}
200200
201201static  __device__  __forceinline__  float  warp_reduce_sum (float  x) {
@@ -284,7 +284,7 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
284284}
285285
286286static  __device__  __forceinline__  half2 warp_reduce_max (half2 x) {
287- #if  !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL 
287+ #if  !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL 
288288#pragma  unroll
289289   for  (int  offset = 16 ; offset > 0 ; offset >>= 1 ) {
290290       x = ggml_cuda_hmax2 (x, __shfl_xor_sync (0xffffffff , x, offset, 32 ));
@@ -293,7 +293,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
293293#else 
294294   GGML_UNUSED (x);
295295   NO_DEVICE_CODE;
296- #endif  //  !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL 
296+ #endif  //  !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL 
297297}
298298
299299#if  CUDART_VERSION < CUDART_HMASK
@@ -333,13 +333,13 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
333333
334334#else  //  defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
335335
336- #if  __CUDA_ARCH__ >= MIN_CC_DP4A 
336+ #if  __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A 
337337    return  __dp4a (a, b, c);
338- #else  //  __CUDA_ARCH__ >= MIN_CC_DP4A 
338+ #else  //  __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A 
339339    const  int8_t  * a8 = (const  int8_t  *) &a;
340340    const  int8_t  * b8 = (const  int8_t  *) &b;
341341    return  c + a8[0 ]*b8[0 ] + a8[1 ]*b8[1 ] + a8[2 ]*b8[2 ] + a8[3 ]*b8[3 ];
342- #endif  //  __CUDA_ARCH__ >= MIN_CC_DP4A 
342+ #endif  //  __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A 
343343
344344#endif  //  defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
345345}
0 commit comments