@@ -176,7 +176,7 @@ static const char * cu_get_error_str(CUresult err) {
176176#define CU_CHECK (err ) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
177177#endif
178178
179- #if !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && !defined(GGML_USE_MUSA)
179+ #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
180180# define CUDA_SET_SHARED_MEMORY_LIMIT (kernel, nbytes ) \
181181 do { \
182182 static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = { false }; \
@@ -191,7 +191,7 @@ static const char * cu_get_error_str(CUresult err) {
191191 do { \
192192 GGML_UNUSED (nbytes); \
193193 } while (0 )
194- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
194+ #endif // !(defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
195195
196196#if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
197197#define GGML_CUDA_ASSUME (x ) __builtin_assume(x)
@@ -211,9 +211,9 @@ typedef float2 dfloat2;
211211#define GGML_USE_VMM
212212#endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
213213
214- #if ( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
214+ #if defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
215215#define FP16_AVAILABLE
216- #endif // ( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
216+ #endif // defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
217217
218218#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
219219#define FAST_FP16_AVAILABLE
@@ -227,17 +227,17 @@ typedef float2 dfloat2;
227227#define FP16_MMA_AVAILABLE
228228#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
229229
230- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined( CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
230+ #if defined(GGML_USE_HIP) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
231231#define AMD_MFMA_AVAILABLE
232- #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__ ) && defined(CDNA3 )
232+ #endif // defined(GGML_USE_HIP) && defined(CDNA3 ) && ! defined(GGML_HIP_NO_MMQ_MFMA )
233233
234- #if !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
234+ #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
235235#define NEW_MMA_AVAILABLE
236- #endif // !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
236+ #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
237237
238- #if !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
238+ #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
239239#define CP_ASYNC_AVAILABLE
240- #endif // !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
240+ #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
241241
242242#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
243243#define FLASH_ATTN_AVAILABLE
@@ -259,7 +259,7 @@ static bool fast_fp16_hardware_available(const int cc) {
259259
260260// Any FP16 tensor core instructions are available for ggml code.
261261static bool fp16_mma_available (const int cc) {
262- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
262+ #if defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN)
263263 return false ;
264264#else
265265 if ((GGML_CUDA_CC_IS_NVIDIA (cc) && ggml_cuda_highest_compiled_arch (cc) >= GGML_CUDA_CC_VOLTA) ||
@@ -275,7 +275,7 @@ static bool fp16_mma_available(const int cc) {
275275 } else {
276276 return false ;
277277 }
278- #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
278+ #endif // defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN)
279279}
280280
281281// To be used for feature selection of external libraries, e.g. cuBLAS.
@@ -312,25 +312,25 @@ static bool cp_async_available(const int cc) {
312312}
313313
314314static constexpr __device__ int ggml_cuda_get_physical_warp_size () {
315- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
315+ #if defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
316316 return 64 ;
317317#else
318318 return 32 ;
319- #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
319+ #endif // defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
320320}
321321
322322[[noreturn]]
323323static __device__ void no_device_code (
324324 const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
325325
326- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
326+ #if defined(GGML_USE_HIP)
327327 printf (" %s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n " ,
328328 file_name, line, function_name, arch);
329329 GGML_UNUSED (arch_list);
330330#else
331331 printf (" %s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n " ,
332332 file_name, line, function_name, arch, arch_list);
333- #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
333+ #endif // defined(GGML_USE_HIP)
334334 __trap ();
335335
336336 GGML_UNUSED (no_device_code); // suppress unused function warning
@@ -367,15 +367,15 @@ struct ggml_cuda_unroll<1> {
367367
368368template <int width = WARP_SIZE>
369369static __device__ __forceinline__ int warp_reduce_sum (int x) {
370- #if !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
370+ #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
371371 return __reduce_add_sync (0xffffffff , x);
372372#else
373373#pragma unroll
374374 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
375375 x += __shfl_xor_sync (0xffffffff , x, offset, width);
376376 }
377377 return x;
378- #endif // !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
378+ #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
379379}
380380
381381template <int width = WARP_SIZE>
@@ -444,11 +444,11 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
444444static __device__ __forceinline__ half ggml_cuda_hmax (const half a, const half b) {
445445#ifdef FP16_AVAILABLE
446446
447- #if !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && CUDART_VERSION < CUDART_HMAX
447+ #if !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX
448448 return __float2half (fmaxf (__half2float (a), __half2float (b)));
449449#else
450450 return __hmax (a, b);
451- #endif // !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && CUDART_VERSION < CUDART_HMAX
451+ #endif // !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX
452452
453453#else
454454 NO_DEVICE_CODE;
@@ -476,7 +476,7 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
476476
477477template <int width = WARP_SIZE>
478478static __device__ __forceinline__ half2 warp_reduce_max (half2 x) {
479- #if !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
479+ #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
480480#pragma unroll
481481 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
482482 x = ggml_cuda_hmax2 (x, __shfl_xor_sync (0xffffffff , x, offset, width));
@@ -485,7 +485,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
485485#else
486486 GGML_UNUSED (x);
487487 NO_DEVICE_CODE;
488- #endif // !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
488+ #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
489489}
490490
491491#if CUDART_VERSION < CUDART_HMASK
@@ -497,7 +497,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
497497#endif // CUDART_VERSION < CUDART_HMASK
498498
499499static __device__ __forceinline__ int ggml_cuda_dp4a (const int a, const int b, int c) {
500- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
500+ #if defined(GGML_USE_HIP)
501501#if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
502502 c = __builtin_amdgcn_sdot4 (a, b, c, false );
503503#elif defined(RDNA3) || defined(RDNA4)
@@ -523,7 +523,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
523523#endif
524524 return c;
525525
526- #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
526+ #else // defined(GGML_USE_HIP)
527527
528528#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
529529 return __dp4a (a, b, c);
@@ -533,7 +533,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
533533 return c + a8[0 ]*b8[0 ] + a8[1 ]*b8[1 ] + a8[2 ]*b8[2 ] + a8[3 ]*b8[3 ];
534534#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
535535
536- #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
536+ #endif // defined(GGML_USE_HIP)
537537}
538538
539539typedef void (*dequantize_kernel_t )(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
0 commit comments