@@ -180,7 +180,7 @@ static const char * cu_get_error_str(CUresult err) {
180180#define CU_CHECK (err ) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
181181#endif
182182
183- #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
183+ #if !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && !defined(GGML_USE_MUSA)
184184# define CUDA_SET_SHARED_MEMORY_LIMIT (kernel, nbytes ) \
185185 do { \
186186 static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = { false }; \
@@ -195,7 +195,7 @@ static const char * cu_get_error_str(CUresult err) {
195195 do { \
196196 GGML_UNUSED (nbytes); \
197197 } while (0 )
198- #endif // !(defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
198+ #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
199199
200200#if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
201201#define GGML_CUDA_ASSUME (x ) __builtin_assume(x)
@@ -215,9 +215,9 @@ typedef float2 dfloat2;
215215#define GGML_USE_VMM
216216#endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
217217
218- #if defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
218+ #if ( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
219219#define FP16_AVAILABLE
220- #endif // defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
220+ #endif // ( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
221221
222222#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
223223#define FAST_FP16_AVAILABLE
@@ -231,17 +231,17 @@ typedef float2 dfloat2;
231231#define FP16_MMA_AVAILABLE
232232#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
233233
234- #if defined(GGML_USE_HIP) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
234+ #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined( CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
235235#define AMD_MFMA_AVAILABLE
236- #endif // defined(GGML_USE_HIP) && defined(CDNA3 ) && ! defined(GGML_HIP_NO_MMQ_MFMA )
236+ #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__ ) && defined(CDNA3 )
237237
238- #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
238+ #if !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
239239#define NEW_MMA_AVAILABLE
240- #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
240+ #endif // !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
241241
242- #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
242+ #if !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
243243#define CP_ASYNC_AVAILABLE
244- #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
244+ #endif // !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
245245
246246#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
247247#define FLASH_ATTN_AVAILABLE
@@ -263,7 +263,7 @@ static bool fast_fp16_hardware_available(const int cc) {
263263
264264// Any FP16 tensor core instructions are available for ggml code.
265265static bool fp16_mma_available (const int cc) {
266- #if defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN)
266+ #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
267267 return false ;
268268#else
269269 if ((GGML_CUDA_CC_IS_NVIDIA (cc) && ggml_cuda_highest_compiled_arch (cc) >= GGML_CUDA_CC_VOLTA) ||
@@ -279,7 +279,7 @@ static bool fp16_mma_available(const int cc) {
279279 } else {
280280 return false ;
281281 }
282- #endif // defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN)
282+ #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
283283}
284284
285285// To be used for feature selection of external libraries, e.g. cuBLAS.
@@ -316,25 +316,25 @@ static bool cp_async_available(const int cc) {
316316}
317317
318318static constexpr __device__ int ggml_cuda_get_physical_warp_size () {
319- #if defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
319+ #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
320320 return 64 ;
321321#else
322322 return 32 ;
323- #endif // defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
323+ #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
324324}
325325
326326[[noreturn]]
327327static __device__ void no_device_code (
328328 const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
329329
330- #if defined(GGML_USE_HIP)
330+ #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
331331 printf (" %s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n " ,
332332 file_name, line, function_name, arch);
333333 GGML_UNUSED (arch_list);
334334#else
335335 printf (" %s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n " ,
336336 file_name, line, function_name, arch, arch_list);
337- #endif // defined(GGML_USE_HIP)
337+ #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
338338 __trap ();
339339
340340 GGML_UNUSED (no_device_code); // suppress unused function warning
@@ -371,15 +371,15 @@ struct ggml_cuda_unroll<1> {
371371
372372template <int width = WARP_SIZE>
373373static __device__ __forceinline__ int warp_reduce_sum (int x) {
374- #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
374+ #if !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
375375 return __reduce_add_sync (0xffffffff , x);
376376#else
377377#pragma unroll
378378 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
379379 x += __shfl_xor_sync (0xffffffff , x, offset, width);
380380 }
381381 return x;
382- #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
382+ #endif // !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
383383}
384384
385385template <int width = WARP_SIZE>
@@ -448,11 +448,11 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
448448static __device__ __forceinline__ half ggml_cuda_hmax (const half a, const half b) {
449449#ifdef FP16_AVAILABLE
450450
451- #if !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX
451+ #if !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && CUDART_VERSION < CUDART_HMAX
452452 return __float2half (fmaxf (__half2float (a), __half2float (b)));
453453#else
454454 return __hmax (a, b);
455- #endif // !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX
455+ #endif // !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && CUDART_VERSION < CUDART_HMAX
456456
457457#else
458458 NO_DEVICE_CODE;
@@ -480,7 +480,7 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
480480
481481template <int width = WARP_SIZE>
482482static __device__ __forceinline__ half2 warp_reduce_max (half2 x) {
483- #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
483+ #if !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
484484#pragma unroll
485485 for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
486486 x = ggml_cuda_hmax2 (x, __shfl_xor_sync (0xffffffff , x, offset, width));
@@ -489,7 +489,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
489489#else
490490 GGML_UNUSED (x);
491491 NO_DEVICE_CODE;
492- #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
492+ #endif // !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
493493}
494494
495495#if CUDART_VERSION < CUDART_HMASK
@@ -501,7 +501,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
501501#endif // CUDART_VERSION < CUDART_HMASK
502502
503503static __device__ __forceinline__ int ggml_cuda_dp4a (const int a, const int b, int c) {
504- #if defined(GGML_USE_HIP)
504+ #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
505505#if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
506506 c = __builtin_amdgcn_sdot4 (a, b, c, false );
507507#elif defined(RDNA3) || defined(RDNA4)
@@ -527,7 +527,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
527527#endif
528528 return c;
529529
530- #else // defined(GGML_USE_HIP)
530+ #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
531531
532532#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
533533 return __dp4a (a, b, c);
@@ -537,7 +537,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
537537 return c + a8[0 ]*b8[0 ] + a8[1 ]*b8[1 ] + a8[2 ]*b8[2 ] + a8[3 ]*b8[3 ];
538538#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
539539
540- #endif // defined(GGML_USE_HIP)
540+ #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
541541}
542542
543543typedef void (*dequantize_kernel_t )(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
0 commit comments