Skip to content

Commit 7b644a2

Browse files
committed
Revert "HIP: remove the use of __HIP_PLATFORM_AMD__, explicitly support only AMD targets (ggml-org#14945)"
This reverts commit aa79524.
1 parent b04362f commit 7b644a2

File tree

10 files changed

+65
-67
lines changed

10 files changed

+65
-67
lines changed

ggml/src/ggml-cuda/common.cuh

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ static const char * cu_get_error_str(CUresult err) {
180180
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
181181
#endif
182182

183-
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
183+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
184184
# define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
185185
do { \
186186
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = { false }; \
@@ -195,7 +195,7 @@ static const char * cu_get_error_str(CUresult err) {
195195
do { \
196196
GGML_UNUSED(nbytes); \
197197
} while (0)
198-
#endif // !(defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
198+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
199199

200200
#if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
201201
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
@@ -215,9 +215,9 @@ typedef float2 dfloat2;
215215
#define GGML_USE_VMM
216216
#endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
217217

218-
#if defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
218+
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
219219
#define FP16_AVAILABLE
220-
#endif // defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
220+
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
221221

222222
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
223223
#define FAST_FP16_AVAILABLE
@@ -231,17 +231,17 @@ typedef float2 dfloat2;
231231
#define FP16_MMA_AVAILABLE
232232
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
233233

234-
#if defined(GGML_USE_HIP) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
234+
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
235235
#define AMD_MFMA_AVAILABLE
236-
#endif // defined(GGML_USE_HIP) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
236+
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)
237237

238-
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
238+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
239239
#define NEW_MMA_AVAILABLE
240-
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
240+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
241241

242-
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
242+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
243243
#define CP_ASYNC_AVAILABLE
244-
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
244+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
245245

246246
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
247247
#define FLASH_ATTN_AVAILABLE
@@ -263,7 +263,7 @@ static bool fast_fp16_hardware_available(const int cc) {
263263

264264
// Any FP16 tensor core instructions are available for ggml code.
265265
static bool fp16_mma_available(const int cc) {
266-
#if defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN)
266+
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
267267
return false;
268268
#else
269269
if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
@@ -279,7 +279,7 @@ static bool fp16_mma_available(const int cc) {
279279
} else {
280280
return false;
281281
}
282-
#endif // defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN)
282+
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
283283
}
284284

285285
// To be used for feature selection of external libraries, e.g. cuBLAS.
@@ -316,25 +316,25 @@ static bool cp_async_available(const int cc) {
316316
}
317317

318318
static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
319-
#if defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
319+
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
320320
return 64;
321321
#else
322322
return 32;
323-
#endif // defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
323+
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
324324
}
325325

326326
[[noreturn]]
327327
static __device__ void no_device_code(
328328
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
329329

330-
#if defined(GGML_USE_HIP)
330+
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
331331
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
332332
file_name, line, function_name, arch);
333333
GGML_UNUSED(arch_list);
334334
#else
335335
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
336336
file_name, line, function_name, arch, arch_list);
337-
#endif // defined(GGML_USE_HIP)
337+
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
338338
__trap();
339339

340340
GGML_UNUSED(no_device_code); // suppress unused function warning
@@ -371,15 +371,15 @@ struct ggml_cuda_unroll<1> {
371371

372372
template<int width = WARP_SIZE>
373373
static __device__ __forceinline__ int warp_reduce_sum(int x) {
374-
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
374+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
375375
return __reduce_add_sync(0xffffffff, x);
376376
#else
377377
#pragma unroll
378378
for (int offset = width/2; offset > 0; offset >>= 1) {
379379
x += __shfl_xor_sync(0xffffffff, x, offset, width);
380380
}
381381
return x;
382-
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
382+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
383383
}
384384

385385
template<int width = WARP_SIZE>
@@ -448,11 +448,11 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
448448
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
449449
#ifdef FP16_AVAILABLE
450450

451-
#if !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX
451+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
452452
return __float2half(fmaxf(__half2float(a), __half2float(b)));
453453
#else
454454
return __hmax(a, b);
455-
#endif // !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX
455+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
456456

457457
#else
458458
NO_DEVICE_CODE;
@@ -480,7 +480,7 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
480480

481481
template<int width = WARP_SIZE>
482482
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
483-
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
483+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
484484
#pragma unroll
485485
for (int offset = width/2; offset > 0; offset >>= 1) {
486486
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width));
@@ -489,7 +489,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
489489
#else
490490
GGML_UNUSED(x);
491491
NO_DEVICE_CODE;
492-
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
492+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
493493
}
494494

495495
#if CUDART_VERSION < CUDART_HMASK
@@ -501,7 +501,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
501501
#endif // CUDART_VERSION < CUDART_HMASK
502502

503503
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
504-
#if defined(GGML_USE_HIP)
504+
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
505505
#if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
506506
c = __builtin_amdgcn_sdot4(a, b, c, false);
507507
#elif defined(RDNA3) || defined(RDNA4)
@@ -527,7 +527,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
527527
#endif
528528
return c;
529529

530-
#else // defined(GGML_USE_HIP)
530+
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
531531

532532
#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
533533
return __dp4a(a, b, c);
@@ -537,7 +537,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
537537
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
538538
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
539539

540-
#endif // defined(GGML_USE_HIP)
540+
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
541541
}
542542

543543
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);

ggml/src/ggml-cuda/fattn-common.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -592,9 +592,9 @@ static __global__ void flash_attn_stream_k_fixup(
592592
}
593593

594594
template<int D> // D == head size
595-
#if !defined(GGML_USE_HIP)
595+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
596596
__launch_bounds__(D, 1)
597-
#endif // !(defined(GGML_USE_HIP)
597+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
598598
static __global__ void flash_attn_combine_results(
599599
const float * __restrict__ VKQ_parts,
600600
const float2 * __restrict__ VKQ_meta,

ggml/src/ggml-cuda/fattn-mma-f16.cuh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1391,24 +1391,24 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
13911391
constexpr bool use_logit_softcap = false;
13921392
fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla>;
13931393

1394-
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
1394+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
13951395
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
13961396
if (!shared_memory_limit_raised[id]) {
13971397
CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
13981398
shared_memory_limit_raised[id] = true;
13991399
}
1400-
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
1400+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
14011401
} else {
14021402
constexpr bool use_logit_softcap = true;
14031403
fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla>;
14041404

1405-
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
1405+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
14061406
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
14071407
if (!shared_memory_limit_raised[id]) {
14081408
CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
14091409
shared_memory_limit_raised[id] = true;
14101410
}
1411-
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
1411+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
14121412
}
14131413

14141414
launch_fattn<DV, ncols1, ncols2>

ggml/src/ggml-cuda/fattn-tile-f16.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55
#define FATTN_KQ_STRIDE_TILE_F16 64
66

77
template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
8-
#if !defined(GGML_USE_HIP)
8+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
99
__launch_bounds__(nwarps*WARP_SIZE, 2)
10-
#endif // !defined(GGML_USE_HIP)
10+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
1111
static __global__ void flash_attn_tile_ext_f16(
1212
const char * __restrict__ Q,
1313
const char * __restrict__ K,

ggml/src/ggml-cuda/fattn-tile-f32.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55
#define FATTN_KQ_STRIDE_TILE_F32 32
66

77
template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
8-
#if !defined(GGML_USE_HIP)
8+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
99
__launch_bounds__(nwarps*WARP_SIZE, 2)
10-
#endif // !defined(GGML_USE_HIP)
10+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
1111
static __global__ void flash_attn_tile_ext_f32(
1212
const char * __restrict__ Q,
1313
const char * __restrict__ K,

ggml/src/ggml-cuda/fattn-wmma-f16.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#include "fattn-wmma-f16.cuh"
88

99
#ifdef FP16_MMA_AVAILABLE
10-
#if !defined(GGML_USE_HIP)
10+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
1111
#include <mma.h>
1212
#ifdef GGML_USE_MUSA
1313
namespace wmma = mtmusa::wmma;
@@ -18,7 +18,7 @@ namespace wmma = nvcuda::wmma;
1818
#undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers
1919
#include <rocwmma/rocwmma.hpp>
2020
namespace wmma = rocwmma;
21-
#endif // !defined(GGML_USE_HIP)
21+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
2222
#endif // FP16_MMA_AVAILABLE
2323

2424
// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
@@ -546,7 +546,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_ten
546546
return;
547547
}
548548

549-
#if !defined(GGML_USE_HIP)
549+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
550550
if (Q->ne[1] <= 8 && Q->ne[0] % warp_size == 0) {
551551
constexpr int cols_per_block = 8;
552552
switch (Q->ne[0]) {
@@ -568,7 +568,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_ten
568568
}
569569
return;
570570
}
571-
#endif // !defined(GGML_USE_HIP)
571+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
572572

573573
if (Q->ne[1] <= 32) {
574574
constexpr int cols_per_block = 16;

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
130130
return err;
131131
}
132132

133-
#if defined(GGML_USE_HIP)
133+
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
134134
static int ggml_cuda_parse_id(char devName[]) {
135135
// A list of possible Target IDs can be found under the rocclr/clr repo in device.cpp
136136
// these values are not stable so this is susceptible to breakage
@@ -177,10 +177,10 @@ static int ggml_cuda_parse_id(char devName[]) {
177177
archNum += archMinor;
178178
return archNum;
179179
}
180-
#endif // defined(GGML_USE_HIP)
180+
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
181181

182182
static ggml_cuda_device_info ggml_cuda_init() {
183-
#if defined(GGML_USE_HIP)
183+
#ifdef __HIP_PLATFORM_AMD__
184184
// Workaround for a rocBLAS bug when using multiple graphics cards:
185185
// https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
186186
{
@@ -254,7 +254,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
254254
info.devices[id].nsm = prop.multiProcessorCount;
255255
info.devices[id].smpb = prop.sharedMemPerBlock;
256256
info.devices[id].warp_size = prop.warpSize;
257-
#if defined(GGML_USE_HIP)
257+
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
258258
info.devices[id].smpbo = prop.sharedMemPerBlock;
259259

260260
info.devices[id].cc = ggml_cuda_parse_id(prop.gcnArchName);
@@ -284,7 +284,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
284284
info.devices[id].cc = 100*prop.major + 10*prop.minor;
285285
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
286286
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
287-
#endif // defined(GGML_USE_HIP)
287+
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
288288
}
289289

290290
for (int id = 0; id < info.device_count; ++id) {

ggml/src/ggml-cuda/mma.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ namespace ggml_cuda_mma {
6868
static constexpr int I = I_;
6969
static constexpr int J = J_;
7070

71-
#if defined(GGML_USE_HIP)
71+
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
7272
static constexpr int ne = I * J / 64;
7373
T x[ne] = {0};
7474

@@ -132,7 +132,7 @@ namespace ggml_cuda_mma {
132132
static_assert(I == -1 && J == -1, "template specialization not implemented");
133133
}
134134
}
135-
#endif // defined(GGML_USE_HIP)
135+
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
136136
};
137137

138138
template <int I_, int J_>

0 commit comments

Comments
 (0)