diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 2e5d48797fa49..6f43fd9affef5 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -376,11 +376,11 @@ struct ggml_cuda_unroll<1> { template static __device__ __forceinline__ int warp_reduce_sum(int x) { #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE - return __reduce_add_sync(0xffffffff, x); + return __reduce_add_sync(0xFFFFFFFF, x); #else #pragma unroll for (int offset = width/2; offset > 0; offset >>= 1) { - x += __shfl_xor_sync(0xffffffff, x, offset, width); + x += __shfl_xor_sync(0xFFFFFFFF, x, offset, width); } return x; #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE @@ -390,7 +390,7 @@ template static __device__ __forceinline__ float warp_reduce_sum(float x) { #pragma unroll for (int offset = width/2; offset > 0; offset >>= 1) { - x += __shfl_xor_sync(0xffffffff, x, offset, width); + x += __shfl_xor_sync(0xFFFFFFFF, x, offset, width); } return x; } @@ -399,8 +399,8 @@ template static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) { #pragma unroll for (int offset = width/2; offset > 0; offset >>= 1) { - a.x += __shfl_xor_sync(0xffffffff, a.x, offset, width); - a.y += __shfl_xor_sync(0xffffffff, a.y, offset, width); + a.x += __shfl_xor_sync(0xFFFFFFFF, a.x, offset, width); + a.y += __shfl_xor_sync(0xFFFFFFFF, a.y, offset, width); } return a; } @@ -410,7 +410,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { #ifdef FP16_AVAILABLE #pragma unroll for (int offset = width/2; offset > 0; offset >>= 1) { - a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, offset, width)); + a = __hadd2(a, __shfl_xor_sync(0xFFFFFFFF, a, offset, width)); } return a; @@ -445,12 +445,12 @@ static __device__ __forceinline__ int warp_reduce_all(int x) { #ifdef GGML_USE_HIP #pragma unroll for (int offset = width/2; offset > 0; offset >>= 1) { - x = x && __shfl_xor_sync(0xffffffff, x, offset, width); + x = x && __shfl_xor_sync(0xFFFFFFFF, x, offset, width); } return x; #else static_assert(width == WARP_SIZE, "width != WARP_SIZE not implemented"); - return __all_sync(0xffffffff, x); + return __all_sync(0xFFFFFFFF, x); #endif // GGML_USE_HIP } @@ -458,7 +458,7 @@ template static __device__ __forceinline__ float warp_reduce_max(float x) { #pragma unroll for (int offset = width/2; offset > 0; offset >>= 1) { - x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, offset, width)); + x = fmaxf(x, __shfl_xor_sync(0xFFFFFFFF, x, offset, width)); } return x; } @@ -501,7 +501,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000) #pragma unroll for (int offset = width/2; offset > 0; offset >>= 1) { - x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width)); + x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xFFFFFFFF, x, offset, width)); } return x; #else diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu index fdc4d17da2da9..e390b7d733b51 100644 --- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu +++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu @@ -15,7 +15,11 @@ namespace wmma = mtmusa::wmma; namespace wmma = nvcuda::wmma; #endif // GGML_USE_MUSA #elif defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE) -#undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers +#if HIP_VERSION >= 60500000 +#define HIP_DISABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers for ROCm 6.5+ +#else +#undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers before ROCm 6.5 +#endif // HIP_VERSION >= 60500000 #include namespace wmma = rocwmma; #endif // !defined(GGML_USE_HIP) diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index c31f319232252..ea13371fc7a90 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -137,7 +137,7 @@ #define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR #define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED -#if HIP_VERSION >= 70000000 +#if HIP_VERSION >= 60500000 #define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F #define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F @@ -149,7 +149,7 @@ #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F #define cublasComputeType_t hipblasDatatype_t #define cudaDataType_t hipblasDatatype_t -#endif // HIP_VERSION >= 7000000 +#endif // HIP_VERSION >= 60500000 #if !defined(__HIP_PLATFORM_AMD__) #error "The HIP backend supports only AMD targets"