@@ -190,64 +190,58 @@ static __device__ void no_device_code(
190190#define  NO_DEVICE_CODE  // GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
191191#endif  //  __CUDA_ARCH__
192192
193+ template <int  width = WARP_SIZE>
193194static  __device__  __forceinline__  int  warp_reduce_sum (int  x) {
194195#if  !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
195196    return  __reduce_add_sync (0xffffffff , x);
196197#else 
197198#pragma  unroll
198-     for  (int  offset = 16 ; offset > 0 ; offset >>= 1 ) {
199-         x += __shfl_xor_sync (0xffffffff , x, offset, 32 );
199+     for  (int  offset = width/ 2 ; offset > 0 ; offset >>= 1 ) {
200+         x += __shfl_xor_sync (0xffffffff , x, offset, width );
200201    }
201202    return  x;
202203#endif  //  !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
203204}
204205
206+ template <int  width = WARP_SIZE>
205207static  __device__  __forceinline__  float  warp_reduce_sum (float  x) {
206208#pragma  unroll
207-     for  (int  offset = 16 ; offset > 0 ; offset >>= 1 ) {
208-         x += __shfl_xor_sync (0xffffffff , x, offset, 32 );
209+     for  (int  offset = width/ 2 ; offset > 0 ; offset >>= 1 ) {
210+         x += __shfl_xor_sync (0xffffffff , x, offset, width );
209211    }
210212    return  x;
211213}
212214
215+ template <int  width = WARP_SIZE>
213216static  __device__  __forceinline__  float2  warp_reduce_sum (float2  a) {
214217#pragma  unroll
215-     for  (int  offset = 16 ; offset > 0 ; offset >>= 1 ) {
216-         a.x  += __shfl_xor_sync (0xffffffff , a.x , offset, 32 );
217-         a.y  += __shfl_xor_sync (0xffffffff , a.y , offset, 32 );
218+     for  (int  offset = width/ 2 ; offset > 0 ; offset >>= 1 ) {
219+         a.x  += __shfl_xor_sync (0xffffffff , a.x , offset, width );
220+         a.y  += __shfl_xor_sync (0xffffffff , a.y , offset, width );
218221    }
219222    return  a;
220223}
221224
225+ template <int  width = WARP_SIZE>
222226static  __device__  __forceinline__  half2 warp_reduce_sum (half2 a) {
223227#ifdef  FP16_AVAILABLE
224- 
225- #if  defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
226- #pragma  unroll
227-     for  (int  offset = 16 ; offset > 0 ; offset >>= 1 ) {
228-         const  half2 a_other = __shfl_xor_sync (0xffffffff , a, offset, 32 );
229-         reinterpret_cast <half&>(a.x ) +=  __low2half (a_other);
230-         reinterpret_cast <half&>(a.y ) += __high2half (a_other);
231-     }
232-     return  a;
233- #else 
234228#pragma  unroll
235-     for  (int  offset = 16 ; offset > 0 ; offset >>= 1 ) {
236-         a = __hadd2 (a, __shfl_xor_sync (0xffffffff , a, offset, 32 ));
229+     for  (int  offset = width/ 2 ; offset > 0 ; offset >>= 1 ) {
230+         a = __hadd2 (a, __shfl_xor_sync (0xffffffff , a, offset, width ));
237231    }
238232    return  a;
239- #endif  //  defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
240233
241234#else 
242235    NO_DEVICE_CODE;
243236    return  a;
244237#endif  //  FP16_AVAILABLE
245238}
246239
240+ template <int  width = WARP_SIZE>
247241static  __device__  __forceinline__  float  warp_reduce_max (float  x) {
248242#pragma  unroll
249-     for  (int  offset = 16 ; offset > 0 ; offset >>= 1 ) {
250-         x = fmaxf (x, __shfl_xor_sync (0xffffffff , x, offset, 32 ));
243+     for  (int  offset = width/ 2 ; offset > 0 ; offset >>= 1 ) {
244+         x = fmaxf (x, __shfl_xor_sync (0xffffffff , x, offset, width ));
251245    }
252246    return  x;
253247}
@@ -269,35 +263,34 @@ static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b
269263}
270264
271265static  __device__  __forceinline__  half2 ggml_cuda_hmax2 (const  half2 a, const  half2 b) {
272- #if  !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) 
273- 
274- #if  CUDART_VERSION >= CUDART_HMAX
266+ #if  defined(GGML_USE_HIP) && HIP_VERSION >= 50700000 
267+      return   half2 ( __hmax (a. x , b. x ),  __hmax (a. y , b. y )); 
268+ #elif  !defined(GGML_USE_HIP) &&  CUDART_VERSION >= CUDART_HMAX
275269    return  __hmax2 (a, b);
276- #else 
270+ #elif  !defined(GGML_USE_HIP) 
277271    half2 ret;
278272    reinterpret_cast <half&>(ret.x ) = __float2half (fmaxf ( __low2float (a),  __low2float (b)));
279273    reinterpret_cast <half&>(ret.y ) = __float2half (fmaxf (__high2float (a), __high2float (b)));
280274    return  ret;
281- #endif  //  CUDART_VERSION >= CUDART_HMAX
282- 
283275#else 
284276    GGML_UNUSED (a);
285277    GGML_UNUSED (b);
286278    NO_DEVICE_CODE;
287- #endif   //  !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) 
279+ #endif 
288280}
289281
282+ template <int  width = WARP_SIZE>
290283static  __device__  __forceinline__  half2 warp_reduce_max (half2 x) {
291- #if  !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
284+ #if  !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000) 
292285#pragma  unroll
293-    for  (int  offset = 16 ; offset > 0 ; offset >>= 1 ) {
294-        x = ggml_cuda_hmax2 (x, __shfl_xor_sync (0xffffffff , x, offset, 32 ));
286+    for  (int  offset = width/ 2 ; offset > 0 ; offset >>= 1 ) {
287+        x = ggml_cuda_hmax2 (x, __shfl_xor_sync (0xffffffff , x, offset, width ));
295288   }
296289   return  x;
297290#else 
298291   GGML_UNUSED (x);
299292   NO_DEVICE_CODE;
300- #endif  //  !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
293+ #endif  //  !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000) 
301294}
302295
303296#if  CUDART_VERSION < CUDART_HMASK
0 commit comments