@@ -592,42 +592,17 @@ static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v,
592592#endif  //  defined(GGML_USE_HIP) && (defined(RDNA2)  || defined(RDNA3) || defined(RDNA4) || defined(GCN5) || defined(CDNA))
593593}
594594
595- static  __device__  __forceinline__  void  ggml_cuda_mad (half2 & acc, const  half2 v, const  half2 u) {
596- #ifdef  FAST_FP16_AVAILABLE
597-     acc += v*u;
598- #else 
599-     const  float2  tmpv = __half22float2 (v);
600-     const  float2  tmpu = __half22float2 (u);
601-     float2  tmpacc = __half22float2 (acc);
602-     tmpacc.x  += tmpv.x  * tmpu.x ;
603-     tmpacc.y  += tmpv.y  * tmpu.y ;
604-     acc = make_half2 (tmpacc.x , tmpacc.y );
605- #endif  //  FAST_FP16_AVAILABLE
606- }
607- 
608595//  Aligned memory transfers of 8/16 bytes can be faster than 2 transfers with 4 bytes, especially on AMD.
609- template  <int  nbytes,  int  alignment =  0 >
596+ template  <int  nbytes>
610597static  __device__  __forceinline__  void  ggml_cuda_memcpy_1 (void  * __restrict__  dst, const  void  * __restrict__  src) {
611-     if  constexpr  (alignment != 0 ) {
612-         static_assert (nbytes % alignment == 0 , " bad alignment" 
613-     }
614-     constexpr  int  nb_per_cpy = alignment == 0  ? nbytes : alignment;
615- 
616- #pragma  unroll
617-     for  (int  i = 0 ; i < nbytes/nb_per_cpy; ++i) {
618-         if  constexpr  (nb_per_cpy == 1 ) {
619-             ((char  *) dst)[i] = ((const  char  *) src)[i];
620-         } else  if  constexpr  (nb_per_cpy == 2 ) {
621-             ((short  *) dst)[i] = ((const  short  *) src)[i];
622-         } else  if  constexpr  (nb_per_cpy == 4 ) {
623-             ((int  *) dst)[i] = ((const  int  *) src)[i];
624-         } else  if  constexpr  (nb_per_cpy == 8 ) {
625-             ((int2  *) dst)[i] = ((const  int2  *) src)[i];
626-         } else  if  constexpr  (nb_per_cpy == 16 ) {
627-             ((int4  *) dst)[i] = ((const  int4  *) src)[i];
628-         } else  {
629-             static_assert (nbytes == 0  && nbytes == -1 , " bad nbytes" 
630-         }
598+     if  constexpr  (nbytes == 4 ) {
599+         *(int  *) dst = *(const  int  *) src;
600+     } else  if  constexpr  (nbytes == 8 ) {
601+         *(int2  *) dst = *(const  int2  *) src;
602+     } else  if  constexpr  (nbytes == 16 ) {
603+         *(int4  *) dst = *(const  int4  *) src;
604+     } else  {
605+         static_assert (nbytes == 0  && nbytes == -1 , " bad nbytes" 
631606    }
632607}
633608
0 commit comments