@@ -563,42 +563,17 @@ static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v,
563563#endif  //  defined(GGML_USE_HIP) && (defined(RDNA2)  || defined(RDNA3) || defined(RDNA4) || defined(GCN5) || defined(CDNA))
564564}
565565
566- static  __device__  __forceinline__  void  ggml_cuda_mad (half2 & acc, const  half2 v, const  half2 u) {
567- #ifdef  FAST_FP16_AVAILABLE
568-     acc += v*u;
569- #else 
570-     const  float2  tmpv = __half22float2 (v);
571-     const  float2  tmpu = __half22float2 (u);
572-     float2  tmpacc = __half22float2 (acc);
573-     tmpacc.x  += tmpv.x  * tmpu.x ;
574-     tmpacc.y  += tmpv.y  * tmpu.y ;
575-     acc = make_half2 (tmpacc.x , tmpacc.y );
576- #endif  //  FAST_FP16_AVAILABLE
577- }
578- 
579566//  Aligned memory transfers of 8/16 bytes can be faster than 2 transfers with 4 bytes, especially on AMD.
580- template  <int  nbytes,  int  alignment =  0 >
567+ template  <int  nbytes>
581568static  __device__  __forceinline__  void  ggml_cuda_memcpy_1 (void  * __restrict__  dst, const  void  * __restrict__  src) {
582-     if  constexpr  (alignment != 0 ) {
583-         static_assert (nbytes % alignment == 0 , " bad alignment" 
584-     }
585-     constexpr  int  nb_per_cpy = alignment == 0  ? nbytes : alignment;
586- 
587- #pragma  unroll
588-     for  (int  i = 0 ; i < nbytes/nb_per_cpy; ++i) {
589-         if  constexpr  (nb_per_cpy == 1 ) {
590-             ((char  *) dst)[i] = ((const  char  *) src)[i];
591-         } else  if  constexpr  (nb_per_cpy == 2 ) {
592-             ((short  *) dst)[i] = ((const  short  *) src)[i];
593-         } else  if  constexpr  (nb_per_cpy == 4 ) {
594-             ((int  *) dst)[i] = ((const  int  *) src)[i];
595-         } else  if  constexpr  (nb_per_cpy == 8 ) {
596-             ((int2  *) dst)[i] = ((const  int2  *) src)[i];
597-         } else  if  constexpr  (nb_per_cpy == 16 ) {
598-             ((int4  *) dst)[i] = ((const  int4  *) src)[i];
599-         } else  {
600-             static_assert (nbytes == 0  && nbytes == -1 , " bad nbytes" 
601-         }
569+     if  constexpr  (nbytes == 4 ) {
570+         *(int  *) dst = *(const  int  *) src;
571+     } else  if  constexpr  (nbytes == 8 ) {
572+         *(int2  *) dst = *(const  int2  *) src;
573+     } else  if  constexpr  (nbytes == 16 ) {
574+         *(int4  *) dst = *(const  int4  *) src;
575+     } else  {
576+         static_assert (nbytes == 0  && nbytes == -1 , " bad nbytes" 
602577    }
603578}
604579
0 commit comments