Skip to content

Commit adc3e30

Browse files
committed
Revert "CUDA: refactor and deduplicate vector FA kernels (ggml-org#16208)"
This reverts commit 75a3a6c.
1 parent 408c78e commit adc3e30

File tree

129 files changed

+1902
-1307
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

129 files changed

+1902
-1307
lines changed

ggml/src/ggml-cuda/common.cuh

Lines changed: 9 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -563,42 +563,17 @@ static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v,
563563
#endif // defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(GCN5) || defined(CDNA))
564564
}
565565

566-
static __device__ __forceinline__ void ggml_cuda_mad(half2 & acc, const half2 v, const half2 u) {
567-
#ifdef FAST_FP16_AVAILABLE
568-
acc += v*u;
569-
#else
570-
const float2 tmpv = __half22float2(v);
571-
const float2 tmpu = __half22float2(u);
572-
float2 tmpacc = __half22float2(acc);
573-
tmpacc.x += tmpv.x * tmpu.x;
574-
tmpacc.y += tmpv.y * tmpu.y;
575-
acc = make_half2(tmpacc.x, tmpacc.y);
576-
#endif // FAST_FP16_AVAILABLE
577-
}
578-
579566
// Aligned memory transfers of 8/16 bytes can be faster than 2 transfers with 4 bytes, especially on AMD.
580-
template <int nbytes, int alignment = 0>
567+
template <int nbytes>
581568
static __device__ __forceinline__ void ggml_cuda_memcpy_1(void * __restrict__ dst, const void * __restrict__ src) {
582-
if constexpr (alignment != 0) {
583-
static_assert(nbytes % alignment == 0, "bad alignment");
584-
}
585-
constexpr int nb_per_cpy = alignment == 0 ? nbytes : alignment;
586-
587-
#pragma unroll
588-
for (int i = 0; i < nbytes/nb_per_cpy; ++i) {
589-
if constexpr (nb_per_cpy == 1) {
590-
((char *) dst)[i] = ((const char *) src)[i];
591-
} else if constexpr (nb_per_cpy == 2) {
592-
((short *) dst)[i] = ((const short *) src)[i];
593-
} else if constexpr (nb_per_cpy == 4) {
594-
((int *) dst)[i] = ((const int *) src)[i];
595-
} else if constexpr (nb_per_cpy == 8) {
596-
((int2 *) dst)[i] = ((const int2 *) src)[i];
597-
} else if constexpr (nb_per_cpy == 16) {
598-
((int4 *) dst)[i] = ((const int4 *) src)[i];
599-
} else {
600-
static_assert(nbytes == 0 && nbytes == -1, "bad nbytes");
601-
}
569+
if constexpr (nbytes == 4) {
570+
*(int *) dst = *(const int *) src;
571+
} else if constexpr (nbytes == 8) {
572+
*(int2 *) dst = *(const int2 *) src;
573+
} else if constexpr (nbytes == 16) {
574+
*(int4 *) dst = *(const int4 *) src;
575+
} else {
576+
static_assert(nbytes == 0 && nbytes == -1, "bad nbytes");
602577
}
603578
}
604579

0 commit comments

Comments
 (0)