Skip to content

Commit 094a52d

Browse files
committed
Revert "CUDA: faster tile FA, add oob checks, more HSs (ggml-org#16492)"
1 parent 412021b commit 094a52d

15 files changed

+786
-1353
lines changed

ggml/src/ggml-cuda/common.cuh

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -250,8 +250,8 @@ static bool fp16_available(const int cc) {
250250
}
251251

252252
static bool fast_fp16_available(const int cc) {
253-
return GGML_CUDA_CC_IS_AMD(cc) ||
254-
(GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && cc > 610);
253+
return fp16_available(cc) && cc > 610;
254+
// return (GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && cc != 610) || GGML_CUDA_CC_IS_AMD(cc); LCPP
255255
}
256256

257257
// To be used for feature selection of external libraries, e.g. cuBLAS.
@@ -577,10 +577,6 @@ static __device__ __forceinline__ void ggml_cuda_mad(half2 & acc, const half2 v,
577577
}
578578

579579
// Aligned memory transfers of 8/16 bytes can be faster than 2 transfers with 4 bytes, especially on AMD.
580-
// Important: do not use this function if dst and src both point at registers.
581-
// Due to the strict aliasing rule the compiler can do incorrect optimizations if src and dst have different types.
582-
// The function is intended for copies between registers and SRAM/VRAM to make the compiler emit the right instructions.
583-
// If dst and src point at different address spaces then they are guaranteed to not be aliased.
584580
template <int nbytes, int alignment = 0>
585581
static __device__ __forceinline__ void ggml_cuda_memcpy_1(void * __restrict__ dst, const void * __restrict__ src) {
586582
if constexpr (alignment != 0) {

ggml/src/ggml-cuda/fattn-common.cuh

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -793,6 +793,8 @@ void launch_fattn(
793793
GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) &&
794794
"the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
795795

796+
GGML_ASSERT(K->ne[1] % FATTN_KQ_STRIDE == 0 && "Incorrect KV cache padding.");
797+
796798
ggml_cuda_pool & pool = ctx.pool();
797799
cudaStream_t main_stream = ctx.stream();
798800
const int id = ggml_cuda_get_device();
@@ -876,7 +878,7 @@ void launch_fattn(
876878
// Optional optimization where the mask is scanned to determine whether part of the calculation can be skipped.
877879
// Only worth the overhead if there is at lease one FATTN_KQ_STRIDE x FATTN_KQ_STRIDE square to be skipped or
878880
// multiple sequences of possibly different lengths.
879-
if (mask && K->ne[1] % FATTN_KQ_STRIDE == 0 && (Q->ne[1] >= 1024 || Q->ne[3] > 1)) {
881+
if (mask && (Q->ne[1] >= 1024 || Q->ne[3] > 1)) {
880882
const int s31 = mask->nb[1] / sizeof(half2);
881883
const int s33 = mask->nb[3] / sizeof(half2);
882884

@@ -914,7 +916,8 @@ void launch_fattn(
914916

915917
dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + DV) * sizeof(float));
916918
} else {
917-
const int ntiles_KQ = (K->ne[1] + KQ_row_granularity - 1) / KQ_row_granularity; // Max. number of parallel blocks limited by tensor size.
919+
GGML_ASSERT(K->ne[1] % KQ_row_granularity == 0);
920+
const int ntiles_KQ = K->ne[1] / KQ_row_granularity; // Max. number of parallel blocks limited by tensor size.
918921

919922
// parallel_blocks must not be larger than what the tensor size allows:
920923
parallel_blocks = std::min(parallel_blocks, ntiles_KQ);
@@ -943,7 +946,7 @@ void launch_fattn(
943946

944947
blocks_num.x = ntiles_x;
945948
blocks_num.y = parallel_blocks;
946-
blocks_num.z = (Q->ne[2]/ncols2)*Q->ne[3];
949+
blocks_num.z = Q->ne[2]*Q->ne[3];
947950

948951
if (parallel_blocks > 1) {
949952
dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));

0 commit comments

Comments
 (0)