Skip to content

Commit 2a6fe6e

Browse files
committed
Revert "CUDA: use mma FA kernel for gqa > 4 on RTX 4000 (ggml-org#15035)"
This reverts commit 03d4698.
1 parent ddbba17 commit 2a6fe6e

File tree

1 file changed

+2
-3
lines changed

1 file changed

+2
-3
lines changed

ggml/src/ggml-cuda/fattn.cu

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -315,9 +315,8 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
315315

316316
const bool gqa_opt_applies = ((Q->ne[2] / K->ne[2]) % 2 == 0) && mask; // The mma-based kernels have GQA-specific optimizations
317317
const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16;
318-
const bool mma_faster_for_rtx4000 = Q->ne[3] > 1 || (Q->ne[2] > 4*K->ne[2] && K->ne[1] >= 8192);
319-
const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies && !mma_needs_data_conversion &&
320-
(cc < GGML_CUDA_CC_ADA_LOVELACE || mma_faster_for_rtx4000);
318+
const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies &&
319+
(Q->ne[3] > 1 || cc < GGML_CUDA_CC_ADA_LOVELACE) && !mma_needs_data_conversion;
321320
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % (2*warp_size) == 0;
322321
if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) {
323322
if (prec == GGML_PREC_DEFAULT) {

0 commit comments

Comments
 (0)