Skip to content

Commit ce71aba

Browse files
Update ggml/src/ggml-cuda/fattn.cu
Co-authored-by: Johannes Gäßler <[email protected]>
1 parent f7d07dd commit ce71aba

File tree

1 file changed

+1
-1
lines changed

1 file changed

+1
-1
lines changed

ggml/src/ggml-cuda/fattn.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
299299
const bool gqa_opt_applies = ((Q->ne[2] / K->ne[2]) % 2 == 0) && mask; // The mma-based kernels have GQA-specific optimizations
300300
const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16;
301301
const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies && cc < GGML_CUDA_CC_ADA_LOVELACE && !mma_needs_data_conversion;
302-
const bool can_use_vector_kernel = (Q->ne[0] % (2*warp_size) == 0) && (Q->ne[0] <= 256);
302+
const bool can_use_vector_kernel = Q->ne[0] % (2*warp_size) == 0;
303303
if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) {
304304
if (prec == GGML_PREC_DEFAULT) {
305305
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);

0 commit comments

Comments
 (0)