Merge branch 'ikawrakow:main' into main

Thireus · web-flow · commit ef49b838ba0c · 2025-10-29T19:24:45.000Z
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
@@ -591,6 +591,17 @@ static json oaicompat_chat_params_parse(
 
     // Apply chat template to the list of messages
     auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
+    
+    /* Append assistant prefilled message */
+    if (prefill_assistant_message) {
+        if (!last_message.content_parts.empty()) {
+            for (auto & p : last_message.content_parts) {
+                chat_params.prompt += p.text;
+            }
+        } else {
+            chat_params.prompt += last_message.content;
+        }
+    }
 
     llama_params["chat_format"] = static_cast<int>(chat_params.format);
     llama_params["prompt"] = chat_params.prompt;
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
@@ -4386,31 +4386,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
             return (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) || op->src[0]->ne[0] == 128;
 #else
-            if (op->src[0]->ne[0] == 128) {
-                return true;
-            }
-            if (op->src[1]->ne[0] == 256 && op->src[2]->ne[0] == 256 &&
-               (op->src[1]->type == GGML_TYPE_F16 || op->src[1]->type == GGML_TYPE_Q8_0) &&
-               (op->src[2]->type == GGML_TYPE_F16 || op->src[2]->type == GGML_TYPE_Q8_0)) {
-                return true;
-            }
-            if (op->src[1]->ne[0] == 192 && op->src[2]->ne[0] == 128) {
-                return (op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16) ||
-                       (op->src[1]->type == GGML_TYPE_Q8_0 && op->src[2]->type == GGML_TYPE_Q8_0);
-            }
-            if (op->src[1]->ne[0] == 576 && op->src[2]->ne[0] == 512) {
-                const int cc = ggml_cuda_info().devices[cuda_ctx->device].cc;
-                int gqa = op->src[0]->ne[2]/op->src[1]->ne[2];
-                return (new_mma_available(cc) && cc >= CC_AMPERE && op->src[3] && gqa%16 == 0);
-            }
-            if (op->src[1]->ne[0] > 256) {
-                return false;
-            }
-            if (op->src[0]->ne[0] ==  64 && op->src[1]->type == GGML_TYPE_F16) {
-                return true;
-            }
-            return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA &&
-                op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
+            return ggml_cuda_fattn_is_supported(*cuda_ctx, op);
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
         default:
             return false;
diff --git a/ggml/src/ggml-cuda/fattn-mma-f16-interface.cuh b/ggml/src/ggml-cuda/fattn-mma-f16-interface.cuh
@@ -3,3 +3,5 @@
 #include "common.cuh"
 
 void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+bool ggml_cuda_fattn_mma_f16_is_supported(ggml_backend_cuda_context & ctx, const ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cu b/ggml/src/ggml-cuda/fattn-mma-f16.cu
@@ -83,3 +83,10 @@ void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, ggml_tens
 
     ggml_cuda_flash_attn_ext_mma_f16_switch_hs<1>(ctx, dst);
 }
+
+bool ggml_cuda_fattn_mma_f16_is_supported([[maybe_unused]] ggml_backend_cuda_context & ctx, const ggml_tensor * dst) {
+    auto K = dst->src[1];
+    auto V = dst->src[1];
+    if (K->ne[0] != V->ne[0]) return false;
+    return K->ne[0] == 64 || K->ne[0] == 80 || K->ne[0] == 96 || K->ne[0] == 112 || K->ne[0] == 128 || K->ne[0] == 256;
+}
diff --git a/ggml/src/ggml-cuda/fattn-tile-f16.cu b/ggml/src/ggml-cuda/fattn-tile-f16.cu
@@ -353,3 +353,10 @@ void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_ten
         launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, false>(ctx, dst);
     }
 }
+
+bool ggml_cuda_fattn_tile_f16_is_supported([[maybe_unused]] ggml_backend_cuda_context & ctx, const ggml_tensor * dst) {
+    auto K = dst->src[1];
+    auto V = dst->src[2];
+    if (K->ne[0] != V->ne[0]) return false;
+    return K->ne[0] == 64 || K->ne[0] == 128;
+}
diff --git a/ggml/src/ggml-cuda/fattn-tile-f16.cuh b/ggml/src/ggml-cuda/fattn-tile-f16.cuh
@@ -1,3 +1,5 @@
 #include "common.cuh"
 
 void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+bool ggml_cuda_fattn_tile_f16_is_supported(ggml_backend_cuda_context & ctx, const ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/fattn-tile-f32.cu b/ggml/src/ggml-cuda/fattn-tile-f32.cu
@@ -352,3 +352,10 @@ void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_ten
         launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, true>(ctx, dst);
     }
 }
+
+bool ggml_cuda_fattn_tile_f32_is_supported([[maybe_unused]] ggml_backend_cuda_context & ctx, const ggml_tensor * dst) {
+    auto K = dst->src[1];
+    auto V = dst->src[2];
+    if (K->ne[0] != V->ne[0]) return false;
+    return K->ne[0] == 64 || K->ne[0] == 128;
+}
diff --git a/ggml/src/ggml-cuda/fattn-tile-f32.cuh b/ggml/src/ggml-cuda/fattn-tile-f32.cuh
@@ -1,3 +1,5 @@
 #include "common.cuh"
 
 void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+bool ggml_cuda_fattn_tile_f32_is_supported(ggml_backend_cuda_context & ctx, const ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/fattn-vec-f16-interface.cuh b/ggml/src/ggml-cuda/fattn-vec-f16-interface.cuh
@@ -3,3 +3,5 @@
 #include "common.cuh"
 
 void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+bool ggml_cuda_fattn_vec_f16_is_supported(ggml_backend_cuda_context & ctx, const ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/fattn-vec-f16.cu b/ggml/src/ggml-cuda/fattn-vec-f16.cu
@@ -102,4 +102,50 @@ void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tens
     on_no_fattn_vec_case(Q->ne[0], V->ne[0]);
 }
 
-
+bool ggml_cuda_fattn_vec_f16_is_supported([[maybe_unused]] ggml_backend_cuda_context & ctx, const ggml_tensor * dst) {
+    auto K = dst->src[1];
+    auto V = dst->src[2];
+    if (K->ne[0] != V->ne[0]) {
+        if (K->ne[0] != 192 || V->ne[2] != 128) return false;
+        if (K->type != V->type) return false;
+        return K->type == GGML_TYPE_F16 || K->type == GGML_TYPE_Q8_0;
+    }
+#ifdef GGML_CUDA_FA_ALL_QUANTS
+    if (K->ne[0] == 64) {
+        return K->type == GGML_TYPE_F16 &&
+              (V->type == GGML_TYPE_F16  || V->type == GGML_TYPE_Q4_0 || V->type == GGML_TYPE_Q4_1 ||
+               V->type == GGML_TYPE_Q5_0 || V->type == GGML_TYPE_Q5_1 || V->type == GGML_TYPE_Q8_0);
+    }
+    if (K->ne[0] == 256) {
+        return K->type == V->type && (K->type == GGML_TYPE_F16 || K->type == GGML_TYPE_Q8_0);
+    }
+    if (K->ne[0] != 128 || V->ne[0] != 128) return false;
+    if ((K->type == GGML_TYPE_Q4_0 || K->type == GGML_TYPE_Q4_1 || K->type == GGML_TYPE_Q5_0 || K->type == GGML_TYPE_Q5_1 ||
+         K->type == GGML_TYPE_Q8_0 || K->type == GGML_TYPE_F16) &&
+        (V->type == GGML_TYPE_Q4_0 || V->type == GGML_TYPE_Q4_1 || V->type == GGML_TYPE_Q5_0 || V->type == GGML_TYPE_Q5_1 ||
+         V->type == GGML_TYPE_Q8_0 || V->type == GGML_TYPE_F16)) return true;
+    return (K->type == GGML_TYPE_Q8_0 && V->type == GGML_TYPE_IQ4_NL) ||
+           (K->type == GGML_TYPE_Q6_0 && V->type == GGML_TYPE_Q5_0)   ||
+           (K->type == GGML_TYPE_Q6_0 && V->type == GGML_TYPE_Q6_0)   ||
+           (K->type == GGML_TYPE_Q8_0 && V->type == GGML_TYPE_Q6_0)   ||
+           (K->type == GGML_TYPE_Q8_0 && V->type == GGML_TYPE_IQ4_NL);
+#else
+    if (K->ne[0] == 128) {
+        if (K->type == V->type) {
+            return K->type == GGML_TYPE_Q4_0 || K->type == GGML_TYPE_Q8_0 || K->type == GGML_TYPE_F16 || K->type == GGML_TYPE_IQ4_NL;
+        }
+        return (K->type == GGML_TYPE_Q8_0 && V->type == GGML_TYPE_IQ4_NL) ||
+               (K->type == GGML_TYPE_Q6_0 && V->type == GGML_TYPE_Q5_0)   ||
+               (K->type == GGML_TYPE_Q8_0 && V->type == GGML_TYPE_Q6_0)   ||
+               (K->type == GGML_TYPE_Q8_0 && V->type == GGML_TYPE_IQ4_NL);
+    }
+    if (K->type != V->type) return false;
+    if (K->ne[0] == 64) {
+        return K->type == GGML_TYPE_F16;
+    }
+    if (K->ne[0] == 256) {
+        return K->type == GGML_TYPE_F16 || K->type == GGML_TYPE_Q8_0;
+    }
+    return false;
+#endif
+}
diff --git a/ggml/src/ggml-cuda/fattn-vec-f32-interface.cuh b/ggml/src/ggml-cuda/fattn-vec-f32-interface.cuh
@@ -3,3 +3,5 @@
 #include "common.cuh"
 
 void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+bool ggml_cuda_fattn_vec_f32_is_supported(ggml_backend_cuda_context & ctx, const ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/fattn-vec-f32.cu b/ggml/src/ggml-cuda/fattn-vec-f32.cu
@@ -24,7 +24,7 @@ void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tens
     FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0)
     FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1)
     FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0)
-    FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
+    FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16 )
 
     FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
     FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0)
@@ -71,6 +71,12 @@ void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tens
     FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
     FATTN_VEC_F32_CASE(256, GGML_TYPE_Q8_0,GGML_TYPE_Q8_0)
 
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_NL)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0,   GGML_TYPE_IQ4_NL)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q6_0,   GGML_TYPE_Q5_0)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q6_0,   GGML_TYPE_Q6_0)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0,   GGML_TYPE_Q6_0)
+
     FATTN_VEC_F32_CASE_DKDV(192, 128, GGML_TYPE_F16, GGML_TYPE_F16)
     FATTN_VEC_F32_CASE_DKDV(192, 128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
 #else
@@ -83,10 +89,63 @@ void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tens
     FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
     FATTN_VEC_F32_CASE(256, GGML_TYPE_Q8_0,GGML_TYPE_Q8_0)
 
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_NL)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0,   GGML_TYPE_IQ4_NL)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q6_0,   GGML_TYPE_Q5_0)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0,   GGML_TYPE_Q6_0)
+
     FATTN_VEC_F32_CASE_DKDV(192, 128, GGML_TYPE_F16, GGML_TYPE_F16)
     FATTN_VEC_F32_CASE_DKDV(192, 128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
+
 #endif // GGML_CUDA_FA_ALL_QUANTS
 
     on_no_fattn_vec_case(Q->ne[0], V->ne[0]);
 }
 
+bool ggml_cuda_fattn_vec_f32_is_supported([[maybe_unused]] ggml_backend_cuda_context & ctx, const ggml_tensor * dst) {
+    auto K = dst->src[1];
+    auto V = dst->src[2];
+    if (K->ne[0] != V->ne[0]) {
+        if (K->ne[0] != 192 || V->ne[2] != 128) return false;
+        if (K->type != V->type) return false;
+        return K->type == GGML_TYPE_F16 || K->type == GGML_TYPE_Q8_0;
+    }
+#ifdef GGML_CUDA_FA_ALL_QUANTS
+    if (K->ne[0] == 64) {
+        return K->type == GGML_TYPE_F16 &&
+              (V->type == GGML_TYPE_F16  || V->type == GGML_TYPE_Q4_0 || V->type == GGML_TYPE_Q4_1 ||
+               V->type == GGML_TYPE_Q5_0 || V->type == GGML_TYPE_Q5_1 || V->type == GGML_TYPE_Q8_0);
+    }
+    if (K->ne[0] == 256) {
+        return K->type == V->type && (K->type == GGML_TYPE_F16 || K->type == GGML_TYPE_Q8_0);
+    }
+    if (K->ne[0] != 128 || V->ne[0] != 128) return false;
+    if ((K->type == GGML_TYPE_Q4_0 || K->type == GGML_TYPE_Q4_1 || K->type == GGML_TYPE_Q5_0 || K->type == GGML_TYPE_Q5_1 ||
+         K->type == GGML_TYPE_Q8_0 || K->type == GGML_TYPE_F16) &&
+        (V->type == GGML_TYPE_Q4_0 || V->type == GGML_TYPE_Q4_1 || V->type == GGML_TYPE_Q5_0 || V->type == GGML_TYPE_Q5_1 ||
+         V->type == GGML_TYPE_Q8_0 || V->type == GGML_TYPE_F16)) return true;
+    return (K->type == GGML_TYPE_Q8_0 && V->type == GGML_TYPE_IQ4_NL) ||
+           (K->type == GGML_TYPE_Q6_0 && V->type == GGML_TYPE_Q5_0)   ||
+           (K->type == GGML_TYPE_Q6_0 && V->type == GGML_TYPE_Q6_0)   ||
+           (K->type == GGML_TYPE_Q8_0 && V->type == GGML_TYPE_Q6_0)   ||
+           (K->type == GGML_TYPE_Q8_0 && V->type == GGML_TYPE_IQ4_NL);
+#else
+    if (K->ne[0] == 128) {
+        if (K->type == V->type) {
+            return K->type == GGML_TYPE_Q4_0 || K->type == GGML_TYPE_Q8_0 || K->type == GGML_TYPE_F16 || K->type == GGML_TYPE_IQ4_NL;
+        }
+        return (K->type == GGML_TYPE_Q8_0 && V->type == GGML_TYPE_IQ4_NL) ||
+               (K->type == GGML_TYPE_Q6_0 && V->type == GGML_TYPE_Q5_0)   ||
+               (K->type == GGML_TYPE_Q8_0 && V->type == GGML_TYPE_Q6_0)   ||
+               (K->type == GGML_TYPE_Q8_0 && V->type == GGML_TYPE_IQ4_NL);
+    }
+    if (K->type != V->type) return false;
+    if (K->ne[0] == 64) {
+        return K->type == GGML_TYPE_F16;
+    }
+    if (K->ne[0] == 256) {
+        return K->type == GGML_TYPE_F16 || K->type == GGML_TYPE_Q8_0;
+    }
+    return false;
+#endif
+}
diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16-interface.cuh b/ggml/src/ggml-cuda/fattn-wmma-f16-interface.cuh
@@ -3,3 +3,5 @@
 #include "common.cuh"
 
 void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+bool ggml_cuda_fattn_wmma_f16_is_supported(ggml_backend_cuda_context & ctx, const ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
@@ -165,3 +165,9 @@ void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_ten
     }
 }
 
+bool ggml_cuda_fattn_wmma_f16_is_supported([[maybe_unused]] ggml_backend_cuda_context & ctx, const ggml_tensor * dst) {
+    auto K = dst->src[1];
+    auto V = dst->src[2];
+    if (K->ne[0] != V->ne[0]) return K->ne[0] == 192 && V->ne[0] == 128;
+    return K->ne[0] == 64 || K->ne[0] == 80 || K->ne[0] == 96 || K->ne[0] == 112 || K->ne[0] == 128 || K->ne[0] == 256;
+}
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
@@ -91,12 +91,8 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
     //const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies && cc < CC_ADA_LOVELACE && !mma_needs_data_conversion;
     const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies && !(Q->ne[1] == 1 && n_swa > 0);
     const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % (2*WARP_SIZE) == 0;
-    if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) {
-        //if (precision == GGML_PREC_DEFAULT) {
-        //    ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
-        //} else {
-            ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
-        //}
+    if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1 && !ggml_is_quantized(K->type) && !ggml_is_quantized(V->type)) {
+        ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
         return;
     }
 
@@ -125,3 +121,64 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
     ggml_cuda_flash_attn_ext_mma_f16(ctx, dst);
     //ggml_cuda_flash_attn_ext_mma_new(ctx, dst);
 }
+
+bool ggml_cuda_fattn_is_supported(ggml_backend_cuda_context & ctx, const ggml_tensor * dst) {
+    const ggml_tensor * KQV  = dst;
+    const ggml_tensor * Q    = dst->src[0];
+    const ggml_tensor * K    = dst->src[1];
+    const ggml_tensor * V    = dst->src[2];
+    const ggml_tensor * mask = dst->src[3];
+
+    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+    const int32_t precision = KQV->op_params[3];
+    const int32_t n_swa = KQV->op_params[4];
+    if (cc >= CC_OFFSET_AMD) {
+        return precision == GGML_PREC_DEFAULT ? ggml_cuda_fattn_vec_f16_is_supported(ctx, dst)
+                                              : ggml_cuda_fattn_vec_f32_is_supported(ctx, dst);
+    }
+
+    if (!fast_fp16_available(cc)) {
+        if (Q->ne[1] <= 8 || Q->ne[0] == 256) {
+            return ggml_cuda_fattn_vec_f32_is_supported(ctx, dst);
+        } else {
+            return ggml_cuda_fattn_tile_f32_is_supported(ctx, dst);
+        }
+    }
+
+    if (!fp16_mma_available(cc)) {
+        if (precision == GGML_PREC_DEFAULT) {
+            if (Q->ne[1] <= 8 || Q->ne[0] == 256) {
+                return ggml_cuda_fattn_vec_f16_is_supported(ctx, dst);
+            } else {
+                return ggml_cuda_fattn_tile_f16_is_supported(ctx, dst);
+            }
+        } else {
+            if (Q->ne[1] <= 8 || Q->ne[0] == 256) {
+                return ggml_cuda_fattn_vec_f32_is_supported(ctx, dst);
+            } else {
+                return ggml_cuda_fattn_tile_f32_is_supported(ctx, dst);
+            }
+        }
+    }
+
+    const bool gqa_opt_applies = ((Q->ne[2] / K->ne[2]) % 2 == 0) && mask; // The mma-based kernels have GQA-specific optimizations
+    // So, not sure why in mainline they thought that for CC_ADA_LOVELACE or when KV cache is not f16 the vector kernels are faster.
+    // On my GPU (RTX-4080) MMA is efinitely faster for GQA, both for f16 and for quantized KV cache.
+    //const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16;
+    //const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies && cc < CC_ADA_LOVELACE && !mma_needs_data_conversion;
+    const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies && !(Q->ne[1] == 1 && n_swa > 0);
+    const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % (2*WARP_SIZE) == 0;
+    if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1 && !ggml_is_quantized(K->type) && !ggml_is_quantized(V->type)) {
+        return ggml_cuda_fattn_tile_f32_is_supported(ctx, dst);
+    }
+
+    if (new_mma_available(cc) && Q->ne[0] == 576) {
+        return V->ne[0] == 512;
+    }
+
+    if (!new_mma_available(cc) || K->ne[0] != V->ne[0]) {
+        return ggml_cuda_fattn_wmma_f16_is_supported(ctx, dst);
+    }
+
+    return ggml_cuda_fattn_mma_f16_is_supported(ctx, dst);
+}
diff --git a/ggml/src/ggml-cuda/fattn.cuh b/ggml/src/ggml-cuda/fattn.cuh
@@ -1,3 +1,5 @@
 #include "common.cuh"
 
 void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+bool ggml_cuda_fattn_is_supported(ggml_backend_cuda_context & ctx, const ggml_tensor * dst);

Original file line number	Diff line number	Diff line change
`@@ -353,3 +353,10 @@ void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_ten`
`353`	`353`	`launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, false>(ctx, dst);`
`354`	`354`	`}`
`355`	`355`	`}`
	`356`	`+`
	`357`	`+bool ggml_cuda_fattn_tile_f16_is_supported([[maybe_unused]] ggml_backend_cuda_context & ctx, const ggml_tensor * dst) {`
	`358`	`+ auto K = dst->src[1];`
	`359`	`+ auto V = dst->src[2];`
	`360`	`+ if (K->ne[0] != V->ne[0]) return false;`
	`361`	`+ return K->ne[0] == 64 \|\| K->ne[0] == 128;`
	`362`	`+}`
Original file line number	Diff line number	Diff line change
`@@ -352,3 +352,10 @@ void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_ten`
`352`	`352`	`launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, true>(ctx, dst);`
`353`	`353`	`}`
`354`	`354`	`}`
	`355`	`+`
	`356`	`+bool ggml_cuda_fattn_tile_f32_is_supported([[maybe_unused]] ggml_backend_cuda_context & ctx, const ggml_tensor * dst) {`
	`357`	`+ auto K = dst->src[1];`
	`358`	`+ auto V = dst->src[2];`
	`359`	`+ if (K->ne[0] != V->ne[0]) return false;`
	`360`	`+ return K->ne[0] == 64 \|\| K->ne[0] == 128;`
	`361`	`+}`