@@ -633,6 +633,7 @@ struct vk_flash_attn_push_constants {
633633 uint32_t nev2;
634634 uint32_t nev3;
635635 uint32_t nem1;
636+ uint32_t nem2;
636637
637638 uint32_t nb01;
638639 uint32_t nb02;
@@ -643,7 +644,6 @@ struct vk_flash_attn_push_constants {
643644 uint32_t nb21;
644645 uint32_t nb22;
645646 uint32_t nb23;
646- uint32_t nb31;
647647
648648 float scale;
649649 float max_bias;
@@ -658,6 +658,7 @@ struct vk_flash_attn_push_constants {
658658 uint32_t split_kv;
659659 uint32_t k_num;
660660};
661+ static_assert(sizeof(vk_flash_attn_push_constants) <= 128, "sizeof(vk_flash_attn_push_constants) must be <= 128");
661662
662663struct vk_op_push_constants {
663664 uint32_t KX;
@@ -756,6 +757,14 @@ struct vk_op_rope_push_constants {
756757struct vk_op_soft_max_push_constants {
757758 uint32_t KX;
758759 uint32_t KY;
760+ uint32_t ne00;
761+ uint32_t ne01;
762+ uint32_t ne02;
763+ uint32_t ne12;
764+ uint32_t ne13;
765+ uint32_t nb11;
766+ uint32_t nb12;
767+ uint32_t nb13;
759768 float scale;
760769 float max_bias;
761770 float m0;
@@ -6040,7 +6049,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
60406049 GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
60416050
60426051 const uint32_t nem1 = mask ? mask->ne[1] : 0;
6043- const uint32_t nbm1 = mask ? mask->nb[1 ] : 0;
6052+ const uint32_t nem2 = mask ? mask->ne[2 ] : 0;
60446053
60456054 const uint32_t D = neq0;
60466055 uint32_t N = neq1;
@@ -6203,7 +6212,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
62036212 // Try to use split_k when KV is large enough to be worth the overhead
62046213 if (workgroups_x == 1 && shader_core_count > 0 && KV >= 512) {
62056214 // Try to run two workgroups per SM.
6206- split_k = ctx->device->shader_core_count * 2 / workgroups_y;
6215+ split_k = ctx->device->shader_core_count * 2 / ( workgroups_y * workgroups_z) ;
62076216 if (split_k > 1) {
62086217 // Try to evenly split KV into split_k chunks, but it needs to be a multiple
62096218 // of "align", so recompute split_k based on that.
@@ -6213,9 +6222,9 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
62136222 }
62146223 }
62156224
6216- // Reserve space for split_k temporaries. For each split, we need to store the O matrix (D x ne1)
6217- // and the per-row m and L values (ne1 rows).
6218- const uint64_t split_k_size = split_k > 1 ? (D * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k : 0;
6225+ // Reserve space for split_k temporaries. For each split x batch , we need to store the O matrix (D x ne1)
6226+ // and the per-row m and L values (ne1 rows). We store all the matrices first, followed by the rows.
6227+ const uint64_t split_k_size = split_k > 1 ? (D * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k * ne3 : 0;
62196228 if (split_k_size > ctx->device->max_memory_allocation_size) {
62206229 GGML_ABORT("Requested preallocation size is too large");
62216230 }
@@ -6307,11 +6316,10 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
63076316 (uint32_t)neq2, (uint32_t)neq3,
63086317 (uint32_t)nek2, (uint32_t)nek3,
63096318 (uint32_t)nev2, (uint32_t)nev3,
6310- nem1,
6319+ nem1, nem2,
63116320 q_stride, (uint32_t)nbq2, (uint32_t)nbq3,
63126321 k_stride, (uint32_t)nbk2, (uint32_t)nbk3,
63136322 v_stride, (uint32_t)nbv2, (uint32_t)nbv3,
6314- nbm1,
63156323 scale, max_bias, logit_softcap,
63166324 mask != nullptr, n_head_log2, m0, m1,
63176325 gqa_ratio, split_kv, split_k };
@@ -6334,13 +6342,13 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
63346342 pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z });
63356343
63366344 ggml_vk_sync_buffers(subctx);
6337- const std::array<uint32_t, 3 > pc2 = { D, (uint32_t)ne1, split_k };
6345+ const std::array<uint32_t, 4 > pc2 = { D, (uint32_t)ne1, (uint32_t)ne3 , split_k };
63386346 ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce,
63396347 {
63406348 vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
63416349 vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
63426350 },
6343- pc2, { (uint32_t)ne1, 1, 1 });
6351+ pc2, { (uint32_t)ne1, 1, (uint32_t)ne3 });
63446352 } else {
63456353 ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
63466354 {
@@ -7666,7 +7674,13 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
76667674 const uint32_t nrows_x = (uint32_t)ggml_nrows(src0);
76677675 const uint32_t nrows_y = (uint32_t)src0->ne[1];
76687676
7669- const uint32_t n_head_kv = nrows_x/nrows_y;
7677+ const uint32_t ne12 = src1 ? (uint32_t)(src1->ne[2]) : 0u;
7678+ const uint32_t ne13 = src1 ? (uint32_t)(src1->ne[3]) : 0u;
7679+ const uint32_t nb11 = src1 ? (uint32_t)(src1->nb[1] / src1->nb[0]) : 0u;
7680+ const uint32_t nb12 = src1 ? (uint32_t)(src1->nb[2] / src1->nb[0]) : 0u;
7681+ const uint32_t nb13 = src1 ? (uint32_t)(src1->nb[3] / src1->nb[0]) : 0u;
7682+
7683+ const uint32_t n_head_kv = src0->ne[2];
76707684 const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
76717685
76727686 const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
@@ -7675,6 +7689,9 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
76757689 ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX, {
76767690 ncols,
76777691 src1 != nullptr ? nrows_y : (uint32_t)0,
7692+ (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],
7693+ ne12, ne13,
7694+ nb11, nb12, nb13,
76787695 scale, max_bias,
76797696 m0, m1,
76807697 n_head_log2,
@@ -10248,11 +10265,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
1024810265 if (op->src[3] && op->src[3]->type != GGML_TYPE_F16) {
1024910266 return false;
1025010267 }
10251- // TODO: support broadcast
10252- // ref: https://github.com/ggml-org/llama.cpp/pull/14435
10253- if (op->src[0]->ne[3] != 1) {
10254- return false;
10255- }
1025610268 // It's straightforward to support different K/V dequant, but would
1025710269 // significantly increase the number of pipelines
1025810270 if (op->src[1]->type != op->src[2]->type) {
@@ -10413,13 +10425,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
1041310425 case GGML_OP_DIAG_MASK_INF:
1041410426 return true;
1041510427 case GGML_OP_SOFT_MAX:
10416- // TODO: support batching
10417- if (op->src[0]->ne[3] != 1) {
10418- return false;
10419- }
10420- // TODO: support broadcast
10421- // ref: https://github.com/ggml-org/llama.cpp/pull/14435
10422- return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1);
1042310428 case GGML_OP_SOFT_MAX_BACK:
1042410429 case GGML_OP_ARGSORT:
1042510430 case GGML_OP_SUM:
0 commit comments