@@ -1193,6 +1193,10 @@ struct ggml_backend_vk_context {
11931193    vk::Fence fence, almost_ready_fence;
11941194    bool almost_ready_fence_pending {};
11951195
1196+     // Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert.
1197+     vk_pipeline prealloc_y_last_pipeline_used;
1198+     const ggml_tensor *prealloc_y_last_tensor_used;
1199+ 
11961200    vk_buffer buffer_pool[MAX_VK_BUFFERS];
11971201
11981202    vk_context_ref compute_ctx;
@@ -5651,10 +5655,20 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
56515655        ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
56525656    }
56535657    if (y_non_contig) {
5654-         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
5658+         if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1 ||
5659+             ctx->prealloc_y_last_tensor_used != src1) {
5660+             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
5661+             ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1;
5662+             ctx->prealloc_y_last_tensor_used = src1;
5663+         }
56555664    }
56565665    if (quantize_y) {
5657-         ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13);
5666+         if (ctx->prealloc_y_last_pipeline_used != to_q8_1 ||
5667+             ctx->prealloc_y_last_tensor_used != src1) {
5668+             ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13);
5669+             ctx->prealloc_y_last_pipeline_used = to_q8_1;
5670+             ctx->prealloc_y_last_tensor_used = src1;
5671+         }
56585672    }
56595673
56605674    uint32_t stride_batch_x = ne00*ne01;
@@ -5829,7 +5843,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
58295843    }
58305844    if (y_non_contig) {
58315845        GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
5832-         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
5846+         if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1 ||
5847+             ctx->prealloc_y_last_tensor_used != src1) {
5848+             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
5849+             ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1;
5850+             ctx->prealloc_y_last_tensor_used = src1;
5851+         }
58335852    }
58345853
58355854    // For batch_n, the A matrix is the same for each batch, and B/D use the row stride as the batch stride
@@ -6259,7 +6278,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
62596278            { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
62606279    }
62616280    if (y_non_contig) {
6262-         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
6281+         if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1 ||
6282+             ctx->prealloc_y_last_tensor_used != src1) {
6283+             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
6284+             ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1;
6285+             ctx->prealloc_y_last_tensor_used = src1;
6286+         }
62636287    }
62646288
62656289    uint32_t stride_batch_x = ne00*ne01;
@@ -6447,7 +6471,12 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
64476471    }
64486472    if (y_non_contig) {
64496473        GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
6450-         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
6474+         if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1 ||
6475+             ctx->prealloc_y_last_tensor_used != src1) {
6476+             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
6477+             ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1;
6478+             ctx->prealloc_y_last_tensor_used = src1;
6479+         }
64516480    }
64526481
64536482    uint32_t stride_batch_y = ne10*ne11;
@@ -6491,22 +6520,29 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx
64916520        GGML_ASSERT(nei0 <= 4096);
64926521        const uint32_t split_size = std::min(nei1, 4096u / nei0);
64936522
6494-         ggml_tensor src1_copy = *src1;
6495-         ggml_tensor src2_copy = *src2;
6496-         ggml_tensor dst_copy = *dst;
6523+         if (split_size == nei1) {
6524+             ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun);
6525+         } else {
6526+             ggml_tensor src1_copy = *src1;
6527+             ggml_tensor src2_copy = *src2;
6528+             ggml_tensor dst_copy = *dst;
64976529
6498-         for (uint32_t token_start = 0; token_start < nei1; token_start += split_size) {
6499-             const uint32_t n_tokens = std::min(split_size, nei1 - token_start);
6530+              for (uint32_t token_start = 0; token_start < nei1; token_start += split_size) {
6531+                  const uint32_t n_tokens = std::min(split_size, nei1 - token_start);
65006532
6501-             src1_copy.view_offs = src1->view_offs + token_start * src1_copy.nb[2];
6502-             src2_copy.view_offs = src2->view_offs + token_start * src2_copy.nb[1];
6503-             dst_copy.view_offs = dst->view_offs + token_start * dst_copy.nb[2];
6533+                  src1_copy.view_offs = src1->view_offs + token_start * src1_copy.nb[2];
6534+                  src2_copy.view_offs = src2->view_offs + token_start * src2_copy.nb[1];
6535+                  dst_copy.view_offs = dst->view_offs + token_start * dst_copy.nb[2];
65046536
6505-             src1_copy.ne[2] = n_tokens;
6506-             src2_copy.ne[1] = n_tokens;
6507-             dst_copy.ne[2] = n_tokens;
6537+                  src1_copy.ne[2] = n_tokens;
6538+                  src2_copy.ne[1] = n_tokens;
6539+                  dst_copy.ne[2] = n_tokens;
65086540
6509-             ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, &src1_copy, &src2_copy, &dst_copy, dryrun);
6541+                 ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, &src1_copy, &src2_copy, &dst_copy, dryrun);
6542+                 // invalidate cached prealloc_y, can't cache based on the copy of the ggml_tensor
6543+                 ctx->prealloc_y_last_pipeline_used = {};
6544+                 ctx->prealloc_y_last_tensor_used = nullptr;
6545+             }
65106546        }
65116547    }
65126548}
@@ -10311,6 +10347,7 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
1031110347        ggml_vk_pool_free(ctx, buffer);
1031210348    }
1031310349    ctx->gc.temp_buffers.clear();
10350+     ctx->prealloc_y_last_pipeline_used = {};
1031410351
1031510352    ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
1031610353    ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
@@ -10346,6 +10383,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
1034610383    ggml_vk_destroy_buffer(ctx->prealloc_x);
1034710384    ggml_vk_destroy_buffer(ctx->prealloc_y);
1034810385    ggml_vk_destroy_buffer(ctx->prealloc_split_k);
10386+     ctx->prealloc_y_last_pipeline_used = {};
1034910387
1035010388    for (auto& buffer : ctx->buffer_pool) {
1035110389        ggml_vk_destroy_buffer(buffer);
@@ -10894,6 +10932,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1089410932        compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
1089510933    }
1089610934
10935+     ctx->prealloc_y_last_pipeline_used = {};
10936+     ctx->prealloc_y_last_tensor_used = nullptr;
10937+ 
1089710938    // Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
1089810939    // Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
1089910940    // (and scaled down based on model size, so smaller models submit earlier).
0 commit comments