@@ -1193,6 +1193,10 @@ struct ggml_backend_vk_context {
11931193 vk::Fence fence, almost_ready_fence;
11941194 bool almost_ready_fence_pending {};
11951195
1196+ // Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert.
1197+ vk_pipeline_struct * prealloc_y_last_pipeline_used {};
1198+ const ggml_tensor * prealloc_y_last_tensor_used {};
1199+
11961200 vk_buffer buffer_pool[MAX_VK_BUFFERS];
11971201
11981202 vk_context_ref compute_ctx;
@@ -5651,10 +5655,20 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
56515655 ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
56525656 }
56535657 if (y_non_contig) {
5654- ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
5658+ if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
5659+ ctx->prealloc_y_last_tensor_used != src1) {
5660+ ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
5661+ ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
5662+ ctx->prealloc_y_last_tensor_used = src1;
5663+ }
56555664 }
56565665 if (quantize_y) {
5657- ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13);
5666+ if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() ||
5667+ ctx->prealloc_y_last_tensor_used != src1) {
5668+ ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13);
5669+ ctx->prealloc_y_last_pipeline_used = to_q8_1.get();
5670+ ctx->prealloc_y_last_tensor_used = src1;
5671+ }
56585672 }
56595673
56605674 uint32_t stride_batch_x = ne00*ne01;
@@ -5829,7 +5843,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
58295843 }
58305844 if (y_non_contig) {
58315845 GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
5832- ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
5846+ if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
5847+ ctx->prealloc_y_last_tensor_used != src1) {
5848+ ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
5849+ ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
5850+ ctx->prealloc_y_last_tensor_used = src1;
5851+ }
58335852 }
58345853
58355854 // For batch_n, the A matrix is the same for each batch, and B/D use the row stride as the batch stride
@@ -6259,7 +6278,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
62596278 { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
62606279 }
62616280 if (y_non_contig) {
6262- ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
6281+ if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
6282+ ctx->prealloc_y_last_tensor_used != src1) {
6283+ ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
6284+ ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
6285+ ctx->prealloc_y_last_tensor_used = src1;
6286+ }
62636287 }
62646288
62656289 uint32_t stride_batch_x = ne00*ne01;
@@ -6447,7 +6471,12 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
64476471 }
64486472 if (y_non_contig) {
64496473 GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
6450- ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
6474+ if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
6475+ ctx->prealloc_y_last_tensor_used != src1) {
6476+ ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
6477+ ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
6478+ ctx->prealloc_y_last_tensor_used = src1;
6479+ }
64516480 }
64526481
64536482 uint32_t stride_batch_y = ne10*ne11;
@@ -6491,22 +6520,29 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx
64916520 GGML_ASSERT(nei0 <= 4096);
64926521 const uint32_t split_size = std::min(nei1, 4096u / nei0);
64936522
6494- ggml_tensor src1_copy = *src1;
6495- ggml_tensor src2_copy = *src2;
6496- ggml_tensor dst_copy = *dst;
6523+ if (split_size == nei1) {
6524+ ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun);
6525+ } else {
6526+ ggml_tensor src1_copy = *src1;
6527+ ggml_tensor src2_copy = *src2;
6528+ ggml_tensor dst_copy = *dst;
64976529
6498- for (uint32_t token_start = 0; token_start < nei1; token_start += split_size) {
6499- const uint32_t n_tokens = std::min(split_size, nei1 - token_start);
6530+ for (uint32_t token_start = 0; token_start < nei1; token_start += split_size) {
6531+ const uint32_t n_tokens = std::min(split_size, nei1 - token_start);
65006532
6501- src1_copy.view_offs = src1->view_offs + token_start * src1_copy.nb[2];
6502- src2_copy.view_offs = src2->view_offs + token_start * src2_copy.nb[1];
6503- dst_copy.view_offs = dst->view_offs + token_start * dst_copy.nb[2];
6533+ src1_copy.view_offs = src1->view_offs + token_start * src1_copy.nb[2];
6534+ src2_copy.view_offs = src2->view_offs + token_start * src2_copy.nb[1];
6535+ dst_copy.view_offs = dst->view_offs + token_start * dst_copy.nb[2];
65046536
6505- src1_copy.ne[2] = n_tokens;
6506- src2_copy.ne[1] = n_tokens;
6507- dst_copy.ne[2] = n_tokens;
6537+ src1_copy.ne[2] = n_tokens;
6538+ src2_copy.ne[1] = n_tokens;
6539+ dst_copy.ne[2] = n_tokens;
65086540
6509- ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, &src1_copy, &src2_copy, &dst_copy, dryrun);
6541+ ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, &src1_copy, &src2_copy, &dst_copy, dryrun);
6542+ // invalidate cached prealloc_y, can't cache based on the copy of the ggml_tensor
6543+ ctx->prealloc_y_last_pipeline_used = {};
6544+ ctx->prealloc_y_last_tensor_used = nullptr;
6545+ }
65106546 }
65116547 }
65126548}
@@ -10311,6 +10347,7 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
1031110347 ggml_vk_pool_free(ctx, buffer);
1031210348 }
1031310349 ctx->gc.temp_buffers.clear();
10350+ ctx->prealloc_y_last_pipeline_used = {};
1031410351
1031510352 ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
1031610353 ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
@@ -10346,6 +10383,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
1034610383 ggml_vk_destroy_buffer(ctx->prealloc_x);
1034710384 ggml_vk_destroy_buffer(ctx->prealloc_y);
1034810385 ggml_vk_destroy_buffer(ctx->prealloc_split_k);
10386+ ctx->prealloc_y_last_pipeline_used = nullptr;
1034910387
1035010388 for (auto& buffer : ctx->buffer_pool) {
1035110389 ggml_vk_destroy_buffer(buffer);
@@ -10894,6 +10932,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1089410932 compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
1089510933 }
1089610934
10935+ ctx->prealloc_y_last_pipeline_used = nullptr;
10936+ ctx->prealloc_y_last_tensor_used = nullptr;
10937+
1089710938 // Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
1089810939 // Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
1089910940 // (and scaled down based on model size, so smaller models submit earlier).
0 commit comments