Skip to content

Commit 7291c7b

Browse files
committed
vulkan: use prealloc intermediate reuse for mmvq path
1 parent 301d79d commit 7291c7b

File tree

1 file changed

+8
-3
lines changed

1 file changed

+8
-3
lines changed

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4638,7 +4638,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
46384638

46394639
// heuristic to choose workgroup size
46404640
uint32_t dmmv_wg = DMMV_WG_SIZE_SUBGROUP;
4641-
if (ctx->device->vendor_id == VK_VENDOR_ID_NVIDIA || ctx->device->vendor_id == VK_VENDOR_ID_INTEL) {
4641+
if (ctx->device->vendor_id == VK_VENDOR_ID_NVIDIA) {
46424642
// Prefer larger workgroups when M is small, to spread the work out more
46434643
// and keep more SMs busy.
46444644
// q6_k seems to prefer small workgroup size even for "medium" values of M.
@@ -4654,7 +4654,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
46544654
}
46554655

46564656
if (b_type == GGML_TYPE_Q8_1) {
4657-
return ctx->device->pipeline_dequant_mul_mat_vec_q8_1_f32[DMMV_WG_SIZE_SUBGROUP][a_type][num_cols-1];
4657+
return ctx->device->pipeline_dequant_mul_mat_vec_q8_1_f32[dmmv_wg][a_type][num_cols-1];
46584658
}
46594659

46604660
return b_type == GGML_TYPE_F32 ? ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[dmmv_wg][a_type][num_cols-1] : ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[dmmv_wg][a_type][num_cols-1];
@@ -6114,7 +6114,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
61146114
}
61156115
}
61166116
if (quantize_y) {
6117-
ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13, true);
6117+
if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() ||
6118+
ctx->prealloc_y_last_tensor_used != src1) {
6119+
ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13, true);
6120+
ctx->prealloc_y_last_pipeline_used = to_q8_1.get();
6121+
ctx->prealloc_y_last_tensor_used = src1;
6122+
}
61186123
}
61196124

61206125
// For batch_n, the A matrix is the same for each batch, and B/D use the row stride as the batch stride

0 commit comments

Comments
 (0)