vulkan: use prealloc intermediate reuse for mmvq path

0cc4m · 0cc4m · commit 7291c7b64a77 · 2025-08-31T09:18:40.000Z
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -4638,7 +4638,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
 
     // heuristic to choose workgroup size
     uint32_t dmmv_wg = DMMV_WG_SIZE_SUBGROUP;
-    if (ctx->device->vendor_id == VK_VENDOR_ID_NVIDIA || ctx->device->vendor_id == VK_VENDOR_ID_INTEL) {
+    if (ctx->device->vendor_id == VK_VENDOR_ID_NVIDIA) {
         // Prefer larger workgroups when M is small, to spread the work out more
         // and keep more SMs busy.
         // q6_k seems to prefer small workgroup size even for "medium" values of M.
@@ -4654,7 +4654,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
     }
 
     if (b_type == GGML_TYPE_Q8_1) {
-        return ctx->device->pipeline_dequant_mul_mat_vec_q8_1_f32[DMMV_WG_SIZE_SUBGROUP][a_type][num_cols-1];
+        return ctx->device->pipeline_dequant_mul_mat_vec_q8_1_f32[dmmv_wg][a_type][num_cols-1];
     }
 
     return b_type == GGML_TYPE_F32 ? ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[dmmv_wg][a_type][num_cols-1] : ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[dmmv_wg][a_type][num_cols-1];
@@ -6114,7 +6114,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
         }
     }
     if (quantize_y) {
-        ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13, true);
+        if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() ||
+            ctx->prealloc_y_last_tensor_used != src1) {
+            ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13, true);
+            ctx->prealloc_y_last_pipeline_used = to_q8_1.get();
+            ctx->prealloc_y_last_tensor_used = src1;
+        }
     }
 
     // For batch_n, the A matrix is the same for each batch, and B/D use the row stride as the batch stride

Original file line number	Diff line number	Diff line change
`@@ -4638,7 +4638,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *`
`4638`	`4638`
`4639`	`4639`	`// heuristic to choose workgroup size`
`4640`	`4640`	`uint32_t dmmv_wg = DMMV_WG_SIZE_SUBGROUP;`
`4641`		`- if (ctx->device->vendor_id == VK_VENDOR_ID_NVIDIA \|\| ctx->device->vendor_id == VK_VENDOR_ID_INTEL) {`
	`4641`	`+ if (ctx->device->vendor_id == VK_VENDOR_ID_NVIDIA) {`
`4642`	`4642`	`// Prefer larger workgroups when M is small, to spread the work out more`
`4643`	`4643`	`// and keep more SMs busy.`
`4644`	`4644`	`// q6_k seems to prefer small workgroup size even for "medium" values of M.`
`@@ -4654,7 +4654,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *`
`4654`	`4654`	`}`
`4655`	`4655`
`4656`	`4656`	`if (b_type == GGML_TYPE_Q8_1) {`
`4657`		`- return ctx->device->pipeline_dequant_mul_mat_vec_q8_1_f32[DMMV_WG_SIZE_SUBGROUP][a_type][num_cols-1];`
	`4657`	`+ return ctx->device->pipeline_dequant_mul_mat_vec_q8_1_f32[dmmv_wg][a_type][num_cols-1];`
`4658`	`4658`	`}`
`4659`	`4659`
`4660`	`4660`	`return b_type == GGML_TYPE_F32 ? ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[dmmv_wg][a_type][num_cols-1] : ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[dmmv_wg][a_type][num_cols-1];`
`@@ -6114,7 +6114,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&`
`6114`	`6114`	`}`
`6115`	`6115`	`}`
`6116`	`6116`	`if (quantize_y) {`
`6117`		`- ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13, true);`
	`6117`	`+ if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() \|\|`
	`6118`	`+ ctx->prealloc_y_last_tensor_used != src1) {`
	`6119`	`+ ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13, true);`
	`6120`	`+ ctx->prealloc_y_last_pipeline_used = to_q8_1.get();`
	`6121`	`+ ctx->prealloc_y_last_tensor_used = src1;`
	`6122`	`+ }`
`6118`	`6123`	`}`
`6119`	`6124`
`6120`	`6125`	`// For batch_n, the A matrix is the same for each batch, and B/D use the row stride as the batch stride`