don't use shared pointer for prealloc_y_last_pipeline_used

jeffbolznv · jeffbolznv · commit 5b3471171e29 · 2025-08-21T07:07:44.000-05:00
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1194,8 +1194,8 @@ struct ggml_backend_vk_context {
     bool almost_ready_fence_pending {};
 
     // Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert.
-    vk_pipeline prealloc_y_last_pipeline_used;
-    const ggml_tensor *prealloc_y_last_tensor_used;
+    vk_pipeline_struct * prealloc_y_last_pipeline_used {};
+    const ggml_tensor * prealloc_y_last_tensor_used {};
 
     vk_buffer buffer_pool[MAX_VK_BUFFERS];
 
@@ -5655,18 +5655,18 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
         ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
     }
     if (y_non_contig) {
-        if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1 ||
+        if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
             ctx->prealloc_y_last_tensor_used != src1) {
             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
-            ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1;
+            ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
             ctx->prealloc_y_last_tensor_used = src1;
         }
     }
     if (quantize_y) {
-        if (ctx->prealloc_y_last_pipeline_used != to_q8_1 ||
+        if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() ||
             ctx->prealloc_y_last_tensor_used != src1) {
             ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13);
-            ctx->prealloc_y_last_pipeline_used = to_q8_1;
+            ctx->prealloc_y_last_pipeline_used = to_q8_1.get();
             ctx->prealloc_y_last_tensor_used = src1;
         }
     }
@@ -5843,10 +5843,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
     }
     if (y_non_contig) {
         GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
-        if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1 ||
+        if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
             ctx->prealloc_y_last_tensor_used != src1) {
             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
-            ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1;
+            ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
             ctx->prealloc_y_last_tensor_used = src1;
         }
     }
@@ -6278,10 +6278,10 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
             { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
     }
     if (y_non_contig) {
-        if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1 ||
+        if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
             ctx->prealloc_y_last_tensor_used != src1) {
             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
-            ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1;
+            ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
             ctx->prealloc_y_last_tensor_used = src1;
         }
     }
@@ -6471,10 +6471,10 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
     }
     if (y_non_contig) {
         GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
-        if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1 ||
+        if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
             ctx->prealloc_y_last_tensor_used != src1) {
             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
-            ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1;
+            ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
             ctx->prealloc_y_last_tensor_used = src1;
         }
     }
@@ -10383,7 +10383,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
     ggml_vk_destroy_buffer(ctx->prealloc_x);
     ggml_vk_destroy_buffer(ctx->prealloc_y);
     ggml_vk_destroy_buffer(ctx->prealloc_split_k);
-    ctx->prealloc_y_last_pipeline_used = {};
+    ctx->prealloc_y_last_pipeline_used = nullptr;
 
     for (auto& buffer : ctx->buffer_pool) {
         ggml_vk_destroy_buffer(buffer);
@@ -10932,7 +10932,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
         compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
     }
 
-    ctx->prealloc_y_last_pipeline_used = {};
+    ctx->prealloc_y_last_pipeline_used = nullptr;
     ctx->prealloc_y_last_tensor_used = nullptr;
 
     // Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.

Original file line number	Diff line number	Diff line change
`@@ -1194,8 +1194,8 @@ struct ggml_backend_vk_context {`
`1194`	`1194`	`bool almost_ready_fence_pending {};`
`1195`	`1195`
`1196`	`1196`	`// Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert.`
`1197`		`- vk_pipeline prealloc_y_last_pipeline_used;`
`1198`		`- const ggml_tensor *prealloc_y_last_tensor_used;`
	`1197`	`+ vk_pipeline_struct * prealloc_y_last_pipeline_used {};`
	`1198`	`+ const ggml_tensor * prealloc_y_last_tensor_used {};`
`1199`	`1199`
`1200`	`1200`	`vk_buffer buffer_pool[MAX_VK_BUFFERS];`
`1201`	`1201`
`@@ -5655,18 +5655,18 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub`
`5655`	`5655`	`ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});`
`5656`	`5656`	`}`
`5657`	`5657`	`if (y_non_contig) {`
`5658`		`- if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1 \|\|`
	`5658`	`+ if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() \|\|`
`5659`	`5659`	`ctx->prealloc_y_last_tensor_used != src1) {`
`5660`	`5660`	`ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });`
`5661`		`- ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1;`
	`5661`	`+ ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();`
`5662`	`5662`	`ctx->prealloc_y_last_tensor_used = src1;`
`5663`	`5663`	`}`
`5664`	`5664`	`}`
`5665`	`5665`	`if (quantize_y) {`
`5666`		`- if (ctx->prealloc_y_last_pipeline_used != to_q8_1 \|\|`
	`5666`	`+ if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() \|\|`
`5667`	`5667`	`ctx->prealloc_y_last_tensor_used != src1) {`
`5668`	`5668`	`ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13);`
`5669`		`- ctx->prealloc_y_last_pipeline_used = to_q8_1;`
	`5669`	`+ ctx->prealloc_y_last_pipeline_used = to_q8_1.get();`
`5670`	`5670`	`ctx->prealloc_y_last_tensor_used = src1;`
`5671`	`5671`	`}`
`5672`	`5672`	`}`
`@@ -5843,10 +5843,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&`
`5843`	`5843`	`}`
`5844`	`5844`	`if (y_non_contig) {`
`5845`	`5845`	`GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);`
`5846`		`- if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1 \|\|`
	`5846`	`+ if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() \|\|`
`5847`	`5847`	`ctx->prealloc_y_last_tensor_used != src1) {`
`5848`	`5848`	`ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });`
`5849`		`- ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1;`
	`5849`	`+ ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();`
`5850`	`5850`	`ctx->prealloc_y_last_tensor_used = src1;`
`5851`	`5851`	`}`
`5852`	`5852`	`}`
`@@ -6278,10 +6278,10 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&`
`6278`	`6278`	`{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});`
`6279`	`6279`	`}`
`6280`	`6280`	`if (y_non_contig) {`
`6281`		`- if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1 \|\|`
	`6281`	`+ if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() \|\|`
`6282`	`6282`	`ctx->prealloc_y_last_tensor_used != src1) {`
`6283`	`6283`	`ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });`
`6284`		`- ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1;`
	`6284`	`+ ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();`
`6285`	`6285`	`ctx->prealloc_y_last_tensor_used = src1;`
`6286`	`6286`	`}`
`6287`	`6287`	`}`
`@@ -6471,10 +6471,10 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte`
`6471`	`6471`	`}`
`6472`	`6472`	`if (y_non_contig) {`
`6473`	`6473`	`GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);`
`6474`		`- if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1 \|\|`
	`6474`	`+ if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() \|\|`
`6475`	`6475`	`ctx->prealloc_y_last_tensor_used != src1) {`
`6476`	`6476`	`ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });`
`6477`		`- ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1;`
	`6477`	`+ ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();`
`6478`	`6478`	`ctx->prealloc_y_last_tensor_used = src1;`
`6479`	`6479`	`}`
`6480`	`6480`	`}`
`@@ -10383,7 +10383,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {`
`10383`	`10383`	`ggml_vk_destroy_buffer(ctx->prealloc_x);`
`10384`	`10384`	`ggml_vk_destroy_buffer(ctx->prealloc_y);`
`10385`	`10385`	`ggml_vk_destroy_buffer(ctx->prealloc_split_k);`
`10386`		`- ctx->prealloc_y_last_pipeline_used = {};`
	`10386`	`+ ctx->prealloc_y_last_pipeline_used = nullptr;`
`10387`	`10387`
`10388`	`10388`	`for (auto& buffer : ctx->buffer_pool) {`
`10389`	`10389`	`ggml_vk_destroy_buffer(buffer);`
`@@ -10932,7 +10932,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg`
`10932`	`10932`	`compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);`
`10933`	`10933`	`}`
`10934`	`10934`
`10935`		`- ctx->prealloc_y_last_pipeline_used = {};`
	`10935`	`+ ctx->prealloc_y_last_pipeline_used = nullptr;`
`10936`	`10936`	`ctx->prealloc_y_last_tensor_used = nullptr;`
`10937`	`10937`
`10938`	`10938`	`// Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.`