@@ -4079,7 +4079,33 @@ static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bo
40794079    return s;
40804080}
40814081
4082- static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list<vk::DescriptorBufferInfo> const& descriptor_buffer_infos, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) {
4082+ template <typename T> size_t push_constant_size(const T &t) {
4083+     static_assert(std::is_class<T>::value, "T must be a struct/class");
4084+     GGML_UNUSED(t);
4085+     return sizeof(T);
4086+ }
4087+ template <typename T> size_t push_constant_size(const std::vector<T> &t) {
4088+     GGML_UNUSED(t);
4089+     return sizeof(T) * t.size();
4090+ }
4091+ template <typename T, uint32_t N> size_t push_constant_size(const std::array<T, N> &t) {
4092+     GGML_UNUSED(t);
4093+     return sizeof(T) * N;
4094+ }
4095+ 
4096+ template <typename T> const T *push_constant_data(const T &t) {
4097+     static_assert(std::is_class<T>::value, "T must be a struct/class");
4098+     return &t;
4099+ }
4100+ template <typename T> const T *push_constant_data(const std::vector<T> &t) {
4101+     return t.data();
4102+ }
4103+ template <typename T, uint32_t N> const T *push_constant_data(const std::array<T, N> &t) {
4104+     return t.data();
4105+ }
4106+ 
4107+ template <typename T>
4108+ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list<vk::DescriptorBufferInfo> const& descriptor_buffer_infos, const T &push_constants, std::array<uint32_t, 3> elements) {
40834109    const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
40844110    const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
40854111    const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
@@ -4095,7 +4121,7 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context&
40954121    vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
40964122    ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {});
40974123
4098-     subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size,  push_constants);
4124+     subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size(push_constants), push_constant_data( push_constants) );
40994125    subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline);
41004126    subctx->s->buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
41014127                                pipeline->layout,
@@ -4558,18 +4584,18 @@ static void ggml_vk_matmul(
45584584    ggml_vk_sync_buffers(subctx);
45594585    if (split_k == 1) {
45604586        const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3, padded_n };
4561-         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, sizeof(vk_mat_mat_push_constants), & pc, { m, n, batch });
4587+         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, pc, { m, n, batch });
45624588        return;
45634589    }
45644590
45654591    GGML_ASSERT(batch_stride_d == m * n);
45664592
45674593    const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, padded_n };
45684594    // Make sure enough workgroups get assigned for split k to work
4569-     ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, sizeof(vk_mat_mat_push_constants), & pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
4595+     ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
45704596    ggml_vk_sync_buffers(subctx);
45714597    const std::array<uint32_t, 2> pc2 = { (uint32_t)(m * n * batch), split_k };
4572-     ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data() , { m * n * batch, 1, 1 });
4598+     ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2, { m * n * batch, 1, 1 });
45734599}
45744600
45754601static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, uint32_t m, uint32_t n, bool aligned, ggml_type src0_type) {
@@ -4617,7 +4643,7 @@ static void ggml_vk_matmul_id(
46174643    ggml_vk_sync_buffers(subctx);
46184644    const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d,
46194645                                              nei0, nei1, nbi1, ne11, padded_n };
4620-     ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids }, sizeof(vk_mat_mat_id_push_constants), & pc, { m, nei1, n_as });
4646+     ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids }, pc, { m, nei1, n_as });
46214647}
46224648
46234649static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
@@ -4738,7 +4764,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
47384764    };
47394765    init_pushconst_fastdiv(pc);
47404766    ggml_vk_sync_buffers(subctx);
4741-     ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), & pc, elements);
4767+     ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements);
47424768}
47434769
47444770static vk_pipeline ggml_vk_get_quantize_pipeline(ggml_backend_vk_context * ctx, ggml_type type) {
@@ -4757,7 +4783,7 @@ static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& sub
47574783    vk_pipeline pipeline = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1);
47584784
47594785    ggml_vk_sync_buffers(subctx);
4760-     ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof( uint32_t), &ne , { ne, 1, 1 });
4786+     ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, std::array< uint32_t, 1>{ne} , { ne, 1, 1 });
47614787}
47624788
47634789static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -4957,7 +4983,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
49574983    } else if (qx_needs_dequant) {
49584984        const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
49594985        ggml_vk_sync_buffers(subctx);
4960-         ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data() , { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
4986+         ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
49614987    }
49624988    if (y_non_contig) {
49634989        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
@@ -5173,7 +5199,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
51735199    ggml_vk_sync_buffers(subctx);
51745200    ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
51755201                              { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} },
5176-                               sizeof(vk_mat_vec_push_constants), & pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
5202+                               pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
51775203}
51785204
51795205static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -5261,7 +5287,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
52615287    }
52625288
52635289    ggml_vk_sync_buffers(subctx);
5264-     ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), & pc, { 1, (uint32_t)ne01, workgroups_z });
5290+     ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, workgroups_z });
52655291}
52665292
52675293static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -5344,7 +5370,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
53445370    const std::array<uint32_t, 9> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
53455371    ggml_vk_sync_buffers(subctx);
53465372    ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
5347-         { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), & pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
5373+         { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
53485374}
53495375
53505376static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -5560,7 +5586,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
55605586        const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
55615587        ggml_vk_sync_buffers(subctx);
55625588        ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0,
5563-             { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data() , { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
5589+             { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
55645590    }
55655591    if (y_non_contig) {
55665592        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
@@ -5780,7 +5806,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
57805806    ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
57815807        { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 },
57825808        vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, vk_subbuffer{ d_ids, ids_buf_offset, ids_sz } },
5783-         sizeof(vk_mat_vec_id_push_constants), & pc, { groups_x, (uint32_t)nei0, groups_z });
5809+         pc, { groups_x, (uint32_t)nei0, groups_z });
57845810}
57855811
57865812static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
@@ -6130,7 +6156,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
61306156                                    // there's no more than one tile of rows (i.e. workgroups_x would have been
61316157                                    // one). We reuse workgroups_x to mean the number of splits, so we need to
61326158                                    // cancel out the divide by wg_denoms[0].
6133-                                     sizeof(vk_flash_attn_push_constants), & pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z });
6159+                                     pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z });
61346160
61356161        ggml_vk_sync_buffers(subctx);
61366162        const std::array<uint32_t, 3> pc2 = { D, (uint32_t)ne1, split_k };
@@ -6139,7 +6165,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
61396165                                        vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
61406166                                        vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
61416167                                    },
6142-                                     pc2.size() * uint32_t{sizeof(uint32_t)}, pc2.data() , { (uint32_t)ne1, 1, 1 });
6168+                                     pc2, { (uint32_t)ne1, 1, 1 });
61436169    } else {
61446170        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
61456171                                    {
@@ -6149,7 +6175,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
61496175                                        vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
61506176                                        vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
61516177                                    },
6152-                                     sizeof(vk_flash_attn_push_constants), & pc, { workgroups_x, workgroups_y, workgroups_z });
6178+                                     pc, { workgroups_x, workgroups_y, workgroups_z });
61536179    }
61546180}
61556181
@@ -6827,7 +6853,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
68276853        }
68286854
68296855        ggml_vk_sync_buffers(subctx);
6830-         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), & pc, elements);
6856+         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
68316857    } else if (op == GGML_OP_ROPE || op == GGML_OP_ROPE_BACK) {
68326858        // Empty src2 is possible in rope, but the shader needs a buffer
68336859        vk_subbuffer subbuf_z;
@@ -6838,26 +6864,26 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
68386864        }
68396865
68406866        ggml_vk_sync_buffers(subctx);
6841-         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), & pc, elements);
6867+         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
68426868    } else if (op == GGML_OP_IM2COL) {
68436869        // im2col uses only src1 and dst buffers
68446870        ggml_vk_sync_buffers(subctx);
6845-         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), & pc, elements);
6871+         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
68466872    } else if (op == GGML_OP_COUNT_EQUAL) {
68476873        ggml_vk_sync_buffers(subctx);
68486874        // count_equal assumes that destination buffer is initialized with zeroes
68496875        ggml_vk_buffer_memset_async(subctx, d_D, d_buf_offset, 0, d_sz);
68506876        ggml_vk_sync_buffers(subctx);
6851-         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), & pc, elements);
6877+         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
68526878    } else if (use_src2) {
68536879        ggml_vk_sync_buffers(subctx);
6854-         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), & pc, elements);
6880+         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
68556881    } else if (use_src1) {
68566882        ggml_vk_sync_buffers(subctx);
6857-         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), & pc, elements);
6883+         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
68586884    } else {
68596885        ggml_vk_sync_buffers(subctx);
6860-         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), & pc, elements);
6886+         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
68616887    }
68626888}
68636889
@@ -7026,7 +7052,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
70267052            vk_subbuffer{ d_srcs[4], src_offsets[4], src_sizes[4] },
70277053            vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] },
70287054            vk_subbuffer{ d_D, dst_offset, dst_size }
7029-         }, sizeof(vk_op_rwkv_wkv6_push_constants), & pc, elements);
7055+         }, pc, elements);
70307056    } else if (version == 7) {
70317057        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {
70327058            vk_subbuffer{ d_srcs[0], src_offsets[0], src_sizes[0] },
@@ -7037,7 +7063,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
70377063            vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] },
70387064            vk_subbuffer{ d_srcs[6], src_offsets[6], src_sizes[6] },
70397065            vk_subbuffer{ d_D, dst_offset, dst_size }
7040-         }, sizeof(vk_op_rwkv_wkv7_push_constants), & pc, elements);
7066+         }, pc, elements);
70417067    } else {
70427068        // shouldn't happen
70437069        GGML_ASSERT(false);
@@ -7174,7 +7200,7 @@ static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_cont
71747200        vk_subbuffer{ d_GM, gm_offset, gm_size },
71757201        vk_subbuffer{ d_GV, gv_offset, gv_size },
71767202        vk_subbuffer{ d_P, p_offset, p_size },
7177-     }, sizeof(vk_op_push_constants), & pc, elements);
7203+     }, pc, elements);
71787204}
71797205
71807206static void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) {
@@ -8063,7 +8089,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
80638089    vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
80648090    ggml_vk_ctx_begin(ctx->device, subctx);
80658091    const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
8066-     ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data() , { (uint32_t)ne, 1, 1});
8092+     ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc, { (uint32_t)ne, 1, 1});
80678093    ggml_vk_ctx_end(subctx);
80688094
80698095    auto begin = std::chrono::high_resolution_clock::now();
0 commit comments