@@ -2885,7 +2885,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
28852885            ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_len,              mul_mat_vec_p021_f16_f32_data,              "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true);
28862886        }
28872887    }
2888-     ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 9  * sizeof(uint32_t), {1, 1, 1}, {}, 1);
2888+     ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 12  * sizeof(uint32_t), {1, 1, 1}, {}, 1);
28892889
28902890    ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
28912891    ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
@@ -5821,7 +5821,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
58215821    const uint64_t ne00 = src0->ne[0];
58225822    const uint64_t ne01 = src0->ne[1];
58235823    const uint64_t ne02 = src0->ne[2];
5824-     //  const uint64_t ne03 = src0->ne[3];
5824+     const uint64_t ne03 = src0->ne[3];
58255825
58265826    const uint64_t nb01 = src0->nb[1];
58275827    const uint64_t nb02 = src0->nb[2];
@@ -5833,7 +5833,12 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
58335833    const uint64_t ne12 = src1->ne[2];
58345834    // const uint64_t ne13 = src1->ne[3];
58355835
5836+     const uint32_t nb03 = (uint32_t)(src0->nb[3] / sizeof(ggml_fp16_t));
5837+     const uint32_t nb13 = (uint32_t)(src1->nb[3] / sizeof(float));
5838+     const uint32_t nb23 = (uint32_t)(dst->nb[3] / sizeof(float));
5839+ 
58365840    GGML_ASSERT(ne11 == 1);
5841+     GGML_ASSERT(src0->ne[3] == src1->ne[3]); // checked in supports_op
58375842
58385843    ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
58395844    ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
@@ -5849,7 +5854,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
58495854        src1_uma = d_Qy != nullptr;
58505855    }
58515856
5852-     const uint64_t d_ne = ne01 * ne11 * ne12;
5857+     const uint64_t d_ne = ne01 * ne11 * ne12 * ne03 ;
58535858
58545859    const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t);
58555860    const uint32_t channel_stride_x = nb02 / sizeof(ggml_fp16_t);
@@ -5884,10 +5889,10 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
58845889    const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
58855890
58865891    // compute
5887-     const std::array<uint32_t, 9 > pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
5892+     const std::array<uint32_t, 12 > pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)), nb03, nb13, nb23  };
58885893    ggml_vk_sync_buffers(subctx);
58895894    ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
5890-         { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1 , (uint32_t)ne01, (uint32_t)ne12 });
5895+         { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { (uint32_t)ne03 , (uint32_t)ne01, (uint32_t)ne12 });
58915896}
58925897
58935898static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
0 commit comments