Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 58 additions & 17 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1193,6 +1193,10 @@ struct ggml_backend_vk_context {
vk::Fence fence, almost_ready_fence;
bool almost_ready_fence_pending {};

// Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert.
vk_pipeline_struct * prealloc_y_last_pipeline_used {};
const ggml_tensor * prealloc_y_last_tensor_used {};

vk_buffer buffer_pool[MAX_VK_BUFFERS];

vk_context_ref compute_ctx;
Expand Down Expand Up @@ -5651,10 +5655,20 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
}
if (y_non_contig) {
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
ctx->prealloc_y_last_tensor_used != src1) {
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
ctx->prealloc_y_last_tensor_used = src1;
}
}
if (quantize_y) {
ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13);
if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() ||
ctx->prealloc_y_last_tensor_used != src1) {
ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13);
ctx->prealloc_y_last_pipeline_used = to_q8_1.get();
ctx->prealloc_y_last_tensor_used = src1;
}
}

uint32_t stride_batch_x = ne00*ne01;
Expand Down Expand Up @@ -5829,7 +5843,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
}
if (y_non_contig) {
GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
ctx->prealloc_y_last_tensor_used != src1) {
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
ctx->prealloc_y_last_tensor_used = src1;
}
}

// For batch_n, the A matrix is the same for each batch, and B/D use the row stride as the batch stride
Expand Down Expand Up @@ -6259,7 +6278,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
}
if (y_non_contig) {
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
ctx->prealloc_y_last_tensor_used != src1) {
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
ctx->prealloc_y_last_tensor_used = src1;
}
}

uint32_t stride_batch_x = ne00*ne01;
Expand Down Expand Up @@ -6447,7 +6471,12 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
}
if (y_non_contig) {
GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
ctx->prealloc_y_last_tensor_used != src1) {
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
ctx->prealloc_y_last_tensor_used = src1;
}
}

uint32_t stride_batch_y = ne10*ne11;
Expand Down Expand Up @@ -6491,22 +6520,29 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx
GGML_ASSERT(nei0 <= 4096);
const uint32_t split_size = std::min(nei1, 4096u / nei0);

ggml_tensor src1_copy = *src1;
ggml_tensor src2_copy = *src2;
ggml_tensor dst_copy = *dst;
if (split_size == nei1) {
ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun);
} else {
ggml_tensor src1_copy = *src1;
ggml_tensor src2_copy = *src2;
ggml_tensor dst_copy = *dst;

for (uint32_t token_start = 0; token_start < nei1; token_start += split_size) {
const uint32_t n_tokens = std::min(split_size, nei1 - token_start);
for (uint32_t token_start = 0; token_start < nei1; token_start += split_size) {
const uint32_t n_tokens = std::min(split_size, nei1 - token_start);

src1_copy.view_offs = src1->view_offs + token_start * src1_copy.nb[2];
src2_copy.view_offs = src2->view_offs + token_start * src2_copy.nb[1];
dst_copy.view_offs = dst->view_offs + token_start * dst_copy.nb[2];
src1_copy.view_offs = src1->view_offs + token_start * src1_copy.nb[2];
src2_copy.view_offs = src2->view_offs + token_start * src2_copy.nb[1];
dst_copy.view_offs = dst->view_offs + token_start * dst_copy.nb[2];

src1_copy.ne[2] = n_tokens;
src2_copy.ne[1] = n_tokens;
dst_copy.ne[2] = n_tokens;
src1_copy.ne[2] = n_tokens;
src2_copy.ne[1] = n_tokens;
dst_copy.ne[2] = n_tokens;

ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, &src1_copy, &src2_copy, &dst_copy, dryrun);
ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, &src1_copy, &src2_copy, &dst_copy, dryrun);
// invalidate cached prealloc_y, can't cache based on the copy of the ggml_tensor
ctx->prealloc_y_last_pipeline_used = {};
ctx->prealloc_y_last_tensor_used = nullptr;
}
}
}
}
Expand Down Expand Up @@ -10311,6 +10347,7 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
ggml_vk_pool_free(ctx, buffer);
}
ctx->gc.temp_buffers.clear();
ctx->prealloc_y_last_pipeline_used = {};

ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
Expand Down Expand Up @@ -10346,6 +10383,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
ggml_vk_destroy_buffer(ctx->prealloc_x);
ggml_vk_destroy_buffer(ctx->prealloc_y);
ggml_vk_destroy_buffer(ctx->prealloc_split_k);
ctx->prealloc_y_last_pipeline_used = nullptr;

for (auto& buffer : ctx->buffer_pool) {
ggml_vk_destroy_buffer(buffer);
Expand Down Expand Up @@ -10894,6 +10932,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
}

ctx->prealloc_y_last_pipeline_used = nullptr;
ctx->prealloc_y_last_tensor_used = nullptr;

// Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
// Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
// (and scaled down based on model size, so smaller models submit earlier).
Expand Down
42 changes: 36 additions & 6 deletions tests/test-backend-ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3098,9 +3098,10 @@ struct test_mul_mat : public test_case {
const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
const std::array<int64_t, 4> per; // permutation of dimensions
const bool v; // whether a and b are non-contiguous views
const uint32_t o; // number of outputs

std::string vars() override {
return VARS_TO_STR9(type_a, type_b, m, n, k, bs, nr, per, v);
return VARS_TO_STR10(type_a, type_b, m, n, k, bs, nr, per, v, o);
}

double max_nmse_err() override {
Expand All @@ -3121,8 +3122,8 @@ struct test_mul_mat : public test_case {
std::array<int64_t, 2> bs = {10, 10},
std::array<int64_t, 2> nr = {2, 2},
std::array<int64_t, 4> per = {0, 1, 2, 3},
bool v = false)
: type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), v(v) {}
bool v = false, uint32_t o = 1)
: type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), v(v), o(o) {}

ggml_tensor * build_graph(ggml_context * ctx) override {
// C^T = A * B^T: (k, m) * (k, n) => (m, n)
Expand Down Expand Up @@ -3186,9 +3187,21 @@ struct test_mul_mat : public test_case {

ggml_tensor * out = ggml_mul_mat(ctx, a, b);
ggml_set_name(out, "out");
for (uint32_t i = 1; i < o; ++i) {
ggml_tensor * out2 = ggml_mul_mat(ctx, a, b);
ggml_set_name(out2, "out2");
out = ggml_add(ctx, out, out2);
}

return out;
}

bool run_whole_graph() override { return o > 1; }

std::string op_desc(ggml_tensor * t) override {
GGML_UNUSED(t);
return ggml_op_name(GGML_OP_MUL_MAT);
}
};

// GGML_OP_MUL_MAT_ID
Expand All @@ -3201,9 +3214,10 @@ struct test_mul_mat_id : public test_case {
const int64_t m;
const int64_t n;
const int64_t k;
const uint32_t o; // number of outputs

std::string vars() override {
return VARS_TO_STR8(type_a, type_b, n_mats, n_used, b, m, n, k);
return VARS_TO_STR9(type_a, type_b, n_mats, n_used, b, m, n, k, o);
}

double max_nmse_err() override {
Expand All @@ -3217,9 +3231,9 @@ struct test_mul_mat_id : public test_case {

test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
int n_mats = 8, int n_used = 2, bool b = false,
int64_t m = 32, int64_t n = 32, int64_t k = 32)
int64_t m = 32, int64_t n = 32, int64_t k = 32, uint32_t o = 1)
: type_a(type_a), type_b(type_b), n_mats(n_mats), n_used(n_used), b(b),
m(m), n(n), k(k) {
m(m), n(n), k(k), o(o) {
GGML_ASSERT(n_used <= n_mats);
}

Expand All @@ -3241,6 +3255,13 @@ struct test_mul_mat_id : public test_case {
ggml_tensor * out = ggml_mul_mat_id(ctx, as, b, ids);
ggml_set_name(out, "out");

for (uint32_t i = 1; i < o; ++i) {
ggml_tensor * a2 = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
ggml_tensor * out2 = ggml_mul_mat_id(ctx, a2, b, ids);
ggml_set_name(out2, "out2");
out = ggml_add(ctx, out, out2);
}

return out;
}

Expand All @@ -3264,6 +3285,13 @@ struct test_mul_mat_id : public test_case {
}
}
}

bool run_whole_graph() override { return o > 1; }

std::string op_desc(ggml_tensor * t) override {
GGML_UNUSED(t);
return ggml_op_name(GGML_OP_MUL_MAT_ID);
}
};

// GGML_OP_OUT_PROD
Expand Down Expand Up @@ -5798,6 +5826,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, {1, 1}, {4, 1}, {0, 2, 1, 3}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67, {1, 1}, {4, 1}, {0, 2, 1, 3}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 16, 32, 32, { 1, 1}, {1, 1}, {0, 1, 2, 3}, true, 3));

for (auto bs2 : {1,3}) {
for (auto bs : {1,2,4,8}) {
Expand Down Expand Up @@ -5826,6 +5855,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
}

test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 1, 1, false, 8, 16, 1));
test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 16, 16, false, 32, 32, 32, 3));

for (ggml_type type_a : base_types) {
for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
Expand Down