Skip to content

Commit ecab78c

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents b7efb58 + 96452a3 commit ecab78c

File tree

3 files changed

+102
-31
lines changed

3 files changed

+102
-31
lines changed

examples/model-conversion/README.md

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ The motivation for having this is that the conversion process can often be an
66
iterative process, where the original model is inspected, converted, updates
77
made to llama.cpp, converted again, etc. Once the model has been converted it
88
needs to be verified against the original model, and then optionally quantified,
9-
and is some cases perplexity checked of the quantized model. And finally the
9+
and in some cases perplexity checked of the quantized model. And finally the
1010
model/models need to the ggml-org on Hugging Face. This tool/example tries to
1111
help with this process.
1212

@@ -62,7 +62,7 @@ Command line arguments take precedence over environment variables when both are
6262

6363
In cases where the transformer implementation for the model has not been released
6464
yet it is possible to set the environment variable `UNRELEASED_MODEL_NAME` which
65-
will the cause the transformer implementation to be loaded explicitely and not
65+
will then cause the transformer implementation to be loaded explicitely and not
6666
use AutoModelForCausalLM:
6767
```
6868
export UNRELEASED_MODEL_NAME=SomeNewModel
@@ -87,7 +87,7 @@ from the converted model.
8787
# Or using command line argument
8888
(venv) $ make causal-run-original-model MODEL_PATH=~/work/ai/models/some_model
8989
```
90-
This command will save two file to the `data` directory, one is a binary file
90+
This command will save two files to the `data` directory, one is a binary file
9191
containing logits which will be used for comparison with the converted model
9292
later, and the other is a text file which allows for manual visual inspection.
9393

@@ -128,11 +128,11 @@ Quantized model saved to: /path/to/quantized/model-Q8_0.gguf
128128
Export the quantized model path to QUANTIZED_MODEL variable in your environment
129129
```
130130
This will show the path to the quantized model in the terminal, which can then
131-
be used set the `QUANTIZED_MODEL` environment variable:
131+
be used to set the `QUANTIZED_MODEL` environment variable:
132132
```console
133133
export QUANTIZED_MODEL=/path/to/quantized/model-Q8_0.gguf
134134
```
135-
The the quantized model can be run using the following command:
135+
Then the quantized model can be run using the following command:
136136
```console
137137
(venv) $ make causal-run-quantized-model
138138
```
@@ -229,11 +229,11 @@ Quantized model saved to: /path/to/quantized/model-Q8_0.gguf
229229
Export the quantized model path to QUANTIZED_EMBEDDING_MODEL variable in your environment
230230
```
231231
This will show the path to the quantized model in the terminal, which can then
232-
be used set the `QUANTIZED_EMBEDDING_MODEL` environment variable:
232+
be used to set the `QUANTIZED_EMBEDDING_MODEL` environment variable:
233233
```console
234234
export QUANTIZED_EMBEDDING_MODEL=/path/to/quantized/model-Q8_0.gguf
235235
```
236-
The the quantized model can be run using the following command:
236+
Then the quantized model can be run using the following command:
237237
```console
238238
(venv) $ make embedding-run-quantized-model
239239
```
@@ -246,7 +246,7 @@ token/logits file:
246246
```console
247247
(venv) $ make perplexity-run QUANTIZED_MODEL=~/path/to/quantized/model.gguf
248248
```
249-
This will use the wikitext dataset to run the perplexity evaluation and and
249+
This will use the wikitext dataset to run the perplexity evaluation and
250250
output the perplexity score to the terminal. This value can then be compared
251251
with the perplexity score of the unquantized model.
252252

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 58 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1193,6 +1193,10 @@ struct ggml_backend_vk_context {
11931193
vk::Fence fence, almost_ready_fence;
11941194
bool almost_ready_fence_pending {};
11951195

1196+
// Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert.
1197+
vk_pipeline_struct * prealloc_y_last_pipeline_used {};
1198+
const ggml_tensor * prealloc_y_last_tensor_used {};
1199+
11961200
vk_buffer buffer_pool[MAX_VK_BUFFERS];
11971201

11981202
vk_context_ref compute_ctx;
@@ -5651,10 +5655,20 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
56515655
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
56525656
}
56535657
if (y_non_contig) {
5654-
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
5658+
if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
5659+
ctx->prealloc_y_last_tensor_used != src1) {
5660+
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
5661+
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
5662+
ctx->prealloc_y_last_tensor_used = src1;
5663+
}
56555664
}
56565665
if (quantize_y) {
5657-
ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13);
5666+
if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() ||
5667+
ctx->prealloc_y_last_tensor_used != src1) {
5668+
ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13);
5669+
ctx->prealloc_y_last_pipeline_used = to_q8_1.get();
5670+
ctx->prealloc_y_last_tensor_used = src1;
5671+
}
56585672
}
56595673

56605674
uint32_t stride_batch_x = ne00*ne01;
@@ -5829,7 +5843,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
58295843
}
58305844
if (y_non_contig) {
58315845
GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
5832-
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
5846+
if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
5847+
ctx->prealloc_y_last_tensor_used != src1) {
5848+
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
5849+
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
5850+
ctx->prealloc_y_last_tensor_used = src1;
5851+
}
58335852
}
58345853

58355854
// For batch_n, the A matrix is the same for each batch, and B/D use the row stride as the batch stride
@@ -6259,7 +6278,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
62596278
{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
62606279
}
62616280
if (y_non_contig) {
6262-
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
6281+
if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
6282+
ctx->prealloc_y_last_tensor_used != src1) {
6283+
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
6284+
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
6285+
ctx->prealloc_y_last_tensor_used = src1;
6286+
}
62636287
}
62646288

62656289
uint32_t stride_batch_x = ne00*ne01;
@@ -6447,7 +6471,12 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
64476471
}
64486472
if (y_non_contig) {
64496473
GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
6450-
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
6474+
if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
6475+
ctx->prealloc_y_last_tensor_used != src1) {
6476+
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
6477+
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
6478+
ctx->prealloc_y_last_tensor_used = src1;
6479+
}
64516480
}
64526481

64536482
uint32_t stride_batch_y = ne10*ne11;
@@ -6491,22 +6520,29 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx
64916520
GGML_ASSERT(nei0 <= 4096);
64926521
const uint32_t split_size = std::min(nei1, 4096u / nei0);
64936522

6494-
ggml_tensor src1_copy = *src1;
6495-
ggml_tensor src2_copy = *src2;
6496-
ggml_tensor dst_copy = *dst;
6523+
if (split_size == nei1) {
6524+
ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun);
6525+
} else {
6526+
ggml_tensor src1_copy = *src1;
6527+
ggml_tensor src2_copy = *src2;
6528+
ggml_tensor dst_copy = *dst;
64976529

6498-
for (uint32_t token_start = 0; token_start < nei1; token_start += split_size) {
6499-
const uint32_t n_tokens = std::min(split_size, nei1 - token_start);
6530+
for (uint32_t token_start = 0; token_start < nei1; token_start += split_size) {
6531+
const uint32_t n_tokens = std::min(split_size, nei1 - token_start);
65006532

6501-
src1_copy.view_offs = src1->view_offs + token_start * src1_copy.nb[2];
6502-
src2_copy.view_offs = src2->view_offs + token_start * src2_copy.nb[1];
6503-
dst_copy.view_offs = dst->view_offs + token_start * dst_copy.nb[2];
6533+
src1_copy.view_offs = src1->view_offs + token_start * src1_copy.nb[2];
6534+
src2_copy.view_offs = src2->view_offs + token_start * src2_copy.nb[1];
6535+
dst_copy.view_offs = dst->view_offs + token_start * dst_copy.nb[2];
65046536

6505-
src1_copy.ne[2] = n_tokens;
6506-
src2_copy.ne[1] = n_tokens;
6507-
dst_copy.ne[2] = n_tokens;
6537+
src1_copy.ne[2] = n_tokens;
6538+
src2_copy.ne[1] = n_tokens;
6539+
dst_copy.ne[2] = n_tokens;
65086540

6509-
ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, &src1_copy, &src2_copy, &dst_copy, dryrun);
6541+
ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, &src1_copy, &src2_copy, &dst_copy, dryrun);
6542+
// invalidate cached prealloc_y, can't cache based on the copy of the ggml_tensor
6543+
ctx->prealloc_y_last_pipeline_used = {};
6544+
ctx->prealloc_y_last_tensor_used = nullptr;
6545+
}
65106546
}
65116547
}
65126548
}
@@ -10311,6 +10347,7 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
1031110347
ggml_vk_pool_free(ctx, buffer);
1031210348
}
1031310349
ctx->gc.temp_buffers.clear();
10350+
ctx->prealloc_y_last_pipeline_used = {};
1031410351

1031510352
ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
1031610353
ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
@@ -10346,6 +10383,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
1034610383
ggml_vk_destroy_buffer(ctx->prealloc_x);
1034710384
ggml_vk_destroy_buffer(ctx->prealloc_y);
1034810385
ggml_vk_destroy_buffer(ctx->prealloc_split_k);
10386+
ctx->prealloc_y_last_pipeline_used = nullptr;
1034910387

1035010388
for (auto& buffer : ctx->buffer_pool) {
1035110389
ggml_vk_destroy_buffer(buffer);
@@ -10894,6 +10932,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1089410932
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
1089510933
}
1089610934

10935+
ctx->prealloc_y_last_pipeline_used = nullptr;
10936+
ctx->prealloc_y_last_tensor_used = nullptr;
10937+
1089710938
// Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
1089810939
// Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
1089910940
// (and scaled down based on model size, so smaller models submit earlier).

tests/test-backend-ops.cpp

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3098,9 +3098,10 @@ struct test_mul_mat : public test_case {
30983098
const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
30993099
const std::array<int64_t, 4> per; // permutation of dimensions
31003100
const bool v; // whether a and b are non-contiguous views
3101+
const uint32_t o; // number of outputs
31013102

31023103
std::string vars() override {
3103-
return VARS_TO_STR9(type_a, type_b, m, n, k, bs, nr, per, v);
3104+
return VARS_TO_STR10(type_a, type_b, m, n, k, bs, nr, per, v, o);
31043105
}
31053106

31063107
double max_nmse_err() override {
@@ -3121,8 +3122,8 @@ struct test_mul_mat : public test_case {
31213122
std::array<int64_t, 2> bs = {10, 10},
31223123
std::array<int64_t, 2> nr = {2, 2},
31233124
std::array<int64_t, 4> per = {0, 1, 2, 3},
3124-
bool v = false)
3125-
: type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), v(v) {}
3125+
bool v = false, uint32_t o = 1)
3126+
: type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), v(v), o(o) {}
31263127

31273128
ggml_tensor * build_graph(ggml_context * ctx) override {
31283129
// C^T = A * B^T: (k, m) * (k, n) => (m, n)
@@ -3186,9 +3187,21 @@ struct test_mul_mat : public test_case {
31863187

31873188
ggml_tensor * out = ggml_mul_mat(ctx, a, b);
31883189
ggml_set_name(out, "out");
3190+
for (uint32_t i = 1; i < o; ++i) {
3191+
ggml_tensor * out2 = ggml_mul_mat(ctx, a, b);
3192+
ggml_set_name(out2, "out2");
3193+
out = ggml_add(ctx, out, out2);
3194+
}
31893195

31903196
return out;
31913197
}
3198+
3199+
bool run_whole_graph() override { return o > 1; }
3200+
3201+
std::string op_desc(ggml_tensor * t) override {
3202+
GGML_UNUSED(t);
3203+
return ggml_op_name(GGML_OP_MUL_MAT);
3204+
}
31923205
};
31933206

31943207
// GGML_OP_MUL_MAT_ID
@@ -3201,9 +3214,10 @@ struct test_mul_mat_id : public test_case {
32013214
const int64_t m;
32023215
const int64_t n;
32033216
const int64_t k;
3217+
const uint32_t o; // number of outputs
32043218

32053219
std::string vars() override {
3206-
return VARS_TO_STR8(type_a, type_b, n_mats, n_used, b, m, n, k);
3220+
return VARS_TO_STR9(type_a, type_b, n_mats, n_used, b, m, n, k, o);
32073221
}
32083222

32093223
double max_nmse_err() override {
@@ -3217,9 +3231,9 @@ struct test_mul_mat_id : public test_case {
32173231

32183232
test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
32193233
int n_mats = 8, int n_used = 2, bool b = false,
3220-
int64_t m = 32, int64_t n = 32, int64_t k = 32)
3234+
int64_t m = 32, int64_t n = 32, int64_t k = 32, uint32_t o = 1)
32213235
: type_a(type_a), type_b(type_b), n_mats(n_mats), n_used(n_used), b(b),
3222-
m(m), n(n), k(k) {
3236+
m(m), n(n), k(k), o(o) {
32233237
GGML_ASSERT(n_used <= n_mats);
32243238
}
32253239

@@ -3241,6 +3255,13 @@ struct test_mul_mat_id : public test_case {
32413255
ggml_tensor * out = ggml_mul_mat_id(ctx, as, b, ids);
32423256
ggml_set_name(out, "out");
32433257

3258+
for (uint32_t i = 1; i < o; ++i) {
3259+
ggml_tensor * a2 = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
3260+
ggml_tensor * out2 = ggml_mul_mat_id(ctx, a2, b, ids);
3261+
ggml_set_name(out2, "out2");
3262+
out = ggml_add(ctx, out, out2);
3263+
}
3264+
32443265
return out;
32453266
}
32463267

@@ -3264,6 +3285,13 @@ struct test_mul_mat_id : public test_case {
32643285
}
32653286
}
32663287
}
3288+
3289+
bool run_whole_graph() override { return o > 1; }
3290+
3291+
std::string op_desc(ggml_tensor * t) override {
3292+
GGML_UNUSED(t);
3293+
return ggml_op_name(GGML_OP_MUL_MAT_ID);
3294+
}
32673295
};
32683296

32693297
// GGML_OP_OUT_PROD
@@ -5798,6 +5826,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
57985826
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1}));
57995827
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, {1, 1}, {4, 1}, {0, 2, 1, 3}));
58005828
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67, {1, 1}, {4, 1}, {0, 2, 1, 3}));
5829+
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 16, 32, 32, { 1, 1}, {1, 1}, {0, 1, 2, 3}, true, 3));
58015830

58025831
for (auto bs2 : {1,3}) {
58035832
for (auto bs : {1,2,4,8}) {
@@ -5826,6 +5855,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
58265855
}
58275856

58285857
test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 1, 1, false, 8, 16, 1));
5858+
test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 16, 16, false, 32, 32, 32, 3));
58295859

58305860
for (ggml_type type_a : base_types) {
58315861
for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {

0 commit comments

Comments
 (0)