Merge branch 'ggml-org:master' into master

Thireus · web-flow · commit ecab78cf8866 · 2025-08-21T15:58:14.000+01:00
diff --git a/examples/model-conversion/README.md b/examples/model-conversion/README.md
@@ -6,7 +6,7 @@ The motivation for having this is that the conversion process can often be an
 iterative process, where the original model is inspected, converted, updates
 made to llama.cpp, converted again, etc. Once the model has been converted it
 needs to be verified against the original model, and then optionally quantified,
-and is some cases perplexity checked of the quantized model. And finally the
+and in some cases perplexity checked of the quantized model. And finally the
 model/models need to the ggml-org on Hugging Face. This tool/example tries to
 help with this process.
 
@@ -62,7 +62,7 @@ Command line arguments take precedence over environment variables when both are
 
 In cases where the transformer implementation for the model has not been released
 yet it is possible to set the environment variable `UNRELEASED_MODEL_NAME` which
-will the cause the transformer implementation to be loaded explicitely and not
+will then cause the transformer implementation to be loaded explicitely and not
 use AutoModelForCausalLM:
 ```
 export UNRELEASED_MODEL_NAME=SomeNewModel
@@ -87,7 +87,7 @@ from the converted model.
 # Or using command line argument
 (venv) $ make causal-run-original-model MODEL_PATH=~/work/ai/models/some_model
 ```
-This command will save two file to the `data` directory, one is a binary file
+This command will save two files to the `data` directory, one is a binary file
 containing logits which will be used for comparison with the converted model
 later, and the other is a text file which allows for manual visual inspection.
 
@@ -128,11 +128,11 @@ Quantized model saved to: /path/to/quantized/model-Q8_0.gguf
 Export the quantized model path to QUANTIZED_MODEL variable in your environment
 ```
 This will show the path to the quantized model in the terminal, which can then
-be used set the `QUANTIZED_MODEL` environment variable:
+be used to set the `QUANTIZED_MODEL` environment variable:
 ```console
 export QUANTIZED_MODEL=/path/to/quantized/model-Q8_0.gguf
 ```
-The the quantized model can be run using the following command:
+Then the quantized model can be run using the following command:
 ```console
 (venv) $ make causal-run-quantized-model
 ```
@@ -229,11 +229,11 @@ Quantized model saved to: /path/to/quantized/model-Q8_0.gguf
 Export the quantized model path to QUANTIZED_EMBEDDING_MODEL variable in your environment
 ```
 This will show the path to the quantized model in the terminal, which can then
-be used set the `QUANTIZED_EMBEDDING_MODEL` environment variable:
+be used to set the `QUANTIZED_EMBEDDING_MODEL` environment variable:
 ```console
 export QUANTIZED_EMBEDDING_MODEL=/path/to/quantized/model-Q8_0.gguf
 ```
-The the quantized model can be run using the following command:
+Then the quantized model can be run using the following command:
 ```console
 (venv) $ make embedding-run-quantized-model
 ```
@@ -246,7 +246,7 @@ token/logits file:
 ```console
 (venv) $ make perplexity-run QUANTIZED_MODEL=~/path/to/quantized/model.gguf
 ```
-This will use the wikitext dataset to run the perplexity evaluation and and
+This will use the wikitext dataset to run the perplexity evaluation and
 output the perplexity score to the terminal. This value can then be compared
 with the perplexity score of the unquantized model.
 
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1193,6 +1193,10 @@ struct ggml_backend_vk_context {
     vk::Fence fence, almost_ready_fence;
     bool almost_ready_fence_pending {};
 
+    // Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert.
+    vk_pipeline_struct * prealloc_y_last_pipeline_used {};
+    const ggml_tensor * prealloc_y_last_tensor_used {};
+
     vk_buffer buffer_pool[MAX_VK_BUFFERS];
 
     vk_context_ref compute_ctx;
@@ -5651,10 +5655,20 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
         ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
     }
     if (y_non_contig) {
-        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+        if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
+            ctx->prealloc_y_last_tensor_used != src1) {
+            ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+            ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
+            ctx->prealloc_y_last_tensor_used = src1;
+        }
     }
     if (quantize_y) {
-        ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13);
+        if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() ||
+            ctx->prealloc_y_last_tensor_used != src1) {
+            ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13);
+            ctx->prealloc_y_last_pipeline_used = to_q8_1.get();
+            ctx->prealloc_y_last_tensor_used = src1;
+        }
     }
 
     uint32_t stride_batch_x = ne00*ne01;
@@ -5829,7 +5843,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
     }
     if (y_non_contig) {
         GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
-        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+        if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
+            ctx->prealloc_y_last_tensor_used != src1) {
+            ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+            ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
+            ctx->prealloc_y_last_tensor_used = src1;
+        }
     }
 
     // For batch_n, the A matrix is the same for each batch, and B/D use the row stride as the batch stride
@@ -6259,7 +6278,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
             { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
     }
     if (y_non_contig) {
-        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+        if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
+            ctx->prealloc_y_last_tensor_used != src1) {
+            ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+            ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
+            ctx->prealloc_y_last_tensor_used = src1;
+        }
     }
 
     uint32_t stride_batch_x = ne00*ne01;
@@ -6447,7 +6471,12 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
     }
     if (y_non_contig) {
         GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
-        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+        if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
+            ctx->prealloc_y_last_tensor_used != src1) {
+            ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+            ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
+            ctx->prealloc_y_last_tensor_used = src1;
+        }
     }
 
     uint32_t stride_batch_y = ne10*ne11;
@@ -6491,22 +6520,29 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx
         GGML_ASSERT(nei0 <= 4096);
         const uint32_t split_size = std::min(nei1, 4096u / nei0);
 
-        ggml_tensor src1_copy = *src1;
-        ggml_tensor src2_copy = *src2;
-        ggml_tensor dst_copy = *dst;
+        if (split_size == nei1) {
+            ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun);
+        } else {
+            ggml_tensor src1_copy = *src1;
+            ggml_tensor src2_copy = *src2;
+            ggml_tensor dst_copy = *dst;
 
-        for (uint32_t token_start = 0; token_start < nei1; token_start += split_size) {
-            const uint32_t n_tokens = std::min(split_size, nei1 - token_start);
+            for (uint32_t token_start = 0; token_start < nei1; token_start += split_size) {
+                const uint32_t n_tokens = std::min(split_size, nei1 - token_start);
 
-            src1_copy.view_offs = src1->view_offs + token_start * src1_copy.nb[2];
-            src2_copy.view_offs = src2->view_offs + token_start * src2_copy.nb[1];
-            dst_copy.view_offs = dst->view_offs + token_start * dst_copy.nb[2];
+                src1_copy.view_offs = src1->view_offs + token_start * src1_copy.nb[2];
+                src2_copy.view_offs = src2->view_offs + token_start * src2_copy.nb[1];
+                dst_copy.view_offs = dst->view_offs + token_start * dst_copy.nb[2];
 
-            src1_copy.ne[2] = n_tokens;
-            src2_copy.ne[1] = n_tokens;
-            dst_copy.ne[2] = n_tokens;
+                src1_copy.ne[2] = n_tokens;
+                src2_copy.ne[1] = n_tokens;
+                dst_copy.ne[2] = n_tokens;
 
-            ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, &src1_copy, &src2_copy, &dst_copy, dryrun);
+                ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, &src1_copy, &src2_copy, &dst_copy, dryrun);
+                // invalidate cached prealloc_y, can't cache based on the copy of the ggml_tensor
+                ctx->prealloc_y_last_pipeline_used = {};
+                ctx->prealloc_y_last_tensor_used = nullptr;
+            }
         }
     }
 }
@@ -10311,6 +10347,7 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
         ggml_vk_pool_free(ctx, buffer);
     }
     ctx->gc.temp_buffers.clear();
+    ctx->prealloc_y_last_pipeline_used = {};
 
     ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
     ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
@@ -10346,6 +10383,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
     ggml_vk_destroy_buffer(ctx->prealloc_x);
     ggml_vk_destroy_buffer(ctx->prealloc_y);
     ggml_vk_destroy_buffer(ctx->prealloc_split_k);
+    ctx->prealloc_y_last_pipeline_used = nullptr;
 
     for (auto& buffer : ctx->buffer_pool) {
         ggml_vk_destroy_buffer(buffer);
@@ -10894,6 +10932,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
         compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
     }
 
+    ctx->prealloc_y_last_pipeline_used = nullptr;
+    ctx->prealloc_y_last_tensor_used = nullptr;
+
     // Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
     // Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
     // (and scaled down based on model size, so smaller models submit earlier).
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -3098,9 +3098,10 @@ struct test_mul_mat : public test_case {
     const std::array<int64_t, 2> nr;  // repeat in dims 3 and 4
     const std::array<int64_t, 4> per; // permutation of dimensions
     const bool v; // whether a and b are non-contiguous views
+    const uint32_t o; // number of outputs
 
     std::string vars() override {
-        return VARS_TO_STR9(type_a, type_b, m, n, k, bs, nr, per, v);
+        return VARS_TO_STR10(type_a, type_b, m, n, k, bs, nr, per, v, o);
     }
 
     double max_nmse_err() override {
@@ -3121,8 +3122,8 @@ struct test_mul_mat : public test_case {
             std::array<int64_t, 2> bs = {10, 10},
             std::array<int64_t, 2> nr = {2, 2},
             std::array<int64_t, 4> per = {0, 1, 2, 3},
-            bool v = false)
-        : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), v(v) {}
+            bool v = false, uint32_t o = 1)
+        : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), v(v), o(o) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         // C^T = A * B^T: (k, m) * (k, n) => (m, n)
@@ -3186,9 +3187,21 @@ struct test_mul_mat : public test_case {
 
         ggml_tensor * out = ggml_mul_mat(ctx, a, b);
         ggml_set_name(out, "out");
+        for (uint32_t i = 1; i < o; ++i) {
+            ggml_tensor * out2 = ggml_mul_mat(ctx, a, b);
+            ggml_set_name(out2, "out2");
+            out = ggml_add(ctx, out, out2);
+        }
 
         return out;
     }
+
+    bool run_whole_graph() override { return o > 1; }
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return ggml_op_name(GGML_OP_MUL_MAT);
+    }
 };
 
 // GGML_OP_MUL_MAT_ID
@@ -3201,9 +3214,10 @@ struct test_mul_mat_id : public test_case {
     const int64_t m;
     const int64_t n;
     const int64_t k;
+    const uint32_t o; // number of outputs
 
     std::string vars() override {
-        return VARS_TO_STR8(type_a, type_b, n_mats, n_used, b, m, n, k);
+        return VARS_TO_STR9(type_a, type_b, n_mats, n_used, b, m, n, k, o);
     }
 
     double max_nmse_err() override {
@@ -3217,9 +3231,9 @@ struct test_mul_mat_id : public test_case {
 
     test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
             int n_mats = 8, int n_used = 2, bool b = false,
-            int64_t m = 32, int64_t n = 32, int64_t k = 32)
+            int64_t m = 32, int64_t n = 32, int64_t k = 32, uint32_t o = 1)
         : type_a(type_a), type_b(type_b), n_mats(n_mats), n_used(n_used), b(b),
-            m(m), n(n), k(k) {
+            m(m), n(n), k(k), o(o) {
             GGML_ASSERT(n_used <= n_mats);
         }
 
@@ -3241,6 +3255,13 @@ struct test_mul_mat_id : public test_case {
         ggml_tensor * out = ggml_mul_mat_id(ctx, as, b, ids);
         ggml_set_name(out, "out");
 
+        for (uint32_t i = 1; i < o; ++i) {
+            ggml_tensor * a2 = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
+            ggml_tensor * out2 = ggml_mul_mat_id(ctx, a2, b, ids);
+            ggml_set_name(out2, "out2");
+            out = ggml_add(ctx, out, out2);
+        }
+
         return out;
     }
 
@@ -3264,6 +3285,13 @@ struct test_mul_mat_id : public test_case {
             }
         }
     }
+
+    bool run_whole_graph() override { return o > 1; }
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return ggml_op_name(GGML_OP_MUL_MAT_ID);
+    }
 };
 
 // GGML_OP_OUT_PROD
@@ -5798,6 +5826,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45,  64, { 8,  1}, {4, 1}));
     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, {1,  1}, {4, 1}, {0, 2, 1, 3}));
     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67,  {1,  1}, {4, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 16, 32, 32, { 1,  1}, {1, 1}, {0, 1, 2, 3}, true, 3));
 
     for (auto bs2 : {1,3}) {
         for (auto bs : {1,2,4,8}) {
@@ -5826,6 +5855,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     }
 
     test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 1, 1, false, 8, 16, 1));
+    test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 16, 16, false, 32, 32, 32, 3));
 
     for (ggml_type type_a : base_types) {
         for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {