From ae532eac2c1df1d8edc3d2719145895b966de1bf Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Mon, 18 Aug 2025 00:56:29 -0500 Subject: [PATCH 01/10] vulkan: disable spirv-opt for bfloat16 shaders (#15352) --- ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 6cc474b90a377..75c572d6fc904 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -223,7 +223,8 @@ void string_to_spv_func(const std::string& _name, const std::string& in_fname, c std::string target_env = (name.find("_cm2") != std::string::npos) ? "--target-env=vulkan1.3" : "--target-env=vulkan1.2"; // disable spirv-opt for coopmat shaders for https://github.com/ggerganov/llama.cpp/issues/10734 - std::string opt_level = coopmat ? "" : "-O"; + // disable spirv-opt for bf16 shaders for https://github.com/ggml-org/llama.cpp/issues/15344 + std::string opt_level = (coopmat || name.find("bf16") != std::string::npos) ? "" : "-O"; #ifdef _WIN32 std::vector cmd = {GLSLC, "-fshader-stage=compute", target_env, opt_level, "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""}; From f44f7931729022c57319a0124931120a169e0da9 Mon Sep 17 00:00:00 2001 From: compilade Date: Mon, 18 Aug 2025 03:23:56 -0400 Subject: [PATCH 02/10] ggml-quants : fix make_qp_quants NANs and IQ1 assertion errors (#15379) * ggml-quants : fix make_qp_quants NANs and IQ1 assertion errors * ggml-quants : avoid division by zero in make_q3_quants --- ggml/src/ggml-quants.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 94f6405ca1e05..727932123e41b 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -566,7 +566,7 @@ static float make_q3_quants(int n, int nmax, const float * GGML_RESTRICT x, int8 for (int i = 0; i < n; ++i) { L[i] += nmax; } - return sumlx / suml2; + return suml2 > 0.0f ? sumlx / suml2 : 0.0f; } for (int i = 0; i < n; ++i) { int l = nearest_int(iscale * x[i]); @@ -901,7 +901,7 @@ static float make_qp_quants(int n, int nmax, const float * GGML_RESTRICT x, uint for (int i = 0; i < n; ++i) { max = MAX(max, x[i]); } - if (!max) { // all zero + if (max < GROUP_MAX_EPS) { // all zero for (int i = 0; i < n; ++i) { L[i] = 0; } return 0.f; } @@ -966,7 +966,7 @@ static float make_qp_quants(int n, int nmax, const float * GGML_RESTRICT x, uint break; } } - return sumlx/suml2; + return suml2 > 0.0f ? sumlx / suml2 : 0.0f; } static void quantize_row_q2_K_impl(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k, const float * GGML_RESTRICT quant_weights) { @@ -4266,7 +4266,7 @@ static void quantize_row_iq1_s_impl(const float * GGML_RESTRICT x, void * GGML_R sumw[j+1] = sumw[j] + weight[i]; } } - float best_score = -FLT_MIN, scale = max; + float best_score = -FLT_MAX, scale = max; int besti1 = -1, besti2 = -1, best_shift = 0; for (int i1 = 0; i1 <= block_size; ++i1) { for (int i2 = i1; i2 <= block_size; ++i2) { @@ -4442,7 +4442,7 @@ static void quantize_row_iq1_m_impl(const float * GGML_RESTRICT x, void * GGML_R idx[2*j] = j; } qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper); - float best_score = -FLT_MIN, scale = max; + float best_score = -FLT_MAX, scale = max; int besti1 = -1, besti2 = -1, best_k = -1; // 0: +, + // 1: +, - From 618575c5825d7d4f170e686e772178d2aae148ae Mon Sep 17 00:00:00 2001 From: Dobri Danchev <12420863+danchev@users.noreply.github.com> Date: Mon, 18 Aug 2025 05:50:48 -0500 Subject: [PATCH 03/10] Fix broken build: require updated pip to support --break-system-packages (#15357) * Revert "devops : fix compile bug when the BASE_CUDA_DEV_CONTAINER is based on Ubuntu 24.04 (#15005)" This reverts commit e4e915912cfd2ee15c5a4a0074813232134892f6. * devops: Allow pip to modify externally-managed python environment (system installation) - Updated pip install commands to include the --break-system-packages flag, ensuring compatibility when working with system-managed Python environments (PEP 668). - Note: The --break-system-packages option was introduced in 2023. Ensure pip is updated to a recent version before using this flag. fixes [#15004](https://github.com/danchev/llama.cpp/issues/15004) --- .devops/cuda.Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/.devops/cuda.Dockerfile b/.devops/cuda.Dockerfile index f9dc7f3163b69..4b708ae278ddf 100644 --- a/.devops/cuda.Dockerfile +++ b/.devops/cuda.Dockerfile @@ -60,6 +60,7 @@ RUN apt-get update \ git \ python3 \ python3-pip \ + && pip install --upgrade pip setuptools wheel \ && pip install --break-system-packages -r requirements.txt \ && apt autoremove -y \ && apt clean -y \ From d1d82416006e7ff41780cb0e9b5f28d30a267497 Mon Sep 17 00:00:00 2001 From: davidef Date: Mon, 18 Aug 2025 16:51:42 +0200 Subject: [PATCH 04/10] server : fix incoming tasks not process in order (#15395) --- tools/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 24bfae78919a4..ab88f3d26c544 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -1729,7 +1729,7 @@ struct server_queue { void pop_deferred_task() { std::unique_lock lock(mutex_tasks); if (!queue_tasks_deferred.empty()) { - queue_tasks.emplace_back(std::move(queue_tasks_deferred.front())); + queue_tasks.emplace_front(std::move(queue_tasks_deferred.front())); queue_tasks_deferred.pop_front(); } condition_tasks.notify_one(); From 3007baf201e7ffcda17dbdb0335997fa50a6595b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 18 Aug 2025 18:11:44 +0300 Subject: [PATCH 05/10] readme : update hot topics (#15397) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 11d92907862ad..844675638427b 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ LLM inference in C/C++ ## Hot topics +- **[guide : running gpt-oss with llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/15396)** - **[[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)** - Support for the `gpt-oss` model with native MXFP4 format has been added | [PR](https://github.com/ggml-org/llama.cpp/pull/15091) | [Collaboration with NVIDIA](https://blogs.nvidia.com/blog/rtx-ai-garage-openai-oss) | [Comment](https://github.com/ggml-org/llama.cpp/discussions/15095) - Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen) From baa9255a45105d2d3b4ec432af13b7a6eda3ff35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Mon, 18 Aug 2025 19:30:17 +0200 Subject: [PATCH 06/10] llama : merge conts and reshapes and remove unnecessary cont (#15380) * remove unnecessary conts and merge reshapes * restore necessary conts * merge more conts and reshapes * merge even more conts and reshapes --- src/llama-model.cpp | 143 +++++++++++++++++++++++--------------------- tools/mtmd/clip.cpp | 45 +++++++------- 2 files changed, 96 insertions(+), 92 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 23a26f0c64ea6..0c149cd478e9c 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -6743,9 +6743,9 @@ struct llm_build_falcon : public llm_graph_context { ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); - ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); // using mode = 2 for neox mode Qcur = ggml_rope_ext( @@ -7023,9 +7023,9 @@ struct llm_build_dbrx : public llm_graph_context { Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); - Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, @@ -7145,13 +7145,13 @@ struct llm_build_starcoder : public llm_graph_context { cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); - ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)); + ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)); + ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); @@ -7367,13 +7367,15 @@ struct llm_build_bert : public llm_graph_context { cb(cur, "bqkv", il); } - Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)); + Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)); + Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); } else { Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq); Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk); Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); } if (model.layers[il].attn_q_norm) { @@ -7381,6 +7383,10 @@ struct llm_build_bert : public llm_graph_context { model.layers[il].attn_q_norm, model.layers[il].attn_q_norm_b, LLM_NORM, il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + } else { + Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); } if (model.layers[il].attn_k_norm) { @@ -7388,11 +7394,11 @@ struct llm_build_bert : public llm_graph_context { model.layers[il].attn_k_norm, model.layers[il].attn_k_norm_b, LLM_NORM, il); - } - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + } else { + Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + } // RoPE if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) { @@ -7537,9 +7543,9 @@ struct llm_build_neo_bert : public llm_graph_context { Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); - Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); // RoPE Qcur = ggml_rope_ext( @@ -7646,13 +7652,13 @@ struct llm_build_bloom : public llm_graph_context { cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); - ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)); + ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)); + ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); @@ -7770,7 +7776,7 @@ struct llm_build_mpt : public llm_graph_context { ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)); ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)); - ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); @@ -7789,17 +7795,18 @@ struct llm_build_mpt : public llm_graph_context { model.layers[il].attn_k_norm_b, LLM_NORM, il); cb(Kcur, "Kcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); } else { - Qcur = ggml_cont(ctx0, Qcur); + Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); cb(Qcur, "Qcur", il); - Kcur = ggml_cont(ctx0, Kcur); + Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); cb(Kcur, "Kcur", il); } - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); @@ -8051,9 +8058,9 @@ struct llm_build_qwen : public llm_graph_context { ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); - ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd))); + ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); // using mode = 2 for neox mode Qcur = ggml_rope_ext( @@ -9026,21 +9033,21 @@ struct llm_build_phi2 : public llm_graph_context { Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); - Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); } else { Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq); Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk); Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); } cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, @@ -9164,21 +9171,21 @@ struct llm_build_phi3 : public llm_graph_context { Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd)); Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd)); - Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa))); + Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); } else { Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq); Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk); Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); } cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, @@ -9428,17 +9435,17 @@ struct llm_build_gpt2 : public llm_graph_context { cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); - ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)); + ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)); + ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, @@ -9534,9 +9541,9 @@ struct llm_build_codeshell : public llm_graph_context { ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); - ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, @@ -10864,8 +10871,8 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens] all_coefs = ggml_scale_bias(ctx0, all_coefs, 1.0f, 1.0f); // + 1.0 cb(all_coefs, "all_coefs", il); - all_coefs = ggml_cont(ctx0, ggml_transpose(ctx0, all_coefs)); // [n_tokens, n_altup] - all_coefs = ggml_reshape_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup] + all_coefs = ggml_transpose(ctx0, all_coefs); // [n_tokens, n_altup] + all_coefs = ggml_cont_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup] innovation = ggml_repeat_4d(ctx0, innovation, n_embd, n_tokens, n_altup, 1); ggml_tensor * corrected = ggml_mul(ctx0, innovation, all_coefs); // [n_embd, n_tokens, n_altup] @@ -12278,9 +12285,9 @@ struct llm_build_gptneox : public llm_graph_context { ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); - ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, @@ -13413,17 +13420,17 @@ struct llm_build_jais : public llm_graph_context { cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); - ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd))); - ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd))); - ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa))); + ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd)); + ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd)); + ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa)); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, @@ -13526,6 +13533,7 @@ struct llm_build_chatglm : public llm_graph_context { } Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); } else { cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); @@ -13535,11 +13543,10 @@ struct llm_build_chatglm : public llm_graph_context { } Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); - Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); } - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor); Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, @@ -13660,6 +13667,7 @@ struct llm_build_glm4 : public llm_graph_context { } Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); } else { cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); @@ -13669,11 +13677,10 @@ struct llm_build_glm4 : public llm_graph_context { } Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); - Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); } - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, @@ -16840,13 +16847,13 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv)); ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv)); - ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_head_v * n_head_kv, n_tokens, qkv->nb[1], v_offset * ggml_element_size(qkv))); + ggml_tensor * Vcur = ggml_view_2d(ctx0, qkv, n_embd_head_v * n_head_kv, n_tokens, qkv->nb[1], v_offset * ggml_element_size(qkv)); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens); Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); cb(Qcur, "Qcur_normed", il); @@ -16913,15 +16920,13 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { cb(zx, "mamba_in_proj", il); // {8192, 5, 1, 1} -> {8192, 1, 5, 1} zx = ggml_permute(ctx0, zx, 0, 2, 1, 3); - zx = ggml_cont(ctx0, zx); - zx = ggml_reshape_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs); + zx = ggml_cont_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs); cb(zx, "mamba_in_proj_out", il); // split into z and x // => {head_dim * n_heads, n_seq_tokens, n_seqs} ggml_tensor * x = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], head_dim*ggml_element_size(zx)); - x = ggml_cont(ctx0, x); - x = ggml_reshape_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs); + x = ggml_cont_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs); // x = ggml_permute(ctx0, x, 0, 2, 1, 3); cb(x, "mamba_x_split", il); diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index c27f8ebbd9912..099e28237072f 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -508,8 +508,8 @@ struct clip_graph { const int patches_per_image = n_patches_x; const int kernel_size = hparams.proj_scale_factor; - cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); - cur = ggml_reshape_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size); + cur = ggml_transpose(ctx0, cur); + cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size); // doing a pool2d to reduce the number of output tokens cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0); @@ -537,13 +537,13 @@ struct clip_graph { GGML_ASSERT(scale_factor != 0); cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz); cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), + cur = ggml_cont_4d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor, bsz); cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur), + cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, seq / (scale_factor * scale_factor), bsz); @@ -570,13 +570,13 @@ struct clip_graph { // unshuffle h cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height); - cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3)); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // unshuffle w - cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor); - cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3)); + cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); + cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); // projection cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm @@ -715,15 +715,15 @@ struct clip_graph { auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); inp = ggml_add(ctx0, inp, inp_1); - inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b] - inp = ggml_reshape_4d( + inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b] + inp = ggml_cont_4d( ctx0, inp, n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); inp = ggml_reshape_4d( ctx0, inp, n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); - inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3)); - inp = ggml_reshape_3d( + inp = ggml_permute(ctx0, inp, 0, 2, 1, 3); + inp = ggml_cont_3d( ctx0, inp, n_embd, n_patches_x * n_patches_y, batch_size); } @@ -988,14 +988,14 @@ struct clip_graph { GGML_ASSERT(scale_factor > 0); cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz); cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), + cur = ggml_cont_4d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor, bsz); cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // flatten to 2D - cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur), + cur = ggml_cont_2d(ctx0, cur, n_embd * scale_factor * scale_factor, cur->ne[1] * cur->ne[2]); } @@ -1081,14 +1081,14 @@ struct clip_graph { n_patches_y, bsz); cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), + cur = ggml_cont_4d(ctx0, cur, n_embd * scale_factor * scale_factor, n_patches_x / scale_factor, n_patches_y / scale_factor, bsz); cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // flatten to 2D - cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur), + cur = ggml_cont_2d(ctx0, cur, n_embd * scale_factor * scale_factor, n_patches / scale_factor / scale_factor); cb(cur, "pixel_shuffle", -1); @@ -1321,8 +1321,8 @@ struct clip_graph { ggml_tensor * block_1 = nullptr; { // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24] - mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3)); - mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]); + mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3); + mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]); // stride = 1, padding = 1, bias is nullptr block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1); @@ -1427,9 +1427,9 @@ struct clip_graph { mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b); // mlp_2 ne = [2048, 576, 1, 1] // // AVG Pool Layer 2*2, strides = 2 - mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3)); + mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3); // mlp_2 ne = [576, 2048, 1, 1] - mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]); + mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]); // mlp_2 ne [24, 24, 2048, 1] mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); // weight ne = [3, 3, 2048, 1] @@ -1449,8 +1449,8 @@ struct clip_graph { // glm projector else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE) { size_t gridsz = (size_t)sqrt(embeddings->ne[1]); - embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3)); - embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]); + embeddings = ggml_permute(ctx0,embeddings,1,0,2,3); + embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]); embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1); embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size); embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3)); @@ -2005,7 +2005,6 @@ struct clip_graph { ggml_row_size(cur->type, n_dim), ggml_row_size(cur->type, n_dim*n_head), n_dim/2 * ggml_element_size(cur)); - second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors second = ggml_rope_ext( ctx0, second, From f0c541d315e97b297b3421c52ebde53340ee66b3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 18 Aug 2025 20:35:47 +0300 Subject: [PATCH 07/10] scripts : update sync scripts --- scripts/sync-ggml-am.sh | 47 +++-------------------------------------- scripts/sync-ggml.sh | 16 +------------- 2 files changed, 4 insertions(+), 59 deletions(-) diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index 29d30e0a188a1..826c560cd883a 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -74,21 +74,7 @@ while read c; do cmake/common.cmake \ cmake/ggml-config.cmake.in \ src/ggml-cpu/cmake/FindSIMD.cmake \ - src/ggml*.h \ - src/ggml*.c \ - src/ggml*.cpp \ - src/gguf*.cpp \ - src/ggml-blas/* \ - src/ggml-cann/* \ - src/ggml-cpu/* \ - src/ggml-cuda/* \ - src/ggml-hip/* \ - src/ggml-metal/* \ - src/ggml-musa/* \ - src/ggml-opencl/* \ - src/ggml-rpc/* \ - src/ggml-sycl/* \ - src/ggml-vulkan/* \ + src/ggml* \ include/ggml*.h \ include/gguf*.h \ tests/test-opt.cpp \ @@ -131,21 +117,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # cmake/ggml-config.cmake.in -> ggml/cmake/ggml-config.cmake.in # src/ggml-cpu/cmake/FindSIMD.cmake -> ggml/src/ggml-cpu/cmake/FindSIMD.cmake # - # src/ggml*.c -> ggml/src/ggml*.c - # src/ggml*.cpp -> ggml/src/ggml*.cpp - # src/ggml*.h -> ggml/src/ggml*.h - # src/gguf*.cpp -> ggml/src/gguf*.cpp - # src/ggml-blas/* -> ggml/src/ggml-blas/* - # src/ggml-cann/* -> ggml/src/ggml-cann/* - # src/ggml-cpu/* -> ggml/src/ggml-cpu/* - # src/ggml-cuda/* -> ggml/src/ggml-cuda/* - # src/ggml-hip/* -> ggml/src/ggml-hip/* - # src/ggml-metal/* -> ggml/src/ggml-metal/* - # src/ggml-musa/* -> ggml/src/ggml-musa/* - # src/ggml-opencl/* -> ggml/src/ggml-opencl/* - # src/ggml-rpc/* -> ggml/src/ggml-rpc/* - # src/ggml-sycl/* -> ggml/src/ggml-sycl/* - # src/ggml-vulkan/* -> ggml/src/ggml-vulkan/* + # src/ggml* -> ggml/src/ggml* # # include/ggml*.h -> ggml/include/ggml*.h # include/gguf*.h -> ggml/include/gguf*.h @@ -163,20 +135,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/([[:space:]]| [ab]\/)cmake\/common.cmake/\1ggml\/cmake\/common.cmake/g' \ -e 's/([[:space:]]| [ab]\/)cmake\/ggml-config.cmake.in/\1ggml\/cmake\/ggml-config.cmake.in/g' \ -e 's/([[:space:]]| [ab]\/)src\/ggml-cpu\/cmake\/FindSIMD.cmake/\1ggml\/src\/ggml-cpu\/cmake\/FindSIMD.cmake/g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml(.*)\.c/\1ggml\/src\/ggml\2.c/g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml(.*)\.cpp/\1ggml\/src\/ggml\2.cpp/g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml(.*)\.h/\1ggml\/src\/ggml\2.h/g' \ - -e 's/([[:space:]]| [ab]\/)src\/gguf(.*)\.cpp/\1ggml\/src\/gguf\2.cpp/g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-blas\//\1ggml\/src\/ggml-blas\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-cann\//\1ggml\/src\/ggml-cann\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-cpu\//\1ggml\/src\/ggml-cpu\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-cuda\//\1ggml\/src\/ggml-cuda\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-hip\//\1ggml\/src\/ggml-hip\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-metal\//\1ggml\/src\/ggml-metal\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-opencl\//\1ggml\/src\/ggml-opencl\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-rpc\//\1ggml\/src\/ggml-rpc\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-sycl\//\1ggml\/src\/ggml-sycl\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-vulkan\//\1ggml\/src\/ggml-vulkan\//g' \ + -e 's/([[:space:]]| [ab]\/)src\/ggml(.*)/\1ggml\/src\/ggml\2/g' \ -e 's/([[:space:]]| [ab]\/)include\/ggml(.*)\.h/\1ggml\/include\/ggml\2.h/g' \ -e 's/([[:space:]]| [ab]\/)include\/gguf(.*)\.h/\1ggml\/include\/gguf\2.h/g' \ -e 's/([[:space:]]| [ab]\/)tests\/(.*)\.cpp/\1tests\/\2.cpp/g' \ diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh index 9b98329e09cb6..2da9b57891582 100755 --- a/scripts/sync-ggml.sh +++ b/scripts/sync-ggml.sh @@ -6,21 +6,7 @@ cp -rpv ../ggml/src/CMakeLists.txt ./ggml/src/CMakeLists.txt cp -rpv ../ggml/cmake/* ./ggml/cmake/ cp -rpv ../ggml/src/ggml-cpu/cmake/* ./ggml/src/ggml-cpu/cmake/ -cp -rpv ../ggml/src/ggml*.c ./ggml/src/ -cp -rpv ../ggml/src/ggml*.cpp ./ggml/src/ -cp -rpv ../ggml/src/ggml*.h ./ggml/src/ -cp -rpv ../ggml/src/gguf*.cpp ./ggml/src/ -cp -rpv ../ggml/src/ggml-blas/* ./ggml/src/ggml-blas/ -cp -rpv ../ggml/src/ggml-cann/* ./ggml/src/ggml-cann/ -cp -rpv ../ggml/src/ggml-cpu/* ./ggml/src/ggml-cpu/ -cp -rpv ../ggml/src/ggml-cuda/* ./ggml/src/ggml-cuda/ -cp -rpv ../ggml/src/ggml-hip/* ./ggml/src/ggml-hip/ -cp -rpv ../ggml/src/ggml-metal/* ./ggml/src/ggml-metal/ -cp -rpv ../ggml/src/ggml-musa/* ./ggml/src/ggml-musa/ -cp -rpv ../ggml/src/ggml-opencl/* ./ggml/src/ggml-opencl/ -cp -rpv ../ggml/src/ggml-rpc/* ./ggml/src/ggml-rpc/ -cp -rpv ../ggml/src/ggml-sycl/* ./ggml/src/ggml-sycl/ -cp -rpv ../ggml/src/ggml-vulkan/* ./ggml/src/ggml-vulkan/ +cp -rpv ../ggml/src/ggml* ./ggml/src/ cp -rpv ../ggml/include/ggml*.h ./ggml/include/ cp -rpv ../ggml/include/gguf*.h ./ggml/include/ From 60212f1ead2dce9bf1ac69633a7069258ae604d8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 18 Aug 2025 22:02:11 +0300 Subject: [PATCH 08/10] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 7e2875f870f78..22348782d1911 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -b141fc226b68e4af383101c39da90b54ede98850 +323951f1bdcdfbd5b5ff3a9a7c3770e63b1a560e From 6d7f1117e3e3285d0c5c11b5ebb0439e27920082 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 18 Aug 2025 22:02:50 +0300 Subject: [PATCH 09/10] codeowners : remove mmv.* --- CODEOWNERS | 1 - 1 file changed, 1 deletion(-) diff --git a/CODEOWNERS b/CODEOWNERS index b698b1f4cbaa1..18564a08b1d6c 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -5,7 +5,6 @@ /tools/server/ @ngxson /ggml/src/ggml-cuda/fattn* @JohannesGaessler /ggml/src/ggml-cuda/mmq.* @JohannesGaessler -/ggml/src/ggml-cuda/mmv.* @JohannesGaessler /ggml/src/ggml-cuda/mmvq.* @JohannesGaessler /ggml/src/ggml-opt.cpp @JohannesGaessler /ggml/src/gguf.cpp @JohannesGaessler From f08c4c0d8d0cb6caaf8b7ad316039232b9fa059c Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Mon, 18 Aug 2025 22:53:52 +0200 Subject: [PATCH 10/10] mtmd : clean up clip_n_output_tokens (#15391) --- tools/mtmd/clip.cpp | 60 +++++++++++++++++++-------------------------- tools/mtmd/clip.h | 5 ---- tools/mtmd/tests.sh | 1 + 3 files changed, 26 insertions(+), 40 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 099e28237072f..a32fe84fa7112 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -3648,8 +3648,9 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) { const auto & params = ctx->model.hparams; - // only for models using fixed size square images - int n_patches_sq = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); + // for models with fixed size image, the input image is already pre-processed and resized to square + int patch_size = params.patch_size; + int n_patches = (img->nx / patch_size) * (img->ny / patch_size); projector_type proj = ctx->proj_type(); @@ -3663,27 +3664,27 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_LDPV2: case PROJECTOR_TYPE_GLM_EDGE: { - n_patches_sq /= 4; + n_patches /= 4; if (ctx->model.mm_glm_tok_boi) { - n_patches_sq += 2; // for BOI and EOI token embeddings + n_patches += 2; // for BOI and EOI token embeddings } } break; case PROJECTOR_TYPE_MINICPMV: { // Use actual config value if available, otherwise fall back to hardcoded values if (params.minicpmv_query_num > 0) { - n_patches_sq = params.minicpmv_query_num; + n_patches = params.minicpmv_query_num; } else { // Fallback to hardcoded values for legacy models if (params.minicpmv_version == 2) { - n_patches_sq = 96; + n_patches = 96; } else if (params.minicpmv_version == 3) { - n_patches_sq = 64; + n_patches = 64; } else if (params.minicpmv_version == 4) { - n_patches_sq = 64; + n_patches = 64; } else if (params.minicpmv_version == 5) { // MiniCPM-V 4.0 - n_patches_sq = 64; + n_patches = 64; } else { GGML_ABORT("Unknown minicpmv version"); } @@ -3692,67 +3693,56 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN25VL: { - // dynamic size + // dynamic size (2 conv, so double patch size) int patch_size = params.patch_size * 2; int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0); int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0); - n_patches_sq = x_patch * y_patch; + n_patches = x_patch * y_patch; } break; case PROJECTOR_TYPE_GEMMA3: - { - int n_per_side = params.image_size / params.patch_size; - int n_per_side_2d_pool = n_per_side / params.proj_scale_factor; - n_patches_sq = n_per_side_2d_pool * n_per_side_2d_pool; - } break; case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_INTERNVL: + case PROJECTOR_TYPE_LLAMA4: + case PROJECTOR_TYPE_LFM2: { // both W and H are divided by proj_scale_factor - n_patches_sq /= (params.proj_scale_factor * params.proj_scale_factor); + int scale_factor = ctx->model.hparams.proj_scale_factor; + n_patches /= (scale_factor * scale_factor); } break; case PROJECTOR_TYPE_PIXTRAL: { // dynamic size int n_merge = params.spatial_merge_size; - int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1); - int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1); - n_patches_sq = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row - } break; - case PROJECTOR_TYPE_LLAMA4: - { - int scale_factor = ctx->model.hparams.proj_scale_factor; - n_patches_sq /= (scale_factor * scale_factor); + int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1); + int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1); + n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row } break; case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_QWEN2A: { - n_patches_sq = img->nx; + n_patches = img->nx; const int proj_stack_factor = ctx->model.hparams.proj_stack_factor; if (ctx->model.audio_has_stack_frames()) { GGML_ASSERT(proj_stack_factor > 0); - const int n_len = CLIP_ALIGN(n_patches_sq, proj_stack_factor); - n_patches_sq = n_len / proj_stack_factor; + const int n_len = CLIP_ALIGN(n_patches, proj_stack_factor); + n_patches = n_len / proj_stack_factor; } // whisper downscales input token by half after conv1d - n_patches_sq /= 2; + n_patches /= 2; if (ctx->model.audio_has_avgpool()) { // divide by 2 because of nn.AvgPool1d(2, stride=2) - n_patches_sq /= 2; + n_patches /= 2; } } break; - case PROJECTOR_TYPE_LFM2: - { - n_patches_sq = (img->nx / (params.patch_size * params.proj_scale_factor)) * (img->ny / (params.patch_size * params.proj_scale_factor)); - } break; default: GGML_ABORT("unsupported projector type"); } - return n_patches_sq; + return n_patches; } static std::vector>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector> & pos) { diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 08f3efb7b1daf..3387cdbd36955 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -82,11 +82,6 @@ struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch */ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img); -bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); - -/** interpret bytes as an image file with length bytes_length, and use the result to populate img */ -bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); - /** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */ bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs ); diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh index e73cf96af2941..6f8a5f86ac5b2 100755 --- a/tools/mtmd/tests.sh +++ b/tools/mtmd/tests.sh @@ -68,6 +68,7 @@ add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M" add_test_vision "ggml-org/InternVL2_5-1B-GGUF:Q8_0" add_test_vision "ggml-org/InternVL3-1B-Instruct-GGUF:Q8_0" add_test_vision "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M" +add_test_vision "ggml-org/LFM2-VL-450M-GGUF:Q8_0" add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0" add_test_audio "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"