From b85cddb88438d7a11d3b572e8b026886025de614 Mon Sep 17 00:00:00 2001 From: 65a <10104049+65a@users.noreply.github.com> Date: Sat, 1 Nov 2025 14:18:47 -0700 Subject: [PATCH 1/3] Remove n_embd hack from llama-models.cpp --- src/llama-model.cpp | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 04239181c7765..6da5841bc6139 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1039,9 +1039,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { case 64: type = LLM_TYPE_32B; break; default: type = LLM_TYPE_UNKNOWN; } - // since vision model stacks deepstack features along feature dim - // we also create a fake "n_embd" for text model to be the main embd + deepstack embds - hparams.n_embd *= hparams.n_deepstack_layers + 1; } break; case LLM_ARCH_QWEN3MOE: { @@ -1065,9 +1062,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { case 94: type = LLM_TYPE_235B_A22B; break; default: type = LLM_TYPE_UNKNOWN; } - // since vision model stacks deepstack features along feature dim - // we also create a fake "n_embd" for text model to be the main embd + deepstack embds - hparams.n_embd *= hparams.n_deepstack_layers + 1; } break; case LLM_ARCH_PHI2: { @@ -3332,10 +3326,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { case LLM_ARCH_QWEN3: case LLM_ARCH_QWEN3VL: { - // for model loading, the weights only have the main embd - // so we need to divide by the number of deepstack layers + 1 - // n_embd is const int so we declare a new variable - int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1); + int64_t n_embd = hparams.n_embd; tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // output @@ -3371,10 +3362,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { case LLM_ARCH_QWEN3MOE: case LLM_ARCH_QWEN3VLMOE: { - // for model loading, the weights only have the main embd - // so we need to divide by the number of deepstack layers + 1 - // n_embd is const int so we declare a new variable - int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1); + int64_t n_embd = hparams.n_embd; tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // output From 981d578bc3b26a01bb9029d83d85bcf38ec84401 Mon Sep 17 00:00:00 2001 From: 65a <10104049+65a@users.noreply.github.com> Date: Sat, 1 Nov 2025 14:21:09 -0700 Subject: [PATCH 2/3] Remove embd hack from qwen3vl.cpp --- src/models/qwen3vl.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/models/qwen3vl.cpp b/src/models/qwen3vl.cpp index 10b36c1f65e91..15a6c66db3e81 100644 --- a/src/models/qwen3vl.cpp +++ b/src/models/qwen3vl.cpp @@ -2,9 +2,9 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_full = hparams.n_embd; // main embd + deepstack embds const size_t n_deepstack_layers = hparams.n_deepstack_layers; - const int64_t n_embd = n_embd_full / (n_deepstack_layers + 1); + const int64_t n_embd_full = hparams.n_embd * (n_deepstack_layers + 1); // main embd + deepstack embds + const int64_t n_embd = hparams.n_embd; const int64_t n_embd_head = hparams.n_embd_head_v; @@ -23,9 +23,9 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_ if (ubatch.embd) { // Image input: split main embd and deepstack embds - ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], 0); + ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd_full, n_tokens, inpL->nb[1], 0); for (size_t i = 0; i < n_deepstack_layers; i++) { - deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float)); + deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd_full, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float)); } inpL = inpL_main; } From af5f50f16375c7e80c44c333c292263da46e7908 Mon Sep 17 00:00:00 2001 From: 65a <10104049+65a@users.noreply.github.com> Date: Sat, 1 Nov 2025 14:22:28 -0700 Subject: [PATCH 3/3] Remove embd hack from qwen3vl-moe.cpp --- src/models/qwen3vl-moe.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/models/qwen3vl-moe.cpp b/src/models/qwen3vl-moe.cpp index c48643c0cd140..74dcf95b37785 100644 --- a/src/models/qwen3vl-moe.cpp +++ b/src/models/qwen3vl-moe.cpp @@ -1,9 +1,9 @@ #include "models.h" llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_full = hparams.n_embd; // main embd + deepstack embds const size_t n_deepstack_layers = hparams.n_deepstack_layers; - const int64_t n_embd = n_embd_full / (n_deepstack_layers + 1); + const int64_t n_embd_full = hparams.n_embd * (n_deepstack_layers + 1); // main embd + deepstack embds + const int64_t n_embd = hparams.n_embd; const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -21,9 +21,9 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_ if (ubatch.embd) { // Image input: split main embd and deepstack embds - ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], 0); + ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd_full, n_tokens, inpL->nb[1], 0); for (size_t i = 0; i < n_deepstack_layers; i++) { - deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float)); + deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd_full, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float)); } inpL = inpL_main; }