From b85cddb88438d7a11d3b572e8b026886025de614 Mon Sep 17 00:00:00 2001
From: 65a <10104049+65a@users.noreply.github.com>
Date: Sat, 1 Nov 2025 14:18:47 -0700
Subject: [PATCH 1/3] Remove n_embd hack from llama-models.cpp

---
 src/llama-model.cpp | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 04239181c7765..6da5841bc6139 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1039,9 +1039,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     case 64: type = LLM_TYPE_32B; break;
                     default: type = LLM_TYPE_UNKNOWN;
                 }
-                // since vision model stacks deepstack features along feature dim
-                // we also create a fake "n_embd" for text model to be the main embd + deepstack embds
-                hparams.n_embd *= hparams.n_deepstack_layers + 1;
             } break;
         case LLM_ARCH_QWEN3MOE:
             {
@@ -1065,9 +1062,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     case 94: type = LLM_TYPE_235B_A22B; break;
                     default: type = LLM_TYPE_UNKNOWN;
                 }
-                // since vision model stacks deepstack features along feature dim
-                // we also create a fake "n_embd" for text model to be the main embd + deepstack embds
-                hparams.n_embd *= hparams.n_deepstack_layers + 1;
             } break;
         case LLM_ARCH_PHI2:
             {
@@ -3332,10 +3326,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
             case LLM_ARCH_QWEN3:
             case LLM_ARCH_QWEN3VL:
                 {
-                    // for model loading, the weights only have the main embd
-                    // so we need to divide by the number of deepstack layers + 1
-                    // n_embd is const int so we declare a new variable
-                    int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1);
+                    int64_t n_embd = hparams.n_embd;
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
@@ -3371,10 +3362,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
             case LLM_ARCH_QWEN3MOE:
             case LLM_ARCH_QWEN3VLMOE:
                 {
-                    // for model loading, the weights only have the main embd
-                    // so we need to divide by the number of deepstack layers + 1
-                    // n_embd is const int so we declare a new variable
-                    int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1);
+                    int64_t n_embd = hparams.n_embd;
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output

From 981d578bc3b26a01bb9029d83d85bcf38ec84401 Mon Sep 17 00:00:00 2001
From: 65a <10104049+65a@users.noreply.github.com>
Date: Sat, 1 Nov 2025 14:21:09 -0700
Subject: [PATCH 2/3] Remove embd hack from qwen3vl.cpp

---
 src/models/qwen3vl.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/models/qwen3vl.cpp b/src/models/qwen3vl.cpp
index 10b36c1f65e91..15a6c66db3e81 100644
--- a/src/models/qwen3vl.cpp
+++ b/src/models/qwen3vl.cpp
@@ -2,9 +2,9 @@
 
 llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
 
-    const int64_t n_embd_full = hparams.n_embd; // main embd + deepstack embds
     const size_t n_deepstack_layers = hparams.n_deepstack_layers;
-    const int64_t n_embd = n_embd_full / (n_deepstack_layers + 1);
+    const int64_t n_embd_full = hparams.n_embd * (n_deepstack_layers + 1); // main embd + deepstack embds
+    const int64_t n_embd = hparams.n_embd;
     const int64_t n_embd_head = hparams.n_embd_head_v;
 
 
@@ -23,9 +23,9 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_
 
     if (ubatch.embd) {
         // Image input: split main embd and deepstack embds
-        ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], 0);
+        ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd_full, n_tokens, inpL->nb[1], 0);
         for (size_t i = 0; i < n_deepstack_layers; i++) {
-            deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float));
+            deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd_full, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float));
         }
         inpL = inpL_main;
     }

From af5f50f16375c7e80c44c333c292263da46e7908 Mon Sep 17 00:00:00 2001
From: 65a <10104049+65a@users.noreply.github.com>
Date: Sat, 1 Nov 2025 14:22:28 -0700
Subject: [PATCH 3/3] Remove embd hack from qwen3vl-moe.cpp

---
 src/models/qwen3vl-moe.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/models/qwen3vl-moe.cpp b/src/models/qwen3vl-moe.cpp
index c48643c0cd140..74dcf95b37785 100644
--- a/src/models/qwen3vl-moe.cpp
+++ b/src/models/qwen3vl-moe.cpp
@@ -1,9 +1,9 @@
 #include "models.h"
 
 llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_full = hparams.n_embd; // main embd + deepstack embds
     const size_t n_deepstack_layers = hparams.n_deepstack_layers;
-    const int64_t n_embd = n_embd_full / (n_deepstack_layers + 1);
+    const int64_t n_embd_full = hparams.n_embd * (n_deepstack_layers + 1); // main embd + deepstack embds
+    const int64_t n_embd = hparams.n_embd;
     const int64_t n_embd_head = hparams.n_embd_head_v;
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -21,9 +21,9 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_
 
     if (ubatch.embd) {
         // Image input: split main embd and deepstack embds
-        ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], 0);
+        ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd_full, n_tokens, inpL->nb[1], 0);
         for (size_t i = 0; i < n_deepstack_layers; i++) {
-            deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float));
+            deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd_full, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float));
         }
         inpL = inpL_main;
     }