diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 04239181c7765..6da5841bc6139 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1039,9 +1039,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { case 64: type = LLM_TYPE_32B; break; default: type = LLM_TYPE_UNKNOWN; } - // since vision model stacks deepstack features along feature dim - // we also create a fake "n_embd" for text model to be the main embd + deepstack embds - hparams.n_embd *= hparams.n_deepstack_layers + 1; } break; case LLM_ARCH_QWEN3MOE: { @@ -1065,9 +1062,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { case 94: type = LLM_TYPE_235B_A22B; break; default: type = LLM_TYPE_UNKNOWN; } - // since vision model stacks deepstack features along feature dim - // we also create a fake "n_embd" for text model to be the main embd + deepstack embds - hparams.n_embd *= hparams.n_deepstack_layers + 1; } break; case LLM_ARCH_PHI2: { @@ -3332,10 +3326,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { case LLM_ARCH_QWEN3: case LLM_ARCH_QWEN3VL: { - // for model loading, the weights only have the main embd - // so we need to divide by the number of deepstack layers + 1 - // n_embd is const int so we declare a new variable - int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1); + int64_t n_embd = hparams.n_embd; tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // output @@ -3371,10 +3362,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { case LLM_ARCH_QWEN3MOE: case LLM_ARCH_QWEN3VLMOE: { - // for model loading, the weights only have the main embd - // so we need to divide by the number of deepstack layers + 1 - // n_embd is const int so we declare a new variable - int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1); + int64_t n_embd = hparams.n_embd; tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // output diff --git a/src/models/qwen3vl-moe.cpp b/src/models/qwen3vl-moe.cpp index c48643c0cd140..74dcf95b37785 100644 --- a/src/models/qwen3vl-moe.cpp +++ b/src/models/qwen3vl-moe.cpp @@ -1,9 +1,9 @@ #include "models.h" llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_full = hparams.n_embd; // main embd + deepstack embds const size_t n_deepstack_layers = hparams.n_deepstack_layers; - const int64_t n_embd = n_embd_full / (n_deepstack_layers + 1); + const int64_t n_embd_full = hparams.n_embd * (n_deepstack_layers + 1); // main embd + deepstack embds + const int64_t n_embd = hparams.n_embd; const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -21,9 +21,9 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_ if (ubatch.embd) { // Image input: split main embd and deepstack embds - ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], 0); + ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd_full, n_tokens, inpL->nb[1], 0); for (size_t i = 0; i < n_deepstack_layers; i++) { - deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float)); + deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd_full, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float)); } inpL = inpL_main; } diff --git a/src/models/qwen3vl.cpp b/src/models/qwen3vl.cpp index 10b36c1f65e91..15a6c66db3e81 100644 --- a/src/models/qwen3vl.cpp +++ b/src/models/qwen3vl.cpp @@ -2,9 +2,9 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_full = hparams.n_embd; // main embd + deepstack embds const size_t n_deepstack_layers = hparams.n_deepstack_layers; - const int64_t n_embd = n_embd_full / (n_deepstack_layers + 1); + const int64_t n_embd_full = hparams.n_embd * (n_deepstack_layers + 1); // main embd + deepstack embds + const int64_t n_embd = hparams.n_embd; const int64_t n_embd_head = hparams.n_embd_head_v; @@ -23,9 +23,9 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_ if (ubatch.embd) { // Image input: split main embd and deepstack embds - ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], 0); + ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd_full, n_tokens, inpL->nb[1], 0); for (size_t i = 0; i < n_deepstack_layers; i++) { - deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float)); + deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd_full, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float)); } inpL = inpL_main; }