@@ -1027,7 +1027,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
10271027 } break;
10281028 case LLM_ARCH_QWEN3VL:
10291029 {
1030- ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, 0 );
1030+ ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false );
10311031 ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
10321032 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
10331033 switch (hparams.n_layer) {
@@ -1036,8 +1036,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
10361036 case 64: type = LLM_TYPE_32B; break;
10371037 default: type = LLM_TYPE_UNKNOWN;
10381038 }
1039- // for deepstack patch, we consider the embd to be [main_embd, deepstack_embd_1, deepstack_embd_2, ...]
1040- hparams.n_embd = hparams.n_embd * (hparams.n_deepstack_layers + 1);
1039+ // since vision model stacks deepstack features along feature dim
1040+ // we also create a fake "n_embd" for text model to be the main embd + deepstack embds
1041+ hparams.n_embd *= hparams.n_deepstack_layers + 1;
10411042 } break;
10421043 case LLM_ARCH_QWEN3MOE:
10431044 {
@@ -1052,17 +1053,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
10521053 } break;
10531054 case LLM_ARCH_QWEN3VLMOE:
10541055 {
1055- ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, 0 );
1056+ ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false );
10561057 ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
1057- ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1058+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
10581059 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
10591060 switch (hparams.n_layer) {
10601061 case 48: type = LLM_TYPE_30B_A3B; break;
10611062 case 94: type = LLM_TYPE_235B_A22B; break;
10621063 default: type = LLM_TYPE_UNKNOWN;
10631064 }
1064- // for deepstack patch, we consider the embd to be [main_embd, deepstack_embd_1, deepstack_embd_2, ...]
1065- hparams.n_embd = hparams.n_embd * (hparams.n_deepstack_layers + 1);
1065+ // since vision model stacks deepstack features along feature dim
1066+ // we also create a fake "n_embd" for text model to be the main embd + deepstack embds
1067+ hparams.n_embd *= hparams.n_deepstack_layers + 1;
10661068 } break;
10671069 case LLM_ARCH_PHI2:
10681070 {
@@ -3307,11 +3309,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
33073309 case LLM_ARCH_QWEN3:
33083310 case LLM_ARCH_QWEN3VL:
33093311 {
3310- int64_t n_embd = hparams.n_embd;
3311- // for deepstack features, we consider the embd to be [main_embd, deepstack_embd_1, deepstack_embd_2, ...]
3312- if (arch == LLM_ARCH_QWEN3VL) {
3313- n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1);
3314- }
3312+ // for model loading, the weights only have the main embd
3313+ // so we need to divide by the number of deepstack layers + 1
3314+ // n_embd is const int so we declare a new variable
3315+ int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1);
33153316 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
33163317
33173318 // output
@@ -3347,11 +3348,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
33473348 case LLM_ARCH_QWEN3MOE:
33483349 case LLM_ARCH_QWEN3VLMOE:
33493350 {
3350- // for deepstack features, we consider the embd to be [main_embd, deepstack_embd_1, deepstack_embd_2, ...]
3351- int64_t n_embd = hparams.n_embd;
3352- if (arch == LLM_ARCH_QWEN3VLMOE) {
3353- n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1);
3354- }
3351+ // for model loading, the weights only have the main embd
3352+ // so we need to divide by the number of deepstack layers + 1
3353+ // n_embd is const int so we declare a new variable
3354+ int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1);
33553355 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
33563356
33573357 // output
0 commit comments