@@ -276,7 +276,7 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
276276 } break ;
277277 case GGML_OP_IM2COL:
278278 {
279- const int n_embd = hparams.n_embd ;
279+ const int n_embd = hparams.n_embd_full ;
280280 ggml_tensor * b = ggml_new_tensor_4d (ctx, GGML_TYPE_F32, n_embd, w->ne [1 ], 1 , 1 );
281281 op_tensor = ggml_im2col (ctx, w, b, 1 , 0 , 0 , 0 , 1 , 0 , false , GGML_TYPE_F16);
282282 } break ;
@@ -505,6 +505,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
505505 ml.get_key (LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used , false );
506506 ml.get_key (LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups , false );
507507 ml.get_key (LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used , false );
508+ hparams.n_embd_full = hparams.n_embd ;
508509
509510 if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
510511 ml.get_key (LLM_KV_FEATURES_LENGTH, hparams.n_embd_features );
@@ -1041,7 +1042,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
10411042 }
10421043 // since vision model stacks deepstack features along feature dim
10431044 // we also create a fake "n_embd" for text model to be the main embd + deepstack embds
1044- hparams.n_embd *= hparams.n_deepstack_layers + 1 ;
1045+ hparams.n_embd_full *= hparams.n_deepstack_layers + 1 ;
10451046 } break ;
10461047 case LLM_ARCH_QWEN3MOE:
10471048 {
@@ -1067,7 +1068,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
10671068 }
10681069 // since vision model stacks deepstack features along feature dim
10691070 // we also create a fake "n_embd" for text model to be the main embd + deepstack embds
1070- hparams.n_embd *= hparams.n_deepstack_layers + 1 ;
1071+ hparams.n_embd_full *= hparams.n_deepstack_layers + 1 ;
10711072 } break ;
10721073 case LLM_ARCH_PHI2:
10731074 {
@@ -3332,10 +3333,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
33323333 case LLM_ARCH_QWEN3:
33333334 case LLM_ARCH_QWEN3VL:
33343335 {
3335- // for model loading, the weights only have the main embd
3336- // so we need to divide by the number of deepstack layers + 1
3337- // n_embd is const int so we declare a new variable
3338- int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1 );
33393336 tok_embd = create_tensor (tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab}, 0 );
33403337
33413338 // output
@@ -3371,10 +3368,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
33713368 case LLM_ARCH_QWEN3MOE:
33723369 case LLM_ARCH_QWEN3VLMOE:
33733370 {
3374- // for model loading, the weights only have the main embd
3375- // so we need to divide by the number of deepstack layers + 1
3376- // n_embd is const int so we declare a new variable
3377- int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1 );
33783371 tok_embd = create_tensor (tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab}, 0 );
33793372
33803373 // output
@@ -6681,8 +6674,8 @@ ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
66816674 return ::select_buft (
66826675 *pimpl->dev_layer .at (il).buft_list ,
66836676 [&](ggml_context * ctx) {
6684- ggml_tensor * cur = ggml_new_tensor_1d (ctx, GGML_TYPE_F32, hparams.n_embd );
6685- ggml_tensor * layer_dir = ggml_new_tensor_1d (ctx, GGML_TYPE_F32, hparams.n_embd );
6677+ ggml_tensor * cur = ggml_new_tensor_1d (ctx, GGML_TYPE_F32, hparams.n_embd_full );
6678+ ggml_tensor * layer_dir = ggml_new_tensor_1d (ctx, GGML_TYPE_F32, hparams.n_embd_full );
66866679 return ggml_add (ctx, cur, layer_dir);
66876680 });
66886681}
@@ -7329,6 +7322,10 @@ int32_t llama_model_n_embd(const llama_model * model) {
73297322 return model->hparams .n_embd ;
73307323}
73317324
7325+ int32_t llama_model_n_embd_full (const llama_model * model) {
7326+ return model->hparams .n_embd_full ;
7327+ }
7328+
73327329int32_t llama_model_n_layer (const llama_model * model) {
73337330 return model->hparams .n_layer ;
73347331}
0 commit comments