@@ -4386,13 +4386,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
43864386 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
43874387 }
43884388
4389- // NextN/MTP tensors (preserved but unused) - treated as output tensors
4390- create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ), { 2 * n_embd, n_embd }, TENSOR_NOT_REQUIRED);
4391- create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
4392- create_tensor(tn(LLM_TENSOR_NEXTN_ENORM), { n_embd }, TENSOR_NOT_REQUIRED);
4393- create_tensor(tn(LLM_TENSOR_NEXTN_HNORM), { n_embd }, TENSOR_NOT_REQUIRED);
4394- create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
4395- create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM), { n_embd }, TENSOR_NOT_REQUIRED);
4389+ // NextN/MTP tensors (preserved but unused) - in final layer (dynamic layer number)
4390+ const int final_layer = n_layer - 1; // NextN tensors are in the last layer
4391+ create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", final_layer), { 2 * n_embd, n_embd }, TENSOR_NOT_REQUIRED);
4392+ create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", final_layer), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
4393+ create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", final_layer), { n_embd }, TENSOR_NOT_REQUIRED);
4394+ create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", final_layer), { n_embd }, TENSOR_NOT_REQUIRED);
4395+ create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", final_layer), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
4396+ create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", final_layer), { n_embd }, TENSOR_NOT_REQUIRED);
43964397
43974398 for (int i = 0; i < n_layer; ++i) {
43984399 auto & layer = layers[i];
0 commit comments