@@ -4510,7 +4510,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
45104510
45114511 if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
45124512 // skip all tensors in the NextN layers
4513- flags |= TENSOR_SKIP;
4513+ // flags |= TENSOR_SKIP;
45144514 }
45154515
45164516 auto & layer = layers[i];
@@ -4574,12 +4574,37 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
45744574
45754575 // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
45764576 if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
4577+
4578+ // our input/output layer sanity check prevents us from loading the eh_proj layer!
4579+ // this is because eh_proj is labelled with a layer number in existing GGUFs,
4580+ // so we need to set bid == <last layer number> to successfully load the tensors, but our io layer sanity check requires bid == -1.
4581+ // this function is a hack that creates the nextn layers as LLM_TENSOR_LAYER_REPEATING instead.
4582+ /* auto create_tensor_override_io_sanity_check =
4583+ [&](llm_tensor type_enum, const char * suffix, int bid, const std::initializer_list<int64_t>& ne, int flags) -> ggml_tensor * {
4584+
4585+ auto tn_orig = tn(type_enum, suffix, bid);
4586+ llm_tensor_info info_override = *tn_orig.info;
4587+ info_override.layer = LLM_TENSOR_LAYER_REPEATING;
4588+
4589+ auto tn_override = tn_orig;
4590+ tn_override.info = &info_override;
4591+
4592+ return create_tensor(tn_override, ne, flags);
4593+ };*/
4594+
45774595 layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
45784596 layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags);
45794597 layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
45804598 layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
45814599 layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags);
45824600 layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags);
4601+
4602+ // layer.nextn.eh_proj = create_tensor_override_io_sanity_check(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i, { 2 * n_embd, n_embd }, flags);
4603+ // layer.nextn.embed_tokens = create_tensor_override_io_sanity_check(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i, { n_embd, n_vocab }, flags);
4604+ // layer.nextn.enorm = create_tensor_override_io_sanity_check(LLM_TENSOR_NEXTN_ENORM, "weight", i, { n_embd }, flags);
4605+ // layer.nextn.hnorm = create_tensor_override_io_sanity_check(LLM_TENSOR_NEXTN_HNORM, "weight", i, { n_embd }, flags);
4606+ // layer.nextn.shared_head_head = create_tensor_override_io_sanity_check(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i, { n_embd, n_vocab }, flags);
4607+ // layer.nextn.shared_head_norm = create_tensor_override_io_sanity_check(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i, { n_embd }, flags);
45834608 }
45844609 }
45854610 }
0 commit comments