Skip to content

Commit 1f477b3

Browse files
committed
make nextn weights loadable without a crash
1 parent e434f87 commit 1f477b3

File tree

3 files changed

+35
-8
lines changed

3 files changed

+35
-8
lines changed

src/llama-arch.cpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2240,12 +2240,13 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
22402240
{LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
22412241
// NextN/MTP tensors are currently ignored (reserved for future MTP support)
22422242
// These tensors only exist in the last layer(s) and are treated as output tensors
2243-
{LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
2244-
{LLM_TENSOR_NEXTN_EMBED_TOKENS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
2245-
{LLM_TENSOR_NEXTN_ENORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
2246-
{LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
2247-
{LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
2248-
{LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
2243+
// Changed to LLM_TENSOR_LAYER_REPEATING because we saved these under a blk with a non-negative id
2244+
{LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2245+
{LLM_TENSOR_NEXTN_EMBED_TOKENS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}},
2246+
{LLM_TENSOR_NEXTN_ENORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}},
2247+
{LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
2248+
{LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2249+
{LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
22492250
};
22502251

22512252
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}

src/llama-model.cpp

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4510,7 +4510,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
45104510

45114511
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
45124512
// skip all tensors in the NextN layers
4513-
flags |= TENSOR_SKIP;
4513+
// flags |= TENSOR_SKIP;
45144514
}
45154515

45164516
auto & layer = layers[i];
@@ -4574,12 +4574,37 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
45744574

45754575
// NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
45764576
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
4577+
4578+
// our input/output layer sanity check prevents us from loading the eh_proj layer!
4579+
// this is because eh_proj is labelled with a layer number in existing GGUFs,
4580+
// so we need to set bid == <last layer number> to successfully load the tensors, but our io layer sanity check requires bid == -1.
4581+
// this function is a hack that creates the nextn layers as LLM_TENSOR_LAYER_REPEATING instead.
4582+
/* auto create_tensor_override_io_sanity_check =
4583+
[&](llm_tensor type_enum, const char * suffix, int bid, const std::initializer_list<int64_t>& ne, int flags) -> ggml_tensor * {
4584+
4585+
auto tn_orig = tn(type_enum, suffix, bid);
4586+
llm_tensor_info info_override = *tn_orig.info;
4587+
info_override.layer = LLM_TENSOR_LAYER_REPEATING;
4588+
4589+
auto tn_override = tn_orig;
4590+
tn_override.info = &info_override;
4591+
4592+
return create_tensor(tn_override, ne, flags);
4593+
};*/
4594+
45774595
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
45784596
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags);
45794597
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
45804598
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
45814599
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags);
45824600
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags);
4601+
4602+
// layer.nextn.eh_proj = create_tensor_override_io_sanity_check(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i, { 2 * n_embd, n_embd }, flags);
4603+
// layer.nextn.embed_tokens = create_tensor_override_io_sanity_check(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i, { n_embd, n_vocab }, flags);
4604+
// layer.nextn.enorm = create_tensor_override_io_sanity_check(LLM_TENSOR_NEXTN_ENORM, "weight", i, { n_embd }, flags);
4605+
// layer.nextn.hnorm = create_tensor_override_io_sanity_check(LLM_TENSOR_NEXTN_HNORM, "weight", i, { n_embd }, flags);
4606+
// layer.nextn.shared_head_head = create_tensor_override_io_sanity_check(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i, { n_embd, n_vocab }, flags);
4607+
// layer.nextn.shared_head_norm = create_tensor_override_io_sanity_check(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i, { n_embd }, flags);
45834608
}
45844609
}
45854610
}

tools/server/server.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1432,7 +1432,8 @@ struct server_slot {
14321432
}
14331433

14341434
bool can_speculate() const {
1435-
return (ctx_dft || has_mtp) && params.speculative.n_max > 0 && params.cache_prompt;
1435+
// return (ctx_dft || has_mtp) && params.speculative.n_max > 0 && params.cache_prompt;
1436+
return (ctx_dft) && params.speculative.n_max > 0 && params.cache_prompt;
14361437
}
14371438

14381439
void add_token(const completion_token_output & token) {

0 commit comments

Comments
 (0)