Skip to content

Commit db60623

Browse files
committed
added getter for nextn layer count and server slot has_mtp property
1 parent 79c1160 commit db60623

File tree

3 files changed

+17
-1
lines changed

3 files changed

+17
-1
lines changed

include/llama.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -495,6 +495,8 @@ extern "C" {
495495

496496
LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
497497

498+
LLAMA_API int32_t llama_model_n_nextn_layer(const struct llama_model * model);
499+
498500
// Functions to access the model's GGUF metadata scalar values
499501
// - The functions return the length of the string on success, or -1 on failure
500502
// - The output string is always null-terminated and cleared on failure

src/llama-model.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18587,6 +18587,10 @@ const char * llama_model_cls_label(const struct llama_model * model, uint32_t i)
1858718587
return nullptr;
1858818588
}
1858918589

18590+
int32_t llama_model_n_nextn_layer(const llama_model * model) {
18591+
return model->hparams.nextn_predict_layers;
18592+
}
18593+
1859018594
// deprecated
1859118595
int32_t llama_n_ctx_train(const llama_model * model) {
1859218596
return llama_model_n_ctx_train(model);

tools/server/server.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1294,7 +1294,8 @@ struct server_slot {
12941294
mtmd_context * mctx = nullptr;
12951295

12961296
common_speculative * spec = nullptr;
1297-
1297+
bool has_mtp = false;
1298+
12981299
std::vector<common_adapter_lora_info> lora;
12991300

13001301
// the index relative to completion multi-task request
@@ -2121,6 +2122,15 @@ struct server_context {
21212122
common_speculative_add_replacement_tgt_dft(slot.spec, pair.first.c_str(), pair.second.c_str());
21222123
}
21232124
}
2125+
else if (llama_model_n_nextn_layer(model) > 0) {
2126+
SRV_INF("model has nextn layers = %d\n", llama_model_n_nextn_layer(model));
2127+
slot.has_mtp = true;
2128+
2129+
// assume one speculative token (true of all well-known MTP models so far)
2130+
slot.batch_spec = llama_batch_init(2, 0, 1);
2131+
params_base.speculative.n_min = 0;
2132+
params_base.speculative.n_max = 1;
2133+
}
21242134

21252135
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
21262136

0 commit comments

Comments
 (0)