added getter for nextn layer count and server slot has_mtp property

F1LM1 · F1LM1 · commit db60623e7926 · 2025-08-10T23:52:54.000-04:00
diff --git a/include/llama.h b/include/llama.h
@@ -495,6 +495,8 @@ extern "C" {
 
     LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
 
+    LLAMA_API int32_t llama_model_n_nextn_layer(const struct llama_model * model);
+
     // Functions to access the model's GGUF metadata scalar values
     // - The functions return the length of the string on success, or -1 on failure
     // - The output string is always null-terminated and cleared on failure
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -18587,6 +18587,10 @@ const char * llama_model_cls_label(const struct llama_model * model, uint32_t i)
     return nullptr;
 }
 
+int32_t llama_model_n_nextn_layer(const llama_model * model) {
+    return model->hparams.nextn_predict_layers;
+}
+
 // deprecated
 int32_t llama_n_ctx_train(const llama_model * model) {
     return llama_model_n_ctx_train(model);
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -1294,7 +1294,8 @@ struct server_slot {
     mtmd_context * mctx = nullptr;
 
     common_speculative * spec = nullptr;
-
+    bool has_mtp = false;    
+    
     std::vector<common_adapter_lora_info> lora;
 
     // the index relative to completion multi-task request
@@ -2121,6 +2122,15 @@ struct server_context {
                     common_speculative_add_replacement_tgt_dft(slot.spec, pair.first.c_str(), pair.second.c_str());
                 }
             }
+            else if (llama_model_n_nextn_layer(model) > 0) {
+              SRV_INF("model has nextn layers = %d\n", llama_model_n_nextn_layer(model));
+              slot.has_mtp = true;
+              
+              // assume one speculative token (true of all well-known MTP models so far)
+              slot.batch_spec = llama_batch_init(2, 0, 1);
+              params_base.speculative.n_min = 0;
+              params_base.speculative.n_max = 1;
+            }
 
             SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);