try add set_mpt_head api

ngxson · ngxson · commit e8b115b24357 · 2025-05-04T11:39:53.000+02:00
diff --git a/include/llama.h b/include/llama.h
@@ -496,6 +496,12 @@ extern "C" {
     LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
 
+    // If model supports multi-token predict, this returns number of tokens ; returns 0 otherwise
+    LLAMA_API int32_t llama_model_n_mtp(const struct llama_model * model);
+
+    // Get the i-th multi-token predict model (used by speculative decoding)
+    LLAMA_API struct llama_model * llama_model_get_mtp(struct llama_model * model, int32_t i);
+
     // Get the model's RoPE frequency scaling factor
     LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
 
@@ -959,6 +965,9 @@ extern "C" {
     // If set to true, the model will only attend to the past tokens
     LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
 
+    // Set whether to use multi-token predict head ; 0 means no MTP
+    LLAMA_API void llama_set_mpt_head(struct llama_context * ctx, int32_t n_mtp);
+
     // Set whether the model is in warmup mode or not
     // If true, all model tensors are activated during llama_decode() to load and cache their weights.
     LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup);
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
@@ -121,6 +121,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_EMBEDDING_SCALE,                   "%s.embedding_scale"                   },
     { LLM_KV_TOKEN_SHIFT_COUNT,                 "%s.token_shift_count"                 },
     { LLM_KV_INTERLEAVE_MOE_LAYER_STEP,         "%s.interleave_moe_layer_step"         },
+    { LLM_KV_N_MULTI_TOKEN_PREDICT,             "%s.n_multi_token_predict"             },
 
     { LLM_KV_ATTENTION_HEAD_COUNT,                   "%s.attention.head_count"                   },
     { LLM_KV_ATTENTION_HEAD_COUNT_KV,                "%s.attention.head_count_kv"                },
diff --git a/src/llama-arch.h b/src/llama-arch.h
@@ -125,6 +125,7 @@ enum llm_kv {
     LLM_KV_EMBEDDING_SCALE,
     LLM_KV_TOKEN_SHIFT_COUNT,
     LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
+    LLM_KV_N_MULTI_TOKEN_PREDICT,
 
     LLM_KV_ATTENTION_HEAD_COUNT,
     LLM_KV_ATTENTION_HEAD_COUNT_KV,
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -625,6 +625,18 @@ void llama_context::set_causal_attn(bool value) {
     cparams.causal_attn = value;
 }
 
+void llama_context::set_causal_attn(bool value) {
+    LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
+
+    cparams.causal_attn = value;
+}
+
+void llama_context::set_mpt_head(int32_t value) {
+    LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
+
+    cparams.curr_mtp = value;
+}
+
 void llama_context::set_warmup(bool value) {
     LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
 
@@ -1981,6 +1993,11 @@ void llama_set_causal_attn(llama_context * ctx, bool causal_attn) {
     ctx->set_causal_attn(causal_attn);
 }
 
+void llama_set_mpt_head(llama_context * ctx, int32_t n_mtp) {
+    GGML_ASSERT(n_mtp <= llama_model_n_mtp(llama_get_model(ctx)));
+    ctx->set_mpt_head(n_mtp);
+}
+
 void llama_set_warmup(llama_context * ctx, bool warmup) {
     ctx->set_warmup(warmup);
 }
diff --git a/src/llama-context.h b/src/llama-context.h
@@ -69,6 +69,7 @@ struct llama_context {
 
     void set_embeddings (bool value);
     void set_causal_attn(bool value);
+    void set_mpt_head(int32_t value);
     void set_warmup(bool value);
 
     void set_adapter_lora(
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
@@ -31,6 +31,11 @@ struct llama_cparams {
     bool no_perf;
     bool warmup;
 
+    // multi-token predict
+    // 0 means not using MTP
+    // N means using the nth MTP head
+    int32_t curr_mtp = 0;
+
     enum llama_pooling_type pooling_type;
 
     ggml_backend_sched_eval_callback cb_eval;
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
@@ -47,6 +47,9 @@ struct llama_hparams {
     uint32_t n_embd_head_k_mla = 0;
     uint32_t n_embd_head_v_mla = 0;
 
+    // for multi-token predict
+    uint32_t n_mtp = 0;
+
     // for WavTokenizer
     struct llama_hparams_posnet   posnet;
     struct llama_hparams_convnext convnext;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -455,6 +455,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
         GGML_ASSERT(hparams.n_expert_used == 0);
     }
 
+    // multi-token predict
+    ml.get_key(LLM_KV_N_MULTI_TOKEN_PREDICT, hparams.n_mtp, false);
+
     // zero-out the array hparams
     std::fill(hparams.n_head_arr.begin(),    hparams.n_head_arr.end(),    0);
     std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
@@ -4323,6 +4326,10 @@ void llama_model::print_info() const {
         LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms   = %d\n",     __func__, hparams.ssm_dt_b_c_rms);
     }
 
+    if (hparams.n_mtp) {
+        LLAMA_LOG_INFO("%s: n_mtp            = %u\n",     __func__, hparams.n_mtp);
+    }
+
     LLAMA_LOG_INFO("%s: model type       = %s\n",     __func__, type_name().c_str());
     if (pimpl->n_elements >= 1e12) {
         LLAMA_LOG_INFO("%s: model params     = %.2f T\n", __func__, pimpl->n_elements*1e-12);
@@ -13234,6 +13241,10 @@ int32_t llama_model_n_head_kv(const llama_model * model) {
     return model->hparams.n_head_kv();
 }
 
+int32_t llama_model_n_mtp(const llama_model * model) {
+    return model->hparams.n_mtp;
+}
+
 // deprecated
 int32_t llama_n_ctx_train(const llama_model * model) {
     return llama_model_n_ctx_train(model);