mtp-batch(refactor): Replace MTP boolean flags with an explicit operation enum

SamuelOliveirads · SamuelOliveirads · commit 913af8f48d2d · 2025-10-10T16:44:28.000-03:00
diff --git a/common/speculative.cpp b/common/speculative.cpp
@@ -378,17 +378,21 @@ llama_token mtp_speculative_gen_draft(
     const llama_seq_id draft_seq_id = 0;
     common_batch_add(mtp_batch, id_last, n_past, {0}, true);
 
-    mtp_batch.update_mtp_kv = false;
-    mtp_batch.use_mtp_head  = true;
+    mtp_batch.mtp_params.op_type = MTP_OP_DRAFT_GEN;
 
-    LOG_INF("[DEBUG-DRAFT-CALL] Calling llama_decode for draft. update_mtp_kv=%s, use_mtp_head=%s\n",
-        mtp_batch.update_mtp_kv ? "true" : "false",
-        mtp_batch.use_mtp_head ? "true" : "false"
-    );
+    // LOG_INF("[DEBUG-DRAFT-CALL] Calling llama_decode for draft. update_mtp_kv=%s, use_mtp_head=%s\n",
+    //     mtp_batch.update_mtp_kv ? "true" : "false",
+    //     mtp_batch.use_mtp_head ? "true" : "false"
+    // );
 
+    // Perform the MTP draft generation decode. This writes the MTP layer's
+    // KV state for the draft token into the cache.
     llama_decode(ctx, mtp_batch);
     llama_batch_free(mtp_batch);
 
+    // CRITICAL: Purge the metadata for the draft token we just wrote.
+    // This makes the physical cell available again for the main model's validation pass,
+    // preventing a cache state corruption where two cells map to the same logical position.
     llama_kv_cache_seq_rm(ctx, draft_seq_id, draft_pos, draft_pos + 1);
 
     const llama_model * model = llama_get_model(ctx);
@@ -398,7 +402,7 @@ llama_token mtp_speculative_gen_draft(
     cur_p->size = n_vocab;
     for (int i = 0; i < n_vocab; ++i) {
         cur_p->data[i].id = i;
-        cur_p->data[i].logit = llama_get_logits_ith(ctx, 0)[i]; // TODO: check if position 0 is the right
+        cur_p->data[i].logit = llama_get_logits_ith(ctx, 0)[i]; // For a single-token batch, logits are always at index 0.
     }
     cur_p->sorted = false;
     common_sampler_apply_chain(smpl, cur_p);
@@ -415,9 +419,11 @@ void mtp_update_kv_cache(struct llama_context * ctx, const llama_batch& batch, b
     LOG_INF("[MTP-UPDATE|%s] Updating %d tokens...\n", is_prompt_warmup ? "PROMPT_WARMUP" : "GEN_ACCEPTED", batch.n_tokens);
 
     llama_batch mtp_batch = batch;
-    mtp_batch.update_mtp_kv = true;
-    mtp_batch.use_mtp_head  = true;
-    mtp_batch.is_mtp_prompt_warmup = is_prompt_warmup;
+    if (is_prompt_warmup) {
+        mtp_batch.mtp_params.op_type = MTP_OP_WARMUP;
+    } else {
+        mtp_batch.mtp_params.op_type = MTP_OP_UPDATE_ACCEPTED;
+    }
 
     for (int i = 0; i < mtp_batch.n_tokens; ++i) {
         mtp_batch.logits[i] = false;
diff --git a/include/llama.h b/include/llama.h
@@ -221,6 +221,17 @@ extern "C" {
     //               - if not:        only the last token is output
     //            )
     //
+    typedef enum {
+        MTP_OP_NONE,
+        MTP_OP_WARMUP,
+        MTP_OP_UPDATE_ACCEPTED,
+        MTP_OP_DRAFT_GEN,
+    } llama_mtp_op_type;
+
+    typedef struct llama_mtp_params {
+        llama_mtp_op_type op_type;
+    } llama_mtp_params;
+
     typedef struct llama_batch {
         int32_t n_tokens;
 
@@ -230,9 +241,7 @@ extern "C" {
         int32_t      *  n_seq_id;
         llama_seq_id ** seq_id;
         int8_t       *  logits;   // TODO: rename this to "output"
-        bool            update_mtp_kv; 
-        bool            use_mtp_head;
-        bool            is_mtp_prompt_warmup;
+        llama_mtp_params mtp_params;
     } llama_batch;
 
     enum llama_model_kv_override_type {
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
@@ -841,9 +841,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
         /*n_seq_id      =*/ nullptr,
         /*seq_id        =*/ nullptr,
         /*logits        =*/ nullptr,
-        /*.use_mtp_head =*/ false,
-        /*update_mtp_kv =*/ false,
-        /*.is_mtp_prompt_warmup =*/ false,
+        /*.mtp_params   =*/ { MTP_OP_NONE },
     };
 
     if (embd) {
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -750,7 +750,7 @@ static double calculate_vector_sum(const float* vec, size_t size) {
 }
 
 llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret,
-                                                bool do_mtp_kv_update, bool use_mtp_head, bool is_mtp_prompt_warmup) {
+                                                const llama_mtp_params & mtp_params) {
     if (mctx && !mctx->apply()) {
         LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__);
         ret = GGML_STATUS_FAILED;
@@ -762,7 +762,7 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
 
     // the new graph parameters
     // in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
-    const auto gparams = graph_params(res, ubatch, mctx, gtype, do_mtp_kv_update, use_mtp_head);
+    const auto gparams = graph_params(res, ubatch, mctx, gtype, mtp_params);
 
     if (!graph_reuse_disable && res->can_reuse(gparams)) {
         //LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
@@ -793,22 +793,22 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
         }
     }
 
-    if (do_mtp_kv_update || (use_mtp_head && !do_mtp_kv_update)) { // If it is any MTP operation
+    if (mtp_params.op_type != MTP_OP_NONE) { // If it is any MTP operation
         const char * target_tensor_name = "result_embd_pooled";
         ggml_tensor* hidden_states_input = ggml_get_tensor(res->get_ctx(), target_tensor_name);
 
         const float * source_hidden_state = nullptr;
-        if (is_mtp_prompt_warmup || (do_mtp_kv_update && !is_mtp_prompt_warmup)) {
+        if (mtp_params.op_type == MTP_OP_WARMUP || mtp_params.op_type == MTP_OP_UPDATE_ACCEPTED) {
             source_hidden_state = this->embd;
         } else {
             source_hidden_state = this->draft_input_hidden_state;
         }
 
         if (source_hidden_state != nullptr && hidden_states_input != nullptr) {
             const size_t n_embd = this->model.hparams.n_embd;
-            const size_t n_tokens_for_sum = (do_mtp_kv_update && ubatch.n_tokens > 2) ? ubatch.n_tokens : 1;
+            const size_t n_tokens_for_sum = (mtp_params.op_type == MTP_OP_UPDATE_ACCEPTED && ubatch.n_tokens > 2) ? ubatch.n_tokens : 1;
             double input_sum = calculate_vector_sum(source_hidden_state, n_tokens_for_sum * n_embd);
-            const char * op_type = (do_mtp_kv_update) ? "MTP_UPDATE" : "DRAFT_GEN";
+            const char * op_type = (mtp_params.op_type == MTP_OP_UPDATE_ACCEPTED) ? "MTP_UPDATE" : "DRAFT_GEN";
 
             LLAMA_LOG_WARN("[MTP-INPUT-CHECK] Operation: %s | Input Checksum: %e\n", op_type, input_sum);
 
@@ -833,20 +833,20 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
     const int64_t t_exec_start_us = ggml_time_us();
     const auto status = graph_compute(res->get_gf(), ubatch.n_tokens > 1);
     const int64_t t_exec_end_us = ggml_time_us();
-    LLAMA_LOG_INFO(
-        "[PERF] Graph compute time: %.2f ms (ubatch_size: %u, MTP path: %s)\n",
-        (t_exec_end_us - t_exec_start_us) / 1000.0,
-        ubatch.n_tokens,
-        do_mtp_kv_update ? "yes" : "no"
-    );
+    // LLAMA_LOG_INFO(
+    //     "[PERF] Graph compute time: %.2f ms (ubatch_size: %u, MTP path: %s)\n",
+    //     (t_exec_end_us - t_exec_start_us) / 1000.0,
+    //     ubatch.n_tokens,
+    //     do_mtp_kv_update ? "yes" : "no"
+    // );
     if (status != GGML_STATUS_SUCCESS) {
         LLAMA_LOG_ERROR("%s: failed to compute graph, compute status: %d\n", __func__, status);
         ret = status;
         return nullptr;
     }
 
     ret = GGML_STATUS_SUCCESS;
-    if (do_mtp_kv_update || use_mtp_head) {
+    if (mtp_params.op_type == MTP_OP_UPDATE_ACCEPTED) {
         ggml_tensor * sum_tensor = ggml_get_tensor(res->get_ctx(), "mtp_input_sum");
         if (sum_tensor) {
             LLAMA_LOG_WARN("[DEBUG-SUM] MTP input sum node successfully created.\n");
@@ -912,7 +912,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
     cparams.causal_attn = false;
 
     ggml_status status;
-    const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status, false, false, false);
+    const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status, { MTP_OP_NONE });
 
     cparams.causal_attn = causal_attn_org;
 
@@ -1027,10 +1027,10 @@ int llama_context::decode(const llama_batch & batch_inp) {
     GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT
 
     auto * kvd = static_cast<llama_context_kv_cache_data *>(kv_cache_data);
-    LLAMA_LOG_WARN("[DEBUG-DECODE-ENTRY] Entering llama_decode. update_mtp_kv=%s, use_mtp_head=%s\n",
-        batch_inp.update_mtp_kv ? "true" : "false",
-        batch_inp.use_mtp_head ? "true" : "false"
-    );
+    // LLAMA_LOG_WARN("[DEBUG-DECODE-ENTRY] Entering llama_decode. update_mtp_kv=%s, use_mtp_head=%s\n",
+    //     batch_inp.update_mtp_kv ? "true" : "false",
+    //     batch_inp.use_mtp_head ? "true" : "false"
+    // );
 
     if (!memory) {
         LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__);
@@ -1101,7 +1101,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
             } else {
                 mctx = memory->init_batch(*balloc, cparams.n_ubatch, output_all);
 
-                if (!batch_inp.use_mtp_head && !batch_inp.update_mtp_kv) {
+                if (batch_inp.mtp_params.op_type == MTP_OP_NONE) {
                     if (mctx && mctx->get_status() == LLAMA_MEMORY_STATUS_SUCCESS) {
                         kvd->last_main_model_sinfos = static_cast<llama_kv_cache_unified_context *>(mctx.get())->get_sinfos();
                     } else {
@@ -1158,9 +1158,9 @@ int llama_context::decode(const llama_batch & batch_inp) {
     };
 
     int64_t n_outputs_prev = 0;
-    const bool do_mtp_kv_update = batch_inp.update_mtp_kv;
-    const bool use_mtp_head = batch_inp.use_mtp_head;
-    const bool is_prompt_warmup = batch_inp.is_mtp_prompt_warmup;
+    // const bool do_mtp_kv_update = batch_inp.update_mtp_kv;
+    // const bool use_mtp_head = batch_inp.use_mtp_head;
+    // const bool is_prompt_warmup = batch_inp.is_mtp_prompt_warmup;
     
     do {
         const auto & ubatch = mctx->get_ubatch();
@@ -1169,13 +1169,13 @@ int llama_context::decode(const llama_batch & batch_inp) {
             for (uint32_t i = 0; i < std::min((uint32_t)5, ubatch.n_tokens); ++i) {
                 pos_str += std::to_string(ubatch.pos[i]) + " ";
             }
-            LLAMA_LOG_WARN(
-                "[DEBUG-POS] ubatch_size=%u, update_mtp_kv=%s, use_mtp_head=%s. Positions: %s...\n",
-                ubatch.n_tokens,
-                batch_inp.update_mtp_kv ? "true" : "false",
-                batch_inp.use_mtp_head ? "true" : "false",
-                pos_str.c_str()
-            );
+            // LLAMA_LOG_WARN(
+            //     "[DEBUG-POS] ubatch_size=%u, update_mtp_kv=%s, use_mtp_head=%s. Positions: %s...\n",
+            //     ubatch.n_tokens,
+            //     batch_inp.update_mtp_kv ? "true" : "false",
+            //     batch_inp.use_mtp_head ? "true" : "false",
+            //     pos_str.c_str()
+            // );
         }
 
         // count the outputs in this ubatch
@@ -1193,16 +1193,16 @@ int llama_context::decode(const llama_batch & batch_inp) {
             // needs to happen before the graph is built
             n_outputs = n_outputs_new;
         }
-        if (do_mtp_kv_update) {
-            LLAMA_LOG_WARN("[DEBUG-MTP-UPDATE] MTP KV Update ubatch: n_tokens=%d\n", ubatch.n_tokens);
-            std::string positions_str;
-            for (int i = 0; i < std::min((uint32_t)5, ubatch.n_tokens); ++i) {
-                positions_str += std::to_string(ubatch.pos[i]) + " ";
-            }
-            LLAMA_LOG_WARN("[DEBUG-MTP-UPDATE] Positions: %s...\n", positions_str.c_str());
-        }
+        // if (do_mtp_kv_update) {
+        //     LLAMA_LOG_WARN("[DEBUG-MTP-UPDATE] MTP KV Update ubatch: n_tokens=%d\n", ubatch.n_tokens);
+        //     std::string positions_str;
+        //     for (int i = 0; i < std::min((uint32_t)5, ubatch.n_tokens); ++i) {
+        //         positions_str += std::to_string(ubatch.pos[i]) + " ";
+        //     }
+        //     LLAMA_LOG_WARN("[DEBUG-MTP-UPDATE] Positions: %s...\n", positions_str.c_str());
+        // }
         ggml_status status;
-        const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status, do_mtp_kv_update, use_mtp_head, is_prompt_warmup);
+        const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status, batch_inp.mtp_params);
         if (!res) {
             // the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
             llama_pos pos_min[LLAMA_MAX_SEQ];
@@ -1261,17 +1261,17 @@ int llama_context::decode(const llama_batch & batch_inp) {
             }
         }
 
-        if (use_mtp_head) {
-            if (t_embd != nullptr) {
-                LLAMA_LOG_ERROR("[MTP-GRAPH-BUG] The MTP graph returned an embedding tensor when it shouldn't have! This will cause corruption.\n");
-            } else {
-                LLAMA_LOG_WARN("[MTP-GRAPH-OK] The MTP graph correctly did not return an embedding tensor.\n");
-            }
-        }
+        // if (use_mtp_head) {
+        //     if (t_embd != nullptr) {
+        //         LLAMA_LOG_ERROR("[MTP-GRAPH-BUG] The MTP graph returned an embedding tensor when it shouldn't have! This will cause corruption.\n");
+        //     } else {
+        //         LLAMA_LOG_WARN("[MTP-GRAPH-OK] The MTP graph correctly did not return an embedding tensor.\n");
+        //     }
+        // }
 
         // extract embeddings
         if (t_embd && n_outputs > 0) {
-            if (!use_mtp_head) {
+            if (batch_inp.mtp_params.op_type == MTP_OP_NONE) {
                 ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
                 GGML_ASSERT(backend_embd != nullptr);
 
@@ -1389,7 +1389,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
         ggml_backend_sched_reset(sched.get());
     }
 
-    if (!use_mtp_head) {
+    if (batch_inp.mtp_params.op_type == MTP_OP_NONE) {
         synchronize(); 
         const size_t n_embd = this->model.hparams.n_embd;
         double full_buffer_sum = calculate_vector_sum(this->embd, n_outputs_all * n_embd);
@@ -1534,7 +1534,7 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
 
     auto * res = gf_res_reserve.get();
 
-    const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT, false, false);
+    const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT, { MTP_OP_NONE });
 
     res->reset();
 
@@ -1556,8 +1556,7 @@ llm_graph_params llama_context::graph_params(
                       const llama_ubatch & ubatch,
                         const llama_memory_context_i * mctx,
                         llm_graph_type   gtype,
-                        bool update_mtp_kv,
-                        bool use_mtp_head) const {
+                        const llama_mtp_params & mtp_params) const {
     return {
         /*.arch        =*/ model.arch,
         /*.hparams     =*/ model.hparams,
@@ -1570,8 +1569,7 @@ llm_graph_params llama_context::graph_params(
         /*.loras       =*/ &loras,
         /*.mctx        =*/ mctx,
         /*.cross       =*/ &cross,
-        /*.update_mtp_kv =*/ update_mtp_kv,
-        /*.use_mtp_head  =*/ use_mtp_head,
+        /*.mtp_params  =*/ mtp_params,
         /*.n_outputs   =*/ n_outputs,
         /*.cb          =*/ graph_get_cb(),
         /*.res         =*/ res,
@@ -2312,7 +2310,7 @@ void llama_context::opt_epoch_iter(
 
             auto * res = gf_res_prev.get();
 
-            const auto gparams = graph_params(res, ubatch, mctx.get(), LLM_GRAPH_TYPE_DEFAULT, false, false);
+            const auto gparams = graph_params(res, ubatch, mctx.get(), LLM_GRAPH_TYPE_DEFAULT, { MTP_OP_NONE });
 
             res->reset();
 
diff --git a/src/llama-context.h b/src/llama-context.h
@@ -111,9 +111,7 @@ struct llama_context {
                     llm_graph_type   gtype,
             llama_memory_context_i * mctx,
                        ggml_status & ret,
-                const bool do_mtp_kv_update,
-                const bool use_mtp_head,
-                bool is_mtp_prompt_warmup);
+                const llama_mtp_params & mtp_params);
 
     int encode(const llama_batch & batch_inp);
     int decode(const llama_batch & batch_inp);
@@ -229,8 +227,7 @@ struct llama_context {
                       const llama_ubatch & ubatch,
             const llama_memory_context_i * mctx,
                           llm_graph_type   gtype,
-                           bool update_mtp_kv,
-                           bool use_mtp_head) const;
+                           const llama_mtp_params & mtp_params) const;
 
     llm_graph_cb graph_get_cb(ggml_backend_sched * sched_override = nullptr) const;
 
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -417,8 +417,7 @@ struct llm_graph_params {
     const llama_adapter_loras    * loras;
     const llama_memory_context_i * mctx;
     const llama_cross            * cross;
-    bool update_mtp_kv;
-    bool use_mtp_head;
+    llama_mtp_params             mtp_params;
 
     uint32_t n_outputs;
 
@@ -467,8 +466,7 @@ struct llm_graph_params {
             cvec      == other.cvec  &&
             loras     == other.loras &&
             cross     == other.cross &&
-            update_mtp_kv == other.update_mtp_kv &&
-            use_mtp_head  == other.use_mtp_head  &&
+            mtp_params.op_type == other.mtp_params.op_type &&
             n_outputs == other.n_outputs;
     }
 };
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
diff --git a/tools/server/server.cpp b/tools/server/server.cpp