mtp-batch(chore): Remove final MTP debug logs and dead code

SamuelOliveirads · SamuelOliveirads · commit 0127c6beeb38 · 2025-10-11T22:20:54.000-03:00
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -13,7 +13,6 @@
 #include <cstring>
 #include <limits>
 #include <stdexcept>
-#include <numeric>
 
 //
 // llama_context
@@ -738,17 +737,6 @@ bool llama_context::apply_adapter_cvec(
     return cvec.apply(model, data, len, n_embd, il_start, il_end);
 }
 
-static double calculate_vector_sum(const float* vec, size_t size) {
-    if (!vec) {
-        return 0.0;
-    }
-    double sum = 0.0;
-    for (size_t i = 0; i < size; ++i) {
-        sum += vec[i];
-    }
-    return sum;
-}
-
 llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret,
                                                 const llama_mtp_params & mtp_params) {
     if (mctx && !mctx->apply()) {
@@ -995,10 +983,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
     GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT
 
     auto * kvd = static_cast<llama_context_kv_cache_data *>(kv_cache_data);
-    // LLAMA_LOG_WARN("[DEBUG-DECODE-ENTRY] Entering llama_decode. update_mtp_kv=%s, use_mtp_head=%s\n",
-    //     batch_inp.update_mtp_kv ? "true" : "false",
-    //     batch_inp.use_mtp_head ? "true" : "false"
-    // );
 
     if (!memory) {
         LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__);
@@ -1074,10 +1058,10 @@ int llama_context::decode(const llama_batch & batch_inp) {
                 }
             case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
                 {
-                    // if (use_last_main_model_sinfos) {
-                    //     LLAMA_LOG_ERROR("%s: Mismatch between ubatches and sinfos during reuse.\n", __func__);
-                    //     return -1;
-                    // }
+                    if (kvd->forced_sinfos) {
+                        LLAMA_LOG_ERROR("%s: Mismatch between ubatches and sinfos during reuse.\n", __func__);
+                        return -1;
+                    }
 
                     if (!did_optimize) {
                         did_optimize = true;
@@ -1106,9 +1090,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
     };
 
     int64_t n_outputs_prev = 0;
-    // const bool do_mtp_kv_update = batch_inp.update_mtp_kv;
-    // const bool use_mtp_head = batch_inp.use_mtp_head;
-    // const bool is_prompt_warmup = batch_inp.is_mtp_prompt_warmup;
     
     do {
         const auto & ubatch = mctx->get_ubatch();
@@ -1127,14 +1108,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
             // needs to happen before the graph is built
             n_outputs = n_outputs_new;
         }
-        // if (do_mtp_kv_update) {
-        //     LLAMA_LOG_WARN("[DEBUG-MTP-UPDATE] MTP KV Update ubatch: n_tokens=%d\n", ubatch.n_tokens);
-        //     std::string positions_str;
-        //     for (int i = 0; i < std::min((uint32_t)5, ubatch.n_tokens); ++i) {
-        //         positions_str += std::to_string(ubatch.pos[i]) + " ";
-        //     }
-        //     LLAMA_LOG_WARN("[DEBUG-MTP-UPDATE] Positions: %s...\n", positions_str.c_str());
-        // }
         ggml_status status;
         const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status, batch_inp.mtp_params);
         if (!res) {
@@ -1195,14 +1168,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
             }
         }
 
-        // if (use_mtp_head) {
-        //     if (t_embd != nullptr) {
-        //         LLAMA_LOG_ERROR("[MTP-GRAPH-BUG] The MTP graph returned an embedding tensor when it shouldn't have! This will cause corruption.\n");
-        //     } else {
-        //         LLAMA_LOG_WARN("[MTP-GRAPH-OK] The MTP graph correctly did not return an embedding tensor.\n");
-        //     }
-        // }
-
         // extract embeddings
         if (t_embd && n_outputs > 0) {
             if (batch_inp.mtp_params.op_type == MTP_OP_NONE) {
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -13829,11 +13829,6 @@ struct llm_build_glm4_moe : public llm_graph_context {
             // Final layer tensors are loaded but not processed in forward pass
             const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
             for (int il = 0; il < n_transformer_layers; ++il) {
-                // if (params.use_mtp_head) {
-                //     LLAMA_LOG_ERROR("[DEBUG-KV-ERROR] MTP path is running the main layer %d!\n", il);
-                // } else {
-                //     LLAMA_LOG_WARN("[DEBUG-KV] Main Head Path: Accessing layer %d\n", il);
-                // }
                 ggml_tensor * inpSA = inpL;
 
                 // Pre-attention norm
@@ -13976,7 +13971,6 @@ struct llm_build_glm4_moe : public llm_graph_context {
         ggml_tensor * embd_copy = ggml_dup(ctx0, prev_embeddings);
 
         const int il = hparams.n_layer - 1;
-        // LLAMA_LOG_WARN("[DEBUG-KV] MTP Head Path: Accessing layer %d\n", il);
         ggml_tensor * sum_node = ggml_sum(ctx0, embd_copy);
 
         ggml_set_name(sum_node, "mtp_input_sum");
@@ -18311,12 +18305,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
 }
 
 ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
-    const int64_t t_start_us = ggml_time_us();
     
     std::unique_ptr<llm_graph_context> llm;
-
-    const bool build_mtp = params.mtp_params.op_type == MTP_OP_UPDATE_ACCEPTED;
-
     switch (arch) {
         case LLM_ARCH_LLAMA:
             {
@@ -18678,12 +18668,6 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
         // add on pooling layer
         llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
     }
-    const int64_t t_end_us = ggml_time_us();
-    // LLAMA_LOG_INFO(
-    //     "[PERF] Graph build time: %.2f ms (MTP path: %s)\n",
-    //     (t_end_us - t_start_us) / 1000.0,
-    //     params.use_mtp_head ? "yes" : "no"
-    // );
     return llm->res->get_gf();
 }
 
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -3520,7 +3520,7 @@ struct server_context {
                     // Clean up the forced state to not affect subsequent decodes.
                     llama_mtp_cancel_sinfo_update(ctx);
                 } else {
-                    LOG_ERR("%s: Failed to prepare the MTP symphony for warmup.", __func__);
+                    LOG_ERR("%s: Failed to prepare the MTP for warmup.", __func__);
                 }
             }
 

Original file line number	Diff line number	Diff line change
`@@ -3520,7 +3520,7 @@ struct server_context {`
`3520`	`3520`	`// Clean up the forced state to not affect subsequent decodes.`
`3521`	`3521`	`llama_mtp_cancel_sinfo_update(ctx);`
`3522`	`3522`	`} else {`
`3523`		`- LOG_ERR("%s: Failed to prepare the MTP symphony for warmup.", __func__);`
	`3523`	`+ LOG_ERR("%s: Failed to prepare the MTP for warmup.", __func__);`
`3524`	`3524`	`}`
`3525`	`3525`	`}`
`3526`	`3526`