Skip to content

Commit 0127c6b

Browse files
mtp-batch(chore): Remove final MTP debug logs and dead code
1 parent 4bcc9e2 commit 0127c6b

File tree

3 files changed

+5
-56
lines changed

3 files changed

+5
-56
lines changed

src/llama-context.cpp

Lines changed: 4 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
#include <cstring>
1414
#include <limits>
1515
#include <stdexcept>
16-
#include <numeric>
1716

1817
//
1918
// llama_context
@@ -738,17 +737,6 @@ bool llama_context::apply_adapter_cvec(
738737
return cvec.apply(model, data, len, n_embd, il_start, il_end);
739738
}
740739

741-
static double calculate_vector_sum(const float* vec, size_t size) {
742-
if (!vec) {
743-
return 0.0;
744-
}
745-
double sum = 0.0;
746-
for (size_t i = 0; i < size; ++i) {
747-
sum += vec[i];
748-
}
749-
return sum;
750-
}
751-
752740
llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret,
753741
const llama_mtp_params & mtp_params) {
754742
if (mctx && !mctx->apply()) {
@@ -995,10 +983,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
995983
GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT
996984

997985
auto * kvd = static_cast<llama_context_kv_cache_data *>(kv_cache_data);
998-
// LLAMA_LOG_WARN("[DEBUG-DECODE-ENTRY] Entering llama_decode. update_mtp_kv=%s, use_mtp_head=%s\n",
999-
// batch_inp.update_mtp_kv ? "true" : "false",
1000-
// batch_inp.use_mtp_head ? "true" : "false"
1001-
// );
1002986

1003987
if (!memory) {
1004988
LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__);
@@ -1074,10 +1058,10 @@ int llama_context::decode(const llama_batch & batch_inp) {
10741058
}
10751059
case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
10761060
{
1077-
// if (use_last_main_model_sinfos) {
1078-
// LLAMA_LOG_ERROR("%s: Mismatch between ubatches and sinfos during reuse.\n", __func__);
1079-
// return -1;
1080-
// }
1061+
if (kvd->forced_sinfos) {
1062+
LLAMA_LOG_ERROR("%s: Mismatch between ubatches and sinfos during reuse.\n", __func__);
1063+
return -1;
1064+
}
10811065

10821066
if (!did_optimize) {
10831067
did_optimize = true;
@@ -1106,9 +1090,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
11061090
};
11071091

11081092
int64_t n_outputs_prev = 0;
1109-
// const bool do_mtp_kv_update = batch_inp.update_mtp_kv;
1110-
// const bool use_mtp_head = batch_inp.use_mtp_head;
1111-
// const bool is_prompt_warmup = batch_inp.is_mtp_prompt_warmup;
11121093

11131094
do {
11141095
const auto & ubatch = mctx->get_ubatch();
@@ -1127,14 +1108,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
11271108
// needs to happen before the graph is built
11281109
n_outputs = n_outputs_new;
11291110
}
1130-
// if (do_mtp_kv_update) {
1131-
// LLAMA_LOG_WARN("[DEBUG-MTP-UPDATE] MTP KV Update ubatch: n_tokens=%d\n", ubatch.n_tokens);
1132-
// std::string positions_str;
1133-
// for (int i = 0; i < std::min((uint32_t)5, ubatch.n_tokens); ++i) {
1134-
// positions_str += std::to_string(ubatch.pos[i]) + " ";
1135-
// }
1136-
// LLAMA_LOG_WARN("[DEBUG-MTP-UPDATE] Positions: %s...\n", positions_str.c_str());
1137-
// }
11381111
ggml_status status;
11391112
const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status, batch_inp.mtp_params);
11401113
if (!res) {
@@ -1195,14 +1168,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
11951168
}
11961169
}
11971170

1198-
// if (use_mtp_head) {
1199-
// if (t_embd != nullptr) {
1200-
// LLAMA_LOG_ERROR("[MTP-GRAPH-BUG] The MTP graph returned an embedding tensor when it shouldn't have! This will cause corruption.\n");
1201-
// } else {
1202-
// LLAMA_LOG_WARN("[MTP-GRAPH-OK] The MTP graph correctly did not return an embedding tensor.\n");
1203-
// }
1204-
// }
1205-
12061171
// extract embeddings
12071172
if (t_embd && n_outputs > 0) {
12081173
if (batch_inp.mtp_params.op_type == MTP_OP_NONE) {

src/llama-model.cpp

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -13829,11 +13829,6 @@ struct llm_build_glm4_moe : public llm_graph_context {
1382913829
// Final layer tensors are loaded but not processed in forward pass
1383013830
const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
1383113831
for (int il = 0; il < n_transformer_layers; ++il) {
13832-
// if (params.use_mtp_head) {
13833-
// LLAMA_LOG_ERROR("[DEBUG-KV-ERROR] MTP path is running the main layer %d!\n", il);
13834-
// } else {
13835-
// LLAMA_LOG_WARN("[DEBUG-KV] Main Head Path: Accessing layer %d\n", il);
13836-
// }
1383713832
ggml_tensor * inpSA = inpL;
1383813833

1383913834
// Pre-attention norm
@@ -13976,7 +13971,6 @@ struct llm_build_glm4_moe : public llm_graph_context {
1397613971
ggml_tensor * embd_copy = ggml_dup(ctx0, prev_embeddings);
1397713972

1397813973
const int il = hparams.n_layer - 1;
13979-
// LLAMA_LOG_WARN("[DEBUG-KV] MTP Head Path: Accessing layer %d\n", il);
1398013974
ggml_tensor * sum_node = ggml_sum(ctx0, embd_copy);
1398113975

1398213976
ggml_set_name(sum_node, "mtp_input_sum");
@@ -18311,12 +18305,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
1831118305
}
1831218306

1831318307
ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
18314-
const int64_t t_start_us = ggml_time_us();
1831518308

1831618309
std::unique_ptr<llm_graph_context> llm;
18317-
18318-
const bool build_mtp = params.mtp_params.op_type == MTP_OP_UPDATE_ACCEPTED;
18319-
1832018310
switch (arch) {
1832118311
case LLM_ARCH_LLAMA:
1832218312
{
@@ -18678,12 +18668,6 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
1867818668
// add on pooling layer
1867918669
llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
1868018670
}
18681-
const int64_t t_end_us = ggml_time_us();
18682-
// LLAMA_LOG_INFO(
18683-
// "[PERF] Graph build time: %.2f ms (MTP path: %s)\n",
18684-
// (t_end_us - t_start_us) / 1000.0,
18685-
// params.use_mtp_head ? "yes" : "no"
18686-
// );
1868718671
return llm->res->get_gf();
1868818672
}
1868918673

tools/server/server.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3520,7 +3520,7 @@ struct server_context {
35203520
// Clean up the forced state to not affect subsequent decodes.
35213521
llama_mtp_cancel_sinfo_update(ctx);
35223522
} else {
3523-
LOG_ERR("%s: Failed to prepare the MTP symphony for warmup.", __func__);
3523+
LOG_ERR("%s: Failed to prepare the MTP for warmup.", __func__);
35243524
}
35253525
}
35263526

0 commit comments

Comments
 (0)