1313#include < cstring>
1414#include < limits>
1515#include < stdexcept>
16- #include < numeric>
1716
1817//
1918// llama_context
@@ -738,17 +737,6 @@ bool llama_context::apply_adapter_cvec(
738737 return cvec.apply (model, data, len, n_embd, il_start, il_end);
739738}
740739
741- static double calculate_vector_sum (const float * vec, size_t size) {
742- if (!vec) {
743- return 0.0 ;
744- }
745- double sum = 0.0 ;
746- for (size_t i = 0 ; i < size; ++i) {
747- sum += vec[i];
748- }
749- return sum;
750- }
751-
752740llm_graph_result * llama_context::process_ubatch (const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret,
753741 const llama_mtp_params & mtp_params) {
754742 if (mctx && !mctx->apply ()) {
@@ -995,10 +983,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
995983 GGML_ASSERT ((!batch_inp.token && batch_inp.embd ) || (batch_inp.token && !batch_inp.embd )); // NOLINT
996984
997985 auto * kvd = static_cast <llama_context_kv_cache_data *>(kv_cache_data);
998- // LLAMA_LOG_WARN("[DEBUG-DECODE-ENTRY] Entering llama_decode. update_mtp_kv=%s, use_mtp_head=%s\n",
999- // batch_inp.update_mtp_kv ? "true" : "false",
1000- // batch_inp.use_mtp_head ? "true" : "false"
1001- // );
1002986
1003987 if (!memory) {
1004988 LLAMA_LOG_DEBUG (" %s: cannot decode batches with this context (calling encode() instead)\n " , __func__);
@@ -1074,10 +1058,10 @@ int llama_context::decode(const llama_batch & batch_inp) {
10741058 }
10751059 case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
10761060 {
1077- // if (use_last_main_model_sinfos ) {
1078- // LLAMA_LOG_ERROR("%s: Mismatch between ubatches and sinfos during reuse.\n", __func__);
1079- // return -1;
1080- // }
1061+ if (kvd-> forced_sinfos ) {
1062+ LLAMA_LOG_ERROR (" %s: Mismatch between ubatches and sinfos during reuse.\n " , __func__);
1063+ return -1 ;
1064+ }
10811065
10821066 if (!did_optimize) {
10831067 did_optimize = true ;
@@ -1106,9 +1090,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
11061090 };
11071091
11081092 int64_t n_outputs_prev = 0 ;
1109- // const bool do_mtp_kv_update = batch_inp.update_mtp_kv;
1110- // const bool use_mtp_head = batch_inp.use_mtp_head;
1111- // const bool is_prompt_warmup = batch_inp.is_mtp_prompt_warmup;
11121093
11131094 do {
11141095 const auto & ubatch = mctx->get_ubatch ();
@@ -1127,14 +1108,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
11271108 // needs to happen before the graph is built
11281109 n_outputs = n_outputs_new;
11291110 }
1130- // if (do_mtp_kv_update) {
1131- // LLAMA_LOG_WARN("[DEBUG-MTP-UPDATE] MTP KV Update ubatch: n_tokens=%d\n", ubatch.n_tokens);
1132- // std::string positions_str;
1133- // for (int i = 0; i < std::min((uint32_t)5, ubatch.n_tokens); ++i) {
1134- // positions_str += std::to_string(ubatch.pos[i]) + " ";
1135- // }
1136- // LLAMA_LOG_WARN("[DEBUG-MTP-UPDATE] Positions: %s...\n", positions_str.c_str());
1137- // }
11381111 ggml_status status;
11391112 const auto * res = process_ubatch (ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get (), status, batch_inp.mtp_params );
11401113 if (!res) {
@@ -1195,14 +1168,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
11951168 }
11961169 }
11971170
1198- // if (use_mtp_head) {
1199- // if (t_embd != nullptr) {
1200- // LLAMA_LOG_ERROR("[MTP-GRAPH-BUG] The MTP graph returned an embedding tensor when it shouldn't have! This will cause corruption.\n");
1201- // } else {
1202- // LLAMA_LOG_WARN("[MTP-GRAPH-OK] The MTP graph correctly did not return an embedding tensor.\n");
1203- // }
1204- // }
1205-
12061171 // extract embeddings
12071172 if (t_embd && n_outputs > 0 ) {
12081173 if (batch_inp.mtp_params .op_type == MTP_OP_NONE) {
0 commit comments