F1LM1
diff --git a/‎common/speculative.cpp‎
Lines changed: 26 additions & 1 deletion b/‎common/speculative.cpp‎
Lines changed: 26 additions & 1 deletion
diff --git a/‎common/speculative.h‎
Lines changed: 7 additions & 0 deletions b/‎common/speculative.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎include/llama.h‎
Lines changed: 5 additions & 1 deletion b/‎include/llama.h‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/llama-context.cpp‎
Lines changed: 61 additions & 8 deletions b/‎src/llama-context.cpp‎
Lines changed: 61 additions & 8 deletions
diff --git a/‎src/llama-context.h‎
Lines changed: 10 additions & 0 deletions b/‎src/llama-context.h‎
Lines changed: 10 additions & 0 deletions
@@ -418,10 +418,35 @@ void mtp_update_kv_cache(struct llama_context * ctx, const llama_batch& batch, b
     for (int i = 0; i < mtp_batch.n_tokens; ++i) {
         mtp_batch.logits[i] = false;
     }
-
     llama_decode(ctx, mtp_batch);
 }
 
+void mtp_accept_tokens(
+    struct llama_context * ctx,
+    const std::vector<llama_token> & ids,
+    int32_t n_past_base,
+    llama_seq_id seq_id
+) {
+    if (ids.empty()) {
+        return;
+    }
+
+    if (!llama_mtp_prepare_sinfo_for_update(ctx, ids.size())) {
+        return;
+    }
+
+    llama_batch accepted_batch = llama_batch_init(ids.size(), 0, 1);
+    for (size_t i = 0; i < ids.size(); ++i) {
+        common_batch_add(accepted_batch, ids[i], n_past_base + i, { seq_id }, false);
+    }
+
+    mtp_update_kv_cache(ctx, accepted_batch, false);
+
+    llama_mtp_cancel_sinfo_update(ctx);
+
+    llama_batch_free(accepted_batch);
+}
+
 // Debug function - It will be removed later
 double calculate_vector_sum_double(const float* vec, size_t size) {
     if (!vec) {
 
@@ -51,4 +51,11 @@ llama_tokens common_speculative_gen_draft(
 
 void mtp_update_kv_cache(struct llama_context * ctx, const llama_batch& batch, bool is_prompt_warmup);
 
+void mtp_accept_tokens(
+    struct llama_context * ctx,
+    const std::vector<llama_token> & ids,
+    int32_t n_past_base,
+    llama_seq_id seq_id
+);
+
 double calculate_vector_sum_double(const float* vec, size_t size);
@@ -1457,7 +1457,11 @@ extern "C" {
             ggml_opt_epoch_callback   callback_train,
             ggml_opt_epoch_callback   callback_eval);
 
-        LLAMA_API void llama_set_draft_input_hidden_state(struct llama_context * ctx, const float * hidden_state);
+    LLAMA_API void llama_set_draft_input_hidden_state(struct llama_context * ctx, const float * hidden_state);
+
+    LLAMA_API bool llama_mtp_prepare_sinfo_for_update(struct llama_context * ctx, size_t n_accepted);
+
+    LLAMA_API void llama_mtp_cancel_sinfo_update(struct llama_context * ctx);
 
 #ifdef __cplusplus
 }
 
@@ -18,6 +18,11 @@
 //
 // llama_context
 //
+struct llama_context_kv_cache_data {
+    llama_kv_cache_unified::slot_info_vec_t last_main_model_sinfos;
+    llama_kv_cache_unified::slot_info_vec_t resized_sinfo_for_force;
+    const llama_kv_cache_unified::slot_info_vec_t * forced_sinfos = nullptr;
+};
 
 llama_context::llama_context(
         const llama_model & model,
@@ -106,6 +111,8 @@ llama_context::llama_context(
     cparams.op_offload = params.op_offload;
     cparams.kv_unified = params.kv_unified;
 
+    kv_cache_data = new llama_context_kv_cache_data();
+
     {
         const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
         supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : supports_set_rows;
@@ -371,6 +378,7 @@ llama_context::llama_context(
 
 llama_context::~llama_context() {
     ggml_opt_free(opt_ctx);
+    delete static_cast<llama_context_kv_cache_data *>(kv_cache_data);
 }
 
 void llama_context::synchronize() {
@@ -1017,6 +1025,8 @@ int llama_context::encode(const llama_batch & batch_inp) {
 
 int llama_context::decode(const llama_batch & batch_inp) {
     GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT
+
+    auto * kvd = static_cast<llama_context_kv_cache_data *>(kv_cache_data);
     LLAMA_LOG_WARN("[DEBUG-DECODE-ENTRY] Entering llama_decode. update_mtp_kv=%s, use_mtp_head=%s\n",
         batch_inp.update_mtp_kv ? "true" : "false",
         batch_inp.use_mtp_head ? "true" : "false"
@@ -1076,10 +1086,31 @@ int llama_context::decode(const llama_batch & batch_inp) {
     // handle any pending defrags/shifts
     kv_self_update(false);
 
-    llama_memory_context_ptr mctx;
+    std::unique_ptr<llama_memory_context_i> mctx;
 
     while (true) {
-        mctx = memory->init_batch(*balloc, cparams.n_ubatch, output_all);
+        if (cparams.warmup) {
+            mctx = memory->init_batch(*balloc, cparams.n_ubatch, output_all);
+        } else {
+            if (kvd->forced_sinfos && !kvd->forced_sinfos->empty()) {
+                LLAMA_LOG_WARN("[DEBUG-CACHE-REUSE] Forcing sinfos, bypassing find_slot.\n");
+
+                mctx = static_cast<llama_kv_cache_unified *>(memory.get())->init_batch_with_sinfos(
+                    *balloc, cparams.n_ubatch, *kvd->forced_sinfos, true
+                );
+            } else {
+                mctx = memory->init_batch(*balloc, cparams.n_ubatch, output_all);
+
+                if (!batch_inp.use_mtp_head && !batch_inp.update_mtp_kv) {
+                    if (mctx && mctx->get_status() == LLAMA_MEMORY_STATUS_SUCCESS) {
+                        kvd->last_main_model_sinfos = static_cast<llama_kv_cache_unified_context *>(mctx.get())->get_sinfos();
+                    } else {
+                        kvd->last_main_model_sinfos.clear();
+                    }
+                }
+            }
+        }
+
         if (!mctx) {
             return -2;
         }
@@ -1091,29 +1122,28 @@ int llama_context::decode(const llama_batch & batch_inp) {
             case LLAMA_MEMORY_STATUS_NO_UPDATE:
                 {
                     LLAMA_LOG_ERROR("%s: unexpected memory context status: %d\n", __func__, mctx->get_status());
-
                     return -2;
                 }
             case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
                 {
+                    // if (use_last_main_model_sinfos) {
+                    //     LLAMA_LOG_ERROR("%s: Mismatch between ubatches and sinfos during reuse.\n", __func__);
+                    //     return -1;
+                    // }
+
                     if (!did_optimize) {
                         did_optimize = true;
-
                         if (kv_self_update(true)) {
                             LLAMA_LOG_DEBUG("%s: retrying batch size %d after cache optimization\n", __func__, balloc->get_n_tokens());
-
                             continue;
                         }
                     }
-
                     LLAMA_LOG_WARN("%s: failed to find a memory slot for batch of size %d\n", __func__, balloc->get_n_tokens());
-
                     return 1;
                 }
             case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
                 {
                     LLAMA_LOG_ERROR("%s: compute failed while preparing batch of size %d\n", __func__, balloc->get_n_tokens());
-
                     return -2;
                 }
         }
@@ -3073,4 +3103,27 @@ void llama_opt_epoch(
 
 void llama_set_draft_input_hidden_state(struct llama_context * ctx, const float * hidden_state) {
     ctx->draft_input_hidden_state = hidden_state;
+}
+
+bool llama_mtp_prepare_sinfo_for_update(struct llama_context * ctx, size_t n_accepted) {
+    auto * kvd = static_cast<llama_context_kv_cache_data *>(ctx->kv_cache_data);
+    const auto & last_sinfo = kvd->last_main_model_sinfos;
+
+    if (last_sinfo.empty() || last_sinfo[0].idxs.empty()) {
+        LLAMA_LOG_ERROR("%s: The sinfo for the last main call is not available.", __func__);
+        return false;
+    }
+
+    kvd->resized_sinfo_for_force = last_sinfo;
+    
+    kvd->resized_sinfo_for_force[0].idxs[0].resize(n_accepted);
+
+    kvd->forced_sinfos = &kvd->resized_sinfo_for_force;
+
+    return true;
+}
+
+void llama_mtp_cancel_sinfo_update(struct llama_context * ctx) {
+    auto * kvd = static_cast<llama_context_kv_cache_data *>(ctx->kv_cache_data);
+    kvd->forced_sinfos = nullptr;
 }
@@ -20,13 +20,20 @@ class llama_io_write_i;
 struct llama_memory_i;
 struct llama_memory_context_i;
 
+struct llama_context_kv_cache_data;
+
 struct llama_context {
     // init scheduler and compute buffers, reserve worst-case graphs
     llama_context(
             const llama_model & model,
                   llama_context_params params);
 
     ~llama_context();
+    
+    llama_context(const llama_context &) = delete;
+    llama_context & operator=(const llama_context &) = delete;
+    llama_context(llama_context &&) = delete;
+    llama_context & operator=(llama_context &&) = delete;
 
     void synchronize();
 
@@ -211,6 +218,9 @@ struct llama_context {
 
     std::unique_ptr<llama_memory_context_i> mtp_memory_batch(const llama_batch& batch_inp);
 
+    // For MTP KV cache cell reuse
+    void * kv_cache_data;
+
 private:
     llm_graph_params graph_params(
                         llm_graph_result * res,