@@ -3728,6 +3728,8 @@ struct server_context {
37283728
37293729 // reuse chunks from the cached prompt by shifting their KV cache in the new position
37303730 if (params_base.n_cache_reuse > 0 ) {
3731+ GGML_ASSERT (!slot.prompt .tokens .has_mtmd );
3732+
37313733 size_t head_c = n_past; // cache
37323734 size_t head_p = n_past; // current prompt
37333735
@@ -3836,6 +3838,9 @@ struct server_context {
38363838 }
38373839
38383840 if (pos_min > pos_min_thold) {
3841+ // TODO: support can be added in the future when corresponding vision models get released
3842+ GGML_ASSERT (!slot.prompt .tokens .has_mtmd );
3843+
38393844 SLT_WRN (slot, " n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n " , n_past, (int ) slot.prompt .tokens .size (), slot.id , pos_min, n_swa);
38403845
38413846 // search for a context checkpoint
@@ -3908,8 +3913,9 @@ struct server_context {
39083913 }
39093914
39103915 // truncate any tokens that are beyond n_past for this slot
3911- if (!llama_memory_seq_rm (llama_get_memory (ctx), slot.id , slot.prompt .n_tokens (), -1 )) {
3912- SLT_WRN (slot, " failed to truncate tokens with position >= %d\n " , slot.prompt .n_tokens ());
3916+ const llama_pos p0 = slot.prompt .tokens .pos_next ();
3917+ if (!llama_memory_seq_rm (llama_get_memory (ctx), slot.id , p0, -1 )) {
3918+ SLT_WRN (slot, " failed to truncate tokens with position >= %d\n " , p0);
39133919 llama_memory_seq_rm (llama_get_memory (ctx), slot.id , -1 , -1 );
39143920
39153921 // there is no common part left
@@ -3918,10 +3924,7 @@ struct server_context {
39183924 slot.prompt .tokens .clear ();
39193925 }
39203926
3921- SLT_INF (slot, " n_tokens = %d, memory_seq_rm [%d, end)\n " , slot.prompt .n_tokens (), slot.prompt .n_tokens ());
3922-
3923- // remove the non-common part from the cache
3924- slot.prompt .tokens .keep_first (slot.prompt .n_tokens ());
3927+ SLT_INF (slot, " n_tokens = %d, memory_seq_rm [%d, end)\n " , slot.prompt .n_tokens (), p0);
39253928
39263929 // check if we should process the image
39273930 if (slot.prompt .n_tokens () < slot.task ->n_tokens () && input_tokens[slot.prompt .n_tokens ()] == LLAMA_TOKEN_NULL) {
0 commit comments