context : simplify kv cache updates

ggerganov · ggerganov · commit 5c8448874e6d · 2025-04-02T12:55:59.000+03:00
ggml-ci
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -1254,6 +1254,9 @@ int llama_context::decode(llama_batch & inp_batch) {
         return -2;
     };
 
+    // handle any pending defrags/shifts
+    kv_self_update();
+
     int64_t n_outputs_prev = 0;
 
     while (sbatch.n_tokens > 0) {
@@ -1293,14 +1296,6 @@ int llama_context::decode(llama_batch & inp_batch) {
 
         // find KV slot
         {
-            kv_self_update();
-
-            // if we have enough unused cells before the current head ->
-            //   better to start searching from the beginning of the cache, hoping to fill it
-            if (kv_self->head > kv_self->used + 2*ubatch.n_tokens) {
-                kv_self->head = 0;
-            }
-
             if (!kv_self->find_slot(ubatch)) {
                 LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__);
                 return -3;
@@ -1342,16 +1337,6 @@ int llama_context::decode(llama_batch & inp_batch) {
             }
         }
 
-        // update the kv ring buffer
-        {
-            kv_self->head += ubatch.n_tokens;
-
-            // Ensure kv cache head points to a valid index.
-            if (kv_self->head >= kv_self->size) {
-                kv_self->head = 0;
-            }
-        }
-
         // plot the computation graph in dot format (for debugging purposes)
         //if (n_past%100 == 0) {
         //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -492,6 +492,12 @@ bool llama_kv_cache_unified::find_slot(
     const uint32_t n_seqs   = ubatch.n_seqs;
     const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
 
+    // if we have enough unused cells before the current head ->
+    //   better to start searching from the beginning of the cache, hoping to fill it
+    if (head > used + 2*ubatch.n_tokens) {
+        head = 0;
+    }
+
     if (recurrent) {
         // For recurrent state architectures (like Mamba or RWKV),
         // each cache cell can store the state for a whole sequence.