kv-cache : rework error recovery logic + SWA n_batch -> n_ubatch

ggerganov · ggerganov · commit 206714dbe762 · 2025-05-12T19:58:49.000+03:00
ggml-ci
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -161,6 +161,8 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
 
     for (uint32_t i = 0; i < size; ++i) {
         if (cells[i].pos >= p0 && cells[i].pos < p1) {
+            pending.seq_rms.push_back({ seq_id, cells[i].pos, i });
+
             if (seq_id < 0) {
                 cells[i].seq_id.clear();
             } else if (cells[i].has_seq_id(seq_id)) {
@@ -331,43 +333,58 @@ llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
 }
 
 void llama_kv_cache_unified::restore() {
-    if (pending.ranges.empty()) {
+    if (pending.ubatches.empty()) {
         return;
     }
 
-    // TODO: here we assume that all sequences should be removed from the cache which is not always the case
-    //       need to start keeping more detailed pending information per-sequence
-
     uint32_t new_head = size;
 
-    for (auto & range : pending.ranges) {
-        for (uint32_t i = range.c0; i < range.c1; ++i) {
-            cells[i].seq_id.clear();
+    for (const auto & ubatch : pending.ubatches) {
+        for (uint32_t i = 0; i < ubatch.data.n_tokens; ++i) {
+            for (int s = 0; s < ubatch.data.n_seq_id[i]; ++s) {
+                const llama_seq_id seq_id = ubatch.data.seq_id[i][s];
 
-            // keep count of the number of used cells
-            if (cells[i].pos >= 0) {
-                used--;
-            }
+                cells[ubatch.head + i].seq_id.erase(seq_id);
+                if (cells[ubatch.head + i].seq_id.empty()) {
+                    used--;
 
-            cells[i].pos = -1;
-        }
+                    new_head = std::min(new_head, ubatch.head + i);
+                }
 
-        new_head = std::min(new_head, range.c0);
+                cells[ubatch.head + i].pos = -1;
+            }
+        }
     }
 
     if (new_head != size && new_head < head) {
         head = new_head;
     }
+
+    for (const auto & seq_rm : pending.seq_rms) {
+        GGML_ASSERT(seq_rm.seq_id >= 0 && "seq_rm.seq_id < 0 during restore - should not happen");
+
+        if (cells[seq_rm.c].seq_id.empty()) {
+            GGML_ASSERT(cells[seq_rm.c].pos == -1 && "cells[seq_rm.c].pos != -1 during restore - should not happen");
+            used++;
+        } else {
+            GGML_ASSERT(cells[seq_rm.c].pos == seq_rm.p && "cells[seq_rm.c].pos != seq_rm.p during restore - should not happen");
+        }
+
+        cells[seq_rm.c].seq_id.insert(seq_rm.seq_id);
+        cells[seq_rm.c].pos = seq_rm.p;
+    }
+
+    pending.clear();
 }
 
 void llama_kv_cache_unified::commit() {
-    if (pending.ranges.empty()) {
+    if (pending.ubatches.empty()) {
         LLAMA_LOG_WARN("%s: no pending KV cache updates to commit - might indicate a bug (ref: %s)\n",
                 __func__, "https://github.com/ggml-org/llama.cpp/pull/12695");
         return;
     }
 
-    pending.ranges.clear();
+    pending.clear();
 }
 
 bool llama_kv_cache_unified::update(llama_context & lctx) {
@@ -430,6 +447,8 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
         do_defrag = false;
     }
 
+    pending.clear();
+
     return need_reserve;
 }
 
@@ -459,7 +478,7 @@ llama_sbatch llama_kv_cache_unified::sbatch_init(
 llama_ubatch llama_kv_cache_unified::ubatch_next(
         llama_sbatch & sbatch,
         uint32_t n_ubatch,
-        bool embd_pooled) const {
+        bool embd_pooled) {
     GGML_UNUSED(embd_pooled);
     return sbatch.split_simple(n_ubatch);
 }
@@ -519,7 +538,7 @@ bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) {
 
     used += n_tokens;
 
-    pending.ranges.push_back({head, head + n_tokens});
+    pending.ubatches.push_back({ head, ubatch });
 
     // a heuristic, to avoid attending the full cache if it is not yet utilized
     // after enough generations, the benefit from this heuristic disappears
@@ -1568,13 +1587,13 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
                      bool   offload,
                  uint32_t   kv_size,
                  uint32_t   n_seq_max,
-                 uint32_t   n_batch,
+                 uint32_t   n_ubatch,
                  uint32_t   padding) : hparams(model.hparams) {
     llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
     llama_kv_cache_unified::layer_filter_cb filter_swa  = [&](int32_t il) { return  model.hparams.is_swa(il); };
 
     const uint32_t kv_size_base = kv_size;
-    const uint32_t kv_size_swa  = std::min(kv_size, GGML_PAD(hparams.n_swa*n_seq_max + n_batch, padding));
+    const uint32_t kv_size_swa  = std::min(kv_size, GGML_PAD(hparams.n_swa*n_seq_max + n_ubatch, padding));
 
     LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, kv_size_base);
 
@@ -1629,21 +1648,6 @@ void llama_kv_cache_unified_iswa::restore() {
 }
 
 void llama_kv_cache_unified_iswa::commit() {
-    if (pending.pos_max.empty()) {
-        return;
-    }
-
-    // slide the window, forgetting old tokens
-    for (const auto & [seq_id, pos_max] : pending.pos_max) {
-        if (pos_max <= (llama_pos) hparams.n_swa) {
-            continue;
-        }
-
-        kv_swa->seq_rm(seq_id, -1, pos_max - hparams.n_swa + 1);
-    }
-
-    pending.pos_max.clear();
-
     kv_base->commit();
     kv_swa ->commit();
 }
@@ -1668,21 +1672,34 @@ void llama_kv_cache_unified_iswa::set_full() {
 }
 
 llama_sbatch llama_kv_cache_unified_iswa::sbatch_init(const llama_batch & batch, bool logits_all) {
-    // this will be used upon successful decode, during commit, to remove old SWA tokens
-    for (int i = 0; i < batch.n_tokens; ++i) {
-        for (int s = 0; s < batch.n_seq_id[i]; ++s) {
-            const llama_seq_id seq_id = batch.seq_id[i][s];
-            const llama_pos    pos    = batch.pos[i];
+    return llama_sbatch(batch, hparams.n_embd, true, logits_all);
+}
 
-            pending.pos_max[seq_id] = std::max(pending.pos_max[seq_id], pos);
+llama_ubatch llama_kv_cache_unified_iswa::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) {
+    GGML_UNUSED(embd_pooled);
+    auto res = sbatch.split_simple(n_ubatch);
+
+    for (uint32_t i = 0; i < res.n_tokens; ++i) {
+        for (int s = 0; s < res.n_seq_id[i]; ++s) {
+            const llama_seq_id seq_id = res.seq_id[i][s];
+            const llama_pos    pos    = res.pos[i];
+
+            pos_max_per_seq[seq_id] = std::max(pos_max_per_seq[seq_id], pos);
         }
     }
 
-    return kv_base->sbatch_init(batch, logits_all);
-}
+    // slide the window, forgetting old tokens
+    for (const auto & [seq_id, pos_max] : pos_max_per_seq) {
+        if (pos_max <= (llama_pos) hparams.n_swa) {
+            continue;
+        }
+
+        kv_swa->seq_rm(seq_id, -1, pos_max - hparams.n_swa + 1);
+    }
+
+    pos_max_per_seq.clear();
 
-llama_ubatch llama_kv_cache_unified_iswa::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const {
-    return kv_base->ubatch_next(sbatch, n_ubatch, embd_pooled);
+    return res;
 }
 
 bool llama_kv_cache_unified_iswa::find_slot(const llama_ubatch & batch) {
@@ -2094,7 +2111,7 @@ llama_sbatch llama_kv_cache_recurrent::sbatch_init(
     return llama_sbatch(batch, hparams.n_embd, false, logits_all);
 }
 
-llama_ubatch llama_kv_cache_recurrent::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const {
+llama_ubatch llama_kv_cache_recurrent::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) {
     if (embd_pooled) {
         // Pooled embeddings cannot be split across ubatches (yet)
         return sbatch.split_seq(n_ubatch);
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
@@ -2,6 +2,7 @@
 
 #include "llama.h"
 #include "llama-io.h"
+#include "llama-batch.h"
 #include "llama-graph.h"
 #include "llama-memory.h"
 
@@ -13,8 +14,6 @@
 
 struct llama_cparams;
 struct llama_hparams;
-struct llama_ubatch;
-struct llama_sbatch;
 struct llama_model;
 struct llama_context;
 
@@ -28,7 +27,7 @@ struct llama_kv_cache : public llama_memory_i {
     virtual void commit()  = 0;
 
     // process any pending defrag/shift/etc. operations
-    // optionally call once before processing a new batch
+    // call once before processing a new batch
     virtual bool update(llama_context & lctx) = 0;
 
     // schedule a defrag if the fragmentation threshold is exceeded. otherwise, do nothing
@@ -44,7 +43,7 @@ struct llama_kv_cache : public llama_memory_i {
     virtual llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) = 0;
 
     // different KV caches require different batch splitting strategies
-    virtual llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const = 0;
+    virtual llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) = 0;
 
     // find an empty slot of size "n_tokens" in the cache
     virtual bool find_slot(const llama_ubatch & batch) = 0;
@@ -135,7 +134,7 @@ class llama_kv_cache_unified : public llama_kv_cache {
     void set_full() override;
 
     llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
-    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
+    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) override;
 
     // updates the cache head
     // Note: On success, it's important that cache.head points
@@ -178,16 +177,11 @@ class llama_kv_cache_unified : public llama_kv_cache {
     const llama_model & model;
     const llama_hparams & hparams;
 
-    // commit/restore cache
-    struct slot_range {
-        uint32_t c0 = 0; // note: these are cell indices, not sequence positions
-        uint32_t c1 = 0;
-    };
-
     struct kv_cell {
         llama_pos pos   = -1;
         llama_pos delta =  0;
 
+        // TODO: replace with bitset uint64_t
         std::set<llama_seq_id> seq_id;
 
         bool has_seq_id(const llama_seq_id & id) const {
@@ -241,10 +235,33 @@ class llama_kv_cache_unified : public llama_kv_cache {
     // model layer id -> KV cache layer id
     std::map<int32_t, int32_t> map_layer_ids;
 
-    // pending cell updates that are not yet committed
-    // TODO: improve by keeping information per-sequence
+    struct ubatch_info {
+        uint32_t head;
+
+        llama_ubatch data;
+    };
+
+    struct seq_rm_info {
+        llama_seq_id seq_id;
+
+        llama_pos p;
+
+        uint32_t c;
+    };
+
+    // pending cell updates that are not yet committed - cleared upon update()
     struct {
-        std::vector<slot_range> ranges;
+        void clear() {
+            ubatches.clear();
+            seq_rms.clear();
+        }
+
+        // upon batch processing failure, we revert these ubatches from the KV cells
+        std::vector<ubatch_info> ubatches;
+
+        // any cell removals that occur during the current batch processing will be restored with this information
+        //   this is relevant for SWA caches that perform token pruning on each ubatch
+        std::vector<seq_rm_info> seq_rms;
     } pending;
 
     // defrag
@@ -307,7 +324,7 @@ class llama_kv_cache_unified_iswa : public llama_kv_cache {
                          bool   offload,
                      uint32_t   kv_size,
                      uint32_t   n_seq_max,
-                     uint32_t   n_batch,
+                     uint32_t   n_ubatch,
                      uint32_t   padding);
 
     ~llama_kv_cache_unified_iswa() = default;
@@ -340,7 +357,7 @@ class llama_kv_cache_unified_iswa : public llama_kv_cache {
     void set_full() override;
 
     llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
-    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
+    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled)  override;
 
     bool find_slot(const llama_ubatch & batch) override;
 
@@ -365,13 +382,10 @@ class llama_kv_cache_unified_iswa : public llama_kv_cache {
     llama_kv_cache_unified * get_kv_swa () const;
 
 private:
-    // pending cell updates that are not yet committed
-    struct {
-        std::map<llama_seq_id, llama_pos> pos_max;
-    } pending;
-
     const llama_hparams & hparams;
 
+    std::map<llama_seq_id, llama_pos> pos_max_per_seq;
+
     std::unique_ptr<llama_kv_cache_unified> kv_base;
     std::unique_ptr<llama_kv_cache_unified> kv_swa;
 };
@@ -439,7 +453,7 @@ class llama_kv_cache_recurrent : public llama_kv_cache {
     void set_full() override;
 
     llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
-    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
+    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) override;
 
     bool find_slot(const llama_ubatch & batch) override;
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -13042,7 +13042,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                             cparams.offload_kqv,
                             cparams.n_ctx,
                             cparams.n_seq_max,
-                            cparams.n_batch,
+                            cparams.n_ubatch,
                             padding);
                 } else {
                     res = new llama_kv_cache_unified(