kv-cache : rework SWA logic to support n_ubatch + recovery

ggerganov · ggerganov · commit cf33051093bc · 2025-05-15T18:07:35.000+03:00
ggml-ci
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -333,31 +333,41 @@ llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
 }
 
 void llama_kv_cache_unified::restore() {
-    if (pending.ubatches.empty()) {
-        return;
-    }
+    switch (swa_type) {
+        case LLAMA_SWA_TYPE_NONE:
+            {
+                uint32_t new_head = size;
 
-    uint32_t new_head = size;
+                for (const auto & ubatch : pending.ubatches) {
+                    for (uint32_t i = 0; i < ubatch.data.n_tokens; ++i) {
+                        for (int s = 0; s < ubatch.data.n_seq_id[i]; ++s) {
+                            const llama_seq_id seq_id = ubatch.data.seq_id[i][s];
 
-    for (const auto & ubatch : pending.ubatches) {
-        for (uint32_t i = 0; i < ubatch.data.n_tokens; ++i) {
-            for (int s = 0; s < ubatch.data.n_seq_id[i]; ++s) {
-                const llama_seq_id seq_id = ubatch.data.seq_id[i][s];
+                            cells[ubatch.head + i].seq_id.erase(seq_id);
+                            if (cells[ubatch.head + i].seq_id.empty()) {
+                                used--;
 
-                cells[ubatch.head + i].seq_id.erase(seq_id);
-                if (cells[ubatch.head + i].seq_id.empty()) {
-                    used--;
+                                new_head = std::min(new_head, ubatch.head + i);
+                            }
 
-                    new_head = std::min(new_head, ubatch.head + i);
+                            cells[ubatch.head + i].pos = -1;
+                        }
+                    }
                 }
 
-                cells[ubatch.head + i].pos = -1;
-            }
-        }
-    }
+                if (new_head != size && new_head < head) {
+                    head = new_head;
+                }
 
-    if (new_head != size && new_head < head) {
-        head = new_head;
+            } break;
+        case LLAMA_SWA_TYPE_STANDARD:
+        case LLAMA_SWA_TYPE_CHUNKED:
+            {
+                if (!pending.cells_org.empty()) {
+                    cells = std::move(pending.cells_org);
+                    used = pending.used_org;
+                }
+            } break;
     }
 
     pending.clear();
@@ -460,16 +470,23 @@ void llama_kv_cache_unified::set_full() {
     head = 0;
 }
 
-llama_sbatch llama_kv_cache_unified::sbatch_init(
-        const llama_batch & batch,
-        bool logits_all) {
+llama_sbatch llama_kv_cache_unified::sbatch_init(const llama_batch & batch, bool logits_all) {
+    switch (swa_type) {
+        case LLAMA_SWA_TYPE_NONE:
+            {
+            } break;
+        case LLAMA_SWA_TYPE_STANDARD:
+        case LLAMA_SWA_TYPE_CHUNKED:
+            {
+                pending.cells_org = cells;
+                pending.used_org  = used;
+            } break;
+    }
+
     return llama_sbatch(batch, hparams.n_embd, true, logits_all);
 }
 
-llama_ubatch llama_kv_cache_unified::ubatch_next(
-        llama_sbatch & sbatch,
-        uint32_t n_ubatch,
-        bool embd_pooled) const {
+llama_ubatch llama_kv_cache_unified::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) {
     GGML_UNUSED(embd_pooled);
     return sbatch.split_simple(n_ubatch);
 }
@@ -642,6 +659,33 @@ ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_
     return ggml_cpy(ctx, v_cur, v_view);
 }
 
+void llama_kv_cache_unified::prune_swa(llama_seq_id seq_id, llama_pos p1) {
+    GGML_ASSERT(swa_type != LLAMA_SWA_TYPE_NONE);
+
+    for (uint32_t i = 0; i < size; ++i) {
+        const llama_pos p0 = cells[i].pos;
+
+        if (is_masked_swa(p0, p1)) {
+            if (seq_id < 0) {
+                cells[i].seq_id.clear();
+            } else if (cells[i].has_seq_id(seq_id)) {
+                cells[i].seq_id.erase(seq_id);
+            } else {
+                continue;
+            }
+
+            if (cells[i].is_empty()) {
+                // keep count of the number of used cells
+                if (cells[i].pos >= 0) {
+                    used--;
+                }
+
+                cells[i].pos = -1;
+            }
+        }
+    }
+}
+
 void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
     const int64_t n_tokens     = ubatch->n_tokens;
     const int64_t n_seq_tokens = ubatch->n_seq_tokens;
@@ -1589,13 +1633,13 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
                      bool   offload,
                  uint32_t   kv_size,
                  uint32_t   n_seq_max,
-                 uint32_t   n_batch,
+                 uint32_t   n_ubatch,
                  uint32_t   padding) : hparams(model.hparams) {
     llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
     llama_kv_cache_unified::layer_filter_cb filter_swa  = [&](int32_t il) { return  model.hparams.is_swa(il); };
 
     const uint32_t kv_size_base = kv_size;
-    const uint32_t kv_size_swa  = std::min(kv_size, GGML_PAD(hparams.n_swa*n_seq_max + n_batch, padding));
+    const uint32_t kv_size_swa  = std::min(kv_size, GGML_PAD(hparams.n_swa*n_seq_max + n_ubatch, padding));
 
     LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, kv_size_base);
 
@@ -1658,21 +1702,6 @@ void llama_kv_cache_unified_iswa::restore() {
 void llama_kv_cache_unified_iswa::commit() {
     kv_base->commit();
     kv_swa ->commit();
-
-    if (pending.pos_max.empty()) {
-        return;
-    }
-
-    // slide the attention window, forgetting/pruning old tokens that are outside the window
-    for (const auto & [seq_id, pos_max] : pending.pos_max) {
-        if (pos_max <= (llama_pos) hparams.n_swa) {
-            continue;
-        }
-
-        kv_swa->seq_rm(seq_id, -1, pos_max - hparams.n_swa + 1);
-    }
-
-    pending.pos_max.clear();
 }
 
 bool llama_kv_cache_unified_iswa::update(llama_context & lctx) {
@@ -1695,21 +1724,30 @@ void llama_kv_cache_unified_iswa::set_full() {
 }
 
 llama_sbatch llama_kv_cache_unified_iswa::sbatch_init(const llama_batch & batch, bool logits_all) {
-    for (int i = 0; i < batch.n_tokens; ++i) {
-        for (int s = 0; s < batch.n_seq_id[i]; ++s) {
-            const llama_seq_id seq_id = batch.seq_id[i][s];
-            const llama_pos    pos    = batch.pos[i];
+    return llama_sbatch(batch, hparams.n_embd, true, logits_all);
+}
+
+llama_ubatch llama_kv_cache_unified_iswa::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) {
+    GGML_UNUSED(embd_pooled);
+    auto res = sbatch.split_simple(n_ubatch);
 
-            pending.pos_max[seq_id] = std::max(pending.pos_max[seq_id], pos);
+    pos_max_per_seq.clear();
+
+    for (uint32_t i = 0; i < res.n_tokens; ++i) {
+        for (int s = 0; s < res.n_seq_id[i]; ++s) {
+            const llama_seq_id seq_id = res.seq_id[i][s];
+            const llama_pos    pos    = res.pos[i];
+
+            pos_max_per_seq[seq_id] = std::max(pos_max_per_seq[seq_id], pos);
         }
     }
 
-    return llama_sbatch(batch, hparams.n_embd, true, logits_all);
-}
+    // slide the attention window, forgetting/pruning old tokens that are outside the window
+    for (const auto & [seq_id, pos_max] : pos_max_per_seq) {
+        kv_swa->prune_swa(seq_id, pos_max);
+    }
 
-llama_ubatch llama_kv_cache_unified_iswa::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const {
-    GGML_UNUSED(embd_pooled);
-    return sbatch.split_simple(n_ubatch);
+    return res;
 }
 
 bool llama_kv_cache_unified_iswa::find_slot(const llama_ubatch & batch) {
@@ -2122,7 +2160,7 @@ llama_sbatch llama_kv_cache_recurrent::sbatch_init(
     return llama_sbatch(batch, hparams.n_embd, false, logits_all);
 }
 
-llama_ubatch llama_kv_cache_recurrent::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const {
+llama_ubatch llama_kv_cache_recurrent::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) {
     if (embd_pooled) {
         // Pooled embeddings cannot be split across ubatches (yet)
         return sbatch.split_seq(n_ubatch);
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
@@ -43,7 +43,7 @@ struct llama_kv_cache : public llama_memory_i {
     virtual llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) = 0;
 
     // different KV caches require different batch splitting strategies
-    virtual llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const = 0;
+    virtual llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) = 0;
 
     // find an empty slot of size "n_tokens" in the cache
     virtual bool find_slot(const llama_ubatch & batch) = 0;
@@ -136,7 +136,7 @@ class llama_kv_cache_unified : public llama_kv_cache {
     void set_full() override;
 
     llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
-    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
+    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) override;
 
     // updates the cache head
     // Note: On success, it's important that cache.head points
@@ -171,6 +171,8 @@ class llama_kv_cache_unified : public llama_kv_cache {
     ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const;
     ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const;
 
+    void prune_swa(llama_seq_id seq_id, llama_pos p1);
+
     void set_input_kq_mask   (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
     void set_input_k_shift   (ggml_tensor * dst) const;
     void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
@@ -249,10 +251,15 @@ class llama_kv_cache_unified : public llama_kv_cache {
     struct {
         void clear() {
             ubatches.clear();
+            cells_org.clear();
         }
 
         // upon batch processing failure, we revert these ubatches from the KV cells
         std::vector<ubatch_info> ubatches;
+
+        uint32_t used_org;
+
+        std::vector<kv_cell> cells_org;
     } pending;
 
     // defrag
@@ -317,7 +324,7 @@ class llama_kv_cache_unified_iswa : public llama_kv_cache {
                          bool   offload,
                      uint32_t   kv_size,
                      uint32_t   n_seq_max,
-                     uint32_t   n_batch,
+                     uint32_t   n_ubatch,
                      uint32_t   padding);
 
     ~llama_kv_cache_unified_iswa() = default;
@@ -350,7 +357,7 @@ class llama_kv_cache_unified_iswa : public llama_kv_cache {
     void set_full() override;
 
     llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
-    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
+    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) override;
 
     bool find_slot(const llama_ubatch & batch) override;
 
@@ -377,10 +384,7 @@ class llama_kv_cache_unified_iswa : public llama_kv_cache {
 private:
     const llama_hparams & hparams;
 
-    // pending cell updates that are not yet committed
-    struct {
-        std::map<llama_seq_id, llama_pos> pos_max;
-    } pending;
+    std::map<llama_seq_id, llama_pos> pos_max_per_seq;
 
     std::unique_ptr<llama_kv_cache_unified> kv_base;
     std::unique_ptr<llama_kv_cache_unified> kv_swa;
@@ -449,7 +453,7 @@ class llama_kv_cache_recurrent : public llama_kv_cache {
     void set_full() override;
 
     llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
-    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
+    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) override;
 
     bool find_slot(const llama_ubatch & batch) override;
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -13228,7 +13228,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                             cparams.offload_kqv,
                             cparams.n_ctx,
                             cparams.n_seq_max,
-                            cparams.n_batch,
+                            cparams.n_ubatch,
                             padding);
                 } else {
                     res = new llama_kv_cache_unified(