llama : add param to control SWA cache size

ggerganov · ggerganov · commit c699abcba314 · 2025-05-18T09:01:27.000+03:00
ggml-ci
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -1445,6 +1445,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.n_keep = value;
         }
     ));
+    add_opt(common_arg(
+        {"--swa-full"},
+        string_format("use full-size SWA cache (default: %s)\n"
+            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)", params.swa_full ? "true" : "false"),
+        [](common_params & params) {
+            params.swa_full = true;
+        }
+    ));
     add_opt(common_arg(
         {"--no-context-shift"},
         string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
diff --git a/common/common.cpp b/common/common.cpp
@@ -1133,6 +1133,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
     cparams.flash_attn        = params.flash_attn;
     cparams.no_perf           = params.no_perf;
     cparams.op_offload        = !params.no_op_offload;
+    cparams.swa_full          = params.swa_full;
 
     if (params.reranking) {
         cparams.embeddings    = true;
diff --git a/common/common.h b/common/common.h
@@ -323,6 +323,7 @@ struct common_params {
     bool flash_attn        = false; // flash attention
     bool no_perf           = false; // disable performance metrics
     bool ctx_shift         = true;  // context shift on inifinite text generation
+    bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool use_mmap          = true;  // use mmap for faster loads
diff --git a/include/llama.h b/include/llama.h
@@ -361,10 +361,11 @@ extern "C" {
 
         // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
         bool embeddings;  // if true, extract embeddings (together with logits)
-        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
-        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
-        bool no_perf;     // whether to measure performance timings
-        bool op_offload;  // whether to offload host tensor operations to device
+        bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
+        bool flash_attn;  // use flash attention [EXPERIMENTAL]
+        bool no_perf;     // measure performance timings
+        bool op_offload;  // offload host tensor operations to device
+        bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
     };
 
     // model quantization parameters
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -177,8 +177,9 @@ llama_context::llama_context(
     // init the memory module
     if (!hparams.vocab_only) {
         llama_memory_params params_mem = {
-            /*.type_k =*/ params.type_k,
-            /*.type_v =*/ params.type_v,
+            /*.type_k   =*/ params.type_k,
+            /*.type_v   =*/ params.type_v,
+            /*.swa_full =*/ params.swa_full,
         };
 
         memory.reset(model.create_memory(params_mem, cparams));
@@ -2092,6 +2093,7 @@ llama_context_params llama_context_default_params() {
         /*.flash_attn                  =*/ false,
         /*.no_perf                     =*/ true,
         /*.op_offload                  =*/ true,
+        /*.swa_full                    =*/ true,
     };
 
     return result;
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -1656,27 +1656,38 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
                      bool   v_trans,
                      bool   offload,
                  uint32_t   kv_size,
+                     bool   swa_full,
                  uint32_t   n_seq_max,
                  uint32_t   n_batch,
                  uint32_t   padding) : hparams(model.hparams) {
     llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
     llama_kv_cache_unified::layer_filter_cb filter_swa  = [&](int32_t il) { return  model.hparams.is_swa(il); };
 
-    const uint32_t kv_size_base = kv_size;
-    const uint32_t kv_size_swa  = std::min(kv_size, GGML_PAD(hparams.n_swa*n_seq_max + n_batch, padding));
+    const uint32_t size_base = kv_size;
 
-    LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, kv_size_base);
+    uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_batch, padding));
+
+    // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size and disable pruning
+    if (swa_full) {
+        LLAMA_LOG_WARN("%s: using full-size SWA cache (ref: %s)\n",
+                __func__, "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
+
+        size_swa = size_base;
+        do_prune = false;
+    }
+
+    LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
 
     kv_base = std::make_unique<llama_kv_cache_unified>(
             model, std::move(filter_base), type_k, type_v,
-            v_trans, offload, kv_size_base, padding,
+            v_trans, offload, size_base, padding,
             0, LLAMA_SWA_TYPE_NONE);
 
-    LLAMA_LOG_INFO("%s: creating     SWA KV cache, size = %u cells\n", __func__, kv_size_swa);
+    LLAMA_LOG_INFO("%s: creating     SWA KV cache, size = %u cells\n", __func__, size_swa);
 
     kv_swa = std::make_unique<llama_kv_cache_unified>(
             model, std::move(filter_swa), type_k, type_v,
-            v_trans, offload, kv_size_swa,  padding,
+            v_trans, offload, size_swa, padding,
             hparams.n_swa, hparams.swa_type);
 }
 
@@ -1733,8 +1744,11 @@ void llama_kv_cache_unified_iswa::commit() {
     kv_swa ->commit();
 
     // slide the attention window, forgetting/pruning old tokens that are outside the window
-    for (const auto & [seq_id, entry] : pending.pos) {
-        kv_swa->prune_swa(seq_id, entry.pmin, entry.pmax);
+    if (do_prune) {
+        for (const auto & [seq_id, entry] : pending.pos) {
+            kv_swa->prune_swa(seq_id, entry.pmin, entry.pmax);
+        }
+
     }
 
     pending.clear();
@@ -1762,17 +1776,19 @@ void llama_kv_cache_unified_iswa::set_full() {
 llama_sbatch llama_kv_cache_unified_iswa::sbatch_init(const llama_batch & batch, bool logits_all) {
     pending.clear();
 
-    for (int i = 0; i < batch.n_tokens; ++i) {
-        for (int s = 0; s < batch.n_seq_id[i]; ++s) {
-            const llama_seq_id seq_id = batch.seq_id[i][s];
-            const llama_pos    pos    = batch.pos[i];
+    if (do_prune) {
+        for (int i = 0; i < batch.n_tokens; ++i) {
+            for (int s = 0; s < batch.n_seq_id[i]; ++s) {
+                const llama_seq_id seq_id = batch.seq_id[i][s];
+                const llama_pos    pos    = batch.pos[i];
 
-            if (pending.pos.find(seq_id) == pending.pos.end()) {
-                pending.pos[seq_id].pmin = pos;
-                pending.pos[seq_id].pmax = pos;
-            } else {
-                pending.pos[seq_id].pmin = std::min(pending.pos[seq_id].pmin, pos);
-                pending.pos[seq_id].pmax = std::max(pending.pos[seq_id].pmax, pos);
+                if (pending.pos.find(seq_id) == pending.pos.end()) {
+                    pending.pos[seq_id].pmin = pos;
+                    pending.pos[seq_id].pmax = pos;
+                } else {
+                    pending.pos[seq_id].pmin = std::min(pending.pos[seq_id].pmin, pos);
+                    pending.pos[seq_id].pmax = std::max(pending.pos[seq_id].pmax, pos);
+                }
             }
         }
     }
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
@@ -318,6 +318,7 @@ class llama_kv_cache_unified_iswa : public llama_kv_cache {
                          bool   v_trans,
                          bool   offload,
                      uint32_t   kv_size,
+                         bool   swa_full,
                      uint32_t   n_seq_max,
                      uint32_t   n_batch,
                      uint32_t   padding);
@@ -380,6 +381,8 @@ class llama_kv_cache_unified_iswa : public llama_kv_cache {
 private:
     const llama_hparams & hparams;
 
+    bool do_prune = true;
+
     struct {
         struct entry {
             llama_pos pmin;
@@ -390,6 +393,7 @@ class llama_kv_cache_unified_iswa : public llama_kv_cache {
             pos.clear();
         }
 
+        // used to perform SWA pruning of old tokens
         std::unordered_map<llama_seq_id, entry> pos;
     } pending;
 
diff --git a/src/llama-memory.h b/src/llama-memory.h
@@ -7,8 +7,8 @@ struct llama_memory_params {
     ggml_type type_k;
     ggml_type type_v;
 
-    // parameters for other types of memory
-    // ...
+    // use full-size SWA cache
+    bool swa_full;
 };
 
 // general concept of LLM memory
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -13227,6 +13227,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                             !cparams.flash_attn,
                             cparams.offload_kqv,
                             cparams.n_ctx,
+                            params.swa_full,
                             cparams.n_seq_max,
                             cparams.n_batch,
                             padding);
diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp
@@ -991,6 +991,7 @@ struct cmd_params_instance {
         cparams.flash_attn   = flash_attn;
         cparams.embeddings   = embeddings;
         cparams.op_offload   = !no_op_offload;
+        cparams.swa_full     = false;
 
         return cparams;
     }
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -3203,7 +3203,8 @@ struct server_context {
 
                             if (slot.n_past > 0 && slot.n_past < (int) slot.cache_tokens.size()) {
                                 if (llama_kv_self_seq_pos_min(ctx, slot.id) > 0) {
-                                    SLT_WRN(slot, "%s", "forcing full prompt re-processing due to lack of cache data\n");
+                                    SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n",
+                                            "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
                                     slot.n_past = 0;
                                 }
                             }

Original file line number	Diff line number	Diff line change
`@@ -991,6 +991,7 @@ struct cmd_params_instance {`
`991`	`991`	`cparams.flash_attn = flash_attn;`
`992`	`992`	`cparams.embeddings = embeddings;`
`993`	`993`	`cparams.op_offload = !no_op_offload;`
	`994`	`+ cparams.swa_full = false;`
`994`	`995`
`995`	`996`	`return cparams;`
`996`	`997`	`}`