additional padding for the swa kv cache itself

LostRuins · LostRuins · commit 2e14338455a7 · 2025-06-28T15:52:48.000+08:00
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
@@ -45,7 +45,7 @@
 #include "common/common.h"
 
 //const
-const int extra_context_handle_fragmentation = 120;
+const int extra_context_handle_fragmentation = 128;
 const int LLAVA_TOKEN_IDENTIFIER_A = -998; //alternate between both, changing when image changes
 const int LLAVA_TOKEN_IDENTIFIER_B = -999;
 
@@ -2174,6 +2174,10 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
         {
            llama_ctx_params.n_ctx += extra_context_handle_fragmentation;
         }
+        else
+        {
+            llama_ctx_params.n_ctx += (extra_context_handle_fragmentation/2);
+        }
 
         llama_ctx_params.offload_kqv = !inputs.low_vram;
         model_params.use_mmap = inputs.use_mmap;
diff --git a/src/llama-kv-cache-unified-iswa.cpp b/src/llama-kv-cache-unified-iswa.cpp
@@ -29,6 +29,9 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
 
     uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_ubatch, n_pad));
 
+    //kcpp: pad the swa kv cache as well, similar to extra_context_handle_fragmentation
+    size_swa += 32;
+
     // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size
     if (swa_full) {
         LLAMA_LOG_WARN("%s: using full-size SWA cache (ref: %s)\n",