Skip to content

Commit 2e14338

Browse files
committed
additional padding for the swa kv cache itself
1 parent ff2cabc commit 2e14338

File tree

2 files changed

+8
-1
lines changed

2 files changed

+8
-1
lines changed

gpttype_adapter.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
#include "common/common.h"
4646

4747
//const
48-
const int extra_context_handle_fragmentation = 120;
48+
const int extra_context_handle_fragmentation = 128;
4949
const int LLAVA_TOKEN_IDENTIFIER_A = -998; //alternate between both, changing when image changes
5050
const int LLAVA_TOKEN_IDENTIFIER_B = -999;
5151

@@ -2174,6 +2174,10 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
21742174
{
21752175
llama_ctx_params.n_ctx += extra_context_handle_fragmentation;
21762176
}
2177+
else
2178+
{
2179+
llama_ctx_params.n_ctx += (extra_context_handle_fragmentation/2);
2180+
}
21772181

21782182
llama_ctx_params.offload_kqv = !inputs.low_vram;
21792183
model_params.use_mmap = inputs.use_mmap;

src/llama-kv-cache-unified-iswa.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
2929

3030
uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_ubatch, n_pad));
3131

32+
//kcpp: pad the swa kv cache as well, similar to extra_context_handle_fragmentation
33+
size_swa += 32;
34+
3235
// when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size
3336
if (swa_full) {
3437
LLAMA_LOG_WARN("%s: using full-size SWA cache (ref: %s)\n",

0 commit comments

Comments
 (0)