Skip to content

Commit 8450a9a

Browse files
committed
Revert "llama : remove KV cache defragmentation logic (ggml-org#15473)"
1 parent 1b54391 commit 8450a9a

File tree

11 files changed

+408
-29
lines changed

11 files changed

+408
-29
lines changed

common/arg.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2256,11 +2256,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
22562256
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
22572257
add_opt(common_arg(
22582258
{"-dt", "--defrag-thold"}, "N",
2259-
string_format("KV cache defragmentation threshold (DEPRECATED)"),
2259+
string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
22602260
[](common_params & params, const std::string & value) {
2261-
GGML_UNUSED(params);
2262-
GGML_UNUSED(value);
2263-
LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
2261+
params.defrag_thold = std::stof(value);
22642262
}
22652263
).set_env("LLAMA_ARG_DEFRAG_THOLD"));
22662264
add_opt(common_arg(

common/common.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1160,6 +1160,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
11601160
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
11611161
cparams.pooling_type = params.pooling_type;
11621162
cparams.attention_type = params.attention_type;
1163+
cparams.defrag_thold = params.defrag_thold;
11631164
cparams.cb_eval = params.cb_eval;
11641165
cparams.cb_eval_user_data = params.cb_eval_user_data;
11651166
cparams.offload_kqv = !params.no_kv_offload;

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,7 @@ struct common_params {
284284
float yarn_beta_fast = 32.0f; // YaRN low correction dim
285285
float yarn_beta_slow = 1.0f; // YaRN high correction dim
286286
int32_t yarn_orig_ctx = 0; // YaRN original context length
287+
float defrag_thold = 0.1f; // KV cache defragmentation threshold
287288

288289
// offload params
289290
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

include/llama.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,7 @@ extern "C" {
315315
float yarn_beta_fast; // YaRN low correction dim
316316
float yarn_beta_slow; // YaRN high correction dim
317317
uint32_t yarn_orig_ctx; // YaRN original context size
318-
float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default)
318+
float defrag_thold; // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
319319

320320
ggml_backend_sched_eval_callback cb_eval;
321321
void * cb_eval_user_data;

src/llama-context.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ llama_context::llama_context(
3939
cparams.yarn_attn_factor = params.yarn_attn_factor;
4040
cparams.yarn_beta_fast = params.yarn_beta_fast;
4141
cparams.yarn_beta_slow = params.yarn_beta_slow;
42+
cparams.defrag_thold = params.defrag_thold;
4243
cparams.embeddings = params.embeddings;
4344
cparams.offload_kqv = params.offload_kqv;
4445
cparams.flash_attn = params.flash_attn;
@@ -977,7 +978,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
977978

978979
bool did_optimize = false;
979980

980-
// handle any pending shifts/copies
981+
// handle any pending defrags/shifts
981982
memory_update(false);
982983

983984
llama_memory_context_ptr mctx;

src/llama-cparams.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ struct llama_cparams {
2424
float yarn_attn_factor;
2525
float yarn_beta_fast;
2626
float yarn_beta_slow;
27+
float defrag_thold;
2728

2829
bool embeddings;
2930
bool causal_attn;

0 commit comments

Comments
 (0)