Skip to content

Commit 9ebebef

Browse files
authored
llama : remove KV cache defragmentation logic (ggml-org#15473)
ggml-ci
1 parent ad5c975 commit 9ebebef

File tree

16 files changed

+32
-440
lines changed

16 files changed

+32
-440
lines changed

common/arg.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2254,9 +2254,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
22542254
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
22552255
add_opt(common_arg(
22562256
{"-dt", "--defrag-thold"}, "N",
2257-
string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
2257+
string_format("KV cache defragmentation threshold (DEPRECATED)"),
22582258
[](common_params & params, const std::string & value) {
2259-
params.defrag_thold = std::stof(value);
2259+
GGML_UNUSED(params);
2260+
GGML_UNUSED(value);
2261+
LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
22602262
}
22612263
).set_env("LLAMA_ARG_DEFRAG_THOLD"));
22622264
add_opt(common_arg(

common/common.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1152,7 +1152,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
11521152
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
11531153
cparams.pooling_type = params.pooling_type;
11541154
cparams.attention_type = params.attention_type;
1155-
cparams.defrag_thold = params.defrag_thold;
11561155
cparams.cb_eval = params.cb_eval;
11571156
cparams.cb_eval_user_data = params.cb_eval_user_data;
11581157
cparams.offload_kqv = !params.no_kv_offload;

common/common.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,6 @@ struct common_params {
288288
float yarn_beta_fast = 32.0f; // YaRN low correction dim
289289
float yarn_beta_slow = 1.0f; // YaRN high correction dim
290290
int32_t yarn_orig_ctx = 0; // YaRN original context length
291-
float defrag_thold = 0.1f; // KV cache defragmentation threshold
292291

293292
// offload params
294293
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

examples/llama.vim

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
"
1818
" start the llama.cpp server with a FIM-compatible model. for example:
1919
"
20-
" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 256
20+
" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 512 --batch-size 1024 --cache-reuse 256
2121
"
2222
" --batch-size [512, model max context]
2323
"

include/llama.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,7 @@ extern "C" {
312312
float yarn_beta_fast; // YaRN low correction dim
313313
float yarn_beta_slow; // YaRN high correction dim
314314
uint32_t yarn_orig_ctx; // YaRN original context size
315-
float defrag_thold; // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
315+
float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default)
316316

317317
ggml_backend_sched_eval_callback cb_eval;
318318
void * cb_eval_user_data;

scripts/compare-llama-bench.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
2929
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
3030
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
31-
"defrag_thold",
3231
"use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth",
3332
"test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
3433
]

src/llama-context.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ llama_context::llama_context(
3939
cparams.yarn_attn_factor = params.yarn_attn_factor;
4040
cparams.yarn_beta_fast = params.yarn_beta_fast;
4141
cparams.yarn_beta_slow = params.yarn_beta_slow;
42-
cparams.defrag_thold = params.defrag_thold;
4342
cparams.embeddings = params.embeddings;
4443
cparams.offload_kqv = params.offload_kqv;
4544
cparams.flash_attn = params.flash_attn;
@@ -978,7 +977,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
978977

979978
bool did_optimize = false;
980979

981-
// handle any pending defrags/shifts
980+
// handle any pending shifts/copies
982981
memory_update(false);
983982

984983
llama_memory_context_ptr mctx;

src/llama-cparams.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ struct llama_cparams {
2424
float yarn_attn_factor;
2525
float yarn_beta_fast;
2626
float yarn_beta_slow;
27-
float defrag_thold;
2827

2928
bool embeddings;
3029
bool causal_attn;

0 commit comments

Comments
 (0)