Skip to content

Commit 318c4f8

Browse files
committed
common : rename kv_split -> kv_unified
ggml-ci
1 parent fb8150d commit 318c4f8

File tree

5 files changed

+20
-11
lines changed

5 files changed

+20
-11
lines changed

common/arg.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1465,11 +1465,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
14651465
}
14661466
).set_env("LLAMA_ARG_SWA_FULL"));
14671467
add_opt(common_arg(
1468-
{"--kv-split", "-kvs"},
1469-
string_format("use multiple streams when computing the attention (default: %s)\n"
1470-
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_split ? "true" : "false"),
1468+
{"--kv-unified", "-kvu"},
1469+
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
1470+
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
14711471
[](common_params & params) {
1472-
params.kv_split = true;
1472+
params.kv_unified = true;
14731473
}
14741474
).set_env("LLAMA_ARG_KV_SPLIT"));
14751475
add_opt(common_arg(

common/common.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1157,7 +1157,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
11571157
cparams.no_perf = params.no_perf;
11581158
cparams.op_offload = !params.no_op_offload;
11591159
cparams.swa_full = params.swa_full;
1160-
cparams.kv_unified = !params.kv_split;
1160+
cparams.kv_unified = params.kv_unified;
11611161

11621162
cparams.type_k = params.cache_type_k;
11631163
cparams.type_v = params.cache_type_v;

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,7 @@ struct common_params {
330330
bool no_perf = false; // disable performance metrics
331331
bool ctx_shift = true; // context shift on inifinite text generation
332332
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
333-
bool kv_split = false; // disable unified KV cache
333+
bool kv_unified = false; // enable unified KV cache
334334

335335
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
336336
bool use_mmap = true; // use mmap for faster loads

src/llama-context.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,12 +98,21 @@ llama_context::llama_context(
9898
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
9999
cparams.n_batch = GGML_KQ_MASK_PAD;
100100
}
101-
102101
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
103102

104103
cparams.op_offload = params.op_offload;
105104
cparams.kv_unified = params.kv_unified;
106105

106+
{
107+
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
108+
const bool supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) : 0;
109+
110+
if (!supports_set_rows && !cparams.kv_unified) {
111+
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
112+
cparams.kv_unified = true;
113+
}
114+
}
115+
107116
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
108117

109118
LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
@@ -2195,7 +2204,7 @@ llama_context_params llama_context_default_params() {
21952204
/*.no_perf =*/ true,
21962205
/*.op_offload =*/ true,
21972206
/*.swa_full =*/ true,
2198-
/*.kv_unified =*/ true,
2207+
/*.kv_unified =*/ false,
21992208
};
22002209

22012210
return result;

src/llama-kv-cache-unified.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -195,9 +195,9 @@ llama_kv_cache_unified::llama_kv_cache_unified(
195195
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
196196
supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) : 0;
197197

198-
if (!supports_set_rows && !unified) {
199-
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing LLAMA_SET_ROWS=1\n", __func__);
200-
supports_set_rows = 1;
198+
if (!supports_set_rows) {
199+
// ref: https://github.com/ggml-org/llama.cpp/pull/14363
200+
GGML_ASSERT(unified && "cannot use non-unified KV cache without ggml_set_rows() support");
201201
}
202202

203203
if (!supports_set_rows) {

0 commit comments

Comments
 (0)