Skip to content

Commit d00cbea

Browse files
ggerganovngxson
andauthored
server : host-memory prompt caching (ggml-org#16391)
* minor : code style * server : fix prompt similarity calculation * server : initial host-memory prompt caching * cont * server : refactor * cont * cont : make the server task of the slot const * cont : minor [no ci] * server : cache prompts and checkpoints only for completion tasks * server : improve prompt caching logic * cont : fix check for number of cached prompts [no ci] * server : improve caching logic, add -cram CLI arg * server : print prompt mismatch info * cont : better naming [no ci] * server : improve prompt cache loading logic * server : add option to debug the slot contents (ggml-org#16482) * server : add option to debug the slot contents * Update tools/server/server.cpp --------- Co-authored-by: Xuan-Son Nguyen <[email protected]> * server : add option to disable prompt cache --------- Co-authored-by: Xuan-Son Nguyen <[email protected]>
1 parent 8328fd4 commit d00cbea

File tree

10 files changed

+809
-467
lines changed

10 files changed

+809
-467
lines changed

common/arg.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1935,6 +1935,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
19351935
params.n_ctx_checkpoints = value;
19361936
}
19371937
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
1938+
add_opt(common_arg(
1939+
{"--cache-ram", "-cram"}, "N",
1940+
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n"
1941+
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
1942+
[](common_params & params, int value) {
1943+
params.cache_ram_mib = value;
1944+
}
1945+
).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER}));
19381946
add_opt(common_arg(
19391947
{"--kv-unified", "-kvu"},
19401948
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"

common/chat.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ struct common_chat_msg_content_part {
3333
struct common_chat_msg {
3434
std::string role;
3535
std::string content;
36-
std::vector<common_chat_msg_content_part> content_parts = {};
37-
std::vector<common_chat_tool_call> tool_calls = {};
36+
std::vector<common_chat_msg_content_part> content_parts;
37+
std::vector<common_chat_tool_call> tool_calls;
3838
std::string reasoning_content;
3939
std::string tool_name;
4040
std::string tool_call_id;
@@ -44,7 +44,7 @@ struct common_chat_msg {
4444
bool empty() const {
4545
return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
4646
}
47-
void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
47+
void set_tool_call_ids(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
4848
for (auto i = 0u; i < tool_calls.size(); i++) {
4949
if (ids_cache.size() <= i) {
5050
auto id = tool_calls[i].id;

common/common.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,7 @@ struct common_params {
378378
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
379379
bool cont_batching = true; // insert new sequences for decoding on-the-fly
380380
bool no_perf = false; // disable performance metrics
381-
bool ctx_shift = false; // context shift on infinite text generation
381+
bool ctx_shift = false; // context shift on infinite text generation
382382
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
383383
bool kv_unified = false; // enable unified KV cache
384384

@@ -425,7 +425,8 @@ struct common_params {
425425
int32_t timeout_write = timeout_read; // http write timeout in seconds
426426
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
427427
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
428-
int32_t n_ctx_checkpoints = 3; // max number of context checkpoints per slot
428+
int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
429+
int32_t cache_ram_mib = 8192; // 0 = no limit, 1 = 1 MiB, etc.
429430

430431
std::string hostname = "127.0.0.1";
431432
std::string public_path = ""; // NOLINT

src/llama-kv-cache.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -123,11 +123,8 @@ llama_kv_cache::llama_kv_cache(
123123
throw std::runtime_error("failed to create ggml context for kv cache");
124124
}
125125

126-
ggml_tensor * k;
127-
ggml_tensor * v;
128-
129-
k = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream);
130-
v = ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream);
126+
ggml_tensor * k = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream);
127+
ggml_tensor * v = ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream);
131128

132129
ggml_format_name(k, "cache_k_l%d", il);
133130
ggml_format_name(v, "cache_v_l%d", il);

0 commit comments

Comments
 (0)