From 57ece5ba2cadbe2fe8fa85bb31146ad3cb874211 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 23 Oct 2025 12:22:32 +0300 Subject: [PATCH 01/14] server : support unified context across slots --- tools/server/server.cpp | 57 ++++++++++++++++++++++++++++++++++------- 1 file changed, 48 insertions(+), 9 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 92d30664e41f4..b907c9372f957 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2407,7 +2407,7 @@ struct server_context { params_dft.devices = params_base.speculative.devices; params_dft.model = params_base.speculative.model; - params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx; + params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? slots.front().n_ctx : params_base.speculative.n_ctx; params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers; params_dft.n_parallel = 1; params_dft.cache_type_k = params_base.speculative.cache_type_k; @@ -2495,7 +2495,7 @@ struct server_context { } void init() { - const int32_t n_ctx_slot = n_ctx / params_base.n_parallel; + const int32_t n_ctx_slot = params_base.kv_unified ? n_ctx : n_ctx / params_base.n_parallel; SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel); @@ -2699,6 +2699,36 @@ struct server_context { return ret; } + // return true if at least one slot has been purged + // TODO: improve logic + // - smarter decision which slot to purge + // - move slot to level 2 cache instead of removing? + // - instead of purging, try to store and resume later? + bool try_purge_idle_slots() { + bool res = false; + + if (!params_base.kv_unified) { + return res; + } + + for (auto & slot : slots) { + if (slot.is_processing()) { + continue; + } + + if (slot.prompt.n_tokens() > 0) { + SRV_WRN("purging slot %d with %zu tokens\n", slot.id, slot.prompt.tokens.size()); + + llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1); + slot.prompt.tokens.clear(); + + res = true; + } + } + + return res; + } + bool launch_slot_with_task(server_slot & slot, server_task && task) { slot.reset(); @@ -3635,9 +3665,10 @@ struct server_context { int32_t n_batch = llama_n_batch(ctx); int32_t n_ubatch = llama_n_ubatch(ctx); - // next, batch any pending prompts without exceeding n_batch - float alora_scale = -1.0f; + float alora_scale = -1.0f; size_t alora_disabled_id = 0; + + // next, batch any pending prompts without exceeding n_batch if (params_base.cont_batching || batch.n_tokens == 0) { for (auto & slot : slots) { // check if we can batch this slot with the previous one @@ -4126,6 +4157,8 @@ struct server_context { std::string err; if (n_batch == 1 && ret == 1) { + // TODO: try to terminate only the largest active slot and continue + // need to remove the tokens from the current batch too err = "Context size has been exceeded."; } @@ -4141,17 +4174,23 @@ struct server_context { // TODO: handle ret == 2 (abort) when we start aborting if (!err.empty()) { - SRV_ERR("%s, i = %d, n_batch = %d, ret = %d\n", err.c_str(), i, n_batch, ret); + SRV_ERR("%s i = %d, n_batch = %d, ret = %d\n", err.c_str(), i, n_batch, ret); + for (auto & slot : slots) { - send_error(slot, err); - slot.release(); + if (slot.is_processing()) { + send_error(slot, err); + slot.release(); + } } + break; } } // retry with half the batch size to try to find a free slot in the KV cache - n_batch /= 2; + if (!try_purge_idle_slots()) { + n_batch /= 2; + } SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret); @@ -4944,7 +4983,7 @@ int main(int argc, char ** argv) { // Everything else, including multimodal completions. inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true); } - const size_t n_ctx_slot = ctx_server.n_ctx / ctx_server.params_base.n_parallel; + const size_t n_ctx_slot = ctx_server.slots.front().n_ctx; tasks.reserve(inputs.size()); for (size_t i = 0; i < inputs.size(); i++) { auto n_prompt_tokens = inputs[i].size(); From a42fb77147715b124249915396eabac027625a0d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 23 Oct 2025 14:33:53 +0300 Subject: [PATCH 02/14] cont : fix speculative decoding initialization --- tools/server/server.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index b907c9372f957..95dc61105ad3f 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2379,6 +2379,10 @@ struct server_context { llama_batch_free(batch); } + int32_t n_ctx_slot() const { + return params_base.kv_unified ? n_ctx : n_ctx / params_base.n_parallel; + } + bool load_model(const common_params & params) { SRV_INF("loading model '%s'\n", params.model.path.c_str()); @@ -2407,7 +2411,7 @@ struct server_context { params_dft.devices = params_base.speculative.devices; params_dft.model = params_base.speculative.model; - params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? slots.front().n_ctx : params_base.speculative.n_ctx; + params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? n_ctx_slot() : params_base.speculative.n_ctx; params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers; params_dft.n_parallel = 1; params_dft.cache_type_k = params_base.speculative.cache_type_k; @@ -2495,8 +2499,6 @@ struct server_context { } void init() { - const int32_t n_ctx_slot = params_base.kv_unified ? n_ctx : n_ctx / params_base.n_parallel; - SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel); for (int i = 0; i < params_base.n_parallel; i++) { @@ -2504,7 +2506,7 @@ struct server_context { slot.id = i; slot.ctx = ctx; - slot.n_ctx = n_ctx_slot; + slot.n_ctx = n_ctx_slot(); slot.mctx = mctx; slot.prompt.tokens.has_mtmd = mctx != nullptr; @@ -2527,7 +2529,7 @@ struct server_context { } } - SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx); + SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx); slot.callback_on_release = [this](int) { queue_tasks.pop_deferred_task(); From 492f628c586ebc1932a30fb9823cc0f788f6eefc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 23 Oct 2025 14:51:26 +0300 Subject: [PATCH 03/14] context : fix n_ctx_per_seq computation --- src/llama-context.cpp | 14 ++++++-------- src/llama-model.cpp | 2 +- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index f6192a36e0ee5..949d157c86ee5 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -112,11 +112,9 @@ llama_context::llama_context( } } - const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; - LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max); LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); - LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq); + LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq()); LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch); LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch); LLAMA_LOG_INFO("%s: causal_attn = %d\n", __func__, cparams.causal_attn); @@ -125,14 +123,14 @@ llama_context::llama_context( LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); - if (n_ctx_per_seq < hparams.n_ctx_train) { + if (n_ctx_per_seq() < hparams.n_ctx_train) { LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n", - __func__, n_ctx_per_seq, hparams.n_ctx_train); + __func__, n_ctx_per_seq(), hparams.n_ctx_train); } - if (n_ctx_per_seq > hparams.n_ctx_train) { + if (n_ctx_per_seq() > hparams.n_ctx_train) { LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", - __func__, n_ctx_per_seq, hparams.n_ctx_train); + __func__, n_ctx_per_seq(), hparams.n_ctx_train); } if (!hparams.vocab_only) { @@ -454,7 +452,7 @@ uint32_t llama_context::n_ctx() const { } uint32_t llama_context::n_ctx_per_seq() const { - return cparams.n_ctx / cparams.n_seq_max; + return cparams.kv_unified ? cparams.n_ctx : cparams.n_ctx / cparams.n_seq_max; } uint32_t llama_context::n_batch() const { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 04239181c7765..36dcdd33eda69 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -6712,7 +6712,7 @@ float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) co } ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const { - const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; + const uint32_t n_ctx_per_seq = cparams.kv_unified ? cparams.n_ctx : cparams.n_ctx / cparams.n_seq_max; // choose long/short freq factors based on the context size if (layers[il].rope_freqs != nullptr) { From 8222e9c29cab42f73bb638e3160a6b1e68ef4bf1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 23 Oct 2025 14:58:07 +0300 Subject: [PATCH 04/14] server : purge slots one by one --- tools/server/server.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 95dc61105ad3f..76edc3df81170 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2725,6 +2725,9 @@ struct server_context { slot.prompt.tokens.clear(); res = true; + + // purge slots one by one + break; } } From 2179175031b61830e79d86d1b44e98678f2fe7f7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 23 Oct 2025 16:24:57 +0300 Subject: [PATCH 05/14] tests : add unified cache server tests --- tools/server/tests/unit/test_completion.py | 31 ++++++++++++++++++++++ tools/server/tests/utils.py | 3 +++ 2 files changed, 34 insertions(+) diff --git a/tools/server/tests/unit/test_completion.py b/tools/server/tests/unit/test_completion.py index 00ba78cf67c09..acb893d495899 100644 --- a/tools/server/tests/unit/test_completion.py +++ b/tools/server/tests/unit/test_completion.py @@ -368,6 +368,37 @@ def check_slots_status(): # assert match_regex(re_content, res.body["content"]) +@pytest.mark.parametrize( + "n_ctx,n_slots,n_predict_vals,expected_success", + [ + (256, 4, [80, 40, 80, 80], [True, True, True, True]), + (256, 4, [70, 70, 70, 70], [False, False, False, False]), + (256, 4, [90, 90, 40, 90], [False, False, True, False]), + (256, 4, [90, 90, 40, 80], [True, True, True, True]), + ], +) +def test_completion_unified(n_ctx, n_slots, n_predict_vals, expected_success): + global server + server.n_slots = n_slots + server.kv_unified = True + server.n_ctx = n_ctx + server.start() + prompt = "A" + tasks = [] + for n_predict in n_predict_vals: + tasks.append((server.make_request, ("POST", "/completion", {"prompt": prompt, "n_predict": n_predict}))) + results = parallel_function_calls(tasks) + for res, n_predict, expect_ok in zip(results, n_predict_vals, expected_success): + if expect_ok: + assert res.status_code == 200 + assert "content" in res.body + if "timings" in res.body: + assert res.body["timings"]["predicted_n"] == n_predict + else: + assert res.status_code == 500 + assert "content" not in res.body + + @pytest.mark.parametrize( "prompt,n_predict,response_fields", [ diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py index 4ba3d43c33044..da703c4c51a15 100644 --- a/tools/server/tests/utils.py +++ b/tools/server/tests/utils.py @@ -78,6 +78,7 @@ class ServerProcess: server_embeddings: bool | None = False server_reranking: bool | None = False server_metrics: bool | None = False + kv_unified: bool | None = False server_slots: bool | None = False pooling: str | None = None draft: int | None = None @@ -159,6 +160,8 @@ def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None: server_args.append("--reranking") if self.server_metrics: server_args.append("--metrics") + if self.kv_unified: + server_args.append("--kv-unified") if self.server_slots: server_args.append("--slots") else: From f0f105ff4b6f7baed62c2f12da7cd17c88d9c98c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 23 Oct 2025 17:54:53 +0300 Subject: [PATCH 06/14] llama : update per-seq context computation --- include/llama.h | 1 + src/llama-context.cpp | 30 +++++++++++++------ src/llama-context.h | 10 +++---- src/llama-cparams.h | 1 + src/llama-model.cpp | 14 +++------ tools/server/server.cpp | 8 ++--- .../server/tests/unit/test_chat_completion.py | 8 ++--- tools/server/tests/unit/test_infill.py | 4 +-- 8 files changed, 40 insertions(+), 36 deletions(-) diff --git a/include/llama.h b/include/llama.h index 08e2ffa34c9f6..532023557df8d 100644 --- a/include/llama.h +++ b/include/llama.h @@ -462,6 +462,7 @@ extern "C" { LLAMA_API bool llama_supports_rpc (void); LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx); + LLAMA_API uint32_t llama_n_ctx_seq (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx); diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 949d157c86ee5..0190475458822 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -112,9 +112,17 @@ llama_context::llama_context( } } + cparams.n_ctx_seq = cparams.kv_unified ? cparams.n_ctx : cparams.n_ctx / cparams.n_seq_max; + + if (cparams.n_ctx_seq > hparams.n_ctx_train) { + LLAMA_LOG_WARN("%s: capping n_ctx_seq (%u) to n_ctx_train (%u)\n", __func__, cparams.n_ctx_seq, hparams.n_ctx_train); + + cparams.n_ctx_seq = hparams.n_ctx_train; + } + LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max); LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); - LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq()); + LLAMA_LOG_INFO("%s: n_ctx_seq = %u\n", __func__, cparams.n_ctx_seq); LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch); LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch); LLAMA_LOG_INFO("%s: causal_attn = %d\n", __func__, cparams.causal_attn); @@ -123,14 +131,14 @@ llama_context::llama_context( LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); - if (n_ctx_per_seq() < hparams.n_ctx_train) { - LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n", - __func__, n_ctx_per_seq(), hparams.n_ctx_train); + if (cparams.n_ctx_seq < hparams.n_ctx_train) { + LLAMA_LOG_WARN("%s: n_ctx_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n", + __func__, cparams.n_ctx_seq, hparams.n_ctx_train); } - if (n_ctx_per_seq() > hparams.n_ctx_train) { - LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", - __func__, n_ctx_per_seq(), hparams.n_ctx_train); + if (cparams.n_ctx_seq > hparams.n_ctx_train) { + LLAMA_LOG_WARN("%s: n_ctx_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", + __func__, cparams.n_ctx_seq, hparams.n_ctx_train); } if (!hparams.vocab_only) { @@ -451,8 +459,8 @@ uint32_t llama_context::n_ctx() const { return cparams.n_ctx; } -uint32_t llama_context::n_ctx_per_seq() const { - return cparams.kv_unified ? cparams.n_ctx : cparams.n_ctx / cparams.n_seq_max; +uint32_t llama_context::n_ctx_seq() const { + return cparams.n_ctx_seq; } uint32_t llama_context::n_batch() const { @@ -2381,6 +2389,10 @@ uint32_t llama_n_ctx(const llama_context * ctx) { return ctx->n_ctx(); } +uint32_t llama_n_ctx_seq(const llama_context * ctx) { + return ctx->n_ctx_seq(); +} + uint32_t llama_n_batch(const llama_context * ctx) { return ctx->n_batch(); } diff --git a/src/llama-context.h b/src/llama-context.h index ed6d82cb396f9..20cbd78955412 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -43,11 +43,11 @@ struct llama_context { ggml_backend_sched_t get_sched() const; - uint32_t n_ctx() const; - uint32_t n_ctx_per_seq() const; - uint32_t n_batch() const; - uint32_t n_ubatch() const; - uint32_t n_seq_max() const; + uint32_t n_ctx() const; + uint32_t n_ctx_seq() const; + uint32_t n_batch() const; + uint32_t n_ubatch() const; + uint32_t n_seq_max() const; uint32_t n_threads() const; uint32_t n_threads_batch() const; diff --git a/src/llama-cparams.h b/src/llama-cparams.h index eae7b839f4857..fcef8fa976038 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -8,6 +8,7 @@ struct llama_cparams { uint32_t n_ctx; // context size used during inference + uint32_t n_ctx_seq; // context for a single sequence uint32_t n_batch; uint32_t n_ubatch; uint32_t n_seq_max; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 36dcdd33eda69..896725466ce24 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -6712,14 +6712,14 @@ float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) co } ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const { - const uint32_t n_ctx_per_seq = cparams.kv_unified ? cparams.n_ctx : cparams.n_ctx / cparams.n_seq_max; + const uint32_t n_ctx_seq = cparams.n_ctx_seq; // choose long/short freq factors based on the context size if (layers[il].rope_freqs != nullptr) { return layers[il].rope_freqs; } - if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) { + if (n_ctx_seq > hparams.n_ctx_orig_yarn) { return layers[il].rope_long; } @@ -6795,12 +6795,6 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, /* filter_attn */ std::move(filter_attn), /* filter_recr */ std::move(filter_recr)); } else { - uint32_t n_ctx_per_stream = cparams.n_ctx; - - if (!cparams.kv_unified) { - n_ctx_per_stream = (cparams.n_ctx + cparams.n_seq_max - 1)/cparams.n_seq_max; - } - llama_memory_i::layer_reuse_cb reuse = nullptr; if (arch == LLM_ARCH_GEMMA3N) { @@ -6824,7 +6818,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, cparams.offload_kqv, params.swa_full, cparams.kv_unified, - n_ctx_per_stream, + cparams.n_ctx_seq, cparams.n_seq_max, cparams.n_ubatch, 1, @@ -6840,7 +6834,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, !cparams.flash_attn, cparams.offload_kqv, cparams.kv_unified, - n_ctx_per_stream, + cparams.n_ctx_seq, cparams.n_seq_max, 1, hparams.n_swa, diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 76edc3df81170..973ef662fa05c 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2379,10 +2379,6 @@ struct server_context { llama_batch_free(batch); } - int32_t n_ctx_slot() const { - return params_base.kv_unified ? n_ctx : n_ctx / params_base.n_parallel; - } - bool load_model(const common_params & params) { SRV_INF("loading model '%s'\n", params.model.path.c_str()); @@ -2411,7 +2407,7 @@ struct server_context { params_dft.devices = params_base.speculative.devices; params_dft.model = params_base.speculative.model; - params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? n_ctx_slot() : params_base.speculative.n_ctx; + params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? llama_n_ctx_seq(ctx) : params_base.speculative.n_ctx; params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers; params_dft.n_parallel = 1; params_dft.cache_type_k = params_base.speculative.cache_type_k; @@ -2506,7 +2502,7 @@ struct server_context { slot.id = i; slot.ctx = ctx; - slot.n_ctx = n_ctx_slot(); + slot.n_ctx = llama_n_ctx_seq(ctx); slot.mctx = mctx; slot.prompt.tokens.has_mtmd = mctx != nullptr; diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py index d56d3d5f178b8..392e0efecdbbd 100644 --- a/tools/server/tests/unit/test_chat_completion.py +++ b/tools/server/tests/unit/test_chat_completion.py @@ -433,21 +433,21 @@ def test_context_size_exceeded_stream(): @pytest.mark.parametrize( "n_batch,batch_count,reuse_cache", [ - (64, 15, False), + (64, 3, False), (64, 1, True), ] ) -def test_return_progresssss(n_batch, batch_count, reuse_cache): +def test_return_progress(n_batch, batch_count, reuse_cache): global server server.n_batch = n_batch - server.n_ctx = 2048 + server.n_ctx = 256 server.n_slots = 1 server.start() def make_cmpl_request(): return server.make_stream_request("POST", "/chat/completions", data={ "max_tokens": 10, "messages": [ - {"role": "user", "content": "This is a test" * 100}, + {"role": "user", "content": "This is a test" * 10}, ], "stream": True, "return_progress": True, diff --git a/tools/server/tests/unit/test_infill.py b/tools/server/tests/unit/test_infill.py index 73dacdae812b8..cd1a391b4adbc 100644 --- a/tools/server/tests/unit/test_infill.py +++ b/tools/server/tests/unit/test_infill.py @@ -18,7 +18,7 @@ def test_infill_without_input_extra(): "input_suffix": "}\n", }) assert res.status_code == 200 - assert match_regex("(Ann|small|shiny|Daddy)+", res.body["content"]) + assert match_regex("(Ann|small|shiny|Daddy|Jimmy)+", res.body["content"]) def test_infill_with_input_extra(): @@ -34,7 +34,7 @@ def test_infill_with_input_extra(): "input_suffix": "}\n", }) assert res.status_code == 200 - assert match_regex("(Dad|excited|park)+", res.body["content"]) + assert match_regex("(Dad|excited|park|Jimmy)+", res.body["content"]) @pytest.mark.parametrize("input_extra", [ From e7b7cbfb34c391a7aa7835e5fef2e7cf1669ccdf Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 28 Oct 2025 12:49:10 +0200 Subject: [PATCH 07/14] test-thread-safety : handle tiny training context of the input model --- tests/test-thread-safety.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/test-thread-safety.cpp b/tests/test-thread-safety.cpp index e5158fb5062f0..bcb86c35e6652 100644 --- a/tests/test-thread-safety.cpp +++ b/tests/test-thread-safety.cpp @@ -131,7 +131,14 @@ int main(int argc, char ** argv) { } batch = llama_batch_get_one(&token, 1); - if (llama_decode(ctx.get(), batch)) { + + int ret = llama_decode(ctx.get(), batch); + if (ret == 1 && i > 0) { + LOG_INF("Context full, stopping generation.\n"); + break; + } + + if (ret != 0) { LOG_ERR("Model %d/%d, Context %d/%d: failed to decode\n", m + 1, num_models, c + 1, num_contexts); failed.store(true); return; From 290f6a9f08ddc9aa112e27e1acdb70d1b0b6d779 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 30 Oct 2025 20:15:13 +0200 Subject: [PATCH 08/14] server : fix server_tokens clear() --- tools/server/server.cpp | 7 ++++--- tools/server/utils.hpp | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 973ef662fa05c..38276d7f51c50 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -3946,8 +3946,11 @@ struct server_context { // truncate any tokens that are beyond n_past for this slot const llama_pos p0 = slot.prompt.tokens.pos_next(); + + SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0); + if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) { - SLT_WRN(slot, "failed to truncate tokens with position >= %d\n", p0); + SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0); llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1); // there is no common part left @@ -3956,8 +3959,6 @@ struct server_context { slot.prompt.tokens.clear(); } - SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0); - // check if we should process the image if (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) { // process the image diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index b6198edfc487c..2bce2f4a47af9 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -1212,7 +1212,7 @@ struct server_tokens { for (auto it = tokens.map_idx_to_media.begin(); it != tokens.map_idx_to_media.end(); ) { auto * chunk = tokens.map_idx_to_media[it->first].get(); mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk)); - map_idx_to_media[start_idx+it->first] = std::move(new_chunk); + map_idx_to_media[start_idx + it->first] = std::move(new_chunk); } } } @@ -1244,6 +1244,7 @@ struct server_tokens { } void clear() { + map_idx_to_media.clear(); tokens.clear(); } From 23323cd1c427e388202216456c58737ef2a2b4cc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 30 Oct 2025 20:15:34 +0200 Subject: [PATCH 09/14] server : use 4 slots + unified KV by default --- tools/server/server.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 38276d7f51c50..117451ee9ad93 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -4432,6 +4432,13 @@ int main(int argc, char ** argv) { return 1; } + if (params.n_parallel == 1 && params.kv_unified == false) { + LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true\n", __func__); + + params.n_parallel = 4; + params.kv_unified = true; + } + common_init(); // struct that contains llama context and inference From f2cca0245b813df0cdbde5245b2a1cadfb78fb68 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 30 Oct 2025 20:16:20 +0200 Subject: [PATCH 10/14] llama : add note about context size queries --- include/llama.h | 4 +++- src/llama-context.cpp | 12 +++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/include/llama.h b/include/llama.h index 532023557df8d..caca361516aaa 100644 --- a/include/llama.h +++ b/include/llama.h @@ -461,6 +461,8 @@ extern "C" { LLAMA_API bool llama_supports_gpu_offload(void); LLAMA_API bool llama_supports_rpc (void); + // NOTE: After creating a llama_context, it is recommended to query the actual values using these functions + // In some cases the requested values via llama_context_params may differ from the actual values used by the context LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_ctx_seq (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); @@ -586,7 +588,7 @@ extern "C" { LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size); // Manually free a LoRA adapter - // Note: loaded adapters will be free when the associated model is deleted + // NOTE: loaded adapters will be free when the associated model is deleted LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter); // Get the invocation tokens if the current lora is an alora diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 0190475458822..e949afab2142f 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -112,7 +112,11 @@ llama_context::llama_context( } } - cparams.n_ctx_seq = cparams.kv_unified ? cparams.n_ctx : cparams.n_ctx / cparams.n_seq_max; + if (cparams.kv_unified) { + cparams.n_ctx_seq = cparams.n_ctx; + } else { + cparams.n_ctx_seq = cparams.n_ctx / cparams.n_seq_max; + } if (cparams.n_ctx_seq > hparams.n_ctx_train) { LLAMA_LOG_WARN("%s: capping n_ctx_seq (%u) to n_ctx_train (%u)\n", __func__, cparams.n_ctx_seq, hparams.n_ctx_train); @@ -120,6 +124,12 @@ llama_context::llama_context( cparams.n_ctx_seq = hparams.n_ctx_train; } + if (cparams.kv_unified) { + cparams.n_ctx = cparams.n_ctx_seq; + } else { + cparams.n_ctx = cparams.n_ctx_seq * cparams.n_seq_max; + } + LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max); LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); LLAMA_LOG_INFO("%s: n_ctx_seq = %u\n", __func__, cparams.n_ctx_seq); From ff684363fa53f6f50946b9143c3c2fc0b719796f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 30 Oct 2025 20:39:26 +0200 Subject: [PATCH 11/14] cont : update todos [no ci] --- tools/server/server.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 117451ee9ad93..29ba95ded553b 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2699,7 +2699,7 @@ struct server_context { // return true if at least one slot has been purged // TODO: improve logic - // - smarter decision which slot to purge + // - smarter decision which slot to purge (LRU or longest prompt?) // - move slot to level 2 cache instead of removing? // - instead of purging, try to store and resume later? bool try_purge_idle_slots() { @@ -4159,7 +4159,7 @@ struct server_context { std::string err; if (n_batch == 1 && ret == 1) { - // TODO: try to terminate only the largest active slot and continue + // TODO: try to terminate only the largest active slot/sequence and continue with the rest // need to remove the tokens from the current batch too err = "Context size has been exceeded."; } From c08d0d148a77447e28843fa918851bef01e88e7b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 1 Nov 2025 17:44:52 +0200 Subject: [PATCH 12/14] context : do not cap the size of the context --- src/llama-context.cpp | 15 ++------------- tools/server/server.cpp | 10 +++++++++- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index e949afab2142f..8641586eebca5 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -115,19 +115,8 @@ llama_context::llama_context( if (cparams.kv_unified) { cparams.n_ctx_seq = cparams.n_ctx; } else { - cparams.n_ctx_seq = cparams.n_ctx / cparams.n_seq_max; - } - - if (cparams.n_ctx_seq > hparams.n_ctx_train) { - LLAMA_LOG_WARN("%s: capping n_ctx_seq (%u) to n_ctx_train (%u)\n", __func__, cparams.n_ctx_seq, hparams.n_ctx_train); - - cparams.n_ctx_seq = hparams.n_ctx_train; - } - - if (cparams.kv_unified) { - cparams.n_ctx = cparams.n_ctx_seq; - } else { - cparams.n_ctx = cparams.n_ctx_seq * cparams.n_seq_max; + cparams.n_ctx_seq = cparams.n_ctx / cparams.n_seq_max; + cparams.n_ctx = cparams.n_ctx_seq * cparams.n_seq_max; } LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max); diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 29ba95ded553b..90f49d4aa6006 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2497,12 +2497,20 @@ struct server_context { void init() { SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel); + const int n_ctx_train = llama_model_n_ctx_train(model); + + int n_ctx_slot = llama_n_ctx_seq(ctx); + if (n_ctx_slot > n_ctx_train) { + SRV_WRN("the slot context (%d) exceeds the training context of the model (%d) - capping\n", n_ctx_slot, n_ctx_train); + n_ctx_slot = n_ctx_train; + } + for (int i = 0; i < params_base.n_parallel; i++) { server_slot slot; slot.id = i; slot.ctx = ctx; - slot.n_ctx = llama_n_ctx_seq(ctx); + slot.n_ctx = n_ctx_slot; slot.mctx = mctx; slot.prompt.tokens.has_mtmd = mctx != nullptr; From 356dc08b983ee46332913c3762c9218bc365a51e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 1 Nov 2025 18:24:07 +0200 Subject: [PATCH 13/14] tests : adjust parameters to be CI friendlier --- tools/server/tests/unit/test_completion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/server/tests/unit/test_completion.py b/tools/server/tests/unit/test_completion.py index acb893d495899..3c0ce98973f4b 100644 --- a/tools/server/tests/unit/test_completion.py +++ b/tools/server/tests/unit/test_completion.py @@ -374,7 +374,7 @@ def check_slots_status(): (256, 4, [80, 40, 80, 80], [True, True, True, True]), (256, 4, [70, 70, 70, 70], [False, False, False, False]), (256, 4, [90, 90, 40, 90], [False, False, True, False]), - (256, 4, [90, 90, 40, 80], [True, True, True, True]), + (256, 4, [90, 90, 40, 75], [True, True, True, True]), ], ) def test_completion_unified(n_ctx, n_slots, n_predict_vals, expected_success): From 56fceee2cbfba351f3e59cbcdf7fb18123dfd0e6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 2 Nov 2025 12:11:06 +0200 Subject: [PATCH 14/14] context : add warning --- src/llama-context.cpp | 12 ++++++++++-- tools/server/server.cpp | 2 ++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 8641586eebca5..2b39366271ff9 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -115,8 +115,16 @@ llama_context::llama_context( if (cparams.kv_unified) { cparams.n_ctx_seq = cparams.n_ctx; } else { - cparams.n_ctx_seq = cparams.n_ctx / cparams.n_seq_max; - cparams.n_ctx = cparams.n_ctx_seq * cparams.n_seq_max; + cparams.n_ctx_seq = cparams.n_ctx / cparams.n_seq_max; + + if (cparams.n_ctx_seq == 0) { + throw std::runtime_error("n_ctx_seq == 0"); + } + + if (cparams.n_ctx != cparams.n_ctx_seq * cparams.n_seq_max) { + cparams.n_ctx = cparams.n_ctx_seq * cparams.n_seq_max; + LLAMA_LOG_WARN("%s: n_ctx is not divisible by n_seq_max - rounding down to %u\n", __func__, cparams.n_ctx); + } } LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max); diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 90f49d4aa6006..aa4981585200a 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -4440,6 +4440,8 @@ int main(int argc, char ** argv) { return 1; } + // TODO: should we have a separate n_parallel parameter for the server? + // https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177 if (params.n_parallel == 1 && params.kv_unified == false) { LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true\n", __func__);