From 57ece5ba2cadbe2fe8fa85bb31146ad3cb874211 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 23 Oct 2025 12:22:32 +0300
Subject: [PATCH 01/14] server : support unified context across slots

---
 tools/server/server.cpp | 57 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 48 insertions(+), 9 deletions(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 92d30664e41f4..b907c9372f957 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -2407,7 +2407,7 @@ struct server_context {
 
             params_dft.devices      = params_base.speculative.devices;
             params_dft.model        = params_base.speculative.model;
-            params_dft.n_ctx        = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
+            params_dft.n_ctx        = params_base.speculative.n_ctx == 0 ? slots.front().n_ctx : params_base.speculative.n_ctx;
             params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
             params_dft.n_parallel   = 1;
             params_dft.cache_type_k = params_base.speculative.cache_type_k;
@@ -2495,7 +2495,7 @@ struct server_context {
     }
 
     void init() {
-        const int32_t n_ctx_slot = n_ctx / params_base.n_parallel;
+        const int32_t n_ctx_slot = params_base.kv_unified ? n_ctx : n_ctx / params_base.n_parallel;
 
         SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel);
 
@@ -2699,6 +2699,36 @@ struct server_context {
         return ret;
     }
 
+    // return true if at least one slot has been purged
+    // TODO: improve logic
+    //       - smarter decision which slot to purge
+    //       - move slot to level 2 cache instead of removing?
+    //       - instead of purging, try to store and resume later?
+    bool try_purge_idle_slots() {
+        bool res = false;
+
+        if (!params_base.kv_unified) {
+            return res;
+        }
+
+        for (auto & slot : slots) {
+            if (slot.is_processing()) {
+                continue;
+            }
+
+            if (slot.prompt.n_tokens() > 0) {
+                SRV_WRN("purging slot %d with %zu tokens\n", slot.id, slot.prompt.tokens.size());
+
+                llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1);
+                slot.prompt.tokens.clear();
+
+                res = true;
+            }
+        }
+
+        return res;
+    }
+
     bool launch_slot_with_task(server_slot & slot, server_task && task) {
         slot.reset();
 
@@ -3635,9 +3665,10 @@ struct server_context {
         int32_t n_batch  = llama_n_batch(ctx);
         int32_t n_ubatch = llama_n_ubatch(ctx);
 
-        // next, batch any pending prompts without exceeding n_batch
-        float alora_scale = -1.0f;
+        float  alora_scale       = -1.0f;
         size_t alora_disabled_id = 0;
+
+        // next, batch any pending prompts without exceeding n_batch
         if (params_base.cont_batching || batch.n_tokens == 0) {
             for (auto & slot : slots) {
                 // check if we can batch this slot with the previous one
@@ -4126,6 +4157,8 @@ struct server_context {
                     std::string err;
 
                     if (n_batch == 1 && ret == 1) {
+                        // TODO: try to terminate only the largest active slot and continue
+                        //       need to remove the tokens from the current batch too
                         err = "Context size has been exceeded.";
                     }
 
@@ -4141,17 +4174,23 @@ struct server_context {
                     // TODO: handle ret == 2 (abort) when we start aborting
 
                     if (!err.empty()) {
-                        SRV_ERR("%s, i = %d, n_batch = %d, ret = %d\n", err.c_str(), i, n_batch, ret);
+                        SRV_ERR("%s i = %d, n_batch = %d, ret = %d\n", err.c_str(), i, n_batch, ret);
+
                         for (auto & slot : slots) {
-                            send_error(slot, err);
-                            slot.release();
+                            if (slot.is_processing()) {
+                                send_error(slot, err);
+                                slot.release();
+                            }
                         }
+
                         break;
                     }
                 }
 
                 // retry with half the batch size to try to find a free slot in the KV cache
-                n_batch /= 2;
+                if (!try_purge_idle_slots()) {
+                    n_batch /= 2;
+                }
 
                 SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
 
@@ -4944,7 +4983,7 @@ int main(int argc, char ** argv) {
                 // Everything else, including multimodal completions.
                 inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
             }
-            const size_t n_ctx_slot = ctx_server.n_ctx / ctx_server.params_base.n_parallel;
+            const size_t n_ctx_slot = ctx_server.slots.front().n_ctx;
             tasks.reserve(inputs.size());
             for (size_t i = 0; i < inputs.size(); i++) {
                 auto n_prompt_tokens = inputs[i].size();

From a42fb77147715b124249915396eabac027625a0d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 23 Oct 2025 14:33:53 +0300
Subject: [PATCH 02/14] cont : fix speculative decoding initialization

---
 tools/server/server.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index b907c9372f957..95dc61105ad3f 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -2379,6 +2379,10 @@ struct server_context {
         llama_batch_free(batch);
     }
 
+    int32_t n_ctx_slot() const {
+        return params_base.kv_unified ? n_ctx : n_ctx / params_base.n_parallel;
+    }
+
     bool load_model(const common_params & params) {
         SRV_INF("loading model '%s'\n", params.model.path.c_str());
 
@@ -2407,7 +2411,7 @@ struct server_context {
 
             params_dft.devices      = params_base.speculative.devices;
             params_dft.model        = params_base.speculative.model;
-            params_dft.n_ctx        = params_base.speculative.n_ctx == 0 ? slots.front().n_ctx : params_base.speculative.n_ctx;
+            params_dft.n_ctx        = params_base.speculative.n_ctx == 0 ? n_ctx_slot() : params_base.speculative.n_ctx;
             params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
             params_dft.n_parallel   = 1;
             params_dft.cache_type_k = params_base.speculative.cache_type_k;
@@ -2495,8 +2499,6 @@ struct server_context {
     }
 
     void init() {
-        const int32_t n_ctx_slot = params_base.kv_unified ? n_ctx : n_ctx / params_base.n_parallel;
-
         SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel);
 
         for (int i = 0; i < params_base.n_parallel; i++) {
@@ -2504,7 +2506,7 @@ struct server_context {
 
             slot.id = i;
             slot.ctx = ctx;
-            slot.n_ctx = n_ctx_slot;
+            slot.n_ctx = n_ctx_slot();
             slot.mctx = mctx;
             slot.prompt.tokens.has_mtmd = mctx != nullptr;
 
@@ -2527,7 +2529,7 @@ struct server_context {
                 }
             }
 
-            SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
+            SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx);
 
             slot.callback_on_release = [this](int) {
                 queue_tasks.pop_deferred_task();

From 492f628c586ebc1932a30fb9823cc0f788f6eefc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 23 Oct 2025 14:51:26 +0300
Subject: [PATCH 03/14] context : fix n_ctx_per_seq computation

---
 src/llama-context.cpp | 14 ++++++--------
 src/llama-model.cpp   |  2 +-
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index f6192a36e0ee5..949d157c86ee5 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -112,11 +112,9 @@ llama_context::llama_context(
         }
     }
 
-    const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
-
     LLAMA_LOG_INFO("%s: n_seq_max     = %u\n",   __func__, cparams.n_seq_max);
     LLAMA_LOG_INFO("%s: n_ctx         = %u\n",   __func__, cparams.n_ctx);
-    LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n",   __func__, n_ctx_per_seq);
+    LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n",   __func__, n_ctx_per_seq());
     LLAMA_LOG_INFO("%s: n_batch       = %u\n",   __func__, cparams.n_batch);
     LLAMA_LOG_INFO("%s: n_ubatch      = %u\n",   __func__, cparams.n_ubatch);
     LLAMA_LOG_INFO("%s: causal_attn   = %d\n",   __func__, cparams.causal_attn);
@@ -125,14 +123,14 @@ llama_context::llama_context(
     LLAMA_LOG_INFO("%s: freq_base     = %.1f\n", __func__, cparams.rope_freq_base);
     LLAMA_LOG_INFO("%s: freq_scale    = %g\n",   __func__, cparams.rope_freq_scale);
 
-    if (n_ctx_per_seq < hparams.n_ctx_train) {
+    if (n_ctx_per_seq() < hparams.n_ctx_train) {
         LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
-                __func__, n_ctx_per_seq, hparams.n_ctx_train);
+                __func__, n_ctx_per_seq(), hparams.n_ctx_train);
     }
 
-    if (n_ctx_per_seq > hparams.n_ctx_train) {
+    if (n_ctx_per_seq() > hparams.n_ctx_train) {
         LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
-                __func__, n_ctx_per_seq, hparams.n_ctx_train);
+                __func__, n_ctx_per_seq(), hparams.n_ctx_train);
     }
 
     if (!hparams.vocab_only) {
@@ -454,7 +452,7 @@ uint32_t llama_context::n_ctx() const {
 }
 
 uint32_t llama_context::n_ctx_per_seq() const {
-    return cparams.n_ctx / cparams.n_seq_max;
+    return cparams.kv_unified ? cparams.n_ctx : cparams.n_ctx / cparams.n_seq_max;
 }
 
 uint32_t llama_context::n_batch() const {
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 04239181c7765..36dcdd33eda69 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -6712,7 +6712,7 @@ float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) co
 }
 
 ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
-    const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
+    const uint32_t n_ctx_per_seq = cparams.kv_unified ? cparams.n_ctx : cparams.n_ctx / cparams.n_seq_max;
 
     // choose long/short freq factors based on the context size
     if (layers[il].rope_freqs != nullptr) {

From 8222e9c29cab42f73bb638e3160a6b1e68ef4bf1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 23 Oct 2025 14:58:07 +0300
Subject: [PATCH 04/14] server : purge slots one by one

---
 tools/server/server.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 95dc61105ad3f..76edc3df81170 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -2725,6 +2725,9 @@ struct server_context {
                 slot.prompt.tokens.clear();
 
                 res = true;
+
+                // purge slots one by one
+                break;
             }
         }
 

From 2179175031b61830e79d86d1b44e98678f2fe7f7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 23 Oct 2025 16:24:57 +0300
Subject: [PATCH 05/14] tests : add unified cache server tests

---
 tools/server/tests/unit/test_completion.py | 31 ++++++++++++++++++++++
 tools/server/tests/utils.py                |  3 +++
 2 files changed, 34 insertions(+)

diff --git a/tools/server/tests/unit/test_completion.py b/tools/server/tests/unit/test_completion.py
index 00ba78cf67c09..acb893d495899 100644
--- a/tools/server/tests/unit/test_completion.py
+++ b/tools/server/tests/unit/test_completion.py
@@ -368,6 +368,37 @@ def check_slots_status():
         # assert match_regex(re_content, res.body["content"])
 
 
+@pytest.mark.parametrize(
+    "n_ctx,n_slots,n_predict_vals,expected_success",
+    [
+        (256, 4, [80, 40, 80, 80], [True,  True,  True,  True]),
+        (256, 4, [70, 70, 70, 70], [False, False, False, False]),
+        (256, 4, [90, 90, 40, 90], [False, False, True,  False]),
+        (256, 4, [90, 90, 40, 80], [True,  True,  True,  True]),
+    ],
+)
+def test_completion_unified(n_ctx, n_slots, n_predict_vals, expected_success):
+    global server
+    server.n_slots = n_slots
+    server.kv_unified = True
+    server.n_ctx = n_ctx
+    server.start()
+    prompt = "A"
+    tasks = []
+    for n_predict in n_predict_vals:
+        tasks.append((server.make_request, ("POST", "/completion", {"prompt": prompt, "n_predict": n_predict})))
+    results = parallel_function_calls(tasks)
+    for res, n_predict, expect_ok in zip(results, n_predict_vals, expected_success):
+        if expect_ok:
+            assert res.status_code == 200
+            assert "content" in res.body
+            if "timings" in res.body:
+                assert res.body["timings"]["predicted_n"] == n_predict
+        else:
+            assert res.status_code == 500
+            assert "content" not in res.body
+
+
 @pytest.mark.parametrize(
     "prompt,n_predict,response_fields",
     [
diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py
index 4ba3d43c33044..da703c4c51a15 100644
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@@ -78,6 +78,7 @@ class ServerProcess:
     server_embeddings: bool | None = False
     server_reranking: bool | None = False
     server_metrics: bool | None = False
+    kv_unified: bool | None = False
     server_slots: bool | None = False
     pooling: str | None = None
     draft: int | None = None
@@ -159,6 +160,8 @@ def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
             server_args.append("--reranking")
         if self.server_metrics:
             server_args.append("--metrics")
+        if self.kv_unified:
+            server_args.append("--kv-unified")
         if self.server_slots:
             server_args.append("--slots")
         else:

From f0f105ff4b6f7baed62c2f12da7cd17c88d9c98c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 23 Oct 2025 17:54:53 +0300
Subject: [PATCH 06/14] llama : update per-seq context computation

---
 include/llama.h                               |  1 +
 src/llama-context.cpp                         | 30 +++++++++++++------
 src/llama-context.h                           | 10 +++----
 src/llama-cparams.h                           |  1 +
 src/llama-model.cpp                           | 14 +++------
 tools/server/server.cpp                       |  8 ++---
 .../server/tests/unit/test_chat_completion.py |  8 ++---
 tools/server/tests/unit/test_infill.py        |  4 +--
 8 files changed, 40 insertions(+), 36 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 08e2ffa34c9f6..532023557df8d 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -462,6 +462,7 @@ extern "C" {
     LLAMA_API bool llama_supports_rpc        (void);
 
     LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
+    LLAMA_API uint32_t llama_n_ctx_seq  (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_ubatch   (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_seq_max  (const struct llama_context * ctx);
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 949d157c86ee5..0190475458822 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -112,9 +112,17 @@ llama_context::llama_context(
         }
     }
 
+    cparams.n_ctx_seq = cparams.kv_unified ? cparams.n_ctx : cparams.n_ctx / cparams.n_seq_max;
+
+    if (cparams.n_ctx_seq > hparams.n_ctx_train) {
+        LLAMA_LOG_WARN("%s: capping n_ctx_seq (%u) to n_ctx_train (%u)\n", __func__, cparams.n_ctx_seq, hparams.n_ctx_train);
+
+        cparams.n_ctx_seq = hparams.n_ctx_train;
+    }
+
     LLAMA_LOG_INFO("%s: n_seq_max     = %u\n",   __func__, cparams.n_seq_max);
     LLAMA_LOG_INFO("%s: n_ctx         = %u\n",   __func__, cparams.n_ctx);
-    LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n",   __func__, n_ctx_per_seq());
+    LLAMA_LOG_INFO("%s: n_ctx_seq     = %u\n",   __func__, cparams.n_ctx_seq);
     LLAMA_LOG_INFO("%s: n_batch       = %u\n",   __func__, cparams.n_batch);
     LLAMA_LOG_INFO("%s: n_ubatch      = %u\n",   __func__, cparams.n_ubatch);
     LLAMA_LOG_INFO("%s: causal_attn   = %d\n",   __func__, cparams.causal_attn);
@@ -123,14 +131,14 @@ llama_context::llama_context(
     LLAMA_LOG_INFO("%s: freq_base     = %.1f\n", __func__, cparams.rope_freq_base);
     LLAMA_LOG_INFO("%s: freq_scale    = %g\n",   __func__, cparams.rope_freq_scale);
 
-    if (n_ctx_per_seq() < hparams.n_ctx_train) {
-        LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
-                __func__, n_ctx_per_seq(), hparams.n_ctx_train);
+    if (cparams.n_ctx_seq < hparams.n_ctx_train) {
+        LLAMA_LOG_WARN("%s: n_ctx_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
+                __func__, cparams.n_ctx_seq, hparams.n_ctx_train);
     }
 
-    if (n_ctx_per_seq() > hparams.n_ctx_train) {
-        LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
-                __func__, n_ctx_per_seq(), hparams.n_ctx_train);
+    if (cparams.n_ctx_seq > hparams.n_ctx_train) {
+        LLAMA_LOG_WARN("%s: n_ctx_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
+                __func__, cparams.n_ctx_seq, hparams.n_ctx_train);
     }
 
     if (!hparams.vocab_only) {
@@ -451,8 +459,8 @@ uint32_t llama_context::n_ctx() const {
     return cparams.n_ctx;
 }
 
-uint32_t llama_context::n_ctx_per_seq() const {
-    return cparams.kv_unified ? cparams.n_ctx : cparams.n_ctx / cparams.n_seq_max;
+uint32_t llama_context::n_ctx_seq() const {
+    return cparams.n_ctx_seq;
 }
 
 uint32_t llama_context::n_batch() const {
@@ -2381,6 +2389,10 @@ uint32_t llama_n_ctx(const llama_context * ctx) {
     return ctx->n_ctx();
 }
 
+uint32_t llama_n_ctx_seq(const llama_context * ctx) {
+    return ctx->n_ctx_seq();
+}
+
 uint32_t llama_n_batch(const llama_context * ctx) {
     return ctx->n_batch();
 }
diff --git a/src/llama-context.h b/src/llama-context.h
index ed6d82cb396f9..20cbd78955412 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -43,11 +43,11 @@ struct llama_context {
 
     ggml_backend_sched_t get_sched() const;
 
-    uint32_t n_ctx()         const;
-    uint32_t n_ctx_per_seq() const;
-    uint32_t n_batch()       const;
-    uint32_t n_ubatch()      const;
-    uint32_t n_seq_max()     const;
+    uint32_t n_ctx()     const;
+    uint32_t n_ctx_seq() const;
+    uint32_t n_batch()   const;
+    uint32_t n_ubatch()  const;
+    uint32_t n_seq_max() const;
 
     uint32_t n_threads()       const;
     uint32_t n_threads_batch() const;
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
index eae7b839f4857..fcef8fa976038 100644
--- a/src/llama-cparams.h
+++ b/src/llama-cparams.h
@@ -8,6 +8,7 @@
 
 struct llama_cparams {
     uint32_t n_ctx;           // context size used during inference
+    uint32_t n_ctx_seq;       // context for a single sequence
     uint32_t n_batch;
     uint32_t n_ubatch;
     uint32_t n_seq_max;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 36dcdd33eda69..896725466ce24 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -6712,14 +6712,14 @@ float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) co
 }
 
 ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
-    const uint32_t n_ctx_per_seq = cparams.kv_unified ? cparams.n_ctx : cparams.n_ctx / cparams.n_seq_max;
+    const uint32_t n_ctx_seq = cparams.n_ctx_seq;
 
     // choose long/short freq factors based on the context size
     if (layers[il].rope_freqs != nullptr) {
         return layers[il].rope_freqs;
     }
 
-    if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
+    if (n_ctx_seq > hparams.n_ctx_orig_yarn) {
         return layers[il].rope_long;
     }
 
@@ -6795,12 +6795,6 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                         /* filter_attn       */ std::move(filter_attn),
                         /* filter_recr       */ std::move(filter_recr));
                 } else {
-                    uint32_t n_ctx_per_stream = cparams.n_ctx;
-
-                    if (!cparams.kv_unified) {
-                        n_ctx_per_stream = (cparams.n_ctx + cparams.n_seq_max - 1)/cparams.n_seq_max;
-                    }
-
                     llama_memory_i::layer_reuse_cb reuse = nullptr;
 
                     if (arch == LLM_ARCH_GEMMA3N) {
@@ -6824,7 +6818,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                                 cparams.offload_kqv,
                                 params.swa_full,
                                 cparams.kv_unified,
-                                n_ctx_per_stream,
+                                cparams.n_ctx_seq,
                                 cparams.n_seq_max,
                                 cparams.n_ubatch,
                                 1,
@@ -6840,7 +6834,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                                 !cparams.flash_attn,
                                 cparams.offload_kqv,
                                 cparams.kv_unified,
-                                n_ctx_per_stream,
+                                cparams.n_ctx_seq,
                                 cparams.n_seq_max,
                                 1,
                                 hparams.n_swa,
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 76edc3df81170..973ef662fa05c 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -2379,10 +2379,6 @@ struct server_context {
         llama_batch_free(batch);
     }
 
-    int32_t n_ctx_slot() const {
-        return params_base.kv_unified ? n_ctx : n_ctx / params_base.n_parallel;
-    }
-
     bool load_model(const common_params & params) {
         SRV_INF("loading model '%s'\n", params.model.path.c_str());
 
@@ -2411,7 +2407,7 @@ struct server_context {
 
             params_dft.devices      = params_base.speculative.devices;
             params_dft.model        = params_base.speculative.model;
-            params_dft.n_ctx        = params_base.speculative.n_ctx == 0 ? n_ctx_slot() : params_base.speculative.n_ctx;
+            params_dft.n_ctx        = params_base.speculative.n_ctx == 0 ? llama_n_ctx_seq(ctx) : params_base.speculative.n_ctx;
             params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
             params_dft.n_parallel   = 1;
             params_dft.cache_type_k = params_base.speculative.cache_type_k;
@@ -2506,7 +2502,7 @@ struct server_context {
 
             slot.id = i;
             slot.ctx = ctx;
-            slot.n_ctx = n_ctx_slot();
+            slot.n_ctx = llama_n_ctx_seq(ctx);
             slot.mctx = mctx;
             slot.prompt.tokens.has_mtmd = mctx != nullptr;
 
diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py
index d56d3d5f178b8..392e0efecdbbd 100644
--- a/tools/server/tests/unit/test_chat_completion.py
+++ b/tools/server/tests/unit/test_chat_completion.py
@@ -433,21 +433,21 @@ def test_context_size_exceeded_stream():
 @pytest.mark.parametrize(
     "n_batch,batch_count,reuse_cache",
     [
-        (64, 15, False),
+        (64, 3, False),
         (64, 1, True),
     ]
 )
-def test_return_progresssss(n_batch, batch_count, reuse_cache):
+def test_return_progress(n_batch, batch_count, reuse_cache):
     global server
     server.n_batch = n_batch
-    server.n_ctx = 2048
+    server.n_ctx = 256
     server.n_slots = 1
     server.start()
     def make_cmpl_request():
         return server.make_stream_request("POST", "/chat/completions", data={
             "max_tokens": 10,
             "messages": [
-                {"role": "user", "content": "This is a test" * 100},
+                {"role": "user", "content": "This is a test" * 10},
             ],
             "stream": True,
             "return_progress": True,
diff --git a/tools/server/tests/unit/test_infill.py b/tools/server/tests/unit/test_infill.py
index 73dacdae812b8..cd1a391b4adbc 100644
--- a/tools/server/tests/unit/test_infill.py
+++ b/tools/server/tests/unit/test_infill.py
@@ -18,7 +18,7 @@ def test_infill_without_input_extra():
         "input_suffix": "}\n",
     })
     assert res.status_code == 200
-    assert match_regex("(Ann|small|shiny|Daddy)+", res.body["content"])
+    assert match_regex("(Ann|small|shiny|Daddy|Jimmy)+", res.body["content"])
 
 
 def test_infill_with_input_extra():
@@ -34,7 +34,7 @@ def test_infill_with_input_extra():
         "input_suffix": "}\n",
     })
     assert res.status_code == 200
-    assert match_regex("(Dad|excited|park)+", res.body["content"])
+    assert match_regex("(Dad|excited|park|Jimmy)+", res.body["content"])
 
 
 @pytest.mark.parametrize("input_extra", [

From e7b7cbfb34c391a7aa7835e5fef2e7cf1669ccdf Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 28 Oct 2025 12:49:10 +0200
Subject: [PATCH 07/14] test-thread-safety : handle tiny training context of
 the input model

---
 tests/test-thread-safety.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tests/test-thread-safety.cpp b/tests/test-thread-safety.cpp
index e5158fb5062f0..bcb86c35e6652 100644
--- a/tests/test-thread-safety.cpp
+++ b/tests/test-thread-safety.cpp
@@ -131,7 +131,14 @@ int main(int argc, char ** argv) {
                     }
 
                     batch = llama_batch_get_one(&token, 1);
-                    if (llama_decode(ctx.get(), batch)) {
+
+                    int ret = llama_decode(ctx.get(), batch);
+                    if (ret == 1 && i > 0) {
+                        LOG_INF("Context full, stopping generation.\n");
+                        break;
+                    }
+
+                    if (ret != 0) {
                         LOG_ERR("Model %d/%d, Context %d/%d: failed to decode\n", m + 1, num_models, c + 1, num_contexts);
                         failed.store(true);
                         return;

From 290f6a9f08ddc9aa112e27e1acdb70d1b0b6d779 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 30 Oct 2025 20:15:13 +0200
Subject: [PATCH 08/14] server : fix server_tokens clear()

---
 tools/server/server.cpp | 7 ++++---
 tools/server/utils.hpp  | 3 ++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 973ef662fa05c..38276d7f51c50 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -3946,8 +3946,11 @@ struct server_context {
 
                     // truncate any tokens that are beyond n_past for this slot
                     const llama_pos p0 = slot.prompt.tokens.pos_next();
+
+                    SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0);
+
                     if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) {
-                        SLT_WRN(slot, "failed to truncate tokens with position >= %d\n", p0);
+                        SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0);
                         llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1);
 
                         // there is no common part left
@@ -3956,8 +3959,6 @@ struct server_context {
                         slot.prompt.tokens.clear();
                     }
 
-                    SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0);
-
                     // check if we should process the image
                     if (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) {
                         // process the image
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index b6198edfc487c..2bce2f4a47af9 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -1212,7 +1212,7 @@ struct server_tokens {
             for (auto it = tokens.map_idx_to_media.begin(); it != tokens.map_idx_to_media.end(); ) {
                 auto * chunk = tokens.map_idx_to_media[it->first].get();
                 mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
-                map_idx_to_media[start_idx+it->first] = std::move(new_chunk);
+                map_idx_to_media[start_idx + it->first] = std::move(new_chunk);
             }
         }
     }
@@ -1244,6 +1244,7 @@ struct server_tokens {
     }
 
     void clear() {
+        map_idx_to_media.clear();
         tokens.clear();
     }
 

From 23323cd1c427e388202216456c58737ef2a2b4cc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 30 Oct 2025 20:15:34 +0200
Subject: [PATCH 09/14] server : use 4 slots + unified KV by default

---
 tools/server/server.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 38276d7f51c50..117451ee9ad93 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -4432,6 +4432,13 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    if (params.n_parallel == 1 && params.kv_unified == false) {
+        LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true\n", __func__);
+
+        params.n_parallel = 4;
+        params.kv_unified = true;
+    }
+
     common_init();
 
     // struct that contains llama context and inference

From f2cca0245b813df0cdbde5245b2a1cadfb78fb68 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 30 Oct 2025 20:16:20 +0200
Subject: [PATCH 10/14] llama : add note about context size queries

---
 include/llama.h       |  4 +++-
 src/llama-context.cpp | 12 +++++++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 532023557df8d..caca361516aaa 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -461,6 +461,8 @@ extern "C" {
     LLAMA_API bool llama_supports_gpu_offload(void);
     LLAMA_API bool llama_supports_rpc        (void);
 
+    // NOTE: After creating a llama_context, it is recommended to query the actual values using these functions
+    //       In some cases the requested values via llama_context_params may differ from the actual values used by the context
     LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_ctx_seq  (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
@@ -586,7 +588,7 @@ extern "C" {
     LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
 
     // Manually free a LoRA adapter
-    // Note: loaded adapters will be free when the associated model is deleted
+    // NOTE: loaded adapters will be free when the associated model is deleted
     LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
 
     // Get the invocation tokens if the current lora is an alora
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 0190475458822..e949afab2142f 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -112,7 +112,11 @@ llama_context::llama_context(
         }
     }
 
-    cparams.n_ctx_seq = cparams.kv_unified ? cparams.n_ctx : cparams.n_ctx / cparams.n_seq_max;
+    if (cparams.kv_unified) {
+        cparams.n_ctx_seq = cparams.n_ctx;
+    } else {
+        cparams.n_ctx_seq = cparams.n_ctx / cparams.n_seq_max;
+    }
 
     if (cparams.n_ctx_seq > hparams.n_ctx_train) {
         LLAMA_LOG_WARN("%s: capping n_ctx_seq (%u) to n_ctx_train (%u)\n", __func__, cparams.n_ctx_seq, hparams.n_ctx_train);
@@ -120,6 +124,12 @@ llama_context::llama_context(
         cparams.n_ctx_seq = hparams.n_ctx_train;
     }
 
+    if (cparams.kv_unified) {
+        cparams.n_ctx = cparams.n_ctx_seq;
+    } else {
+        cparams.n_ctx = cparams.n_ctx_seq * cparams.n_seq_max;
+    }
+
     LLAMA_LOG_INFO("%s: n_seq_max     = %u\n",   __func__, cparams.n_seq_max);
     LLAMA_LOG_INFO("%s: n_ctx         = %u\n",   __func__, cparams.n_ctx);
     LLAMA_LOG_INFO("%s: n_ctx_seq     = %u\n",   __func__, cparams.n_ctx_seq);

From ff684363fa53f6f50946b9143c3c2fc0b719796f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 30 Oct 2025 20:39:26 +0200
Subject: [PATCH 11/14] cont : update todos [no ci]

---
 tools/server/server.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 117451ee9ad93..29ba95ded553b 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -2699,7 +2699,7 @@ struct server_context {
 
     // return true if at least one slot has been purged
     // TODO: improve logic
-    //       - smarter decision which slot to purge
+    //       - smarter decision which slot to purge (LRU or longest prompt?)
     //       - move slot to level 2 cache instead of removing?
     //       - instead of purging, try to store and resume later?
     bool try_purge_idle_slots() {
@@ -4159,7 +4159,7 @@ struct server_context {
                     std::string err;
 
                     if (n_batch == 1 && ret == 1) {
-                        // TODO: try to terminate only the largest active slot and continue
+                        // TODO: try to terminate only the largest active slot/sequence and continue with the rest
                         //       need to remove the tokens from the current batch too
                         err = "Context size has been exceeded.";
                     }

From c08d0d148a77447e28843fa918851bef01e88e7b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 1 Nov 2025 17:44:52 +0200
Subject: [PATCH 12/14] context : do not cap the size of the context

---
 src/llama-context.cpp   | 15 ++-------------
 tools/server/server.cpp | 10 +++++++++-
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index e949afab2142f..8641586eebca5 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -115,19 +115,8 @@ llama_context::llama_context(
     if (cparams.kv_unified) {
         cparams.n_ctx_seq = cparams.n_ctx;
     } else {
-        cparams.n_ctx_seq = cparams.n_ctx / cparams.n_seq_max;
-    }
-
-    if (cparams.n_ctx_seq > hparams.n_ctx_train) {
-        LLAMA_LOG_WARN("%s: capping n_ctx_seq (%u) to n_ctx_train (%u)\n", __func__, cparams.n_ctx_seq, hparams.n_ctx_train);
-
-        cparams.n_ctx_seq = hparams.n_ctx_train;
-    }
-
-    if (cparams.kv_unified) {
-        cparams.n_ctx = cparams.n_ctx_seq;
-    } else {
-        cparams.n_ctx = cparams.n_ctx_seq * cparams.n_seq_max;
+        cparams.n_ctx_seq = cparams.n_ctx     / cparams.n_seq_max;
+        cparams.n_ctx     = cparams.n_ctx_seq * cparams.n_seq_max;
     }
 
     LLAMA_LOG_INFO("%s: n_seq_max     = %u\n",   __func__, cparams.n_seq_max);
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 29ba95ded553b..90f49d4aa6006 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -2497,12 +2497,20 @@ struct server_context {
     void init() {
         SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel);
 
+        const int n_ctx_train = llama_model_n_ctx_train(model);
+
+        int n_ctx_slot = llama_n_ctx_seq(ctx);
+        if (n_ctx_slot > n_ctx_train) {
+            SRV_WRN("the slot context (%d) exceeds the training context of the model (%d) - capping\n", n_ctx_slot, n_ctx_train);
+            n_ctx_slot = n_ctx_train;
+        }
+
         for (int i = 0; i < params_base.n_parallel; i++) {
             server_slot slot;
 
             slot.id = i;
             slot.ctx = ctx;
-            slot.n_ctx = llama_n_ctx_seq(ctx);
+            slot.n_ctx = n_ctx_slot;
             slot.mctx = mctx;
             slot.prompt.tokens.has_mtmd = mctx != nullptr;
 

From 356dc08b983ee46332913c3762c9218bc365a51e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 1 Nov 2025 18:24:07 +0200
Subject: [PATCH 13/14] tests : adjust parameters to be CI friendlier

---
 tools/server/tests/unit/test_completion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/server/tests/unit/test_completion.py b/tools/server/tests/unit/test_completion.py
index acb893d495899..3c0ce98973f4b 100644
--- a/tools/server/tests/unit/test_completion.py
+++ b/tools/server/tests/unit/test_completion.py
@@ -374,7 +374,7 @@ def check_slots_status():
         (256, 4, [80, 40, 80, 80], [True,  True,  True,  True]),
         (256, 4, [70, 70, 70, 70], [False, False, False, False]),
         (256, 4, [90, 90, 40, 90], [False, False, True,  False]),
-        (256, 4, [90, 90, 40, 80], [True,  True,  True,  True]),
+        (256, 4, [90, 90, 40, 75], [True,  True,  True,  True]),
     ],
 )
 def test_completion_unified(n_ctx, n_slots, n_predict_vals, expected_success):

From 56fceee2cbfba351f3e59cbcdf7fb18123dfd0e6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 2 Nov 2025 12:11:06 +0200
Subject: [PATCH 14/14] context : add warning

---
 src/llama-context.cpp   | 12 ++++++++++--
 tools/server/server.cpp |  2 ++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 8641586eebca5..2b39366271ff9 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -115,8 +115,16 @@ llama_context::llama_context(
     if (cparams.kv_unified) {
         cparams.n_ctx_seq = cparams.n_ctx;
     } else {
-        cparams.n_ctx_seq = cparams.n_ctx     / cparams.n_seq_max;
-        cparams.n_ctx     = cparams.n_ctx_seq * cparams.n_seq_max;
+        cparams.n_ctx_seq = cparams.n_ctx / cparams.n_seq_max;
+
+        if (cparams.n_ctx_seq == 0) {
+            throw std::runtime_error("n_ctx_seq == 0");
+        }
+
+        if (cparams.n_ctx != cparams.n_ctx_seq * cparams.n_seq_max) {
+            cparams.n_ctx =  cparams.n_ctx_seq * cparams.n_seq_max;
+            LLAMA_LOG_WARN("%s: n_ctx is not divisible by n_seq_max - rounding down to %u\n", __func__, cparams.n_ctx);
+        }
     }
 
     LLAMA_LOG_INFO("%s: n_seq_max     = %u\n",   __func__, cparams.n_seq_max);
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 90f49d4aa6006..aa4981585200a 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -4440,6 +4440,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    // TODO: should we have a separate n_parallel parameter for the server?
+    //       https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177
     if (params.n_parallel == 1 && params.kv_unified == false) {
         LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true\n", __func__);