context : add warning

ggerganov · ggerganov · commit 56fceee2cbfb · 2025-11-02T12:11:06.000+02:00
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -115,8 +115,16 @@ llama_context::llama_context(
     if (cparams.kv_unified) {
         cparams.n_ctx_seq = cparams.n_ctx;
     } else {
-        cparams.n_ctx_seq = cparams.n_ctx     / cparams.n_seq_max;
-        cparams.n_ctx     = cparams.n_ctx_seq * cparams.n_seq_max;
+        cparams.n_ctx_seq = cparams.n_ctx / cparams.n_seq_max;
+
+        if (cparams.n_ctx_seq == 0) {
+            throw std::runtime_error("n_ctx_seq == 0");
+        }
+
+        if (cparams.n_ctx != cparams.n_ctx_seq * cparams.n_seq_max) {
+            cparams.n_ctx =  cparams.n_ctx_seq * cparams.n_seq_max;
+            LLAMA_LOG_WARN("%s: n_ctx is not divisible by n_seq_max - rounding down to %u\n", __func__, cparams.n_ctx);
+        }
     }
 
     LLAMA_LOG_INFO("%s: n_seq_max     = %u\n",   __func__, cparams.n_seq_max);
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -4440,6 +4440,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    // TODO: should we have a separate n_parallel parameter for the server?
+    //       https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177
     if (params.n_parallel == 1 && params.kv_unified == false) {
         LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true\n", __func__);
 

Original file line number	Diff line number	Diff line change
`@@ -4440,6 +4440,8 @@ int main(int argc, char ** argv) {`
`4440`	`4440`	`return 1;`
`4441`	`4441`	`}`
`4442`	`4442`
	`4443`	`+ // TODO: should we have a separate n_parallel parameter for the server?`
	`4444`	`+ // https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177`
`4443`	`4445`	`if (params.n_parallel == 1 && params.kv_unified == false) {`
`4444`	`4446`	`LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true\n", __func__);`
`4445`	`4447`