context : fix n_ctx_per_seq computation

ggerganov · ggerganov · commit 2648c5b200b2 · 2025-10-28T12:43:58.000+02:00
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -112,11 +112,9 @@ llama_context::llama_context(
         }
     }
 
-    const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
-
     LLAMA_LOG_INFO("%s: n_seq_max     = %u\n",   __func__, cparams.n_seq_max);
     LLAMA_LOG_INFO("%s: n_ctx         = %u\n",   __func__, cparams.n_ctx);
-    LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n",   __func__, n_ctx_per_seq);
+    LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n",   __func__, n_ctx_per_seq());
     LLAMA_LOG_INFO("%s: n_batch       = %u\n",   __func__, cparams.n_batch);
     LLAMA_LOG_INFO("%s: n_ubatch      = %u\n",   __func__, cparams.n_ubatch);
     LLAMA_LOG_INFO("%s: causal_attn   = %d\n",   __func__, cparams.causal_attn);
@@ -125,14 +123,14 @@ llama_context::llama_context(
     LLAMA_LOG_INFO("%s: freq_base     = %.1f\n", __func__, cparams.rope_freq_base);
     LLAMA_LOG_INFO("%s: freq_scale    = %g\n",   __func__, cparams.rope_freq_scale);
 
-    if (n_ctx_per_seq < hparams.n_ctx_train) {
+    if (n_ctx_per_seq() < hparams.n_ctx_train) {
         LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
-                __func__, n_ctx_per_seq, hparams.n_ctx_train);
+                __func__, n_ctx_per_seq(), hparams.n_ctx_train);
     }
 
-    if (n_ctx_per_seq > hparams.n_ctx_train) {
+    if (n_ctx_per_seq() > hparams.n_ctx_train) {
         LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
-                __func__, n_ctx_per_seq, hparams.n_ctx_train);
+                __func__, n_ctx_per_seq(), hparams.n_ctx_train);
     }
 
     if (!hparams.vocab_only) {
@@ -454,7 +452,7 @@ uint32_t llama_context::n_ctx() const {
 }
 
 uint32_t llama_context::n_ctx_per_seq() const {
-    return cparams.n_ctx / cparams.n_seq_max;
+    return cparams.kv_unified ? cparams.n_ctx : cparams.n_ctx / cparams.n_seq_max;
 }
 
 uint32_t llama_context::n_batch() const {
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -6581,7 +6581,7 @@ float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) co
 }
 
 ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
-    const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
+    const uint32_t n_ctx_per_seq = cparams.kv_unified ? cparams.n_ctx : cparams.n_ctx / cparams.n_seq_max;
 
     // choose long/short freq factors based on the context size
     if (layers[il].rope_freqs != nullptr) {

Original file line number	Diff line number	Diff line change
`@@ -112,11 +112,9 @@ llama_context::llama_context(`
`112`	`112`	`}`
`113`	`113`	`}`
`114`	`114`
`115`		`- const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;`
`116`		`-`
`117`	`115`	`LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);`
`118`	`116`	`LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);`
`119`		`- LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq);`
	`117`	`+ LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq());`
`120`	`118`	`LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);`
`121`	`119`	`LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);`
`122`	`120`	`LLAMA_LOG_INFO("%s: causal_attn = %d\n", __func__, cparams.causal_attn);`
`@@ -125,14 +123,14 @@ llama_context::llama_context(`
`125`	`123`	`LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);`
`126`	`124`	`LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);`
`127`	`125`
`128`		`- if (n_ctx_per_seq < hparams.n_ctx_train) {`
	`126`	`+ if (n_ctx_per_seq() < hparams.n_ctx_train) {`
`129`	`127`	`LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",`
`130`		`- __func__, n_ctx_per_seq, hparams.n_ctx_train);`
	`128`	`+ __func__, n_ctx_per_seq(), hparams.n_ctx_train);`
`131`	`129`	`}`
`132`	`130`
`133`		`- if (n_ctx_per_seq > hparams.n_ctx_train) {`
	`131`	`+ if (n_ctx_per_seq() > hparams.n_ctx_train) {`
`134`	`132`	`LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",`
`135`		`- __func__, n_ctx_per_seq, hparams.n_ctx_train);`
	`133`	`+ __func__, n_ctx_per_seq(), hparams.n_ctx_train);`
`136`	`134`	`}`
`137`	`135`
`138`	`136`	`if (!hparams.vocab_only) {`
`@@ -454,7 +452,7 @@ uint32_t llama_context::n_ctx() const {`
`454`	`452`	`}`
`455`	`453`
`456`	`454`	`uint32_t llama_context::n_ctx_per_seq() const {`
`457`		`- return cparams.n_ctx / cparams.n_seq_max;`
	`455`	`+ return cparams.kv_unified ? cparams.n_ctx : cparams.n_ctx / cparams.n_seq_max;`
`458`	`456`	`}`
`459`	`457`
`460`	`458`	`uint32_t llama_context::n_batch() const {`
Original file line number	Diff line number	Diff line change
`@@ -6581,7 +6581,7 @@ float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) co`
`6581`	`6581`	`}`
`6582`	`6582`
`6583`	`6583`	`ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {`
`6584`		`- const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;`
	`6584`	`+ const uint32_t n_ctx_per_seq = cparams.kv_unified ? cparams.n_ctx : cparams.n_ctx / cparams.n_seq_max;`
`6585`	`6585`
`6586`	`6586`	`// choose long/short freq factors based on the context size`
`6587`	`6587`	`if (layers[il].rope_freqs != nullptr) {`