@@ -2262,12 +2262,14 @@ struct server_context {
22622262 slot.has_next_token = true ;
22632263 }
22642264
2265- // if context shifting is disabled, make sure that we don't run out of context
2266- if (!params_base.ctx_shift && slot.n_past + 1 >= slot.n_ctx ) {
2265+ // if context shift is disabled, we stop when it reaches the context limit
2266+ if (!params_base.ctx_shift && slot.n_cache_tokens () + 1 >= slot.n_ctx ) {
2267+ slot.truncated = true ;
22672268 slot.stop = STOP_TYPE_LIMIT;
22682269 slot.has_next_token = false ;
22692270
2270- SLT_DBG (slot, " stopped due to running out of context, n_past = %d, n_ctx = %d\n " , slot.n_past , slot.n_ctx );
2271+ SLT_DBG (slot, " stopped due to running out of context capacity, n_cache_tokens = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n " ,
2272+ slot.n_cache_tokens (), slot.n_prompt_tokens (), slot.n_decoded , slot.n_ctx );
22712273 }
22722274
22732275 // check the limits
@@ -2327,16 +2329,6 @@ struct server_context {
23272329 }
23282330 }
23292331
2330- // if context shift is disabled, we stop when it reaches the context limit
2331- if (!params_base.ctx_shift && slot.n_cache_tokens () >= slot.n_ctx ) {
2332- slot.truncated = true ;
2333- slot.stop = STOP_TYPE_LIMIT;
2334- slot.has_next_token = false ;
2335-
2336- SLT_DBG (slot, " stopped due to running out of context capacity, n_cache_tokens = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n " ,
2337- slot.n_cache_tokens (), slot.n_prompt_tokens (), slot.n_decoded , slot.n_ctx );
2338- }
2339-
23402332 if (llama_vocab_is_eog (vocab, result.tok )) {
23412333 slot.stop = STOP_TYPE_EOS;
23422334 slot.has_next_token = false ;
0 commit comments