@@ -2866,10 +2866,12 @@ struct server_context {
28662866
28672867 // if context shifting is disabled, make sure that we don't run out of context
28682868 if (!params_base.ctx_shift && slot.n_past + 1 >= slot.n_ctx ) {
2869+ slot.truncated = true ;
28692870 slot.stop = STOP_TYPE_LIMIT;
28702871 slot.has_next_token = false ;
28712872
2872- SLT_DBG (slot, " stopped due to running out of context, n_past = %d, n_ctx = %d\n " , slot.n_past , slot.n_ctx );
2873+ SLT_DBG (slot, " stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n " ,
2874+ slot.n_decoded , slot.n_prompt_tokens (), slot.n_past , slot.n_ctx );
28732875 }
28742876
28752877 // check the limits
@@ -2929,36 +2931,13 @@ struct server_context {
29292931 }
29302932 }
29312933
2932- // if context shift is disabled, we stop when it reaches the context limit
2933- if (slot.n_past >= slot.n_ctx ) {
2934- slot.truncated = true ;
2935- slot.stop = STOP_TYPE_LIMIT;
2936- slot.has_next_token = false ;
2937-
2938- SLT_DBG (slot, " stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n " ,
2939- slot.n_decoded , slot.n_prompt_tokens (), slot.n_past , slot.n_ctx );
2940- }
2941-
29422934 if (llama_vocab_is_eog (vocab, result.tok )) {
29432935 slot.stop = STOP_TYPE_EOS;
29442936 slot.has_next_token = false ;
29452937
29462938 SLT_DBG (slot, " %s" , " stopped by EOS\n " );
29472939 }
29482940
2949- const auto n_ctx_train = llama_model_n_ctx_train (model);
2950-
2951- if (slot.task ->params .n_predict < 1 && slot.n_prompt_tokens () + slot.n_decoded >= n_ctx_train) {
2952- slot.truncated = true ;
2953- slot.stop = STOP_TYPE_LIMIT;
2954- slot.has_next_token = false ; // stop prediction
2955-
2956- SLT_WRN (slot,
2957- " n_predict (%d) is set for infinite generation. "
2958- " Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n " ,
2959- slot.task ->params .n_predict , n_ctx_train);
2960- }
2961-
29622941 SLT_DBG (slot, " n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n " , slot.n_decoded , slot.n_remaining , result.tok , token_str.c_str ());
29632942
29642943 return slot.has_next_token ; // continue
0 commit comments