@@ -2866,10 +2866,12 @@ struct server_context {
28662866
28672867        //  if context shifting is disabled, make sure that we don't run out of context
28682868        if  (!params_base.ctx_shift  && slot.n_past  + 1  >= slot.n_ctx ) {
2869+             slot.truncated       = true ;
28692870            slot.stop            = STOP_TYPE_LIMIT;
28702871            slot.has_next_token  = false ;
28712872
2872-             SLT_DBG (slot, " stopped due to running out of context, n_past = %d, n_ctx = %d\n "  , slot.n_past , slot.n_ctx );
2873+             SLT_DBG (slot, " stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n "  ,
2874+                     slot.n_decoded , slot.n_prompt_tokens (), slot.n_past , slot.n_ctx );
28732875        }
28742876
28752877        //  check the limits
@@ -2929,36 +2931,13 @@ struct server_context {
29292931            }
29302932        }
29312933
2932-         //  if context shift is disabled, we stop when it reaches the context limit
2933-         if  (slot.n_past  >= slot.n_ctx ) {
2934-             slot.truncated       = true ;
2935-             slot.stop            = STOP_TYPE_LIMIT;
2936-             slot.has_next_token  = false ;
2937- 
2938-             SLT_DBG (slot, " stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n "  ,
2939-                     slot.n_decoded , slot.n_prompt_tokens (), slot.n_past , slot.n_ctx );
2940-         }
2941- 
29422934        if  (llama_vocab_is_eog (vocab, result.tok )) {
29432935            slot.stop            = STOP_TYPE_EOS;
29442936            slot.has_next_token  = false ;
29452937
29462938            SLT_DBG (slot, " %s"  , " stopped by EOS\n "  );
29472939        }
29482940
2949-         const  auto  n_ctx_train = llama_model_n_ctx_train (model);
2950- 
2951-         if  (slot.task ->params .n_predict  < 1  && slot.n_prompt_tokens () + slot.n_decoded  >= n_ctx_train) {
2952-             slot.truncated       = true ;
2953-             slot.stop            = STOP_TYPE_LIMIT;
2954-             slot.has_next_token  = false ; //  stop prediction
2955- 
2956-             SLT_WRN (slot,
2957-                     " n_predict (%d) is set for infinite generation. " 
2958-                     " Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n "  ,
2959-                     slot.task ->params .n_predict , n_ctx_train);
2960-         }
2961- 
29622941        SLT_DBG (slot, " n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n "  , slot.n_decoded , slot.n_remaining , result.tok , token_str.c_str ());
29632942
29642943        return  slot.has_next_token ; //  continue
0 commit comments