@@ -1317,7 +1317,7 @@ struct server_slot {
13171317 return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK;
13181318 }
13191319
1320- bool can_batch_with (server_slot & other_slot) {
1320+ bool can_batch_with (server_slot & other_slot) const {
13211321 return is_non_causal () == other_slot.is_non_causal ()
13221322 && are_lora_equal (lora, other_slot.lora );
13231323 }
@@ -2162,14 +2162,6 @@ struct server_context {
21622162 }
21632163
21642164 if (slot.has_new_line ) {
2165- // if we have already seen a new line, we stop after a certain time limit
2166- if (slot.params .t_max_predict_ms > 0 && (ggml_time_us () - slot.t_start_generation > 1000 .0f *slot.params .t_max_predict_ms )) {
2167- slot.stop = STOP_TYPE_LIMIT;
2168- slot.has_next_token = false ;
2169-
2170- SLT_DBG (slot, " stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n " , slot.n_decoded , (int ) slot.params .t_max_predict_ms );
2171- }
2172-
21732165 // require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent
21742166 if (slot.params .n_indent > 0 ) {
21752167 // check the current indentation
@@ -2208,6 +2200,14 @@ struct server_context {
22082200 // check if there is a new line in the generated text
22092201 if (result.text_to_send .find (' \n ' ) != std::string::npos) {
22102202 slot.has_new_line = true ;
2203+
2204+ // if we have seen a new line, we stop after a certain time limit, but only upon another new line
2205+ if (slot.params .t_max_predict_ms > 0 && (ggml_time_us () - slot.t_start_generation > 1000 .0f *slot.params .t_max_predict_ms )) {
2206+ slot.stop = STOP_TYPE_LIMIT;
2207+ slot.has_next_token = false ;
2208+
2209+ SLT_DBG (slot, " stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n " , slot.n_decoded , (int ) slot.params .t_max_predict_ms );
2210+ }
22112211 }
22122212
22132213 // if context shift is disabled, we stop when it reaches the context limit
0 commit comments