@@ -2208,7 +2208,7 @@ struct server_context {
22082208 bool full_stop_reached = false ;
22092209 bool partial_stop_reached = false ;
22102210
2211- // search start strings
2211+ // search the start strings
22122212 if (start_string_missing && !incomplete && slot.has_next_token ) {
22132213 size_t max_start_string_size = slot.params .start_string_max_len ;
22142214 size_t search_len = max_start_string_size + token_str.size ();
@@ -2230,17 +2230,11 @@ struct server_context {
22302230 }
22312231 }
22322232
2233+ // search the stop strings
22332234 if (!incomplete) {
22342235 size_t pos = std::min (slot.n_sent_text , slot.generated_text .size ());
22352236
22362237 const std::string str_test = slot.generated_text .substr (pos);
2237- bool send_text = true ;
2238-
2239- // Handle the start strings
2240- if (start_string_missing)
2241- {
2242- send_text = false ;
2243- }
22442238
22452239 // search stop word and delete it
22462240 size_t stop_pos = slot.find_stopping_strings (str_test, token_str.size (), true );
@@ -2249,33 +2243,44 @@ struct server_context {
22492243 slot.generated_text .begin () + pos + stop_pos,
22502244 slot.generated_text .end ());
22512245 pos = std::min (slot.n_sent_text , slot.generated_text .size ());
2246+ full_stop_reached = true ;
22522247 } else if (slot.has_next_token ) {
22532248 stop_pos = slot.find_stopping_strings (str_test, token_str.size (), false );
2254- send_text = send_text && stop_pos == std::string::npos;
2249+ partial_stop_reached = ( stop_pos != std::string::npos) ;
22552250 }
2251+ }
22562252
2257- // check if there is any token to predict
2258- if (send_text) {
2259- // no send the stop word in the response
2260- result.text_to_send = slot.generated_text .substr (pos, std::string::npos);
2261- slot.n_sent_text += result.text_to_send .size ();
2262- // add the token to slot queue and cache
2263- } else {
2264- result.text_to_send = " " ;
2265- }
2253+ if (full_stop_reached)
2254+ {
2255+ slot.stop = STOP_TYPE_WORD;
2256+ slot.has_next_token = false ;
2257+ SLT_DBG (slot, " stopped by word, n_decoded = %d, n_predict = %d\n " , slot.n_decoded , slot.params .n_predict );
2258+ }
22662259
2267- slot.add_token (result);
2268- if (slot.params .stream ) {
2269- send_partial_response (slot, result);
2270- }
2260+ if (partial_stop_reached || start_string_missing)
2261+ {
2262+ result.text_to_send = " " ;
2263+ }
2264+ else
2265+ {
2266+ size_t valid_generated_len = validate_utf8 (slot.generated_text );
2267+ size_t available_data = valid_generated_len - slot.n_sent_text ;
2268+ result.text_to_send = slot.generated_text .substr (slot.n_sent_text , available_data);
2269+ slot.n_sent_text += result.text_to_send .size ();
2270+ }
2271+
2272+ slot.add_token (result);
2273+
2274+ if (slot.params .stream && !result.text_to_send .empty ()) {
2275+ send_partial_response (slot, result);
22712276 }
22722277
22732278 if (incomplete) {
22742279 slot.has_next_token = true ;
22752280 }
22762281
22772282 // check the limits
2278- if (slot.n_decoded > 0 && slot. has_next_token && !slot. has_budget (params_base) ) {
2283+ if (slot.has_next_token && token_budget_exhausted ) {
22792284 slot.stop = STOP_TYPE_LIMIT;
22802285 slot.has_next_token = false ;
22812286
0 commit comments