@@ -788,7 +788,7 @@ struct server_slot {
788788
789789 pos = text.find (word, from_pos);
790790 } else {
791- pos = string_find_partial_stop (word, text );
791+ pos = string_find_partial_stop (text, word );
792792 }
793793
794794 if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) {
@@ -1960,31 +1960,28 @@ struct server_context {
19601960 size_t pos = std::min (slot.n_sent_text , slot.generated_text .size ());
19611961
19621962 const std::string str_test = slot.generated_text .substr (pos);
1963- bool is_stop_full = false ;
1963+ bool send_text = true ;
19641964
19651965 size_t stop_pos = slot.find_stopping_strings (str_test, token_str.size (), true );
19661966 if (stop_pos != std::string::npos) {
1967- is_stop_full = true ;
19681967 slot.generated_text .erase (
19691968 slot.generated_text .begin () + pos + stop_pos,
19701969 slot.generated_text .end ());
1971- // Update n_sent_text to not exceed the new generated_text size
1972- slot.n_sent_text = std::min (slot.n_sent_text , slot.generated_text .size ());
1973- pos = slot.n_sent_text ;
1974- } else {
1975- is_stop_full = false ;
1976- stop_pos = slot.find_stopping_strings (str_test, token_str.size (), false );
1970+ pos = std::min (slot.n_sent_text , slot.generated_text .size ());
1971+ }
1972+ else if (slot.has_next_token && !llama_token_is_eog (model, result.tok )) {
1973+ stop_pos = slot.find_stopping_strings (str_test, token_str.size (), false );
1974+ send_text = stop_pos == std::string::npos;
19771975 }
19781976
19791977 // check if there is any token to predict
1980- if (stop_pos == std::string::npos || (!slot. has_next_token && !is_stop_full && stop_pos > 0 ) ) {
1978+ if (send_text ) {
19811979 // no send the stop word in the response
19821980 result.text_to_send = slot.generated_text .substr (pos, std::string::npos);
19831981 slot.n_sent_text += result.text_to_send .size ();
19841982 // add the token to slot queue and cache
1985- } else if (stop_pos != std::string::npos) {
1986- // Handle partial stop - update n_sent_text to the end of the current text
1987- slot.n_sent_text = slot.generated_text .size ();
1983+ } else {
1984+ result.text_to_send = " " ;
19881985 }
19891986
19901987 slot.add_token_string (result);
0 commit comments