@@ -376,7 +376,6 @@ struct llama_client_slot
376376
377377 int32_t num_prompt_tokens = 0 ;
378378 int32_t num_prompt_tokens_processed = 0 ;
379- int32_t multibyte_pending = 0 ;
380379
381380 json prompt;
382381 std::string generated_text;
@@ -425,7 +424,6 @@ struct llama_client_slot
425424 stopped_word = false ;
426425 stopped_limit = false ;
427426 stopping_word = " " ;
428- multibyte_pending = 0 ;
429427 n_past = 0 ;
430428 sent_count = 0 ;
431429 sent_token_probs_index = 0 ;
@@ -992,35 +990,36 @@ struct llama_server_context
992990 slot.generated_text += token_str;
993991 slot.has_next_token = true ;
994992
995- if (slot.multibyte_pending > 0 )
993+ // check if there is incomplete UTF-8 character at the end
994+ bool incomplete = false ;
995+ for (unsigned i = 1 ; i < 5 && i <= slot.generated_text .size (); ++i)
996996 {
997- slot. multibyte_pending -= token_str. size ();
998- }
999- else if (token_str. size () == 1 )
1000- {
1001- const char c = token_str[ 0 ] ;
1002- // 2-byte characters: 110xxxxx 10xxxxxx
997+ unsigned char c = slot. generated_text [slot. generated_text . size () - i] ;
998+ if ((c & 0xC0 ) == 0x80 )
999+ {
1000+ // continuation byte: 10xxxxxx
1001+ continue ;
1002+ }
10031003 if ((c & 0xE0 ) == 0xC0 )
10041004 {
1005- slot. multibyte_pending = 1 ;
1006- // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
1005+ // 2-byte character: 110xxxxx ...
1006+ incomplete = i < 2 ;
10071007 }
10081008 else if ((c & 0xF0 ) == 0xE0 )
10091009 {
1010- slot. multibyte_pending = 2 ;
1011- // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1010+ // 3-byte character: 1110xxxx ...
1011+ incomplete = i < 3 ;
10121012 }
10131013 else if ((c & 0xF8 ) == 0xF0 )
10141014 {
1015- slot.multibyte_pending = 3 ;
1016- }
1017- else
1018- {
1019- slot.multibyte_pending = 0 ;
1015+ // 4-byte character: 11110xxx ...
1016+ incomplete = i < 4 ;
10201017 }
1018+ // else 1-byte character or invalid byte
1019+ break ;
10211020 }
10221021
1023- if (slot. multibyte_pending == 0 )
1022+ if (!incomplete )
10241023 {
10251024 size_t pos = std::min (slot.sent_count , slot.generated_text .size ());
10261025 const std::string str_test = slot.generated_text .substr (pos);
@@ -1055,7 +1054,7 @@ struct llama_server_context
10551054 }
10561055 }
10571056
1058- if (slot. multibyte_pending > 0 && !slot. has_next_token )
1057+ if (incomplete )
10591058 {
10601059 slot.has_next_token = true ;
10611060 }
0 commit comments