@@ -33,6 +33,45 @@ jclass la_int_var;
3333jmethodID la_int_var_value;
3434jmethodID la_int_var_inc;
3535
36+ std::string cached_token_chars;
37+
38+ bool is_valid_utf8 (const char * string) {
39+ if (!string) {
40+ return true ;
41+ }
42+
43+ const unsigned char * bytes = (const unsigned char *)string;
44+ int num;
45+
46+ while (*bytes != 0x00 ) {
47+ if ((*bytes & 0x80 ) == 0x00 ) {
48+ // U+0000 to U+007F
49+ num = 1 ;
50+ } else if ((*bytes & 0xE0 ) == 0xC0 ) {
51+ // U+0080 to U+07FF
52+ num = 2 ;
53+ } else if ((*bytes & 0xF0 ) == 0xE0 ) {
54+ // U+0800 to U+FFFF
55+ num = 3 ;
56+ } else if ((*bytes & 0xF8 ) == 0xF0 ) {
57+ // U+10000 to U+10FFFF
58+ num = 4 ;
59+ } else {
60+ return false ;
61+ }
62+
63+ bytes += 1 ;
64+ for (int i = 1 ; i < num; ++i) {
65+ if ((*bytes & 0xC0 ) != 0x80 ) {
66+ return false ;
67+ }
68+ bytes += 1 ;
69+ }
70+ }
71+
72+ return true ;
73+ }
74+
3675static void log_callback (ggml_log_level level, const char * fmt, void * data) {
3776 if (level == GGML_LOG_LEVEL_ERROR) __android_log_print (ANDROID_LOG_ERROR, TAG, fmt, data);
3877 else if (level == GGML_LOG_LEVEL_INFO) __android_log_print (ANDROID_LOG_INFO, TAG, fmt, data);
@@ -295,6 +334,8 @@ Java_com_example_llama_Llm_completion_1init(
295334 jint n_len
296335 ) {
297336
337+ cached_token_chars.clear ();
338+
298339 const auto text = env->GetStringUTFChars (jtext, 0 );
299340 const auto context = reinterpret_cast <llama_context *>(context_pointer);
300341 const auto batch = reinterpret_cast <llama_batch *>(batch_pointer);
@@ -372,8 +413,16 @@ Java_com_example_llama_Llm_completion_1loop(
372413 }
373414
374415 auto new_token_chars = llama_token_to_piece (context, new_token_id);
375- LOGi (" new_token_chars: `%s`" , new_token_chars.c_str ());
376- auto new_token = env->NewStringUTF (new_token_chars.c_str ());
416+ cached_token_chars += new_token_chars;
417+
418+ jstring new_token = nullptr ;
419+ if (is_valid_utf8 (cached_token_chars.c_str ())) {
420+ new_token = env->NewStringUTF (cached_token_chars.c_str ());
421+ LOGi (" cached: %s, new_token_chars: `%s`, id: %d" , cached_token_chars.c_str (), new_token_chars.c_str (), new_token_id);
422+ cached_token_chars.clear ();
423+ } else {
424+ new_token = env->NewStringUTF (" " );
425+ }
377426
378427 llama_batch_clear (*batch);
379428 llama_batch_add (*batch, new_token_id, n_cur, { 0 }, true );
0 commit comments