Avoid streaming incomplete UTF-8 characters

corebonts · corebonts · commit 8ff1f5086b57 · 2025-03-21T22:30:35.000+01:00
Some characters, like the chinese fù is sometimes returned as two tokens, as "\u00e8\u00b5" and "\u008b" in this case. This is also depends on the model, but when it happens, for example with DeepSeek R1, we have to wait for the character to be complete and send it only then. This resolves #722 and #646
diff --git a/llamafile/server/utf.cpp b/llamafile/server/utf.cpp
@@ -0,0 +1,49 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "utils.h"
+#include <string>
+
+namespace lf {
+namespace server {
+
+bool ends_with_incomplete_utf8(const std::string& str) {
+    for (unsigned i = 1; i <= 4 && i <= str.size(); ++i) {
+        unsigned char c = str[str.size() - i];
+        if ((c & 0xC0) == 0x80) {
+            // continuation byte: 10xxxxxx
+            continue;
+        }
+        if ((c & 0xE0) == 0xC0) {
+            // 2-byte character: 110xxxxx ...
+            return i < 2;
+        } else if ((c & 0xF0) == 0xE0) {
+            // 3-byte character: 1110xxxx ...
+            return i < 3;
+        } else if ((c & 0xF8) == 0xF0) {
+            // 4-byte character: 11110xxx ...
+            return i < 4;
+        }
+        // else 1-byte character or invalid byte
+        break; // Found a valid starting byte, no need to check further.
+    }
+
+    return false; // Did not find an incomplete character
+}
+
+} // namespace server
+} // namespace lf
diff --git a/llamafile/server/utils.h b/llamafile/server/utils.h
@@ -50,5 +50,8 @@ remove_old_image_atoms(const std::vector<Atom>&);
 int
 count_tokens(const std::vector<Atom>&);
 
+bool
+ends_with_incomplete_utf8(const std::string& str);
+
 } // namespace server
 } // namespace lf
diff --git a/llamafile/server/v1_chat_completions.cpp b/llamafile/server/v1_chat_completions.cpp
@@ -78,7 +78,7 @@ struct V1ChatCompletionState
 {
     std::string prompt;
     std::vector<Atom> atoms;
-    std::string piece;
+    std::string piece = "";
 };
 
 struct V1ChatCompletionResponse
@@ -658,19 +658,23 @@ Client::v1_chat_completions()
             finish_reason = "stop";
             break;
         }
-        state->piece =
+        state->piece +=
           llamafile_token_to_piece(slot_->ctx_, id, DONT_RENDER_SPECIAL_TOKENS);
         if (!state->piece.empty()) {
             if (params->stream) {
-                char* p = append_http_response_message(obuf_.p, 200);
-                choice["delta"]["content"] = state->piece;
-                response->json["created"] = timespec_real().tv_sec;
-                response->content = make_event(response->json);
-                choice.getObject().erase("delta");
-                if (!send_response_chunk(response->content))
-                    return false;
+                if (!ends_with_incomplete_utf8(state->piece)) {
+                    char* p = append_http_response_message(obuf_.p, 200);
+                    choice["delta"]["content"] = state->piece;
+                    response->json["created"] = timespec_real().tv_sec;
+                    response->content = make_event(response->json);
+                    choice.getObject().erase("delta");
+                    if (!send_response_chunk(response->content))
+                        return false;
+                    state->piece.clear();
+                }
             } else {
                 response->content += state->piece;
+                state->piece.clear();
             }
         }
     }
diff --git a/llamafile/server/v1_completions.cpp b/llamafile/server/v1_completions.cpp
@@ -76,7 +76,7 @@ struct V1CompletionParams
 struct V1CompletionState
 {
     std::vector<Atom> atoms;
-    std::string piece;
+    std::string piece = "";
 };
 
 struct V1CompletionResponse
@@ -495,18 +495,22 @@ Client::v1_completions()
             finish_reason = "stop";
             break;
         }
-        state->piece =
+        state->piece +=
           llamafile_token_to_piece(slot_->ctx_, id, DONT_RENDER_SPECIAL_TOKENS);
         if (!state->piece.empty()) {
             if (params->stream) {
-                char* p = append_http_response_message(obuf_.p, 200);
-                choice["text"] = state->piece;
-                response->json["created"] = timespec_real().tv_sec;
-                response->content = make_event(response->json);
-                if (!send_response_chunk(response->content))
-                    return false;
+                if (!ends_with_incomplete_utf8(state->piece)) {
+                    char* p = append_http_response_message(obuf_.p, 200);
+                    choice["text"] = state->piece;
+                    response->json["created"] = timespec_real().tv_sec;
+                    response->content = make_event(response->json);
+                    if (!send_response_chunk(response->content))
+                        return false;
+                    state->piece.clear();
+                }
             } else {
                 response->content += state->piece;
+                state->piece.clear();
             }
         }
     }