Merge pull request #727 from corebonts/incomplete-utf

cjpais · web-flow · commit a9658c7a13b7 · 2025-03-24T09:36:01.000-07:00
Avoid streaming incomplete UTF-8 characters
diff --git a/llamafile/server/utf.cpp b/llamafile/server/utf.cpp
@@ -0,0 +1,49 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "utils.h"
+#include <string>
+
+namespace lf {
+namespace server {
+
+bool ends_with_incomplete_utf8(const std::string& str) {
+    for (unsigned i = 1; i <= 4 && i <= str.size(); ++i) {
+        unsigned char c = str[str.size() - i];
+        if ((c & 0xC0) == 0x80) {
+            // continuation byte: 10xxxxxx
+            continue;
+        }
+        if ((c & 0xE0) == 0xC0) {
+            // 2-byte character: 110xxxxx ...
+            return i < 2;
+        } else if ((c & 0xF0) == 0xE0) {
+            // 3-byte character: 1110xxxx ...
+            return i < 3;
+        } else if ((c & 0xF8) == 0xF0) {
+            // 4-byte character: 11110xxx ...
+            return i < 4;
+        }
+        // else 1-byte character or invalid byte
+        break; // Found a valid starting byte, no need to check further.
+    }
+
+    return false; // Did not find an incomplete character
+}
+
+} // namespace server
+} // namespace lf
diff --git a/llamafile/server/utils.h b/llamafile/server/utils.h
@@ -50,5 +50,8 @@ remove_old_image_atoms(const std::vector<Atom>&);
 int
 count_tokens(const std::vector<Atom>&);
 
+bool
+ends_with_incomplete_utf8(const std::string& str);
+
 } // namespace server
 } // namespace lf
diff --git a/llamafile/server/v1_chat_completions.cpp b/llamafile/server/v1_chat_completions.cpp
@@ -78,7 +78,7 @@ struct V1ChatCompletionState
 {
     std::string prompt;
     std::vector<Atom> atoms;
-    std::string piece;
+    std::string piece = "";
 };
 
 struct V1ChatCompletionResponse
@@ -658,19 +658,23 @@ Client::v1_chat_completions()
             finish_reason = "stop";
             break;
         }
-        state->piece =
+        state->piece +=
           llamafile_token_to_piece(slot_->ctx_, id, DONT_RENDER_SPECIAL_TOKENS);
         if (!state->piece.empty()) {
             if (params->stream) {
-                char* p = append_http_response_message(obuf_.p, 200);
-                choice["delta"]["content"] = state->piece;
-                response->json["created"] = timespec_real().tv_sec;
-                response->content = make_event(response->json);
-                choice.getObject().erase("delta");
-                if (!send_response_chunk(response->content))
-                    return false;
+                if (!ends_with_incomplete_utf8(state->piece)) {
+                    char* p = append_http_response_message(obuf_.p, 200);
+                    choice["delta"]["content"] = state->piece;
+                    response->json["created"] = timespec_real().tv_sec;
+                    response->content = make_event(response->json);
+                    choice.getObject().erase("delta");
+                    if (!send_response_chunk(response->content))
+                        return false;
+                    state->piece.clear();
+                }
             } else {
                 response->content += state->piece;
+                state->piece.clear();
             }
         }
     }
diff --git a/llamafile/server/v1_completions.cpp b/llamafile/server/v1_completions.cpp
@@ -76,7 +76,7 @@ struct V1CompletionParams
 struct V1CompletionState
 {
     std::vector<Atom> atoms;
-    std::string piece;
+    std::string piece = "";
 };
 
 struct V1CompletionResponse
@@ -495,18 +495,22 @@ Client::v1_completions()
             finish_reason = "stop";
             break;
         }
-        state->piece =
+        state->piece +=
           llamafile_token_to_piece(slot_->ctx_, id, DONT_RENDER_SPECIAL_TOKENS);
         if (!state->piece.empty()) {
             if (params->stream) {
-                char* p = append_http_response_message(obuf_.p, 200);
-                choice["text"] = state->piece;
-                response->json["created"] = timespec_real().tv_sec;
-                response->content = make_event(response->json);
-                if (!send_response_chunk(response->content))
-                    return false;
+                if (!ends_with_incomplete_utf8(state->piece)) {
+                    char* p = append_http_response_message(obuf_.p, 200);
+                    choice["text"] = state->piece;
+                    response->json["created"] = timespec_real().tv_sec;
+                    response->content = make_event(response->json);
+                    if (!send_response_chunk(response->content))
+                        return false;
+                    state->piece.clear();
+                }
             } else {
                 response->content += state->piece;
+                state->piece.clear();
             }
         }
     }