Skip to content

Commit 8ff1f50

Browse files
committed
Avoid streaming incomplete UTF-8 characters
Some characters, like the chinese fù is sometimes returned as two tokens, as "\u00e8\u00b5" and "\u008b" in this case. This is also depends on the model, but when it happens, for example with DeepSeek R1, we have to wait for the character to be complete and send it only then. This resolves #722 and #646
1 parent 33f561d commit 8ff1f50

File tree

4 files changed

+77
-17
lines changed

4 files changed

+77
-17
lines changed

llamafile/server/utf.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
2+
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
3+
//
4+
// Copyright 2024 Mozilla Foundation
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
18+
#include "utils.h"
19+
#include <string>
20+
21+
namespace lf {
22+
namespace server {
23+
24+
bool ends_with_incomplete_utf8(const std::string& str) {
25+
for (unsigned i = 1; i <= 4 && i <= str.size(); ++i) {
26+
unsigned char c = str[str.size() - i];
27+
if ((c & 0xC0) == 0x80) {
28+
// continuation byte: 10xxxxxx
29+
continue;
30+
}
31+
if ((c & 0xE0) == 0xC0) {
32+
// 2-byte character: 110xxxxx ...
33+
return i < 2;
34+
} else if ((c & 0xF0) == 0xE0) {
35+
// 3-byte character: 1110xxxx ...
36+
return i < 3;
37+
} else if ((c & 0xF8) == 0xF0) {
38+
// 4-byte character: 11110xxx ...
39+
return i < 4;
40+
}
41+
// else 1-byte character or invalid byte
42+
break; // Found a valid starting byte, no need to check further.
43+
}
44+
45+
return false; // Did not find an incomplete character
46+
}
47+
48+
} // namespace server
49+
} // namespace lf

llamafile/server/utils.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,5 +50,8 @@ remove_old_image_atoms(const std::vector<Atom>&);
5050
int
5151
count_tokens(const std::vector<Atom>&);
5252

53+
bool
54+
ends_with_incomplete_utf8(const std::string& str);
55+
5356
} // namespace server
5457
} // namespace lf

llamafile/server/v1_chat_completions.cpp

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ struct V1ChatCompletionState
7878
{
7979
std::string prompt;
8080
std::vector<Atom> atoms;
81-
std::string piece;
81+
std::string piece = "";
8282
};
8383

8484
struct V1ChatCompletionResponse
@@ -658,19 +658,23 @@ Client::v1_chat_completions()
658658
finish_reason = "stop";
659659
break;
660660
}
661-
state->piece =
661+
state->piece +=
662662
llamafile_token_to_piece(slot_->ctx_, id, DONT_RENDER_SPECIAL_TOKENS);
663663
if (!state->piece.empty()) {
664664
if (params->stream) {
665-
char* p = append_http_response_message(obuf_.p, 200);
666-
choice["delta"]["content"] = state->piece;
667-
response->json["created"] = timespec_real().tv_sec;
668-
response->content = make_event(response->json);
669-
choice.getObject().erase("delta");
670-
if (!send_response_chunk(response->content))
671-
return false;
665+
if (!ends_with_incomplete_utf8(state->piece)) {
666+
char* p = append_http_response_message(obuf_.p, 200);
667+
choice["delta"]["content"] = state->piece;
668+
response->json["created"] = timespec_real().tv_sec;
669+
response->content = make_event(response->json);
670+
choice.getObject().erase("delta");
671+
if (!send_response_chunk(response->content))
672+
return false;
673+
state->piece.clear();
674+
}
672675
} else {
673676
response->content += state->piece;
677+
state->piece.clear();
674678
}
675679
}
676680
}

llamafile/server/v1_completions.cpp

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ struct V1CompletionParams
7676
struct V1CompletionState
7777
{
7878
std::vector<Atom> atoms;
79-
std::string piece;
79+
std::string piece = "";
8080
};
8181

8282
struct V1CompletionResponse
@@ -495,18 +495,22 @@ Client::v1_completions()
495495
finish_reason = "stop";
496496
break;
497497
}
498-
state->piece =
498+
state->piece +=
499499
llamafile_token_to_piece(slot_->ctx_, id, DONT_RENDER_SPECIAL_TOKENS);
500500
if (!state->piece.empty()) {
501501
if (params->stream) {
502-
char* p = append_http_response_message(obuf_.p, 200);
503-
choice["text"] = state->piece;
504-
response->json["created"] = timespec_real().tv_sec;
505-
response->content = make_event(response->json);
506-
if (!send_response_chunk(response->content))
507-
return false;
502+
if (!ends_with_incomplete_utf8(state->piece)) {
503+
char* p = append_http_response_message(obuf_.p, 200);
504+
choice["text"] = state->piece;
505+
response->json["created"] = timespec_real().tv_sec;
506+
response->content = make_event(response->json);
507+
if (!send_response_chunk(response->content))
508+
return false;
509+
state->piece.clear();
510+
}
508511
} else {
509512
response->content += state->piece;
513+
state->piece.clear();
510514
}
511515
}
512516
}

0 commit comments

Comments
 (0)