Skip to content

Commit 4a293ba

Browse files
committed
bug-fix: handle broken UTF-8 sequences in common_chat_parse()
1 parent 0a5036b commit 4a293ba

File tree

3 files changed

+27
-0
lines changed

3 files changed

+27
-0
lines changed

common/chat.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1944,6 +1944,7 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
19441944
}
19451945
}
19461946
auto msg = builder.result();
1947+
msg.content = truncate_incomplete_utf8(msg.content);
19471948
LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
19481949
return msg;
19491950
}

common/json-partial.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,3 +254,27 @@ bool common_json_parse(
254254
it = end;
255255
return true;
256256
}
257+
258+
std::string truncate_incomplete_utf8(const std::string & str) {
259+
if (str.empty()) return str;
260+
261+
size_t len = str.length();
262+
size_t pos = len;
263+
264+
while (pos > 0) {
265+
--pos;
266+
unsigned char byte = static_cast<unsigned char>(str[pos]);
267+
268+
int explen;
269+
if ((byte & 0xC0) == 0x80) continue;
270+
if ((byte & 0x80) == 0x00) explen = 1; // ASCII (0xxxxxxx) - 1 byte
271+
else if ((byte & 0xE0) == 0xC0) explen = 2; // 2-byte sequence (110xxxxx)
272+
else if ((byte & 0xF0) == 0xE0) explen = 3; // 3-byte sequence (1110xxxx)
273+
else if ((byte & 0xF8) == 0xF0) explen = 4; // 4-byte sequence (11110xxx)
274+
else return str.substr(0, pos); // Invalid UTF-8 start byte
275+
276+
return str.substr(0, pos + (pos + explen <= len ? explen : 0));
277+
}
278+
279+
return "";
280+
}

common/json-partial.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,5 @@ bool common_json_parse(
3636
const std::string::const_iterator & end,
3737
const std::string & healing_marker,
3838
common_json & out);
39+
40+
std::string truncate_incomplete_utf8(const std::string & str);

0 commit comments

Comments
 (0)