55#include < nlohmann/json.hpp>
66
77#include < string>
8+ #include < regex>
89
910using json = nlohmann::ordered_json;
1011
@@ -168,6 +169,47 @@ bool common_json_parse(
168169 }
169170 }
170171
172+ // Matches a potentially partial unicode escape sequence, e.g. \u, \uX, \uXX, \uXXX, \uXXXX
173+ static const std::regex partial_unicode_regex (R"( \\u(?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F])?)?)?)?$)" );
174+
175+ auto is_high_surrogate = [&](const std::string & s) {
176+ // Check if a partial of a high surrogate (U+D800-U+DBFF)
177+ return s.length () >= 4 &&
178+ s[0 ] == ' \\ ' && s[1 ] == ' u' &&
179+ std::tolower (s[2 ]) == ' d' &&
180+ (s[3 ] == ' 8' || s[3 ] == ' 9' || std::tolower (s[3 ]) == ' a' || std::tolower (s[3 ]) == ' b' );
181+ };
182+
183+ // Initialize the unicode marker to a low surrogate to handle the edge case
184+ // where a high surrogate (U+D800-U+DBFF) is immediately followed by a
185+ // backslash (\)
186+ std::string unicode_marker_padding = " udc00" ;
187+ std::smatch last_unicode_seq;
188+
189+ if (std::regex_search (str, last_unicode_seq, partial_unicode_regex)) {
190+ std::smatch second_last_seq;
191+ std::string prelude = str.substr (0 , last_unicode_seq.position ());
192+
193+ // Pad the escape sequence with 0s until it forms a complete sequence of 6 characters
194+ unicode_marker_padding = std::string (6 - last_unicode_seq.length (), ' 0' );
195+
196+ if (is_high_surrogate (last_unicode_seq.str ())) {
197+ // If the sequence is a partial match for a high surrogate, add a low surrogate (U+DC00-U+UDFF)
198+ unicode_marker_padding += " \\ udc00" ;
199+ } else if (std::regex_search (prelude, second_last_seq, partial_unicode_regex)) {
200+ if (is_high_surrogate (second_last_seq.str ())) {
201+ // If this follows a high surrogate, pad it to be a low surrogate
202+ if (last_unicode_seq.length () == 2 ) {
203+ unicode_marker_padding = " dc00" ;
204+ } else if (last_unicode_seq.length () == 3 ) {
205+ unicode_marker_padding = " c00" ;
206+ } else {
207+ // The original unicode_marker_padding is already padded with 0s
208+ }
209+ }
210+ }
211+ }
212+
171213 const auto & magic_seed = out.healing_marker .marker = healing_marker;// "$llama.cpp.json$";
172214
173215 if (err_loc.stack .back ().type == COMMON_JSON_STACK_ELEMENT_KEY) {
@@ -186,6 +228,9 @@ bool common_json_parse(
186228 } else if (str[str.length () - 1 ] == ' \\ ' && can_parse (str + " \\\" " + closing)) {
187229 // Was inside an object value string after an escape
188230 str += (out.healing_marker .json_dump_marker = " \\ " + magic_seed) + " \" " + closing;
231+ } else if (can_parse (str + unicode_marker_padding + " \" " + closing)) {
232+ // Was inside an object value string after a partial unicode escape
233+ str += (out.healing_marker .json_dump_marker = unicode_marker_padding + magic_seed) + " \" " + closing;
189234 } else {
190235 // find last :
191236 auto last_pos = str.find_last_of (' :' );
@@ -205,6 +250,9 @@ bool common_json_parse(
205250 } else if (str[str.length () - 1 ] == ' \\ ' && can_parse (str + " \\\" " + closing)) {
206251 // Was inside an array value string after an escape
207252 str += (out.healing_marker .json_dump_marker = " \\ " + magic_seed) + " \" " + closing;
253+ } else if (can_parse (str + unicode_marker_padding + " \" " + closing)) {
254+ // Was inside an array value string after a partial unicode escape
255+ str += (out.healing_marker .json_dump_marker = unicode_marker_padding + magic_seed) + " \" " + closing;
208256 } else if (!was_maybe_number () && can_parse (str + " , 1" + closing)) {
209257 // Had just finished a value
210258 str += (out.healing_marker .json_dump_marker = " ,\" " + magic_seed) + " \" " + closing;
@@ -230,6 +278,9 @@ bool common_json_parse(
230278 } else if (str[str.length () - 1 ] == ' \\ ' && can_parse (str + " \\\" : 1" + closing)) {
231279 // Was inside an object key string after an escape
232280 str += (out.healing_marker .json_dump_marker = " \\ " + magic_seed) + " \" : 1" + closing;
281+ } else if (can_parse (str + unicode_marker_padding + " \" : 1" + closing)) {
282+ // Was inside an object key string after a partial unicode escape
283+ str += (out.healing_marker .json_dump_marker = unicode_marker_padding + magic_seed) + " \" : 1" + closing;
233284 } else {
234285 auto last_pos = str.find_last_of (' :' );
235286 if (last_pos == std::string::npos) {
0 commit comments