Skip to content

Commit 5a885dd

Browse files
TymolcBillyONeal
authored andcommitted
Handle multi-byte unicode characters in json parsing (#1023)
* handle multi-byte unicode characters in json parsing * Properly check high surrogate start and end Co-Authored-By: Tymolc <[email protected]>
1 parent 8f0393d commit 5a885dd

File tree

2 files changed

+76
-21
lines changed

2 files changed

+76
-21
lines changed

Release/src/json/json_parsing.cpp

Lines changed: 69 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ class JSON_Parser
139139

140140
virtual bool CompleteComment(Token& token);
141141
virtual bool CompleteStringLiteral(Token& token);
142+
int convert_unicode_to_code_point();
142143
bool handle_unescape_char(Token& token);
143144

144145
private:
@@ -652,7 +653,15 @@ bool JSON_StringParser<CharType>::CompleteComment(typename JSON_Parser<CharType>
652653
return true;
653654
}
654655

655-
void convert_append_unicode_code_unit(JSON_Parser<wchar_t>::Token& token, utf16char value)
656+
void convert_append_unicode_code_unit(JSON_Parser<utf16char>::Token& token, utf16string value)
657+
{
658+
token.string_val.append(value);
659+
}
660+
void convert_append_unicode_code_unit(JSON_Parser<char>::Token& token, utf16string value)
661+
{
662+
token.string_val.append(::utility::conversions::utf16_to_utf8(value));
663+
}
664+
void convert_append_unicode_code_unit(JSON_Parser<utf16char>::Token& token, utf16char value)
656665
{
657666
token.string_val.push_back(value);
658667
}
@@ -662,6 +671,37 @@ void convert_append_unicode_code_unit(JSON_Parser<char>::Token& token, utf16char
662671
token.string_val.append(::utility::conversions::utf16_to_utf8(utf16));
663672
}
664673

674+
template<typename CharType>
675+
int JSON_Parser<CharType>::convert_unicode_to_code_point()
676+
{
677+
// A four-hexdigit Unicode character.
678+
// Transform into a 16 bit code point.
679+
int decoded = 0;
680+
for (int i = 0; i < 4; ++i)
681+
{
682+
auto ch = NextCharacter();
683+
int ch_int = static_cast<int>(ch);
684+
if (ch_int < 0 || ch_int > 127) return -1;
685+
#ifdef _WIN32
686+
const int isxdigitResult = _isxdigit_l(ch_int, utility::details::scoped_c_thread_locale::c_locale());
687+
#else
688+
const int isxdigitResult = isxdigit(ch_int);
689+
#endif
690+
if (!isxdigitResult) return -1;
691+
692+
int val = _hexval[static_cast<size_t>(ch_int)];
693+
694+
_ASSERTE(val != -1);
695+
696+
// Add the input char to the decoded number
697+
decoded |= (val << (4 * (3 - i)));
698+
}
699+
return decoded;
700+
}
701+
702+
#define H_SURROGATE_START 0xD800
703+
#define H_SURROGATE_END 0xDBFF
704+
665705
template<typename CharType>
666706
inline bool JSON_Parser<CharType>::handle_unescape_char(Token& token)
667707
{
@@ -682,26 +722,31 @@ inline bool JSON_Parser<CharType>::handle_unescape_char(Token& token)
682722
case 't': token.string_val.push_back('\t'); return true;
683723
case 'u':
684724
{
685-
// A four-hexdigit Unicode character.
686-
// Transform into a 16 bit code point.
687-
int decoded = 0;
688-
for (int i = 0; i < 4; ++i)
725+
int decoded = convert_unicode_to_code_point();
726+
if (decoded == -1)
689727
{
690-
ch = NextCharacter();
691-
int ch_int = static_cast<int>(ch);
692-
if (ch_int < 0 || ch_int > 127) return false;
693-
#ifdef _WIN32
694-
const int isxdigitResult = _isxdigit_l(ch_int, utility::details::scoped_c_thread_locale::c_locale());
695-
#else
696-
const int isxdigitResult = isxdigit(ch_int);
697-
#endif
698-
if (!isxdigitResult) return false;
728+
return false;
729+
}
730+
731+
// handle multi-block characters that start with a high-surrogate
732+
if (decoded >= H_SURROGATE_START && decoded <= H_SURROGATE_END)
733+
{
734+
// skip escape character '\u'
735+
if (NextCharacter() != '\\' || NextCharacter() != 'u')
736+
{
737+
return false;
738+
}
739+
int decoded2 = convert_unicode_to_code_point();
740+
741+
if (decoded2 == -1)
742+
{
743+
return false;
744+
}
699745

700-
int val = _hexval[static_cast<size_t>(ch_int)];
701-
_ASSERTE(val != -1);
746+
utf16string compoundUTF16 = {static_cast<utf16char>(decoded), static_cast<utf16char>(decoded2)};
747+
convert_append_unicode_code_unit(token, compoundUTF16);
702748

703-
// Add the input char to the decoded number
704-
decoded |= (val << (4 * (3 - i)));
749+
return true;
705750
}
706751

707752
// Construct the character based on the decoded number
@@ -1015,9 +1060,13 @@ std::unique_ptr<web::json::details::_Value> JSON_Parser<CharType>::_ParseValue(
10151060
{
10161061
switch (tkn.kind)
10171062
{
1018-
case JSON_Parser<CharType>::Token::TKN_OpenBrace: { return _ParseObject(tkn);
1063+
case JSON_Parser<CharType>::Token::TKN_OpenBrace:
1064+
{
1065+
return _ParseObject(tkn);
10191066
}
1020-
case JSON_Parser<CharType>::Token::TKN_OpenBracket: { return _ParseArray(tkn);
1067+
case JSON_Parser<CharType>::Token::TKN_OpenBracket:
1068+
{
1069+
return _ParseArray(tkn);
10211070
}
10221071
case JSON_Parser<CharType>::Token::TKN_StringLiteral:
10231072
{

Release/tests/functional/json/parsing_tests.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ SUITE(parsing_tests)
159159
input.append(2, ch);
160160
json::value val = json::value::parse(input);
161161
VERIFY_IS_TRUE(val.is_object());
162-
VERIFY_ARE_EQUAL(U("2"), val[U("1"]).serialize());
162+
VERIFY_ARE_EQUAL(U("2"), val[U("1")].serialize());
163163
}
164164
}
165165

@@ -213,6 +213,12 @@ SUITE(parsing_tests)
213213
const auto euro = to_string_t("\xE2\x82\xAC");
214214
VERIFY_ARE_EQUAL(euro, str.as_string());
215215

216+
// UTF-16 character with surrogate pair
217+
str = json::value::parse(U("\"\\ud83d\\ude00\""));
218+
// Grinning Face emoji as a hexadecimal UTF-8
219+
const auto emoji = to_string_t("\xF0\x9F\x98\x80");
220+
VERIFY_ARE_EQUAL(emoji, str.as_string());
221+
216222
VERIFY_PARSING_THROW(json::value::parse(U("\"\\u0klB\"")));
217223
}
218224

0 commit comments

Comments
 (0)