Skip to content

Commit 8d4374c

Browse files
committed
Implement well-formed JSON.stringify
The change to the spec makes unpaired surrogate code units serialize to their escape sequences instead of bad characters. This is now in stage 4. Fixes #5735
1 parent 17cddd6 commit 8d4374c

File tree

7 files changed

+133
-42
lines changed

7 files changed

+133
-42
lines changed

lib/Common/Codex/Utf8Codex.cpp

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -69,11 +69,6 @@ namespace utf8
6969
return ((0x5B >> (((prefix ^ 0xF0) >> 3) & 0x1E)) & 0x03) + 1;
7070
}
7171

72-
const char16 WCH_UTF16_HIGH_FIRST = char16(0xd800);
73-
const char16 WCH_UTF16_HIGH_LAST = char16(0xdbff);
74-
const char16 WCH_UTF16_LOW_FIRST = char16(0xdc00);
75-
const char16 WCH_UTF16_LOW_LAST = char16(0xdfff);
76-
7772
char16 GetUnknownCharacter(DecodeOptions options = doDefault)
7873
{
7974
if ((options & doThrowOnInvalidWCHARs) != 0)
@@ -83,26 +78,11 @@ namespace utf8
8378
return char16(UNICODE_UNKNOWN_CHAR_MARK);
8479
}
8580

86-
inline BOOL InRange(const char16 ch, const char16 chMin, const char16 chMax)
87-
{
88-
return (unsigned)(ch - chMin) <= (unsigned)(chMax - chMin);
89-
}
90-
9181
BOOL IsValidWideChar(char16 ch)
9282
{
9383
return (ch < 0xfdd0) || ((ch > 0xfdef) && (ch <= 0xffef)) || ((ch >= 0xfff9) && (ch <= 0xfffd));
9484
}
9585

96-
inline BOOL IsHighSurrogateChar(char16 ch)
97-
{
98-
return InRange( ch, WCH_UTF16_HIGH_FIRST, WCH_UTF16_HIGH_LAST );
99-
}
100-
101-
inline BOOL IsLowSurrogateChar(char16 ch)
102-
{
103-
return InRange( ch, WCH_UTF16_LOW_FIRST, WCH_UTF16_LOW_LAST );
104-
}
105-
10686
_At_(ptr, _In_reads_(end - ptr) _Post_satisfies_(ptr >= _Old_(ptr) - 1 && ptr <= end))
10787
inline char16 DecodeTail(char16 c1, LPCUTF8& ptr, LPCUTF8 end, DecodeOptions& options, bool *chunkEndsAtTruncatedSequence)
10888
{

lib/Common/Codex/Utf8Codex.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,26 @@ namespace utf8
157157

158158
BOOL IsValidWideChar(char16 ch);
159159

160+
const char16 WCH_UTF16_HIGH_FIRST = char16(0xd800);
161+
const char16 WCH_UTF16_HIGH_LAST = char16(0xdbff);
162+
const char16 WCH_UTF16_LOW_FIRST = char16(0xdc00);
163+
const char16 WCH_UTF16_LOW_LAST = char16(0xdfff);
164+
165+
inline BOOL InRange(const char16 ch, const char16 chMin, const char16 chMax)
166+
{
167+
return (unsigned)(ch - chMin) <= (unsigned)(chMax - chMin);
168+
}
169+
170+
inline BOOL IsHighSurrogateChar(char16 ch)
171+
{
172+
return InRange(ch, WCH_UTF16_HIGH_FIRST, WCH_UTF16_HIGH_LAST);
173+
}
174+
175+
inline BOOL IsLowSurrogateChar(char16 ch)
176+
{
177+
return InRange(ch, WCH_UTF16_LOW_FIRST, WCH_UTF16_LOW_LAST);
178+
}
179+
160180
// Decode the trail bytes after the UTF8 lead byte c1 but returning 0xFFFD if trail bytes are expected after end.
161181
_At_(ptr, _In_reads_(end - ptr) _Post_satisfies_(ptr >= _Old_(ptr) - 1 && ptr <= end))
162182
char16 DecodeTail(char16 c1, LPCUTF8& ptr, LPCUTF8 end, DecodeOptions& options, bool *chunkEndsAtTruncatedSequence = nullptr);

lib/Runtime/Library/JSONStringBuilder.cpp

Lines changed: 45 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,34 @@ JSONStringBuilder::AppendString(_In_ JavascriptString* str)
3030
AppendBuffer(str->GetString(), str->GetLength());
3131
}
3232

33+
void
34+
JSONStringBuilder::AppendEscapeSequence(_In_ const char16 character)
35+
{
36+
// Convert character into a 4 digit hex code (e.g. \u0010)
37+
this->AppendCharacter(_u('\\'));
38+
this->AppendCharacter(_u('u'));
39+
{
40+
char16 buf[5];
41+
// Get hex value
42+
_ltow_s(character, buf, _countof(buf), 16);
43+
44+
// Append leading zeros if necessary before the hex value
45+
charcount_t count = static_cast<charcount_t>(wcslen(buf));
46+
switch (count)
47+
{
48+
case 1:
49+
this->AppendCharacter(_u('0'));
50+
case 2:
51+
this->AppendCharacter(_u('0'));
52+
case 3:
53+
this->AppendCharacter(_u('0'));
54+
default:
55+
this->AppendBuffer(buf, count);
56+
break;
57+
}
58+
}
59+
}
60+
3361
void
3462
JSONStringBuilder::EscapeAndAppendString(_In_ JavascriptString* str)
3563
{
@@ -70,30 +98,25 @@ JSONStringBuilder::EscapeAndAppendString(_In_ JavascriptString* str)
7098
this->AppendCharacter(_u('t'));
7199
break;
72100
default:
73-
if (currentCharacter < _u(' '))
101+
if (currentCharacter < _u(' ') || utf8::IsLowSurrogateChar(currentCharacter))
102+
{
103+
this->AppendEscapeSequence(currentCharacter);
104+
}
105+
else if (utf8::IsHighSurrogateChar(currentCharacter))
74106
{
75-
// If character is less than SPACE, it is converted into a 4 digit hex code (e.g. \u0010)
76-
this->AppendCharacter(_u('\\'));
77-
this->AppendCharacter(_u('u'));
107+
if (index + 1 < bufferStart + strLength && utf8::IsLowSurrogateChar(*(index + 1)))
108+
{
109+
// Append surrogate pair normally
110+
this->AppendCharacter(currentCharacter);
111+
this->AppendCharacter(*(index + 1));
112+
113+
// Skip the trailing-surrogate code unit
114+
index++;
115+
}
116+
else
78117
{
79-
char16 buf[5];
80-
// Get hex value
81-
_ltow_s(currentCharacter, buf, _countof(buf), 16);
82-
83-
// Append leading zeros if necessary before the hex value
84-
charcount_t count = static_cast<charcount_t>(wcslen(buf));
85-
switch (count)
86-
{
87-
case 1:
88-
this->AppendCharacter(_u('0'));
89-
case 2:
90-
this->AppendCharacter(_u('0'));
91-
case 3:
92-
this->AppendCharacter(_u('0'));
93-
default:
94-
this->AppendBuffer(buf, count);
95-
break;
96-
}
118+
// High-surrogate code unit not followed by a trailing-surrogate code unit should be escaped.
119+
this->AppendEscapeSequence(currentCharacter);
97120
}
98121
}
99122
else

lib/Runtime/Library/JSONStringBuilder.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ class JSONStringBuilder
2323
void AppendCharacter(char16 character);
2424
void AppendBuffer(_In_ const char16* buffer, charcount_t length);
2525
void AppendString(_In_ JavascriptString* str);
26+
void AppendEscapeSequence(_In_ const char16 character);
2627
void EscapeAndAppendString(_In_ JavascriptString* str);
2728
void AppendObjectString(_In_ JSONObject* valueList);
2829
void AppendArrayString(_In_ JSONArray* valueArray);

lib/Runtime/Library/JSONStringifier.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -690,6 +690,25 @@ JSONStringifier::CalculateStringElementLength(_In_ JavascriptString* str)
690690
{
691691
escapedStrLength += LazyJSONString::escapeMapCount[currentCharacter];
692692
}
693+
else if (utf8::IsLowSurrogateChar(currentCharacter))
694+
{
695+
// Lone trailing-surrogate code units should be escaped.
696+
// They will always need 5 extra characters for the escape sequence, ie: \udbff
697+
escapedStrLength += 5;
698+
}
699+
else if (utf8::IsHighSurrogateChar(currentCharacter))
700+
{
701+
if (index + 1 < bufferStart + strLength && utf8::IsLowSurrogateChar(*(index + 1)))
702+
{
703+
// Regular surrogate pairs are handled normally - skip the trailing-surrogate code unit.
704+
index++;
705+
}
706+
else
707+
{
708+
// High-surrogate code unit not followed by a trailing-surrogate code unit should be escaped.
709+
escapedStrLength += 5;
710+
}
711+
}
693712
}
694713
if (escapedStrLength > UINT32_MAX)
695714
{

test/es7/rlexe.xml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,4 +125,10 @@
125125
<compile-flags>-ES2018AsyncIteration -args summary -endargs</compile-flags>
126126
</default>
127127
</test>
128+
<test>
129+
<default>
130+
<files>wellformedJSON.js</files>
131+
<compile-flags>-args summary -endargs</compile-flags>
132+
</default>
133+
</test>
128134
</regress-exe>

test/es7/wellformedJSON.js

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
//-------------------------------------------------------------------------------------------------------
2+
// Copyright (C) Microsoft. All rights reserved.
3+
// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
4+
//-------------------------------------------------------------------------------------------------------
5+
6+
WScript.LoadScriptFile("..\\UnitTestFramework\\UnitTestFramework.js");
7+
8+
var tests = [
9+
{
10+
name: "Broken surrogate pairs should be escaped during JSON.stringify",
11+
body: function () {
12+
assert.areEqual(JSON.stringify("\uD834"), '"\\ud834"',
13+
'JSON.stringify("\\uD834")');
14+
assert.areEqual(JSON.stringify("\uDF06"), '"\\udf06"',
15+
'JSON.stringify("\\uDF06")');
16+
17+
assert.areEqual(JSON.stringify("\uD834\uDF06"), '"𝌆"',
18+
'JSON.stringify("\\uD834\\uDF06")');
19+
assert.areEqual(JSON.stringify("\uD834\uD834\uDF06\uD834"), '"\\ud834𝌆\\ud834"',
20+
'JSON.stringify("\\uD834\\uD834\\uDF06\\uD834")');
21+
assert.areEqual(JSON.stringify("\uD834\uD834\uDF06\uDF06"), '"\\ud834𝌆\\udf06"',
22+
'JSON.stringify("\\uD834\\uD834\\uDF06\\uDF06")');
23+
assert.areEqual(JSON.stringify("\uDF06\uD834\uDF06\uD834"), '"\\udf06𝌆\\ud834"',
24+
'JSON.stringify("\\uDF06\\uD834\\uDF06\\uD834")');
25+
assert.areEqual(JSON.stringify("\uDF06\uD834\uDF06\uDF06"), '"\\udf06𝌆\\udf06"',
26+
'JSON.stringify("\\uDF06\\uD834\\uDF06\\uDF06")');
27+
28+
assert.areEqual(JSON.stringify("\uDF06\uD834"), '"\\udf06\\ud834"',
29+
'JSON.stringify("\\uDF06\\uD834")');
30+
assert.areEqual(JSON.stringify("\uD834\uDF06\uD834\uD834"), '"𝌆\\ud834\\ud834"',
31+
'JSON.stringify("\\uD834\\uDF06\\uD834\\uD834")');
32+
assert.areEqual(JSON.stringify("\uD834\uDF06\uD834\uDF06"), '"𝌆𝌆"',
33+
'JSON.stringify("\\uD834\\uDF06\\uD834\\uDF06")');
34+
assert.areEqual(JSON.stringify("\uDF06\uDF06\uD834\uD834"), '"\\udf06\\udf06\\ud834\\ud834"',
35+
'JSON.stringify("\\uDF06\\uDF06\\uD834\\uD834")');
36+
assert.areEqual(JSON.stringify("\uDF06\uDF06\uD834\uDF06"), '"\\udf06\\udf06𝌆"',
37+
'JSON.stringify("\\uDF06\\uDF06\\uD834\\uDF06")');
38+
}
39+
},
40+
];
41+
42+
testRunner.runTests(tests, { verbose: WScript.Arguments[0] != "summary" });

0 commit comments

Comments
 (0)