Skip to content

Commit ab6288d

Browse files
committed
RegExp Unicode JIT treats escaped surrogate followed by literal surrogate as surrogate pair
https://bugs.webkit.org/show_bug.cgi?id=290567 rdar://148548273 Reviewed by Yusuke Suzuki. This is due to the multi character optimization in JSC::Yarr::generatePatternCharacterOnce() where we match multiple characters by loading the number of characters we are matching together and then checking that what was loaded match the concatenation of characters. This bypasses the normal reading of a valid surrogate pair. The fix is to process dangling surrogates as individual characters and group together with surrounding characters. * JSTests/stress/regexp-unicode-mix-escaped-and-literal-surrogates.js: Added. (arrayToString): (objectToString): (dumpValue): (compareArray): (compareGroups): (testRegExp): (testRegExpSyntaxError): (printErrors): * Source/JavaScriptCore/yarr/YarrJIT.cpp: Canonical link: https://commits.webkit.org/294066@main
1 parent 5e26c1c commit ab6288d

File tree

2 files changed

+203
-3
lines changed

2 files changed

+203
-3
lines changed
Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
// With verbose set to false, this test is successful if there is no output. Set verbose to true to see expected matches.
2+
let verbose = false;
3+
4+
let errors = 0;
5+
6+
function arrayToString(arr)
7+
{
8+
let str = '';
9+
arr.forEach(function(v, index) {
10+
if (typeof v == "string")
11+
str += "\"" + v + "\"";
12+
else
13+
str += v;
14+
15+
if (index != (arr.length - 1))
16+
str += ',';
17+
});
18+
return str;
19+
}
20+
21+
function objectToString(obj)
22+
{
23+
let str = "";
24+
25+
firstEntry = true;
26+
27+
for (const [key, value] of Object.entries(obj)) {
28+
if (!firstEntry)
29+
str += ", ";
30+
31+
str += key + ": " + dumpValue(value);
32+
33+
firstEntry = false;
34+
}
35+
36+
return "{ " + str + " }";
37+
}
38+
39+
function dumpValue(v)
40+
{
41+
if (v === null)
42+
return "<null>";
43+
44+
if (v === undefined)
45+
return "<undefined>";
46+
47+
if (typeof v == "string")
48+
return "\"" + v + "\"";
49+
50+
let str = "";
51+
52+
if (v.length)
53+
str += arrayToString(v);
54+
55+
if (v.groups) {
56+
groupStr = objectToString(v.groups);
57+
58+
if (str.length) {
59+
if ( groupStr.length)
60+
str += ", " + groupStr;
61+
} else
62+
str = groupStr;
63+
}
64+
65+
return "[ " + str + " ]";
66+
}
67+
68+
function compareArray(expected, actual)
69+
{
70+
if (expected === null && actual === null)
71+
return true;
72+
73+
if (expected === null) {
74+
print("### expected is null, actual is not null");
75+
return false;
76+
}
77+
78+
if (actual === null) {
79+
print("### expected is not null, actual is null");
80+
return false;
81+
}
82+
83+
if (expected.length !== actual.length) {
84+
print("### expected.length: " + expected.length + ", actual.length: " + actual.length);
85+
return false;
86+
}
87+
88+
for (var i = 0; i < expected.length; i++) {
89+
if (expected[i] !== actual[i]) {
90+
print("### expected[" + i + "]: \"" + expected[i] + "\" !== actual[" + i + "]: \"" + actual[i] + "\"");
91+
return false;
92+
}
93+
}
94+
95+
return true;
96+
}
97+
98+
function compareGroups(expected, actual)
99+
{
100+
if (expected === null && actual === null)
101+
return true;
102+
103+
if (expected === null) {
104+
print("### expected group is null, actual group is not null");
105+
return false;
106+
}
107+
108+
if (actual === null) {
109+
print("### expected group is not null, actual group is null");
110+
return false;
111+
}
112+
113+
for (const key in expected) {
114+
if (expected[key] !== actual[key]) {
115+
print("### expected." + key + ": " + dumpValue(expected[key]) + " !== actual." + key + ": " + dumpValue(actual[key]));
116+
return false;
117+
}
118+
}
119+
120+
return true;
121+
}
122+
123+
let testNumber = 0;
124+
125+
function testRegExp(re, str, exp, groups)
126+
{
127+
testNumber++;
128+
129+
if (groups)
130+
exp.groups = groups;
131+
132+
let actual = re.exec(str);
133+
134+
let result = compareArray(exp, actual);;
135+
136+
if (exp && exp.groups) {
137+
if (!compareGroups(exp.groups, actual.groups))
138+
result = false;
139+
}
140+
141+
if (result) {
142+
if (verbose)
143+
print(re.toString() + ".exec(" + dumpValue(str) + "), passed ", dumpValue(exp));
144+
} else {
145+
print(re.toString() + ".exec(" + dumpValue(str) + "), FAILED test #" + testNumber + ", Expected ", dumpValue(exp), " got ", dumpValue(actual));
146+
errors++;
147+
}
148+
}
149+
150+
function testRegExpSyntaxError(reString, flags, expError)
151+
{
152+
testNumber++;
153+
154+
155+
try {
156+
let re = new RegExp(reString, flags);
157+
print("FAILED test #" + testNumber + ", Expected /" + reString + "/" + flags + " to throw \"" + expError + "\", but it didn't");
158+
errors++;
159+
} catch (e) {
160+
if (e != expError)
161+
print("FAILED test #" + testNumber + ", Expected /" + reString + "/" + flags + " to throw \"" + expError + "\" got \"" + e + "\"");
162+
else if (verbose)
163+
print("/" + reString + "/" + flags + " passed, it threw \"" + expError + "\" as expected");
164+
}
165+
}
166+
167+
function printErrors()
168+
{
169+
if (errors)
170+
throw "Test had " + errors + " errors";
171+
}
172+
173+
// Test a escaped surrogate paired with a literal surrogate.
174+
testRegExp(new RegExp("\\ud800\udc00+", "u"), "\u{10000}\u{10000}", null);
175+
testRegExp(new RegExp("^\\ud800\udc00+", "u"), "\u{10000}\u{10000}", null);
176+
testRegExp(new RegExp("\\ud800\udc00+$", "u"), "\u{10000}\u{10000}", null);
177+
testRegExp(new RegExp("^\\ud800\udc00+$", "u"), "\u{10000}\u{10000}", null);
178+
testRegExp(new RegExp("\\uD83D\uDC38", "u"), "\u{1F438}", null);
179+
testRegExp(new RegExp("\ud800\\udc00", "u"), "\u{10000}\u{10000}", null);
180+
testRegExp(new RegExp("\uD83D\\uDC38", "u"), "\u{1F438}", null);
181+
testRegExp(new RegExp("abcdefg\\ud800\udc001234567", "u"), "abcdefg\u{10000}1234567", null);
182+
testRegExp(new RegExp("1234567\uD83D\\uDC38abcdefg", "u"), "1234567\u{1F438}abcdefg", null);
183+
184+
// Test well formed pairs of escaped surrogates
185+
testRegExp(new RegExp("\\ud800\\udc00+", "u"), "\u{10000}\u{10000}", ["\u{10000}\u{10000}"]);
186+
testRegExp(new RegExp("^\\ud800\\udc00+", "u"), "\u{10000}\u{10000}", ["\u{10000}\u{10000}"]);
187+
testRegExp(new RegExp("\\ud800\\udc00+$", "u"), "\u{10000}\u{10000}", ["\u{10000}\u{10000}"]);
188+
testRegExp(new RegExp("^\\ud800\\udc00+$", "u"), "\u{10000}\u{10000}", ["\u{10000}\u{10000}"]);
189+
testRegExp(new RegExp("\\uD83D\\uDC38", "u"), "\u{1F438}", ["\u{1F438}"]);
190+
testRegExp(new RegExp("abcdefg\\ud800\\udc001234567", "u"), "abcdefg\u{10000}1234567", ["abcdefg\u{10000}1234567"]);
191+
testRegExp(new RegExp("1234567\\uD83D\\uDC38abcdefg", "u"), "1234567\u{1F438}abcdefg", ["1234567\u{1F438}abcdefg"]);
192+
193+
// Test well formed pairs of literal surrogates
194+
testRegExp(new RegExp("\ud800\udc00+", "u"), "\u{10000}\u{10000}", ["\u{10000}\u{10000}"]);
195+
testRegExp(new RegExp("^\ud800\udc00+", "u"), "\u{10000}\u{10000}", ["\u{10000}\u{10000}"]);
196+
testRegExp(new RegExp("\ud800\udc00+$", "u"), "\u{10000}\u{10000}", ["\u{10000}\u{10000}"]);
197+
testRegExp(new RegExp("^\ud800\udc00+$", "u"), "\u{10000}\u{10000}", ["\u{10000}\u{10000}"]);
198+
testRegExp(new RegExp("\uD83D\uDC38", "u"), "\u{1F438}", ["\u{1F438}"]);
199+
testRegExp(new RegExp("abcdefg\ud800\udc001234567", "u"), "abcdefg\u{10000}1234567", ["abcdefg\u{10000}1234567"]);
200+
testRegExp(new RegExp("1234567\uD83D\uDC38abcdefg", "u"), "1234567\u{1F438}abcdefg", ["1234567\u{1F438}abcdefg"]);

Source/JavaScriptCore/yarr/YarrJIT.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2103,8 +2103,8 @@ class YarrGenerator final : public YarrJITInfo {
21032103
// upper & lower case representations are converted to a character class.
21042104
ASSERT(!op.m_term->ignoreCase() || isASCIIAlpha(firstChar) || isCanonicallyUnique(firstChar, m_canonicalMode));
21052105

2106-
if (have16BitCharacter && m_charSize == CharSize::Char16 && !U_IS_BMP(firstChar)) {
2107-
// The first term we are considering is a non-BMP char in a 16 bit pattern. Just try matching it and be done.
2106+
if (m_decodeSurrogatePairs && (!U_IS_BMP(firstChar) || U16_IS_SURROGATE(firstChar))) {
2107+
// The first term we are considering is a non-BMP or dangling surrogate char in unicode pattern. Just try matching it and be done.
21082108
uint64_t charToMatch = firstChar;
21092109

21102110
auto offset = op.m_checkedOffset - op.m_term->inputPosition;
@@ -2133,7 +2133,7 @@ class YarrGenerator final : public YarrJITInfo {
21332133
|| (currTerm->type != PatternTerm::Type::PatternCharacter
21342134
&& currTerm->type != PatternTerm::Type::CharacterClass)
21352135
|| (m_decodeSurrogatePairs
2136-
&& ((currTerm->type == PatternTerm::Type::PatternCharacter && !U_IS_BMP(currTerm->patternCharacter))
2136+
&& ((currTerm->type == PatternTerm::Type::PatternCharacter && (!U_IS_BMP(currTerm->patternCharacter) || U16_IS_SURROGATE(currTerm->patternCharacter)))
21372137
|| (currTerm->type == PatternTerm::Type::CharacterClass && (currTerm->characterClass->hasNonBMPCharacters()
21382138
|| currTerm->invert())))))
21392139
break;

0 commit comments

Comments
 (0)