Skip to content

Commit 4374e44

Browse files
author
Kenji Fukuda
committed
[MERGE #5450 @kfukuda2] Change parsing of \c to reflect spec Fixes #2973
Merge pull request #5450 from kfukuda2:ControlCharFix Changing the treatment of escaped c ,`\cX`, according to spec. If the Unicode flag is not present and the sequence is inside a character class:    If `X` is a non-word(^A-Za-z0-9_), treat `\` as a standalone literal and resume parsing from `c`.    If `X` is a word, take the lowest 5 bits of the Unicode codepoint of `X` to be matched. If the Unicode flag is present and the sequence is inside a character class:    If `X` is a non-letter(^A-Za-z), throw a SyntaxError.    If `X` is a letter, take the lowest 5 bits of the Unicode codepoint of `X` to be matched. If the Unicode flag is not present and the sequence is not inside a character class:    If `X` is a non-letter(^A-Za-z), treat `\` as a standalone literal and resume parsing from `c`.    If `X` is a letter, take the lowest 5 bits of the Unicode codepoint of `X` to be matched. If the Unicode flag is present and the sequence is not inside a character class:    If `X` is a non-letter(^A-Za-z), throw a SyntaxError.    If `X` is a letter, take the lowest 5 bits of the Unicode codepoint of `X` to be matched.
2 parents 88b1be4 + 4c26c3c commit 4374e44

File tree

3 files changed

+218
-23
lines changed

3 files changed

+218
-23
lines changed

lib/Parser/RegexParser.cpp

Lines changed: 16 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1529,7 +1529,13 @@ namespace UnifiedRegex
15291529
else if (ECLookahead() == 'c')
15301530
{
15311531
if (standardEncodedChars->IsLetter(ECLookahead(1))) // terminating 0 is not a letter
1532+
{
15321533
ECConsume(2);
1534+
}
1535+
else
1536+
{
1537+
DeferredFailIfUnicode(JSERR_RegExpInvalidEscape);
1538+
}
15331539
return false;
15341540
}
15351541
else
@@ -2161,7 +2167,7 @@ namespace UnifiedRegex
21612167
{
21622168
if (unicodeFlagPresent)
21632169
{
2164-
//We a range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here
2170+
//A range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here
21652171
//This breaks the notion of Pass0 check for valid syntax, because during that time, the unicode flag is unknown.
21662172
Fail(JSERR_UnicodeRegExpRangeContainsCharClass); //From #sec-patterns-static-semantics-early-errors-annexb
21672173
}
@@ -2206,7 +2212,7 @@ namespace UnifiedRegex
22062212
{
22072213
if (prevprevWasACharSetAndPartOfRange)
22082214
{
2209-
//We a range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here
2215+
//A range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here
22102216
//This breaks the notion of Pass0 check for valid syntax, because during that time, the unicode flag is unknown.
22112217
if (unicodeFlagPresent)
22122218
{
@@ -2494,6 +2500,8 @@ namespace UnifiedRegex
24942500
}
24952501
else
24962502
{
2503+
DeferredFailIfUnicode(JSERR_RegExpInvalidEscape); // Fail in unicode mode for non-letter escaped control characters according to 262 Annex-B RegExp grammar spec #prod-annexB-Term
2504+
24972505
if (!IsEOF())
24982506
{
24992507
EncodedChar ecLookahead = ECLookahead();
@@ -2625,33 +2633,19 @@ namespace UnifiedRegex
26252633
standardChars->SetNonWordChars(ctAllocator, deferredSetNode->set);
26262634
return deferredSetNode;
26272635
case 'c':
2628-
if (standardEncodedChars->IsLetter(ECLookahead())) // terminating 0 is not a letter
2636+
if (standardEncodedChars->IsWord(ECLookahead())) // terminating 0 is not a word character
26292637
{
26302638
c = UTC(Chars<EncodedChar>::CTU(ECLookahead()) % 32);
26312639
ECConsume();
26322640
// fall-through for identity escape
26332641
}
26342642
else
26352643
{
2636-
// SPEC DEVIATION: For non-letters, still take lower 5 bits, e.g. [\c1] == [\x11].
2637-
// However, '-', ']', and EOF make the \c just a 'c'.
2638-
if (!IsEOF())
2639-
{
2640-
EncodedChar ec = ECLookahead();
2641-
switch (ec)
2642-
{
2643-
case '-':
2644-
case ']':
2645-
// fall-through for identity escape with 'c'
2646-
break;
2647-
default:
2648-
c = UTC(Chars<EncodedChar>::CTU(ec) % 32);
2649-
ECConsume();
2650-
// fall-through for identity escape
2651-
break;
2652-
}
2653-
}
2654-
// else: fall-through for identity escape with 'c'
2644+
// If the lookahead is a non-alphanumeric and not an underscore ('_'), then treat '\' and 'c' separately.
2645+
//#sec-regular-expression-patterns-semantics
2646+
ECRevert(1); //Put cursor back at 'c' and treat it as a non-escaped character.
2647+
deferredCharNode->cs[0] = '\\';
2648+
return deferredCharNode;
26552649
}
26562650
break;
26572651
case 'x':
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
//-------------------------------------------------------------------------------------------------------
2+
// Copyright (C) Microsoft. All rights reserved.
3+
// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
4+
//-------------------------------------------------------------------------------------------------------
5+
6+
WScript.LoadScriptFile("..\\UnitTestFramework\\UnitTestFramework.js");
7+
8+
function matchRegExp(str, regexpLiteral, expectedResult)
9+
{
10+
matchResultLiteral = str.match(regexpLiteral);
11+
errorMsgBase = "Expected result of match between string: '" + str + "' and regular expression: " + regexpLiteral + " to be " +
12+
expectedResult + " but was "
13+
14+
actualResultLiteral = matchResultLiteral == null ? null : matchResultLiteral[0];
15+
assert.areEqual(expectedResult, actualResultLiteral, errorMsgBase + actualResultLiteral);
16+
17+
regexpConstructor = new RegExp(regexpLiteral);
18+
matchResultConstructor = str.match(regexpConstructor);
19+
20+
actualResultConstructor = matchResultConstructor == null ? null : matchResultConstructor[0];
21+
assert.areEqual(expectedResult, actualResultConstructor, errorMsgBase + actualResultConstructor);
22+
}
23+
24+
var tests = [
25+
{
26+
name : "Control characters followed by a word character ([A-Za-z0-9_])",
27+
body : function ()
28+
{
29+
re = /[\c6]+/; //'6' = ascii x36, parsed as [\x16]+
30+
matchRegExp("6", re, null);
31+
matchRegExp("\\", re, null);
32+
matchRegExp("\\c6", re, null);
33+
matchRegExp("c", re, null);
34+
matchRegExp("\x16", re, "\x16");
35+
36+
re = /\c6/; //'6' = ascii x36, parsed as "\\c6"
37+
matchRegExp("\\c6", re, "\\c6");
38+
matchRegExp("\\", re, null);
39+
matchRegExp("6", re, null);
40+
matchRegExp("c", re, null);
41+
matchRegExp("\x16", re, null);
42+
43+
re = /\c6[\c6]+/; //'6' = ascii x36, parsed as "\\c6"[\x16]+
44+
matchRegExp("\\c6\x16", re, "\\c6\x16");
45+
matchRegExp("\\", re, null);
46+
matchRegExp("c", re, null);
47+
matchRegExp("\x16", re, null);
48+
49+
re = /[\ca]+/; //'a' = ascii x61, parsed as [\x01]+
50+
matchRegExp("a", re, null);
51+
matchRegExp("\\", re, null);
52+
matchRegExp("c", re, null);
53+
matchRegExp("00xyzabc123\x01qrst", re, "\x01");
54+
55+
re = /[\c_]+/; //'_' = ascii 0x5F, parsed as [\x1F]+
56+
matchRegExp("\x1F\x1F\x05", re, "\x1F\x1F");
57+
matchRegExp("\\\\\\", re, null);
58+
matchRegExp("////", re, null);
59+
matchRegExp("ccc_", re, null);
60+
61+
re = /[\cG]*/; //'G' = ascii x47, parsed as [\x07]*
62+
matchRegExp("\x07\x06\x05", re, "\x07");
63+
matchRegExp("\\\\", re, "");
64+
matchRegExp("////", re, "");
65+
matchRegExp("cccG", re, "");
66+
67+
re = /[\cG\c6\cf]+/; //'G' = ascii x47, '6' = ascii x36, 'f' = ascii x66, parsed as [\x07\x16\x06]+
68+
matchRegExp("\x00\x03\x07\x06\x16\x07\x08", re, "\x07\x06\x16\x07");
69+
matchRegExp("\\\\", re, null);
70+
matchRegExp("////", re, null);
71+
matchRegExp("cfG6", re, null);
72+
73+
re = /\cG\cf/; //'G' = ascii x47, 'f' = ascii x66, parsed as "\x07\x06"
74+
matchRegExp("\x00\x03\x07\x06\x16\x07\x08", re, "\x07\x06");
75+
matchRegExp("\\", re, null);
76+
matchRegExp("/", re, null);
77+
matchRegExp("\\cG\\c6\\cf", re, null);
78+
79+
re = /[\cz\cZ]+/; //'z' = ascii x7A, 'Z' = ascii x5A, have the same lowest 5 bits, parsed as [\x1A]+
80+
matchRegExp("\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" +
81+
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f", re, "\x1a");
82+
matchRegExp("\\\\", re, null);
83+
matchRegExp("////", re, null);
84+
matchRegExp("ccczZ", re, null);
85+
}
86+
},
87+
{
88+
name : "Control characters followed by a non-word character ([^A-Za-z0-9_])",
89+
body : function ()
90+
{
91+
re = /[\c*]+/; //'*' = ascii 42, parsed as [\\c*]+
92+
matchRegExp("\x0a\x09\x08", re, null);
93+
matchRegExp("a*c*b*d*", re, "*c*");
94+
matchRegExp("\\\\", re, "\\\\");
95+
matchRegExp("////", re, null);
96+
matchRegExp("ccc", re, "ccc");
97+
98+
re = /[\c}]*/; //'}' = ascii 125, parsed as [\\c}]*
99+
matchRegExp("\x1d\x7d\x3d", re, "");
100+
matchRegExp("}c}}cd*c*b*d*", re, "}c}}c");
101+
matchRegExp("\\\\", re, "\\\\");
102+
matchRegExp("////", re, "");
103+
matchRegExp("ccc", re, "ccc");
104+
105+
re = /[\c;]+/; //';' = ascii 59, parsed as [\\c;]+
106+
matchRegExp("\x1b\x1c", re, null);
107+
matchRegExp("d;c;d;*", re, ";c;");
108+
matchRegExp("\\\\", re, "\\\\");
109+
matchRegExp("////", re, null);
110+
matchRegExp("ccc", re, "ccc");
111+
112+
re = /\c%/; //'%' = ascii x25, parsed as \\c%
113+
matchRegExp("\\", re, null);
114+
matchRegExp("\\", re, null);
115+
matchRegExp("\\c%", re, "\\c%");
116+
matchRegExp("\x05", re, null);
117+
}
118+
},
119+
{
120+
name : "Control Character tests with unicode flag present",
121+
body : function ()
122+
{
123+
re = /[\cAg]+/u; //'A' = ascii x41, parsed as [g\x01]+
124+
matchRegExp("abcdefghi", re, "g");
125+
matchRegExp("\\\\", re, null);
126+
matchRegExp("////", re, null);
127+
matchRegExp("\x01\x01gg\x02\x04ggg", re, "\x01\x01gg");
128+
129+
re = /[\czA]+/u; //'z' = ascii x7A, parsed as [\x1AA]+
130+
matchRegExp("abcdefghi", re, null);
131+
matchRegExp("\\\\", re, null);
132+
matchRegExp("////", re, null);
133+
matchRegExp("YZA\x1aABC", re, "A\x1aA");
134+
135+
assert.throws(() => eval("\"\".match(/[\\c]/u)"), SyntaxError, "(Character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by no character here.",
136+
"Invalid regular expression: invalid escape in unicode pattern");
137+
assert.throws(() => eval("\"\".match(/[\\c-d]/u)"), SyntaxError, "(Character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by a dash, '-', here.",
138+
"Invalid regular expression: invalid escape in unicode pattern");
139+
assert.throws(() => eval("\"\".match(/[ab\\c_$]/u)"), SyntaxError, "(Character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by an underscore, '_', here.",
140+
"Invalid regular expression: invalid escape in unicode pattern");
141+
assert.throws(() => eval("\"\".match(/[ab\\c\\d]/u)"), SyntaxError, "(Character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by a backslash, '\\', here.",
142+
"Invalid regular expression: invalid escape in unicode pattern");
143+
assert.throws(() => eval("\"\".match(/[ab\\c3]/u)"), SyntaxError, "(Character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by a number, '3', here.",
144+
"Invalid regular expression: invalid escape in unicode pattern");
145+
146+
re = /\cAg/u; //'A' = ascii x41, parsed as "\x01g"
147+
matchRegExp("abcdefghi", re, null);
148+
matchRegExp("\\\\", re, null);
149+
matchRegExp("////", re, null);
150+
matchRegExp("\x01\x01gg\x02\x04ggg", re, "\x01g");
151+
152+
re = /\czA/u; //'z' = ascii x7A, parsed as "\x1aA"
153+
matchRegExp("abcdefghi", re, null);
154+
matchRegExp("\\\\", re, null);
155+
matchRegExp("////", re, null);
156+
matchRegExp("YZA\x1aABC", re, "\x1aA");
157+
158+
assert.throws(() => eval("\"\".match(/\\c/u)"), SyntaxError, "(Non-character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by no character here.",
159+
"Invalid regular expression: invalid escape in unicode pattern");
160+
assert.throws(() => eval("\"\".match(/\\c-d/u)"), SyntaxError, "(Non-character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by a dash, '-', here.",
161+
"Invalid regular expression: invalid escape in unicode pattern");
162+
assert.throws(() => eval("\"\".match(/ab\\c_$/u)"), SyntaxError, "(Non-character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by an underscore, '_', here.",
163+
"Invalid regular expression: invalid escape in unicode pattern");
164+
assert.throws(() => eval("\"\".match(/ab\\c\\d/u)"), SyntaxError, "(Non-character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by a backslash, '\\', here.",
165+
"Invalid regular expression: invalid escape in unicode pattern");
166+
assert.throws(() => eval("\"\".match(/ab\\c3/u)"), SyntaxError, "(Non-character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by a number, '3', here.",
167+
"Invalid regular expression: invalid escape in unicode pattern");
168+
}
169+
},
170+
{
171+
name : "Control character edge cases",
172+
body : function ()
173+
{
174+
re = /[\c-g]+/; //'-' = ascii x2D, parsed as [\\c-g]+
175+
matchRegExp("abcdefghi", re, "cdefg");
176+
matchRegExp("\\\\", re, "\\\\");
177+
matchRegExp("////", re, null);
178+
matchRegExp("\x0d", re, null);
179+
matchRegExp("aobd\\f\\d", re, "d\\f\\d");
180+
181+
re = /[\c-]+/; //'-' = ascii x2D, parsed as [\\c-]+
182+
matchRegExp("abcdefghi", re, "c");
183+
matchRegExp("\x0dc--c", re, "c--c");
184+
matchRegExp("\\\\", re, "\\\\");
185+
matchRegExp("////", re, null);
186+
matchRegExp("aobd\\f\\d", re, "\\");
187+
188+
assert.throws(() => eval("\"\".match(/[\\c-a]/)"), SyntaxError, "Expected an error due to 'c-a' being an invalid range.", "Invalid range in character set");
189+
}
190+
}
191+
];
192+
193+
testRunner.runTests(tests, {
194+
verbose : WScript.Arguments[0] != "summary"
195+
});

test/Regex/rlexe.xml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,10 +229,16 @@
229229
<compile-flags>-args summary -endargs</compile-flags>
230230
</default>
231231
</test>
232-
<test>
232+
<test>
233233
<default>
234234
<files>characterclass_with_range.js</files>
235235
<compile-flags>-args summary -endargs</compile-flags>
236236
</default>
237237
</test>
238+
<test>
239+
<default>
240+
<files>control_character_escapes.js</files>
241+
<compile-flags>-args summary -endargs</compile-flags>
242+
</default>
243+
</test>
238244
</regress-exe>

0 commit comments

Comments
 (0)