Skip to content

Commit 3daa80d

Browse files
author
Kenji Fukuda
committed
Adding failure for character classes in range if unicode flag set.
Adding test cases for RegExp Fixing bug where backtick '\' added to set if set ends with escaped character class
1 parent eb33f9a commit 3daa80d

File tree

5 files changed

+112
-156
lines changed

5 files changed

+112
-156
lines changed

lib/Parser/RegexParser.cpp

Lines changed: 32 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2126,8 +2126,8 @@ namespace UnifiedRegex
21262126
EncodedChar nextChar = ECLookahead();
21272127
bool previousWasASurrogate = false;
21282128
bool currIsACharSet = false;
2129-
bool prevWasACharSet = false;
2130-
bool prevprevWasACharSet = false;
2129+
bool prevWasACharSetAndPartOfRange = false;
2130+
bool prevprevWasACharSetAndPartOfRange = false;
21312131

21322132
while(nextChar != ']')
21332133
{
@@ -2138,6 +2138,7 @@ namespace UnifiedRegex
21382138
{
21392139
ECConsume();
21402140
}
2141+
21412142
// These if-blocks are the logical ClassAtomPass1, they weren't grouped into a method to simplify dealing with multiple out parameters.
21422143
if (containsSurrogates && this->currentSurrogatePairNode != nullptr && this->currentSurrogatePairNode->location == this->next)
21432144
{
@@ -2156,7 +2157,12 @@ namespace UnifiedRegex
21562157

21572158
if (returnedNode->tag == Node::MatchSet)
21582159
{
2159-
pendingCodePoint = nextChar;
2160+
if (unicodeFlagPresent && pendingRangeStart != INVALID_CODEPOINT)
2161+
{
2162+
Fail(JSERR_UnicodeRegExpRangeContainsCharClass);
2163+
}
2164+
2165+
pendingCodePoint = INVALID_CODEPOINT;
21602166
if (pendingRangeStart != INVALID_CODEPOINT)
21612167
{
21622168
codePointSet.Set(ctAllocator, '-');
@@ -2173,9 +2179,9 @@ namespace UnifiedRegex
21732179
}
21742180
else if (nextChar == '-')
21752181
{
2176-
if ((!prevWasACharSet && (pendingRangeStart != INVALID_CODEPOINT || pendingCodePoint == INVALID_CODEPOINT)) || ECLookahead(1) == ']')
2182+
if (pendingRangeStart != INVALID_CODEPOINT || pendingCodePoint == INVALID_CODEPOINT || ECLookahead(1) == ']')
21772183
{
2178-
// - is just a char, or end of a range. If the previous char of the RegExp was a charset we want to treat it as the beginning of a range.
2184+
// - is just a char, or end of a range.
21792185
codePointToSet = pendingCodePoint;
21802186
pendingCodePoint = '-';
21812187
ECConsume();
@@ -2193,26 +2199,33 @@ namespace UnifiedRegex
21932199
pendingCodePoint = NextChar();
21942200
}
21952201

2196-
if (codePointToSet != INVALID_CODEPOINT)
2202+
if (codePointToSet != INVALID_CODEPOINT || prevprevWasACharSetAndPartOfRange)
21972203
{
2198-
if (pendingRangeStart != INVALID_CODEPOINT)
2204+
if (unicodeFlagPresent && prevprevWasACharSetAndPartOfRange)
2205+
{
2206+
Fail(JSERR_UnicodeRegExpRangeContainsCharClass);
2207+
}
2208+
else if (prevprevWasACharSetAndPartOfRange)
2209+
{
2210+
if (pendingCodePoint != INVALID_CODEPOINT)
2211+
{
2212+
codePointSet.Set(ctAllocator, pendingCodePoint);
2213+
}
2214+
2215+
codePointSet.Set(ctAllocator, '-');
2216+
pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;
2217+
}
2218+
else if (pendingRangeStart != INVALID_CODEPOINT)
21992219
{
2200-
if (pendingRangeStart > pendingCodePoint && !prevprevWasACharSet)
2220+
if (pendingRangeStart > pendingCodePoint)
22012221
{
22022222
//We have no unicodeFlag, but current range contains surrogates, thus we may end up having to throw a "Syntax" error here
22032223
//This breaks the notion of Pass0 check for valid syntax, because we don't know if we have a unicode option
22042224
Assert(!unicodeFlagPresent);
22052225
Fail(JSERR_RegExpBadRange);
22062226
}
2207-
if (prevprevWasACharSet)
2208-
{
2209-
codePointSet.Set(ctAllocator, '-');
2210-
codePointSet.Set(ctAllocator, pendingCodePoint);
2211-
}
2212-
else
2213-
{
2214-
codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint);
2215-
}
2227+
2228+
codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint);
22162229
pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;
22172230
}
22182231
else
@@ -2222,8 +2235,8 @@ namespace UnifiedRegex
22222235
}
22232236

22242237
nextChar = ECLookahead();
2225-
prevprevWasACharSet = prevWasACharSet;
2226-
prevWasACharSet = currIsACharSet;
2238+
prevprevWasACharSetAndPartOfRange = prevWasACharSetAndPartOfRange;
2239+
prevWasACharSetAndPartOfRange = currIsACharSet && nextChar == '-';
22272240
currIsACharSet = false;
22282241
}
22292242

lib/Parser/rterrors.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,7 @@ RT_ERROR_MSG(JSERR_NoAccessors, 5673, "Invalid property descriptor: accessors no
366366
RT_ERROR_MSG(JSERR_RegExpInvalidEscape, 5674, "", "Invalid regular expression: invalid escape in unicode pattern", kjstSyntaxError, 0)
367367
RT_ERROR_MSG(JSERR_RegExpTooManyCapturingGroups, 5675, "", "Regular expression cannot have more than 32,767 capturing groups", kjstRangeError, 0)
368368
RT_ERROR_MSG(JSERR_ProxyHandlerReturnedFalse, 5676, "Proxy %s handler returned false", "Proxy handler returned false", kjstTypeError, 0)
369+
RT_ERROR_MSG(JSERR_UnicodeRegExpRangeContainsCharClass, 5677, "", "Character classes not allowed in class ranges", kjstSyntaxError, 0)
369370

370371
//Host errors
371372
RT_ERROR_MSG(JSERR_HostMaybeMissingPromiseContinuationCallback, 5700, "", "Host may not have set any promise continuation callback. Promises may not be executed.", kjstTypeError, 0)

test/Regex/characterclass_with_range.js

Lines changed: 79 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -5,159 +5,128 @@
55

66
WScript.LoadScriptFile("..\\UnitTestFramework\\UnitTestFramework.js");
77

8-
let re = /^[\s-a-z]$/;
9-
let reIgnoreCase = /^[\s-a-z]$/i;
10-
let reUnicode = /^[\s-z]$/u;
11-
let reNoCharClass = /^[a-c-z]$/;
8+
function testRegExp(str, regexp, expectedResult)
9+
{
10+
actualResult = regexp.test(str);
11+
errorMsg = "Expected result of test for match between string: '" + str + "' and regular expression: " + regexp + " to be " +
12+
expectedResult + " but was " + actualResult;
13+
assert.areEqual(expectedResult, actualResult, errorMsg);
14+
}
1215

1316
var tests = [
14-
/*No Flag RegExp Tests begin*/
15-
{
16-
name : "Ensure 'a-z' not counted as range",
17-
body : function ()
18-
{
19-
assert.isFalse(re.test("b"));
20-
}
21-
},
22-
{
23-
name : "Ensure 'a' included in set",
24-
body : function ()
25-
{
26-
assert.isTrue(re.test("a"));
27-
}
28-
},
29-
{
30-
name : "Ensure ' ' included in set",
31-
body : function ()
32-
{
33-
assert.isTrue(re.test(" "));
34-
}
35-
},
36-
{
37-
name : "Ensure 'z' included in set",
38-
body : function ()
39-
{
40-
assert.isTrue(re.test("z"));
41-
}
42-
},
43-
{
44-
name : "Ensure '\t' included in set",
45-
body : function ()
46-
{
47-
assert.isTrue(re.test("\t"));
48-
}
49-
},
50-
{
51-
name : "Ensure 'a-z' not counted as range",
52-
body : function ()
53-
{
54-
assert.isFalse(re.test("q"));
55-
}
56-
},
57-
{
58-
name : "Ensure '\' not counted in set",
59-
body : function ()
60-
{
61-
assert.isFalse(re.test("\\"));
62-
}
63-
},
64-
/*No Flag RegExp Tests End*/
65-
/*IgnoreCase Flag RegExp Tests Begin*/
6617
{
67-
name : "Ensure 'O' not included in set",
18+
name : "RegExp tests with no flags",
6819
body : function ()
6920
{
70-
assert.isFalse(reIgnoreCase.test("O"));
21+
let re = /[\s-a-z]/;
22+
testRegExp("b", re, false);
23+
testRegExp("a", re, true);
24+
testRegExp(" ", re, true);
25+
testRegExp("z", re, true);
26+
testRegExp("\t", re, true);
27+
testRegExp("q", re, false);
28+
testRegExp("\\", re, false);
29+
testRegExp("\u2028", re, true);
30+
testRegExp("\u2009", re, true);
7131
}
7232
},
7333
{
74-
name : "Ensure 'A' included in set",
34+
name : "RegExp tests with IgnoreCase flag set",
7535
body : function ()
7636
{
77-
assert.isTrue(reIgnoreCase.test("A"));
37+
let reIgnoreCase = /^[\s-a-z]$/i;
38+
testRegExp("O", reIgnoreCase, false);
39+
testRegExp("A", reIgnoreCase, true);
40+
testRegExp(" ", reIgnoreCase, true);
41+
testRegExp("z", reIgnoreCase, true);
42+
testRegExp("\t", reIgnoreCase, true);
43+
testRegExp("\u2028", reIgnoreCase, true);
44+
testRegExp("\u2009", reIgnoreCase, true);
7845
}
7946
},
8047
{
81-
name : "Ensure ' ' included in set",
48+
name : "RegExp tests with Unicode flag set",
8249
body : function ()
8350
{
84-
assert.isTrue(reIgnoreCase.test(" "));
51+
let reUnicode = /^[a-d]$/u;
52+
testRegExp("a", reUnicode, true);
53+
testRegExp("c", reUnicode, true);
54+
testRegExp("d", reUnicode, true);
55+
testRegExp("C", reUnicode, false);
56+
testRegExp("g", reUnicode, false);
57+
testRegExp("\u2028", reUnicode, false);
58+
testRegExp("\u2009", reUnicode, false);
59+
assert.throws(() => eval("/^[\\s-z]$/u.exec(\"-\")"), SyntaxError, "Expected an error due to character sets not being allowed in ranges when unicode flag is set.", "Character classes not allowed in class ranges");
8560
}
8661
},
8762
{
88-
name : "Ensure 'z' included in set",
63+
name : "Non-character class tests",
8964
body : function ()
9065
{
91-
assert.isTrue(reIgnoreCase.test("z"));
66+
let reNoCharClass = /^[a-c-z]$/;
67+
testRegExp("b", reNoCharClass, true);
68+
testRegExp("-", reNoCharClass, true);
69+
testRegExp("z", reNoCharClass, true);
70+
testRegExp("y", reNoCharClass, false);
9271
}
9372
},
9473
{
95-
name : "Ensure '\t' included in set",
74+
name : "Regression tests from bugFixRegression",
9675
body : function ()
9776
{
98-
assert.isTrue(reIgnoreCase.test("\t"));
77+
assert.areEqual(" -a", /[\s-a-c]*/.exec(" -abc")[0]);
78+
assert.areEqual(" -abc", /[\s\-a-c]*/.exec(" -abc")[0]);
79+
assert.areEqual(" -ab", /[a-\s-b]*/.exec(" -ab")[0]);
80+
assert.areEqual(" -ab", /[a\-\s\-b]*/.exec(" -ab")[0]);
9981
}
10082
},
101-
/*IgnoreCase Flag RegExp Tests End*/
102-
/*Unicode Flag RegExp Tests Begin*/
10383
{
104-
name : "'-' included in set since \s-z treated as union of three types, not range",
84+
name : "Special character tests",
10585
body : function ()
10686
{
107-
assert.isTrue(reUnicode.test("-"));
87+
let re = /^[\s][a\sb][\s--c-f]$/;
88+
testRegExp(' \\', re, false);
89+
testRegExp(' \\ ', re, false);
90+
testRegExp('\\ ', re, false);
91+
re = /[-][\d\-]/;
92+
testRegExp('--', re, true);
93+
testRegExp('-9', re, true);
94+
testRegExp(' ', re, false);
10895
}
10996
},
11097
{
111-
name : "' ' in set from \s character set",
98+
name : "Negation character set tests",
11299
body : function ()
113100
{
114-
assert.isTrue(reUnicode.test(" "));
101+
let reNegationCharSet = /[\D-\s]+/;
102+
testRegExp('555686', reNegationCharSet, false);
103+
testRegExp('555-686', reNegationCharSet, true);
104+
testRegExp('alphabet-123', reNegationCharSet, true);
115105
}
116106
},
117107
{
118-
name : "b not included in '\s-z'",
108+
name : "Non-range tests",
119109
body : function ()
120110
{
121-
assert.isFalse(reUnicode.test("b"));
111+
let reNonRange = /[-\w]/
112+
testRegExp('-', reNonRange, true);
113+
testRegExp('g', reNonRange, true);
114+
testRegExp('5', reNonRange, true);
115+
testRegExp(' ', reNonRange, false);
116+
testRegExp('\t', reNonRange, false);
117+
testRegExp('\u2028', reNonRange, false);
118+
119+
reNonRange = /[\w-]/
120+
testRegExp('-', reNonRange, true);
121+
testRegExp('g', reNonRange, true);
122+
testRegExp('5', reNonRange, true);
123+
testRegExp(' ', reNonRange, false);
124+
testRegExp('\t', reNonRange, false);
125+
testRegExp('\u2028', reNonRange, false);
122126
}
123-
},
124-
/*Unicode Flag RegExp Tests End*/
125-
/*Non-character class tests Begin*/
126-
{
127-
name : "First range is used",
128-
body : function ()
129-
{
130-
assert.isTrue(reNoCharClass.test("b"));
131-
}
132-
},
133-
{
134-
name : "'-' included in set from 2nd dash",
135-
body : function ()
136-
{
137-
assert.isTrue(reNoCharClass.test("-"));
138-
}
139-
},
140-
{
141-
name : "z included in set",
142-
body : function ()
143-
{
144-
assert.isTrue(reNoCharClass.test("z"));
145-
}
146-
},
147-
{
148-
name : "'c-z' not viewed as range",
149-
body : function ()
150-
{
151-
assert.isFalse(reNoCharClass.test("y"));
152-
}
153-
}
154-
/*Non-character class tests End*/
127+
}
155128
];
156129

157-
if (typeof modifyTests !== "undefined") {
158-
tests = modifyTests(tests);
159-
}
160-
161130
testRunner.runTests(tests, {
162131
verbose : WScript.Arguments[0] != "summary"
163132
});

test/UnifiedRegex/bugFixRegression.baseline

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -632,26 +632,6 @@ exec(/(?:a||b)?/ /*lastIndex=0*/ , "b");
632632
["b"] /*input="b", index=0*/
633633
r.lastIndex=0
634634
RegExp.${_,1,...,9}=["b","","","","","","","","",""]
635-
exec(/[\s-a-c]*/ /*lastIndex=0*/ , " -abc");
636-
[" -abc"] /*input=" -abc", index=0*/
637-
r.lastIndex=0
638-
RegExp.${_,1,...,9}=[" -abc","","","","","","","","",""]
639-
exec(/[\s\-a-c]*/ /*lastIndex=0*/ , " -abc");
640-
[" -abc"] /*input=" -abc", index=0*/
641-
r.lastIndex=0
642-
RegExp.${_,1,...,9}=[" -abc","","","","","","","","",""]
643-
exec(/[a-\s-b]*/ /*lastIndex=0*/ , " -ab");
644-
[" -ab"] /*input=" -ab", index=0*/
645-
r.lastIndex=0
646-
RegExp.${_,1,...,9}=[" -ab","","","","","","","","",""]
647-
exec(/[a\-\s\-b]*/ /*lastIndex=0*/ , " -ab");
648-
[" -ab"] /*input=" -ab", index=0*/
649-
r.lastIndex=0
650-
RegExp.${_,1,...,9}=[" -ab","","","","","","","","",""]
651-
exec(/[\s--c-!]*/ /*lastIndex=0*/ , " -./0Abc!");
652-
[" -./0Abc!"] /*input=" -./0Abc!", index=0*/
653-
r.lastIndex=0
654-
RegExp.${_,1,...,9}=[" -./0Abc!","","","","","","","","",""]
655635
EXCEPTION
656636
exec(/x*(?:(?=x(y*)+)y|\1x)/ /*lastIndex=0*/ , "xxy");
657637
["xx",undefined] /*input="xxy", index=0*/

test/UnifiedRegex/bugFixRegression.js

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -501,13 +501,6 @@ exec(/(?:a*)?/, "");
501501
exec(/(?:a+)?/, "");
502502
exec(/(?:a||b)?/, "b");
503503

504-
// WOOB1145588
505-
exec(/[\s-a-c]*/, " -abc");
506-
exec(/[\s\-a-c]*/, " -abc");
507-
exec(/[a-\s-b]*/, " -ab");
508-
exec(/[a\-\s\-b]*/, " -ab");
509-
exec(/[\s--c-!]*/, " -./0Abc!");
510-
511504
try {
512505
var r = new RegExp("[\\s-c-a]*", "");
513506
exec(r, " -abc");

0 commit comments

Comments
 (0)