Skip to content

Commit eb33f9a

Browse files
author
Kenji Fukuda
committed
Fixing RegExp parsing for character classes interacting with ranges.
Fixes #258
1 parent a1bdacf commit eb33f9a

File tree

4 files changed

+195
-8
lines changed

4 files changed

+195
-8
lines changed

lib/Parser/DebugWriter.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@ namespace UnifiedRegex
7272
CheckForNewline();
7373
if (c > 0xff)
7474
Output::Print(_u("\\u%lc%lc%lc%lc"), hex[c >> 12], hex[(c >> 8) & 0xf], hex[(c >> 4) & 0xf], hex[c & 0xf]);
75+
else if (c == '-')
76+
Output::Print(_u("\\x2d"));
7577
else if (c < ' ' || c > '~')
7678
Output::Print(_u("\\x%lc%lc"), hex[c >> 4], hex[c & 0xf]);
7779
else

lib/Parser/RegexParser.cpp

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1931,6 +1931,7 @@ namespace UnifiedRegex
19311931
codepoint_t pendingRangeStart = INVALID_CODEPOINT;
19321932
codepoint_t pendingRangeEnd = INVALID_CODEPOINT;
19331933
bool previousSurrogatePart = false;
1934+
19341935
while(nextChar != ']')
19351936
{
19361937
current = next;
@@ -2034,7 +2035,7 @@ namespace UnifiedRegex
20342035

20352036
lastCodepoint = INVALID_CODEPOINT;
20362037
}
2037-
// If we the next character is the end of range ']', then we can't have a surrogate pair.
2038+
// If the next character is the end of range ']', then we can't have a surrogate pair.
20382039
// The current character is the range end, if we don't already have a candidate.
20392040
else if (ECLookahead() == ']' && pendingRangeEnd == INVALID_CODEPOINT)
20402041
{
@@ -2124,6 +2125,10 @@ namespace UnifiedRegex
21242125
codepoint_t pendingRangeStart = INVALID_CODEPOINT;
21252126
EncodedChar nextChar = ECLookahead();
21262127
bool previousWasASurrogate = false;
2128+
bool currIsACharSet = false;
2129+
bool prevWasACharSet = false;
2130+
bool prevprevWasACharSet = false;
2131+
21272132
while(nextChar != ']')
21282133
{
21292134
codepoint_t codePointToSet = INVALID_CODEPOINT;
@@ -2147,30 +2152,30 @@ namespace UnifiedRegex
21472152
else if (nextChar == '\\')
21482153
{
21492154
Node* returnedNode = ClassEscapePass1(&deferredCharNode, &deferredSetNode, previousWasASurrogate);
2155+
codePointToSet = pendingCodePoint;
21502156

21512157
if (returnedNode->tag == Node::MatchSet)
21522158
{
2153-
codePointToSet = pendingCodePoint;
2154-
pendingCodePoint = INVALID_CODEPOINT;
2159+
pendingCodePoint = nextChar;
21552160
if (pendingRangeStart != INVALID_CODEPOINT)
21562161
{
21572162
codePointSet.Set(ctAllocator, '-');
21582163
}
21592164
pendingRangeStart = INVALID_CODEPOINT;
21602165
codePointSet.UnionInPlace(ctAllocator, deferredSetNode.set);
2166+
currIsACharSet = true;
21612167
}
21622168
else
21632169
{
21642170
// Just a character
2165-
codePointToSet = pendingCodePoint;
21662171
pendingCodePoint = deferredCharNode.cs[0];
21672172
}
21682173
}
21692174
else if (nextChar == '-')
21702175
{
2171-
if (pendingRangeStart != INVALID_CODEPOINT || pendingCodePoint == INVALID_CODEPOINT || ECLookahead(1) == ']')
2176+
if ((!prevWasACharSet && (pendingRangeStart != INVALID_CODEPOINT || pendingCodePoint == INVALID_CODEPOINT)) || ECLookahead(1) == ']')
21722177
{
2173-
// - is just a char, or end of a range.
2178+
// - is just a char, or end of a range. If the previous char of the RegExp was a charset we want to treat it as the beginning of a range.
21742179
codePointToSet = pendingCodePoint;
21752180
pendingCodePoint = '-';
21762181
ECConsume();
@@ -2192,14 +2197,22 @@ namespace UnifiedRegex
21922197
{
21932198
if (pendingRangeStart != INVALID_CODEPOINT)
21942199
{
2195-
if (pendingRangeStart > pendingCodePoint)
2200+
if (pendingRangeStart > pendingCodePoint && !prevprevWasACharSet)
21962201
{
21972202
//We have no unicodeFlag, but current range contains surrogates, thus we may end up having to throw a "Syntax" error here
21982203
//This breaks the notion of Pass0 check for valid syntax, because we don't know if we have a unicode option
21992204
Assert(!unicodeFlagPresent);
22002205
Fail(JSERR_RegExpBadRange);
22012206
}
2202-
codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint);
2207+
if (prevprevWasACharSet)
2208+
{
2209+
codePointSet.Set(ctAllocator, '-');
2210+
codePointSet.Set(ctAllocator, pendingCodePoint);
2211+
}
2212+
else
2213+
{
2214+
codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint);
2215+
}
22032216
pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;
22042217
}
22052218
else
@@ -2209,6 +2222,9 @@ namespace UnifiedRegex
22092222
}
22102223

22112224
nextChar = ECLookahead();
2225+
prevprevWasACharSet = prevWasACharSet;
2226+
prevWasACharSet = currIsACharSet;
2227+
currIsACharSet = false;
22122228
}
22132229

22142230
if (pendingCodePoint != INVALID_CODEPOINT)
Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
//-------------------------------------------------------------------------------------------------------
2+
// Copyright (C) Microsoft. All rights reserved.
3+
// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
4+
//-------------------------------------------------------------------------------------------------------
5+
6+
WScript.LoadScriptFile("..\\UnitTestFramework\\UnitTestFramework.js");
7+
8+
let re = /^[\s-a-z]$/;
9+
let reIgnoreCase = /^[\s-a-z]$/i;
10+
let reUnicode = /^[\s-z]$/u;
11+
let reNoCharClass = /^[a-c-z]$/;
12+
13+
var tests = [
14+
/*No Flag RegExp Tests begin*/
15+
{
16+
name : "Ensure 'a-z' not counted as range",
17+
body : function ()
18+
{
19+
assert.isFalse(re.test("b"));
20+
}
21+
},
22+
{
23+
name : "Ensure 'a' included in set",
24+
body : function ()
25+
{
26+
assert.isTrue(re.test("a"));
27+
}
28+
},
29+
{
30+
name : "Ensure ' ' included in set",
31+
body : function ()
32+
{
33+
assert.isTrue(re.test(" "));
34+
}
35+
},
36+
{
37+
name : "Ensure 'z' included in set",
38+
body : function ()
39+
{
40+
assert.isTrue(re.test("z"));
41+
}
42+
},
43+
{
44+
name : "Ensure '\t' included in set",
45+
body : function ()
46+
{
47+
assert.isTrue(re.test("\t"));
48+
}
49+
},
50+
{
51+
name : "Ensure 'a-z' not counted as range",
52+
body : function ()
53+
{
54+
assert.isFalse(re.test("q"));
55+
}
56+
},
57+
{
58+
name : "Ensure '\' not counted in set",
59+
body : function ()
60+
{
61+
assert.isFalse(re.test("\\"));
62+
}
63+
},
64+
/*No Flag RegExp Tests End*/
65+
/*IgnoreCase Flag RegExp Tests Begin*/
66+
{
67+
name : "Ensure 'O' not included in set",
68+
body : function ()
69+
{
70+
assert.isFalse(reIgnoreCase.test("O"));
71+
}
72+
},
73+
{
74+
name : "Ensure 'A' included in set",
75+
body : function ()
76+
{
77+
assert.isTrue(reIgnoreCase.test("A"));
78+
}
79+
},
80+
{
81+
name : "Ensure ' ' included in set",
82+
body : function ()
83+
{
84+
assert.isTrue(reIgnoreCase.test(" "));
85+
}
86+
},
87+
{
88+
name : "Ensure 'z' included in set",
89+
body : function ()
90+
{
91+
assert.isTrue(reIgnoreCase.test("z"));
92+
}
93+
},
94+
{
95+
name : "Ensure '\t' included in set",
96+
body : function ()
97+
{
98+
assert.isTrue(reIgnoreCase.test("\t"));
99+
}
100+
},
101+
/*IgnoreCase Flag RegExp Tests End*/
102+
/*Unicode Flag RegExp Tests Begin*/
103+
{
104+
name : "'-' included in set since \s-z treated as union of three types, not range",
105+
body : function ()
106+
{
107+
assert.isTrue(reUnicode.test("-"));
108+
}
109+
},
110+
{
111+
name : "' ' in set from \s character set",
112+
body : function ()
113+
{
114+
assert.isTrue(reUnicode.test(" "));
115+
}
116+
},
117+
{
118+
name : "b not included in '\s-z'",
119+
body : function ()
120+
{
121+
assert.isFalse(reUnicode.test("b"));
122+
}
123+
},
124+
/*Unicode Flag RegExp Tests End*/
125+
/*Non-character class tests Begin*/
126+
{
127+
name : "First range is used",
128+
body : function ()
129+
{
130+
assert.isTrue(reNoCharClass.test("b"));
131+
}
132+
},
133+
{
134+
name : "'-' included in set from 2nd dash",
135+
body : function ()
136+
{
137+
assert.isTrue(reNoCharClass.test("-"));
138+
}
139+
},
140+
{
141+
name : "z included in set",
142+
body : function ()
143+
{
144+
assert.isTrue(reNoCharClass.test("z"));
145+
}
146+
},
147+
{
148+
name : "'c-z' not viewed as range",
149+
body : function ()
150+
{
151+
assert.isFalse(reNoCharClass.test("y"));
152+
}
153+
}
154+
/*Non-character class tests End*/
155+
];
156+
157+
if (typeof modifyTests !== "undefined") {
158+
tests = modifyTests(tests);
159+
}
160+
161+
testRunner.runTests(tests, {
162+
verbose : WScript.Arguments[0] != "summary"
163+
});

test/Regex/rlexe.xml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,4 +229,10 @@
229229
<compile-flags>-args summary -endargs</compile-flags>
230230
</default>
231231
</test>
232+
<test>
233+
<default>
234+
<files>characterclass_with_range.js</files>
235+
<compile-flags>-args summary -endargs</compile-flags>
236+
</default>
237+
</test>
232238
</regress-exe>

0 commit comments

Comments
 (0)