Fixing RegExp parsing for character classes interacting with ranges.

Kenji Fukuda · Kenji Fukuda · commit eb33f9a451a7 · 2018-07-09T09:35:48.000-07:00
Fixes #258
diff --git a/lib/Parser/DebugWriter.cpp b/lib/Parser/DebugWriter.cpp
@@ -72,6 +72,8 @@ namespace UnifiedRegex
         CheckForNewline();
         if (c > 0xff)
             Output::Print(_u("\\u%lc%lc%lc%lc"), hex[c >> 12], hex[(c >> 8) & 0xf], hex[(c >> 4) & 0xf], hex[c & 0xf]);
+        else if (c == '-')
+            Output::Print(_u("\\x2d"));
         else if (c < ' ' || c > '~')
             Output::Print(_u("\\x%lc%lc"), hex[c >> 4], hex[c & 0xf]);
         else
diff --git a/lib/Parser/RegexParser.cpp b/lib/Parser/RegexParser.cpp
@@ -1931,6 +1931,7 @@ namespace UnifiedRegex
         codepoint_t pendingRangeStart = INVALID_CODEPOINT;
         codepoint_t pendingRangeEnd = INVALID_CODEPOINT;
         bool previousSurrogatePart = false;
+
         while(nextChar != ']')
         {
             current = next;
@@ -2034,7 +2035,7 @@ namespace UnifiedRegex
 
                     lastCodepoint = INVALID_CODEPOINT;
                 }
-                // If we the next character is the end of range ']', then we can't have a surrogate pair.
+                // If the next character is the end of range ']', then we can't have a surrogate pair.
                 // The current character is the range end, if we don't already have a candidate.
                 else if (ECLookahead() == ']' && pendingRangeEnd == INVALID_CODEPOINT)
                 {
@@ -2124,6 +2125,10 @@ namespace UnifiedRegex
         codepoint_t pendingRangeStart = INVALID_CODEPOINT;
         EncodedChar nextChar = ECLookahead();
         bool previousWasASurrogate = false;
+        bool currIsACharSet = false;
+        bool prevWasACharSet = false;
+        bool prevprevWasACharSet = false;
+
         while(nextChar != ']')
         {
             codepoint_t codePointToSet = INVALID_CODEPOINT;
@@ -2147,30 +2152,30 @@ namespace UnifiedRegex
             else if (nextChar == '\\')
             {
                 Node* returnedNode = ClassEscapePass1(&deferredCharNode, &deferredSetNode, previousWasASurrogate);
+                codePointToSet = pendingCodePoint;
 
                 if (returnedNode->tag == Node::MatchSet)
                 {
-                    codePointToSet = pendingCodePoint;
-                    pendingCodePoint = INVALID_CODEPOINT;
+                    pendingCodePoint = nextChar;
                     if (pendingRangeStart != INVALID_CODEPOINT)
                     {
                         codePointSet.Set(ctAllocator, '-');
                     }
                     pendingRangeStart = INVALID_CODEPOINT;
                     codePointSet.UnionInPlace(ctAllocator, deferredSetNode.set);
+                    currIsACharSet = true;
                 }
                 else
                 {
                     // Just a character
-                    codePointToSet = pendingCodePoint;
                     pendingCodePoint = deferredCharNode.cs[0];
                 }
             }
             else if (nextChar == '-')
             {
-                if (pendingRangeStart != INVALID_CODEPOINT || pendingCodePoint == INVALID_CODEPOINT || ECLookahead(1) == ']')
+                if ((!prevWasACharSet && (pendingRangeStart != INVALID_CODEPOINT || pendingCodePoint == INVALID_CODEPOINT)) ||  ECLookahead(1) == ']')
                 {
-                    // - is just a char, or end of a range.
+                    // - is just a char, or end of a range. If the previous char of the RegExp was a charset we want to treat it as the beginning of a range.
                     codePointToSet = pendingCodePoint;
                     pendingCodePoint = '-';
                     ECConsume();
@@ -2192,14 +2197,22 @@ namespace UnifiedRegex
             {
                 if (pendingRangeStart != INVALID_CODEPOINT)
                 {
-                    if (pendingRangeStart > pendingCodePoint)
+                    if (pendingRangeStart > pendingCodePoint && !prevprevWasACharSet)
                     {
                         //We have no unicodeFlag, but current range contains surrogates, thus we may end up having to throw a "Syntax" error here
                         //This breaks the notion of Pass0 check for valid syntax, because we don't know if we have a unicode option
                         Assert(!unicodeFlagPresent);
                         Fail(JSERR_RegExpBadRange);
                     }
-                    codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint);
+                    if (prevprevWasACharSet)
+                    {
+                        codePointSet.Set(ctAllocator, '-');
+                        codePointSet.Set(ctAllocator, pendingCodePoint);
+                    }
+                    else
+                    {
+                        codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint);
+                    }
                     pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;
                 }
                 else
@@ -2209,6 +2222,9 @@ namespace UnifiedRegex
             }
 
             nextChar = ECLookahead();
+            prevprevWasACharSet = prevWasACharSet;
+            prevWasACharSet = currIsACharSet;
+            currIsACharSet = false;
         }
 
         if (pendingCodePoint != INVALID_CODEPOINT)
diff --git a/test/Regex/characterclass_with_range.js b/test/Regex/characterclass_with_range.js
@@ -0,0 +1,163 @@
+//-------------------------------------------------------------------------------------------------------
+// Copyright (C) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
+//-------------------------------------------------------------------------------------------------------
+
+WScript.LoadScriptFile("..\\UnitTestFramework\\UnitTestFramework.js");
+
+let re = /^[\s-a-z]$/;
+let reIgnoreCase = /^[\s-a-z]$/i;
+let reUnicode = /^[\s-z]$/u;
+let reNoCharClass = /^[a-c-z]$/;
+
+var tests = [
+    /*No Flag RegExp Tests begin*/
+    {
+        name : "Ensure 'a-z' not counted as range",
+        body : function () 
+        {
+            assert.isFalse(re.test("b"));
+        }
+    },
+    {
+        name : "Ensure 'a' included in set",
+        body : function () 
+        {
+            assert.isTrue(re.test("a"));
+        }
+    },
+    {
+        name : "Ensure ' ' included in set",
+        body : function () 
+        {
+            assert.isTrue(re.test(" "));
+        }
+    },
+    {
+        name : "Ensure 'z' included in set",
+        body : function () 
+        {
+            assert.isTrue(re.test("z"));
+        }
+    },
+    {
+        name : "Ensure '\t' included in set",
+        body : function () 
+        {
+            assert.isTrue(re.test("\t"));
+        }
+    },
+    {
+        name : "Ensure 'a-z' not counted as range",
+        body : function () 
+        {
+            assert.isFalse(re.test("q"));
+        }
+    },
+    {
+        name : "Ensure '\' not counted in set",
+        body : function () 
+        {
+            assert.isFalse(re.test("\\"));
+        }
+    },
+    /*No Flag RegExp Tests End*/
+    /*IgnoreCase Flag RegExp Tests Begin*/
+    {
+        name : "Ensure 'O' not included in set",
+        body : function () 
+        {
+            assert.isFalse(reIgnoreCase.test("O"));
+        }
+    },
+    {
+        name : "Ensure 'A' included in set",
+        body : function () 
+        {
+            assert.isTrue(reIgnoreCase.test("A"));
+        }
+    },
+    {
+        name : "Ensure ' ' included in set",
+        body : function () 
+        {
+            assert.isTrue(reIgnoreCase.test(" "));
+        }
+    },
+    {
+        name : "Ensure 'z' included in set",
+        body : function () 
+        {
+            assert.isTrue(reIgnoreCase.test("z"));
+        }
+    },
+    {
+        name : "Ensure '\t' included in set",
+        body : function () 
+        {
+            assert.isTrue(reIgnoreCase.test("\t"));
+        }
+    },
+    /*IgnoreCase Flag RegExp Tests End*/
+    /*Unicode Flag RegExp Tests Begin*/
+    {
+        name : "'-' included in set since \s-z treated as union of three types, not range",
+        body : function () 
+        {
+            assert.isTrue(reUnicode.test("-"));
+        }
+    },
+    {
+        name : "' ' in set from \s character set",
+        body : function () 
+        {
+            assert.isTrue(reUnicode.test(" "));
+        }
+    },
+    {
+        name : "b not included in '\s-z'",
+        body : function () 
+        {
+            assert.isFalse(reUnicode.test("b"));
+        }
+    },
+    /*Unicode Flag RegExp Tests End*/
+    /*Non-character class tests Begin*/
+    {
+        name : "First range is used",
+        body : function () 
+        {
+            assert.isTrue(reNoCharClass.test("b"));
+        }
+    },
+    {
+        name : "'-' included in set from 2nd dash",
+        body : function () 
+        {
+            assert.isTrue(reNoCharClass.test("-"));
+        }
+    },
+    {
+        name : "z included in set",
+        body : function () 
+        {
+            assert.isTrue(reNoCharClass.test("z"));
+        }
+    },
+    {
+        name : "'c-z' not viewed as range",
+        body : function () 
+        {
+            assert.isFalse(reNoCharClass.test("y"));
+        }
+    }    
+    /*Non-character class tests End*/
+];
+
+if (typeof modifyTests !== "undefined") {
+    tests = modifyTests(tests);
+}
+
+testRunner.runTests(tests, {
+    verbose : WScript.Arguments[0] != "summary"
+});
diff --git a/test/Regex/rlexe.xml b/test/Regex/rlexe.xml
@@ -229,4 +229,10 @@
       <compile-flags>-args summary -endargs</compile-flags>
     </default>
   </test>
+    <test>
+    <default>
+      <files>characterclass_with_range.js</files>
+      <compile-flags>-args summary -endargs</compile-flags>
+    </default>
+  </test>
 </regress-exe>

Original file line number	Diff line number	Diff line change
`@@ -1931,6 +1931,7 @@ namespace UnifiedRegex`
`1931`	`1931`	`codepoint_t pendingRangeStart = INVALID_CODEPOINT;`
`1932`	`1932`	`codepoint_t pendingRangeEnd = INVALID_CODEPOINT;`
`1933`	`1933`	`bool previousSurrogatePart = false;`
	`1934`	`+`
`1934`	`1935`	`while(nextChar != ']')`
`1935`	`1936`	`{`
`1936`	`1937`	`current = next;`
`@@ -2034,7 +2035,7 @@ namespace UnifiedRegex`
`2034`	`2035`
`2035`	`2036`	`lastCodepoint = INVALID_CODEPOINT;`
`2036`	`2037`	`}`
`2037`		`- // If we the next character is the end of range ']', then we can't have a surrogate pair.`
	`2038`	`+ // If the next character is the end of range ']', then we can't have a surrogate pair.`
`2038`	`2039`	`// The current character is the range end, if we don't already have a candidate.`
`2039`	`2040`	`else if (ECLookahead() == ']' && pendingRangeEnd == INVALID_CODEPOINT)`
`2040`	`2041`	`{`
`@@ -2124,6 +2125,10 @@ namespace UnifiedRegex`
`2124`	`2125`	`codepoint_t pendingRangeStart = INVALID_CODEPOINT;`
`2125`	`2126`	`EncodedChar nextChar = ECLookahead();`
`2126`	`2127`	`bool previousWasASurrogate = false;`
	`2128`	`+ bool currIsACharSet = false;`
	`2129`	`+ bool prevWasACharSet = false;`
	`2130`	`+ bool prevprevWasACharSet = false;`
	`2131`	`+`
`2127`	`2132`	`while(nextChar != ']')`
`2128`	`2133`	`{`
`2129`	`2134`	`codepoint_t codePointToSet = INVALID_CODEPOINT;`
`@@ -2147,30 +2152,30 @@ namespace UnifiedRegex`
`2147`	`2152`	`else if (nextChar == '\\')`
`2148`	`2153`	`{`
`2149`	`2154`	`Node* returnedNode = ClassEscapePass1(&deferredCharNode, &deferredSetNode, previousWasASurrogate);`
	`2155`	`+ codePointToSet = pendingCodePoint;`
`2150`	`2156`
`2151`	`2157`	`if (returnedNode->tag == Node::MatchSet)`
`2152`	`2158`	`{`
`2153`		`- codePointToSet = pendingCodePoint;`
`2154`		`- pendingCodePoint = INVALID_CODEPOINT;`
	`2159`	`+ pendingCodePoint = nextChar;`
`2155`	`2160`	`if (pendingRangeStart != INVALID_CODEPOINT)`
`2156`	`2161`	`{`
`2157`	`2162`	`codePointSet.Set(ctAllocator, '-');`
`2158`	`2163`	`}`
`2159`	`2164`	`pendingRangeStart = INVALID_CODEPOINT;`
`2160`	`2165`	`codePointSet.UnionInPlace(ctAllocator, deferredSetNode.set);`
	`2166`	`+ currIsACharSet = true;`
`2161`	`2167`	`}`
`2162`	`2168`	`else`
`2163`	`2169`	`{`
`2164`	`2170`	`// Just a character`
`2165`		`- codePointToSet = pendingCodePoint;`
`2166`	`2171`	`pendingCodePoint = deferredCharNode.cs[0];`
`2167`	`2172`	`}`
`2168`	`2173`	`}`
`2169`	`2174`	`else if (nextChar == '-')`
`2170`	`2175`	`{`
`2171`		`- if (pendingRangeStart != INVALID_CODEPOINT \|\| pendingCodePoint == INVALID_CODEPOINT \|\| ECLookahead(1) == ']')`
	`2176`	`+ if ((!prevWasACharSet && (pendingRangeStart != INVALID_CODEPOINT \|\| pendingCodePoint == INVALID_CODEPOINT)) \|\| ECLookahead(1) == ']')`
`2172`	`2177`	`{`
`2173`		`- // - is just a char, or end of a range.`
	`2178`	`+ // - is just a char, or end of a range. If the previous char of the RegExp was a charset we want to treat it as the beginning of a range.`
`2174`	`2179`	`codePointToSet = pendingCodePoint;`
`2175`	`2180`	`pendingCodePoint = '-';`
`2176`	`2181`	`ECConsume();`
`@@ -2192,14 +2197,22 @@ namespace UnifiedRegex`
`2192`	`2197`	`{`
`2193`	`2198`	`if (pendingRangeStart != INVALID_CODEPOINT)`
`2194`	`2199`	`{`
`2195`		`- if (pendingRangeStart > pendingCodePoint)`
	`2200`	`+ if (pendingRangeStart > pendingCodePoint && !prevprevWasACharSet)`
`2196`	`2201`	`{`
`2197`	`2202`	`//We have no unicodeFlag, but current range contains surrogates, thus we may end up having to throw a "Syntax" error here`
`2198`	`2203`	`//This breaks the notion of Pass0 check for valid syntax, because we don't know if we have a unicode option`
`2199`	`2204`	`Assert(!unicodeFlagPresent);`
`2200`	`2205`	`Fail(JSERR_RegExpBadRange);`
`2201`	`2206`	`}`
`2202`		`- codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint);`
	`2207`	`+ if (prevprevWasACharSet)`
	`2208`	`+ {`
	`2209`	`+ codePointSet.Set(ctAllocator, '-');`
	`2210`	`+ codePointSet.Set(ctAllocator, pendingCodePoint);`
	`2211`	`+ }`
	`2212`	`+ else`
	`2213`	`+ {`
	`2214`	`+ codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint);`
	`2215`	`+ }`
`2203`	`2216`	`pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;`
`2204`	`2217`	`}`
`2205`	`2218`	`else`
`@@ -2209,6 +2222,9 @@ namespace UnifiedRegex`
`2209`	`2222`	`}`
`2210`	`2223`
`2211`	`2224`	`nextChar = ECLookahead();`
	`2225`	`+ prevprevWasACharSet = prevWasACharSet;`
	`2226`	`+ prevWasACharSet = currIsACharSet;`
	`2227`	`+ currIsACharSet = false;`
`2212`	`2228`	`}`
`2213`	`2229`
`2214`	`2230`	`if (pendingCodePoint != INVALID_CODEPOINT)`