[MERGE #5450 @kfukuda2] Change parsing of \c to reflect spec Fixes #2973

Kenji Fukuda · Kenji Fukuda · commit 4374e4407b12 · 2018-07-17T09:59:45.000-07:00
Merge pull request #5450 from kfukuda2:ControlCharFix Changing the treatment of escaped c ,`\cX`, according to spec. If the Unicode flag is not present and the sequence is inside a character class: &nbsp;&nbsp;&nbsp;If `X` is a non-word(^A-Za-z0-9_), treat `\` as a standalone literal and resume parsing from `c`. &nbsp;&nbsp;&nbsp;If `X` is a word, take the lowest 5 bits of the Unicode codepoint of `X` to be matched. If the Unicode flag is present and the sequence is inside a character class: &nbsp;&nbsp;&nbsp;If `X` is a non-letter(^A-Za-z), throw a SyntaxError. &nbsp;&nbsp;&nbsp;If `X` is a letter, take the lowest 5 bits of the Unicode codepoint of `X` to be matched. If the Unicode flag is not present and the sequence is not inside a character class: &nbsp;&nbsp;&nbsp;If `X` is a non-letter(^A-Za-z), treat `\` as a standalone literal and resume parsing from `c`. &nbsp;&nbsp;&nbsp;If `X` is a letter, take the lowest 5 bits of the Unicode codepoint of `X` to be matched. If the Unicode flag is present and the sequence is not inside a character class: &nbsp;&nbsp;&nbsp;If `X` is a non-letter(^A-Za-z), throw a SyntaxError. &nbsp;&nbsp;&nbsp;If `X` is a letter, take the lowest 5 bits of the Unicode codepoint of `X` to be matched.
diff --git a/lib/Parser/RegexParser.cpp b/lib/Parser/RegexParser.cpp
@@ -1529,7 +1529,13 @@ namespace UnifiedRegex
         else if (ECLookahead() == 'c')
         {
             if (standardEncodedChars->IsLetter(ECLookahead(1))) // terminating 0 is not a letter
+            {
                 ECConsume(2);
+            }
+            else
+            {
+                DeferredFailIfUnicode(JSERR_RegExpInvalidEscape);
+            }
             return false;
         }
         else
@@ -2161,7 +2167,7 @@ namespace UnifiedRegex
                     {
                         if (unicodeFlagPresent)
                         {
-                            //We a range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here
+                            //A range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here
                             //This breaks the notion of Pass0 check for valid syntax, because during that time, the unicode flag is unknown.
                             Fail(JSERR_UnicodeRegExpRangeContainsCharClass); //From #sec-patterns-static-semantics-early-errors-annexb
                         }
@@ -2206,7 +2212,7 @@ namespace UnifiedRegex
             {
                 if (prevprevWasACharSetAndPartOfRange)
                 {
-                    //We a range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here
+                    //A range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here
                     //This breaks the notion of Pass0 check for valid syntax, because during that time, the unicode flag is unknown.
                     if (unicodeFlagPresent)
                     {
@@ -2494,6 +2500,8 @@ namespace UnifiedRegex
                 }
                 else
                 {
+                    DeferredFailIfUnicode(JSERR_RegExpInvalidEscape); // Fail in unicode mode for non-letter escaped control characters according to 262 Annex-B RegExp grammar spec #prod-annexB-Term 
+
                     if (!IsEOF())
                     {
                         EncodedChar ecLookahead = ECLookahead();
@@ -2625,33 +2633,19 @@ namespace UnifiedRegex
                 standardChars->SetNonWordChars(ctAllocator, deferredSetNode->set);
                 return deferredSetNode;
             case 'c':
-                if (standardEncodedChars->IsLetter(ECLookahead())) // terminating 0 is not a letter
+                if (standardEncodedChars->IsWord(ECLookahead())) // terminating 0 is not a word character
                 {
                     c = UTC(Chars<EncodedChar>::CTU(ECLookahead()) % 32);
                     ECConsume();
                     // fall-through for identity escape
                 }
                 else
                 {
-                    // SPEC DEVIATION: For non-letters, still take lower 5 bits, e.g. [\c1] == [\x11].
-                    //                 However, '-', ']', and EOF make the \c just a 'c'.
-                    if (!IsEOF())
-                    {
-                        EncodedChar ec = ECLookahead();
-                        switch (ec)
-                        {
-                        case '-':
-                        case ']':
-                            // fall-through for identity escape with 'c'
-                            break;
-                        default:
-                            c = UTC(Chars<EncodedChar>::CTU(ec) % 32);
-                            ECConsume();
-                            // fall-through for identity escape
-                            break;
-                        }
-                    }
-                    // else: fall-through for identity escape with 'c'
+                    // If the lookahead is a non-alphanumeric and not an underscore ('_'), then treat '\' and 'c' separately.
+                    //#sec-regular-expression-patterns-semantics 
+                    ECRevert(1); //Put cursor back at 'c' and treat it as a non-escaped character.
+                    deferredCharNode->cs[0] = '\\';
+                    return deferredCharNode;
                 }
                 break;
             case 'x':
diff --git a/test/Regex/control_character_escapes.js b/test/Regex/control_character_escapes.js
@@ -0,0 +1,195 @@
+//-------------------------------------------------------------------------------------------------------
+// Copyright (C) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
+//-------------------------------------------------------------------------------------------------------
+
+WScript.LoadScriptFile("..\\UnitTestFramework\\UnitTestFramework.js");
+
+function matchRegExp(str, regexpLiteral, expectedResult)
+{
+    matchResultLiteral = str.match(regexpLiteral);
+    errorMsgBase = "Expected result of match between string: '" + str + "' and regular expression: " + regexpLiteral + " to be " + 
+                    expectedResult + " but was "
+
+    actualResultLiteral = matchResultLiteral == null ? null : matchResultLiteral[0];
+    assert.areEqual(expectedResult, actualResultLiteral, errorMsgBase + actualResultLiteral); 
+    
+    regexpConstructor = new RegExp(regexpLiteral);
+    matchResultConstructor = str.match(regexpConstructor);
+
+    actualResultConstructor = matchResultConstructor == null ? null : matchResultConstructor[0];
+    assert.areEqual(expectedResult, actualResultConstructor, errorMsgBase + actualResultConstructor); 
+}
+
+var tests = [
+    {
+        name : "Control characters followed by a word character ([A-Za-z0-9_])",
+        body : function () 
+        {
+            re = /[\c6]+/; //'6' = ascii x36, parsed as [\x16]+
+            matchRegExp("6", re, null);
+            matchRegExp("\\", re, null);
+            matchRegExp("\\c6", re, null);
+            matchRegExp("c", re, null);
+            matchRegExp("\x16", re, "\x16");
+            
+            re = /\c6/; //'6' = ascii x36, parsed as "\\c6"
+            matchRegExp("\\c6", re, "\\c6");
+            matchRegExp("\\", re, null);
+            matchRegExp("6", re, null);
+            matchRegExp("c", re, null);
+            matchRegExp("\x16", re, null);
+            
+            re = /\c6[\c6]+/; //'6' = ascii x36, parsed as "\\c6"[\x16]+
+            matchRegExp("\\c6\x16", re, "\\c6\x16");
+            matchRegExp("\\", re, null);
+            matchRegExp("c", re, null);
+            matchRegExp("\x16", re, null);
+            
+            re = /[\ca]+/; //'a' = ascii x61, parsed as [\x01]+
+            matchRegExp("a", re, null);
+            matchRegExp("\\", re, null);
+            matchRegExp("c", re, null);
+            matchRegExp("00xyzabc123\x01qrst", re, "\x01");
+	    
+            re = /[\c_]+/; //'_' = ascii 0x5F, parsed as [\x1F]+
+            matchRegExp("\x1F\x1F\x05", re, "\x1F\x1F");
+            matchRegExp("\\\\\\", re, null);
+            matchRegExp("////", re, null);
+            matchRegExp("ccc_", re, null);
+            
+            re = /[\cG]*/; //'G' = ascii x47, parsed as [\x07]*
+            matchRegExp("\x07\x06\x05", re, "\x07");
+            matchRegExp("\\\\", re, "");
+            matchRegExp("////", re, "");
+            matchRegExp("cccG", re, "");
+            
+            re = /[\cG\c6\cf]+/; //'G' = ascii x47, '6' = ascii x36, 'f' = ascii x66, parsed as [\x07\x16\x06]+
+            matchRegExp("\x00\x03\x07\x06\x16\x07\x08", re, "\x07\x06\x16\x07");
+            matchRegExp("\\\\", re, null);
+            matchRegExp("////", re, null);
+            matchRegExp("cfG6", re, null);
+            
+            re = /\cG\cf/; //'G' = ascii x47, 'f' = ascii x66, parsed as "\x07\x06"
+            matchRegExp("\x00\x03\x07\x06\x16\x07\x08", re, "\x07\x06");
+            matchRegExp("\\", re, null);
+            matchRegExp("/", re, null);
+            matchRegExp("\\cG\\c6\\cf", re, null);
+            
+            re = /[\cz\cZ]+/; //'z' = ascii x7A, 'Z' = ascii x5A, have the same lowest 5 bits, parsed as [\x1A]+
+            matchRegExp("\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + 
+                        "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f", re, "\x1a");
+            matchRegExp("\\\\", re, null);
+            matchRegExp("////", re, null);
+            matchRegExp("ccczZ", re, null);
+        }
+    },
+    {
+        name : "Control characters followed by a non-word character ([^A-Za-z0-9_])",
+        body : function () 
+        {
+            re = /[\c*]+/; //'*' = ascii 42, parsed as [\\c*]+ 
+            matchRegExp("\x0a\x09\x08", re, null);
+            matchRegExp("a*c*b*d*", re, "*c*");
+            matchRegExp("\\\\", re, "\\\\");
+            matchRegExp("////", re, null);
+            matchRegExp("ccc", re, "ccc");
+            
+            re = /[\c}]*/; //'}' = ascii 125, parsed as [\\c}]*
+            matchRegExp("\x1d\x7d\x3d", re, "");
+            matchRegExp("}c}}cd*c*b*d*", re, "}c}}c");
+            matchRegExp("\\\\", re, "\\\\");
+            matchRegExp("////", re, "");
+            matchRegExp("ccc", re, "ccc");
+            
+            re = /[\c;]+/; //';' = ascii 59, parsed as [\\c;]+
+            matchRegExp("\x1b\x1c", re, null);
+            matchRegExp("d;c;d;*", re, ";c;");
+            matchRegExp("\\\\", re, "\\\\");
+            matchRegExp("////", re, null);
+            matchRegExp("ccc", re, "ccc");
+            
+            re = /\c%/; //'%' = ascii x25, parsed as \\c%
+            matchRegExp("\\", re, null);
+            matchRegExp("\\", re, null);
+            matchRegExp("\\c%", re, "\\c%");
+            matchRegExp("\x05", re, null);
+        }
+    },
+    {
+        name : "Control Character tests with unicode flag present",
+        body : function () 
+        {
+            re = /[\cAg]+/u; //'A' = ascii x41, parsed as [g\x01]+
+            matchRegExp("abcdefghi", re, "g");
+            matchRegExp("\\\\", re, null);
+            matchRegExp("////", re, null);
+            matchRegExp("\x01\x01gg\x02\x04ggg", re, "\x01\x01gg");            
+            
+            re = /[\czA]+/u;  //'z' = ascii x7A, parsed as [\x1AA]+
+            matchRegExp("abcdefghi", re, null);
+            matchRegExp("\\\\", re, null);
+            matchRegExp("////", re, null);
+            matchRegExp("YZA\x1aABC", re, "A\x1aA");    
+            
+            assert.throws(() => eval("\"\".match(/[\\c]/u)"), SyntaxError, "(Character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by no character here.", 
+                        "Invalid regular expression: invalid escape in unicode pattern");
+            assert.throws(() => eval("\"\".match(/[\\c-d]/u)"), SyntaxError, "(Character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by a dash, '-', here.", 
+                        "Invalid regular expression: invalid escape in unicode pattern");
+            assert.throws(() => eval("\"\".match(/[ab\\c_$]/u)"), SyntaxError, "(Character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by an underscore, '_', here.",
+                        "Invalid regular expression: invalid escape in unicode pattern");
+            assert.throws(() => eval("\"\".match(/[ab\\c\\d]/u)"), SyntaxError, "(Character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by a backslash, '\\', here.", 
+                        "Invalid regular expression: invalid escape in unicode pattern");
+            assert.throws(() => eval("\"\".match(/[ab\\c3]/u)"), SyntaxError, "(Character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by a number, '3', here.", 
+                        "Invalid regular expression: invalid escape in unicode pattern");
+                        
+            re = /\cAg/u;  //'A' = ascii x41, parsed as "\x01g"
+            matchRegExp("abcdefghi", re, null);
+            matchRegExp("\\\\", re, null);
+            matchRegExp("////", re, null);
+            matchRegExp("\x01\x01gg\x02\x04ggg", re, "\x01g");            
+            
+            re = /\czA/u;  //'z' = ascii x7A, parsed as "\x1aA"
+            matchRegExp("abcdefghi", re, null);
+            matchRegExp("\\\\", re, null);
+            matchRegExp("////", re, null);
+            matchRegExp("YZA\x1aABC", re, "\x1aA");   
+            
+            assert.throws(() => eval("\"\".match(/\\c/u)"), SyntaxError, "(Non-character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by no character here.", 
+                        "Invalid regular expression: invalid escape in unicode pattern");
+            assert.throws(() => eval("\"\".match(/\\c-d/u)"), SyntaxError, "(Non-character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by a dash, '-', here.", 
+                        "Invalid regular expression: invalid escape in unicode pattern");
+            assert.throws(() => eval("\"\".match(/ab\\c_$/u)"), SyntaxError, "(Non-character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by an underscore, '_', here.",
+                        "Invalid regular expression: invalid escape in unicode pattern");
+            assert.throws(() => eval("\"\".match(/ab\\c\\d/u)"), SyntaxError, "(Non-character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by a backslash, '\\', here.", 
+                        "Invalid regular expression: invalid escape in unicode pattern");
+            assert.throws(() => eval("\"\".match(/ab\\c3/u)"), SyntaxError, "(Non-character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by a number, '3', here.", 
+                        "Invalid regular expression: invalid escape in unicode pattern");
+        }
+    },
+    {
+        name : "Control character edge cases",
+        body : function () 
+        {
+            re = /[\c-g]+/; //'-' = ascii x2D, parsed as [\\c-g]+ 
+            matchRegExp("abcdefghi", re, "cdefg");
+            matchRegExp("\\\\", re, "\\\\");
+            matchRegExp("////", re, null);
+            matchRegExp("\x0d", re, null);
+            matchRegExp("aobd\\f\\d", re, "d\\f\\d");            
+            
+            re = /[\c-]+/; //'-' = ascii x2D, parsed as [\\c-]+
+            matchRegExp("abcdefghi", re, "c");
+            matchRegExp("\x0dc--c", re, "c--c");
+            matchRegExp("\\\\", re, "\\\\");
+            matchRegExp("////", re, null);
+            matchRegExp("aobd\\f\\d", re, "\\");  
+            
+            assert.throws(() => eval("\"\".match(/[\\c-a]/)"), SyntaxError, "Expected an error due to 'c-a' being an invalid range.", "Invalid range in character set");
+        }
+    }
+];
+
+testRunner.runTests(tests, {
+    verbose : WScript.Arguments[0] != "summary"
+});
diff --git a/test/Regex/rlexe.xml b/test/Regex/rlexe.xml
@@ -229,10 +229,16 @@
       <compile-flags>-args summary -endargs</compile-flags>
     </default>
   </test>
-    <test>
+  <test>
     <default>
       <files>characterclass_with_range.js</files>
       <compile-flags>-args summary -endargs</compile-flags>
     </default>
   </test>
+  <test>
+    <default>
+      <files>control_character_escapes.js</files>
+      <compile-flags>-args summary -endargs</compile-flags>
+    </default>
+  </test>
 </regress-exe>

Original file line number	Diff line number	Diff line change
`@@ -1529,7 +1529,13 @@ namespace UnifiedRegex`
`1529`	`1529`	`else if (ECLookahead() == 'c')`
`1530`	`1530`	`{`
`1531`	`1531`	`if (standardEncodedChars->IsLetter(ECLookahead(1))) // terminating 0 is not a letter`
	`1532`	`+ {`
`1532`	`1533`	`ECConsume(2);`
	`1534`	`+ }`
	`1535`	`+ else`
	`1536`	`+ {`
	`1537`	`+ DeferredFailIfUnicode(JSERR_RegExpInvalidEscape);`
	`1538`	`+ }`
`1533`	`1539`	`return false;`
`1534`	`1540`	`}`
`1535`	`1541`	`else`
`@@ -2161,7 +2167,7 @@ namespace UnifiedRegex`
`2161`	`2167`	`{`
`2162`	`2168`	`if (unicodeFlagPresent)`
`2163`	`2169`	`{`
`2164`		`- //We a range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here`
	`2170`	`+ //A range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here`
`2165`	`2171`	`//This breaks the notion of Pass0 check for valid syntax, because during that time, the unicode flag is unknown.`
`2166`	`2172`	`Fail(JSERR_UnicodeRegExpRangeContainsCharClass); //From #sec-patterns-static-semantics-early-errors-annexb`
`2167`	`2173`	`}`
`@@ -2206,7 +2212,7 @@ namespace UnifiedRegex`
`2206`	`2212`	`{`
`2207`	`2213`	`if (prevprevWasACharSetAndPartOfRange)`
`2208`	`2214`	`{`
`2209`		`- //We a range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here`
	`2215`	`+ //A range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here`
`2210`	`2216`	`//This breaks the notion of Pass0 check for valid syntax, because during that time, the unicode flag is unknown.`
`2211`	`2217`	`if (unicodeFlagPresent)`
`2212`	`2218`	`{`
`@@ -2494,6 +2500,8 @@ namespace UnifiedRegex`
`2494`	`2500`	`}`
`2495`	`2501`	`else`
`2496`	`2502`	`{`
	`2503`	`+ DeferredFailIfUnicode(JSERR_RegExpInvalidEscape); // Fail in unicode mode for non-letter escaped control characters according to 262 Annex-B RegExp grammar spec #prod-annexB-Term`
	`2504`	`+`
`2497`	`2505`	`if (!IsEOF())`
`2498`	`2506`	`{`
`2499`	`2507`	`EncodedChar ecLookahead = ECLookahead();`
`@@ -2625,33 +2633,19 @@ namespace UnifiedRegex`
`2625`	`2633`	`standardChars->SetNonWordChars(ctAllocator, deferredSetNode->set);`
`2626`	`2634`	`return deferredSetNode;`
`2627`	`2635`	`case 'c':`
`2628`		`- if (standardEncodedChars->IsLetter(ECLookahead())) // terminating 0 is not a letter`
	`2636`	`+ if (standardEncodedChars->IsWord(ECLookahead())) // terminating 0 is not a word character`
`2629`	`2637`	`{`
`2630`	`2638`	`c = UTC(Chars<EncodedChar>::CTU(ECLookahead()) % 32);`
`2631`	`2639`	`ECConsume();`
`2632`	`2640`	`// fall-through for identity escape`
`2633`	`2641`	`}`
`2634`	`2642`	`else`
`2635`	`2643`	`{`
`2636`		`- // SPEC DEVIATION: For non-letters, still take lower 5 bits, e.g. [\c1] == [\x11].`
`2637`		`- // However, '-', ']', and EOF make the \c just a 'c'.`
`2638`		`- if (!IsEOF())`
`2639`		`- {`
`2640`		`- EncodedChar ec = ECLookahead();`
`2641`		`- switch (ec)`
`2642`		`- {`
`2643`		`- case '-':`
`2644`		`- case ']':`
`2645`		`- // fall-through for identity escape with 'c'`
`2646`		`- break;`
`2647`		`- default:`
`2648`		`- c = UTC(Chars<EncodedChar>::CTU(ec) % 32);`
`2649`		`- ECConsume();`
`2650`		`- // fall-through for identity escape`
`2651`		`- break;`
`2652`		`- }`
`2653`		`- }`
`2654`		`- // else: fall-through for identity escape with 'c'`
	`2644`	`+ // If the lookahead is a non-alphanumeric and not an underscore ('_'), then treat '\' and 'c' separately.`
	`2645`	`+ //#sec-regular-expression-patterns-semantics`
	`2646`	`+ ECRevert(1); //Put cursor back at 'c' and treat it as a non-escaped character.`
	`2647`	`+ deferredCharNode->cs[0] = '\\';`
	`2648`	`+ return deferredCharNode;`
`2655`	`2649`	`}`
`2656`	`2650`	`break;`
`2657`	`2651`	`case 'x':`