Comment fixes in js test and RegexParser, clarity changes

Kenji Fukuda · Kenji Fukuda · commit 4c26c3cfbfdd · 2018-07-16T11:40:49.000-07:00
diff --git a/lib/Parser/RegexParser.cpp b/lib/Parser/RegexParser.cpp
@@ -2167,7 +2167,7 @@ namespace UnifiedRegex
                     {
                         if (unicodeFlagPresent)
                         {
-                            //We a range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here
+                            //A range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here
                             //This breaks the notion of Pass0 check for valid syntax, because during that time, the unicode flag is unknown.
                             Fail(JSERR_UnicodeRegExpRangeContainsCharClass); //From #sec-patterns-static-semantics-early-errors-annexb
                         }
@@ -2212,7 +2212,7 @@ namespace UnifiedRegex
             {
                 if (prevprevWasACharSetAndPartOfRange)
                 {
-                    //We a range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here
+                    //A range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here
                     //This breaks the notion of Pass0 check for valid syntax, because during that time, the unicode flag is unknown.
                     if (unicodeFlagPresent)
                     {
@@ -2500,7 +2500,7 @@ namespace UnifiedRegex
                 }
                 else
                 {
-                    DeferredFailIfUnicode(JSERR_RegExpInvalidEscape); // Fail in unicode mode for non-letter escaped control characters according to 262 Annex-B RegExp grammar SPEC #prod-annexB-Term 
+                    DeferredFailIfUnicode(JSERR_RegExpInvalidEscape); // Fail in unicode mode for non-letter escaped control characters according to 262 Annex-B RegExp grammar spec #prod-annexB-Term 
 
                     if (!IsEOF())
                     {
@@ -2633,15 +2633,15 @@ namespace UnifiedRegex
                 standardChars->SetNonWordChars(ctAllocator, deferredSetNode->set);
                 return deferredSetNode;
             case 'c':
-                if (standardEncodedChars->IsWord(ECLookahead())) // terminating 0 is not a word
+                if (standardEncodedChars->IsWord(ECLookahead())) // terminating 0 is not a word character
                 {
                     c = UTC(Chars<EncodedChar>::CTU(ECLookahead()) % 32);
                     ECConsume();
                     // fall-through for identity escape
                 }
                 else
                 {
-                    // If the lookahead is a non-alphanumeric and not a dash('-'), then treat '\' and 'c' separately.
+                    // If the lookahead is a non-alphanumeric and not an underscore ('_'), then treat '\' and 'c' separately.
                     //#sec-regular-expression-patterns-semantics 
                     ECRevert(1); //Put cursor back at 'c' and treat it as a non-escaped character.
                     deferredCharNode->cs[0] = '\\';
diff --git a/test/Regex/control_character_escapes.js b/test/Regex/control_character_escapes.js
@@ -26,63 +26,57 @@ var tests = [
         name : "Control characters followed by a word character ([A-Za-z0-9_])",
         body : function () 
         {
-            re = /[\c6]+/; //'6' = ascii x36
+            re = /[\c6]+/; //'6' = ascii x36, parsed as [\x16]+
             matchRegExp("6", re, null);
             matchRegExp("\\", re, null);
             matchRegExp("\\c6", re, null);
             matchRegExp("c", re, null);
             matchRegExp("\x16", re, "\x16");
             
-            re = /\c6/; //'6' = ascii x36
+            re = /\c6/; //'6' = ascii x36, parsed as "\\c6"
             matchRegExp("\\c6", re, "\\c6");
             matchRegExp("\\", re, null);
             matchRegExp("6", re, null);
             matchRegExp("c", re, null);
             matchRegExp("\x16", re, null);
             
-            re = /\c6[\c6]+/; //'6' = ascii x36
+            re = /\c6[\c6]+/; //'6' = ascii x36, parsed as "\\c6"[\x16]+
             matchRegExp("\\c6\x16", re, "\\c6\x16");
             matchRegExp("\\", re, null);
             matchRegExp("c", re, null);
             matchRegExp("\x16", re, null);
             
-            re = /[\ca]+/; //'a' = ascii x61
+            re = /[\ca]+/; //'a' = ascii x61, parsed as [\x01]+
             matchRegExp("a", re, null);
             matchRegExp("\\", re, null);
             matchRegExp("c", re, null);
             matchRegExp("00xyzabc123\x01qrst", re, "\x01");
 	    
-            re = /[\c_]+/; //'_' = ascii 0x5F
+            re = /[\c_]+/; //'_' = ascii 0x5F, parsed as [\x1F]+
             matchRegExp("\x1F\x1F\x05", re, "\x1F\x1F");
             matchRegExp("\\\\\\", re, null);
             matchRegExp("////", re, null);
             matchRegExp("ccc_", re, null);
             
-            re = /[\cG]*/; //'G' = ascii x47
+            re = /[\cG]*/; //'G' = ascii x47, parsed as [\x07]*
             matchRegExp("\x07\x06\x05", re, "\x07");
             matchRegExp("\\\\", re, "");
             matchRegExp("////", re, "");
             matchRegExp("cccG", re, "");
             
-            re = /\cG\cf/; //'G' = ascii x47, 'f' = ascii x66
-            matchRegExp("\x00\x03\x07\x06\x07\x08", re, "\x07\x06");
-            matchRegExp("\\", re, null);
-            matchRegExp("/", re, null);
-            matchRegExp("\\cG\\c6\\cf", re, null);
-            
-            re = /[\cG\c6\cf]+/; //'G' = ascii x47, '6' = ascii x36, 'f' = ascii x66
+            re = /[\cG\c6\cf]+/; //'G' = ascii x47, '6' = ascii x36, 'f' = ascii x66, parsed as [\x07\x16\x06]+
             matchRegExp("\x00\x03\x07\x06\x16\x07\x08", re, "\x07\x06\x16\x07");
             matchRegExp("\\\\", re, null);
             matchRegExp("////", re, null);
             matchRegExp("cfG6", re, null);
             
-            re = /\cG\cf/; //'G' = ascii x47, 'f' = ascii x66
+            re = /\cG\cf/; //'G' = ascii x47, 'f' = ascii x66, parsed as "\x07\x06"
             matchRegExp("\x00\x03\x07\x06\x16\x07\x08", re, "\x07\x06");
             matchRegExp("\\", re, null);
             matchRegExp("/", re, null);
             matchRegExp("\\cG\\c6\\cf", re, null);
             
-            re = /[\cz\cZ]+/; //'z' = ascii x7A, 'Z' = ascii x5A, have the same lowest 5 bits
+            re = /[\cz\cZ]+/; //'z' = ascii x7A, 'Z' = ascii x5A, have the same lowest 5 bits, parsed as [\x1A]+
             matchRegExp("\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + 
                         "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f", re, "\x1a");
             matchRegExp("\\\\", re, null);
@@ -94,28 +88,28 @@ var tests = [
         name : "Control characters followed by a non-word character ([^A-Za-z0-9_])",
         body : function () 
         {
-            re = /[\c*]+/; //'*' = ascii 42
+            re = /[\c*]+/; //'*' = ascii 42, parsed as [\\c*]+ 
             matchRegExp("\x0a\x09\x08", re, null);
             matchRegExp("a*c*b*d*", re, "*c*");
             matchRegExp("\\\\", re, "\\\\");
             matchRegExp("////", re, null);
             matchRegExp("ccc", re, "ccc");
             
-            re = /[\c}]*/; //'}' = ascii 125
+            re = /[\c}]*/; //'}' = ascii 125, parsed as [\\c}]*
             matchRegExp("\x1d\x7d\x3d", re, "");
             matchRegExp("}c}}cd*c*b*d*", re, "}c}}c");
             matchRegExp("\\\\", re, "\\\\");
             matchRegExp("////", re, "");
             matchRegExp("ccc", re, "ccc");
             
-            re = /[\c;]+/; //';' = ascii 59
+            re = /[\c;]+/; //';' = ascii 59, parsed as [\\c;]+
             matchRegExp("\x1b\x1c", re, null);
             matchRegExp("d;c;d;*", re, ";c;");
             matchRegExp("\\\\", re, "\\\\");
             matchRegExp("////", re, null);
             matchRegExp("ccc", re, "ccc");
             
-            re = /\c%/; //'%' = ascii x25
+            re = /\c%/; //'%' = ascii x25, parsed as \\c%
             matchRegExp("\\", re, null);
             matchRegExp("\\", re, null);
             matchRegExp("\\c%", re, "\\c%");
@@ -126,67 +120,67 @@ var tests = [
         name : "Control Character tests with unicode flag present",
         body : function () 
         {
-            re = /[\cAg]+/u; //'A' = ascii x41
+            re = /[\cAg]+/u; //'A' = ascii x41, parsed as [g\x01]+
             matchRegExp("abcdefghi", re, "g");
             matchRegExp("\\\\", re, null);
             matchRegExp("////", re, null);
             matchRegExp("\x01\x01gg\x02\x04ggg", re, "\x01\x01gg");            
             
-            re = /[\czA]+/u;  //'z' = ascii x7A
+            re = /[\czA]+/u;  //'z' = ascii x7A, parsed as [\x1AA]+
             matchRegExp("abcdefghi", re, null);
             matchRegExp("\\\\", re, null);
             matchRegExp("////", re, null);
             matchRegExp("YZA\x1aABC", re, "A\x1aA");    
             
-            assert.throws(() => eval("\"\".match(/[\\c]/u)"), SyntaxError, "Expected an error due to non-letters being disallowed from control character when unicode flag present", 
+            assert.throws(() => eval("\"\".match(/[\\c]/u)"), SyntaxError, "(Character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by no character here.", 
                         "Invalid regular expression: invalid escape in unicode pattern");
-            assert.throws(() => eval("\"\".match(/[\\c-d]/u)"), SyntaxError, "Expected an error due to non-letters being disallowed from control character when unicode flag present", 
+            assert.throws(() => eval("\"\".match(/[\\c-d]/u)"), SyntaxError, "(Character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by a dash, '-', here.", 
                         "Invalid regular expression: invalid escape in unicode pattern");
-            assert.throws(() => eval("\"\".match(/[ab\\c_$]/u)"), SyntaxError, "Expected an error due to non-letters being disallowed from control character when unicode flag present", 
+            assert.throws(() => eval("\"\".match(/[ab\\c_$]/u)"), SyntaxError, "(Character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by an underscore, '_', here.",
                         "Invalid regular expression: invalid escape in unicode pattern");
-            assert.throws(() => eval("\"\".match(/[ab\\c\\d]/u)"), SyntaxError, "Expected an error due to non-letters being disallowed from control character when unicode flag present", 
+            assert.throws(() => eval("\"\".match(/[ab\\c\\d]/u)"), SyntaxError, "(Character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by a backslash, '\\', here.", 
                         "Invalid regular expression: invalid escape in unicode pattern");
-            assert.throws(() => eval("\"\".match(/[ab\\c3]/u)"), SyntaxError, "Expected an error due to non-letters being disallowed from control character when unicode flag present", 
+            assert.throws(() => eval("\"\".match(/[ab\\c3]/u)"), SyntaxError, "(Character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by a number, '3', here.", 
                         "Invalid regular expression: invalid escape in unicode pattern");
                         
-            re = /\cAg/u;  //'A' = ascii x41
+            re = /\cAg/u;  //'A' = ascii x41, parsed as "\x01g"
             matchRegExp("abcdefghi", re, null);
             matchRegExp("\\\\", re, null);
             matchRegExp("////", re, null);
             matchRegExp("\x01\x01gg\x02\x04ggg", re, "\x01g");            
             
-            re = /\czA/u;  //'z' = ascii x7A
+            re = /\czA/u;  //'z' = ascii x7A, parsed as "\x1aA"
             matchRegExp("abcdefghi", re, null);
             matchRegExp("\\\\", re, null);
             matchRegExp("////", re, null);
             matchRegExp("YZA\x1aABC", re, "\x1aA");   
             
-            assert.throws(() => eval("\"\".match(/\\c/u)"), SyntaxError, "Expected an error due to non-letters being disallowed from control character when unicode flag present", 
+            assert.throws(() => eval("\"\".match(/\\c/u)"), SyntaxError, "(Non-character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by no character here.", 
                         "Invalid regular expression: invalid escape in unicode pattern");
-            assert.throws(() => eval("\"\".match(/\\c-d/u)"), SyntaxError, "Expected an error due to non-letters being disallowed from control character when unicode flag present", 
+            assert.throws(() => eval("\"\".match(/\\c-d/u)"), SyntaxError, "(Non-character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by a dash, '-', here.", 
                         "Invalid regular expression: invalid escape in unicode pattern");
-            assert.throws(() => eval("\"\".match(/ab\\c_$/u)"), SyntaxError, "Expected an error due to non-letters being disallowed from control character when unicode flag present", 
+            assert.throws(() => eval("\"\".match(/ab\\c_$/u)"), SyntaxError, "(Non-character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by an underscore, '_', here.",
                         "Invalid regular expression: invalid escape in unicode pattern");
-            assert.throws(() => eval("\"\".match(/ab\\c\\d/u)"), SyntaxError, "Expected an error due to non-letters being disallowed from control character when unicode flag present", 
+            assert.throws(() => eval("\"\".match(/ab\\c\\d/u)"), SyntaxError, "(Non-character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by a backslash, '\\', here.", 
                         "Invalid regular expression: invalid escape in unicode pattern");
-            assert.throws(() => eval("\"\".match(/ab\\c3/u)"), SyntaxError, "Expected an error due to non-letters being disallowed from control character when unicode flag present", 
+            assert.throws(() => eval("\"\".match(/ab\\c3/u)"), SyntaxError, "(Non-character class) Expected an error because escaped c must be followed by a letter when unicode flag is present, but is followed by a number, '3', here.", 
                         "Invalid regular expression: invalid escape in unicode pattern");
         }
     },
     {
         name : "Control character edge cases",
         body : function () 
         {
-            re = /[\c-g]+/; //'-' = ascii x2D
+            re = /[\c-g]+/; //'-' = ascii x2D, parsed as [\\c-g]+ 
             matchRegExp("abcdefghi", re, "cdefg");
             matchRegExp("\\\\", re, "\\\\");
             matchRegExp("////", re, null);
             matchRegExp("\x0d", re, null);
             matchRegExp("aobd\\f\\d", re, "d\\f\\d");            
             
-            re = /[\c-]+/; //'-' = ascii x2D
+            re = /[\c-]+/; //'-' = ascii x2D, parsed as [\\c-]+
             matchRegExp("abcdefghi", re, "c");
-            matchRegExp("\x0d", re, null);
+            matchRegExp("\x0dc--c", re, "c--c");
             matchRegExp("\\\\", re, "\\\\");
             matchRegExp("////", re, null);
             matchRegExp("aobd\\f\\d", re, "\\");  
@@ -198,4 +192,4 @@ var tests = [
 
 testRunner.runTests(tests, {
     verbose : WScript.Arguments[0] != "summary"
-});
+});

Original file line number	Diff line number	Diff line change
`@@ -2167,7 +2167,7 @@ namespace UnifiedRegex`
`2167`	`2167`	`{`
`2168`	`2168`	`if (unicodeFlagPresent)`
`2169`	`2169`	`{`
`2170`		`- //We a range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here`
	`2170`	`+ //A range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here`
`2171`	`2171`	`//This breaks the notion of Pass0 check for valid syntax, because during that time, the unicode flag is unknown.`
`2172`	`2172`	`Fail(JSERR_UnicodeRegExpRangeContainsCharClass); //From #sec-patterns-static-semantics-early-errors-annexb`
`2173`	`2173`	`}`
`@@ -2212,7 +2212,7 @@ namespace UnifiedRegex`
`2212`	`2212`	`{`
`2213`	`2213`	`if (prevprevWasACharSetAndPartOfRange)`
`2214`	`2214`	`{`
`2215`		`- //We a range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here`
	`2215`	`+ //A range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here`
`2216`	`2216`	`//This breaks the notion of Pass0 check for valid syntax, because during that time, the unicode flag is unknown.`
`2217`	`2217`	`if (unicodeFlagPresent)`
`2218`	`2218`	`{`
`@@ -2500,7 +2500,7 @@ namespace UnifiedRegex`
`2500`	`2500`	`}`
`2501`	`2501`	`else`
`2502`	`2502`	`{`
`2503`		`- DeferredFailIfUnicode(JSERR_RegExpInvalidEscape); // Fail in unicode mode for non-letter escaped control characters according to 262 Annex-B RegExp grammar SPEC #prod-annexB-Term`
	`2503`	`+ DeferredFailIfUnicode(JSERR_RegExpInvalidEscape); // Fail in unicode mode for non-letter escaped control characters according to 262 Annex-B RegExp grammar spec #prod-annexB-Term`
`2504`	`2504`
`2505`	`2505`	`if (!IsEOF())`
`2506`	`2506`	`{`
`@@ -2633,15 +2633,15 @@ namespace UnifiedRegex`
`2633`	`2633`	`standardChars->SetNonWordChars(ctAllocator, deferredSetNode->set);`
`2634`	`2634`	`return deferredSetNode;`
`2635`	`2635`	`case 'c':`
`2636`		`- if (standardEncodedChars->IsWord(ECLookahead())) // terminating 0 is not a word`
	`2636`	`+ if (standardEncodedChars->IsWord(ECLookahead())) // terminating 0 is not a word character`
`2637`	`2637`	`{`
`2638`	`2638`	`c = UTC(Chars<EncodedChar>::CTU(ECLookahead()) % 32);`
`2639`	`2639`	`ECConsume();`
`2640`	`2640`	`// fall-through for identity escape`
`2641`	`2641`	`}`
`2642`	`2642`	`else`
`2643`	`2643`	`{`
`2644`		`- // If the lookahead is a non-alphanumeric and not a dash('-'), then treat '\' and 'c' separately.`
	`2644`	`+ // If the lookahead is a non-alphanumeric and not an underscore ('_'), then treat '\' and 'c' separately.`
`2645`	`2645`	`//#sec-regular-expression-patterns-semantics`
`2646`	`2646`	`ECRevert(1); //Put cursor back at 'c' and treat it as a non-escaped character.`
`2647`	`2647`	`deferredCharNode->cs[0] = '\\';`