Strictly parse escape sequences in identifiers.

cscott · cscott · commit 3f4fba4bccb6 · 2016-02-01T17:42:23.000-05:00
Make our escape sequence parsing match https://www.w3.org/TR/CSS21/grammar.html#scanner In particular, don't accept "\\\r" as a valid escape sequence, and swallow trailing space for every hex escape (not just 2-digit ones). We also tweak `readName` in the tokenizer to unescape the names it reads, which makes `@ch\041 rset` correctly parse as a `CHARSET_SYM`.
diff --git a/src/css/TokenStream.js b/src/css/TokenStream.js
@@ -1,7 +1,7 @@
 /*global Tokens, TokenStreamBase*/
 
 var h = /^[0-9a-fA-F]$/,
-    nonascii = /^[\u0080-\uFFFF]$/,
+    nonascii = /^[\u00A0-\uFFFF]$/,
     nl = /\n|\r\n|\r|\f/,
     whitespace = /\u0009|\u000a|\u000c|\u000d|\u0020/;
 
@@ -27,7 +27,7 @@ function isNewLine(c){
 }
 
 function isNameStart(c){
-    return c !== null && (/[a-z_\u0080-\uFFFF\\]/i.test(c));
+    return c !== null && (/[a-z_\u00A0-\uFFFF\\]/i.test(c));
 }
 
 function isNameChar(c){
@@ -214,6 +214,19 @@ TokenStream.prototype = mix(new TokenStreamBase(), {
                     token = this.htmlCommentStartToken(c, startLine, startCol);
                     break;
 
+                /*
+                 * Potential tokens:
+                 * - IDENT
+                 * - CHAR
+                 */
+                case "\\":
+                    if (/[^\r\n\f]/.test(reader.peek())) {
+                        token = this.identOrFunctionToken(c, startLine, startCol);
+                    } else {
+                        token = this.charToken(c, startLine, startCol);
+                    }
+                    break;
+
                 /*
                  * Potential tokens:
                  * - UNICODE_RANGE
@@ -942,8 +955,13 @@ TokenStream.prototype = mix(new TokenStreamBase(), {
 
         while(true){
             if (c == "\\"){
-                ident += this.readEscape(reader.read());
-                c = reader.peek();
+                if (/^[^\r\n\f]$/.test(reader.peek(2))) {
+                    ident += this.readEscape(reader.read(), true);
+                    c = reader.peek();
+                } else {
+                    // Bad escape sequence.
+                    break;
+                }
             } else if(c && isNameChar(c)){
                 ident += reader.read();
                 c = reader.peek();
@@ -955,7 +973,7 @@ TokenStream.prototype = mix(new TokenStreamBase(), {
         return ident;
     },
 
-    readEscape: function(first){
+    readEscape: function(first, unescape){
         var reader  = this._reader,
             cssEscape = first || "",
             i       = 0,
@@ -968,13 +986,31 @@ TokenStream.prototype = mix(new TokenStreamBase(), {
             } while(c && isHexDigit(c) && ++i < 6);
         }
 
-        if (cssEscape.length == 3 && /\s/.test(c) ||
-            cssEscape.length == 7 || cssEscape.length == 1){
+        if (cssEscape.length === 1) {
+            if (/^[^\r\n\f0-9a-f]$/.test(c)) {
                 reader.read();
+                if (unescape) { return c; }
+            } else {
+                // We should never get here (readName won't call readEscape
+                // if the escape sequence is bad).
+                throw new Error("Bad escape sequence.");
+            }
+        } else if (c === '\r') {
+            reader.read();
+            if (reader.peek() === '\n') {
+                c += reader.read();
+            }
+        } else if (/^[ \t\n\f]$/.test(c)) {
+            reader.read();
         } else {
             c = "";
         }
 
+        if (unescape) {
+            var cp = parseInt(cssEscape.slice(first.length), 16);
+            return String.fromCodePoint ? String.fromCodePoint(cp) :
+                String.fromCharCode(cp);
+        }
         return cssEscape + c;
     },
 
diff --git a/tests/css/Parser.js b/tests/css/Parser.js
@@ -1836,7 +1836,7 @@
 
             Assert.isInstanceOf(Selector, result, "Result should be an instance of Selector.");
             Assert.isInstanceOf(SelectorPart, result.parts[0], "First part should be a SelectorPart.");
-            Assert.areEqual("#\\31 a2b3c", result.parts[0].toString(), "Selector should be correct.");
+            Assert.areEqual("#1a2b3c", result.parts[0].toString(), "Selector should be correct.");
             Assert.areEqual(1, result.parts.length, "Should be one part.");
         }
 
diff --git a/tests/css/TokenStream.js b/tests/css/TokenStream.js
@@ -138,7 +138,9 @@
             "#h\\0fllo"         : [CSSTokens.HASH],
             "#ffeeff"           : [CSSTokens.HASH],
             "#\\31 a2b3c"        : [CSSTokens.HASH],
-            "#r0\\.5"            : [CSSTokens.HASH]
+            "#r0\\.5"            : [CSSTokens.HASH],
+            // Invalid escape sequence
+            "#a\\\r"             : [CSSTokens.HASH, CSSTokens.CHAR, CSSTokens.S]
         }
     }));
 
@@ -150,6 +152,7 @@
 
         var atRules = {
             "@charset"      : CSSTokens.CHARSET_SYM,
+            "@ch\\041 rset" : CSSTokens.CHARSET_SYM,
             "@import"       : CSSTokens.IMPORT_SYM,
             "@page"         : CSSTokens.PAGE_SYM,
             "@media"        : CSSTokens.MEDIA_SYM,

Original file line number	Diff line number	Diff line change
`@@ -1836,7 +1836,7 @@`
`1836`	`1836`
`1837`	`1837`	`Assert.isInstanceOf(Selector, result, "Result should be an instance of Selector.");`
`1838`	`1838`	`Assert.isInstanceOf(SelectorPart, result.parts[0], "First part should be a SelectorPart.");`
`1839`		`- Assert.areEqual("#\\31 a2b3c", result.parts[0].toString(), "Selector should be correct.");`
	`1839`	`+ Assert.areEqual("#1a2b3c", result.parts[0].toString(), "Selector should be correct.");`
`1840`	`1840`	`Assert.areEqual(1, result.parts.length, "Should be one part.");`
`1841`	`1841`	`}`
`1842`	`1842`