diff --git a/src/com/google/javascript/jscomp/regex/RegExpTree.java b/src/com/google/javascript/jscomp/regex/RegExpTree.java index 2e824f6c014..494f6b4b226 100644 --- a/src/com/google/javascript/jscomp/regex/RegExpTree.java +++ b/src/com/google/javascript/jscomp/regex/RegExpTree.java @@ -399,7 +399,7 @@ private RegExpTree parseCharset() { CharRanges ieExplicits = CharRanges.EMPTY; while (pos < limit && pattern.charAt(pos) != ']') { char ch = pattern.charAt(pos); - char start; + int start; if (ch == '\\') { ++pos; char possibleGroupName = pattern.charAt(pos); @@ -414,7 +414,7 @@ private RegExpTree parseCharset() { start = ch; ++pos; } - char end = start; + int end = start; if (pos + 1 < limit && pattern.charAt(pos) == '-' && pattern.charAt(pos + 1) != ']') { ++pos; @@ -464,7 +464,7 @@ private RegExpTree parseCharset() { * contexts, so contexts must filter those instead. * E.g. '\b' means a different thing inside a charset than without. */ - private char parseEscapeChar() { + private int parseEscapeChar() { char ch = pattern.charAt(pos++); switch (ch) { case 'b': return '\b'; @@ -472,7 +472,12 @@ private char parseEscapeChar() { case 'n': return '\n'; case 'r': return '\r'; case 't': return '\t'; - case 'u': return parseHex(4); + case 'u': + if (flags.contains("u") && pos < limit && pattern.charAt(pos) == '{') { + return parseUnicodeEscape(); + } else { + return parseHex(4); + } case 'v': return '\u000b'; case 'x': return parseHex(2); default: @@ -599,7 +604,7 @@ private RegExpTree parseEscape() { ++pos; return new Charset(charGroup, CharRanges.EMPTY); } - return new Text("" + parseEscapeChar()); + return new Text(new String(Character.toChars(parseEscapeChar()))); } } @@ -630,6 +635,42 @@ private char parseHex(int n) { return (char) result; } + private int parseUnicodeEscape() { + checkState(pattern.charAt(pos) == '{'); + int start = pos++; + int result = 0; + char ch = pattern.charAt(pos); + if (ch == '}') { + throw new IllegalArgumentException("Invalid unicode escape: " + + pattern.substring(start, ++pos)); + } + while (pos < limit) { + int digit; + ch = pattern.charAt(pos++); + if ('0' <= ch && ch <= '9') { + digit = ch - '0'; + } else if ('a' <= ch && ch <= 'f') { + digit = ch + (10 - 'a'); + } else if ('A' <= ch && ch <= 'F') { + digit = ch + (10 - 'A'); + } else if (ch == '}') { + break; + } else { + throw new IllegalArgumentException("Invalid character in unicode escape: " + ch); + } + result = (result << 4) | digit; + } + if (ch != '}') { + throw new IllegalArgumentException("Malformed unicode escape: expected '}' after " + + pattern.substring(start, pos)); + } + if (result > 0x10FFFF) { + throw new IllegalArgumentException("Unicode must not be greater than 0x10FFFF: " + + pattern.substring(start, pos)); + } + return result; + } + private boolean isRepetitionStart(char ch) { switch (ch) { case '?': diff --git a/test/com/google/javascript/jscomp/parsing/ParserTest.java b/test/com/google/javascript/jscomp/parsing/ParserTest.java index 0b0d425c719..e528d468410 100644 --- a/test/com/google/javascript/jscomp/parsing/ParserTest.java +++ b/test/com/google/javascript/jscomp/parsing/ParserTest.java @@ -4894,6 +4894,14 @@ public void testRegExpError() { parseError("/\b.\\/", "Expected '/' in regular expression literal"); } + @Test + public void testRegExpUnicode() { + assertNodeEquality(parse("/\\u10fA/"), script(expr(regex("\\u10fA")))); + assertNodeEquality(parse("/\\u{10fA}/u"), script(expr(regex("\\u{10fA}", "u")))); + assertNodeEquality(parse("/\\u{1fA}/u"), script(expr(regex("\\u{1fA}", "u")))); + assertNodeEquality(parse("/\\u{10FFFF}/u"), script(expr(regex("\\u{10FFFF}", "u")))); + } + @Test public void testRegExpFlags() { // Various valid combinations. @@ -6556,6 +6564,10 @@ private static Node regex(String regex) { return new Node(Token.REGEXP, Node.newString(regex)); } + private static Node regex(String regex, String flag) { + return new Node(Token.REGEXP, Node.newString(regex), Node.newString(flag)); + } + /** * Verify that the given code has the given parse errors. * @return If in IDE mode, returns a partial tree. diff --git a/test/com/google/javascript/jscomp/regex/RegExpTreeTest.java b/test/com/google/javascript/jscomp/regex/RegExpTreeTest.java index e363fca4393..2110430e31e 100644 --- a/test/com/google/javascript/jscomp/regex/RegExpTreeTest.java +++ b/test/com/google/javascript/jscomp/regex/RegExpTreeTest.java @@ -193,4 +193,30 @@ public void testBackreferencingTreatedAsStringIfNoGroup() { // (?: ) in expected output serves same purpose as above test assertRegexCompilesTo("[(?)]\\k", "", "(?:[()<>?fo]k)"); } + + @Test + public void testValidUnicodeEscape() { + assertRegexCompilesTo("\\u0061", "", "a"); + assertRegexCompilesTo("\\u10b1", "u", "\\u10b1"); + assertRegexCompilesTo("\\u{61}", "u", "a"); + assertRegexCompilesTo("\\u{10b1}", "u", "\\u10b1"); + assertRegexCompilesTo("\\u{1bc}", "u", "\\u01bc"); + assertRegexCompilesTo("\\u{100A3}", "u", "\\ud800\\udca3"); + } + + @Test + public void testInvalidUnicodeEscape() { + assertRegexThrowsExceptionThat("\\u{a012", "u") + .hasMessageThat() + .isEqualTo("Malformed unicode escape: expected '}' after {a012"); + assertRegexThrowsExceptionThat("\\u{}", "u") + .hasMessageThat() + .isEqualTo("Invalid unicode escape: {}"); + assertRegexThrowsExceptionThat("\\u{10za}", "u") + .hasMessageThat() + .isEqualTo("Invalid character in unicode escape: z"); + assertRegexThrowsExceptionThat("\\u{FFFFFF}", "u") + .hasMessageThat() + .isEqualTo("Unicode must not be greater than 0x10FFFF: {FFFFFF}"); + } }