[GR-38034] Fix escaping of "/" in Regexp#source

andrykonchin · andrykonchin · commit 2d0ae5e3343a · 2022-06-30T10:14:02.000Z
PullRequest: truffleruby/3415
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ Compatibility:
 
 * Fix `Array#fill` to raise `TypeError` instead of `ArgumentError` when the length argument is not numeric (#2652, @andrykonchin).
 * Warn when a global variable is not initialized (#2595, @andrykonchin).
+* Fix escaping of `/` by `Regexp#source` (#2569, @andrykonchin).
 
 Performance:
 
diff --git a/spec/ruby/core/regexp/source_spec.rb b/spec/ruby/core/regexp/source_spec.rb
@@ -9,8 +9,26 @@
     /x(.)xz/.source.should == "x(.)xz"
   end
 
-  it "will remove escape characters" do
-    /foo\/bar/.source.should == "foo/bar"
+  it "keeps escape sequences as is" do
+    /\x20\+/.source.should == '\x20\+'
+  end
+
+  describe "escaping" do
+    it "keeps escaping of metacharacter" do
+      /\$/.source.should == "\\$"
+    end
+
+    it "keeps escaping of metacharacter used as a terminator" do
+      %r+\++.source.should == "\\+"
+    end
+
+    it "removes escaping of non-metacharacter used as a terminator" do
+      %r@\@@.source.should == "@"
+    end
+
+    it "keeps escaping of non-metacharacter not used as a terminator" do
+      /\@/.source.should == "\\@"
+    end
   end
 
   not_supported_on :opal do
diff --git a/spec/ruby/language/regexp/escapes_spec.rb b/spec/ruby/language/regexp/escapes_spec.rb
@@ -2,8 +2,10 @@
 require_relative '../../spec_helper'
 require_relative '../fixtures/classes'
 
+# TODO: synchronize with spec/core/regexp/new_spec.rb -
+#       escaping is also tested there
 describe "Regexps with escape characters" do
-  it "they're supported" do
+  it "supports escape sequences" do
     /\t/.match("\t").to_a.should == ["\t"] # horizontal tab
     /\v/.match("\v").to_a.should == ["\v"] # vertical tab
     /\n/.match("\n").to_a.should == ["\n"] # newline
@@ -15,16 +17,16 @@
     # \nnn         octal char            (encoded byte value)
   end
 
-  it "support quoting meta-characters via escape sequence" do
-    /\\/.match("\\").to_a.should == ["\\"]
-    /\//.match("/").to_a.should == ["/"]
+  it "supports quoting meta-characters via escape sequence" do
     # parenthesis, etc
     /\(/.match("(").to_a.should == ["("]
     /\)/.match(")").to_a.should == [")"]
     /\[/.match("[").to_a.should == ["["]
     /\]/.match("]").to_a.should == ["]"]
     /\{/.match("{").to_a.should == ["{"]
     /\}/.match("}").to_a.should == ["}"]
+    /\</.match("<").to_a.should == ["<"]
+    /\>/.match(">").to_a.should == [">"]
     # alternation separator
     /\|/.match("|").to_a.should == ["|"]
     # quantifiers
@@ -37,11 +39,81 @@
     /\$/.match("$").to_a.should == ["$"]
   end
 
+  it "supports quoting meta-characters via escape sequence when used as a terminator" do
+    # parenthesis, etc
+    # %r[[, %r((, etc literals - are forbidden
+    %r(\().match("(").to_a.should == ["("]
+    %r(\)).match(")").to_a.should == [")"]
+    %r)\().match("(").to_a.should == ["("]
+    %r)\)).match(")").to_a.should == [")"]
+
+    %r[\[].match("[").to_a.should == ["["]
+    %r[\]].match("]").to_a.should == ["]"]
+    %r]\[].match("[").to_a.should == ["["]
+    %r]\]].match("]").to_a.should == ["]"]
+
+    %r{\{}.match("{").to_a.should == ["{"]
+    %r{\}}.match("}").to_a.should == ["}"]
+    %r}\{}.match("{").to_a.should == ["{"]
+    %r}\}}.match("}").to_a.should == ["}"]
+
+    %r<\<>.match("<").to_a.should == ["<"]
+    %r<\>>.match(">").to_a.should == [">"]
+    %r>\<>.match("<").to_a.should == ["<"]
+    %r>\>>.match(">").to_a.should == [">"]
+
+    # alternation separator
+    %r|\||.match("|").to_a.should == ["|"]
+    # quantifiers
+    %r?\??.match("?").to_a.should == ["?"]
+    %r.\...match(".").to_a.should == ["."]
+    %r*\**.match("*").to_a.should == ["*"]
+    %r+\++.match("+").to_a.should == ["+"]
+    # line anchors
+    %r^\^^.match("^").to_a.should == ["^"]
+    %r$\$$.match("$").to_a.should == ["$"]
+  end
+
+  it "supports quoting non-meta-characters via escape sequence when used as a terminator" do
+    non_meta_character_terminators = [
+      '!', '"', '#', '%', '&', "'", ',', '-', ':', ';', '@', '_', '`', '/', '=', '~'
+    ]
+
+    non_meta_character_terminators.each do |c|
+      pattern = eval("%r" + c + "\\" + c + c)
+      pattern.match(c).to_a.should == [c]
+    end
+  end
+
+  it "does not change semantics of escaped non-meta-character when used as a terminator" do
+    all_terminators = [*("!".."/"), *(":".."@"), *("[".."`"), *("{".."~")]
+    meta_character_terminators = ["$", "^", "*", "+", ".", "?", "|", "}", ")", ">", "]"]
+    special_cases = ['(', '{', '[', '<', '\\']
+
+    # it should be equivalent to
+    #   [ '!', '"', '#', '%', '&', "'", ',', '-', ':', ';', '@', '_', '`', '/', '=', '~' ]
+    non_meta_character_terminators = all_terminators - meta_character_terminators - special_cases
+
+    non_meta_character_terminators.each do |c|
+      pattern = eval("%r" + c + "\\" + c + c)
+      pattern.should == /#{c}/
+    end
+  end
+
+  it "does not change semantics of escaped meta-character when used as a terminator" do
+    meta_character_terminators = ["$", "^", "*", "+", ".", "?", "|", "}", ")", ">", "]"]
+
+    meta_character_terminators.each do |c|
+      pattern = eval("%r" + c + "\\" + c + c)
+      pattern.should == eval("/\\#{c}/")
+    end
+  end
+
   it "allows any character to be escaped" do
     /\y/.match("y").to_a.should == ["y"]
   end
 
-  it "support \\x (hex characters)" do
+  it "supports \\x (hex characters)" do
     /\xA/.match("\nxyz").to_a.should == ["\n"]
     /\x0A/.match("\n").to_a.should == ["\n"]
     /\xAA/.match("\nA").should be_nil
@@ -53,7 +125,7 @@
     # \x{7HHHHHHH} wide hexadecimal char (character code point value)
   end
 
-  it "support \\c (control characters)" do
+  it "supports \\c (control characters)" do
     #/\c \c@\c`/.match("\00\00\00").to_a.should == ["\00\00\00"]
     /\c#\cc\cC/.match("\03\03\03").to_a.should == ["\03\03\03"]
     /\c'\cG\cg/.match("\a\a\a").to_a.should == ["\a\a\a"]
diff --git a/spec/ruby/language/regexp_spec.rb b/spec/ruby/language/regexp_spec.rb
@@ -96,7 +96,6 @@
     /./.match("\0").to_a.should == ["\0"]
   end
 
-
   it "supports | (alternations)" do
     /a|b/.match("a").to_a.should == ["a"]
   end
@@ -161,26 +160,6 @@
     pattern.should_not =~ 'T'
   end
 
-  escapable_terminators =  ['!', '"', '#', '%', '&', "'", ',', '-', ':', ';', '@', '_', '`']
-
-  it "supports escaping characters when used as a terminator" do
-    escapable_terminators.each do |c|
-      ref = "(?-mix:#{c})"
-      pattern = eval("%r" + c + "\\" + c + c)
-      pattern.to_s.should == ref
-    end
-  end
-
-  it "treats an escaped non-escapable character normally when used as a terminator" do
-    all_terminators = [*("!".."/"), *(":".."@"), *("[".."`"), *("{".."~")]
-    special_cases = ['(', '{', '[', '<', '\\', '=', '~']
-    (all_terminators - special_cases - escapable_terminators).each do |c|
-      ref = "(?-mix:\\#{c})"
-      pattern = eval("%r" + c + "\\" + c + c)
-      pattern.to_s.should == ref
-    end
-  end
-
   it "support handling unicode 9.0 characters with POSIX bracket expressions" do
     char_lowercase = "\u{104D8}" # OSAGE SMALL LETTER A
     /[[:lower:]]/.match(char_lowercase).to_s.should == char_lowercase
diff --git a/src/main/java/org/truffleruby/parser/lexer/StringTerm.java b/src/main/java/org/truffleruby/parser/lexer/StringTerm.java
@@ -45,10 +45,6 @@
 import static org.truffleruby.parser.lexer.RubyLexer.isHexChar;
 import static org.truffleruby.parser.lexer.RubyLexer.isOctChar;
 
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Set;
-
 import org.jcodings.Encoding;
 import org.truffleruby.core.regexp.RegexpOptions;
 import org.truffleruby.core.rope.Rope;
@@ -60,10 +56,6 @@
 
 public class StringTerm extends StrTerm {
 
-    // Chanacters that can be escaped in a %r style regexp literal when they are also the terminator.
-    private static final Set<Character> REGEXP_ESCAPABLE_TERMINATORS = new HashSet<>(
-            Arrays.asList(new Character[]{ '!', '"', '#', '%', '&', '\'', ',', '-', ':', ';', '@', '_', '`' }));
-
     // Expand variables, Indentation of final marker
     private int flags;
 
@@ -356,12 +348,11 @@ public int parseStringIntoBuffer(RubyLexer lexer, RopeBuilder buffer, Encoding e
 
                         if (regexp) {
                             if (c == end && !simple_re_meta(c)) {
-                                buffer.append('\\');
                                 buffer.append(c);
                                 continue;
                             }
                             lexer.pushback(c);
-                            parseEscapeIntoBuffer(regexp, lexer, buffer);
+                            parseEscapeIntoBuffer(lexer, buffer);
 
                             if (hasNonAscii && buffer.getEncoding() != enc[0]) {
                                 mixedEscape(lexer, buffer.getEncoding(), enc[0]);
@@ -419,9 +410,6 @@ public int parseStringIntoBuffer(RubyLexer lexer, RopeBuilder buffer, Encoding e
     }
 
     private boolean simple_re_meta(int c) {
-        if (c == end) {
-            return true;
-        }
         switch (c) {
             case '$':
             case '*':
@@ -442,12 +430,12 @@ private boolean simple_re_meta(int c) {
 
     // Was a goto in original ruby lexer
     @SuppressWarnings("fallthrough")
-    private void escaped(boolean regexp, RubyLexer lexer, RopeBuilder buffer) {
+    private void escaped(RubyLexer lexer, RopeBuilder buffer) {
         int c;
 
         switch (c = lexer.nextc()) {
             case '\\':
-                parseEscapeIntoBuffer(regexp, lexer, buffer);
+                parseEscapeIntoBuffer(lexer, buffer);
                 break;
             case EOF:
                 lexer.compile_error("Invalid escape character syntax");
@@ -457,7 +445,7 @@ private void escaped(boolean regexp, RubyLexer lexer, RopeBuilder buffer) {
     }
 
     @SuppressWarnings("fallthrough")
-    private void parseEscapeIntoBuffer(boolean regexp, RubyLexer lexer, RopeBuilder buffer) {
+    private void parseEscapeIntoBuffer(RubyLexer lexer, RopeBuilder buffer) {
         int c;
 
         switch (c = lexer.nextc()) {
@@ -505,41 +493,24 @@ private void parseEscapeIntoBuffer(boolean regexp, RubyLexer lexer, RopeBuilder
                     lexer.compile_error("Invalid escape character syntax");
                 }
                 buffer.append(new byte[]{ '\\', 'M', '-' });
-                escaped(regexp, lexer, buffer);
+                escaped(lexer, buffer);
                 break;
             case 'C':
                 if ((lexer.nextc()) != '-') {
                     lexer.compile_error("Invalid escape character syntax");
                 }
                 buffer.append(new byte[]{ '\\', 'C', '-' });
-                escaped(regexp, lexer, buffer);
+                escaped(lexer, buffer);
                 break;
             case 'c':
                 buffer.append(new byte[]{ '\\', 'c' });
-                escaped(regexp, lexer, buffer);
+                escaped(lexer, buffer);
                 break;
             case EOF:
                 lexer.compile_error("Invalid escape character syntax");
             default:
-                if (regexp) {
-                    simpleRegexpEscape(buffer, c);
-                } else {
-                    simpleStringEscape(buffer, c);
-                }
-        }
-    }
-
-    private void simpleRegexpEscape(RopeBuilder buffer, int c) {
-        if (c == end && REGEXP_ESCAPABLE_TERMINATORS.contains((char) c)) {
-            buffer.append(c);
-        } else {
-            buffer.append('\\');
-            buffer.append(c);
+                buffer.append('\\');
+                buffer.append(c);
         }
     }
-
-    private void simpleStringEscape(RopeBuilder buffer, int c) {
-        buffer.append('\\');
-        buffer.append(c);
-    }
 }
diff --git a/src/main/java/org/truffleruby/parser/parser/ParserSupport.java b/src/main/java/org/truffleruby/parser/parser/ParserSupport.java
@@ -645,7 +645,7 @@ private void handleUselessWarn(ParseNode node, String useless) {
                 "Useless use of " + useless + " in void context.");
     }
 
-    /** Check to see if current node is an useless statement. If useless a warning if printed.
+    /** Check to see if current node is a useless statement. If useless a warning is printed.
      *
      * @param node to be checked. */
     public void checkUselessStatement(ParseNode node) {
@@ -1802,7 +1802,7 @@ public RopeWithEncoding setRegexpEncoding(RegexpParseNode end, Rope value) {
 
     protected ClassicRegexp checkRegexpSyntax(Rope value, RegexpOptions options) {
         try {
-            // This is only for syntax checking but this will as a side-effect create an entry in the regexp cache.
+            // This is only for syntax checking but this will as a side effect create an entry in the regexp cache.
             return new ClassicRegexp(
                     getConfiguration().getContext(),
                     value,