Skip to content

Commit 2d0ae5e

Browse files
committed
[GR-38034] Fix escaping of "/" in Regexp#source
PullRequest: truffleruby/3415
2 parents b131d3c + 81578cf commit 2d0ae5e

File tree

6 files changed

+110
-69
lines changed

6 files changed

+110
-69
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ Compatibility:
1111

1212
* Fix `Array#fill` to raise `TypeError` instead of `ArgumentError` when the length argument is not numeric (#2652, @andrykonchin).
1313
* Warn when a global variable is not initialized (#2595, @andrykonchin).
14+
* Fix escaping of `/` by `Regexp#source` (#2569, @andrykonchin).
1415

1516
Performance:
1617

spec/ruby/core/regexp/source_spec.rb

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,26 @@
99
/x(.)xz/.source.should == "x(.)xz"
1010
end
1111

12-
it "will remove escape characters" do
13-
/foo\/bar/.source.should == "foo/bar"
12+
it "keeps escape sequences as is" do
13+
/\x20\+/.source.should == '\x20\+'
14+
end
15+
16+
describe "escaping" do
17+
it "keeps escaping of metacharacter" do
18+
/\$/.source.should == "\\$"
19+
end
20+
21+
it "keeps escaping of metacharacter used as a terminator" do
22+
%r+\++.source.should == "\\+"
23+
end
24+
25+
it "removes escaping of non-metacharacter used as a terminator" do
26+
%r@\@@.source.should == "@"
27+
end
28+
29+
it "keeps escaping of non-metacharacter not used as a terminator" do
30+
/\@/.source.should == "\\@"
31+
end
1432
end
1533

1634
not_supported_on :opal do

spec/ruby/language/regexp/escapes_spec.rb

Lines changed: 78 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22
require_relative '../../spec_helper'
33
require_relative '../fixtures/classes'
44

5+
# TODO: synchronize with spec/core/regexp/new_spec.rb -
6+
# escaping is also tested there
57
describe "Regexps with escape characters" do
6-
it "they're supported" do
8+
it "supports escape sequences" do
79
/\t/.match("\t").to_a.should == ["\t"] # horizontal tab
810
/\v/.match("\v").to_a.should == ["\v"] # vertical tab
911
/\n/.match("\n").to_a.should == ["\n"] # newline
@@ -15,16 +17,16 @@
1517
# \nnn octal char (encoded byte value)
1618
end
1719

18-
it "support quoting meta-characters via escape sequence" do
19-
/\\/.match("\\").to_a.should == ["\\"]
20-
/\//.match("/").to_a.should == ["/"]
20+
it "supports quoting meta-characters via escape sequence" do
2121
# parenthesis, etc
2222
/\(/.match("(").to_a.should == ["("]
2323
/\)/.match(")").to_a.should == [")"]
2424
/\[/.match("[").to_a.should == ["["]
2525
/\]/.match("]").to_a.should == ["]"]
2626
/\{/.match("{").to_a.should == ["{"]
2727
/\}/.match("}").to_a.should == ["}"]
28+
/\</.match("<").to_a.should == ["<"]
29+
/\>/.match(">").to_a.should == [">"]
2830
# alternation separator
2931
/\|/.match("|").to_a.should == ["|"]
3032
# quantifiers
@@ -37,11 +39,81 @@
3739
/\$/.match("$").to_a.should == ["$"]
3840
end
3941

42+
it "supports quoting meta-characters via escape sequence when used as a terminator" do
43+
# parenthesis, etc
44+
# %r[[, %r((, etc literals - are forbidden
45+
%r(\().match("(").to_a.should == ["("]
46+
%r(\)).match(")").to_a.should == [")"]
47+
%r)\().match("(").to_a.should == ["("]
48+
%r)\)).match(")").to_a.should == [")"]
49+
50+
%r[\[].match("[").to_a.should == ["["]
51+
%r[\]].match("]").to_a.should == ["]"]
52+
%r]\[].match("[").to_a.should == ["["]
53+
%r]\]].match("]").to_a.should == ["]"]
54+
55+
%r{\{}.match("{").to_a.should == ["{"]
56+
%r{\}}.match("}").to_a.should == ["}"]
57+
%r}\{}.match("{").to_a.should == ["{"]
58+
%r}\}}.match("}").to_a.should == ["}"]
59+
60+
%r<\<>.match("<").to_a.should == ["<"]
61+
%r<\>>.match(">").to_a.should == [">"]
62+
%r>\<>.match("<").to_a.should == ["<"]
63+
%r>\>>.match(">").to_a.should == [">"]
64+
65+
# alternation separator
66+
%r|\||.match("|").to_a.should == ["|"]
67+
# quantifiers
68+
%r?\??.match("?").to_a.should == ["?"]
69+
%r.\...match(".").to_a.should == ["."]
70+
%r*\**.match("*").to_a.should == ["*"]
71+
%r+\++.match("+").to_a.should == ["+"]
72+
# line anchors
73+
%r^\^^.match("^").to_a.should == ["^"]
74+
%r$\$$.match("$").to_a.should == ["$"]
75+
end
76+
77+
it "supports quoting non-meta-characters via escape sequence when used as a terminator" do
78+
non_meta_character_terminators = [
79+
'!', '"', '#', '%', '&', "'", ',', '-', ':', ';', '@', '_', '`', '/', '=', '~'
80+
]
81+
82+
non_meta_character_terminators.each do |c|
83+
pattern = eval("%r" + c + "\\" + c + c)
84+
pattern.match(c).to_a.should == [c]
85+
end
86+
end
87+
88+
it "does not change semantics of escaped non-meta-character when used as a terminator" do
89+
all_terminators = [*("!".."/"), *(":".."@"), *("[".."`"), *("{".."~")]
90+
meta_character_terminators = ["$", "^", "*", "+", ".", "?", "|", "}", ")", ">", "]"]
91+
special_cases = ['(', '{', '[', '<', '\\']
92+
93+
# it should be equivalent to
94+
# [ '!', '"', '#', '%', '&', "'", ',', '-', ':', ';', '@', '_', '`', '/', '=', '~' ]
95+
non_meta_character_terminators = all_terminators - meta_character_terminators - special_cases
96+
97+
non_meta_character_terminators.each do |c|
98+
pattern = eval("%r" + c + "\\" + c + c)
99+
pattern.should == /#{c}/
100+
end
101+
end
102+
103+
it "does not change semantics of escaped meta-character when used as a terminator" do
104+
meta_character_terminators = ["$", "^", "*", "+", ".", "?", "|", "}", ")", ">", "]"]
105+
106+
meta_character_terminators.each do |c|
107+
pattern = eval("%r" + c + "\\" + c + c)
108+
pattern.should == eval("/\\#{c}/")
109+
end
110+
end
111+
40112
it "allows any character to be escaped" do
41113
/\y/.match("y").to_a.should == ["y"]
42114
end
43115

44-
it "support \\x (hex characters)" do
116+
it "supports \\x (hex characters)" do
45117
/\xA/.match("\nxyz").to_a.should == ["\n"]
46118
/\x0A/.match("\n").to_a.should == ["\n"]
47119
/\xAA/.match("\nA").should be_nil
@@ -53,7 +125,7 @@
53125
# \x{7HHHHHHH} wide hexadecimal char (character code point value)
54126
end
55127

56-
it "support \\c (control characters)" do
128+
it "supports \\c (control characters)" do
57129
#/\c \c@\c`/.match("\00\00\00").to_a.should == ["\00\00\00"]
58130
/\c#\cc\cC/.match("\03\03\03").to_a.should == ["\03\03\03"]
59131
/\c'\cG\cg/.match("\a\a\a").to_a.should == ["\a\a\a"]

spec/ruby/language/regexp_spec.rb

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,6 @@
9696
/./.match("\0").to_a.should == ["\0"]
9797
end
9898

99-
10099
it "supports | (alternations)" do
101100
/a|b/.match("a").to_a.should == ["a"]
102101
end
@@ -161,26 +160,6 @@
161160
pattern.should_not =~ 'T'
162161
end
163162

164-
escapable_terminators = ['!', '"', '#', '%', '&', "'", ',', '-', ':', ';', '@', '_', '`']
165-
166-
it "supports escaping characters when used as a terminator" do
167-
escapable_terminators.each do |c|
168-
ref = "(?-mix:#{c})"
169-
pattern = eval("%r" + c + "\\" + c + c)
170-
pattern.to_s.should == ref
171-
end
172-
end
173-
174-
it "treats an escaped non-escapable character normally when used as a terminator" do
175-
all_terminators = [*("!".."/"), *(":".."@"), *("[".."`"), *("{".."~")]
176-
special_cases = ['(', '{', '[', '<', '\\', '=', '~']
177-
(all_terminators - special_cases - escapable_terminators).each do |c|
178-
ref = "(?-mix:\\#{c})"
179-
pattern = eval("%r" + c + "\\" + c + c)
180-
pattern.to_s.should == ref
181-
end
182-
end
183-
184163
it "support handling unicode 9.0 characters with POSIX bracket expressions" do
185164
char_lowercase = "\u{104D8}" # OSAGE SMALL LETTER A
186165
/[[:lower:]]/.match(char_lowercase).to_s.should == char_lowercase

src/main/java/org/truffleruby/parser/lexer/StringTerm.java

Lines changed: 9 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,6 @@
4545
import static org.truffleruby.parser.lexer.RubyLexer.isHexChar;
4646
import static org.truffleruby.parser.lexer.RubyLexer.isOctChar;
4747

48-
import java.util.Arrays;
49-
import java.util.HashSet;
50-
import java.util.Set;
51-
5248
import org.jcodings.Encoding;
5349
import org.truffleruby.core.regexp.RegexpOptions;
5450
import org.truffleruby.core.rope.Rope;
@@ -60,10 +56,6 @@
6056

6157
public class StringTerm extends StrTerm {
6258

63-
// Chanacters that can be escaped in a %r style regexp literal when they are also the terminator.
64-
private static final Set<Character> REGEXP_ESCAPABLE_TERMINATORS = new HashSet<>(
65-
Arrays.asList(new Character[]{ '!', '"', '#', '%', '&', '\'', ',', '-', ':', ';', '@', '_', '`' }));
66-
6759
// Expand variables, Indentation of final marker
6860
private int flags;
6961

@@ -356,12 +348,11 @@ public int parseStringIntoBuffer(RubyLexer lexer, RopeBuilder buffer, Encoding e
356348

357349
if (regexp) {
358350
if (c == end && !simple_re_meta(c)) {
359-
buffer.append('\\');
360351
buffer.append(c);
361352
continue;
362353
}
363354
lexer.pushback(c);
364-
parseEscapeIntoBuffer(regexp, lexer, buffer);
355+
parseEscapeIntoBuffer(lexer, buffer);
365356

366357
if (hasNonAscii && buffer.getEncoding() != enc[0]) {
367358
mixedEscape(lexer, buffer.getEncoding(), enc[0]);
@@ -419,9 +410,6 @@ public int parseStringIntoBuffer(RubyLexer lexer, RopeBuilder buffer, Encoding e
419410
}
420411

421412
private boolean simple_re_meta(int c) {
422-
if (c == end) {
423-
return true;
424-
}
425413
switch (c) {
426414
case '$':
427415
case '*':
@@ -442,12 +430,12 @@ private boolean simple_re_meta(int c) {
442430

443431
// Was a goto in original ruby lexer
444432
@SuppressWarnings("fallthrough")
445-
private void escaped(boolean regexp, RubyLexer lexer, RopeBuilder buffer) {
433+
private void escaped(RubyLexer lexer, RopeBuilder buffer) {
446434
int c;
447435

448436
switch (c = lexer.nextc()) {
449437
case '\\':
450-
parseEscapeIntoBuffer(regexp, lexer, buffer);
438+
parseEscapeIntoBuffer(lexer, buffer);
451439
break;
452440
case EOF:
453441
lexer.compile_error("Invalid escape character syntax");
@@ -457,7 +445,7 @@ private void escaped(boolean regexp, RubyLexer lexer, RopeBuilder buffer) {
457445
}
458446

459447
@SuppressWarnings("fallthrough")
460-
private void parseEscapeIntoBuffer(boolean regexp, RubyLexer lexer, RopeBuilder buffer) {
448+
private void parseEscapeIntoBuffer(RubyLexer lexer, RopeBuilder buffer) {
461449
int c;
462450

463451
switch (c = lexer.nextc()) {
@@ -505,41 +493,24 @@ private void parseEscapeIntoBuffer(boolean regexp, RubyLexer lexer, RopeBuilder
505493
lexer.compile_error("Invalid escape character syntax");
506494
}
507495
buffer.append(new byte[]{ '\\', 'M', '-' });
508-
escaped(regexp, lexer, buffer);
496+
escaped(lexer, buffer);
509497
break;
510498
case 'C':
511499
if ((lexer.nextc()) != '-') {
512500
lexer.compile_error("Invalid escape character syntax");
513501
}
514502
buffer.append(new byte[]{ '\\', 'C', '-' });
515-
escaped(regexp, lexer, buffer);
503+
escaped(lexer, buffer);
516504
break;
517505
case 'c':
518506
buffer.append(new byte[]{ '\\', 'c' });
519-
escaped(regexp, lexer, buffer);
507+
escaped(lexer, buffer);
520508
break;
521509
case EOF:
522510
lexer.compile_error("Invalid escape character syntax");
523511
default:
524-
if (regexp) {
525-
simpleRegexpEscape(buffer, c);
526-
} else {
527-
simpleStringEscape(buffer, c);
528-
}
529-
}
530-
}
531-
532-
private void simpleRegexpEscape(RopeBuilder buffer, int c) {
533-
if (c == end && REGEXP_ESCAPABLE_TERMINATORS.contains((char) c)) {
534-
buffer.append(c);
535-
} else {
536-
buffer.append('\\');
537-
buffer.append(c);
512+
buffer.append('\\');
513+
buffer.append(c);
538514
}
539515
}
540-
541-
private void simpleStringEscape(RopeBuilder buffer, int c) {
542-
buffer.append('\\');
543-
buffer.append(c);
544-
}
545516
}

src/main/java/org/truffleruby/parser/parser/ParserSupport.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -645,7 +645,7 @@ private void handleUselessWarn(ParseNode node, String useless) {
645645
"Useless use of " + useless + " in void context.");
646646
}
647647

648-
/** Check to see if current node is an useless statement. If useless a warning if printed.
648+
/** Check to see if current node is a useless statement. If useless a warning is printed.
649649
*
650650
* @param node to be checked. */
651651
public void checkUselessStatement(ParseNode node) {
@@ -1802,7 +1802,7 @@ public RopeWithEncoding setRegexpEncoding(RegexpParseNode end, Rope value) {
18021802

18031803
protected ClassicRegexp checkRegexpSyntax(Rope value, RegexpOptions options) {
18041804
try {
1805-
// This is only for syntax checking but this will as a side-effect create an entry in the regexp cache.
1805+
// This is only for syntax checking but this will as a side effect create an entry in the regexp cache.
18061806
return new ClassicRegexp(
18071807
getConfiguration().getContext(),
18081808
value,

0 commit comments

Comments
 (0)