Skip to content

Commit 2fbec1c

Browse files
authored
Merge pull request #3411 from Earlopain/parser-translator-better-regexp-handling
Better handle regexp in the parser translator
2 parents 00fae31 + e1c75f3 commit 2fbec1c

File tree

3 files changed

+29
-12
lines changed

3 files changed

+29
-12
lines changed

lib/prism/translation/parser/compiler.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1507,7 +1507,7 @@ def visit_regular_expression_node(node)
15071507
elsif node.content.include?("\n")
15081508
string_nodes_from_line_continuations(node.unescaped, node.content, node.content_loc.start_offset, node.opening)
15091509
else
1510-
[builder.string_internal(token(node.content_loc))]
1510+
[builder.string_internal([node.unescaped, srange(node.content_loc)])]
15111511
end
15121512

15131513
builder.regexp_compose(

lib/prism/translation/parser/lexer.rb

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -637,18 +637,34 @@ def trim_heredoc_whitespace(string, heredoc)
637637
DELIMITER_SYMETRY = { "[" => "]", "(" => ")", "{" => "}", "<" => ">" }.freeze
638638
private_constant :DELIMITER_SYMETRY
639639

640+
641+
# https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/lexer-strings.rl#L14
642+
REGEXP_META_CHARACTERS = ["\\", "$", "(", ")", "*", "+", ".", "<", ">", "?", "[", "]", "^", "{", "|", "}"]
643+
private_constant :REGEXP_META_CHARACTERS
644+
640645
# Apply Ruby string escaping rules
641646
def unescape_string(string, quote)
642647
# In single-quoted heredocs, everything is taken literally.
643648
return string if quote == "<<'"
644649

645-
# TODO: Implement regexp escaping
646-
return string if quote == "/" || quote.start_with?("%r")
647-
648650
# OPTIMIZATION: Assume that few strings need escaping to speed up the common case.
649651
return string unless string.include?("\\")
650652

651-
if interpolation?(quote)
653+
# Enclosing character for the string. `"` for `"foo"`, `{` for `%w{foo}`, etc.
654+
delimiter = quote[-1]
655+
656+
if regexp?(quote)
657+
# Should be escaped handled to single-quoted heredocs. The only character that is
658+
# allowed to be escaped is the delimiter, except when that also has special meaning
659+
# in the regexp. Since all the symetry delimiters have special meaning, they don't need
660+
# to be considered separately.
661+
if REGEXP_META_CHARACTERS.include?(delimiter)
662+
string
663+
else
664+
# There can never be an even amount of backslashes. It would be a syntax error.
665+
string.gsub(/\\(#{Regexp.escape(delimiter)})/, '\1')
666+
end
667+
elsif interpolation?(quote)
652668
# Appending individual escape sequences may force the string out of its intended
653669
# encoding. Start out with binary and force it back later.
654670
result = "".b
@@ -693,12 +709,6 @@ def unescape_string(string, quote)
693709

694710
result
695711
else
696-
if quote == "'"
697-
delimiter = "'"
698-
else
699-
delimiter = quote[2]
700-
end
701-
702712
delimiters = Regexp.escape("#{delimiter}#{DELIMITER_SYMETRY[delimiter]}")
703713
string.gsub(/\\([\\#{delimiters}])/, '\1')
704714
end
@@ -730,6 +740,11 @@ def interpolation?(quote)
730740
quote != "'" && !quote.start_with?("%q", "%w", "%i")
731741
end
732742

743+
# Regexp allow interpolation but are handled differently during unescaping
744+
def regexp?(quote)
745+
quote == "/" || quote.start_with?("%r")
746+
end
747+
733748
# Determine if the string is part of a %-style array.
734749
def percent_array?(quote)
735750
quote.start_with?("%w", "%W", "%i", "%I")

test/prism/ruby/parser_test.rb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,13 +69,15 @@ class ParserTest < TestCase
6969

7070
# https://github.com/whitequark/parser/issues/950
7171
"whitequark/dedenting_interpolating_heredoc_fake_line_continuation.txt",
72+
73+
# Contains an escaped multibyte character. This is supposed to drop to backslash
74+
"seattlerb/regexp_escape_extended.txt",
7275
]
7376

7477
# These files are either failing to parse or failing to translate, so we'll
7578
# skip them for now.
7679
skip_all = skip_incorrect | [
7780
"unescaping.txt",
78-
"seattlerb/bug190.txt",
7981
"seattlerb/heredoc_with_extra_carriage_returns_windows.txt",
8082
"seattlerb/heredoc_with_only_carriage_returns_windows.txt",
8183
"seattlerb/heredoc_with_only_carriage_returns.txt",

0 commit comments

Comments
 (0)