Skip to content

Commit 5cc71e1

Browse files
authored
Merge pull request #3344 from Earlopain/parser-translator-escaping-all-kinds-of-strings
Better escaping handling for all strings in the parser translator
2 parents cd3ced2 + 73aeba6 commit 5cc71e1

File tree

2 files changed

+57
-38
lines changed

2 files changed

+57
-38
lines changed

lib/prism/translation/parser/lexer.rb

Lines changed: 57 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ class Lexer
201201
]
202202

203203
# Heredocs are complex and require us to keep track of a bit of info to refer to later
204-
HeredocData = Struct.new(:identifier, :common_whitespace, :quote, keyword_init: true)
204+
HeredocData = Struct.new(:identifier, :common_whitespace, keyword_init: true)
205205

206206
private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES, :HeredocData
207207

@@ -234,6 +234,7 @@ def to_a
234234
length = lexed.length
235235

236236
heredoc_stack = Array.new
237+
quote_stack = Array.new
237238

238239
while index < length
239240
token, state = lexed[index]
@@ -312,22 +313,28 @@ def to_a
312313
value = ""
313314
location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
314315
index += 1
315-
elsif basic_quotes && next_token&.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && next_next_token&.type == :STRING_END
316-
# the parser gem doesn't simplify strings when its value ends in a newline
317-
unless (string_value = next_token.value).end_with?("\n")
318-
next_location = token.location.join(next_next_token.location)
319-
value = unescape_string(string_value)
320-
type = :tSTRING
321-
location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
322-
index += 2
316+
elsif value.start_with?("'", '"', "%")
317+
if next_token&.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && next_next_token&.type == :STRING_END
318+
# the parser gem doesn't simplify strings when its value ends in a newline
319+
if !(string_value = next_token.value).end_with?("\n") && basic_quotes
320+
next_location = token.location.join(next_next_token.location)
321+
value = unescape_string(string_value, value)
322+
type = :tSTRING
323+
location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
324+
index += 2
325+
tokens << [type, [value, location]]
326+
327+
next
328+
end
323329
end
330+
331+
quote_stack.push(value)
324332
elsif token.type == :HEREDOC_START
325333
quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2]
326334
heredoc_type = value[2] == "-" || value[2] == "~" ? value[2] : ""
327335
heredoc = HeredocData.new(
328336
identifier: value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier],
329337
common_whitespace: 0,
330-
quote: quote,
331338
)
332339

333340
if quote == "`"
@@ -347,20 +354,19 @@ def to_a
347354
end
348355

349356
heredoc_stack.push(heredoc)
357+
quote_stack.push(value)
350358
end
351359
when :tSTRING_CONTENT
352360
if (lines = token.value.lines).one?
353361
# Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
354362
is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
355363
# The parser gem only removes indentation when the heredoc is not nested
356364
not_nested = heredoc_stack.size == 1
357-
current_heredoc = heredoc_stack.last
358-
if is_first_token_on_line && not_nested && current_heredoc.common_whitespace > 0
359-
value = trim_heredoc_whitespace(value, heredoc)
360-
end
361-
if current_heredoc
362-
value = unescape_heredoc(value, heredoc)
365+
if is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
366+
value = trim_heredoc_whitespace(value, current_heredoc)
363367
end
368+
369+
value = unescape_string(value, quote_stack.last)
364370
else
365371
# When the parser gem encounters a line continuation inside of a multiline string,
366372
# it emits a single string node. The backslash (and remaining newline) is removed.
@@ -386,7 +392,7 @@ def to_a
386392

387393
if emit
388394
end_offset = start_offset + current_line.bytesize + adjustment
389-
tokens << [:tSTRING_CONTENT, [unescape_string(current_line), Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]]
395+
tokens << [:tSTRING_CONTENT, [unescape_string(current_line, quote_stack.last), Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]]
390396
start_offset = end_offset
391397
current_line = +""
392398
adjustment = 0
@@ -405,6 +411,8 @@ def to_a
405411
value = value[0]
406412
location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])
407413
end
414+
415+
quote_stack.pop
408416
when :tSYMBEG
409417
if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END
410418
next_location = token.location.join(next_token.location)
@@ -413,6 +421,8 @@ def to_a
413421
value = { "~@" => "~", "!@" => "!" }.fetch(value, value)
414422
location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
415423
index += 1
424+
else
425+
quote_stack.push(value)
416426
end
417427
when :tFID
418428
if !tokens.empty? && tokens.dig(-1, 0) == :kDEF
@@ -422,10 +432,15 @@ def to_a
422432
if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :STRING_END
423433
type = :tBACK_REF2
424434
end
435+
quote_stack.push(value)
425436
when :tSYMBOLS_BEG, :tQSYMBOLS_BEG, :tWORDS_BEG, :tQWORDS_BEG
426437
if (next_token = lexed[index][0]) && next_token.type == :WORDS_SEP
427438
index += 1
428439
end
440+
441+
quote_stack.push(value)
442+
when :tREGEXP_BEG
443+
quote_stack.push(value)
429444
end
430445

431446
tokens << [type, [value, location]]
@@ -541,11 +556,6 @@ def trim_heredoc_whitespace(string, heredoc)
541556
string[trimmed_characters..]
542557
end
543558

544-
# Naive string escaping handling. Should be closer to the "unescape_heredoc" method
545-
def unescape_string(string)
546-
string.gsub("\\\\", "\\")
547-
end
548-
549559
# Escape sequences that have special and should appear unescaped in the resulting string.
550560
ESCAPES = {
551561
"a" => "\a", "b" => "\b", "e" => "\e", "f" => "\f",
@@ -554,15 +564,34 @@ def unescape_string(string)
554564
}.freeze
555565
private_constant :ESCAPES
556566

567+
# When one of these delimiters is encountered, then the other
568+
# one is allowed to be escaped as well.
569+
DELIMITER_SYMETRY = { "[" => "]", "(" => ")", "{" => "}", "<" => ">" }.freeze
570+
private_constant :DELIMITER_SYMETRY
571+
557572
# TODO: Does not handle "\u1234" and other longer-form escapes.
558-
def unescape_heredoc(string, heredoc)
573+
def unescape_string(string, quote)
559574
# In single-quoted heredocs, everything is taken literally.
560-
return string if heredoc.quote == "'"
575+
return string if quote == "<<'"
576+
577+
# TODO: Implement regexp escaping
578+
return string if quote == "/" || quote.start_with?("%r")
579+
580+
if quote == "'" || quote.start_with?("%q") || quote.start_with?("%w") || quote.start_with?("%i")
581+
if quote == "'"
582+
delimiter = "'"
583+
else
584+
delimiter = quote[2]
585+
end
561586

562-
# When double-quoted, escape sequences can be written literally. For example, "\\n" becomes "\n",
563-
# and "\\\\n" becomes "\\n". Unknown escapes sequences, like "\\o" simply become "o".
564-
string.gsub(/\\./) do |match|
565-
ESCAPES[match[1]] || match[1]
587+
delimiters = Regexp.escape("#{delimiter}#{DELIMITER_SYMETRY[delimiter]}")
588+
string.gsub(/\\([\\#{delimiters}])/, '\1')
589+
else
590+
# When double-quoted, escape sequences can be written literally. For example, "\\n" becomes "\n",
591+
# and "\\\\n" becomes "\\n". Unknown escapes sequences, like "\\o" simply become "o".
592+
string.gsub(/\\./) do |match|
593+
ESCAPES[match[1]] || match[1]
594+
end
566595
end
567596
end
568597
end

test/prism/ruby/parser_test.rb

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,6 @@ class ParserTest < TestCase
9393
"methods.txt",
9494
"strings.txt",
9595
"tilde_heredocs.txt",
96-
"xstring_with_backslash.txt",
9796
"seattlerb/backticks_interpolation_line.txt",
9897
"seattlerb/bug169.txt",
9998
"seattlerb/case_in.txt",
@@ -103,28 +102,19 @@ class ParserTest < TestCase
103102
"seattlerb/difficult6__8.txt",
104103
"seattlerb/dsym_esc_to_sym.txt",
105104
"seattlerb/heredoc_unicode.txt",
106-
"seattlerb/heredoc_with_carriage_return_escapes_windows.txt",
107-
"seattlerb/heredoc_with_carriage_return_escapes.txt",
108105
"seattlerb/module_comments.txt",
109106
"seattlerb/parse_line_block_inline_comment_leading_newlines.txt",
110107
"seattlerb/parse_line_block_inline_comment.txt",
111108
"seattlerb/parse_line_block_inline_multiline_comment.txt",
112-
"seattlerb/parse_line_dstr_escaped_newline.txt",
113109
"seattlerb/parse_line_heredoc.txt",
114-
"seattlerb/parse_line_multiline_str_literal_n.txt",
115-
"seattlerb/parse_line_str_with_newline_escape.txt",
116110
"seattlerb/pct_w_heredoc_interp_nested.txt",
117-
"seattlerb/qw_escape_term.txt",
118111
"seattlerb/read_escape_unicode_curlies.txt",
119112
"seattlerb/read_escape_unicode_h4.txt",
120113
"seattlerb/required_kwarg_no_value.txt",
121114
"seattlerb/slashy_newlines_within_string.txt",
122-
"seattlerb/str_double_escaped_newline.txt",
123115
"seattlerb/str_evstr_escape.txt",
124-
"seattlerb/str_newline_hash_line_number.txt",
125116
"seattlerb/TestRubyParserShared.txt",
126117
"unparser/corpus/literal/assignment.txt",
127-
"unparser/corpus/semantic/opasgn.txt",
128118
"whitequark/args.txt",
129119
"whitequark/beginless_erange_after_newline.txt",
130120
"whitequark/beginless_irange_after_newline.txt",

0 commit comments

Comments
 (0)