Skip to content

Commit b751c75

Browse files
committed
Better handle heredoc escaping in the parser translator
This does a few things: * After a line continuation in a heredoc, emit only a single token * Dedent ` heredocs * Properly handle escapes in single and doublequoted heredocs Some of the heredoc escaping logic should be applied to strings as well, but I'll leave that for a different PR.
1 parent 8bbdc4f commit b751c75

File tree

2 files changed

+67
-33
lines changed

2 files changed

+67
-33
lines changed

lib/prism/translation/parser/lexer.rb

Lines changed: 67 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ class Lexer
201201
]
202202

203203
# Heredocs are complex and require us to keep track of a bit of info to refer to later
204-
HeredocData = Struct.new(:identifier, :common_whitespace, keyword_init: true)
204+
HeredocData = Struct.new(:identifier, :common_whitespace, :quote, keyword_init: true)
205205

206206
private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES, :HeredocData
207207

@@ -316,7 +316,7 @@ def to_a
316316
# the parser gem doesn't simplify strings when its value ends in a newline
317317
unless (string_value = next_token.value).end_with?("\n")
318318
next_location = token.location.join(next_next_token.location)
319-
value = string_value.gsub("\\\\", "\\")
319+
value = unescape_string(string_value)
320320
type = :tSTRING
321321
location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
322322
index += 2
@@ -327,19 +327,23 @@ def to_a
327327
heredoc = HeredocData.new(
328328
identifier: value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier],
329329
common_whitespace: 0,
330+
quote: quote,
330331
)
331332

332333
if quote == "`"
333334
type = :tXSTRING_BEG
334-
value = "<<`"
335-
else
336-
# The parser gem trims whitespace from squiggly heredocs. We must record
337-
# the most common whitespace to later remove.
338-
if heredoc_type == "~"
339-
heredoc.common_whitespace = calculate_heredoc_whitespace(index)
340-
end
335+
end
341336

342-
value = "<<#{quote == "'" || quote == "\"" ? quote : "\""}"
337+
# The parser gem trims whitespace from squiggly heredocs. We must record
338+
# the most common whitespace to later remove.
339+
if heredoc_type == "~" || heredoc_type == "`"
340+
heredoc.common_whitespace = calculate_heredoc_whitespace(index)
341+
end
342+
343+
if quote == "'" || quote == '"' || quote == "`"
344+
value = "<<#{quote}"
345+
else
346+
value = '<<"'
343347
end
344348

345349
heredoc_stack.push(heredoc)
@@ -350,31 +354,43 @@ def to_a
350354
is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
351355
# The parser gem only removes indentation when the heredoc is not nested
352356
not_nested = heredoc_stack.size == 1
353-
if is_first_token_on_line && not_nested && (heredoc = heredoc_stack[0]).common_whitespace > 0
357+
current_heredoc = heredoc_stack.last
358+
if is_first_token_on_line && not_nested && current_heredoc.common_whitespace > 0
354359
value = trim_heredoc_whitespace(value, heredoc)
355360
end
361+
if current_heredoc
362+
value = unescape_heredoc(value, heredoc)
363+
end
356364
else
365+
# When the parser gem encounters a line continuation inside of a multiline string,
366+
# it emits a single string node. The backslash (and remaining newline) is removed.
367+
current_line = +""
368+
adjustment = 0
357369
start_offset = offset_cache[token.location.start_offset]
358-
lines.map do |line|
359-
newline = line.end_with?("\r\n") ? "\r\n" : "\n"
370+
emit = false
371+
372+
lines.each.with_index do |line, index|
360373
chomped_line = line.chomp
361-
if match = chomped_line.match(/(?<backslashes>\\+)\z/)
362-
adjustment = match[:backslashes].size / 2
363-
adjusted_line = chomped_line.delete_suffix("\\" * adjustment)
364-
if match[:backslashes].size.odd?
365-
adjusted_line.delete_suffix!("\\")
366-
adjustment += 2
367-
else
368-
adjusted_line << newline
369-
end
374+
375+
# When the line ends with an odd number of backslashes, it must be a line continuation.
376+
if chomped_line[/\\{1,}\z/]&.length&.odd?
377+
chomped_line.delete_suffix!("\\")
378+
current_line << chomped_line
379+
adjustment += 2
380+
# If the string ends with a line continuation emit the remainder
381+
emit = index == lines.count - 1
370382
else
371-
adjusted_line = line
372-
adjustment = 0
383+
current_line << line
384+
emit = true
373385
end
374386

375-
end_offset = start_offset + adjusted_line.bytesize + adjustment
376-
tokens << [:tSTRING_CONTENT, [adjusted_line, Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]]
377-
start_offset = end_offset
387+
if emit
388+
end_offset = start_offset + current_line.bytesize + adjustment
389+
tokens << [:tSTRING_CONTENT, [unescape_string(current_line), Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]]
390+
start_offset = end_offset
391+
current_line = +""
392+
adjustment = 0
393+
end
378394
end
379395
next
380396
end
@@ -524,6 +540,30 @@ def trim_heredoc_whitespace(string, heredoc)
524540

525541
string[trimmed_characters..]
526542
end
543+
544+
def unescape_string(string)
545+
string.gsub("\\\\", "\\")
546+
end
547+
548+
# Escape sequences that have special and should appear unescaped in the resulting string.
549+
ESCAPES = {
550+
"a" => "\a", "b" => "\b", "e" => "\e", "f" => "\f",
551+
"n" => "\n", "r" => "\r", "s" => "\s", "t" => "\t",
552+
"v" => "\v", "\\\\" => "\\"
553+
}.freeze
554+
private_constant :ESCAPES
555+
556+
# TODO: Does not handle "\u1234" and other longer-form escapes.
557+
def unescape_heredoc(string, heredoc)
558+
# In single-quoted heredocs, everything is taken literally.
559+
return string if heredoc.quote == "'"
560+
561+
# When double-quoted, escape sequences can be written literally. For example, "\\n" becomes "\n",
562+
# and "\\\\n" becomes "\\n". Unknown escapes sequences, like "\\o" simply become "o".
563+
string.gsub(/\\./) do |match|
564+
ESCAPES[match[1]] || match[1]
565+
end
566+
end
527567
end
528568
end
529569
end

test/prism/ruby/parser_test.rb

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -102,13 +102,9 @@ class ParserTest < TestCase
102102
"seattlerb/difficult6__7.txt",
103103
"seattlerb/difficult6__8.txt",
104104
"seattlerb/dsym_esc_to_sym.txt",
105-
"seattlerb/heredoc__backslash_dos_format.txt",
106-
"seattlerb/heredoc_backslash_nl.txt",
107105
"seattlerb/heredoc_unicode.txt",
108106
"seattlerb/heredoc_with_carriage_return_escapes_windows.txt",
109107
"seattlerb/heredoc_with_carriage_return_escapes.txt",
110-
"seattlerb/heredoc_with_interpolation_and_carriage_return_escapes_windows.txt",
111-
"seattlerb/heredoc_with_interpolation_and_carriage_return_escapes.txt",
112108
"seattlerb/module_comments.txt",
113109
"seattlerb/parse_line_block_inline_comment_leading_newlines.txt",
114110
"seattlerb/parse_line_block_inline_comment.txt",
@@ -128,14 +124,12 @@ class ParserTest < TestCase
128124
"seattlerb/str_newline_hash_line_number.txt",
129125
"seattlerb/TestRubyParserShared.txt",
130126
"unparser/corpus/literal/assignment.txt",
131-
"unparser/corpus/literal/dstr.txt",
132127
"unparser/corpus/semantic/opasgn.txt",
133128
"whitequark/args.txt",
134129
"whitequark/beginless_erange_after_newline.txt",
135130
"whitequark/beginless_irange_after_newline.txt",
136131
"whitequark/bug_ascii_8bit_in_literal.txt",
137132
"whitequark/bug_def_no_paren_eql_begin.txt",
138-
"whitequark/dedenting_heredoc.txt",
139133
"whitequark/forward_arg_with_open_args.txt",
140134
"whitequark/lbrace_arg_after_command_args.txt",
141135
"whitequark/multiple_pattern_matches.txt",

0 commit comments

Comments
 (0)