Skip to content

Commit d1697b3

Browse files
committed
Further refine string handling in the parser translator
Mostly around newlines and line continuation. * percent arrays need special backslash handling in the ast * Fix offset issue for heredocs with many line continuations (used wrong variable as index access) * More refined rules on when to simplify string tokens * Handle line continuations in squiggly heredocs * Correctly dedent squiggly heredocs with interpolation * Consider `':foo:` and `%s[foo]` to not be interpolation
1 parent de7bb68 commit d1697b3

File tree

10 files changed

+724
-241
lines changed

10 files changed

+724
-241
lines changed

lib/prism/translation/parser/compiler.rb

Lines changed: 95 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,29 @@ def visit_and_node(node)
7474
# []
7575
# ^^
7676
def visit_array_node(node)
77-
builder.array(token(node.opening_loc), visit_all(node.elements), token(node.closing_loc))
77+
if node.opening&.start_with?("%w", "%W", "%i", "%I")
78+
elements = node.elements.flat_map do |element|
79+
if element.is_a?(StringNode)
80+
if element.content.include?("\n")
81+
string_nodes_from_line_continuations(element.unescaped, element.content, element.content_loc.start_offset, node.opening)
82+
else
83+
[builder.string_internal([element.unescaped, srange(element.content_loc)])]
84+
end
85+
elsif element.is_a?(InterpolatedStringNode)
86+
builder.string_compose(
87+
token(element.opening_loc),
88+
string_nodes_from_interpolation(element, node.opening),
89+
token(element.closing_loc)
90+
)
91+
else
92+
[visit(element)]
93+
end
94+
end
95+
else
96+
elements = visit_all(node.elements)
97+
end
98+
99+
builder.array(token(node.opening_loc), elements, token(node.closing_loc))
78100
end
79101

80102
# foo => [bar]
@@ -1085,19 +1107,9 @@ def visit_interpolated_string_node(node)
10851107
return visit_heredoc(node) { |children, closing| builder.string_compose(token(node.opening_loc), children, closing) }
10861108
end
10871109

1088-
parts = node.parts.flat_map do |part|
1089-
# When the content of a string node is split across multiple lines, the
1090-
# parser gem creates individual string nodes for each line the content is part of.
1091-
if part.type == :string_node && part.content.include?("\n") && part.opening_loc.nil?
1092-
string_nodes_from_line_continuations(part.unescaped, part.content, part.content_loc.start_offset, node.opening)
1093-
else
1094-
visit(part)
1095-
end
1096-
end
1097-
10981110
builder.string_compose(
10991111
token(node.opening_loc),
1100-
parts,
1112+
string_nodes_from_interpolation(node, node.opening),
11011113
token(node.closing_loc)
11021114
)
11031115
end
@@ -1116,14 +1128,14 @@ def visit_interpolated_symbol_node(node)
11161128
# ^^^^^^^^^^^^
11171129
def visit_interpolated_x_string_node(node)
11181130
if node.heredoc?
1119-
visit_heredoc(node) { |children, closing| builder.xstring_compose(token(node.opening_loc), children, closing) }
1120-
else
1121-
builder.xstring_compose(
1122-
token(node.opening_loc),
1123-
visit_all(node.parts),
1124-
token(node.closing_loc)
1125-
)
1131+
return visit_heredoc(node) { |children, closing| builder.xstring_compose(token(node.opening_loc), children, closing) }
11261132
end
1133+
1134+
builder.xstring_compose(
1135+
token(node.opening_loc),
1136+
string_nodes_from_interpolation(node, node.opening),
1137+
token(node.closing_loc)
1138+
)
11271139
end
11281140

11291141
# -> { it }
@@ -2011,13 +2023,6 @@ def visit_block(call, block)
20112023
end
20122024
end
20132025

2014-
# The parser gem automatically converts \r\n to \n, meaning our offsets
2015-
# need to be adjusted to always subtract 1 from the length.
2016-
def chomped_bytesize(line)
2017-
chomped = line.chomp
2018-
chomped.bytesize + (chomped == line ? 0 : 1)
2019-
end
2020-
20212026
# Visit a heredoc that can be either a string or an xstring.
20222027
def visit_heredoc(node)
20232028
children = Array.new
@@ -2086,55 +2091,88 @@ def within_pattern
20862091
end
20872092
end
20882093

2094+
def string_nodes_from_interpolation(node, opening)
2095+
node.parts.flat_map do |part|
2096+
# When the content of a string node is split across multiple lines, the
2097+
# parser gem creates individual string nodes for each line the content is part of.
2098+
if part.type == :string_node && part.content.include?("\n") && part.opening_loc.nil?
2099+
string_nodes_from_line_continuations(part.unescaped, part.content, part.content_loc.start_offset, opening)
2100+
else
2101+
visit(part)
2102+
end
2103+
end
2104+
end
2105+
20892106
# Create parser string nodes from a single prism node. The parser gem
20902107
# "glues" strings together when a line continuation is encountered.
20912108
def string_nodes_from_line_continuations(unescaped, escaped, start_offset, opening)
20922109
unescaped = unescaped.lines
20932110
escaped = escaped.lines
2111+
percent_array = opening&.start_with?("%w", "%W", "%i", "%I")
2112+
2113+
# Non-interpolating strings
2114+
if opening&.end_with?("'") || opening&.start_with?("%q", "%s", "%w", "%i")
2115+
current_length = 0
2116+
current_line = +""
20942117

2095-
escaped_lengths = []
2096-
normalized_lengths = []
2097-
# Keeps track of where an unescaped line should start a new token. An unescaped
2098-
# \n would otherwise be indistinguishable from the actual newline at the end of
2099-
# of the line. The parser gem only emits a new string node at "real" newlines,
2100-
# line continuations don't start a new node as well.
2101-
do_next_tokens = []
2102-
2103-
if opening&.end_with?("'")
2104-
escaped.each do |line|
2105-
escaped_lengths << line.bytesize
2106-
normalized_lengths << chomped_bytesize(line)
2107-
do_next_tokens << true
2118+
escaped.filter_map.with_index do |escaped_line, index|
2119+
unescaped_line = unescaped.fetch(index, "")
2120+
current_length += escaped_line.bytesize
2121+
current_line << unescaped_line
2122+
2123+
# Glue line continuations together. Only %w and %i arrays can contain these.
2124+
if percent_array && escaped_line[/(\\)*\n$/, 1]&.length&.odd?
2125+
next unless index == escaped.count - 1
2126+
end
2127+
s = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_length)])
2128+
start_offset += escaped_line.bytesize
2129+
current_line = +""
2130+
current_length = 0
2131+
s
21082132
end
21092133
else
2134+
escaped_lengths = []
2135+
normalized_lengths = []
2136+
# Keeps track of where an unescaped line should start a new token. An unescaped
2137+
# \n would otherwise be indistinguishable from the actual newline at the end of
2138+
# of the line. The parser gem only emits a new string node at "real" newlines,
2139+
# line continuations don't start a new node as well.
2140+
do_next_tokens = []
2141+
21102142
escaped
21112143
.chunk_while { |before, after| before[/(\\*)\r?\n$/, 1]&.length&.odd? || false }
21122144
.each do |lines|
21132145
escaped_lengths << lines.sum(&:bytesize)
2114-
normalized_lengths << lines.sum { |line| chomped_bytesize(line) }
21152146
unescaped_lines_count = lines.sum do |line|
21162147
line.scan(/(\\*)n/).count { |(backslashes)| backslashes&.length&.odd? || false }
21172148
end
2118-
do_next_tokens.concat(Array.new(unescaped_lines_count + 1, false))
2149+
extra = 1
2150+
extra = lines.count if percent_array # Account for line continuations in percent arrays
2151+
2152+
normalized_lengths.concat(Array.new(unescaped_lines_count + extra, 0))
2153+
normalized_lengths[-1] = lines.sum { |line| line.bytesize }
2154+
do_next_tokens.concat(Array.new(unescaped_lines_count + extra, false))
21192155
do_next_tokens[-1] = true
21202156
end
2121-
end
2122-
2123-
current_line = +""
2124-
current_normalized_length = 0
21252157

2126-
unescaped.filter_map.with_index do |unescaped_line, index|
2127-
current_line << unescaped_line
2128-
current_normalized_length += normalized_lengths.fetch(index, 0)
2129-
2130-
if do_next_tokens[index]
2131-
inner_part = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_normalized_length)])
2132-
start_offset += escaped_lengths.fetch(index, 0)
2133-
current_line = +""
2134-
current_normalized_length = 0
2135-
inner_part
2136-
else
2137-
nil
2158+
current_line = +""
2159+
current_normalized_length = 0
2160+
2161+
emitted_count = 0
2162+
unescaped.filter_map.with_index do |unescaped_line, index|
2163+
current_line << unescaped_line
2164+
current_normalized_length += normalized_lengths.fetch(index, 0)
2165+
2166+
if do_next_tokens[index]
2167+
inner_part = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_normalized_length)])
2168+
start_offset += escaped_lengths.fetch(emitted_count, 0)
2169+
current_line = +""
2170+
current_normalized_length = 0
2171+
emitted_count += 1
2172+
inner_part
2173+
else
2174+
nil
2175+
end
21382176
end
21392177
end
21402178
end

lib/prism/translation/parser/lexer.rb

Lines changed: 59 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -353,11 +353,15 @@ def to_a
353353
location = range(next_location.start_offset, next_location.end_offset)
354354
index += 1
355355
elsif value.start_with?("'", '"', "%")
356-
if next_token&.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && next_next_token&.type == :STRING_END
357-
# the parser gem doesn't simplify strings when its value ends in a newline
358-
if !(string_value = next_token.value).end_with?("\n") && basic_quotes
356+
if next_token&.type == :STRING_CONTENT && next_next_token&.type == :STRING_END
357+
string_value = next_token.value
358+
if simplify_string?(string_value, value)
359359
next_location = token.location.join(next_next_token.location)
360-
value = unescape_string(string_value, value)
360+
if percent_array?(value)
361+
value = percent_array_unescape(string_value)
362+
else
363+
value = unescape_string(string_value, value)
364+
end
361365
type = :tSTRING
362366
location = range(next_location.start_offset, next_location.end_offset)
363367
index += 2
@@ -399,17 +403,31 @@ def to_a
399403
is_percent_array = percent_array?(quote_stack.last)
400404

401405
if (lines = token.value.lines).one?
402-
# Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
403-
is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
404-
# The parser gem only removes indentation when the heredoc is not nested
405-
not_nested = heredoc_stack.size == 1
406-
if is_percent_array
407-
value = percent_array_unescape(value)
408-
elsif is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
409-
value = trim_heredoc_whitespace(value, current_heredoc)
410-
end
406+
# Prism usually emits a single token for strings with line continuations.
407+
# For squiggly heredocs they are not joined so we do that manually here.
408+
current_string = +""
409+
current_length = 0
410+
start_offset = token.location.start_offset
411+
while token.type == :STRING_CONTENT
412+
current_length += token.value.bytesize
413+
# Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
414+
is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
415+
# The parser gem only removes indentation when the heredoc is not nested
416+
not_nested = heredoc_stack.size == 1
417+
if is_percent_array
418+
value = percent_array_unescape(token.value)
419+
elsif is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
420+
value = trim_heredoc_whitespace(token.value, current_heredoc)
421+
end
411422

412-
value = unescape_string(value, quote_stack.last)
423+
current_string << unescape_string(value, quote_stack.last)
424+
if (backslash_count = token.value[/(\\{1,})\n/, 1]&.length).nil? || backslash_count.even? || !interpolation?(quote_stack.last)
425+
tokens << [:tSTRING_CONTENT, [current_string, range(start_offset, start_offset + current_length)]]
426+
break
427+
end
428+
token = lexed[index][0]
429+
index += 1
430+
end
413431
else
414432
# When the parser gem encounters a line continuation inside of a multiline string,
415433
# it emits a single string node. The backslash (and remaining newline) is removed.
@@ -447,8 +465,8 @@ def to_a
447465
adjustment = 0
448466
end
449467
end
450-
next
451468
end
469+
next
452470
when :tSTRING_DVAR
453471
value = nil
454472
when :tSTRING_END
@@ -571,20 +589,21 @@ def calculate_heredoc_whitespace(heredoc_token_index)
571589
while (lexed[next_token_index] && next_token = lexed[next_token_index][0])
572590
next_token_index += 1
573591
next_next_token = lexed[next_token_index] && lexed[next_token_index][0]
592+
first_token_on_line = next_token.location.start_column == 0
574593

575594
# String content inside nested heredocs and interpolation is ignored
576595
if next_token.type == :HEREDOC_START || next_token.type == :EMBEXPR_BEGIN
577596
# When interpolation is the first token of a line there is no string
578597
# content to check against. There will be no common whitespace.
579-
if nesting_level == 0 && next_token.location.start_column == 0
598+
if nesting_level == 0 && first_token_on_line
580599
result = 0
581600
end
582601
nesting_level += 1
583602
elsif next_token.type == :HEREDOC_END || next_token.type == :EMBEXPR_END
584603
nesting_level -= 1
585604
# When we encountered the matching heredoc end, we can exit
586605
break if nesting_level == -1
587-
elsif next_token.type == :STRING_CONTENT && nesting_level == 0
606+
elsif next_token.type == :STRING_CONTENT && nesting_level == 0 && first_token_on_line
588607
common_whitespace = 0
589608
next_token.value[/^\s*/].each_char do |char|
590609
if char == "\t"
@@ -674,8 +693,11 @@ def unescape_string(string, quote)
674693
# Append what was just skipped over, excluding the found backslash.
675694
result.append_as_bytes(string.byteslice(scanner.pos - skipped, skipped - 1))
676695

677-
# Simple single-character escape sequences like \n
678-
if (replacement = ESCAPES[scanner.peek(1)])
696+
if scanner.peek(1) == "\n"
697+
# Line continuation
698+
scanner.pos += 1
699+
elsif (replacement = ESCAPES[scanner.peek(1)])
700+
# Simple single-character escape sequences like \n
679701
result.append_as_bytes(replacement)
680702
scanner.pos += 1
681703
elsif (octal = scanner.check(/[0-7]{1,3}/))
@@ -714,6 +736,23 @@ def unescape_string(string, quote)
714736
end
715737
end
716738

739+
# Certain strings are merged into a single string token.
740+
def simplify_string?(value, quote)
741+
case quote
742+
when "'"
743+
# Only simplify 'foo'
744+
!value.include?("\n")
745+
when '"'
746+
# Simplify when every line ends with a line continuation, or it is the last line
747+
value.lines.all? do |line|
748+
!line.end_with?("\n") || line[/(\\*)$/, 1]&.length&.odd?
749+
end
750+
else
751+
# %q and similar are never simplified
752+
false
753+
end
754+
end
755+
717756
# In a percent array, certain whitespace can be preceeded with a backslash,
718757
# causing the following characters to be part of the previous element.
719758
def percent_array_unescape(string)
@@ -737,7 +776,7 @@ def percent_array_leading_whitespace(string)
737776

738777
# Determine if characters preceeded by a backslash should be escaped or not
739778
def interpolation?(quote)
740-
quote != "'" && !quote.start_with?("%q", "%w", "%i")
779+
!quote.end_with?("'") && !quote.start_with?("%q", "%w", "%i", "%s")
741780
end
742781

743782
# Regexp allow interpolation but are handled differently during unescaping

0 commit comments

Comments
 (0)