Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 95 additions & 57 deletions lib/prism/translation/parser/compiler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,29 @@ def visit_and_node(node)
# []
# ^^
def visit_array_node(node)
builder.array(token(node.opening_loc), visit_all(node.elements), token(node.closing_loc))
if node.opening&.start_with?("%w", "%W", "%i", "%I")
elements = node.elements.flat_map do |element|
if element.is_a?(StringNode)
if element.content.include?("\n")
string_nodes_from_line_continuations(element.unescaped, element.content, element.content_loc.start_offset, node.opening)
else
[builder.string_internal([element.unescaped, srange(element.content_loc)])]
end
elsif element.is_a?(InterpolatedStringNode)
builder.string_compose(
token(element.opening_loc),
string_nodes_from_interpolation(element, node.opening),
token(element.closing_loc)
)
else
[visit(element)]
end
end
else
elements = visit_all(node.elements)
end

builder.array(token(node.opening_loc), elements, token(node.closing_loc))
end

# foo => [bar]
Expand Down Expand Up @@ -1085,19 +1107,9 @@ def visit_interpolated_string_node(node)
return visit_heredoc(node) { |children, closing| builder.string_compose(token(node.opening_loc), children, closing) }
end

parts = node.parts.flat_map do |part|
# When the content of a string node is split across multiple lines, the
# parser gem creates individual string nodes for each line the content is part of.
if part.type == :string_node && part.content.include?("\n") && part.opening_loc.nil?
string_nodes_from_line_continuations(part.unescaped, part.content, part.content_loc.start_offset, node.opening)
else
visit(part)
end
end

builder.string_compose(
token(node.opening_loc),
parts,
string_nodes_from_interpolation(node, node.opening),
token(node.closing_loc)
)
end
Expand All @@ -1116,14 +1128,14 @@ def visit_interpolated_symbol_node(node)
# ^^^^^^^^^^^^
def visit_interpolated_x_string_node(node)
if node.heredoc?
visit_heredoc(node) { |children, closing| builder.xstring_compose(token(node.opening_loc), children, closing) }
else
builder.xstring_compose(
token(node.opening_loc),
visit_all(node.parts),
token(node.closing_loc)
)
return visit_heredoc(node) { |children, closing| builder.xstring_compose(token(node.opening_loc), children, closing) }
end

builder.xstring_compose(
token(node.opening_loc),
string_nodes_from_interpolation(node, node.opening),
token(node.closing_loc)
)
end

# -> { it }
Expand Down Expand Up @@ -2011,13 +2023,6 @@ def visit_block(call, block)
end
end

# The parser gem automatically converts \r\n to \n, meaning our offsets
# need to be adjusted to always subtract 1 from the length.
def chomped_bytesize(line)
chomped = line.chomp
chomped.bytesize + (chomped == line ? 0 : 1)
end

# Visit a heredoc that can be either a string or an xstring.
def visit_heredoc(node)
children = Array.new
Expand Down Expand Up @@ -2086,55 +2091,88 @@ def within_pattern
end
end

# When the content of a string node is split across multiple lines, the
# parser gem creates individual string nodes for each line the content is part of.
def string_nodes_from_interpolation(node, opening)
node.parts.flat_map do |part|
if part.type == :string_node && part.content.include?("\n") && part.opening_loc.nil?
string_nodes_from_line_continuations(part.unescaped, part.content, part.content_loc.start_offset, opening)
else
visit(part)
end
end
end

# Create parser string nodes from a single prism node. The parser gem
# "glues" strings together when a line continuation is encountered.
def string_nodes_from_line_continuations(unescaped, escaped, start_offset, opening)
unescaped = unescaped.lines
escaped = escaped.lines
percent_array = opening&.start_with?("%w", "%W", "%i", "%I")

# Non-interpolating strings
if opening&.end_with?("'") || opening&.start_with?("%q", "%s", "%w", "%i")
current_length = 0
current_line = +""

escaped.filter_map.with_index do |escaped_line, index|
unescaped_line = unescaped.fetch(index, "")
current_length += escaped_line.bytesize
current_line << unescaped_line

escaped_lengths = []
normalized_lengths = []
# Keeps track of where an unescaped line should start a new token. An unescaped
# \n would otherwise be indistinguishable from the actual newline at the end of
# of the line. The parser gem only emits a new string node at "real" newlines,
# line continuations don't start a new node as well.
do_next_tokens = []

if opening&.end_with?("'")
escaped.each do |line|
escaped_lengths << line.bytesize
normalized_lengths << chomped_bytesize(line)
do_next_tokens << true
# Glue line continuations together. Only %w and %i arrays can contain these.
if percent_array && escaped_line[/(\\)*\n$/, 1]&.length&.odd?
next unless index == escaped.count - 1
end
s = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_length)])
start_offset += escaped_line.bytesize
current_line = +""
current_length = 0
s
end
else
escaped_lengths = []
normalized_lengths = []
# Keeps track of where an unescaped line should start a new token. An unescaped
# \n would otherwise be indistinguishable from the actual newline at the end of
# of the line. The parser gem only emits a new string node at "real" newlines,
# line continuations don't start a new node as well.
do_next_tokens = []

escaped
.chunk_while { |before, after| before[/(\\*)\r?\n$/, 1]&.length&.odd? || false }
.each do |lines|
escaped_lengths << lines.sum(&:bytesize)
normalized_lengths << lines.sum { |line| chomped_bytesize(line) }
unescaped_lines_count = lines.sum do |line|
line.scan(/(\\*)n/).count { |(backslashes)| backslashes&.length&.odd? || false }
end
do_next_tokens.concat(Array.new(unescaped_lines_count + 1, false))
extra = 1
extra = lines.count if percent_array # Account for line continuations in percent arrays

normalized_lengths.concat(Array.new(unescaped_lines_count + extra, 0))
normalized_lengths[-1] = lines.sum { |line| line.bytesize }
do_next_tokens.concat(Array.new(unescaped_lines_count + extra, false))
do_next_tokens[-1] = true
end
end

current_line = +""
current_normalized_length = 0

unescaped.filter_map.with_index do |unescaped_line, index|
current_line << unescaped_line
current_normalized_length += normalized_lengths.fetch(index, 0)

if do_next_tokens[index]
inner_part = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_normalized_length)])
start_offset += escaped_lengths.fetch(index, 0)
current_line = +""
current_normalized_length = 0
inner_part
else
nil
current_line = +""
current_normalized_length = 0

emitted_count = 0
unescaped.filter_map.with_index do |unescaped_line, index|
current_line << unescaped_line
current_normalized_length += normalized_lengths.fetch(index, 0)

if do_next_tokens[index]
inner_part = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_normalized_length)])
start_offset += escaped_lengths.fetch(emitted_count, 0)
current_line = +""
current_normalized_length = 0
emitted_count += 1
inner_part
else
nil
end
end
end
end
Expand Down
79 changes: 59 additions & 20 deletions lib/prism/translation/parser/lexer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -353,11 +353,15 @@ def to_a
location = range(next_location.start_offset, next_location.end_offset)
index += 1
elsif value.start_with?("'", '"', "%")
if next_token&.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && next_next_token&.type == :STRING_END
# the parser gem doesn't simplify strings when its value ends in a newline
if !(string_value = next_token.value).end_with?("\n") && basic_quotes
if next_token&.type == :STRING_CONTENT && next_next_token&.type == :STRING_END
string_value = next_token.value
if simplify_string?(string_value, value)
next_location = token.location.join(next_next_token.location)
value = unescape_string(string_value, value)
if percent_array?(value)
value = percent_array_unescape(string_value)
else
value = unescape_string(string_value, value)
end
type = :tSTRING
location = range(next_location.start_offset, next_location.end_offset)
index += 2
Expand Down Expand Up @@ -399,17 +403,31 @@ def to_a
is_percent_array = percent_array?(quote_stack.last)

if (lines = token.value.lines).one?
# Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
# The parser gem only removes indentation when the heredoc is not nested
not_nested = heredoc_stack.size == 1
if is_percent_array
value = percent_array_unescape(value)
elsif is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
value = trim_heredoc_whitespace(value, current_heredoc)
end
# Prism usually emits a single token for strings with line continuations.
# For squiggly heredocs they are not joined so we do that manually here.
current_string = +""
current_length = 0
start_offset = token.location.start_offset
while token.type == :STRING_CONTENT
current_length += token.value.bytesize
# Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
# The parser gem only removes indentation when the heredoc is not nested
not_nested = heredoc_stack.size == 1
if is_percent_array
value = percent_array_unescape(token.value)
elsif is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
value = trim_heredoc_whitespace(token.value, current_heredoc)
end

value = unescape_string(value, quote_stack.last)
current_string << unescape_string(value, quote_stack.last)
if (backslash_count = token.value[/(\\{1,})\n/, 1]&.length).nil? || backslash_count.even? || !interpolation?(quote_stack.last)
tokens << [:tSTRING_CONTENT, [current_string, range(start_offset, start_offset + current_length)]]
break
end
token = lexed[index][0]
index += 1
end
else
# When the parser gem encounters a line continuation inside of a multiline string,
# it emits a single string node. The backslash (and remaining newline) is removed.
Expand Down Expand Up @@ -447,8 +465,8 @@ def to_a
adjustment = 0
end
end
next
end
next
when :tSTRING_DVAR
value = nil
when :tSTRING_END
Expand Down Expand Up @@ -571,20 +589,21 @@ def calculate_heredoc_whitespace(heredoc_token_index)
while (lexed[next_token_index] && next_token = lexed[next_token_index][0])
next_token_index += 1
next_next_token = lexed[next_token_index] && lexed[next_token_index][0]
first_token_on_line = next_token.location.start_column == 0

# String content inside nested heredocs and interpolation is ignored
if next_token.type == :HEREDOC_START || next_token.type == :EMBEXPR_BEGIN
# When interpolation is the first token of a line there is no string
# content to check against. There will be no common whitespace.
if nesting_level == 0 && next_token.location.start_column == 0
if nesting_level == 0 && first_token_on_line
result = 0
end
nesting_level += 1
elsif next_token.type == :HEREDOC_END || next_token.type == :EMBEXPR_END
nesting_level -= 1
# When we encountered the matching heredoc end, we can exit
break if nesting_level == -1
elsif next_token.type == :STRING_CONTENT && nesting_level == 0
elsif next_token.type == :STRING_CONTENT && nesting_level == 0 && first_token_on_line
common_whitespace = 0
next_token.value[/^\s*/].each_char do |char|
if char == "\t"
Expand Down Expand Up @@ -674,8 +693,11 @@ def unescape_string(string, quote)
# Append what was just skipped over, excluding the found backslash.
result.append_as_bytes(string.byteslice(scanner.pos - skipped, skipped - 1))

# Simple single-character escape sequences like \n
if (replacement = ESCAPES[scanner.peek(1)])
if scanner.peek(1) == "\n"
# Line continuation
scanner.pos += 1
elsif (replacement = ESCAPES[scanner.peek(1)])
# Simple single-character escape sequences like \n
result.append_as_bytes(replacement)
scanner.pos += 1
elsif (octal = scanner.check(/[0-7]{1,3}/))
Expand Down Expand Up @@ -714,6 +736,23 @@ def unescape_string(string, quote)
end
end

# Certain strings are merged into a single string token.
def simplify_string?(value, quote)
case quote
when "'"
# Only simplify 'foo'
!value.include?("\n")
when '"'
# Simplify when every line ends with a line continuation, or it is the last line
value.lines.all? do |line|
!line.end_with?("\n") || line[/(\\*)$/, 1]&.length&.odd?
end
else
# %q and similar are never simplified
false
end
end

# In a percent array, certain whitespace can be preceeded with a backslash,
# causing the following characters to be part of the previous element.
def percent_array_unescape(string)
Expand All @@ -737,7 +776,7 @@ def percent_array_leading_whitespace(string)

# Determine if characters preceeded by a backslash should be escaped or not
def interpolation?(quote)
quote != "'" && !quote.start_with?("%q", "%w", "%i")
!quote.end_with?("'") && !quote.start_with?("%q", "%w", "%i", "%s")
end

# Regexp allow interpolation but are handled differently during unescaping
Expand Down
Loading
Loading