Skip to content

Commit 7bfd527

Browse files
committed
Implement squiggly heredocs for the parser translator
In parser, the string content is dedented. This implements these rules as far as I was able to understand them. It's all a bit confusing with spaces/tabs, always learning more funny things about them. I refered to the prism implementation, parser seems to adhere to it very closely.
1 parent 7262d09 commit 7bfd527

File tree

2 files changed

+89
-18
lines changed

2 files changed

+89
-18
lines changed

lib/prism/translation/parser/lexer.rb

Lines changed: 89 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,10 @@ class Lexer
200200
:tEQL, :tLPAREN, :tLPAREN2, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS
201201
]
202202

203-
private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES
203+
# Heredocs are complex and require us to keep track of a bit of info to refer to later
204+
HeredocData = Struct.new(:identifier, :common_whitespace, keyword_init: true)
205+
206+
private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES, :HeredocData
204207

205208
# The Parser::Source::Buffer that the tokens were lexed from.
206209
attr_reader :source_buffer
@@ -230,7 +233,7 @@ def to_a
230233
index = 0
231234
length = lexed.length
232235

233-
heredoc_identifier_stack = []
236+
heredoc_stack = Array.new
234237

235238
while index < length
236239
token, state = lexed[index]
@@ -299,9 +302,6 @@ def to_a
299302
when :tSPACE
300303
value = nil
301304
when :tSTRING_BEG
302-
if token.type == :HEREDOC_START
303-
heredoc_identifier_stack.push(value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier])
304-
end
305305
next_token = lexed[index][0]
306306
next_next_token = lexed[index + 1][0]
307307
basic_quotes = ["\"", "'"].include?(value)
@@ -321,17 +321,39 @@ def to_a
321321
location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
322322
index += 2
323323
end
324-
elsif value.start_with?("<<")
324+
elsif token.type == :HEREDOC_START
325325
quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2]
326+
heredoc_type = value[2] == "-" || value[2] == "~" ? value[2] : ""
327+
heredoc = HeredocData.new(
328+
identifier: value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier],
329+
common_whitespace: 0,
330+
)
331+
326332
if quote == "`"
327333
type = :tXSTRING_BEG
328334
value = "<<`"
329335
else
336+
# The parser gem trims whitespace from squiggly heredocs. We must record
337+
# the most common whitespace to later remove.
338+
if heredoc_type == "~"
339+
heredoc.common_whitespace = calculate_heredoc_whitespace(index)
340+
end
341+
330342
value = "<<#{quote == "'" || quote == "\"" ? quote : "\""}"
331343
end
344+
345+
heredoc_stack.push(heredoc)
332346
end
333347
when :tSTRING_CONTENT
334-
unless (lines = token.value.lines).one?
348+
if (lines = token.value.lines).one?
349+
# Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
350+
is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
351+
# The parser gem only removes indentation when the heredoc is not nested
352+
not_nested = heredoc_stack.size == 1
353+
if is_first_token_on_line && not_nested && (heredoc = heredoc_stack[0]).common_whitespace > 0
354+
value = trim_heredoc_whitespace(value, heredoc)
355+
end
356+
else
335357
start_offset = offset_cache[token.location.start_offset]
336358
lines.map do |line|
337359
newline = line.end_with?("\r\n") ? "\r\n" : "\n"
@@ -361,7 +383,7 @@ def to_a
361383
when :tSTRING_END
362384
if token.type == :HEREDOC_END && value.end_with?("\n")
363385
newline_length = value.end_with?("\r\n") ? 2 : 1
364-
value = heredoc_identifier_stack.pop
386+
value = heredoc_stack.pop.identifier
365387
location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - newline_length])
366388
elsif token.type == :REGEXP_END
367389
value = value[0]
@@ -443,6 +465,65 @@ def parse_rational(value)
443465
rescue ArgumentError
444466
0r
445467
end
468+
469+
# Wonky heredoc tab/spaces rules.
470+
# https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L10548-L10558
471+
def calculate_heredoc_whitespace(heredoc_token_index)
472+
next_token_index = heredoc_token_index
473+
nesting_level = 0
474+
previous_line = -1
475+
result = Float::MAX
476+
477+
while (lexed[next_token_index] && next_token = lexed[next_token_index][0])
478+
next_token_index += 1
479+
next_next_token = lexed[next_token_index] && lexed[next_token_index][0]
480+
481+
# String content inside nested heredocs and interpolation is ignored
482+
if next_token.type == :HEREDOC_START || next_token.type == :EMBEXPR_BEGIN
483+
nesting_level += 1
484+
elsif next_token.type == :HEREDOC_END || next_token.type == :EMBEXPR_END
485+
nesting_level -= 1
486+
# When we encountered the matching heredoc end, we can exit
487+
break if nesting_level == -1
488+
elsif next_token.type == :STRING_CONTENT && nesting_level == 0
489+
common_whitespace = 0
490+
next_token.value[/^\s*/].each_char do |char|
491+
if char == "\t"
492+
common_whitespace = (common_whitespace / 8 + 1) * 8;
493+
else
494+
common_whitespace += 1
495+
end
496+
end
497+
498+
is_first_token_on_line = next_token.location.start_line != previous_line
499+
# Whitespace is significant if followed by interpolation
500+
whitespace_only = common_whitespace == next_token.value.length && next_next_token&.location&.start_line != next_token.location.start_line
501+
if is_first_token_on_line && !whitespace_only && common_whitespace < result
502+
result = common_whitespace
503+
previous_line = next_token.location.start_line
504+
end
505+
end
506+
end
507+
result
508+
end
509+
510+
# Wonky heredoc tab/spaces rules.
511+
# https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L16528-L16545
512+
def trim_heredoc_whitespace(string, heredoc)
513+
trimmed_whitespace = 0
514+
trimmed_characters = 0
515+
while (string[trimmed_characters] == "\t" || string[trimmed_characters] == " ") && trimmed_whitespace < heredoc.common_whitespace
516+
if string[trimmed_characters] == "\t"
517+
trimmed_whitespace = (trimmed_whitespace / 8 + 1) * 8;
518+
break if trimmed_whitespace > heredoc.common_whitespace
519+
else
520+
trimmed_whitespace += 1
521+
end
522+
trimmed_characters += 1
523+
end
524+
525+
string[trimmed_characters..]
526+
end
446527
end
447528
end
448529
end

test/prism/ruby/parser_test.rb

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -104,13 +104,6 @@ class ParserTest < TestCase
104104
"seattlerb/dsym_esc_to_sym.txt",
105105
"seattlerb/heredoc__backslash_dos_format.txt",
106106
"seattlerb/heredoc_backslash_nl.txt",
107-
"seattlerb/heredoc_squiggly_blank_line_plus_interpolation.txt",
108-
"seattlerb/heredoc_squiggly_blank_lines.txt",
109-
"seattlerb/heredoc_squiggly_interp.txt",
110-
"seattlerb/heredoc_squiggly_tabs_extra.txt",
111-
"seattlerb/heredoc_squiggly_tabs.txt",
112-
"seattlerb/heredoc_squiggly_visually_blank_lines.txt",
113-
"seattlerb/heredoc_squiggly.txt",
114107
"seattlerb/heredoc_unicode.txt",
115108
"seattlerb/heredoc_with_carriage_return_escapes_windows.txt",
116109
"seattlerb/heredoc_with_carriage_return_escapes.txt",
@@ -143,14 +136,11 @@ class ParserTest < TestCase
143136
"whitequark/bug_ascii_8bit_in_literal.txt",
144137
"whitequark/bug_def_no_paren_eql_begin.txt",
145138
"whitequark/dedenting_heredoc.txt",
146-
"whitequark/dedenting_non_interpolating_heredoc_line_continuation.txt",
147139
"whitequark/forward_arg_with_open_args.txt",
148140
"whitequark/lbrace_arg_after_command_args.txt",
149141
"whitequark/multiple_pattern_matches.txt",
150142
"whitequark/newline_in_hash_argument.txt",
151143
"whitequark/parser_bug_640.txt",
152-
"whitequark/parser_drops_truncated_parts_of_squiggly_heredoc.txt",
153-
"whitequark/ruby_bug_11990.txt",
154144
"whitequark/ruby_bug_14690.txt",
155145
"whitequark/ruby_bug_9669.txt",
156146
"whitequark/slash_newline_in_heredocs.txt",

0 commit comments

Comments
 (0)