Skip to content

Commit 302cc30

Browse files
committed
Implement squiggly heredocs for the parser translator
In parser, the string content is dedented. This implements these rules as far as I was able to understand them. It's all a bit confusing with spaces/tabs, always learning more funny things about them. I refered to the prism implementation, parser seems to adhere to it very closely.
1 parent f5ae6d2 commit 302cc30

File tree

2 files changed

+89
-18
lines changed

2 files changed

+89
-18
lines changed

lib/prism/translation/parser/lexer.rb

Lines changed: 89 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,10 @@ class Lexer
200200
:tEQL, :tLPAREN, :tLPAREN2, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS
201201
]
202202

203-
private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES
203+
# Heredocs are complex and require us to keep track of a bit of info to refer to later
204+
HeredocData = Struct.new(:identifier, :common_whitespace, keyword_init: true)
205+
206+
private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES, :HeredocData
204207

205208
# The Parser::Source::Buffer that the tokens were lexed from.
206209
attr_reader :source_buffer
@@ -230,7 +233,7 @@ def to_a
230233
index = 0
231234
length = lexed.length
232235

233-
heredoc_identifier_stack = []
236+
heredoc_stack = Array.new
234237

235238
while index < length
236239
token, state = lexed[index]
@@ -299,9 +302,6 @@ def to_a
299302
when :tSPACE
300303
value = nil
301304
when :tSTRING_BEG
302-
if token.type == :HEREDOC_START
303-
heredoc_identifier_stack.push(value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier])
304-
end
305305
next_token = lexed[index][0]
306306
next_next_token = lexed[index + 1][0]
307307
basic_quotes = ["\"", "'"].include?(value)
@@ -321,17 +321,39 @@ def to_a
321321
location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
322322
index += 2
323323
end
324-
elsif value.start_with?("<<")
324+
elsif token.type == :HEREDOC_START
325325
quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2]
326+
heredoc_type = value[2] == "-" || value[2] == "~" ? value[2] : ""
327+
heredoc = HeredocData.new(
328+
identifier: value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier],
329+
common_whitespace: 0,
330+
)
331+
326332
if quote == "`"
327333
type = :tXSTRING_BEG
328334
value = "<<`"
329335
else
336+
# The parser gem trims whitespace from squiggly heredocs. We must record
337+
# the most common whitespace to later remove.
338+
if heredoc_type == "~"
339+
heredoc.common_whitespace = calculate_heredoc_whitespace(index)
340+
end
341+
330342
value = "<<#{quote == "'" || quote == "\"" ? quote : "\""}"
331343
end
344+
345+
heredoc_stack.push(heredoc)
332346
end
333347
when :tSTRING_CONTENT
334-
unless (lines = token.value.lines).one?
348+
if (lines = token.value.lines).one?
349+
# Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
350+
is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
351+
# The parser gem only removes indentation when the heredoc is not nested
352+
not_nested = heredoc_stack.size == 1
353+
if is_first_token_on_line && not_nested && (heredoc = heredoc_stack[0]).common_whitespace > 0
354+
value = trim_heredoc_whitespace(value, heredoc)
355+
end
356+
else
335357
start_offset = offset_cache[token.location.start_offset]
336358
lines.map do |line|
337359
newline = line.end_with?("\r\n") ? "\r\n" : "\n"
@@ -361,7 +383,7 @@ def to_a
361383
when :tSTRING_END
362384
if token.type == :HEREDOC_END && value.end_with?("\n")
363385
newline_length = value.end_with?("\r\n") ? 2 : 1
364-
value = heredoc_identifier_stack.pop
386+
value = heredoc_stack.pop.identifier
365387
location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - newline_length])
366388
elsif token.type == :REGEXP_END
367389
value = value[0]
@@ -439,6 +461,65 @@ def parse_rational(value)
439461
rescue ArgumentError
440462
0r
441463
end
464+
465+
# Wonky heredoc tab/spaces rules.
466+
# https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L10548-L10558
467+
def calculate_heredoc_whitespace(heredoc_token_index)
468+
next_token_index = heredoc_token_index
469+
nesting_level = 0
470+
previous_line = -1
471+
result = Float::MAX
472+
473+
while (lexed[next_token_index] && next_token = lexed[next_token_index][0])
474+
next_token_index += 1
475+
next_next_token = lexed[next_token_index] && lexed[next_token_index][0]
476+
477+
# String content inside nested heredocs and interpolation is ignored
478+
if next_token.type == :HEREDOC_START || next_token.type == :EMBEXPR_BEGIN
479+
nesting_level += 1
480+
elsif next_token.type == :HEREDOC_END || next_token.type == :EMBEXPR_END
481+
nesting_level -= 1
482+
# When we encountered the matching heredoc end, we can exit
483+
break if nesting_level == -1
484+
elsif next_token.type == :STRING_CONTENT && nesting_level == 0
485+
common_whitespace = 0
486+
next_token.value[/^\s*/].each_char do |char|
487+
if char == "\t"
488+
common_whitespace = (common_whitespace / 8 + 1) * 8;
489+
else
490+
common_whitespace += 1
491+
end
492+
end
493+
494+
is_first_token_on_line = next_token.location.start_line != previous_line
495+
# Whitespace is significant if followed by interpolation
496+
whitespace_only = common_whitespace == next_token.value.length && next_next_token&.location&.start_line != next_token.location.start_line
497+
if is_first_token_on_line && !whitespace_only && common_whitespace < result
498+
result = common_whitespace
499+
previous_line = next_token.location.start_line
500+
end
501+
end
502+
end
503+
result
504+
end
505+
506+
# Wonky heredoc tab/spaces rules.
507+
# https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L16528-L16545
508+
def trim_heredoc_whitespace(string, heredoc)
509+
trimmed_whitespace = 0
510+
trimmed_characters = 0
511+
while (string[trimmed_characters] == "\t" || string[trimmed_characters] == " ") && trimmed_whitespace < heredoc.common_whitespace
512+
if string[trimmed_characters] == "\t"
513+
trimmed_whitespace = (trimmed_whitespace / 8 + 1) * 8;
514+
break if trimmed_whitespace > heredoc.common_whitespace
515+
else
516+
trimmed_whitespace += 1
517+
end
518+
trimmed_characters += 1
519+
end
520+
521+
string[trimmed_characters..]
522+
end
442523
end
443524
end
444525
end

test/prism/ruby/parser_test.rb

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -104,13 +104,6 @@ class ParserTest < TestCase
104104
"seattlerb/dsym_esc_to_sym.txt",
105105
"seattlerb/heredoc__backslash_dos_format.txt",
106106
"seattlerb/heredoc_backslash_nl.txt",
107-
"seattlerb/heredoc_squiggly_blank_line_plus_interpolation.txt",
108-
"seattlerb/heredoc_squiggly_blank_lines.txt",
109-
"seattlerb/heredoc_squiggly_interp.txt",
110-
"seattlerb/heredoc_squiggly_tabs_extra.txt",
111-
"seattlerb/heredoc_squiggly_tabs.txt",
112-
"seattlerb/heredoc_squiggly_visually_blank_lines.txt",
113-
"seattlerb/heredoc_squiggly.txt",
114107
"seattlerb/heredoc_unicode.txt",
115108
"seattlerb/heredoc_with_carriage_return_escapes_windows.txt",
116109
"seattlerb/heredoc_with_carriage_return_escapes.txt",
@@ -150,15 +143,12 @@ class ParserTest < TestCase
150143
"whitequark/bug_ascii_8bit_in_literal.txt",
151144
"whitequark/bug_def_no_paren_eql_begin.txt",
152145
"whitequark/dedenting_heredoc.txt",
153-
"whitequark/dedenting_non_interpolating_heredoc_line_continuation.txt",
154146
"whitequark/forward_arg_with_open_args.txt",
155147
"whitequark/interp_digit_var.txt",
156148
"whitequark/lbrace_arg_after_command_args.txt",
157149
"whitequark/multiple_pattern_matches.txt",
158150
"whitequark/newline_in_hash_argument.txt",
159151
"whitequark/parser_bug_640.txt",
160-
"whitequark/parser_drops_truncated_parts_of_squiggly_heredoc.txt",
161-
"whitequark/ruby_bug_11990.txt",
162152
"whitequark/ruby_bug_14690.txt",
163153
"whitequark/ruby_bug_9669.txt",
164154
"whitequark/slash_newline_in_heredocs.txt",

0 commit comments

Comments
 (0)