@@ -200,7 +200,10 @@ class Lexer
200200 :tEQL , :tLPAREN , :tLPAREN2 , :tLSHFT , :tNL , :tOP_ASGN , :tOROP , :tPIPE , :tSEMI , :tSTRING_DBEG , :tUMINUS , :tUPLUS
201201 ]
202202
203- private_constant :TYPES , :EXPR_BEG , :EXPR_LABEL , :LAMBDA_TOKEN_TYPES , :LPAREN_CONVERSION_TOKEN_TYPES
203+ # Heredocs are complex and require us to keep track of a bit of info to refer to later
204+ HeredocData = Struct . new ( :identifier , :common_whitespace , keyword_init : true )
205+
206+ private_constant :TYPES , :EXPR_BEG , :EXPR_LABEL , :LAMBDA_TOKEN_TYPES , :LPAREN_CONVERSION_TOKEN_TYPES , :HeredocData
204207
205208 # The Parser::Source::Buffer that the tokens were lexed from.
206209 attr_reader :source_buffer
@@ -230,7 +233,7 @@ def to_a
230233 index = 0
231234 length = lexed . length
232235
233- heredoc_identifier_stack = [ ]
236+ heredoc_stack = Array . new
234237
235238 while index < length
236239 token , state = lexed [ index ]
@@ -299,9 +302,6 @@ def to_a
299302 when :tSPACE
300303 value = nil
301304 when :tSTRING_BEG
302- if token . type == :HEREDOC_START
303- heredoc_identifier_stack . push ( value . match ( /<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z / ) [ :heredoc_identifier ] )
304- end
305305 next_token = lexed [ index ] [ 0 ]
306306 next_next_token = lexed [ index + 1 ] [ 0 ]
307307 basic_quotes = [ "\" " , "'" ] . include? ( value )
@@ -321,17 +321,39 @@ def to_a
321321 location = Range . new ( source_buffer , offset_cache [ next_location . start_offset ] , offset_cache [ next_location . end_offset ] )
322322 index += 2
323323 end
324- elsif value . start_with? ( "<<" )
324+ elsif token . type == :HEREDOC_START
325325 quote = value [ 2 ] == "-" || value [ 2 ] == "~" ? value [ 3 ] : value [ 2 ]
326+ heredoc_type = value [ 2 ] == "-" || value [ 2 ] == "~" ? value [ 2 ] : ""
327+ heredoc = HeredocData . new (
328+ identifier : value . match ( /<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z / ) [ :heredoc_identifier ] ,
329+ common_whitespace : 0 ,
330+ )
331+
326332 if quote == "`"
327333 type = :tXSTRING_BEG
328334 value = "<<`"
329335 else
336+ # The parser gem trims whitespace from squiggly heredocs. We must record
337+ # the most common whitespace to later remove.
338+ if heredoc_type == "~"
339+ heredoc . common_whitespace = calculate_heredoc_whitespace ( index )
340+ end
341+
330342 value = "<<#{ quote == "'" || quote == "\" " ? quote : "\" " } "
331343 end
344+
345+ heredoc_stack . push ( heredoc )
332346 end
333347 when :tSTRING_CONTENT
334- unless ( lines = token . value . lines ) . one?
348+ if ( lines = token . value . lines ) . one?
349+ # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
350+ is_first_token_on_line = lexed [ index - 1 ] && token . location . start_line != lexed [ index - 2 ] [ 0 ] . location &.start_line
351+ # The parser gem only removes indentation when the heredoc is not nested
352+ not_nested = heredoc_stack . size == 1
353+ if is_first_token_on_line && not_nested && ( heredoc = heredoc_stack [ 0 ] ) . common_whitespace > 0
354+ value = trim_heredoc_whitespace ( value , heredoc )
355+ end
356+ else
335357 start_offset = offset_cache [ token . location . start_offset ]
336358 lines . map do |line |
337359 newline = line . end_with? ( "\r \n " ) ? "\r \n " : "\n "
@@ -361,7 +383,7 @@ def to_a
361383 when :tSTRING_END
362384 if token . type == :HEREDOC_END && value . end_with? ( "\n " )
363385 newline_length = value . end_with? ( "\r \n " ) ? 2 : 1
364- value = heredoc_identifier_stack . pop
386+ value = heredoc_stack . pop . identifier
365387 location = Range . new ( source_buffer , offset_cache [ token . location . start_offset ] , offset_cache [ token . location . end_offset - newline_length ] )
366388 elsif token . type == :REGEXP_END
367389 value = value [ 0 ]
@@ -439,6 +461,65 @@ def parse_rational(value)
439461 rescue ArgumentError
440462 0 r
441463 end
464+
465+ # Wonky heredoc tab/spaces rules.
466+ # https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L10548-L10558
467+ def calculate_heredoc_whitespace ( heredoc_token_index )
468+ next_token_index = heredoc_token_index
469+ nesting_level = 0
470+ previous_line = -1
471+ result = Float ::MAX
472+
473+ while ( lexed [ next_token_index ] && next_token = lexed [ next_token_index ] [ 0 ] )
474+ next_token_index += 1
475+ next_next_token = lexed [ next_token_index ] && lexed [ next_token_index ] [ 0 ]
476+
477+ # String content inside nested heredocs and interpolation is ignored
478+ if next_token . type == :HEREDOC_START || next_token . type == :EMBEXPR_BEGIN
479+ nesting_level += 1
480+ elsif next_token . type == :HEREDOC_END || next_token . type == :EMBEXPR_END
481+ nesting_level -= 1
482+ # When we encountered the matching heredoc end, we can exit
483+ break if nesting_level == -1
484+ elsif next_token . type == :STRING_CONTENT && nesting_level == 0
485+ common_whitespace = 0
486+ next_token . value [ /^\s */ ] . each_char do |char |
487+ if char == "\t "
488+ common_whitespace = ( common_whitespace / 8 + 1 ) * 8 ;
489+ else
490+ common_whitespace += 1
491+ end
492+ end
493+
494+ is_first_token_on_line = next_token . location . start_line != previous_line
495+ # Whitespace is significant if followed by interpolation
496+ whitespace_only = common_whitespace == next_token . value . length && next_next_token &.location &.start_line != next_token . location . start_line
497+ if is_first_token_on_line && !whitespace_only && common_whitespace < result
498+ result = common_whitespace
499+ previous_line = next_token . location . start_line
500+ end
501+ end
502+ end
503+ result
504+ end
505+
506+ # Wonky heredoc tab/spaces rules.
507+ # https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L16528-L16545
508+ def trim_heredoc_whitespace ( string , heredoc )
509+ trimmed_whitespace = 0
510+ trimmed_characters = 0
511+ while ( string [ trimmed_characters ] == "\t " || string [ trimmed_characters ] == " " ) && trimmed_whitespace < heredoc . common_whitespace
512+ if string [ trimmed_characters ] == "\t "
513+ trimmed_whitespace = ( trimmed_whitespace / 8 + 1 ) * 8 ;
514+ break if trimmed_whitespace > heredoc . common_whitespace
515+ else
516+ trimmed_whitespace += 1
517+ end
518+ trimmed_characters += 1
519+ end
520+
521+ string [ trimmed_characters ..]
522+ end
442523 end
443524 end
444525 end
0 commit comments