@@ -200,7 +200,10 @@ class Lexer
200200 :tEQL , :tLPAREN , :tLPAREN2 , :tLSHFT , :tNL , :tOP_ASGN , :tOROP , :tPIPE , :tSEMI , :tSTRING_DBEG , :tUMINUS , :tUPLUS
201201 ]
202202
203- private_constant :TYPES , :EXPR_BEG , :EXPR_LABEL , :LAMBDA_TOKEN_TYPES , :LPAREN_CONVERSION_TOKEN_TYPES
203+ # Heredocs are complex and require us to keep track of a bit of info to refer to later
204+ HeredocData = Struct . new ( :identifier , :common_whitespace , keyword_init : true )
205+
206+ private_constant :TYPES , :EXPR_BEG , :EXPR_LABEL , :LAMBDA_TOKEN_TYPES , :LPAREN_CONVERSION_TOKEN_TYPES , :HeredocData
204207
205208 # The Parser::Source::Buffer that the tokens were lexed from.
206209 attr_reader :source_buffer
@@ -230,7 +233,7 @@ def to_a
230233 index = 0
231234 length = lexed . length
232235
233- heredoc_identifier_stack = [ ]
236+ heredoc_stack = Array . new
234237
235238 while index < length
236239 token , state = lexed [ index ]
@@ -299,9 +302,6 @@ def to_a
299302 when :tSPACE
300303 value = nil
301304 when :tSTRING_BEG
302- if token . type == :HEREDOC_START
303- heredoc_identifier_stack . push ( value . match ( /<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z / ) [ :heredoc_identifier ] )
304- end
305305 next_token = lexed [ index ] [ 0 ]
306306 next_next_token = lexed [ index + 1 ] [ 0 ]
307307 basic_quotes = [ "\" " , "'" ] . include? ( value )
@@ -321,17 +321,39 @@ def to_a
321321 location = Range . new ( source_buffer , offset_cache [ next_location . start_offset ] , offset_cache [ next_location . end_offset ] )
322322 index += 2
323323 end
324- elsif value . start_with? ( "<<" )
324+ elsif token . type == :HEREDOC_START
325325 quote = value [ 2 ] == "-" || value [ 2 ] == "~" ? value [ 3 ] : value [ 2 ]
326+ heredoc_type = value [ 2 ] == "-" || value [ 2 ] == "~" ? value [ 2 ] : ""
327+ heredoc = HeredocData . new (
328+ identifier : value . match ( /<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z / ) [ :heredoc_identifier ] ,
329+ common_whitespace : 0 ,
330+ )
331+
326332 if quote == "`"
327333 type = :tXSTRING_BEG
328334 value = "<<`"
329335 else
336+ # The parser gem trims whitespace from squiggly heredocs. We must record
337+ # the most common whitespace to later remove.
338+ if heredoc_type == "~"
339+ heredoc . common_whitespace = calculate_heredoc_whitespace ( index )
340+ end
341+
330342 value = "<<#{ quote == "'" || quote == "\" " ? quote : "\" " } "
331343 end
344+
345+ heredoc_stack . push ( heredoc )
332346 end
333347 when :tSTRING_CONTENT
334- unless ( lines = token . value . lines ) . one?
348+ if ( lines = token . value . lines ) . one?
349+ # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
350+ is_first_token_on_line = lexed [ index - 1 ] && token . location . start_line != lexed [ index - 2 ] [ 0 ] . location &.start_line
351+ # The parser gem only removes indentation when the heredoc is not nested
352+ not_nested = heredoc_stack . size == 1
353+ if is_first_token_on_line && not_nested && ( heredoc = heredoc_stack [ 0 ] ) . common_whitespace > 0
354+ value = trim_heredoc_whitespace ( value , heredoc )
355+ end
356+ else
335357 start_offset = offset_cache [ token . location . start_offset ]
336358 lines . map do |line |
337359 newline = line . end_with? ( "\r \n " ) ? "\r \n " : "\n "
@@ -361,7 +383,7 @@ def to_a
361383 when :tSTRING_END
362384 if token . type == :HEREDOC_END && value . end_with? ( "\n " )
363385 newline_length = value . end_with? ( "\r \n " ) ? 2 : 1
364- value = heredoc_identifier_stack . pop
386+ value = heredoc_stack . pop . identifier
365387 location = Range . new ( source_buffer , offset_cache [ token . location . start_offset ] , offset_cache [ token . location . end_offset - newline_length ] )
366388 elsif token . type == :REGEXP_END
367389 value = value [ 0 ]
@@ -443,6 +465,65 @@ def parse_rational(value)
443465 rescue ArgumentError
444466 0 r
445467 end
468+
469+ # Wonky heredoc tab/spaces rules.
470+ # https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L10548-L10558
471+ def calculate_heredoc_whitespace ( heredoc_token_index )
472+ next_token_index = heredoc_token_index
473+ nesting_level = 0
474+ previous_line = -1
475+ result = Float ::MAX
476+
477+ while ( lexed [ next_token_index ] && next_token = lexed [ next_token_index ] [ 0 ] )
478+ next_token_index += 1
479+ next_next_token = lexed [ next_token_index ] && lexed [ next_token_index ] [ 0 ]
480+
481+ # String content inside nested heredocs and interpolation is ignored
482+ if next_token . type == :HEREDOC_START || next_token . type == :EMBEXPR_BEGIN
483+ nesting_level += 1
484+ elsif next_token . type == :HEREDOC_END || next_token . type == :EMBEXPR_END
485+ nesting_level -= 1
486+ # When we encountered the matching heredoc end, we can exit
487+ break if nesting_level == -1
488+ elsif next_token . type == :STRING_CONTENT && nesting_level == 0
489+ common_whitespace = 0
490+ next_token . value [ /^\s */ ] . each_char do |char |
491+ if char == "\t "
492+ common_whitespace = ( common_whitespace / 8 + 1 ) * 8 ;
493+ else
494+ common_whitespace += 1
495+ end
496+ end
497+
498+ is_first_token_on_line = next_token . location . start_line != previous_line
499+ # Whitespace is significant if followed by interpolation
500+ whitespace_only = common_whitespace == next_token . value . length && next_next_token &.location &.start_line != next_token . location . start_line
501+ if is_first_token_on_line && !whitespace_only && common_whitespace < result
502+ result = common_whitespace
503+ previous_line = next_token . location . start_line
504+ end
505+ end
506+ end
507+ result
508+ end
509+
510+ # Wonky heredoc tab/spaces rules.
511+ # https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L16528-L16545
512+ def trim_heredoc_whitespace ( string , heredoc )
513+ trimmed_whitespace = 0
514+ trimmed_characters = 0
515+ while ( string [ trimmed_characters ] == "\t " || string [ trimmed_characters ] == " " ) && trimmed_whitespace < heredoc . common_whitespace
516+ if string [ trimmed_characters ] == "\t "
517+ trimmed_whitespace = ( trimmed_whitespace / 8 + 1 ) * 8 ;
518+ break if trimmed_whitespace > heredoc . common_whitespace
519+ else
520+ trimmed_whitespace += 1
521+ end
522+ trimmed_characters += 1
523+ end
524+
525+ string [ trimmed_characters ..]
526+ end
446527 end
447528 end
448529 end
0 commit comments