@@ -201,7 +201,7 @@ class Lexer
201201 ]
202202
203203 # Heredocs are complex and require us to keep track of a bit of info to refer to later
204- HeredocData = Struct . new ( :identifier , :common_whitespace , keyword_init : true )
204+ HeredocData = Struct . new ( :identifier , :common_whitespace , :quote , keyword_init : true )
205205
206206 private_constant :TYPES , :EXPR_BEG , :EXPR_LABEL , :LAMBDA_TOKEN_TYPES , :LPAREN_CONVERSION_TOKEN_TYPES , :HeredocData
207207
@@ -316,7 +316,7 @@ def to_a
316316 # the parser gem doesn't simplify strings when its value ends in a newline
317317 unless ( string_value = next_token . value ) . end_with? ( "\n " )
318318 next_location = token . location . join ( next_next_token . location )
319- value = string_value . gsub ( " \\ \\ " , " \\ " )
319+ value = unescape_string ( string_value )
320320 type = :tSTRING
321321 location = Range . new ( source_buffer , offset_cache [ next_location . start_offset ] , offset_cache [ next_location . end_offset ] )
322322 index += 2
@@ -327,19 +327,23 @@ def to_a
327327 heredoc = HeredocData . new (
328328 identifier : value . match ( /<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z / ) [ :heredoc_identifier ] ,
329329 common_whitespace : 0 ,
330+ quote : quote ,
330331 )
331332
332333 if quote == "`"
333334 type = :tXSTRING_BEG
334- value = "<<`"
335- else
336- # The parser gem trims whitespace from squiggly heredocs. We must record
337- # the most common whitespace to later remove.
338- if heredoc_type == "~"
339- heredoc . common_whitespace = calculate_heredoc_whitespace ( index )
340- end
335+ end
341336
342- value = "<<#{ quote == "'" || quote == "\" " ? quote : "\" " } "
337+ # The parser gem trims whitespace from squiggly heredocs. We must record
338+ # the most common whitespace to later remove.
339+ if heredoc_type == "~" || heredoc_type == "`"
340+ heredoc . common_whitespace = calculate_heredoc_whitespace ( index )
341+ end
342+
343+ if quote == "'" || quote == '"' || quote == "`"
344+ value = "<<#{ quote } "
345+ else
346+ value = '<<"'
343347 end
344348
345349 heredoc_stack . push ( heredoc )
@@ -350,31 +354,43 @@ def to_a
350354 is_first_token_on_line = lexed [ index - 1 ] && token . location . start_line != lexed [ index - 2 ] [ 0 ] . location &.start_line
351355 # The parser gem only removes indentation when the heredoc is not nested
352356 not_nested = heredoc_stack . size == 1
353- if is_first_token_on_line && not_nested && ( heredoc = heredoc_stack [ 0 ] ) . common_whitespace > 0
357+ current_heredoc = heredoc_stack . last
358+ if is_first_token_on_line && not_nested && current_heredoc . common_whitespace > 0
354359 value = trim_heredoc_whitespace ( value , heredoc )
355360 end
361+ if current_heredoc
362+ value = unescape_heredoc ( value , heredoc )
363+ end
356364 else
365+ # When the parser gem encounters a line continuation inside of a multiline string,
366+ # it emits a single string node. The backslash (and remaining newline) is removed.
367+ current_line = +""
368+ adjustment = 0
357369 start_offset = offset_cache [ token . location . start_offset ]
358- lines . map do |line |
359- newline = line . end_with? ( "\r \n " ) ? "\r \n " : "\n "
370+ emit = false
371+
372+ lines . each . with_index do |line , index |
360373 chomped_line = line . chomp
361- if match = chomped_line . match ( /(?<backslashes>\\ +)\z / )
362- adjustment = match [ :backslashes ] . size / 2
363- adjusted_line = chomped_line . delete_suffix ( "\\ " * adjustment )
364- if match [ :backslashes ] . size . odd?
365- adjusted_line . delete_suffix! ( "\\ " )
366- adjustment += 2
367- else
368- adjusted_line << newline
369- end
374+
375+ # When the line ends with an odd number of backslashes, it must be a line continuation.
376+ if chomped_line [ /\\ {1,}\z / ] &.length &.odd?
377+ chomped_line . delete_suffix! ( "\\ " )
378+ current_line << chomped_line
379+ adjustment += 2
380+ # If the string ends with a line continuation emit the remainder
381+ emit = index == lines . count - 1
370382 else
371- adjusted_line = line
372- adjustment = 0
383+ current_line << line
384+ emit = true
373385 end
374386
375- end_offset = start_offset + adjusted_line . bytesize + adjustment
376- tokens << [ :tSTRING_CONTENT , [ adjusted_line , Range . new ( source_buffer , offset_cache [ start_offset ] , offset_cache [ end_offset ] ) ] ]
377- start_offset = end_offset
387+ if emit
388+ end_offset = start_offset + current_line . bytesize + adjustment
389+ tokens << [ :tSTRING_CONTENT , [ unescape_string ( current_line ) , Range . new ( source_buffer , offset_cache [ start_offset ] , offset_cache [ end_offset ] ) ] ]
390+ start_offset = end_offset
391+ current_line = +""
392+ adjustment = 0
393+ end
378394 end
379395 next
380396 end
@@ -524,6 +540,30 @@ def trim_heredoc_whitespace(string, heredoc)
524540
525541 string [ trimmed_characters ..]
526542 end
543+
544+ def unescape_string ( string )
545+ string . gsub ( "\\ \\ " , "\\ " )
546+ end
547+
548+ # Escape sequences that have special and should appear unescaped in the resulting string.
549+ ESCAPES = {
550+ "a" => "\a " , "b" => "\b " , "e" => "\e " , "f" => "\f " ,
551+ "n" => "\n " , "r" => "\r " , "s" => "\s " , "t" => "\t " ,
552+ "v" => "\v " , "\\ \\ " => "\\ "
553+ } . freeze
554+ private_constant :ESCAPES
555+
556+ # TODO: Does not handle "\u1234" and other longer-form escapes.
557+ def unescape_heredoc ( string , heredoc )
558+ # In single-quoted heredocs, everything is taken literally.
559+ return string if heredoc . quote == "'"
560+
561+ # When double-quoted, escape sequences can be written literally. For example, "\\n" becomes "\n",
562+ # and "\\\\n" becomes "\\n". Unknown escapes sequences, like "\\o" simply become "o".
563+ string . gsub ( /\\ ./ ) do |match |
564+ ESCAPES [ match [ 1 ] ] || match [ 1 ]
565+ end
566+ end
527567 end
528568 end
529569 end
0 commit comments