Better handle heredoc escaping in the parser translator

Earlopain · Earlopain · commit b751c75c66fe · 2024-12-24T10:26:58.000+01:00
This does a few things:
* After a line continuation in a heredoc, emit only a single token
* Dedent ` heredocs
* Properly handle escapes in single and doublequoted heredocs

Some of the heredoc escaping logic should be applied to strings as well,
but I'll leave that for a different PR.
diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb
@@ -201,7 +201,7 @@ class Lexer
         ]
 
         # Heredocs are complex and require us to keep track of a bit of info to refer to later
-        HeredocData = Struct.new(:identifier, :common_whitespace, keyword_init: true)
+        HeredocData = Struct.new(:identifier, :common_whitespace, :quote, keyword_init: true)
 
         private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES, :HeredocData
 
@@ -316,7 +316,7 @@ def to_a
                 # the parser gem doesn't simplify strings when its value ends in a newline
                 unless (string_value = next_token.value).end_with?("\n")
                   next_location = token.location.join(next_next_token.location)
-                  value = string_value.gsub("\\\\", "\\")
+                  value = unescape_string(string_value)
                   type = :tSTRING
                   location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
                   index += 2
@@ -327,19 +327,23 @@ def to_a
                 heredoc = HeredocData.new(
                   identifier: value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier],
                   common_whitespace: 0,
+                  quote: quote,
                 )
 
                 if quote == "`"
                   type = :tXSTRING_BEG
-                  value = "<<`"
-                else
-                  # The parser gem trims whitespace from squiggly heredocs. We must record
-                  # the most common whitespace to later remove.
-                  if heredoc_type == "~"
-                    heredoc.common_whitespace = calculate_heredoc_whitespace(index)
-                  end
+                end
 
-                  value = "<<#{quote == "'" || quote == "\"" ? quote : "\""}"
+                # The parser gem trims whitespace from squiggly heredocs. We must record
+                # the most common whitespace to later remove.
+                if heredoc_type == "~" || heredoc_type == "`"
+                  heredoc.common_whitespace = calculate_heredoc_whitespace(index)
+                end
+
+                if quote == "'" || quote == '"' || quote == "`"
+                  value = "<<#{quote}"
+                else
+                  value = '<<"'
                 end
 
                 heredoc_stack.push(heredoc)
@@ -350,31 +354,43 @@ def to_a
                 is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
                 # The parser gem only removes indentation when the heredoc is not nested
                 not_nested = heredoc_stack.size == 1
-                if is_first_token_on_line && not_nested && (heredoc = heredoc_stack[0]).common_whitespace > 0
+                current_heredoc = heredoc_stack.last
+                if is_first_token_on_line && not_nested && current_heredoc.common_whitespace > 0
                   value = trim_heredoc_whitespace(value, heredoc)
                 end
+                if current_heredoc
+                  value = unescape_heredoc(value, heredoc)
+                end
               else
+                # When the parser gem encounters a line continuation inside of a multiline string,
+                # it emits a single string node. The backslash (and remaining newline) is removed.
+                current_line = +""
+                adjustment = 0
                 start_offset = offset_cache[token.location.start_offset]
-                lines.map do |line|
-                  newline = line.end_with?("\r\n") ? "\r\n" : "\n"
+                emit = false
+
+                lines.each.with_index do |line, index|
                   chomped_line = line.chomp
-                  if match = chomped_line.match(/(?<backslashes>\\+)\z/)
-                    adjustment = match[:backslashes].size / 2
-                    adjusted_line = chomped_line.delete_suffix("\\" * adjustment)
-                    if match[:backslashes].size.odd?
-                      adjusted_line.delete_suffix!("\\")
-                      adjustment += 2
-                    else
-                      adjusted_line << newline
-                    end
+
+                  # When the line ends with an odd number of backslashes, it must be a line continuation.
+                  if chomped_line[/\\{1,}\z/]&.length&.odd?
+                    chomped_line.delete_suffix!("\\")
+                    current_line << chomped_line
+                    adjustment += 2
+                    # If the string ends with a line continuation emit the remainder
+                    emit = index == lines.count - 1
                   else
-                    adjusted_line = line
-                    adjustment = 0
+                    current_line << line
+                    emit = true
                   end
 
-                  end_offset = start_offset + adjusted_line.bytesize + adjustment
-                  tokens << [:tSTRING_CONTENT, [adjusted_line, Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]]
-                  start_offset = end_offset
+                  if emit
+                    end_offset = start_offset + current_line.bytesize + adjustment
+                    tokens << [:tSTRING_CONTENT, [unescape_string(current_line), Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]]
+                    start_offset = end_offset
+                    current_line = +""
+                    adjustment = 0
+                  end
                 end
                 next
               end
@@ -524,6 +540,30 @@ def trim_heredoc_whitespace(string, heredoc)
 
           string[trimmed_characters..]
         end
+
+        def unescape_string(string)
+          string.gsub("\\\\", "\\")
+        end
+
+        # Escape sequences that have special and should appear unescaped in the resulting string.
+        ESCAPES = {
+          "a" => "\a", "b" => "\b", "e" => "\e", "f" => "\f",
+          "n" => "\n", "r" => "\r", "s" => "\s", "t" => "\t",
+          "v" => "\v", "\\\\" => "\\"
+        }.freeze
+        private_constant :ESCAPES
+
+        # TODO: Does not handle "\u1234" and other longer-form escapes.
+        def unescape_heredoc(string, heredoc)
+          # In single-quoted heredocs, everything is taken literally.
+          return string if heredoc.quote == "'"
+
+          # When double-quoted, escape sequences can be written literally. For example, "\\n" becomes "\n",
+          # and "\\\\n" becomes "\\n". Unknown escapes sequences, like "\\o" simply become "o".
+          string.gsub(/\\./) do |match|
+            ESCAPES[match[1]] || match[1]
+          end
+        end
       end
     end
   end
diff --git a/test/prism/ruby/parser_test.rb b/test/prism/ruby/parser_test.rb
@@ -102,13 +102,9 @@ class ParserTest < TestCase
       "seattlerb/difficult6__7.txt",
       "seattlerb/difficult6__8.txt",
       "seattlerb/dsym_esc_to_sym.txt",
-      "seattlerb/heredoc__backslash_dos_format.txt",
-      "seattlerb/heredoc_backslash_nl.txt",
       "seattlerb/heredoc_unicode.txt",
       "seattlerb/heredoc_with_carriage_return_escapes_windows.txt",
       "seattlerb/heredoc_with_carriage_return_escapes.txt",
-      "seattlerb/heredoc_with_interpolation_and_carriage_return_escapes_windows.txt",
-      "seattlerb/heredoc_with_interpolation_and_carriage_return_escapes.txt",
       "seattlerb/module_comments.txt",
       "seattlerb/parse_line_block_inline_comment_leading_newlines.txt",
       "seattlerb/parse_line_block_inline_comment.txt",
@@ -128,14 +124,12 @@ class ParserTest < TestCase
       "seattlerb/str_newline_hash_line_number.txt",
       "seattlerb/TestRubyParserShared.txt",
       "unparser/corpus/literal/assignment.txt",
-      "unparser/corpus/literal/dstr.txt",
       "unparser/corpus/semantic/opasgn.txt",
       "whitequark/args.txt",
       "whitequark/beginless_erange_after_newline.txt",
       "whitequark/beginless_irange_after_newline.txt",
       "whitequark/bug_ascii_8bit_in_literal.txt",
       "whitequark/bug_def_no_paren_eql_begin.txt",
-      "whitequark/dedenting_heredoc.txt",
       "whitequark/forward_arg_with_open_args.txt",
       "whitequark/lbrace_arg_after_command_args.txt",
       "whitequark/multiple_pattern_matches.txt",