[ruby/prism] Fix parser translator ast for regex with line continuation

Earlopain · kddnewton · commit a234fd516f82 · 2025-01-11T19:09:05.000-05:00
Turns out, the vast majority of work was already done with handling the same for heredocs I'm confident this should also apply to actual string nodes (there's even a todo for it) but no tests change if I apply it there too, so I can't say for sure if the logic would be correct. The individual test files are a bit too large, maybe something else would break that currently passes. Leaving it for later to look more closely into that. ruby/prism@6bba1c54e1
diff --git a/lib/prism/translation/parser/compiler.rb b/lib/prism/translation/parser/compiler.rb
@@ -1511,13 +1511,9 @@ def visit_redo_node(node)
         # /foo/
         # ^^^^^
         def visit_regular_expression_node(node)
-          content = node.content
           parts =
-            if content.include?("\n")
-              offset = node.content_loc.start_offset
-              content.lines.map do |line|
-                builder.string_internal([line, srange_offsets(offset, offset += line.bytesize)])
-              end
+            if node.content.include?("\n")
+              string_nodes_from_line_continuations(node, node.content_loc.start_offset, node.opening)
             else
               [builder.string_internal(token(node.content_loc))]
             end
@@ -2074,55 +2070,7 @@ def visit_heredoc(node)
           node.parts.each do |part|
             pushing =
               if part.is_a?(StringNode) && part.unescaped.include?("\n")
-                unescaped = part.unescaped.lines
-                escaped = part.content.lines
-
-                escaped_lengths = []
-                normalized_lengths = []
-                # Keeps track of where an unescaped line should start a new token. An unescaped
-                # \n would otherwise be indistinguishable from the actual newline at the end of
-                # of the line. The parser gem only emits a new string node at "real" newlines,
-                # line continuations don't start a new node as well.
-                do_next_tokens = []
-
-                if node.opening.end_with?("'")
-                  escaped.each do |line|
-                    escaped_lengths << line.bytesize
-                    normalized_lengths << chomped_bytesize(line)
-                    do_next_tokens << true
-                  end
-                else
-                  escaped
-                    .chunk_while { |before, after| before[/(\\*)\r?\n$/, 1]&.length&.odd? || false }
-                    .each do |lines|
-                      escaped_lengths << lines.sum(&:bytesize)
-                      normalized_lengths << lines.sum { |line| chomped_bytesize(line) }
-                      unescaped_lines_count = lines.sum do |line|
-                        line.scan(/(\\*)n/).count { |(backslashes)| backslashes&.length&.odd? || false }
-                      end
-                      do_next_tokens.concat(Array.new(unescaped_lines_count + 1, false))
-                      do_next_tokens[-1] = true
-                    end
-                end
-
-                start_offset = part.location.start_offset
-                current_line = +""
-                current_normalized_length = 0
-
-                unescaped.filter_map.with_index do |unescaped_line, index|
-                  current_line << unescaped_line
-                  current_normalized_length += normalized_lengths.fetch(index, 0)
-
-                  if do_next_tokens[index]
-                    inner_part = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_normalized_length)])
-                    start_offset += escaped_lengths.fetch(index, 0)
-                    current_line = +""
-                    current_normalized_length = 0
-                    inner_part
-                  else
-                    nil
-                  end
-                end
+                string_nodes_from_line_continuations(part, part.location.start_offset, node.opening)
               else
                 [visit(part)]
               end
@@ -2172,6 +2120,59 @@ def within_pattern
             parser.pattern_variables.pop
           end
         end
+
+        # Create parser string nodes from a single prism node. The parser gem
+        # "glues" strings together when a line continuation is encountered.
+        def string_nodes_from_line_continuations(node, start_offset, opening)
+          unescaped = node.unescaped.lines
+          escaped = node.content.lines
+
+          escaped_lengths = []
+          normalized_lengths = []
+          # Keeps track of where an unescaped line should start a new token. An unescaped
+          # \n would otherwise be indistinguishable from the actual newline at the end of
+          # of the line. The parser gem only emits a new string node at "real" newlines,
+          # line continuations don't start a new node as well.
+          do_next_tokens = []
+
+          if opening.end_with?("'")
+            escaped.each do |line|
+              escaped_lengths << line.bytesize
+              normalized_lengths << chomped_bytesize(line)
+              do_next_tokens << true
+            end
+          else
+            escaped
+              .chunk_while { |before, after| before[/(\\*)\r?\n$/, 1]&.length&.odd? || false }
+              .each do |lines|
+                escaped_lengths << lines.sum(&:bytesize)
+                normalized_lengths << lines.sum { |line| chomped_bytesize(line) }
+                unescaped_lines_count = lines.sum do |line|
+                  line.scan(/(\\*)n/).count { |(backslashes)| backslashes&.length&.odd? || false }
+                end
+                do_next_tokens.concat(Array.new(unescaped_lines_count + 1, false))
+                do_next_tokens[-1] = true
+              end
+          end
+
+          current_line = +""
+          current_normalized_length = 0
+
+          unescaped.filter_map.with_index do |unescaped_line, index|
+            current_line << unescaped_line
+            current_normalized_length += normalized_lengths.fetch(index, 0)
+
+            if do_next_tokens[index]
+              inner_part = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_normalized_length)])
+              start_offset += escaped_lengths.fetch(index, 0)
+              current_line = +""
+              current_normalized_length = 0
+              inner_part
+            else
+              nil
+            end
+          end
+        end
       end
     end
   end
diff --git a/test/prism/fixtures/regex.txt b/test/prism/fixtures/regex.txt
@@ -46,3 +46,11 @@ tap { /(?<a>)/ =~ to_s }
 def foo(nil:) = /(?<nil>)/ =~ ""
 
 /(?-x:#)/x
+
+/a
+b\
+c\
+d\\\
+e\\
+f\
+/
diff --git a/test/prism/ruby/parser_test.rb b/test/prism/ruby/parser_test.rb
@@ -62,7 +62,6 @@ class ParserTest < TestCase
     # These files are either failing to parse or failing to translate, so we'll
     # skip them for now.
     skip_all = skip_incorrect | [
-      "regex.txt",
       "unescaping.txt",
       "seattlerb/bug190.txt",
       "seattlerb/heredoc_with_extra_carriage_returns_windows.txt",