From 2e1b92670cfa2c05c91b7923c0c168351b53a7d0 Mon Sep 17 00:00:00 2001
From: Earlopain <14981592+Earlopain@users.noreply.github.com>
Date: Sat, 11 Jan 2025 18:48:48 +0100
Subject: [PATCH 1/2] Better comment token handling for the parser translator

There appear to be a bunch of rules, changing behaviour for
inline comments, multiple comments after another, etc.

This seems to line up with reality pretty closely, token differences for RuboCop tests go from 1129 to 619 which seems pretty impressive
---
 lib/prism/translation/parser/lexer.rb | 45 +++++++++++++++++++++++----
 test/prism/ruby/parser_test.rb        |  9 ------
 2 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb
index 74751d2fe5..936305e7c3 100644
--- a/lib/prism/translation/parser/lexer.rb
+++ b/lib/prism/translation/parser/lexer.rb
@@ -200,6 +200,11 @@ class Lexer
           :tEQL, :tLPAREN, :tLPAREN2, :tLPAREN_ARG, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS
         ]
 
+        # Types of tokens that are allowed to continue a method call with comments in-between.
+        # For these, the parser gem doesn't emit a newline token after the last comment.
+        COMMENT_CONTINUATION_TYPES = [:COMMENT, :AMPERSAND_DOT, :DOT]
+        private_constant :COMMENT_CONTINUATION_TYPES
+
         # Heredocs are complex and require us to keep track of a bit of info to refer to later
         HeredocData = Struct.new(:identifier, :common_whitespace, keyword_init: true)
 
@@ -236,6 +241,11 @@ def to_a
           heredoc_stack = Array.new
           quote_stack = Array.new
 
+          # The parser gem emits the newline tokens for comments out of order. This saves
+          # that token location to emit at a later time to properly line everything up.
+          # https://github.com/whitequark/parser/issues/1025
+          comment_newline_location = nil
+
           while index < length
             token, state = lexed[index]
             index += 1
@@ -257,23 +267,46 @@ def to_a
               value.delete_prefix!("?")
             when :tCOMMENT
               if token.type == :EMBDOC_BEGIN
-                start_index = index
 
                 while !((next_token = lexed[index][0]) && next_token.type == :EMBDOC_END) && (index < length - 1)
                   value += next_token.value
                   index += 1
                 end
 
-                if start_index != index
-                  value += next_token.value
-                  location = range(token.location.start_offset, lexed[index][0].location.end_offset)
-                  index += 1
-                end
+                value += next_token.value
+                location = range(token.location.start_offset, lexed[index][0].location.end_offset)
+                index += 1
               else
                 value.chomp!
                 location = range(token.location.start_offset, token.location.end_offset - 1)
+
+                prev_token = lexed[index - 2][0]
+                next_token = lexed[index][0]
+
+                is_inline_comment = prev_token.location.start_line == token.location.start_line
+                if is_inline_comment && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
+                  tokens << [:tCOMMENT, [value, location]]
+
+                  nl_location = range(token.location.end_offset - 1, token.location.end_offset)
+                  tokens << [:tNL, [nil, nl_location]]
+                  next
+                elsif is_inline_comment && next_token&.type == :COMMENT
+                  comment_newline_location = range(token.location.end_offset - 1, token.location.end_offset)
+                elsif comment_newline_location && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
+                  tokens << [:tCOMMENT, [value, location]]
+                  tokens << [:tNL, [nil, comment_newline_location]]
+                  comment_newline_location = nil
+                  next
+                end
               end
             when :tNL
+              next_token = next_token = lexed[index][0]
+              # Newlines after comments are emitted out of order.
+              if next_token&.type == :COMMENT
+                comment_newline_location = location
+                next
+              end
+
               value = nil
             when :tFLOAT
               value = parse_float(value)
diff --git a/test/prism/ruby/parser_test.rb b/test/prism/ruby/parser_test.rb
index 0a0baa52f4..7ec712a4f0 100644
--- a/test/prism/ruby/parser_test.rb
+++ b/test/prism/ruby/parser_test.rb
@@ -81,29 +81,21 @@ class ParserTest < TestCase
     # These files are failing to translate their lexer output into the lexer
     # output expected by the parser gem, so we'll skip them for now.
     skip_tokens = [
-      "comments.txt",
       "dash_heredocs.txt",
       "dos_endings.txt",
       "embdoc_no_newline_at_end.txt",
-      "heredoc_with_comment.txt",
       "heredocs_with_ignored_newlines.txt",
-      "indented_file_end.txt",
       "methods.txt",
       "strings.txt",
       "tilde_heredocs.txt",
       "seattlerb/backticks_interpolation_line.txt",
       "seattlerb/bug169.txt",
       "seattlerb/case_in.txt",
-      "seattlerb/class_comments.txt",
       "seattlerb/difficult4__leading_dots2.txt",
       "seattlerb/difficult6__7.txt",
       "seattlerb/difficult6__8.txt",
       "seattlerb/dsym_esc_to_sym.txt",
       "seattlerb/heredoc_unicode.txt",
-      "seattlerb/module_comments.txt",
-      "seattlerb/parse_line_block_inline_comment_leading_newlines.txt",
-      "seattlerb/parse_line_block_inline_comment.txt",
-      "seattlerb/parse_line_block_inline_multiline_comment.txt",
       "seattlerb/parse_line_heredoc.txt",
       "seattlerb/pct_w_heredoc_interp_nested.txt",
       "seattlerb/read_escape_unicode_curlies.txt",
@@ -117,7 +109,6 @@ class ParserTest < TestCase
       "whitequark/beginless_erange_after_newline.txt",
       "whitequark/beginless_irange_after_newline.txt",
       "whitequark/bug_ascii_8bit_in_literal.txt",
-      "whitequark/bug_def_no_paren_eql_begin.txt",
       "whitequark/forward_arg_with_open_args.txt",
       "whitequark/kwarg_no_paren.txt",
       "whitequark/lbrace_arg_after_command_args.txt",

From 2bcc4948b9c992a0e8403490af4b930dc819681b Mon Sep 17 00:00:00 2001
From: Earlopain <14981592+Earlopain@users.noreply.github.com>
Date: Sat, 11 Jan 2025 18:49:30 +0100
Subject: [PATCH 2/2] Ignore the parser translator lexer for sorbet

The whole while switch/case construct makes typechecking difficult because
sorbet does not recognize that outside variables
may change their value in individual switches.
---
 lib/prism/translation/parser/lexer.rb | 4 ++--
 rakelib/typecheck.rake                | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb
index 936305e7c3..05354ab29d 100644
--- a/lib/prism/translation/parser/lexer.rb
+++ b/lib/prism/translation/parser/lexer.rb
@@ -238,8 +238,8 @@ def to_a
           index = 0
           length = lexed.length
 
-          heredoc_stack = Array.new
-          quote_stack = Array.new
+          heredoc_stack = []
+          quote_stack = []
 
           # The parser gem emits the newline tokens for comments out of order. This saves
           # that token location to emit at a later time to properly line everything up.
diff --git a/rakelib/typecheck.rake b/rakelib/typecheck.rake
index 7a43368555..497282d6f0 100644
--- a/rakelib/typecheck.rake
+++ b/rakelib/typecheck.rake
@@ -37,6 +37,7 @@ namespace :typecheck do
         - ./lib/prism/node_ext.rb
         - ./lib/prism/parse_result.rb
         - ./lib/prism/visitor.rb
+        - ./lib/prism/translation/parser/lexer.rb
         - ./lib/prism/translation/ripper.rb
         - ./lib/prism/translation/ripper/sexp.rb
         - ./lib/prism/translation/ruby_parser.rb