From 2e1b92670cfa2c05c91b7923c0c168351b53a7d0 Mon Sep 17 00:00:00 2001 From: Earlopain <14981592+Earlopain@users.noreply.github.com> Date: Sat, 11 Jan 2025 18:48:48 +0100 Subject: [PATCH 1/2] Better comment token handling for the parser translator There appear to be a bunch of rules, changing behaviour for inline comments, multiple comments after another, etc. This seems to line up with reality pretty closely, token differences for RuboCop tests go from 1129 to 619 which seems pretty impressive --- lib/prism/translation/parser/lexer.rb | 45 +++++++++++++++++++++++---- test/prism/ruby/parser_test.rb | 9 ------ 2 files changed, 39 insertions(+), 15 deletions(-) diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb index 74751d2fe5..936305e7c3 100644 --- a/lib/prism/translation/parser/lexer.rb +++ b/lib/prism/translation/parser/lexer.rb @@ -200,6 +200,11 @@ class Lexer :tEQL, :tLPAREN, :tLPAREN2, :tLPAREN_ARG, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS ] + # Types of tokens that are allowed to continue a method call with comments in-between. + # For these, the parser gem doesn't emit a newline token after the last comment. + COMMENT_CONTINUATION_TYPES = [:COMMENT, :AMPERSAND_DOT, :DOT] + private_constant :COMMENT_CONTINUATION_TYPES + # Heredocs are complex and require us to keep track of a bit of info to refer to later HeredocData = Struct.new(:identifier, :common_whitespace, keyword_init: true) @@ -236,6 +241,11 @@ def to_a heredoc_stack = Array.new quote_stack = Array.new + # The parser gem emits the newline tokens for comments out of order. This saves + # that token location to emit at a later time to properly line everything up. + # https://github.com/whitequark/parser/issues/1025 + comment_newline_location = nil + while index < length token, state = lexed[index] index += 1 @@ -257,23 +267,46 @@ def to_a value.delete_prefix!("?") when :tCOMMENT if token.type == :EMBDOC_BEGIN - start_index = index while !((next_token = lexed[index][0]) && next_token.type == :EMBDOC_END) && (index < length - 1) value += next_token.value index += 1 end - if start_index != index - value += next_token.value - location = range(token.location.start_offset, lexed[index][0].location.end_offset) - index += 1 - end + value += next_token.value + location = range(token.location.start_offset, lexed[index][0].location.end_offset) + index += 1 else value.chomp! location = range(token.location.start_offset, token.location.end_offset - 1) + + prev_token = lexed[index - 2][0] + next_token = lexed[index][0] + + is_inline_comment = prev_token.location.start_line == token.location.start_line + if is_inline_comment && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type) + tokens << [:tCOMMENT, [value, location]] + + nl_location = range(token.location.end_offset - 1, token.location.end_offset) + tokens << [:tNL, [nil, nl_location]] + next + elsif is_inline_comment && next_token&.type == :COMMENT + comment_newline_location = range(token.location.end_offset - 1, token.location.end_offset) + elsif comment_newline_location && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type) + tokens << [:tCOMMENT, [value, location]] + tokens << [:tNL, [nil, comment_newline_location]] + comment_newline_location = nil + next + end end when :tNL + next_token = next_token = lexed[index][0] + # Newlines after comments are emitted out of order. + if next_token&.type == :COMMENT + comment_newline_location = location + next + end + value = nil when :tFLOAT value = parse_float(value) diff --git a/test/prism/ruby/parser_test.rb b/test/prism/ruby/parser_test.rb index 0a0baa52f4..7ec712a4f0 100644 --- a/test/prism/ruby/parser_test.rb +++ b/test/prism/ruby/parser_test.rb @@ -81,29 +81,21 @@ class ParserTest < TestCase # These files are failing to translate their lexer output into the lexer # output expected by the parser gem, so we'll skip them for now. skip_tokens = [ - "comments.txt", "dash_heredocs.txt", "dos_endings.txt", "embdoc_no_newline_at_end.txt", - "heredoc_with_comment.txt", "heredocs_with_ignored_newlines.txt", - "indented_file_end.txt", "methods.txt", "strings.txt", "tilde_heredocs.txt", "seattlerb/backticks_interpolation_line.txt", "seattlerb/bug169.txt", "seattlerb/case_in.txt", - "seattlerb/class_comments.txt", "seattlerb/difficult4__leading_dots2.txt", "seattlerb/difficult6__7.txt", "seattlerb/difficult6__8.txt", "seattlerb/dsym_esc_to_sym.txt", "seattlerb/heredoc_unicode.txt", - "seattlerb/module_comments.txt", - "seattlerb/parse_line_block_inline_comment_leading_newlines.txt", - "seattlerb/parse_line_block_inline_comment.txt", - "seattlerb/parse_line_block_inline_multiline_comment.txt", "seattlerb/parse_line_heredoc.txt", "seattlerb/pct_w_heredoc_interp_nested.txt", "seattlerb/read_escape_unicode_curlies.txt", @@ -117,7 +109,6 @@ class ParserTest < TestCase "whitequark/beginless_erange_after_newline.txt", "whitequark/beginless_irange_after_newline.txt", "whitequark/bug_ascii_8bit_in_literal.txt", - "whitequark/bug_def_no_paren_eql_begin.txt", "whitequark/forward_arg_with_open_args.txt", "whitequark/kwarg_no_paren.txt", "whitequark/lbrace_arg_after_command_args.txt", From 2bcc4948b9c992a0e8403490af4b930dc819681b Mon Sep 17 00:00:00 2001 From: Earlopain <14981592+Earlopain@users.noreply.github.com> Date: Sat, 11 Jan 2025 18:49:30 +0100 Subject: [PATCH 2/2] Ignore the parser translator lexer for sorbet The whole while switch/case construct makes typechecking difficult because sorbet does not recognize that outside variables may change their value in individual switches. --- lib/prism/translation/parser/lexer.rb | 4 ++-- rakelib/typecheck.rake | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb index 936305e7c3..05354ab29d 100644 --- a/lib/prism/translation/parser/lexer.rb +++ b/lib/prism/translation/parser/lexer.rb @@ -238,8 +238,8 @@ def to_a index = 0 length = lexed.length - heredoc_stack = Array.new - quote_stack = Array.new + heredoc_stack = [] + quote_stack = [] # The parser gem emits the newline tokens for comments out of order. This saves # that token location to emit at a later time to properly line everything up. diff --git a/rakelib/typecheck.rake b/rakelib/typecheck.rake index 7a43368555..497282d6f0 100644 --- a/rakelib/typecheck.rake +++ b/rakelib/typecheck.rake @@ -37,6 +37,7 @@ namespace :typecheck do - ./lib/prism/node_ext.rb - ./lib/prism/parse_result.rb - ./lib/prism/visitor.rb + - ./lib/prism/translation/parser/lexer.rb - ./lib/prism/translation/ripper.rb - ./lib/prism/translation/ripper/sexp.rb - ./lib/prism/translation/ruby_parser.rb