[ruby/prism] Fix parser translator tSPACE tokens for percent arrays

Earlopain · matzbot · commit 566f9463c2be · 2025-01-12T18:54:16.000Z
Tests worked around this but the incompatibility is not hard to fix. This fixes 17 token incompatibilies in tests here that were previously passing ruby/prism@101962526d
diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb
@@ -457,7 +457,15 @@ def to_a
                 location = range(token.location.start_offset, token.location.start_offset + 1)
               end
 
-              quote_stack.pop
+              if percent_array?(quote_stack.pop)
+                prev_token = lexed[index - 2][0] if index - 2 >= 0
+                empty = %i[PERCENT_LOWER_I PERCENT_LOWER_W PERCENT_UPPER_I PERCENT_UPPER_W].include?(prev_token&.type)
+                ends_with_whitespace = prev_token&.type == :WORDS_SEP
+                # parser always emits a space token after content in a percent array, even if no actual whitespace is present.
+                if !empty && !ends_with_whitespace
+                  tokens << [:tSPACE, [nil, range(token.location.start_offset, token.location.start_offset)]]
+                end
+              end
             when :tSYMBEG
               if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END
                 next_location = token.location.join(next_token.location)
diff --git a/test/prism/ruby/parser_test.rb b/test/prism/ruby/parser_test.rb
@@ -219,13 +219,6 @@ def assert_equal_tokens(expected_tokens, actual_tokens)
           expected_index += 1
           actual_index += 1
 
-          # The parser gem always has a space before a string end in list
-          # literals, but we don't. So we'll skip over the space.
-          if expected_token[0] == :tSPACE && actual_token[0] == :tSTRING_END
-            expected_index += 1
-            next
-          end
-
           # There are a lot of tokens that have very specific meaning according
           # to the context of the parser. We don't expose that information in
           # prism, so we need to normalize these tokens a bit.