Fix binary encoding for parser translator

Earlopain · Earlopain · commit 69cb8b9593e4 · 2025-01-10T21:35:26.000+01:00
Skipping detecting the encoding is almost always right, just for binary it should actually happen.

A symbol containing escapes that are invalid
in utf-8 would fail to parse since symbols must be valid in the script encoding.
Additionally, the parser gem would raise an exception somewhere during string handling
diff --git a/lib/prism/translation/parser.rb b/lib/prism/translation/parser.rb
@@ -51,7 +51,7 @@ def parse(source_buffer)
         source = source_buffer.source
 
         offset_cache = build_offset_cache(source)
-        result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache)
+        result = unwrap(Prism.parse(source, **prism_options), offset_cache)
 
         build_ast(result.value, offset_cache)
       ensure
@@ -64,7 +64,7 @@ def parse_with_comments(source_buffer)
         source = source_buffer.source
 
         offset_cache = build_offset_cache(source)
-        result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache)
+        result = unwrap(Prism.parse(source, **prism_options), offset_cache)
 
         [
           build_ast(result.value, offset_cache),
@@ -83,7 +83,7 @@ def tokenize(source_buffer, recover = false)
         offset_cache = build_offset_cache(source)
         result =
           begin
-            unwrap(Prism.parse_lex(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache)
+            unwrap(Prism.parse_lex(source, **prism_options), offset_cache)
           rescue ::Parser::SyntaxError
             raise if !recover
           end
@@ -285,6 +285,20 @@ def build_range(location, offset_cache)
         )
       end
 
+      # Options for how prism should parse/lex the source.
+      def prism_options
+        options = {
+          filepath: @source_buffer.name,
+          version: convert_for_prism(version),
+          partial_script: true,
+        }
+        # The parser gem always encodes to UTF-8, unless it is binary.
+        # https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/source/buffer.rb#L80-L107
+        options[:encoding] = false if @source_buffer.source.encoding != Encoding::BINARY
+
+        options
+      end
+
       # Converts the version format handled by Parser to the format handled by Prism.
       def convert_for_prism(version)
         case version
diff --git a/test/prism/fixtures/encoding_binary.txt b/test/prism/fixtures/encoding_binary.txt
@@ -0,0 +1,9 @@
+# encoding: binary
+
+"\xcd"
+
+:"\xcd"
+
+/#{"\xcd"}/
+
+%W[\xC0]
diff --git a/test/prism/fixtures/encoding_euc_jp.txt b/test/prism/fixtures/encoding_euc_jp.txt
@@ -0,0 +1,6 @@
+# encoding: euc-jp
+
+# \x8E indicates a double-byte character, \x01 is not a valid second byte in euc-jp
+"\x8E\x01"
+
+%W["\x8E\x01"]
diff --git a/test/prism/ruby/parser_test.rb b/test/prism/ruby/parser_test.rb
@@ -97,6 +97,8 @@ class ParserTest < TestCase
       "dash_heredocs.txt",
       "dos_endings.txt",
       "embdoc_no_newline_at_end.txt",
+      "encoding_binary.txt",
+      "encoding_euc_jp.txt",
       "heredoc_with_comment.txt",
       "heredocs_with_ignored_newlines.txt",
       "indented_file_end.txt",
diff --git a/test/prism/ruby/ruby_parser_test.rb b/test/prism/ruby/ruby_parser_test.rb
@@ -26,6 +26,7 @@ def ==(other)
 module Prism
   class RubyParserTest < TestCase
     todos = [
+      "encoding_euc_jp.txt",
       "newline_terminated.txt",
       "regex_char_width.txt",
       "seattlerb/bug169.txt",
diff --git a/test/prism/snapshots/encoding_binary.txt b/test/prism/snapshots/encoding_binary.txt
diff --git a/test/prism/snapshots/encoding_euc_jp.txt b/test/prism/snapshots/encoding_euc_jp.txt
diff --git a/test/prism/snippets_test.rb b/test/prism/snippets_test.rb
@@ -5,6 +5,7 @@
 module Prism
   class SnippetsTest < TestCase
     except = [
+      "encoding_binary.txt",
       "newline_terminated.txt",
       "seattlerb/begin_rescue_else_ensure_no_bodies.txt",
       "seattlerb/case_in.txt",