Fix parser translator when unescaping invalid utf8

Earlopain · Earlopain · commit 9999ef2a016a · 2025-01-12T16:08:53.000+01:00
1. The string starts out as binary
2. `ち` is appended, forcing it back into utf-8
3. Some invalid byte sequences are tried to append

&gt; incompatible character encodings: UTF-8 and BINARY (ASCII-8BIT)

This makes use of my wish to use `append_as_bytes`. Unfortunatly that method is rather new
so it needs a fallback
diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb
@@ -638,39 +638,38 @@ def unescape_string(string, quote)
             scanner = StringScanner.new(string)
             while (skipped = scanner.skip_until(/\\/))
               # Append what was just skipped over, excluding the found backslash.
-              result << string.byteslice(scanner.pos - skipped, skipped - 1)
+              append_as_bytes(result, string.byteslice(scanner.pos - skipped, skipped - 1))
 
               # Simple single-character escape sequences like \n
               if (replacement = ESCAPES[scanner.peek(1)])
-                result << replacement
+                append_as_bytes(result, replacement)
                 scanner.pos += 1
               elsif (octal = scanner.check(/[0-7]{1,3}/))
                 # \nnn
-                # NOTE: When Ruby 3.4 is required, this can become result.append_as_bytes(chr)
-                result << octal.to_i(8).chr.b
+                append_as_bytes(result, octal.to_i(8).chr)
                 scanner.pos += octal.bytesize
               elsif (hex = scanner.check(/x([0-9a-fA-F]{1,2})/))
                 # \xnn
-                result << hex[1..].to_i(16).chr.b
+                append_as_bytes(result, hex[1..].to_i(16).chr)
                 scanner.pos += hex.bytesize
               elsif (unicode = scanner.check(/u([0-9a-fA-F]{4})/))
                 # \unnnn
-                result << unicode[1..].hex.chr(Encoding::UTF_8).b
+                append_as_bytes(result, unicode[1..].hex.chr(Encoding::UTF_8))
                 scanner.pos += unicode.bytesize
               elsif scanner.peek(3) == "u{}"
                 # https://github.com/whitequark/parser/issues/856
                 scanner.pos += 3
               elsif (unicode_parts = scanner.check(/u{.*}/))
                 # \u{nnnn ...}
                 unicode_parts[2..-2].split.each do |unicode|
-                  result << unicode.hex.chr(Encoding::UTF_8).b
+                  append_as_bytes(result, unicode.hex.chr(Encoding::UTF_8))
                 end
                 scanner.pos += unicode_parts.bytesize
               end
             end
 
             # Add remainging chars
-            result << string.byteslice(scanner.pos..)
+            append_as_bytes(result, string.byteslice(scanner.pos..))
 
             result.force_encoding(source_buffer.source.encoding)
 
@@ -687,6 +686,18 @@ def unescape_string(string, quote)
           end
         end
 
+        if RUBY_VERSION >= "3.4"
+          # Append some string without modifying the encoding of the receiver.
+          def append_as_bytes(receiver, input)
+            receiver.append_as_bytes(input)
+          end
+        else
+          # Not as efficient as the one above since every append dups the input.
+          def append_as_bytes(receiver, input)
+            receiver << input.b
+          end
+        end
+
         # Determine if characters preceeded by a backslash should be escaped or not
         def interpolation?(quote)
           quote != "'" && !quote.start_with?("%q", "%w", "%i")
diff --git a/test/prism/fixtures/strings.txt b/test/prism/fixtures/strings.txt
@@ -96,6 +96,8 @@ baz
 
 "\7 \43 \141"
 
+"ち\xE3\x81\xFF"
+
 %[abc]
 
 %(abc)
diff --git a/test/prism/ruby/ruby_parser_test.rb b/test/prism/ruby/ruby_parser_test.rb
@@ -35,6 +35,7 @@ class RubyParserTest < TestCase
       "seattlerb/op_asgn_primary_colon_const_command_call.txt",
       "seattlerb/regexp_esc_C_slash.txt",
       "seattlerb/str_lit_concat_bad_encodings.txt",
+      "strings.txt",
       "unescaping.txt",
       "unparser/corpus/literal/kwbegin.txt",
       "unparser/corpus/literal/send.txt",
diff --git a/test/prism/snapshots/strings.txt b/test/prism/snapshots/strings.txt