Fix parser translator when unescaping invalid utf8

Earlopain · Earlopain · commit e31e94a77566 · 2025-01-12T20:08:28.000+01:00
1. The string starts out as binary
2. `ち` is appended, forcing it back into utf-8
3. Some invalid byte sequences are tried to append

&gt; incompatible character encodings: UTF-8 and BINARY (ASCII-8BIT)

This makes use of my wish to use `append_as_bytes`. Unfortunatly that method is rather new
so it needs a fallback
diff --git a/lib/prism/polyfill/append_as_bytes.rb b/lib/prism/polyfill/append_as_bytes.rb
@@ -0,0 +1,12 @@
+# frozen_string_literal: true
+
+# Polyfill for String#append_as_bytes, which didn't exist until Ruby 3.4.
+if !("".respond_to?(:append_as_bytes))
+  String.include(
+    Module.new {
+      def append_as_bytes(*args)
+        args.each { self.<<(_1.b) } # steep:ignore
+      end
+    }
+  )
+end
diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb
@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 
 require "strscan"
+require_relative "../../polyfill/append_as_bytes"
 
 module Prism
   module Translation
@@ -638,39 +639,38 @@ def unescape_string(string, quote)
             scanner = StringScanner.new(string)
             while (skipped = scanner.skip_until(/\\/))
               # Append what was just skipped over, excluding the found backslash.
-              result << string.byteslice(scanner.pos - skipped, skipped - 1)
+              result.append_as_bytes(string.byteslice(scanner.pos - skipped, skipped - 1))
 
               # Simple single-character escape sequences like \n
               if (replacement = ESCAPES[scanner.peek(1)])
-                result << replacement
+                result.append_as_bytes(replacement)
                 scanner.pos += 1
               elsif (octal = scanner.check(/[0-7]{1,3}/))
                 # \nnn
-                # NOTE: When Ruby 3.4 is required, this can become result.append_as_bytes(chr)
-                result << octal.to_i(8).chr.b
+                result.append_as_bytes(octal.to_i(8).chr)
                 scanner.pos += octal.bytesize
               elsif (hex = scanner.check(/x([0-9a-fA-F]{1,2})/))
                 # \xnn
-                result << hex[1..].to_i(16).chr.b
+                result.append_as_bytes(hex[1..].to_i(16).chr)
                 scanner.pos += hex.bytesize
               elsif (unicode = scanner.check(/u([0-9a-fA-F]{4})/))
                 # \unnnn
-                result << unicode[1..].hex.chr(Encoding::UTF_8).b
+                result.append_as_bytes(unicode[1..].hex.chr(Encoding::UTF_8))
                 scanner.pos += unicode.bytesize
               elsif scanner.peek(3) == "u{}"
                 # https://github.com/whitequark/parser/issues/856
                 scanner.pos += 3
               elsif (unicode_parts = scanner.check(/u{.*}/))
                 # \u{nnnn ...}
                 unicode_parts[2..-2].split.each do |unicode|
-                  result << unicode.hex.chr(Encoding::UTF_8).b
+                  result.append_as_bytes(unicode.hex.chr(Encoding::UTF_8))
                 end
                 scanner.pos += unicode_parts.bytesize
               end
             end
 
             # Add remainging chars
-            result << string.byteslice(scanner.pos..)
+            result.append_as_bytes(string.byteslice(scanner.pos..))
 
             result.force_encoding(source_buffer.source.encoding)
 
diff --git a/prism.gemspec b/prism.gemspec
@@ -86,6 +86,7 @@ Gem::Specification.new do |spec|
     "lib/prism/parse_result/errors.rb",
     "lib/prism/parse_result/newlines.rb",
     "lib/prism/pattern.rb",
+    "lib/prism/polyfill/append_as_bytes.rb",
     "lib/prism/polyfill/byteindex.rb",
     "lib/prism/polyfill/unpack1.rb",
     "lib/prism/reflection.rb",
diff --git a/test/prism/fixtures/strings.txt b/test/prism/fixtures/strings.txt
@@ -96,6 +96,8 @@ baz
 
 "\7 \43 \141"
 
+"ち\xE3\x81\xFF"
+
 %[abc]
 
 %(abc)
diff --git a/test/prism/ruby/ruby_parser_test.rb b/test/prism/ruby/ruby_parser_test.rb
@@ -35,6 +35,7 @@ class RubyParserTest < TestCase
       "seattlerb/op_asgn_primary_colon_const_command_call.txt",
       "seattlerb/regexp_esc_C_slash.txt",
       "seattlerb/str_lit_concat_bad_encodings.txt",
+      "strings.txt",
       "unescaping.txt",
       "unparser/corpus/literal/kwbegin.txt",
       "unparser/corpus/literal/send.txt",
diff --git a/test/prism/snapshots/strings.txt b/test/prism/snapshots/strings.txt