From e31e94a77566533726a564eaa129f20b24b1dd07 Mon Sep 17 00:00:00 2001
From: Earlopain <14981592+Earlopain@users.noreply.github.com>
Date: Sun, 12 Jan 2025 20:08:28 +0100
Subject: [PATCH] Fix parser translator when unescaping invalid utf8
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. The string starts out as binary
2. `ち` is appended, forcing it back into utf-8
3. Some invalid byte sequences are tried to append

> incompatible character encodings: UTF-8 and BINARY (ASCII-8BIT)

This makes use of my wish to use `append_as_bytes`. Unfortunatly that method is rather new
so it needs a fallback
---
 lib/prism/polyfill/append_as_bytes.rb |  12 +++
 lib/prism/translation/parser/lexer.rb |  16 ++--
 prism.gemspec                         |   1 +
 test/prism/fixtures/strings.txt       |   2 +
 test/prism/ruby/ruby_parser_test.rb   |   1 +
 test/prism/snapshots/strings.txt      | 120 ++++++++++++++------------
 6 files changed, 87 insertions(+), 65 deletions(-)
 create mode 100644 lib/prism/polyfill/append_as_bytes.rb

diff --git a/lib/prism/polyfill/append_as_bytes.rb b/lib/prism/polyfill/append_as_bytes.rb
new file mode 100644
index 0000000000..6f9b0819a0
--- /dev/null
+++ b/lib/prism/polyfill/append_as_bytes.rb
@@ -0,0 +1,12 @@
+# frozen_string_literal: true
+
+# Polyfill for String#append_as_bytes, which didn't exist until Ruby 3.4.
+if !("".respond_to?(:append_as_bytes))
+  String.include(
+    Module.new {
+      def append_as_bytes(*args)
+        args.each { self.<<(_1.b) } # steep:ignore
+      end
+    }
+  )
+end
diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb
index 49fdd2aea8..b4478123e1 100644
--- a/lib/prism/translation/parser/lexer.rb
+++ b/lib/prism/translation/parser/lexer.rb
@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 
 require "strscan"
+require_relative "../../polyfill/append_as_bytes"
 
 module Prism
   module Translation
@@ -638,24 +639,23 @@ def unescape_string(string, quote)
             scanner = StringScanner.new(string)
             while (skipped = scanner.skip_until(/\\/))
               # Append what was just skipped over, excluding the found backslash.
-              result << string.byteslice(scanner.pos - skipped, skipped - 1)
+              result.append_as_bytes(string.byteslice(scanner.pos - skipped, skipped - 1))
 
               # Simple single-character escape sequences like \n
               if (replacement = ESCAPES[scanner.peek(1)])
-                result << replacement
+                result.append_as_bytes(replacement)
                 scanner.pos += 1
               elsif (octal = scanner.check(/[0-7]{1,3}/))
                 # \nnn
-                # NOTE: When Ruby 3.4 is required, this can become result.append_as_bytes(chr)
-                result << octal.to_i(8).chr.b
+                result.append_as_bytes(octal.to_i(8).chr)
                 scanner.pos += octal.bytesize
               elsif (hex = scanner.check(/x([0-9a-fA-F]{1,2})/))
                 # \xnn
-                result << hex[1..].to_i(16).chr.b
+                result.append_as_bytes(hex[1..].to_i(16).chr)
                 scanner.pos += hex.bytesize
               elsif (unicode = scanner.check(/u([0-9a-fA-F]{4})/))
                 # \unnnn
-                result << unicode[1..].hex.chr(Encoding::UTF_8).b
+                result.append_as_bytes(unicode[1..].hex.chr(Encoding::UTF_8))
                 scanner.pos += unicode.bytesize
               elsif scanner.peek(3) == "u{}"
                 # https://github.com/whitequark/parser/issues/856
@@ -663,14 +663,14 @@ def unescape_string(string, quote)
               elsif (unicode_parts = scanner.check(/u{.*}/))
                 # \u{nnnn ...}
                 unicode_parts[2..-2].split.each do |unicode|
-                  result << unicode.hex.chr(Encoding::UTF_8).b
+                  result.append_as_bytes(unicode.hex.chr(Encoding::UTF_8))
                 end
                 scanner.pos += unicode_parts.bytesize
               end
             end
 
             # Add remainging chars
-            result << string.byteslice(scanner.pos..)
+            result.append_as_bytes(string.byteslice(scanner.pos..))
 
             result.force_encoding(source_buffer.source.encoding)
 
diff --git a/prism.gemspec b/prism.gemspec
index e17e092f4e..4af551b263 100644
--- a/prism.gemspec
+++ b/prism.gemspec
@@ -86,6 +86,7 @@ Gem::Specification.new do |spec|
     "lib/prism/parse_result/errors.rb",
     "lib/prism/parse_result/newlines.rb",
     "lib/prism/pattern.rb",
+    "lib/prism/polyfill/append_as_bytes.rb",
     "lib/prism/polyfill/byteindex.rb",
     "lib/prism/polyfill/unpack1.rb",
     "lib/prism/reflection.rb",
diff --git a/test/prism/fixtures/strings.txt b/test/prism/fixtures/strings.txt
index 83f38cb606..030f15a2c9 100644
--- a/test/prism/fixtures/strings.txt
+++ b/test/prism/fixtures/strings.txt
@@ -96,6 +96,8 @@ baz
 
 "\7 \43 \141"
 
+"ち\xE3\x81\xFF"
+
 %[abc]
 
 %(abc)
diff --git a/test/prism/ruby/ruby_parser_test.rb b/test/prism/ruby/ruby_parser_test.rb
index 1d530dd13b..fd1dbf1ac8 100644
--- a/test/prism/ruby/ruby_parser_test.rb
+++ b/test/prism/ruby/ruby_parser_test.rb
@@ -35,6 +35,7 @@ class RubyParserTest < TestCase
       "seattlerb/op_asgn_primary_colon_const_command_call.txt",
       "seattlerb/regexp_esc_C_slash.txt",
       "seattlerb/str_lit_concat_bad_encodings.txt",
+      "strings.txt",
       "unescaping.txt",
       "unparser/corpus/literal/kwbegin.txt",
       "unparser/corpus/literal/send.txt",
diff --git a/test/prism/snapshots/strings.txt b/test/prism/snapshots/strings.txt
index 917e60224f..0e281ba152 100644
--- a/test/prism/snapshots/strings.txt
+++ b/test/prism/snapshots/strings.txt
@@ -1,10 +1,10 @@
-@ ProgramNode (location: (1,0)-(117,15))
+@ ProgramNode (location: (1,0)-(119,15))
 ├── flags: ∅
 ├── locals: []
 └── statements:
-    @ StatementsNode (location: (1,0)-(117,15))
+    @ StatementsNode (location: (1,0)-(119,15))
     ├── flags: ∅
-    └── body: (length: 53)
+    └── body: (length: 54)
         ├── @ StringNode (location: (1,0)-(1,6))
         │   ├── flags: newline
         │   ├── opening_loc: (1,0)-(1,2) = "%%"
@@ -493,109 +493,115 @@
         │   ├── content_loc: (97,1)-(97,12) = "\\7 \\43 \\141"
         │   ├── closing_loc: (97,12)-(97,13) = "\""
         │   └── unescaped: "\a # a"
-        ├── @ StringNode (location: (99,0)-(99,6))
-        │   ├── flags: newline
-        │   ├── opening_loc: (99,0)-(99,2) = "%["
-        │   ├── content_loc: (99,2)-(99,5) = "abc"
-        │   ├── closing_loc: (99,5)-(99,6) = "]"
-        │   └── unescaped: "abc"
+        ├── @ StringNode (location: (99,0)-(99,17))
+        │   ├── flags: newline, forced_utf8_encoding
+        │   ├── opening_loc: (99,0)-(99,1) = "\""
+        │   ├── content_loc: (99,1)-(99,16) = "ち\\xE3\\x81\\xFF"
+        │   ├── closing_loc: (99,16)-(99,17) = "\""
+        │   └── unescaped: "ち\xE3\x81\xFF"
         ├── @ StringNode (location: (101,0)-(101,6))
         │   ├── flags: newline
-        │   ├── opening_loc: (101,0)-(101,2) = "%("
+        │   ├── opening_loc: (101,0)-(101,2) = "%["
         │   ├── content_loc: (101,2)-(101,5) = "abc"
-        │   ├── closing_loc: (101,5)-(101,6) = ")"
+        │   ├── closing_loc: (101,5)-(101,6) = "]"
         │   └── unescaped: "abc"
         ├── @ StringNode (location: (103,0)-(103,6))
         │   ├── flags: newline
-        │   ├── opening_loc: (103,0)-(103,2) = "%@"
+        │   ├── opening_loc: (103,0)-(103,2) = "%("
         │   ├── content_loc: (103,2)-(103,5) = "abc"
-        │   ├── closing_loc: (103,5)-(103,6) = "@"
+        │   ├── closing_loc: (103,5)-(103,6) = ")"
         │   └── unescaped: "abc"
         ├── @ StringNode (location: (105,0)-(105,6))
         │   ├── flags: newline
-        │   ├── opening_loc: (105,0)-(105,2) = "%$"
+        │   ├── opening_loc: (105,0)-(105,2) = "%@"
         │   ├── content_loc: (105,2)-(105,5) = "abc"
-        │   ├── closing_loc: (105,5)-(105,6) = "$"
+        │   ├── closing_loc: (105,5)-(105,6) = "@"
+        │   └── unescaped: "abc"
+        ├── @ StringNode (location: (107,0)-(107,6))
+        │   ├── flags: newline
+        │   ├── opening_loc: (107,0)-(107,2) = "%$"
+        │   ├── content_loc: (107,2)-(107,5) = "abc"
+        │   ├── closing_loc: (107,5)-(107,6) = "$"
         │   └── unescaped: "abc"
-        ├── @ StringNode (location: (107,0)-(107,2))
+        ├── @ StringNode (location: (109,0)-(109,2))
         │   ├── flags: newline
-        │   ├── opening_loc: (107,0)-(107,1) = "?"
-        │   ├── content_loc: (107,1)-(107,2) = "a"
+        │   ├── opening_loc: (109,0)-(109,1) = "?"
+        │   ├── content_loc: (109,1)-(109,2) = "a"
         │   ├── closing_loc: ∅
         │   └── unescaped: "a"
-        ├── @ InterpolatedStringNode (location: (109,0)-(109,6))
+        ├── @ InterpolatedStringNode (location: (111,0)-(111,6))
         │   ├── flags: newline, static_literal
         │   ├── opening_loc: ∅
         │   ├── parts: (length: 2)
-        │   │   ├── @ StringNode (location: (109,0)-(109,2))
+        │   │   ├── @ StringNode (location: (111,0)-(111,2))
         │   │   │   ├── flags: static_literal, frozen
-        │   │   │   ├── opening_loc: (109,0)-(109,1) = "?"
-        │   │   │   ├── content_loc: (109,1)-(109,2) = "a"
+        │   │   │   ├── opening_loc: (111,0)-(111,1) = "?"
+        │   │   │   ├── content_loc: (111,1)-(111,2) = "a"
         │   │   │   ├── closing_loc: ∅
         │   │   │   └── unescaped: "a"
-        │   │   └── @ StringNode (location: (109,3)-(109,6))
+        │   │   └── @ StringNode (location: (111,3)-(111,6))
         │   │       ├── flags: static_literal, frozen
-        │   │       ├── opening_loc: (109,3)-(109,4) = "\""
-        │   │       ├── content_loc: (109,4)-(109,5) = "a"
-        │   │       ├── closing_loc: (109,5)-(109,6) = "\""
+        │   │       ├── opening_loc: (111,3)-(111,4) = "\""
+        │   │       ├── content_loc: (111,4)-(111,5) = "a"
+        │   │       ├── closing_loc: (111,5)-(111,6) = "\""
         │   │       └── unescaped: "a"
         │   └── closing_loc: ∅
-        ├── @ StringNode (location: (111,0)-(111,7))
+        ├── @ StringNode (location: (113,0)-(113,7))
         │   ├── flags: newline
-        │   ├── opening_loc: (111,0)-(111,3) = "%Q{"
-        │   ├── content_loc: (111,3)-(111,6) = "abc"
-        │   ├── closing_loc: (111,6)-(111,7) = "}"
+        │   ├── opening_loc: (113,0)-(113,3) = "%Q{"
+        │   ├── content_loc: (113,3)-(113,6) = "abc"
+        │   ├── closing_loc: (113,6)-(113,7) = "}"
         │   └── unescaped: "abc"
-        ├── @ StringNode (location: (113,0)-(113,5))
+        ├── @ StringNode (location: (115,0)-(115,5))
         │   ├── flags: newline
-        │   ├── opening_loc: (113,0)-(113,2) = "%^"
-        │   ├── content_loc: (113,2)-(113,4) = "\#$"
-        │   ├── closing_loc: (113,4)-(113,5) = "^"
+        │   ├── opening_loc: (115,0)-(115,2) = "%^"
+        │   ├── content_loc: (115,2)-(115,4) = "\#$"
+        │   ├── closing_loc: (115,4)-(115,5) = "^"
         │   └── unescaped: "\#$"
-        ├── @ StringNode (location: (115,0)-(115,4))
+        ├── @ StringNode (location: (117,0)-(117,4))
         │   ├── flags: newline
-        │   ├── opening_loc: (115,0)-(115,2) = "%@"
-        │   ├── content_loc: (115,2)-(115,3) = "#"
-        │   ├── closing_loc: (115,3)-(115,4) = "@"
+        │   ├── opening_loc: (117,0)-(117,2) = "%@"
+        │   ├── content_loc: (117,2)-(117,3) = "#"
+        │   ├── closing_loc: (117,3)-(117,4) = "@"
         │   └── unescaped: "#"
-        └── @ InterpolatedStringNode (location: (117,0)-(117,15))
+        └── @ InterpolatedStringNode (location: (119,0)-(119,15))
             ├── flags: newline
-            ├── opening_loc: (117,0)-(117,1) = "\""
+            ├── opening_loc: (119,0)-(119,1) = "\""
             ├── parts: (length: 2)
-            │   ├── @ EmbeddedStatementsNode (location: (117,1)-(117,12))
+            │   ├── @ EmbeddedStatementsNode (location: (119,1)-(119,12))
             │   │   ├── flags: ∅
-            │   │   ├── opening_loc: (117,1)-(117,3) = "\#{"
+            │   │   ├── opening_loc: (119,1)-(119,3) = "\#{"
             │   │   ├── statements:
-            │   │   │   @ StatementsNode (location: (117,3)-(117,11))
+            │   │   │   @ StatementsNode (location: (119,3)-(119,11))
             │   │   │   ├── flags: ∅
             │   │   │   └── body: (length: 1)
-            │   │   │       └── @ InterpolatedStringNode (location: (117,3)-(117,11))
+            │   │   │       └── @ InterpolatedStringNode (location: (119,3)-(119,11))
             │   │   │           ├── flags: ∅
-            │   │   │           ├── opening_loc: (117,3)-(117,4) = "\""
+            │   │   │           ├── opening_loc: (119,3)-(119,4) = "\""
             │   │   │           ├── parts: (length: 2)
-            │   │   │           │   ├── @ EmbeddedStatementsNode (location: (117,4)-(117,8))
+            │   │   │           │   ├── @ EmbeddedStatementsNode (location: (119,4)-(119,8))
             │   │   │           │   │   ├── flags: ∅
-            │   │   │           │   │   ├── opening_loc: (117,4)-(117,6) = "\#{"
+            │   │   │           │   │   ├── opening_loc: (119,4)-(119,6) = "\#{"
             │   │   │           │   │   ├── statements:
-            │   │   │           │   │   │   @ StatementsNode (location: (117,6)-(117,7))
+            │   │   │           │   │   │   @ StatementsNode (location: (119,6)-(119,7))
             │   │   │           │   │   │   ├── flags: ∅
             │   │   │           │   │   │   └── body: (length: 1)
-            │   │   │           │   │   │       └── @ ConstantReadNode (location: (117,6)-(117,7))
+            │   │   │           │   │   │       └── @ ConstantReadNode (location: (119,6)-(119,7))
             │   │   │           │   │   │           ├── flags: ∅
             │   │   │           │   │   │           └── name: :B
-            │   │   │           │   │   └── closing_loc: (117,7)-(117,8) = "}"
-            │   │   │           │   └── @ StringNode (location: (117,8)-(117,10))
+            │   │   │           │   │   └── closing_loc: (119,7)-(119,8) = "}"
+            │   │   │           │   └── @ StringNode (location: (119,8)-(119,10))
             │   │   │           │       ├── flags: static_literal, frozen
             │   │   │           │       ├── opening_loc: ∅
-            │   │   │           │       ├── content_loc: (117,8)-(117,10) = " C"
+            │   │   │           │       ├── content_loc: (119,8)-(119,10) = " C"
             │   │   │           │       ├── closing_loc: ∅
             │   │   │           │       └── unescaped: " C"
-            │   │   │           └── closing_loc: (117,10)-(117,11) = "\""
-            │   │   └── closing_loc: (117,11)-(117,12) = "}"
-            │   └── @ StringNode (location: (117,12)-(117,14))
+            │   │   │           └── closing_loc: (119,10)-(119,11) = "\""
+            │   │   └── closing_loc: (119,11)-(119,12) = "}"
+            │   └── @ StringNode (location: (119,12)-(119,14))
             │       ├── flags: static_literal, frozen
             │       ├── opening_loc: ∅
-            │       ├── content_loc: (117,12)-(117,14) = " D"
+            │       ├── content_loc: (119,12)-(119,14) = " D"
             │       ├── closing_loc: ∅
             │       └── unescaped: " D"
-            └── closing_loc: (117,14)-(117,15) = "\""
+            └── closing_loc: (119,14)-(119,15) = "\""