From e31e94a77566533726a564eaa129f20b24b1dd07 Mon Sep 17 00:00:00 2001 From: Earlopain <14981592+Earlopain@users.noreply.github.com> Date: Sun, 12 Jan 2025 20:08:28 +0100 Subject: [PATCH] Fix parser translator when unescaping invalid utf8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. The string starts out as binary 2. `ち` is appended, forcing it back into utf-8 3. Some invalid byte sequences are tried to append > incompatible character encodings: UTF-8 and BINARY (ASCII-8BIT) This makes use of my wish to use `append_as_bytes`. Unfortunatly that method is rather new so it needs a fallback --- lib/prism/polyfill/append_as_bytes.rb | 12 +++ lib/prism/translation/parser/lexer.rb | 16 ++-- prism.gemspec | 1 + test/prism/fixtures/strings.txt | 2 + test/prism/ruby/ruby_parser_test.rb | 1 + test/prism/snapshots/strings.txt | 120 ++++++++++++++------------ 6 files changed, 87 insertions(+), 65 deletions(-) create mode 100644 lib/prism/polyfill/append_as_bytes.rb diff --git a/lib/prism/polyfill/append_as_bytes.rb b/lib/prism/polyfill/append_as_bytes.rb new file mode 100644 index 0000000000..6f9b0819a0 --- /dev/null +++ b/lib/prism/polyfill/append_as_bytes.rb @@ -0,0 +1,12 @@ +# frozen_string_literal: true + +# Polyfill for String#append_as_bytes, which didn't exist until Ruby 3.4. +if !("".respond_to?(:append_as_bytes)) + String.include( + Module.new { + def append_as_bytes(*args) + args.each { self.<<(_1.b) } # steep:ignore + end + } + ) +end diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb index 49fdd2aea8..b4478123e1 100644 --- a/lib/prism/translation/parser/lexer.rb +++ b/lib/prism/translation/parser/lexer.rb @@ -1,6 +1,7 @@ # frozen_string_literal: true require "strscan" +require_relative "../../polyfill/append_as_bytes" module Prism module Translation @@ -638,24 +639,23 @@ def unescape_string(string, quote) scanner = StringScanner.new(string) while (skipped = scanner.skip_until(/\\/)) # Append what was just skipped over, excluding the found backslash. - result << string.byteslice(scanner.pos - skipped, skipped - 1) + result.append_as_bytes(string.byteslice(scanner.pos - skipped, skipped - 1)) # Simple single-character escape sequences like \n if (replacement = ESCAPES[scanner.peek(1)]) - result << replacement + result.append_as_bytes(replacement) scanner.pos += 1 elsif (octal = scanner.check(/[0-7]{1,3}/)) # \nnn - # NOTE: When Ruby 3.4 is required, this can become result.append_as_bytes(chr) - result << octal.to_i(8).chr.b + result.append_as_bytes(octal.to_i(8).chr) scanner.pos += octal.bytesize elsif (hex = scanner.check(/x([0-9a-fA-F]{1,2})/)) # \xnn - result << hex[1..].to_i(16).chr.b + result.append_as_bytes(hex[1..].to_i(16).chr) scanner.pos += hex.bytesize elsif (unicode = scanner.check(/u([0-9a-fA-F]{4})/)) # \unnnn - result << unicode[1..].hex.chr(Encoding::UTF_8).b + result.append_as_bytes(unicode[1..].hex.chr(Encoding::UTF_8)) scanner.pos += unicode.bytesize elsif scanner.peek(3) == "u{}" # https://github.com/whitequark/parser/issues/856 @@ -663,14 +663,14 @@ def unescape_string(string, quote) elsif (unicode_parts = scanner.check(/u{.*}/)) # \u{nnnn ...} unicode_parts[2..-2].split.each do |unicode| - result << unicode.hex.chr(Encoding::UTF_8).b + result.append_as_bytes(unicode.hex.chr(Encoding::UTF_8)) end scanner.pos += unicode_parts.bytesize end end # Add remainging chars - result << string.byteslice(scanner.pos..) + result.append_as_bytes(string.byteslice(scanner.pos..)) result.force_encoding(source_buffer.source.encoding) diff --git a/prism.gemspec b/prism.gemspec index e17e092f4e..4af551b263 100644 --- a/prism.gemspec +++ b/prism.gemspec @@ -86,6 +86,7 @@ Gem::Specification.new do |spec| "lib/prism/parse_result/errors.rb", "lib/prism/parse_result/newlines.rb", "lib/prism/pattern.rb", + "lib/prism/polyfill/append_as_bytes.rb", "lib/prism/polyfill/byteindex.rb", "lib/prism/polyfill/unpack1.rb", "lib/prism/reflection.rb", diff --git a/test/prism/fixtures/strings.txt b/test/prism/fixtures/strings.txt index 83f38cb606..030f15a2c9 100644 --- a/test/prism/fixtures/strings.txt +++ b/test/prism/fixtures/strings.txt @@ -96,6 +96,8 @@ baz "\7 \43 \141" +"ち\xE3\x81\xFF" + %[abc] %(abc) diff --git a/test/prism/ruby/ruby_parser_test.rb b/test/prism/ruby/ruby_parser_test.rb index 1d530dd13b..fd1dbf1ac8 100644 --- a/test/prism/ruby/ruby_parser_test.rb +++ b/test/prism/ruby/ruby_parser_test.rb @@ -35,6 +35,7 @@ class RubyParserTest < TestCase "seattlerb/op_asgn_primary_colon_const_command_call.txt", "seattlerb/regexp_esc_C_slash.txt", "seattlerb/str_lit_concat_bad_encodings.txt", + "strings.txt", "unescaping.txt", "unparser/corpus/literal/kwbegin.txt", "unparser/corpus/literal/send.txt", diff --git a/test/prism/snapshots/strings.txt b/test/prism/snapshots/strings.txt index 917e60224f..0e281ba152 100644 --- a/test/prism/snapshots/strings.txt +++ b/test/prism/snapshots/strings.txt @@ -1,10 +1,10 @@ -@ ProgramNode (location: (1,0)-(117,15)) +@ ProgramNode (location: (1,0)-(119,15)) ├── flags: ∅ ├── locals: [] └── statements: - @ StatementsNode (location: (1,0)-(117,15)) + @ StatementsNode (location: (1,0)-(119,15)) ├── flags: ∅ - └── body: (length: 53) + └── body: (length: 54) ├── @ StringNode (location: (1,0)-(1,6)) │ ├── flags: newline │ ├── opening_loc: (1,0)-(1,2) = "%%" @@ -493,109 +493,115 @@ │ ├── content_loc: (97,1)-(97,12) = "\\7 \\43 \\141" │ ├── closing_loc: (97,12)-(97,13) = "\"" │ └── unescaped: "\a # a" - ├── @ StringNode (location: (99,0)-(99,6)) - │ ├── flags: newline - │ ├── opening_loc: (99,0)-(99,2) = "%[" - │ ├── content_loc: (99,2)-(99,5) = "abc" - │ ├── closing_loc: (99,5)-(99,6) = "]" - │ └── unescaped: "abc" + ├── @ StringNode (location: (99,0)-(99,17)) + │ ├── flags: newline, forced_utf8_encoding + │ ├── opening_loc: (99,0)-(99,1) = "\"" + │ ├── content_loc: (99,1)-(99,16) = "ち\\xE3\\x81\\xFF" + │ ├── closing_loc: (99,16)-(99,17) = "\"" + │ └── unescaped: "ち\xE3\x81\xFF" ├── @ StringNode (location: (101,0)-(101,6)) │ ├── flags: newline - │ ├── opening_loc: (101,0)-(101,2) = "%(" + │ ├── opening_loc: (101,0)-(101,2) = "%[" │ ├── content_loc: (101,2)-(101,5) = "abc" - │ ├── closing_loc: (101,5)-(101,6) = ")" + │ ├── closing_loc: (101,5)-(101,6) = "]" │ └── unescaped: "abc" ├── @ StringNode (location: (103,0)-(103,6)) │ ├── flags: newline - │ ├── opening_loc: (103,0)-(103,2) = "%@" + │ ├── opening_loc: (103,0)-(103,2) = "%(" │ ├── content_loc: (103,2)-(103,5) = "abc" - │ ├── closing_loc: (103,5)-(103,6) = "@" + │ ├── closing_loc: (103,5)-(103,6) = ")" │ └── unescaped: "abc" ├── @ StringNode (location: (105,0)-(105,6)) │ ├── flags: newline - │ ├── opening_loc: (105,0)-(105,2) = "%$" + │ ├── opening_loc: (105,0)-(105,2) = "%@" │ ├── content_loc: (105,2)-(105,5) = "abc" - │ ├── closing_loc: (105,5)-(105,6) = "$" + │ ├── closing_loc: (105,5)-(105,6) = "@" + │ └── unescaped: "abc" + ├── @ StringNode (location: (107,0)-(107,6)) + │ ├── flags: newline + │ ├── opening_loc: (107,0)-(107,2) = "%$" + │ ├── content_loc: (107,2)-(107,5) = "abc" + │ ├── closing_loc: (107,5)-(107,6) = "$" │ └── unescaped: "abc" - ├── @ StringNode (location: (107,0)-(107,2)) + ├── @ StringNode (location: (109,0)-(109,2)) │ ├── flags: newline - │ ├── opening_loc: (107,0)-(107,1) = "?" - │ ├── content_loc: (107,1)-(107,2) = "a" + │ ├── opening_loc: (109,0)-(109,1) = "?" + │ ├── content_loc: (109,1)-(109,2) = "a" │ ├── closing_loc: ∅ │ └── unescaped: "a" - ├── @ InterpolatedStringNode (location: (109,0)-(109,6)) + ├── @ InterpolatedStringNode (location: (111,0)-(111,6)) │ ├── flags: newline, static_literal │ ├── opening_loc: ∅ │ ├── parts: (length: 2) - │ │ ├── @ StringNode (location: (109,0)-(109,2)) + │ │ ├── @ StringNode (location: (111,0)-(111,2)) │ │ │ ├── flags: static_literal, frozen - │ │ │ ├── opening_loc: (109,0)-(109,1) = "?" - │ │ │ ├── content_loc: (109,1)-(109,2) = "a" + │ │ │ ├── opening_loc: (111,0)-(111,1) = "?" + │ │ │ ├── content_loc: (111,1)-(111,2) = "a" │ │ │ ├── closing_loc: ∅ │ │ │ └── unescaped: "a" - │ │ └── @ StringNode (location: (109,3)-(109,6)) + │ │ └── @ StringNode (location: (111,3)-(111,6)) │ │ ├── flags: static_literal, frozen - │ │ ├── opening_loc: (109,3)-(109,4) = "\"" - │ │ ├── content_loc: (109,4)-(109,5) = "a" - │ │ ├── closing_loc: (109,5)-(109,6) = "\"" + │ │ ├── opening_loc: (111,3)-(111,4) = "\"" + │ │ ├── content_loc: (111,4)-(111,5) = "a" + │ │ ├── closing_loc: (111,5)-(111,6) = "\"" │ │ └── unescaped: "a" │ └── closing_loc: ∅ - ├── @ StringNode (location: (111,0)-(111,7)) + ├── @ StringNode (location: (113,0)-(113,7)) │ ├── flags: newline - │ ├── opening_loc: (111,0)-(111,3) = "%Q{" - │ ├── content_loc: (111,3)-(111,6) = "abc" - │ ├── closing_loc: (111,6)-(111,7) = "}" + │ ├── opening_loc: (113,0)-(113,3) = "%Q{" + │ ├── content_loc: (113,3)-(113,6) = "abc" + │ ├── closing_loc: (113,6)-(113,7) = "}" │ └── unescaped: "abc" - ├── @ StringNode (location: (113,0)-(113,5)) + ├── @ StringNode (location: (115,0)-(115,5)) │ ├── flags: newline - │ ├── opening_loc: (113,0)-(113,2) = "%^" - │ ├── content_loc: (113,2)-(113,4) = "\#$" - │ ├── closing_loc: (113,4)-(113,5) = "^" + │ ├── opening_loc: (115,0)-(115,2) = "%^" + │ ├── content_loc: (115,2)-(115,4) = "\#$" + │ ├── closing_loc: (115,4)-(115,5) = "^" │ └── unescaped: "\#$" - ├── @ StringNode (location: (115,0)-(115,4)) + ├── @ StringNode (location: (117,0)-(117,4)) │ ├── flags: newline - │ ├── opening_loc: (115,0)-(115,2) = "%@" - │ ├── content_loc: (115,2)-(115,3) = "#" - │ ├── closing_loc: (115,3)-(115,4) = "@" + │ ├── opening_loc: (117,0)-(117,2) = "%@" + │ ├── content_loc: (117,2)-(117,3) = "#" + │ ├── closing_loc: (117,3)-(117,4) = "@" │ └── unescaped: "#" - └── @ InterpolatedStringNode (location: (117,0)-(117,15)) + └── @ InterpolatedStringNode (location: (119,0)-(119,15)) ├── flags: newline - ├── opening_loc: (117,0)-(117,1) = "\"" + ├── opening_loc: (119,0)-(119,1) = "\"" ├── parts: (length: 2) - │ ├── @ EmbeddedStatementsNode (location: (117,1)-(117,12)) + │ ├── @ EmbeddedStatementsNode (location: (119,1)-(119,12)) │ │ ├── flags: ∅ - │ │ ├── opening_loc: (117,1)-(117,3) = "\#{" + │ │ ├── opening_loc: (119,1)-(119,3) = "\#{" │ │ ├── statements: - │ │ │ @ StatementsNode (location: (117,3)-(117,11)) + │ │ │ @ StatementsNode (location: (119,3)-(119,11)) │ │ │ ├── flags: ∅ │ │ │ └── body: (length: 1) - │ │ │ └── @ InterpolatedStringNode (location: (117,3)-(117,11)) + │ │ │ └── @ InterpolatedStringNode (location: (119,3)-(119,11)) │ │ │ ├── flags: ∅ - │ │ │ ├── opening_loc: (117,3)-(117,4) = "\"" + │ │ │ ├── opening_loc: (119,3)-(119,4) = "\"" │ │ │ ├── parts: (length: 2) - │ │ │ │ ├── @ EmbeddedStatementsNode (location: (117,4)-(117,8)) + │ │ │ │ ├── @ EmbeddedStatementsNode (location: (119,4)-(119,8)) │ │ │ │ │ ├── flags: ∅ - │ │ │ │ │ ├── opening_loc: (117,4)-(117,6) = "\#{" + │ │ │ │ │ ├── opening_loc: (119,4)-(119,6) = "\#{" │ │ │ │ │ ├── statements: - │ │ │ │ │ │ @ StatementsNode (location: (117,6)-(117,7)) + │ │ │ │ │ │ @ StatementsNode (location: (119,6)-(119,7)) │ │ │ │ │ │ ├── flags: ∅ │ │ │ │ │ │ └── body: (length: 1) - │ │ │ │ │ │ └── @ ConstantReadNode (location: (117,6)-(117,7)) + │ │ │ │ │ │ └── @ ConstantReadNode (location: (119,6)-(119,7)) │ │ │ │ │ │ ├── flags: ∅ │ │ │ │ │ │ └── name: :B - │ │ │ │ │ └── closing_loc: (117,7)-(117,8) = "}" - │ │ │ │ └── @ StringNode (location: (117,8)-(117,10)) + │ │ │ │ │ └── closing_loc: (119,7)-(119,8) = "}" + │ │ │ │ └── @ StringNode (location: (119,8)-(119,10)) │ │ │ │ ├── flags: static_literal, frozen │ │ │ │ ├── opening_loc: ∅ - │ │ │ │ ├── content_loc: (117,8)-(117,10) = " C" + │ │ │ │ ├── content_loc: (119,8)-(119,10) = " C" │ │ │ │ ├── closing_loc: ∅ │ │ │ │ └── unescaped: " C" - │ │ │ └── closing_loc: (117,10)-(117,11) = "\"" - │ │ └── closing_loc: (117,11)-(117,12) = "}" - │ └── @ StringNode (location: (117,12)-(117,14)) + │ │ │ └── closing_loc: (119,10)-(119,11) = "\"" + │ │ └── closing_loc: (119,11)-(119,12) = "}" + │ └── @ StringNode (location: (119,12)-(119,14)) │ ├── flags: static_literal, frozen │ ├── opening_loc: ∅ - │ ├── content_loc: (117,12)-(117,14) = " D" + │ ├── content_loc: (119,12)-(119,14) = " D" │ ├── closing_loc: ∅ │ └── unescaped: " D" - └── closing_loc: (117,14)-(117,15) = "\"" + └── closing_loc: (119,14)-(119,15) = "\""