Skip to content

Commit e31e94a

Browse files
committed
Fix parser translator when unescaping invalid utf8
1. The string starts out as binary 2. `ち` is appended, forcing it back into utf-8 3. Some invalid byte sequences are tried to append > incompatible character encodings: UTF-8 and BINARY (ASCII-8BIT) This makes use of my wish to use `append_as_bytes`. Unfortunatly that method is rather new so it needs a fallback
1 parent 01cbec9 commit e31e94a

File tree

6 files changed

+87
-65
lines changed

6 files changed

+87
-65
lines changed
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# frozen_string_literal: true
2+
3+
# Polyfill for String#append_as_bytes, which didn't exist until Ruby 3.4.
4+
if !("".respond_to?(:append_as_bytes))
5+
String.include(
6+
Module.new {
7+
def append_as_bytes(*args)
8+
args.each { self.<<(_1.b) } # steep:ignore
9+
end
10+
}
11+
)
12+
end

lib/prism/translation/parser/lexer.rb

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# frozen_string_literal: true
22

33
require "strscan"
4+
require_relative "../../polyfill/append_as_bytes"
45

56
module Prism
67
module Translation
@@ -638,39 +639,38 @@ def unescape_string(string, quote)
638639
scanner = StringScanner.new(string)
639640
while (skipped = scanner.skip_until(/\\/))
640641
# Append what was just skipped over, excluding the found backslash.
641-
result << string.byteslice(scanner.pos - skipped, skipped - 1)
642+
result.append_as_bytes(string.byteslice(scanner.pos - skipped, skipped - 1))
642643

643644
# Simple single-character escape sequences like \n
644645
if (replacement = ESCAPES[scanner.peek(1)])
645-
result << replacement
646+
result.append_as_bytes(replacement)
646647
scanner.pos += 1
647648
elsif (octal = scanner.check(/[0-7]{1,3}/))
648649
# \nnn
649-
# NOTE: When Ruby 3.4 is required, this can become result.append_as_bytes(chr)
650-
result << octal.to_i(8).chr.b
650+
result.append_as_bytes(octal.to_i(8).chr)
651651
scanner.pos += octal.bytesize
652652
elsif (hex = scanner.check(/x([0-9a-fA-F]{1,2})/))
653653
# \xnn
654-
result << hex[1..].to_i(16).chr.b
654+
result.append_as_bytes(hex[1..].to_i(16).chr)
655655
scanner.pos += hex.bytesize
656656
elsif (unicode = scanner.check(/u([0-9a-fA-F]{4})/))
657657
# \unnnn
658-
result << unicode[1..].hex.chr(Encoding::UTF_8).b
658+
result.append_as_bytes(unicode[1..].hex.chr(Encoding::UTF_8))
659659
scanner.pos += unicode.bytesize
660660
elsif scanner.peek(3) == "u{}"
661661
# https://github.com/whitequark/parser/issues/856
662662
scanner.pos += 3
663663
elsif (unicode_parts = scanner.check(/u{.*}/))
664664
# \u{nnnn ...}
665665
unicode_parts[2..-2].split.each do |unicode|
666-
result << unicode.hex.chr(Encoding::UTF_8).b
666+
result.append_as_bytes(unicode.hex.chr(Encoding::UTF_8))
667667
end
668668
scanner.pos += unicode_parts.bytesize
669669
end
670670
end
671671

672672
# Add remainging chars
673-
result << string.byteslice(scanner.pos..)
673+
result.append_as_bytes(string.byteslice(scanner.pos..))
674674

675675
result.force_encoding(source_buffer.source.encoding)
676676

prism.gemspec

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ Gem::Specification.new do |spec|
8686
"lib/prism/parse_result/errors.rb",
8787
"lib/prism/parse_result/newlines.rb",
8888
"lib/prism/pattern.rb",
89+
"lib/prism/polyfill/append_as_bytes.rb",
8990
"lib/prism/polyfill/byteindex.rb",
9091
"lib/prism/polyfill/unpack1.rb",
9192
"lib/prism/reflection.rb",

test/prism/fixtures/strings.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@ baz
9696

9797
"\7 \43 \141"
9898

99+
"ち\xE3\x81\xFF"
100+
99101
%[abc]
100102

101103
%(abc)

test/prism/ruby/ruby_parser_test.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ class RubyParserTest < TestCase
3535
"seattlerb/op_asgn_primary_colon_const_command_call.txt",
3636
"seattlerb/regexp_esc_C_slash.txt",
3737
"seattlerb/str_lit_concat_bad_encodings.txt",
38+
"strings.txt",
3839
"unescaping.txt",
3940
"unparser/corpus/literal/kwbegin.txt",
4041
"unparser/corpus/literal/send.txt",

test/prism/snapshots/strings.txt

Lines changed: 63 additions & 57 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)