Skip to content

Commit 9999ef2

Browse files
committed
Fix parser translator when unescaping invalid utf8
1. The string starts out as binary 2. `ち` is appended, forcing it back into utf-8 3. Some invalid byte sequences are tried to append > incompatible character encodings: UTF-8 and BINARY (ASCII-8BIT) This makes use of my wish to use `append_as_bytes`. Unfortunatly that method is rather new so it needs a fallback
1 parent 01cbec9 commit 9999ef2

File tree

4 files changed

+85
-65
lines changed

4 files changed

+85
-65
lines changed

lib/prism/translation/parser/lexer.rb

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -638,39 +638,38 @@ def unescape_string(string, quote)
638638
scanner = StringScanner.new(string)
639639
while (skipped = scanner.skip_until(/\\/))
640640
# Append what was just skipped over, excluding the found backslash.
641-
result << string.byteslice(scanner.pos - skipped, skipped - 1)
641+
append_as_bytes(result, string.byteslice(scanner.pos - skipped, skipped - 1))
642642

643643
# Simple single-character escape sequences like \n
644644
if (replacement = ESCAPES[scanner.peek(1)])
645-
result << replacement
645+
append_as_bytes(result, replacement)
646646
scanner.pos += 1
647647
elsif (octal = scanner.check(/[0-7]{1,3}/))
648648
# \nnn
649-
# NOTE: When Ruby 3.4 is required, this can become result.append_as_bytes(chr)
650-
result << octal.to_i(8).chr.b
649+
append_as_bytes(result, octal.to_i(8).chr)
651650
scanner.pos += octal.bytesize
652651
elsif (hex = scanner.check(/x([0-9a-fA-F]{1,2})/))
653652
# \xnn
654-
result << hex[1..].to_i(16).chr.b
653+
append_as_bytes(result, hex[1..].to_i(16).chr)
655654
scanner.pos += hex.bytesize
656655
elsif (unicode = scanner.check(/u([0-9a-fA-F]{4})/))
657656
# \unnnn
658-
result << unicode[1..].hex.chr(Encoding::UTF_8).b
657+
append_as_bytes(result, unicode[1..].hex.chr(Encoding::UTF_8))
659658
scanner.pos += unicode.bytesize
660659
elsif scanner.peek(3) == "u{}"
661660
# https://github.com/whitequark/parser/issues/856
662661
scanner.pos += 3
663662
elsif (unicode_parts = scanner.check(/u{.*}/))
664663
# \u{nnnn ...}
665664
unicode_parts[2..-2].split.each do |unicode|
666-
result << unicode.hex.chr(Encoding::UTF_8).b
665+
append_as_bytes(result, unicode.hex.chr(Encoding::UTF_8))
667666
end
668667
scanner.pos += unicode_parts.bytesize
669668
end
670669
end
671670

672671
# Add remainging chars
673-
result << string.byteslice(scanner.pos..)
672+
append_as_bytes(result, string.byteslice(scanner.pos..))
674673

675674
result.force_encoding(source_buffer.source.encoding)
676675

@@ -687,6 +686,18 @@ def unescape_string(string, quote)
687686
end
688687
end
689688

689+
if RUBY_VERSION >= "3.4"
690+
# Append some string without modifying the encoding of the receiver.
691+
def append_as_bytes(receiver, input)
692+
receiver.append_as_bytes(input)
693+
end
694+
else
695+
# Not as efficient as the one above since every append dups the input.
696+
def append_as_bytes(receiver, input)
697+
receiver << input.b
698+
end
699+
end
700+
690701
# Determine if characters preceeded by a backslash should be escaped or not
691702
def interpolation?(quote)
692703
quote != "'" && !quote.start_with?("%q", "%w", "%i")

test/prism/fixtures/strings.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@ baz
9696

9797
"\7 \43 \141"
9898

99+
"ち\xE3\x81\xFF"
100+
99101
%[abc]
100102

101103
%(abc)

test/prism/ruby/ruby_parser_test.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ class RubyParserTest < TestCase
3535
"seattlerb/op_asgn_primary_colon_const_command_call.txt",
3636
"seattlerb/regexp_esc_C_slash.txt",
3737
"seattlerb/str_lit_concat_bad_encodings.txt",
38+
"strings.txt",
3839
"unescaping.txt",
3940
"unparser/corpus/literal/kwbegin.txt",
4041
"unparser/corpus/literal/send.txt",

test/prism/snapshots/strings.txt

Lines changed: 63 additions & 57 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)