diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb index 1839c494bb..49fdd2aea8 100644 --- a/lib/prism/translation/parser/lexer.rb +++ b/lib/prism/translation/parser/lexer.rb @@ -1,5 +1,7 @@ # frozen_string_literal: true +require "strscan" + module Prism module Translation class Parser @@ -265,6 +267,8 @@ def to_a end when :tCHARACTER value.delete_prefix!("?") + # Character literals behave similar to double-quoted strings. We can use the same escaping mechanism. + value = unescape_string(value, "?") when :tCOMMENT if token.type == :EMBDOC_BEGIN @@ -606,7 +610,7 @@ def trim_heredoc_whitespace(string, heredoc) ESCAPES = { "a" => "\a", "b" => "\b", "e" => "\e", "f" => "\f", "n" => "\n", "r" => "\r", "s" => "\s", "t" => "\t", - "v" => "\v", "\\\\" => "\\" + "v" => "\v", "\\" => "\\" }.freeze private_constant :ESCAPES @@ -615,7 +619,7 @@ def trim_heredoc_whitespace(string, heredoc) DELIMITER_SYMETRY = { "[" => "]", "(" => ")", "{" => "}", "<" => ">" }.freeze private_constant :DELIMITER_SYMETRY - # TODO: Does not handle "\u1234" and other longer-form escapes. + # Apply Ruby string escaping rules def unescape_string(string, quote) # In single-quoted heredocs, everything is taken literally. return string if quote == "<<'" @@ -623,12 +627,54 @@ def unescape_string(string, quote) # TODO: Implement regexp escaping return string if quote == "/" || quote.start_with?("%r") + # OPTIMIZATION: Assume that few strings need escaping to speed up the common case. + return string unless string.include?("\\") + if interpolation?(quote) - # In interpolation, escape sequences can be written literally. For example, "\\n" becomes "\n", - # and "\\\\n" becomes "\\n". Unknown escapes sequences, like "\\o" simply become "o". - string.gsub(/\\./) do |match| - ESCAPES[match[1]] || match[1] + # Appending individual escape sequences may force the string out of its intended + # encoding. Start out with binary and force it back later. + result = "".b + + scanner = StringScanner.new(string) + while (skipped = scanner.skip_until(/\\/)) + # Append what was just skipped over, excluding the found backslash. + result << string.byteslice(scanner.pos - skipped, skipped - 1) + + # Simple single-character escape sequences like \n + if (replacement = ESCAPES[scanner.peek(1)]) + result << replacement + scanner.pos += 1 + elsif (octal = scanner.check(/[0-7]{1,3}/)) + # \nnn + # NOTE: When Ruby 3.4 is required, this can become result.append_as_bytes(chr) + result << octal.to_i(8).chr.b + scanner.pos += octal.bytesize + elsif (hex = scanner.check(/x([0-9a-fA-F]{1,2})/)) + # \xnn + result << hex[1..].to_i(16).chr.b + scanner.pos += hex.bytesize + elsif (unicode = scanner.check(/u([0-9a-fA-F]{4})/)) + # \unnnn + result << unicode[1..].hex.chr(Encoding::UTF_8).b + scanner.pos += unicode.bytesize + elsif scanner.peek(3) == "u{}" + # https://github.com/whitequark/parser/issues/856 + scanner.pos += 3 + elsif (unicode_parts = scanner.check(/u{.*}/)) + # \u{nnnn ...} + unicode_parts[2..-2].split.each do |unicode| + result << unicode.hex.chr(Encoding::UTF_8).b + end + scanner.pos += unicode_parts.bytesize + end end + + # Add remainging chars + result << string.byteslice(scanner.pos..) + + result.force_encoding(source_buffer.source.encoding) + + result else if quote == "'" delimiter = "'" diff --git a/test/prism/ruby/parser_test.rb b/test/prism/ruby/parser_test.rb index 7816b7b57a..4ba38bd0c0 100644 --- a/test/prism/ruby/parser_test.rb +++ b/test/prism/ruby/parser_test.rb @@ -93,21 +93,16 @@ class ParserTest < TestCase "seattlerb/difficult4__leading_dots2.txt", "seattlerb/difficult6__7.txt", "seattlerb/difficult6__8.txt", - "seattlerb/dsym_esc_to_sym.txt", "seattlerb/heredoc_unicode.txt", "seattlerb/parse_line_heredoc.txt", "seattlerb/pct_w_heredoc_interp_nested.txt", - "seattlerb/read_escape_unicode_curlies.txt", - "seattlerb/read_escape_unicode_h4.txt", "seattlerb/required_kwarg_no_value.txt", "seattlerb/slashy_newlines_within_string.txt", - "seattlerb/str_evstr_escape.txt", "seattlerb/TestRubyParserShared.txt", "unparser/corpus/literal/assignment.txt", "whitequark/args.txt", "whitequark/beginless_erange_after_newline.txt", "whitequark/beginless_irange_after_newline.txt", - "whitequark/bug_ascii_8bit_in_literal.txt", "whitequark/forward_arg_with_open_args.txt", "whitequark/kwarg_no_paren.txt", "whitequark/lbrace_arg_after_command_args.txt",