Skip to content

Commit 110461c

Browse files
Earlopainkddnewton
authored andcommitted
[ruby/prism] Implement more string token escaping in the parser translator
This leaves `\c` and `\M` escaping but I don't understand how these should even work yet. Maybe later. ruby/prism@13db3e8cb9
1 parent 81079eb commit 110461c

File tree

2 files changed

+154
-50
lines changed

2 files changed

+154
-50
lines changed

lib/prism/translation/parser/lexer.rb

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# frozen_string_literal: true
22

3+
require "strscan"
4+
35
module Prism
46
module Translation
57
class Parser
@@ -251,6 +253,8 @@ def to_a
251253
end
252254
when :tCHARACTER
253255
value.delete_prefix!("?")
256+
# Character literals behave similar to double-quoted strings. We can use the same escaping mechanism.
257+
value = unescape_string(value, "?")
254258
when :tCOMMENT
255259
if token.type == :EMBDOC_BEGIN
256260
start_index = index
@@ -432,6 +436,156 @@ def parse_rational(value)
432436
rescue ArgumentError
433437
0r
434438
end
439+
440+
# Wonky heredoc tab/spaces rules.
441+
# https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L10548-L10558
442+
def calculate_heredoc_whitespace(heredoc_token_index)
443+
next_token_index = heredoc_token_index
444+
nesting_level = 0
445+
previous_line = -1
446+
result = Float::MAX
447+
448+
while (lexed[next_token_index] && next_token = lexed[next_token_index][0])
449+
next_token_index += 1
450+
next_next_token = lexed[next_token_index] && lexed[next_token_index][0]
451+
452+
# String content inside nested heredocs and interpolation is ignored
453+
if next_token.type == :HEREDOC_START || next_token.type == :EMBEXPR_BEGIN
454+
nesting_level += 1
455+
elsif next_token.type == :HEREDOC_END || next_token.type == :EMBEXPR_END
456+
nesting_level -= 1
457+
# When we encountered the matching heredoc end, we can exit
458+
break if nesting_level == -1
459+
elsif next_token.type == :STRING_CONTENT && nesting_level == 0
460+
common_whitespace = 0
461+
next_token.value[/^\s*/].each_char do |char|
462+
if char == "\t"
463+
common_whitespace = (common_whitespace / 8 + 1) * 8;
464+
else
465+
common_whitespace += 1
466+
end
467+
end
468+
469+
is_first_token_on_line = next_token.location.start_line != previous_line
470+
# Whitespace is significant if followed by interpolation
471+
whitespace_only = common_whitespace == next_token.value.length && next_next_token&.location&.start_line != next_token.location.start_line
472+
if is_first_token_on_line && !whitespace_only && common_whitespace < result
473+
result = common_whitespace
474+
previous_line = next_token.location.start_line
475+
end
476+
end
477+
end
478+
result
479+
end
480+
481+
# Wonky heredoc tab/spaces rules.
482+
# https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L16528-L16545
483+
def trim_heredoc_whitespace(string, heredoc)
484+
trimmed_whitespace = 0
485+
trimmed_characters = 0
486+
while (string[trimmed_characters] == "\t" || string[trimmed_characters] == " ") && trimmed_whitespace < heredoc.common_whitespace
487+
if string[trimmed_characters] == "\t"
488+
trimmed_whitespace = (trimmed_whitespace / 8 + 1) * 8;
489+
break if trimmed_whitespace > heredoc.common_whitespace
490+
else
491+
trimmed_whitespace += 1
492+
end
493+
trimmed_characters += 1
494+
end
495+
496+
string[trimmed_characters..]
497+
end
498+
499+
# Escape sequences that have special and should appear unescaped in the resulting string.
500+
ESCAPES = {
501+
"a" => "\a", "b" => "\b", "e" => "\e", "f" => "\f",
502+
"n" => "\n", "r" => "\r", "s" => "\s", "t" => "\t",
503+
"v" => "\v", "\\" => "\\"
504+
}.freeze
505+
private_constant :ESCAPES
506+
507+
# When one of these delimiters is encountered, then the other
508+
# one is allowed to be escaped as well.
509+
DELIMITER_SYMETRY = { "[" => "]", "(" => ")", "{" => "}", "<" => ">" }.freeze
510+
private_constant :DELIMITER_SYMETRY
511+
512+
# Apply Ruby string escaping rules
513+
def unescape_string(string, quote)
514+
# In single-quoted heredocs, everything is taken literally.
515+
return string if quote == "<<'"
516+
517+
# TODO: Implement regexp escaping
518+
return string if quote == "/" || quote.start_with?("%r")
519+
520+
# OPTIMIZATION: Assume that few strings need escaping to speed up the common case.
521+
return string unless string.include?("\\")
522+
523+
if interpolation?(quote)
524+
# Appending individual escape sequences may force the string out of its intended
525+
# encoding. Start out with binary and force it back later.
526+
result = "".b
527+
528+
scanner = StringScanner.new(string)
529+
while (skipped = scanner.skip_until(/\\/))
530+
# Append what was just skipped over, excluding the found backslash.
531+
result << string.byteslice(scanner.pos - skipped, skipped - 1)
532+
533+
# Simple single-character escape sequences like \n
534+
if (replacement = ESCAPES[scanner.peek(1)])
535+
result << replacement
536+
scanner.pos += 1
537+
elsif (octal = scanner.check(/[0-7]{1,3}/))
538+
# \nnn
539+
# NOTE: When Ruby 3.4 is required, this can become result.append_as_bytes(chr)
540+
result << octal.to_i(8).chr.b
541+
scanner.pos += octal.bytesize
542+
elsif (hex = scanner.check(/x([0-9a-fA-F]{1,2})/))
543+
# \xnn
544+
result << hex[1..].to_i(16).chr.b
545+
scanner.pos += hex.bytesize
546+
elsif (unicode = scanner.check(/u([0-9a-fA-F]{4})/))
547+
# \unnnn
548+
result << unicode[1..].hex.chr(Encoding::UTF_8).b
549+
scanner.pos += unicode.bytesize
550+
elsif scanner.peek(3) == "u{}"
551+
# https://github.com/whitequark/parser/issues/856
552+
scanner.pos += 3
553+
elsif (unicode_parts = scanner.check(/u{.*}/))
554+
# \u{nnnn ...}
555+
unicode_parts[2..-2].split.each do |unicode|
556+
result << unicode.hex.chr(Encoding::UTF_8).b
557+
end
558+
scanner.pos += unicode_parts.bytesize
559+
end
560+
end
561+
562+
# Add remainging chars
563+
result << string.byteslice(scanner.pos..)
564+
565+
result.force_encoding(source_buffer.source.encoding)
566+
567+
result
568+
else
569+
if quote == "'"
570+
delimiter = "'"
571+
else
572+
delimiter = quote[2]
573+
end
574+
575+
delimiters = Regexp.escape("#{delimiter}#{DELIMITER_SYMETRY[delimiter]}")
576+
string.gsub(/\\([\\#{delimiters}])/, '\1')
577+
end
578+
end
579+
580+
# Determine if characters preceeded by a backslash should be escaped or not
581+
def interpolation?(quote)
582+
quote != "'" && !quote.start_with?("%q", "%w", "%i")
583+
end
584+
585+
# Determine if the string is part of a %-style array.
586+
def percent_array?(quote)
587+
quote.start_with?("%w", "%W", "%i", "%I")
588+
end
435589
end
436590
end
437591
end

test/prism/ruby/parser_test.rb

Lines changed: 0 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -81,78 +81,28 @@ class ParserTest < TestCase
8181
# These files are failing to translate their lexer output into the lexer
8282
# output expected by the parser gem, so we'll skip them for now.
8383
skip_tokens = [
84-
"comments.txt",
8584
"dash_heredocs.txt",
86-
"dos_endings.txt",
8785
"embdoc_no_newline_at_end.txt",
88-
"heredoc_with_comment.txt",
8986
"heredocs_with_ignored_newlines.txt",
90-
"indented_file_end.txt",
9187
"methods.txt",
9288
"strings.txt",
9389
"tilde_heredocs.txt",
94-
"xstring_with_backslash.txt",
9590
"seattlerb/backticks_interpolation_line.txt",
9691
"seattlerb/bug169.txt",
9792
"seattlerb/case_in.txt",
98-
"seattlerb/class_comments.txt",
9993
"seattlerb/difficult4__leading_dots2.txt",
10094
"seattlerb/difficult6__7.txt",
10195
"seattlerb/difficult6__8.txt",
102-
"seattlerb/dsym_esc_to_sym.txt",
103-
"seattlerb/heredoc__backslash_dos_format.txt",
104-
"seattlerb/heredoc_backslash_nl.txt",
105-
"seattlerb/heredoc_comma_arg.txt",
106-
"seattlerb/heredoc_squiggly_blank_line_plus_interpolation.txt",
107-
"seattlerb/heredoc_squiggly_blank_lines.txt",
108-
"seattlerb/heredoc_squiggly_interp.txt",
109-
"seattlerb/heredoc_squiggly_tabs_extra.txt",
110-
"seattlerb/heredoc_squiggly_tabs.txt",
111-
"seattlerb/heredoc_squiggly_visually_blank_lines.txt",
112-
"seattlerb/heredoc_squiggly.txt",
11396
"seattlerb/heredoc_unicode.txt",
114-
"seattlerb/heredoc_with_carriage_return_escapes_windows.txt",
115-
"seattlerb/heredoc_with_carriage_return_escapes.txt",
116-
"seattlerb/heredoc_with_interpolation_and_carriage_return_escapes_windows.txt",
117-
"seattlerb/heredoc_with_interpolation_and_carriage_return_escapes.txt",
118-
"seattlerb/interpolated_symbol_array_line_breaks.txt",
119-
"seattlerb/interpolated_word_array_line_breaks.txt",
120-
"seattlerb/label_vs_string.txt",
121-
"seattlerb/module_comments.txt",
122-
"seattlerb/non_interpolated_symbol_array_line_breaks.txt",
123-
"seattlerb/non_interpolated_word_array_line_breaks.txt",
124-
"seattlerb/parse_line_block_inline_comment_leading_newlines.txt",
125-
"seattlerb/parse_line_block_inline_comment.txt",
126-
"seattlerb/parse_line_block_inline_multiline_comment.txt",
127-
"seattlerb/parse_line_dstr_escaped_newline.txt",
12897
"seattlerb/parse_line_heredoc.txt",
129-
"seattlerb/parse_line_multiline_str_literal_n.txt",
130-
"seattlerb/parse_line_str_with_newline_escape.txt",
13198
"seattlerb/pct_w_heredoc_interp_nested.txt",
132-
"seattlerb/qsymbols_empty_space.txt",
133-
"seattlerb/qw_escape_term.txt",
134-
"seattlerb/qWords_space.txt",
135-
"seattlerb/read_escape_unicode_curlies.txt",
136-
"seattlerb/read_escape_unicode_h4.txt",
13799
"seattlerb/required_kwarg_no_value.txt",
138100
"seattlerb/slashy_newlines_within_string.txt",
139-
"seattlerb/str_double_escaped_newline.txt",
140-
"seattlerb/str_double_newline.txt",
141-
"seattlerb/str_evstr_escape.txt",
142-
"seattlerb/str_newline_hash_line_number.txt",
143-
"seattlerb/str_single_newline.txt",
144-
"seattlerb/symbols_empty_space.txt",
145101
"seattlerb/TestRubyParserShared.txt",
146102
"unparser/corpus/literal/assignment.txt",
147-
"unparser/corpus/literal/dstr.txt",
148-
"unparser/corpus/semantic/opasgn.txt",
149103
"whitequark/args.txt",
150104
"whitequark/beginless_erange_after_newline.txt",
151105
"whitequark/beginless_irange_after_newline.txt",
152-
"whitequark/bug_ascii_8bit_in_literal.txt",
153-
"whitequark/bug_def_no_paren_eql_begin.txt",
154-
"whitequark/dedenting_heredoc.txt",
155-
"whitequark/dedenting_non_interpolating_heredoc_line_continuation.txt",
156106
"whitequark/forward_arg_with_open_args.txt",
157107
"whitequark/kwarg_no_paren.txt",
158108
"whitequark/lbrace_arg_after_command_args.txt",

0 commit comments

Comments
 (0)