Skip to content

Commit b151e09

Browse files
committed
Better escaping handling for all strings in the parser translator
This implements more of the escaping rules for all kinds of strings that can appear in: * %i/%I * %w/%W * %q/%Q (and %) It takes into account that for %downcase notation and single-quoted strings, the delimiter is allowed to be escaped. It also implements the special rule about symetric delimiters like `(`, where `)` is also allowed. Leave regexp as is for now, they have their own special rules.
1 parent a5dc98b commit b151e09

File tree

2 files changed

+56
-38
lines changed

2 files changed

+56
-38
lines changed

lib/prism/translation/parser/lexer.rb

Lines changed: 56 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ class Lexer
201201
]
202202

203203
# Heredocs are complex and require us to keep track of a bit of info to refer to later
204-
HeredocData = Struct.new(:identifier, :common_whitespace, :quote, keyword_init: true)
204+
HeredocData = Struct.new(:identifier, :common_whitespace, keyword_init: true)
205205

206206
private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES, :HeredocData
207207

@@ -234,6 +234,7 @@ def to_a
234234
length = lexed.length
235235

236236
heredoc_stack = Array.new
237+
quote_stack = Array.new
237238

238239
while index < length
239240
token, state = lexed[index]
@@ -312,22 +313,28 @@ def to_a
312313
value = ""
313314
location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
314315
index += 1
315-
elsif basic_quotes && next_token&.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && next_next_token&.type == :STRING_END
316-
# the parser gem doesn't simplify strings when its value ends in a newline
317-
unless (string_value = next_token.value).end_with?("\n")
318-
next_location = token.location.join(next_next_token.location)
319-
value = unescape_string(string_value)
320-
type = :tSTRING
321-
location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
322-
index += 2
316+
elsif value.start_with?("'", '"', "%")
317+
if next_token&.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && next_next_token&.type == :STRING_END
318+
# the parser gem doesn't simplify strings when its value ends in a newline
319+
if !(string_value = next_token.value).end_with?("\n") && basic_quotes
320+
next_location = token.location.join(next_next_token.location)
321+
value = unescape_string(string_value, value)
322+
type = :tSTRING
323+
location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
324+
index += 2
325+
tokens << [type, [value, location]]
326+
327+
next
328+
end
323329
end
330+
331+
quote_stack.push(value)
324332
elsif token.type == :HEREDOC_START
325333
quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2]
326334
heredoc_type = value[2] == "-" || value[2] == "~" ? value[2] : ""
327335
heredoc = HeredocData.new(
328336
identifier: value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier],
329337
common_whitespace: 0,
330-
quote: quote,
331338
)
332339

333340
if quote == "`"
@@ -347,20 +354,19 @@ def to_a
347354
end
348355

349356
heredoc_stack.push(heredoc)
357+
quote_stack.push(value)
350358
end
351359
when :tSTRING_CONTENT
352360
if (lines = token.value.lines).one?
353361
# Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
354362
is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
355363
# The parser gem only removes indentation when the heredoc is not nested
356364
not_nested = heredoc_stack.size == 1
357-
current_heredoc = heredoc_stack.last
358-
if is_first_token_on_line && not_nested && current_heredoc.common_whitespace > 0
359-
value = trim_heredoc_whitespace(value, heredoc)
360-
end
361-
if current_heredoc
362-
value = unescape_heredoc(value, heredoc)
365+
if is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
366+
value = trim_heredoc_whitespace(value, current_heredoc)
363367
end
368+
369+
value = unescape_string(value, quote_stack.last)
364370
else
365371
# When the parser gem encounters a line continuation inside of a multiline string,
366372
# it emits a single string node. The backslash (and remaining newline) is removed.
@@ -386,7 +392,7 @@ def to_a
386392

387393
if emit
388394
end_offset = start_offset + current_line.bytesize + adjustment
389-
tokens << [:tSTRING_CONTENT, [unescape_string(current_line), Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]]
395+
tokens << [:tSTRING_CONTENT, [unescape_string(current_line, quote_stack.last), Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]]
390396
start_offset = end_offset
391397
current_line = +""
392398
adjustment = 0
@@ -405,6 +411,8 @@ def to_a
405411
value = value[0]
406412
location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])
407413
end
414+
415+
quote_stack.pop
408416
when :tSYMBEG
409417
if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END
410418
next_location = token.location.join(next_token.location)
@@ -413,6 +421,8 @@ def to_a
413421
value = { "~@" => "~", "!@" => "!" }.fetch(value, value)
414422
location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
415423
index += 1
424+
else
425+
quote_stack.push(value)
416426
end
417427
when :tFID
418428
if !tokens.empty? && tokens.dig(-1, 0) == :kDEF
@@ -422,10 +432,15 @@ def to_a
422432
if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :STRING_END
423433
type = :tBACK_REF2
424434
end
435+
quote_stack.push(value)
425436
when :tSYMBOLS_BEG, :tQSYMBOLS_BEG, :tWORDS_BEG, :tQWORDS_BEG
426437
if (next_token = lexed[index][0]) && next_token.type == :WORDS_SEP
427438
index += 1
428439
end
440+
441+
quote_stack.push(value)
442+
when :tREGEXP_BEG
443+
quote_stack.push(value)
429444
end
430445

431446
tokens << [type, [value, location]]
@@ -541,11 +556,6 @@ def trim_heredoc_whitespace(string, heredoc)
541556
string[trimmed_characters..]
542557
end
543558

544-
# Naive string escaping handling. Should be closer to the "unescape_heredoc" method
545-
def unescape_string(string)
546-
string.gsub("\\\\", "\\")
547-
end
548-
549559
# Escape sequences that have special and should appear unescaped in the resulting string.
550560
ESCAPES = {
551561
"a" => "\a", "b" => "\b", "e" => "\e", "f" => "\f",
@@ -554,15 +564,33 @@ def unescape_string(string)
554564
}.freeze
555565
private_constant :ESCAPES
556566

567+
# When one of these delimiters is encountered, then the other
568+
# one is allowed to be escaped as well.
569+
DELIMITER_SYMETRY = { "[" => "\\\\]", "(" => ")", "{" => "}", "<" => ">" }.freeze
570+
private_constant :DELIMITER_SYMETRY
571+
557572
# TODO: Does not handle "\u1234" and other longer-form escapes.
558-
def unescape_heredoc(string, heredoc)
573+
def unescape_string(string, quote)
559574
# In single-quoted heredocs, everything is taken literally.
560-
return string if heredoc.quote == "'"
575+
return string if quote == "<<'"
576+
577+
# TODO: Implement regexp escaping
578+
return string if quote == "/" || quote.start_with?("%r")
579+
580+
if quote == "'" || quote.start_with?("%q") || quote.start_with?("%w") || quote.start_with?("%i")
581+
if quote == "'"
582+
delimiter = "'"
583+
else
584+
delimiter = quote[2]
585+
end
561586

562-
# When double-quoted, escape sequences can be written literally. For example, "\\n" becomes "\n",
563-
# and "\\\\n" becomes "\\n". Unknown escapes sequences, like "\\o" simply become "o".
564-
string.gsub(/\\./) do |match|
565-
ESCAPES[match[1]] || match[1]
587+
string.gsub(/\\([\\#{delimiter}#{DELIMITER_SYMETRY[delimiter]}])/, '\1')
588+
else
589+
# When double-quoted, escape sequences can be written literally. For example, "\\n" becomes "\n",
590+
# and "\\\\n" becomes "\\n". Unknown escapes sequences, like "\\o" simply become "o".
591+
string.gsub(/\\./) do |match|
592+
ESCAPES[match[1]] || match[1]
593+
end
566594
end
567595
end
568596
end

test/prism/ruby/parser_test.rb

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,6 @@ class ParserTest < TestCase
9393
"methods.txt",
9494
"strings.txt",
9595
"tilde_heredocs.txt",
96-
"xstring_with_backslash.txt",
9796
"seattlerb/backticks_interpolation_line.txt",
9897
"seattlerb/bug169.txt",
9998
"seattlerb/case_in.txt",
@@ -103,28 +102,19 @@ class ParserTest < TestCase
103102
"seattlerb/difficult6__8.txt",
104103
"seattlerb/dsym_esc_to_sym.txt",
105104
"seattlerb/heredoc_unicode.txt",
106-
"seattlerb/heredoc_with_carriage_return_escapes_windows.txt",
107-
"seattlerb/heredoc_with_carriage_return_escapes.txt",
108105
"seattlerb/module_comments.txt",
109106
"seattlerb/parse_line_block_inline_comment_leading_newlines.txt",
110107
"seattlerb/parse_line_block_inline_comment.txt",
111108
"seattlerb/parse_line_block_inline_multiline_comment.txt",
112-
"seattlerb/parse_line_dstr_escaped_newline.txt",
113109
"seattlerb/parse_line_heredoc.txt",
114-
"seattlerb/parse_line_multiline_str_literal_n.txt",
115-
"seattlerb/parse_line_str_with_newline_escape.txt",
116110
"seattlerb/pct_w_heredoc_interp_nested.txt",
117-
"seattlerb/qw_escape_term.txt",
118111
"seattlerb/read_escape_unicode_curlies.txt",
119112
"seattlerb/read_escape_unicode_h4.txt",
120113
"seattlerb/required_kwarg_no_value.txt",
121114
"seattlerb/slashy_newlines_within_string.txt",
122-
"seattlerb/str_double_escaped_newline.txt",
123115
"seattlerb/str_evstr_escape.txt",
124-
"seattlerb/str_newline_hash_line_number.txt",
125116
"seattlerb/TestRubyParserShared.txt",
126117
"unparser/corpus/literal/assignment.txt",
127-
"unparser/corpus/semantic/opasgn.txt",
128118
"whitequark/args.txt",
129119
"whitequark/beginless_erange_after_newline.txt",
130120
"whitequark/beginless_irange_after_newline.txt",

0 commit comments

Comments
 (0)