@@ -201,7 +201,7 @@ class Lexer
201201 ]
202202
203203 # Heredocs are complex and require us to keep track of a bit of info to refer to later
204- HeredocData = Struct . new ( :identifier , :common_whitespace , :quote , keyword_init : true )
204+ HeredocData = Struct . new ( :identifier , :common_whitespace , keyword_init : true )
205205
206206 private_constant :TYPES , :EXPR_BEG , :EXPR_LABEL , :LAMBDA_TOKEN_TYPES , :LPAREN_CONVERSION_TOKEN_TYPES , :HeredocData
207207
@@ -234,6 +234,7 @@ def to_a
234234 length = lexed . length
235235
236236 heredoc_stack = Array . new
237+ quote_stack = Array . new
237238
238239 while index < length
239240 token , state = lexed [ index ]
@@ -312,22 +313,28 @@ def to_a
312313 value = ""
313314 location = Range . new ( source_buffer , offset_cache [ next_location . start_offset ] , offset_cache [ next_location . end_offset ] )
314315 index += 1
315- elsif basic_quotes && next_token &.type == :STRING_CONTENT && next_token . value . lines . count <= 1 && next_next_token &.type == :STRING_END
316- # the parser gem doesn't simplify strings when its value ends in a newline
317- unless ( string_value = next_token . value ) . end_with? ( "\n " )
318- next_location = token . location . join ( next_next_token . location )
319- value = unescape_string ( string_value )
320- type = :tSTRING
321- location = Range . new ( source_buffer , offset_cache [ next_location . start_offset ] , offset_cache [ next_location . end_offset ] )
322- index += 2
316+ elsif value . start_with? ( "'" , '"' , "%" )
317+ if next_token &.type == :STRING_CONTENT && next_token . value . lines . count <= 1 && next_next_token &.type == :STRING_END
318+ # the parser gem doesn't simplify strings when its value ends in a newline
319+ if !( string_value = next_token . value ) . end_with? ( "\n " ) && basic_quotes
320+ next_location = token . location . join ( next_next_token . location )
321+ value = unescape_string ( string_value , value )
322+ type = :tSTRING
323+ location = Range . new ( source_buffer , offset_cache [ next_location . start_offset ] , offset_cache [ next_location . end_offset ] )
324+ index += 2
325+ tokens << [ type , [ value , location ] ]
326+
327+ next
328+ end
323329 end
330+
331+ quote_stack . push ( value )
324332 elsif token . type == :HEREDOC_START
325333 quote = value [ 2 ] == "-" || value [ 2 ] == "~" ? value [ 3 ] : value [ 2 ]
326334 heredoc_type = value [ 2 ] == "-" || value [ 2 ] == "~" ? value [ 2 ] : ""
327335 heredoc = HeredocData . new (
328336 identifier : value . match ( /<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z / ) [ :heredoc_identifier ] ,
329337 common_whitespace : 0 ,
330- quote : quote ,
331338 )
332339
333340 if quote == "`"
@@ -347,20 +354,19 @@ def to_a
347354 end
348355
349356 heredoc_stack . push ( heredoc )
357+ quote_stack . push ( value )
350358 end
351359 when :tSTRING_CONTENT
352360 if ( lines = token . value . lines ) . one?
353361 # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
354362 is_first_token_on_line = lexed [ index - 1 ] && token . location . start_line != lexed [ index - 2 ] [ 0 ] . location &.start_line
355363 # The parser gem only removes indentation when the heredoc is not nested
356364 not_nested = heredoc_stack . size == 1
357- current_heredoc = heredoc_stack . last
358- if is_first_token_on_line && not_nested && current_heredoc . common_whitespace > 0
359- value = trim_heredoc_whitespace ( value , heredoc )
360- end
361- if current_heredoc
362- value = unescape_heredoc ( value , heredoc )
365+ if is_first_token_on_line && not_nested && ( current_heredoc = heredoc_stack . last ) . common_whitespace > 0
366+ value = trim_heredoc_whitespace ( value , current_heredoc )
363367 end
368+
369+ value = unescape_string ( value , quote_stack . last )
364370 else
365371 # When the parser gem encounters a line continuation inside of a multiline string,
366372 # it emits a single string node. The backslash (and remaining newline) is removed.
@@ -386,7 +392,7 @@ def to_a
386392
387393 if emit
388394 end_offset = start_offset + current_line . bytesize + adjustment
389- tokens << [ :tSTRING_CONTENT , [ unescape_string ( current_line ) , Range . new ( source_buffer , offset_cache [ start_offset ] , offset_cache [ end_offset ] ) ] ]
395+ tokens << [ :tSTRING_CONTENT , [ unescape_string ( current_line , quote_stack . last ) , Range . new ( source_buffer , offset_cache [ start_offset ] , offset_cache [ end_offset ] ) ] ]
390396 start_offset = end_offset
391397 current_line = +""
392398 adjustment = 0
@@ -405,6 +411,8 @@ def to_a
405411 value = value [ 0 ]
406412 location = Range . new ( source_buffer , offset_cache [ token . location . start_offset ] , offset_cache [ token . location . start_offset + 1 ] )
407413 end
414+
415+ quote_stack . pop
408416 when :tSYMBEG
409417 if ( next_token = lexed [ index ] [ 0 ] ) && next_token . type != :STRING_CONTENT && next_token . type != :EMBEXPR_BEGIN && next_token . type != :EMBVAR && next_token . type != :STRING_END
410418 next_location = token . location . join ( next_token . location )
@@ -413,6 +421,8 @@ def to_a
413421 value = { "~@" => "~" , "!@" => "!" } . fetch ( value , value )
414422 location = Range . new ( source_buffer , offset_cache [ next_location . start_offset ] , offset_cache [ next_location . end_offset ] )
415423 index += 1
424+ else
425+ quote_stack . push ( value )
416426 end
417427 when :tFID
418428 if !tokens . empty? && tokens . dig ( -1 , 0 ) == :kDEF
@@ -422,10 +432,15 @@ def to_a
422432 if ( next_token = lexed [ index ] [ 0 ] ) && next_token . type != :STRING_CONTENT && next_token . type != :STRING_END
423433 type = :tBACK_REF2
424434 end
435+ quote_stack . push ( value )
425436 when :tSYMBOLS_BEG , :tQSYMBOLS_BEG , :tWORDS_BEG , :tQWORDS_BEG
426437 if ( next_token = lexed [ index ] [ 0 ] ) && next_token . type == :WORDS_SEP
427438 index += 1
428439 end
440+
441+ quote_stack . push ( value )
442+ when :tREGEXP_BEG
443+ quote_stack . push ( value )
429444 end
430445
431446 tokens << [ type , [ value , location ] ]
@@ -541,11 +556,6 @@ def trim_heredoc_whitespace(string, heredoc)
541556 string [ trimmed_characters ..]
542557 end
543558
544- # Naive string escaping handling. Should be closer to the "unescape_heredoc" method
545- def unescape_string ( string )
546- string . gsub ( "\\ \\ " , "\\ " )
547- end
548-
549559 # Escape sequences that have special and should appear unescaped in the resulting string.
550560 ESCAPES = {
551561 "a" => "\a " , "b" => "\b " , "e" => "\e " , "f" => "\f " ,
@@ -554,15 +564,33 @@ def unescape_string(string)
554564 } . freeze
555565 private_constant :ESCAPES
556566
567+ # When one of these delimiters is encountered, then the other
568+ # one is allowed to be escaped as well.
569+ DELIMITER_SYMETRY = { "[" => "\\ \\ ]" , "(" => ")" , "{" => "}" , "<" => ">" } . freeze
570+ private_constant :DELIMITER_SYMETRY
571+
557572 # TODO: Does not handle "\u1234" and other longer-form escapes.
558- def unescape_heredoc ( string , heredoc )
573+ def unescape_string ( string , quote )
559574 # In single-quoted heredocs, everything is taken literally.
560- return string if heredoc . quote == "'"
575+ return string if quote == "<<'"
576+
577+ # TODO: Implement regexp escaping
578+ return string if quote == "/" || quote . start_with? ( "%r" )
579+
580+ if quote == "'" || quote . start_with? ( "%q" ) || quote . start_with? ( "%w" ) || quote . start_with? ( "%i" )
581+ if quote == "'"
582+ delimiter = "'"
583+ else
584+ delimiter = quote [ 2 ]
585+ end
561586
562- # When double-quoted, escape sequences can be written literally. For example, "\\n" becomes "\n",
563- # and "\\\\n" becomes "\\n". Unknown escapes sequences, like "\\o" simply become "o".
564- string . gsub ( /\\ ./ ) do |match |
565- ESCAPES [ match [ 1 ] ] || match [ 1 ]
587+ string . gsub ( /\\ ([\\ #{ delimiter } #{ DELIMITER_SYMETRY [ delimiter ] } ])/ , '\1' )
588+ else
589+ # When double-quoted, escape sequences can be written literally. For example, "\\n" becomes "\n",
590+ # and "\\\\n" becomes "\\n". Unknown escapes sequences, like "\\o" simply become "o".
591+ string . gsub ( /\\ ./ ) do |match |
592+ ESCAPES [ match [ 1 ] ] || match [ 1 ]
593+ end
566594 end
567595 end
568596 end
0 commit comments