@@ -199,10 +199,18 @@ class Lexer
199199 # The following token types are listed as those classified as `tLPAREN`.
200200 LPAREN_CONVERSION_TOKEN_TYPES = [
201201 :kBREAK , :kCASE , :tDIVIDE , :kFOR , :kIF , :kNEXT , :kRETURN , :kUNTIL , :kWHILE , :tAMPER , :tANDOP , :tBANG , :tCOMMA , :tDOT2 , :tDOT3 ,
202- :tEQL , :tLPAREN , :tLPAREN2 , :tLSHFT , :tNL , :tOP_ASGN , :tOROP , :tPIPE , :tSEMI , :tSTRING_DBEG , :tUMINUS , :tUPLUS
202+ :tEQL , :tLPAREN , :tLPAREN2 , :tLPAREN_ARG , : tLSHFT, :tNL , :tOP_ASGN , :tOROP , :tPIPE , :tSEMI , :tSTRING_DBEG , :tUMINUS , :tUPLUS
203203 ]
204204
205- private_constant :TYPES , :EXPR_BEG , :EXPR_LABEL , :LAMBDA_TOKEN_TYPES , :LPAREN_CONVERSION_TOKEN_TYPES
205+ # Types of tokens that are allowed to continue a method call with comments in-between.
206+ # For these, the parser gem doesn't emit a newline token after the last comment.
207+ COMMENT_CONTINUATION_TYPES = [ :COMMENT , :AMPERSAND_DOT , :DOT ]
208+ private_constant :COMMENT_CONTINUATION_TYPES
209+
210+ # Heredocs are complex and require us to keep track of a bit of info to refer to later
211+ HeredocData = Struct . new ( :identifier , :common_whitespace , keyword_init : true )
212+
213+ private_constant :TYPES , :EXPR_BEG , :EXPR_LABEL , :LAMBDA_TOKEN_TYPES , :LPAREN_CONVERSION_TOKEN_TYPES , :HeredocData
206214
207215 # The Parser::Source::Buffer that the tokens were lexed from.
208216 attr_reader :source_buffer
@@ -232,7 +240,13 @@ def to_a
232240 index = 0
233241 length = lexed . length
234242
235- heredoc_identifier_stack = [ ]
243+ heredoc_stack = [ ]
244+ quote_stack = [ ]
245+
246+ # The parser gem emits the newline tokens for comments out of order. This saves
247+ # that token location to emit at a later time to properly line everything up.
248+ # https://github.com/whitequark/parser/issues/1025
249+ comment_newline_location = nil
236250
237251 while index < length
238252 token , state = lexed [ index ]
@@ -241,7 +255,7 @@ def to_a
241255
242256 type = TYPES . fetch ( token . type )
243257 value = token . value
244- location = Range . new ( source_buffer , offset_cache [ token . location . start_offset ] , offset_cache [ token . location . end_offset ] )
258+ location = range ( token . location . start_offset , token . location . end_offset )
245259
246260 case type
247261 when :kDO
@@ -257,32 +271,55 @@ def to_a
257271 value = unescape_string ( value , "?" )
258272 when :tCOMMENT
259273 if token . type == :EMBDOC_BEGIN
260- start_index = index
261274
262275 while !( ( next_token = lexed [ index ] [ 0 ] ) && next_token . type == :EMBDOC_END ) && ( index < length - 1 )
263276 value += next_token . value
264277 index += 1
265278 end
266279
267- if start_index != index
268- value += next_token . value
269- location = Range . new ( source_buffer , offset_cache [ token . location . start_offset ] , offset_cache [ lexed [ index ] [ 0 ] . location . end_offset ] )
270- index += 1
271- end
280+ value += next_token . value
281+ location = range ( token . location . start_offset , lexed [ index ] [ 0 ] . location . end_offset )
282+ index += 1
272283 else
273284 value . chomp!
274- location = Range . new ( source_buffer , offset_cache [ token . location . start_offset ] , offset_cache [ token . location . end_offset - 1 ] )
285+ location = range ( token . location . start_offset , token . location . end_offset - 1 )
286+
287+ prev_token = lexed [ index - 2 ] [ 0 ]
288+ next_token = lexed [ index ] [ 0 ]
289+
290+ is_inline_comment = prev_token . location . start_line == token . location . start_line
291+ if is_inline_comment && !COMMENT_CONTINUATION_TYPES . include? ( next_token &.type )
292+ tokens << [ :tCOMMENT , [ value , location ] ]
293+
294+ nl_location = range ( token . location . end_offset - 1 , token . location . end_offset )
295+ tokens << [ :tNL , [ nil , nl_location ] ]
296+ next
297+ elsif is_inline_comment && next_token &.type == :COMMENT
298+ comment_newline_location = range ( token . location . end_offset - 1 , token . location . end_offset )
299+ elsif comment_newline_location && !COMMENT_CONTINUATION_TYPES . include? ( next_token &.type )
300+ tokens << [ :tCOMMENT , [ value , location ] ]
301+ tokens << [ :tNL , [ nil , comment_newline_location ] ]
302+ comment_newline_location = nil
303+ next
304+ end
275305 end
276306 when :tNL
307+ next_token = next_token = lexed [ index ] [ 0 ]
308+ # Newlines after comments are emitted out of order.
309+ if next_token &.type == :COMMENT
310+ comment_newline_location = location
311+ next
312+ end
313+
277314 value = nil
278315 when :tFLOAT
279316 value = parse_float ( value )
280317 when :tIMAGINARY
281318 value = parse_complex ( value )
282319 when :tINTEGER
283320 if value . start_with? ( "+" )
284- tokens << [ :tUNARY_NUM , [ "+" , Range . new ( source_buffer , offset_cache [ token . location . start_offset ] , offset_cache [ token . location . start_offset + 1 ] ) ] ]
285- location = Range . new ( source_buffer , offset_cache [ token . location . start_offset + 1 ] , offset_cache [ token . location . end_offset ] )
321+ tokens << [ :tUNARY_NUM , [ "+" , range ( token . location . start_offset , token . location . start_offset + 1 ) ] ]
322+ location = range ( token . location . start_offset + 1 , token . location . end_offset )
286323 end
287324
288325 value = parse_integer ( value )
@@ -303,47 +340,80 @@ def to_a
303340 when :tSPACE
304341 value = nil
305342 when :tSTRING_BEG
306- if token . type == :HEREDOC_START
307- heredoc_identifier_stack . push ( value . match ( /<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z / ) [ :heredoc_identifier ] )
308- end
309- if [ "\" " , "'" ] . include? ( value ) && ( next_token = lexed [ index ] [ 0 ] ) && next_token . type == :STRING_END
343+ next_token = lexed [ index ] [ 0 ]
344+ next_next_token = lexed [ index + 1 ] [ 0 ]
345+ basic_quotes = [ "\" " , "'" ] . include? ( value )
346+
347+ if basic_quotes && next_token &.type == :STRING_END
310348 next_location = token . location . join ( next_token . location )
311349 type = :tSTRING
312350 value = ""
313- location = Range . new ( source_buffer , offset_cache [ next_location . start_offset ] , offset_cache [ next_location . end_offset ] )
351+ location = range ( next_location . start_offset , next_location . end_offset )
314352 index += 1
315- elsif [ "\" " , "'" ] . include? ( value ) && ( next_token = lexed [ index ] [ 0 ] ) && next_token . type == :STRING_CONTENT && next_token . value . lines . count <= 1 && ( next_next_token = lexed [ index + 1 ] [ 0 ] ) && next_next_token . type == :STRING_END
316- next_location = token . location . join ( next_next_token . location )
317- type = :tSTRING
318- value = next_token . value . gsub ( "\\ \\ " , "\\ " )
319- location = Range . new ( source_buffer , offset_cache [ next_location . start_offset ] , offset_cache [ next_location . end_offset ] )
320- index += 2
321- elsif value . start_with? ( "<<" )
353+ elsif value . start_with? ( "'" , '"' , "%" )
354+ if next_token &.type == :STRING_CONTENT && next_token . value . lines . count <= 1 && next_next_token &.type == :STRING_END
355+ # the parser gem doesn't simplify strings when its value ends in a newline
356+ if !( string_value = next_token . value ) . end_with? ( "\n " ) && basic_quotes
357+ next_location = token . location . join ( next_next_token . location )
358+ value = unescape_string ( string_value , value )
359+ type = :tSTRING
360+ location = range ( next_location . start_offset , next_location . end_offset )
361+ index += 2
362+ tokens << [ type , [ value , location ] ]
363+
364+ next
365+ end
366+ end
367+
368+ quote_stack . push ( value )
369+ elsif token . type == :HEREDOC_START
322370 quote = value [ 2 ] == "-" || value [ 2 ] == "~" ? value [ 3 ] : value [ 2 ]
371+ heredoc_type = value [ 2 ] == "-" || value [ 2 ] == "~" ? value [ 2 ] : ""
372+ heredoc = HeredocData . new (
373+ identifier : value . match ( /<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z / ) [ :heredoc_identifier ] ,
374+ common_whitespace : 0 ,
375+ )
376+
323377 if quote == "`"
324378 type = :tXSTRING_BEG
325- value = "<<`"
379+ end
380+
381+ # The parser gem trims whitespace from squiggly heredocs. We must record
382+ # the most common whitespace to later remove.
383+ if heredoc_type == "~" || heredoc_type == "`"
384+ heredoc . common_whitespace = calculate_heredoc_whitespace ( index )
385+ end
386+
387+ if quote == "'" || quote == '"' || quote == "`"
388+ value = "<<#{ quote } "
326389 else
327- value = "<< #{ quote == "'" || quote == " \" " ? quote : " \" " } "
390+ value = '<<"'
328391 end
392+
393+ heredoc_stack . push ( heredoc )
394+ quote_stack . push ( value )
329395 end
330396 when :tSTRING_CONTENT
331- unless ( lines = token . value . lines ) . one?
397+ if ( lines = token . value . lines ) . one?
398+ # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
399+ is_first_token_on_line = lexed [ index - 1 ] && token . location . start_line != lexed [ index - 2 ] [ 0 ] . location &.start_line
400+ # The parser gem only removes indentation when the heredoc is not nested
401+ not_nested = heredoc_stack . size == 1
402+ if is_first_token_on_line && not_nested && ( current_heredoc = heredoc_stack . last ) . common_whitespace > 0
403+ value = trim_heredoc_whitespace ( value , current_heredoc )
404+ end
405+
406+ value = unescape_string ( value , quote_stack . last )
407+ else
408+ # When the parser gem encounters a line continuation inside of a multiline string,
409+ # it emits a single string node. The backslash (and remaining newline) is removed.
410+ current_line = +""
411+ adjustment = 0
332412 start_offset = offset_cache [ token . location . start_offset ]
333- lines . map do |line |
334- newline = line . end_with? ( "\r \n " ) ? "\r \n " : "\n "
413+ emit = false
414+
415+ lines . each . with_index do |line , index |
335416 chomped_line = line . chomp
336- <<<<<<< HEAD
337- if match = chomped_line . match ( /(?<backslashes>\\ +)\z / )
338- adjustment = match [ :backslashes ] . size / 2
339- adjusted_line = chomped_line . delete_suffix ( "\\ " * adjustment )
340- if match [ :backslashes ] . size . odd?
341- adjusted_line . delete_suffix! ( "\\ " )
342- adjustment += 2
343- else
344- adjusted_line << newline
345- end
346- =======
347417 backslash_count = chomped_line [ /\\ {1,}\z / ] &.length || 0
348418 is_interpolation = interpolation? ( quote_stack . last )
349419 is_percent_array = percent_array? ( quote_stack . last )
@@ -360,15 +430,18 @@ def to_a
360430 end
361431 # If the string ends with a line continuation emit the remainder
362432 emit = index == lines . count - 1
363- >>>>>>> b6554ad64e ( Fix parser translator tokens for backslashes in single -quoted strings and word arrays )
364433 else
365- adjusted_line = line
366- adjustment = 0
434+ current_line << line
435+ emit = true
367436 end
368437
369- end_offset = start_offset + adjusted_line . bytesize + adjustment
370- tokens << [ :tSTRING_CONTENT , [ adjusted_line , Range . new ( source_buffer , offset_cache [ start_offset ] , offset_cache [ end_offset ] ) ] ]
371- start_offset = end_offset
438+ if emit
439+ end_offset = start_offset + current_line . bytesize + adjustment
440+ tokens << [ :tSTRING_CONTENT , [ unescape_string ( current_line , quote_stack . last ) , range ( start_offset , end_offset ) ] ]
441+ start_offset = end_offset
442+ current_line = +""
443+ adjustment = 0
444+ end
372445 end
373446 next
374447 end
@@ -377,20 +450,24 @@ def to_a
377450 when :tSTRING_END
378451 if token . type == :HEREDOC_END && value . end_with? ( "\n " )
379452 newline_length = value . end_with? ( "\r \n " ) ? 2 : 1
380- value = heredoc_identifier_stack . pop
381- location = Range . new ( source_buffer , offset_cache [ token . location . start_offset ] , offset_cache [ token . location . end_offset - newline_length ] )
453+ value = heredoc_stack . pop . identifier
454+ location = range ( token . location . start_offset , token . location . end_offset - newline_length )
382455 elsif token . type == :REGEXP_END
383456 value = value [ 0 ]
384- location = Range . new ( source_buffer , offset_cache [ token . location . start_offset ] , offset_cache [ token . location . start_offset + 1 ] )
457+ location = range ( token . location . start_offset , token . location . start_offset + 1 )
385458 end
459+
460+ quote_stack . pop
386461 when :tSYMBEG
387462 if ( next_token = lexed [ index ] [ 0 ] ) && next_token . type != :STRING_CONTENT && next_token . type != :EMBEXPR_BEGIN && next_token . type != :EMBVAR && next_token . type != :STRING_END
388463 next_location = token . location . join ( next_token . location )
389464 type = :tSYMBOL
390465 value = next_token . value
391466 value = { "~@" => "~" , "!@" => "!" } . fetch ( value , value )
392- location = Range . new ( source_buffer , offset_cache [ next_location . start_offset ] , offset_cache [ next_location . end_offset ] )
467+ location = range ( next_location . start_offset , next_location . end_offset )
393468 index += 1
469+ else
470+ quote_stack . push ( value )
394471 end
395472 when :tFID
396473 if !tokens . empty? && tokens . dig ( -1 , 0 ) == :kDEF
@@ -400,12 +477,21 @@ def to_a
400477 if ( next_token = lexed [ index ] [ 0 ] ) && next_token . type != :STRING_CONTENT && next_token . type != :STRING_END
401478 type = :tBACK_REF2
402479 end
480+ quote_stack . push ( value )
481+ when :tSYMBOLS_BEG , :tQSYMBOLS_BEG , :tWORDS_BEG , :tQWORDS_BEG
482+ if ( next_token = lexed [ index ] [ 0 ] ) && next_token . type == :WORDS_SEP
483+ index += 1
484+ end
485+
486+ quote_stack . push ( value )
487+ when :tREGEXP_BEG
488+ quote_stack . push ( value )
403489 end
404490
405491 tokens << [ type , [ value , location ] ]
406492
407493 if token . type == :REGEXP_END
408- tokens << [ :tREGEXP_OPT , [ token . value [ 1 ..] , Range . new ( source_buffer , offset_cache [ token . location . start_offset + 1 ] , offset_cache [ token . location . end_offset ] ) ] ]
494+ tokens << [ :tREGEXP_OPT , [ token . value [ 1 ..] , range ( token . location . start_offset + 1 , token . location . end_offset ) ] ]
409495 end
410496 end
411497
@@ -414,6 +500,11 @@ def to_a
414500
415501 private
416502
503+ # Creates a new parser range, taking prisms byte offsets into account
504+ def range ( start_offset , end_offset )
505+ Range . new ( source_buffer , offset_cache [ start_offset ] , offset_cache [ end_offset ] )
506+ end
507+
417508 # Parse an integer from the string representation.
418509 def parse_integer ( value )
419510 Integer ( value )
0 commit comments