Skip to content

Commit 7002222

Browse files
Earlopainkddnewton
authored andcommitted
[ruby/prism] Better comment token handling for the parser translator
There appear to be a bunch of rules, changing behaviour for inline comments, multiple comments after another, etc. This seems to line up with reality pretty closely, token differences for RuboCop tests go from 1129 to 619 which seems pretty impressive ruby/prism@2e1b92670c
1 parent 117d6e1 commit 7002222

File tree

2 files changed

+145
-52
lines changed

2 files changed

+145
-52
lines changed

lib/prism/translation/parser/lexer.rb

Lines changed: 143 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -199,10 +199,18 @@ class Lexer
199199
# The following token types are listed as those classified as `tLPAREN`.
200200
LPAREN_CONVERSION_TOKEN_TYPES = [
201201
:kBREAK, :kCASE, :tDIVIDE, :kFOR, :kIF, :kNEXT, :kRETURN, :kUNTIL, :kWHILE, :tAMPER, :tANDOP, :tBANG, :tCOMMA, :tDOT2, :tDOT3,
202-
:tEQL, :tLPAREN, :tLPAREN2, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS
202+
:tEQL, :tLPAREN, :tLPAREN2, :tLPAREN_ARG, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS
203203
]
204204

205-
private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES
205+
# Types of tokens that are allowed to continue a method call with comments in-between.
206+
# For these, the parser gem doesn't emit a newline token after the last comment.
207+
COMMENT_CONTINUATION_TYPES = [:COMMENT, :AMPERSAND_DOT, :DOT]
208+
private_constant :COMMENT_CONTINUATION_TYPES
209+
210+
# Heredocs are complex and require us to keep track of a bit of info to refer to later
211+
HeredocData = Struct.new(:identifier, :common_whitespace, keyword_init: true)
212+
213+
private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES, :HeredocData
206214

207215
# The Parser::Source::Buffer that the tokens were lexed from.
208216
attr_reader :source_buffer
@@ -232,7 +240,13 @@ def to_a
232240
index = 0
233241
length = lexed.length
234242

235-
heredoc_identifier_stack = []
243+
heredoc_stack = []
244+
quote_stack = []
245+
246+
# The parser gem emits the newline tokens for comments out of order. This saves
247+
# that token location to emit at a later time to properly line everything up.
248+
# https://github.com/whitequark/parser/issues/1025
249+
comment_newline_location = nil
236250

237251
while index < length
238252
token, state = lexed[index]
@@ -241,7 +255,7 @@ def to_a
241255

242256
type = TYPES.fetch(token.type)
243257
value = token.value
244-
location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset])
258+
location = range(token.location.start_offset, token.location.end_offset)
245259

246260
case type
247261
when :kDO
@@ -257,32 +271,55 @@ def to_a
257271
value = unescape_string(value, "?")
258272
when :tCOMMENT
259273
if token.type == :EMBDOC_BEGIN
260-
start_index = index
261274

262275
while !((next_token = lexed[index][0]) && next_token.type == :EMBDOC_END) && (index < length - 1)
263276
value += next_token.value
264277
index += 1
265278
end
266279

267-
if start_index != index
268-
value += next_token.value
269-
location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[lexed[index][0].location.end_offset])
270-
index += 1
271-
end
280+
value += next_token.value
281+
location = range(token.location.start_offset, lexed[index][0].location.end_offset)
282+
index += 1
272283
else
273284
value.chomp!
274-
location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - 1])
285+
location = range(token.location.start_offset, token.location.end_offset - 1)
286+
287+
prev_token = lexed[index - 2][0]
288+
next_token = lexed[index][0]
289+
290+
is_inline_comment = prev_token.location.start_line == token.location.start_line
291+
if is_inline_comment && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
292+
tokens << [:tCOMMENT, [value, location]]
293+
294+
nl_location = range(token.location.end_offset - 1, token.location.end_offset)
295+
tokens << [:tNL, [nil, nl_location]]
296+
next
297+
elsif is_inline_comment && next_token&.type == :COMMENT
298+
comment_newline_location = range(token.location.end_offset - 1, token.location.end_offset)
299+
elsif comment_newline_location && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
300+
tokens << [:tCOMMENT, [value, location]]
301+
tokens << [:tNL, [nil, comment_newline_location]]
302+
comment_newline_location = nil
303+
next
304+
end
275305
end
276306
when :tNL
307+
next_token = next_token = lexed[index][0]
308+
# Newlines after comments are emitted out of order.
309+
if next_token&.type == :COMMENT
310+
comment_newline_location = location
311+
next
312+
end
313+
277314
value = nil
278315
when :tFLOAT
279316
value = parse_float(value)
280317
when :tIMAGINARY
281318
value = parse_complex(value)
282319
when :tINTEGER
283320
if value.start_with?("+")
284-
tokens << [:tUNARY_NUM, ["+", Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])]]
285-
location = Range.new(source_buffer, offset_cache[token.location.start_offset + 1], offset_cache[token.location.end_offset])
321+
tokens << [:tUNARY_NUM, ["+", range(token.location.start_offset, token.location.start_offset + 1)]]
322+
location = range(token.location.start_offset + 1, token.location.end_offset)
286323
end
287324

288325
value = parse_integer(value)
@@ -303,47 +340,80 @@ def to_a
303340
when :tSPACE
304341
value = nil
305342
when :tSTRING_BEG
306-
if token.type == :HEREDOC_START
307-
heredoc_identifier_stack.push(value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier])
308-
end
309-
if ["\"", "'"].include?(value) && (next_token = lexed[index][0]) && next_token.type == :STRING_END
343+
next_token = lexed[index][0]
344+
next_next_token = lexed[index + 1][0]
345+
basic_quotes = ["\"", "'"].include?(value)
346+
347+
if basic_quotes && next_token&.type == :STRING_END
310348
next_location = token.location.join(next_token.location)
311349
type = :tSTRING
312350
value = ""
313-
location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
351+
location = range(next_location.start_offset, next_location.end_offset)
314352
index += 1
315-
elsif ["\"", "'"].include?(value) && (next_token = lexed[index][0]) && next_token.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && (next_next_token = lexed[index + 1][0]) && next_next_token.type == :STRING_END
316-
next_location = token.location.join(next_next_token.location)
317-
type = :tSTRING
318-
value = next_token.value.gsub("\\\\", "\\")
319-
location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
320-
index += 2
321-
elsif value.start_with?("<<")
353+
elsif value.start_with?("'", '"', "%")
354+
if next_token&.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && next_next_token&.type == :STRING_END
355+
# the parser gem doesn't simplify strings when its value ends in a newline
356+
if !(string_value = next_token.value).end_with?("\n") && basic_quotes
357+
next_location = token.location.join(next_next_token.location)
358+
value = unescape_string(string_value, value)
359+
type = :tSTRING
360+
location = range(next_location.start_offset, next_location.end_offset)
361+
index += 2
362+
tokens << [type, [value, location]]
363+
364+
next
365+
end
366+
end
367+
368+
quote_stack.push(value)
369+
elsif token.type == :HEREDOC_START
322370
quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2]
371+
heredoc_type = value[2] == "-" || value[2] == "~" ? value[2] : ""
372+
heredoc = HeredocData.new(
373+
identifier: value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier],
374+
common_whitespace: 0,
375+
)
376+
323377
if quote == "`"
324378
type = :tXSTRING_BEG
325-
value = "<<`"
379+
end
380+
381+
# The parser gem trims whitespace from squiggly heredocs. We must record
382+
# the most common whitespace to later remove.
383+
if heredoc_type == "~" || heredoc_type == "`"
384+
heredoc.common_whitespace = calculate_heredoc_whitespace(index)
385+
end
386+
387+
if quote == "'" || quote == '"' || quote == "`"
388+
value = "<<#{quote}"
326389
else
327-
value = "<<#{quote == "'" || quote == "\"" ? quote : "\""}"
390+
value = '<<"'
328391
end
392+
393+
heredoc_stack.push(heredoc)
394+
quote_stack.push(value)
329395
end
330396
when :tSTRING_CONTENT
331-
unless (lines = token.value.lines).one?
397+
if (lines = token.value.lines).one?
398+
# Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
399+
is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
400+
# The parser gem only removes indentation when the heredoc is not nested
401+
not_nested = heredoc_stack.size == 1
402+
if is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
403+
value = trim_heredoc_whitespace(value, current_heredoc)
404+
end
405+
406+
value = unescape_string(value, quote_stack.last)
407+
else
408+
# When the parser gem encounters a line continuation inside of a multiline string,
409+
# it emits a single string node. The backslash (and remaining newline) is removed.
410+
current_line = +""
411+
adjustment = 0
332412
start_offset = offset_cache[token.location.start_offset]
333-
lines.map do |line|
334-
newline = line.end_with?("\r\n") ? "\r\n" : "\n"
413+
emit = false
414+
415+
lines.each.with_index do |line, index|
335416
chomped_line = line.chomp
336-
<<<<<<< HEAD
337-
if match = chomped_line.match(/(?<backslashes>\\+)\z/)
338-
adjustment = match[:backslashes].size / 2
339-
adjusted_line = chomped_line.delete_suffix("\\" * adjustment)
340-
if match[:backslashes].size.odd?
341-
adjusted_line.delete_suffix!("\\")
342-
adjustment += 2
343-
else
344-
adjusted_line << newline
345-
end
346-
=======
347417
backslash_count = chomped_line[/\\{1,}\z/]&.length || 0
348418
is_interpolation = interpolation?(quote_stack.last)
349419
is_percent_array = percent_array?(quote_stack.last)
@@ -360,15 +430,18 @@ def to_a
360430
end
361431
# If the string ends with a line continuation emit the remainder
362432
emit = index == lines.count - 1
363-
>>>>>>> b6554ad64e (Fix parser translator tokens for backslashes in single-quoted strings and word arrays)
364433
else
365-
adjusted_line = line
366-
adjustment = 0
434+
current_line << line
435+
emit = true
367436
end
368437

369-
end_offset = start_offset + adjusted_line.bytesize + adjustment
370-
tokens << [:tSTRING_CONTENT, [adjusted_line, Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]]
371-
start_offset = end_offset
438+
if emit
439+
end_offset = start_offset + current_line.bytesize + adjustment
440+
tokens << [:tSTRING_CONTENT, [unescape_string(current_line, quote_stack.last), range(start_offset, end_offset)]]
441+
start_offset = end_offset
442+
current_line = +""
443+
adjustment = 0
444+
end
372445
end
373446
next
374447
end
@@ -377,20 +450,24 @@ def to_a
377450
when :tSTRING_END
378451
if token.type == :HEREDOC_END && value.end_with?("\n")
379452
newline_length = value.end_with?("\r\n") ? 2 : 1
380-
value = heredoc_identifier_stack.pop
381-
location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - newline_length])
453+
value = heredoc_stack.pop.identifier
454+
location = range(token.location.start_offset, token.location.end_offset - newline_length)
382455
elsif token.type == :REGEXP_END
383456
value = value[0]
384-
location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])
457+
location = range(token.location.start_offset, token.location.start_offset + 1)
385458
end
459+
460+
quote_stack.pop
386461
when :tSYMBEG
387462
if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END
388463
next_location = token.location.join(next_token.location)
389464
type = :tSYMBOL
390465
value = next_token.value
391466
value = { "~@" => "~", "!@" => "!" }.fetch(value, value)
392-
location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
467+
location = range(next_location.start_offset, next_location.end_offset)
393468
index += 1
469+
else
470+
quote_stack.push(value)
394471
end
395472
when :tFID
396473
if !tokens.empty? && tokens.dig(-1, 0) == :kDEF
@@ -400,12 +477,21 @@ def to_a
400477
if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :STRING_END
401478
type = :tBACK_REF2
402479
end
480+
quote_stack.push(value)
481+
when :tSYMBOLS_BEG, :tQSYMBOLS_BEG, :tWORDS_BEG, :tQWORDS_BEG
482+
if (next_token = lexed[index][0]) && next_token.type == :WORDS_SEP
483+
index += 1
484+
end
485+
486+
quote_stack.push(value)
487+
when :tREGEXP_BEG
488+
quote_stack.push(value)
403489
end
404490

405491
tokens << [type, [value, location]]
406492

407493
if token.type == :REGEXP_END
408-
tokens << [:tREGEXP_OPT, [token.value[1..], Range.new(source_buffer, offset_cache[token.location.start_offset + 1], offset_cache[token.location.end_offset])]]
494+
tokens << [:tREGEXP_OPT, [token.value[1..], range(token.location.start_offset + 1, token.location.end_offset)]]
409495
end
410496
end
411497

@@ -414,6 +500,11 @@ def to_a
414500

415501
private
416502

503+
# Creates a new parser range, taking prisms byte offsets into account
504+
def range(start_offset, end_offset)
505+
Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])
506+
end
507+
417508
# Parse an integer from the string representation.
418509
def parse_integer(value)
419510
Integer(value)

test/prism/fixtures/ranges.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
(..2)
44

5+
foo ((1..1))
6+
57
1...2
68

79
foo[...2]

0 commit comments

Comments
 (0)