Skip to content

Commit 2541426

Browse files
eregonEarlopain
andcommitted
Add Ripper :on_sp events for Prism.lex_compat and Prism::Translation::Ripper
* Handle line continuations. * Handle space at the end of file in LexCompat. Co-authored-by: Earlopain <[email protected]>
1 parent 6d5f343 commit 2541426

File tree

8 files changed

+109
-24
lines changed

8 files changed

+109
-24
lines changed

lib/prism.rb

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,7 @@ def initialize(version)
6161
# Prism::lex_compat(source, **options) -> LexCompat::Result
6262
#
6363
# Returns a parse result whose value is an array of tokens that closely
64-
# resembles the return value of Ripper::lex. The main difference is that the
65-
# `:on_sp` token is not emitted.
64+
# resembles the return value of Ripper::lex.
6665
#
6766
# For supported options, see Prism::parse.
6867
def self.lex_compat(source, **options)
@@ -72,9 +71,8 @@ def self.lex_compat(source, **options)
7271
# :call-seq:
7372
# Prism::lex_ripper(source) -> Array
7473
#
75-
# This lexes with the Ripper lex. It drops any space events but otherwise
76-
# returns the same tokens. Raises SyntaxError if the syntax in source is
77-
# invalid.
74+
# This wraps the result of Ripper.lex. It produces almost exactly the
75+
# same tokens. Raises SyntaxError if the syntax in source is invalid.
7876
def self.lex_ripper(source)
7977
LexRipper.new(source).result # steep:ignore
8078
end

lib/prism/lex_compat.rb

Lines changed: 92 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,7 @@ def state
226226
end
227227

228228
# Tokens where state should be ignored
229-
# used for :on_comment, :on_heredoc_end, :on_embexpr_end
229+
# used for :on_sp, :on_comment, :on_heredoc_end, :on_embexpr_end
230230
class IgnoreStateToken < Token
231231
def ==(other) # :nodoc:
232232
self[0...-1] == other[0...-1]
@@ -611,10 +611,10 @@ def self.build(opening)
611611
BOM_FLUSHED = RUBY_VERSION >= "3.3.0"
612612
private_constant :BOM_FLUSHED
613613

614-
attr_reader :source, :options
614+
attr_reader :options
615615

616-
def initialize(source, **options)
617-
@source = source
616+
def initialize(code, **options)
617+
@code = code
618618
@options = options
619619
end
620620

@@ -624,12 +624,14 @@ def result
624624
state = :default
625625
heredoc_stack = [[]] #: Array[Array[Heredoc::PlainHeredoc | Heredoc::DashHeredoc | Heredoc::DedentingHeredoc]]
626626

627-
result = Prism.lex(source, **options)
627+
result = Prism.lex(@code, **options)
628+
source = result.source
628629
result_value = result.value
629630
previous_state = nil #: State?
630631
last_heredoc_end = nil #: Integer?
632+
eof_token = nil
631633

632-
bom = source.byteslice(0..2) == "\xEF\xBB\xBF"
634+
bom = source.slice(0..2) == "\xEF\xBB\xBF"
633635

634636
result_value.each_with_index do |(token, lex_state), index|
635637
lineno = token.location.start_line
@@ -741,6 +743,7 @@ def result
741743

742744
Token.new([[lineno, column], event, value, lex_state])
743745
when :on_eof
746+
eof_token = token
744747
previous_token = result_value[index - 1][0]
745748

746749
# If we're at the end of the file and the previous token was a
@@ -763,7 +766,7 @@ def result
763766
end_offset += 3
764767
end
765768

766-
tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state])
769+
tokens << Token.new([[lineno, 0], :on_nl, source.slice(start_offset...end_offset), lex_state])
767770
end
768771
end
769772

@@ -857,7 +860,88 @@ def result
857860
# We sort by location to compare against Ripper's output
858861
tokens.sort_by!(&:location)
859862

860-
Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, Source.for(source))
863+
# Add :on_sp tokens
864+
tokens = add_on_sp_tokens(tokens, source, result.data_loc, bom, eof_token)
865+
866+
Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, source)
867+
end
868+
869+
def add_on_sp_tokens(tokens, source, data_loc, bom, eof_token)
870+
new_tokens = []
871+
872+
prev_token_state = Translation::Ripper::Lexer::State.new(Translation::Ripper::EXPR_BEG)
873+
prev_token_end = bom ? 3 : 0
874+
875+
tokens.each do |token|
876+
line, column = token.location
877+
start_offset = source.line_to_byte_offset(line) + column
878+
start_offset += 3 if line == 1 && bom
879+
880+
if start_offset > prev_token_end
881+
sp_value = source.slice(prev_token_end, start_offset - prev_token_end)
882+
sp_line = source.line(prev_token_end)
883+
sp_column = source.column(prev_token_end)
884+
# Ripper reports columns on line 1 without counting the BOM
885+
sp_column -= 3 if sp_line == 1 && bom
886+
continuation_index = sp_value.byteindex("\\")
887+
888+
# ripper emits up to three :on_sp tokens when line continuations are used
889+
if continuation_index
890+
next_whitespace_index = continuation_index + 1
891+
next_whitespace_index += 1 if sp_value.byteslice(next_whitespace_index) == "\r"
892+
next_whitespace_index += 1
893+
first_whitespace = sp_value[0...continuation_index]
894+
continuation = sp_value[continuation_index...next_whitespace_index]
895+
second_whitespace = sp_value[next_whitespace_index..]
896+
897+
new_tokens << IgnoreStateToken.new([
898+
[sp_line, sp_column],
899+
:on_sp,
900+
first_whitespace,
901+
prev_token_state
902+
]) unless first_whitespace.empty?
903+
904+
new_tokens << IgnoreStateToken.new([
905+
[sp_line, sp_column + continuation_index],
906+
:on_sp,
907+
continuation,
908+
prev_token_state
909+
])
910+
911+
new_tokens << IgnoreStateToken.new([
912+
[sp_line + 1, 0],
913+
:on_sp,
914+
second_whitespace,
915+
prev_token_state
916+
]) unless second_whitespace.empty?
917+
else
918+
new_tokens << IgnoreStateToken.new([
919+
[sp_line, sp_column],
920+
:on_sp,
921+
sp_value,
922+
prev_token_state
923+
])
924+
end
925+
end
926+
927+
new_tokens << token
928+
prev_token_state = token.state
929+
prev_token_end = start_offset + token.value.bytesize
930+
end
931+
932+
unless data_loc # no trailing :on_sp with __END__ as it is always preceded by :on_nl
933+
end_offset = eof_token.location.end_offset
934+
if prev_token_end < end_offset
935+
new_tokens << IgnoreStateToken.new([
936+
[source.line(prev_token_end), source.column(prev_token_end)],
937+
:on_sp,
938+
source.slice(prev_token_end, end_offset - prev_token_end),
939+
prev_token_state
940+
])
941+
end
942+
end
943+
944+
new_tokens
861945
end
862946
end
863947

lib/prism/lex_ripper.rb

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@ def result
1919

2020
lex(source).each do |token|
2121
case token[1]
22-
when :on_sp
23-
# skip
2422
when :on_tstring_content
2523
if previous[1] == :on_tstring_content && (token[2].start_with?("\#$") || token[2].start_with?("\#@"))
2624
previous[2] << token[2]

lib/prism/parse_result.rb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,9 +71,9 @@ def lines
7171
end
7272

7373
# Perform a byteslice on the source code using the given byte offset and
74-
# byte length.
75-
def slice(byte_offset, length)
76-
source.byteslice(byte_offset, length) or raise
74+
# byte length, or using a Range.
75+
def slice(...)
76+
source.byteslice(...) or raise
7777
end
7878

7979
# Converts the line number to a byte offset corresponding to the start of that line

rbi/prism/parse_result.rbi

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,7 @@ class Prism::Source
2525
sig { returns(T::Array[String]) }
2626
def lines; end
2727

28-
sig { params(byte_offset: Integer, length: Integer).returns(String) }
29-
def slice(byte_offset, length); end
28+
def slice(...); end
3029

3130
sig { params(byte_offset: Integer).returns(Integer) }
3231
def line(byte_offset); end
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
 p (42)

test/prism/fixtures/bom_spaces.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
p ( 42 )

test/prism/ruby/ripper_test.rb

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ class RipperTest < TestCase
3939

4040
# Skip these tests that we haven't implemented yet.
4141
omitted_sexp_raw = [
42+
"bom_leading_space.txt",
43+
"bom_spaces.txt",
4244
"dos_endings.txt",
4345
"heredocs_with_fake_newlines.txt",
4446
"heredocs_with_ignored_newlines.txt",
@@ -92,7 +94,7 @@ def test_lexer
9294
assert_equal(expected, lexer.parse[0].to_a)
9395
assert_equal(lexer.parse[0].to_a, lexer.scan[0].to_a)
9496

95-
assert_equal(%i[on_int on_op], Translation::Ripper::Lexer.new("1 +").lex.map(&:event))
97+
assert_equal(%i[on_int on_sp on_op], Translation::Ripper::Lexer.new("1 +").lex.map(&:event))
9698
assert_raise(SyntaxError) { Translation::Ripper::Lexer.new("1 +").lex(raise_errors: true) }
9799
end
98100

@@ -121,15 +123,17 @@ def assert_ripper_sexp_raw(source)
121123
def assert_ripper_lex(source)
122124
prism = Translation::Ripper.lex(source)
123125
ripper = Ripper.lex(source)
124-
ripper.reject! { |elem| elem[1] == :on_sp } # Prism doesn't emit on_sp
125-
ripper.sort_by! { |elem| elem[0] } # Prism emits tokens by their order in the code, not in parse order
126+
127+
# Prism emits tokens by their order in the code, not in parse order
128+
ripper.sort_by! { |elem| elem[0] }
126129

127130
[prism.size, ripper.size].max.times do |i|
128131
expected = ripper[i]
129132
actual = prism[i]
133+
130134
# Since tokens related to heredocs are not emitted in the same order,
131135
# the state also doesn't line up.
132-
if expected[1] == :on_heredoc_end && actual[1] == :on_heredoc_end
136+
if expected && actual && expected[1] == :on_heredoc_end && actual[1] == :on_heredoc_end
133137
expected[3] = actual[3] = nil
134138
end
135139

0 commit comments

Comments
 (0)