@@ -226,7 +226,7 @@ def state
226226 end
227227
228228 # Tokens where state should be ignored
229- # used for :on_comment, :on_heredoc_end, :on_embexpr_end
229+ # used for :on_sp, : on_comment, :on_heredoc_end, :on_embexpr_end
230230 class IgnoreStateToken < Token
231231 def ==( other ) # :nodoc:
232232 self [ 0 ...-1 ] == other [ 0 ...-1 ]
@@ -611,10 +611,10 @@ def self.build(opening)
611611 BOM_FLUSHED = RUBY_VERSION >= "3.3.0"
612612 private_constant :BOM_FLUSHED
613613
614- attr_reader :source , : options
614+ attr_reader :options
615615
616- def initialize ( source , **options )
617- @source = source
616+ def initialize ( code , **options )
617+ @code = code
618618 @options = options
619619 end
620620
@@ -624,12 +624,14 @@ def result
624624 state = :default
625625 heredoc_stack = [ [ ] ] #: Array[Array[Heredoc::PlainHeredoc | Heredoc::DashHeredoc | Heredoc::DedentingHeredoc]]
626626
627- result = Prism . lex ( source , **options )
627+ result = Prism . lex ( @code , **options )
628+ source = result . source
628629 result_value = result . value
629630 previous_state = nil #: State?
630631 last_heredoc_end = nil #: Integer?
632+ eof_token = nil
631633
632- bom = source . byteslice ( 0 ..2 ) == "\xEF \xBB \xBF "
634+ bom = source . slice ( 0 ..2 ) == "\xEF \xBB \xBF "
633635
634636 result_value . each_with_index do |( token , lex_state ) , index |
635637 lineno = token . location . start_line
@@ -741,6 +743,7 @@ def result
741743
742744 Token . new ( [ [ lineno , column ] , event , value , lex_state ] )
743745 when :on_eof
746+ eof_token = token
744747 previous_token = result_value [ index - 1 ] [ 0 ]
745748
746749 # If we're at the end of the file and the previous token was a
@@ -763,7 +766,7 @@ def result
763766 end_offset += 3
764767 end
765768
766- tokens << Token . new ( [ [ lineno , 0 ] , :on_nl , source . byteslice ( start_offset ...end_offset ) , lex_state ] )
769+ tokens << Token . new ( [ [ lineno , 0 ] , :on_nl , source . slice ( start_offset ...end_offset ) , lex_state ] )
767770 end
768771 end
769772
@@ -857,7 +860,88 @@ def result
857860 # We sort by location to compare against Ripper's output
858861 tokens . sort_by! ( &:location )
859862
860- Result . new ( tokens , result . comments , result . magic_comments , result . data_loc , result . errors , result . warnings , Source . for ( source ) )
863+ # Add :on_sp tokens
864+ tokens = add_on_sp_tokens ( tokens , source , result . data_loc , bom , eof_token )
865+
866+ Result . new ( tokens , result . comments , result . magic_comments , result . data_loc , result . errors , result . warnings , source )
867+ end
868+
869+ def add_on_sp_tokens ( tokens , source , data_loc , bom , eof_token )
870+ new_tokens = [ ]
871+
872+ prev_token_state = Translation ::Ripper ::Lexer ::State . new ( Translation ::Ripper ::EXPR_BEG )
873+ prev_token_end = bom ? 3 : 0
874+
875+ tokens . each do |token |
876+ line , column = token . location
877+ start_offset = source . line_to_byte_offset ( line ) + column
878+ start_offset += 3 if line == 1 && bom
879+
880+ if start_offset > prev_token_end
881+ sp_value = source . slice ( prev_token_end , start_offset - prev_token_end )
882+ sp_line = source . line ( prev_token_end )
883+ sp_column = source . column ( prev_token_end )
884+ # Ripper reports columns on line 1 without counting the BOM
885+ sp_column -= 3 if sp_line == 1 && bom
886+ continuation_index = sp_value . byteindex ( "\\ " )
887+
888+ # ripper emits up to three :on_sp tokens when line continuations are used
889+ if continuation_index
890+ next_whitespace_index = continuation_index + 1
891+ next_whitespace_index += 1 if sp_value . byteslice ( next_whitespace_index ) == "\r "
892+ next_whitespace_index += 1
893+ first_whitespace = sp_value [ 0 ...continuation_index ]
894+ continuation = sp_value [ continuation_index ...next_whitespace_index ]
895+ second_whitespace = sp_value [ next_whitespace_index ..]
896+
897+ new_tokens << IgnoreStateToken . new ( [
898+ [ sp_line , sp_column ] ,
899+ :on_sp ,
900+ first_whitespace ,
901+ prev_token_state
902+ ] ) unless first_whitespace . empty?
903+
904+ new_tokens << IgnoreStateToken . new ( [
905+ [ sp_line , sp_column + continuation_index ] ,
906+ :on_sp ,
907+ continuation ,
908+ prev_token_state
909+ ] )
910+
911+ new_tokens << IgnoreStateToken . new ( [
912+ [ sp_line + 1 , 0 ] ,
913+ :on_sp ,
914+ second_whitespace ,
915+ prev_token_state
916+ ] ) unless second_whitespace . empty?
917+ else
918+ new_tokens << IgnoreStateToken . new ( [
919+ [ sp_line , sp_column ] ,
920+ :on_sp ,
921+ sp_value ,
922+ prev_token_state
923+ ] )
924+ end
925+ end
926+
927+ new_tokens << token
928+ prev_token_state = token . state
929+ prev_token_end = start_offset + token . value . bytesize
930+ end
931+
932+ unless data_loc # no trailing :on_sp with __END__ as it is always preceded by :on_nl
933+ end_offset = eof_token . location . end_offset
934+ if prev_token_end < end_offset
935+ new_tokens << IgnoreStateToken . new ( [
936+ [ source . line ( prev_token_end ) , source . column ( prev_token_end ) ] ,
937+ :on_sp ,
938+ source . slice ( prev_token_end , end_offset - prev_token_end ) ,
939+ prev_token_state
940+ ] )
941+ end
942+ end
943+
944+ new_tokens
861945 end
862946 end
863947
0 commit comments