Skip to content

Commit 6b10f30

Browse files
authored
Advance scanner position by byte length while searching for line (#3612)
And refactor each encoding scanner into a subclass
1 parent 748140c commit 6b10f30

File tree

4 files changed

+408
-52
lines changed

4 files changed

+408
-52
lines changed

lib/ruby_lsp/document.rb

Lines changed: 140 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ module RubyLsp
77
class Document
88
extend T::Generic
99

10+
class InvalidLocationError < StandardError; end
1011
# This maximum number of characters for providing expensive features, like semantic highlighting and diagnostics.
1112
# This is the same number used by the TypeScript extension in VS Code
1213
MAXIMUM_CHARACTERS_FOR_EXPENSIVE_FEATURES = 100_000
@@ -145,7 +146,14 @@ def find_index_by_position(start_pos, end_pos = nil)
145146

146147
#: -> Scanner
147148
def create_scanner
148-
Scanner.new(@source, @encoding)
149+
case @encoding
150+
when Encoding::UTF_8
151+
Utf8Scanner.new(@source)
152+
when Encoding::UTF_16LE
153+
Utf16Scanner.new(@source)
154+
else
155+
Utf32Scanner.new(@source)
156+
end
149157
end
150158

151159
# @abstract
@@ -163,81 +171,169 @@ class Insert < Edit; end
163171
class Replace < Edit; end
164172
class Delete < Edit; end
165173

174+
# Parent class for all position scanners. Scanners are used to translate a position given by the editor into a
175+
# string index that we can use to find the right place in the document source. The logic for finding the correct
176+
# index depends on the encoding negotiated with the editor, so we have different subclasses for each encoding.
177+
# See https://microsoft.github.io/language-server-protocol/specification/#positionEncodingKind for more information
178+
# @abstract
166179
class Scanner
167180
extend T::Sig
168181

169182
LINE_BREAK = 0x0A #: Integer
170183
# After character 0xFFFF, UTF-16 considers characters to have length 2 and we have to account for that
171184
SURROGATE_PAIR_START = 0xFFFF #: Integer
172185

173-
#: (String source, Encoding encoding) -> void
174-
def initialize(source, encoding)
186+
#: -> void
187+
def initialize
175188
@current_line = 0 #: Integer
176189
@pos = 0 #: Integer
177-
@bytes_or_codepoints = encoding == Encoding::UTF_8 ? source.bytes : source.codepoints #: Array[Integer]
178-
@encoding = encoding
179190
end
180191

181-
# Finds the character index inside the source string for a given line and column
192+
# Finds the character index inside the source string for a given line and column. This method always returns the
193+
# character index regardless of whether we are searching positions based on bytes, code units, or codepoints.
194+
# @abstract
195+
#: (Hash[Symbol, untyped] position) -> Integer
196+
def find_char_position(position); end
197+
end
198+
199+
# For the UTF-8 encoding, positions correspond to bytes
200+
class Utf8Scanner < Scanner
201+
#: (String source) -> void
202+
def initialize(source)
203+
super()
204+
@bytes = source.bytes #: Array[Integer]
205+
@character_length = 0 #: Integer
206+
end
207+
208+
# @override
182209
#: (Hash[Symbol, untyped] position) -> Integer
183210
def find_char_position(position)
184-
# Find the character index for the beginning of the requested line
211+
# Each group of bytes is a character. We advance based on the number of bytes to count how many full characters
212+
# we have in the requested offset
185213
until @current_line == position[:line]
186-
@pos += 1 until LINE_BREAK == @bytes_or_codepoints[@pos]
214+
byte = @bytes[@pos] #: Integer?
215+
raise InvalidLocationError unless byte
216+
217+
until LINE_BREAK == byte
218+
@pos += character_byte_length(byte)
219+
@character_length += 1
220+
byte = @bytes[@pos]
221+
raise InvalidLocationError unless byte
222+
end
223+
187224
@pos += 1
225+
@character_length += 1
188226
@current_line += 1
189227
end
190228

191-
# For UTF-8, the code unit length is the same as bytes, but we want to return the character index
192-
requested_position = if @encoding == Encoding::UTF_8
193-
character_offset = 0
194-
i = @pos
195-
196-
# Each group of bytes is a character. We advance based on the number of bytes to count how many full
197-
# characters we have in the requested offset
198-
while i < @pos + position[:character] && i < @bytes_or_codepoints.length
199-
byte = @bytes_or_codepoints[i] #: as !nil
200-
i += if byte < 0x80 # 1-byte character
201-
1
202-
elsif byte < 0xE0 # 2-byte character
203-
2
204-
elsif byte < 0xF0 # 3-byte character
205-
3
206-
else # 4-byte character
207-
4
208-
end
209-
210-
character_offset += 1
229+
# @character_length has the number of characters until the beginning of the line. We don't accumulate on it for
230+
# the character part because locating the same position twice must return the same value
231+
line_byte_offset = 0
232+
line_characters = 0
233+
234+
while line_byte_offset < position[:character]
235+
byte = @bytes[@pos + line_byte_offset] #: Integer?
236+
raise InvalidLocationError unless byte
237+
238+
line_byte_offset += character_byte_length(byte)
239+
line_characters += 1
240+
end
241+
242+
@character_length + line_characters
243+
end
244+
245+
private
246+
247+
#: (Integer) -> Integer
248+
def character_byte_length(byte)
249+
if byte < 0x80 # 1-byte character
250+
1
251+
elsif byte < 0xE0 # 2-byte character
252+
2
253+
elsif byte < 0xF0 # 3-byte character
254+
3
255+
else # 4-byte character
256+
4
257+
end
258+
end
259+
end
260+
261+
# For the UTF-16 encoding, positions correspond to UTF-16 code units, which count characters beyond the surrogate
262+
# pair as length 2
263+
class Utf16Scanner < Scanner
264+
#: (String) -> void
265+
def initialize(source)
266+
super()
267+
@codepoints = source.codepoints #: Array[Integer]
268+
end
269+
270+
# @override
271+
#: (Hash[Symbol, untyped] position) -> Integer
272+
def find_char_position(position)
273+
# Find the character index for the beginning of the requested line
274+
until @current_line == position[:line]
275+
codepoint = @codepoints[@pos] #: Integer?
276+
raise InvalidLocationError unless codepoint
277+
278+
until LINE_BREAK == @codepoints[@pos]
279+
@pos += 1
280+
codepoint = @codepoints[@pos] #: Integer?
281+
raise InvalidLocationError unless codepoint
211282
end
212283

213-
@pos + character_offset
214-
else
215-
@pos + position[:character]
284+
@pos += 1
285+
@current_line += 1
216286
end
217287

218288
# The final position is the beginning of the line plus the requested column. If the encoding is UTF-16, we also
219289
# need to adjust for surrogate pairs
220-
if @encoding == Encoding::UTF_16LE
221-
requested_position -= utf_16_character_position_correction(@pos, requested_position)
290+
line_characters = 0
291+
line_code_units = 0
292+
293+
while line_code_units < position[:character]
294+
code_point = @codepoints[@pos + line_characters]
295+
raise InvalidLocationError unless code_point
296+
297+
line_code_units += if code_point > SURROGATE_PAIR_START
298+
2 # Surrogate pair, so we skip the next code unit
299+
else
300+
1 # Single code unit character
301+
end
302+
303+
line_characters += 1
222304
end
223305

224-
requested_position
306+
@pos + line_characters
225307
end
308+
end
226309

227-
# Subtract 1 for each character after 0xFFFF in the current line from the column position, so that we hit the
228-
# right character in the UTF-8 representation
229-
#: (Integer current_position, Integer requested_position) -> Integer
230-
def utf_16_character_position_correction(current_position, requested_position)
231-
utf16_unicode_correction = 0
310+
# For the UTF-32 encoding, positions correspond directly to codepoints
311+
class Utf32Scanner < Scanner
312+
#: (String) -> void
313+
def initialize(source)
314+
super()
315+
@codepoints = source.codepoints #: Array[Integer]
316+
end
232317

233-
until current_position == requested_position
234-
codepoint = @bytes_or_codepoints[current_position]
235-
utf16_unicode_correction += 1 if codepoint && codepoint > SURROGATE_PAIR_START
318+
# @override
319+
#: (Hash[Symbol, untyped] position) -> Integer
320+
def find_char_position(position)
321+
# Find the character index for the beginning of the requested line
322+
until @current_line == position[:line]
323+
codepoint = @codepoints[@pos] #: Integer?
324+
raise InvalidLocationError unless codepoint
325+
326+
until LINE_BREAK == @codepoints[@pos]
327+
@pos += 1
328+
codepoint = @codepoints[@pos] #: Integer?
329+
raise InvalidLocationError unless codepoint
330+
end
236331

237-
current_position += 1
332+
@pos += 1
333+
@current_line += 1
238334
end
239335

240-
utf16_unicode_correction
336+
@pos + position[:character]
241337
end
242338
end
243339
end

test/expectations/code_action_resolve/extract_method_script.exp.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
"character": 0
1010
},
1111
"end": {
12-
"line": 2,
12+
"line": 1,
1313
"character": 6
1414
}
1515
},
@@ -37,7 +37,7 @@
3737
"character": 0
3838
}
3939
},
40-
"newText": "def new_method\n a = 5 + 2\n a * 10\n \nend\n\n"
40+
"newText": "def new_method\n a = 5 + 2\n a * 10\nend\n\n"
4141
},
4242
{
4343
"range": {
@@ -46,7 +46,7 @@
4646
"character": 0
4747
},
4848
"end": {
49-
"line": 2,
49+
"line": 1,
5050
"character": 6
5151
}
5252
},

test/requests/code_actions_expectations_test.rb

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,12 @@ def assert_expectations(source, expected)
3737
private
3838

3939
def default_args(source)
40-
end_position = source.lines.count > 1 ? { line: 1, character: 1 } : { line: 0, character: 1 }
40+
end_line = source.lines.count > 1 ? 1 : 0
41+
end_character = source.empty? ? 0 : 1
4142
{
4243
range: {
43-
start: { line: 0, character: 0 }, end: end_position,
44+
start: { line: 0, character: 0 },
45+
end: { line: end_line, character: end_character },
4446
},
4547
context: {
4648
diagnostics: [],

0 commit comments

Comments
 (0)