@@ -7,6 +7,7 @@ module RubyLsp
7
7
class Document
8
8
extend T ::Generic
9
9
10
+ class InvalidLocationError < StandardError ; end
10
11
# This maximum number of characters for providing expensive features, like semantic highlighting and diagnostics.
11
12
# This is the same number used by the TypeScript extension in VS Code
12
13
MAXIMUM_CHARACTERS_FOR_EXPENSIVE_FEATURES = 100_000
@@ -145,7 +146,14 @@ def find_index_by_position(start_pos, end_pos = nil)
145
146
146
147
#: -> Scanner
147
148
def create_scanner
148
- Scanner . new ( @source , @encoding )
149
+ case @encoding
150
+ when Encoding ::UTF_8
151
+ Utf8Scanner . new ( @source )
152
+ when Encoding ::UTF_16LE
153
+ Utf16Scanner . new ( @source )
154
+ else
155
+ Utf32Scanner . new ( @source )
156
+ end
149
157
end
150
158
151
159
# @abstract
@@ -163,81 +171,169 @@ class Insert < Edit; end
163
171
class Replace < Edit ; end
164
172
class Delete < Edit ; end
165
173
174
+ # Parent class for all position scanners. Scanners are used to translate a position given by the editor into a
175
+ # string index that we can use to find the right place in the document source. The logic for finding the correct
176
+ # index depends on the encoding negotiated with the editor, so we have different subclasses for each encoding.
177
+ # See https://microsoft.github.io/language-server-protocol/specification/#positionEncodingKind for more information
178
+ # @abstract
166
179
class Scanner
167
180
extend T ::Sig
168
181
169
182
LINE_BREAK = 0x0A #: Integer
170
183
# After character 0xFFFF, UTF-16 considers characters to have length 2 and we have to account for that
171
184
SURROGATE_PAIR_START = 0xFFFF #: Integer
172
185
173
- #: (String source, Encoding encoding) -> void
174
- def initialize ( source , encoding )
186
+ #: -> void
187
+ def initialize
175
188
@current_line = 0 #: Integer
176
189
@pos = 0 #: Integer
177
- @bytes_or_codepoints = encoding == Encoding ::UTF_8 ? source . bytes : source . codepoints #: Array[Integer]
178
- @encoding = encoding
179
190
end
180
191
181
- # Finds the character index inside the source string for a given line and column
192
+ # Finds the character index inside the source string for a given line and column. This method always returns the
193
+ # character index regardless of whether we are searching positions based on bytes, code units, or codepoints.
194
+ # @abstract
195
+ #: (Hash[Symbol, untyped] position) -> Integer
196
+ def find_char_position ( position ) ; end
197
+ end
198
+
199
+ # For the UTF-8 encoding, positions correspond to bytes
200
+ class Utf8Scanner < Scanner
201
+ #: (String source) -> void
202
+ def initialize ( source )
203
+ super ( )
204
+ @bytes = source . bytes #: Array[Integer]
205
+ @character_length = 0 #: Integer
206
+ end
207
+
208
+ # @override
182
209
#: (Hash[Symbol, untyped] position) -> Integer
183
210
def find_char_position ( position )
184
- # Find the character index for the beginning of the requested line
211
+ # Each group of bytes is a character. We advance based on the number of bytes to count how many full characters
212
+ # we have in the requested offset
185
213
until @current_line == position [ :line ]
186
- @pos += 1 until LINE_BREAK == @bytes_or_codepoints [ @pos ]
214
+ byte = @bytes [ @pos ] #: Integer?
215
+ raise InvalidLocationError unless byte
216
+
217
+ until LINE_BREAK == byte
218
+ @pos += character_byte_length ( byte )
219
+ @character_length += 1
220
+ byte = @bytes [ @pos ]
221
+ raise InvalidLocationError unless byte
222
+ end
223
+
187
224
@pos += 1
225
+ @character_length += 1
188
226
@current_line += 1
189
227
end
190
228
191
- # For UTF-8, the code unit length is the same as bytes, but we want to return the character index
192
- requested_position = if @encoding == Encoding ::UTF_8
193
- character_offset = 0
194
- i = @pos
195
-
196
- # Each group of bytes is a character. We advance based on the number of bytes to count how many full
197
- # characters we have in the requested offset
198
- while i < @pos + position [ :character ] && i < @bytes_or_codepoints . length
199
- byte = @bytes_or_codepoints [ i ] #: as !nil
200
- i += if byte < 0x80 # 1-byte character
201
- 1
202
- elsif byte < 0xE0 # 2-byte character
203
- 2
204
- elsif byte < 0xF0 # 3-byte character
205
- 3
206
- else # 4-byte character
207
- 4
208
- end
209
-
210
- character_offset += 1
229
+ # @character_length has the number of characters until the beginning of the line. We don't accumulate on it for
230
+ # the character part because locating the same position twice must return the same value
231
+ line_byte_offset = 0
232
+ line_characters = 0
233
+
234
+ while line_byte_offset < position [ :character ]
235
+ byte = @bytes [ @pos + line_byte_offset ] #: Integer?
236
+ raise InvalidLocationError unless byte
237
+
238
+ line_byte_offset += character_byte_length ( byte )
239
+ line_characters += 1
240
+ end
241
+
242
+ @character_length + line_characters
243
+ end
244
+
245
+ private
246
+
247
+ #: (Integer) -> Integer
248
+ def character_byte_length ( byte )
249
+ if byte < 0x80 # 1-byte character
250
+ 1
251
+ elsif byte < 0xE0 # 2-byte character
252
+ 2
253
+ elsif byte < 0xF0 # 3-byte character
254
+ 3
255
+ else # 4-byte character
256
+ 4
257
+ end
258
+ end
259
+ end
260
+
261
+ # For the UTF-16 encoding, positions correspond to UTF-16 code units, which count characters beyond the surrogate
262
+ # pair as length 2
263
+ class Utf16Scanner < Scanner
264
+ #: (String) -> void
265
+ def initialize ( source )
266
+ super ( )
267
+ @codepoints = source . codepoints #: Array[Integer]
268
+ end
269
+
270
+ # @override
271
+ #: (Hash[Symbol, untyped] position) -> Integer
272
+ def find_char_position ( position )
273
+ # Find the character index for the beginning of the requested line
274
+ until @current_line == position [ :line ]
275
+ codepoint = @codepoints [ @pos ] #: Integer?
276
+ raise InvalidLocationError unless codepoint
277
+
278
+ until LINE_BREAK == @codepoints [ @pos ]
279
+ @pos += 1
280
+ codepoint = @codepoints [ @pos ] #: Integer?
281
+ raise InvalidLocationError unless codepoint
211
282
end
212
283
213
- @pos + character_offset
214
- else
215
- @pos + position [ :character ]
284
+ @pos += 1
285
+ @current_line += 1
216
286
end
217
287
218
288
# The final position is the beginning of the line plus the requested column. If the encoding is UTF-16, we also
219
289
# need to adjust for surrogate pairs
220
- if @encoding == Encoding ::UTF_16LE
221
- requested_position -= utf_16_character_position_correction ( @pos , requested_position )
290
+ line_characters = 0
291
+ line_code_units = 0
292
+
293
+ while line_code_units < position [ :character ]
294
+ code_point = @codepoints [ @pos + line_characters ]
295
+ raise InvalidLocationError unless code_point
296
+
297
+ line_code_units += if code_point > SURROGATE_PAIR_START
298
+ 2 # Surrogate pair, so we skip the next code unit
299
+ else
300
+ 1 # Single code unit character
301
+ end
302
+
303
+ line_characters += 1
222
304
end
223
305
224
- requested_position
306
+ @pos + line_characters
225
307
end
308
+ end
226
309
227
- # Subtract 1 for each character after 0xFFFF in the current line from the column position, so that we hit the
228
- # right character in the UTF-8 representation
229
- #: (Integer current_position, Integer requested_position) -> Integer
230
- def utf_16_character_position_correction ( current_position , requested_position )
231
- utf16_unicode_correction = 0
310
+ # For the UTF-32 encoding, positions correspond directly to codepoints
311
+ class Utf32Scanner < Scanner
312
+ #: (String) -> void
313
+ def initialize ( source )
314
+ super ( )
315
+ @codepoints = source . codepoints #: Array[Integer]
316
+ end
232
317
233
- until current_position == requested_position
234
- codepoint = @bytes_or_codepoints [ current_position ]
235
- utf16_unicode_correction += 1 if codepoint && codepoint > SURROGATE_PAIR_START
318
+ # @override
319
+ #: (Hash[Symbol, untyped] position) -> Integer
320
+ def find_char_position ( position )
321
+ # Find the character index for the beginning of the requested line
322
+ until @current_line == position [ :line ]
323
+ codepoint = @codepoints [ @pos ] #: Integer?
324
+ raise InvalidLocationError unless codepoint
325
+
326
+ until LINE_BREAK == @codepoints [ @pos ]
327
+ @pos += 1
328
+ codepoint = @codepoints [ @pos ] #: Integer?
329
+ raise InvalidLocationError unless codepoint
330
+ end
236
331
237
- current_position += 1
332
+ @pos += 1
333
+ @current_line += 1
238
334
end
239
335
240
- utf16_unicode_correction
336
+ @pos + position [ :character ]
241
337
end
242
338
end
243
339
end
0 commit comments