Skip to content

Commit af30123

Browse files
Utf 8 encoding issue (#3583)
* Fixing a bug where UTF-8 encodings resulted in incorrect char offset calculations. * Revert "Fail requests that are searching for a non existing position (#2938)" This reverts commit 0d421a6. * Calculate character length based on encountered bytes --------- Co-authored-by: Vinicius Stock <[email protected]>
1 parent 19e925d commit af30123

File tree

4 files changed

+71
-128
lines changed

4 files changed

+71
-128
lines changed

lib/ruby_lsp/document.rb

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@ module RubyLsp
77
class Document
88
extend T::Generic
99

10-
class LocationNotFoundError < StandardError; end
11-
1210
# This maximum number of characters for providing expensive features, like semantic highlighting and diagnostics.
1311
# This is the same number used by the TypeScript extension in VS Code
1412
MAXIMUM_CHARACTERS_FOR_EXPENSIVE_FEATURES = 100_000
@@ -176,7 +174,7 @@ class Scanner
176174
def initialize(source, encoding)
177175
@current_line = 0 #: Integer
178176
@pos = 0 #: Integer
179-
@source = source.codepoints #: Array[Integer]
177+
@bytes_or_codepoints = encoding == Encoding::UTF_8 ? source.bytes : source.codepoints #: Array[Integer]
180178
@encoding = encoding
181179
end
182180

@@ -185,23 +183,40 @@ def initialize(source, encoding)
185183
def find_char_position(position)
186184
# Find the character index for the beginning of the requested line
187185
until @current_line == position[:line]
188-
until LINE_BREAK == @source[@pos]
189-
@pos += 1
186+
@pos += 1 until LINE_BREAK == @bytes_or_codepoints[@pos]
187+
@pos += 1
188+
@current_line += 1
189+
end
190190

191-
if @pos >= @source.length
192-
# Pack the code points back into the original string to provide context in the error message
193-
raise LocationNotFoundError, "Requested position: #{position}\nSource:\n\n#{@source.pack("U*")}"
191+
# For UTF-8, the code unit length is the same as bytes, but we want to return the character index
192+
requested_position = if @encoding == Encoding::UTF_8
193+
character_offset = 0
194+
i = @pos
195+
196+
# Each group of bytes is a character. We advance based on the number of bytes to count how many full
197+
# characters we have in the requested offset
198+
while i < @pos + position[:character] && i < @bytes_or_codepoints.length
199+
byte = @bytes_or_codepoints[i] #: as !nil
200+
i += if byte < 0x80 # 1-byte character
201+
1
202+
elsif byte < 0xE0 # 2-byte character
203+
2
204+
elsif byte < 0xF0 # 3-byte character
205+
3
206+
else # 4-byte character
207+
4
194208
end
209+
210+
character_offset += 1
195211
end
196212

197-
@pos += 1
198-
@current_line += 1
213+
@pos + character_offset
214+
else
215+
@pos + position[:character]
199216
end
200217

201218
# The final position is the beginning of the line plus the requested column. If the encoding is UTF-16, we also
202219
# need to adjust for surrogate pairs
203-
requested_position = @pos + position[:character]
204-
205220
if @encoding == Encoding::UTF_16LE
206221
requested_position -= utf_16_character_position_correction(@pos, requested_position)
207222
end
@@ -216,7 +231,7 @@ def utf_16_character_position_correction(current_position, requested_position)
216231
utf16_unicode_correction = 0
217232

218233
until current_position == requested_position
219-
codepoint = @source[current_position]
234+
codepoint = @bytes_or_codepoints[current_position]
220235
utf16_unicode_correction += 1 if codepoint && codepoint > SURROGATE_PAIR_START
221236

222237
current_position += 1

lib/ruby_lsp/server.rb

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -121,24 +121,12 @@ def process_message(message)
121121
# If a document is deleted before we are able to process all of its enqueued requests, we will try to read it
122122
# from disk and it raise this error. This is expected, so we don't include the `data` attribute to avoid
123123
# reporting these to our telemetry
124-
case e
125-
when Store::NonExistingDocumentError
124+
if e.is_a?(Store::NonExistingDocumentError)
126125
send_message(Error.new(
127126
id: message[:id],
128127
code: Constant::ErrorCodes::INVALID_PARAMS,
129128
message: e.full_message,
130129
))
131-
when Document::LocationNotFoundError
132-
send_message(Error.new(
133-
id: message[:id],
134-
code: Constant::ErrorCodes::REQUEST_FAILED,
135-
message: <<~MESSAGE,
136-
Request #{message[:method]} failed to find the target position.
137-
The file might have been modified while the server was in the middle of searching for the target.
138-
If you experience this regularly, please report any findings and extra information on
139-
https://github.com/Shopify/ruby-lsp/issues/2446
140-
MESSAGE
141-
))
142130
else
143131
send_message(Error.new(
144132
id: message[:id],

test/ruby_document_test.rb

Lines changed: 42 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,48 @@ def foo
5959
RUBY
6060
end
6161

62+
def test_multibyte_character_offsets_are_bytes_in_utf8
63+
document = RubyLsp::RubyDocument.new(source: +<<~RUBY, version: 1, uri: @uri, global_state: @global_state)
64+
65+
RUBY
66+
67+
document.push_edits(
68+
[{ range: { start: { line: 0, character: 3 }, end: { line: 0, character: 3 } }, text: "r" }], version: 2
69+
)
70+
71+
assert_equal(<<~RUBY, document.source)
72+
bár
73+
RUBY
74+
end
75+
76+
def test_multibyte_character_offsets_for_3_byte_character
77+
document = RubyLsp::RubyDocument.new(source: +<<~RUBY, version: 1, uri: @uri, global_state: @global_state)
78+
bあ
79+
RUBY
80+
81+
document.push_edits(
82+
[{ range: { start: { line: 0, character: 4 }, end: { line: 0, character: 4 } }, text: "r" }], version: 2
83+
)
84+
85+
assert_equal(<<~RUBY, document.source)
86+
bあr
87+
RUBY
88+
end
89+
90+
def test_multibyte_character_offsets_for_4_byte_character
91+
document = RubyLsp::RubyDocument.new(source: +<<~RUBY, version: 1, uri: @uri, global_state: @global_state)
92+
b🙂
93+
RUBY
94+
95+
document.push_edits(
96+
[{ range: { start: { line: 0, character: 5 }, end: { line: 0, character: 5 } }, text: "r" }], version: 2
97+
)
98+
99+
assert_equal(<<~RUBY, document.source)
100+
b🙂r
101+
RUBY
102+
end
103+
62104
def test_deletion_full_node
63105
document = RubyLsp::RubyDocument.new(source: +<<~RUBY, version: 1, uri: @uri, global_state: @global_state)
64106
def foo
@@ -275,55 +317,6 @@ def test_multi_cursor_edit
275317
RUBY
276318
end
277319

278-
def test_pushing_edits_to_document_with_unicode
279-
document = RubyLsp::RubyDocument.new(source: +<<~RUBY, version: 1, uri: @uri, global_state: @global_state)
280-
chars = ["億"]
281-
RUBY
282-
283-
# Write puts 'a' in incremental edits
284-
document.push_edits(
285-
[{ range: { start: { line: 0, character: 13 }, end: { line: 0, character: 13 } }, text: "\n" }],
286-
version: 2,
287-
)
288-
document.push_edits(
289-
[{ range: { start: { line: 1, character: 0 }, end: { line: 1, character: 0 } }, text: "p" }],
290-
version: 3,
291-
)
292-
document.push_edits(
293-
[{ range: { start: { line: 1, character: 1 }, end: { line: 1, character: 1 } }, text: "u" }],
294-
version: 4,
295-
)
296-
document.push_edits(
297-
[{ range: { start: { line: 1, character: 2 }, end: { line: 1, character: 2 } }, text: "t" }],
298-
version: 5,
299-
)
300-
document.push_edits(
301-
[{ range: { start: { line: 1, character: 3 }, end: { line: 1, character: 3 } }, text: "s" }],
302-
version: 6,
303-
)
304-
document.push_edits(
305-
[{ range: { start: { line: 1, character: 4 }, end: { line: 1, character: 4 } }, text: " " }],
306-
version: 7,
307-
)
308-
document.push_edits(
309-
[{ range: { start: { line: 1, character: 5 }, end: { line: 1, character: 5 } }, text: "'" }],
310-
version: 8,
311-
)
312-
document.push_edits(
313-
[{ range: { start: { line: 1, character: 6 }, end: { line: 1, character: 6 } }, text: "a" }],
314-
version: 9,
315-
)
316-
document.push_edits(
317-
[{ range: { start: { line: 1, character: 7 }, end: { line: 1, character: 7 } }, text: "'" }],
318-
version: 10,
319-
)
320-
321-
assert_equal(<<~RUBY, document.source)
322-
chars = ["億"]
323-
puts 'a'
324-
RUBY
325-
end
326-
327320
def test_document_handle_4_byte_unicode_characters
328321
source = +<<~RUBY
329322
class Foo
@@ -760,29 +753,6 @@ class Foo
760753
assert_nil(document.cache_get("textDocument/codeLens"))
761754
end
762755

763-
def test_locating_a_non_existing_location_raises
764-
document = RubyLsp::RubyDocument.new(source: <<~RUBY.chomp, version: 1, uri: @uri, global_state: @global_state)
765-
class Foo
766-
end
767-
RUBY
768-
769-
# Exactly at the last character doesn't raise
770-
document.locate_node({ line: 1, character: 2 })
771-
772-
# Anything beyond does
773-
error = assert_raises(RubyLsp::Document::LocationNotFoundError) do
774-
document.locate_node({ line: 3, character: 2 })
775-
end
776-
777-
assert_match(/Requested position: {(:)?line[\s:=>]+3, (:)?character[\s:=>]+2}/, error.message)
778-
assert_match(<<~MESSAGE.chomp, error.message)
779-
Source:
780-
781-
class Foo
782-
end
783-
MESSAGE
784-
end
785-
786756
def test_document_tracks_latest_edit_context
787757
document = RubyLsp::RubyDocument.new(source: +<<~RUBY, version: 1, uri: @uri, global_state: @global_state)
788758
class Foo

test/server_test.rb

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -893,36 +893,6 @@ def handle_window_show_message_response(title)
893893
end
894894
end
895895

896-
def test_requests_to_a_non_existing_position_return_error
897-
uri = URI("file:///foo.rb")
898-
899-
@server.process_message({
900-
method: "textDocument/didOpen",
901-
params: {
902-
textDocument: {
903-
uri: uri,
904-
text: "class Foo\nend",
905-
version: 1,
906-
languageId: "ruby",
907-
},
908-
},
909-
})
910-
911-
@server.process_message({
912-
id: 1,
913-
method: "textDocument/completion",
914-
params: {
915-
textDocument: {
916-
uri: uri,
917-
},
918-
position: { line: 10, character: 0 },
919-
},
920-
})
921-
922-
error = find_message(RubyLsp::Error)
923-
assert_match("Request textDocument/completion failed to find the target position.", error.message)
924-
end
925-
926896
def test_cancelling_requests_returns_nil
927897
uri = URI("file:///foo.rb")
928898

0 commit comments

Comments
 (0)