Skip to content

Commit e4d773e

Browse files
wip
1 parent 307864d commit e4d773e

File tree

9 files changed

+306
-190
lines changed

9 files changed

+306
-190
lines changed

crates/djls-server/src/server.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ impl LanguageServer for DjangoLanguageServer {
157157
save: Some(lsp_types::SaveOptions::default().into()),
158158
},
159159
)),
160-
position_encoding: Some(lsp_types::PositionEncodingKind::from(encoding)),
160+
position_encoding: Some(djls_workspace::position_encoding_to_lsp(encoding)),
161161
diagnostic_provider: Some(lsp_types::DiagnosticServerCapabilities::Options(
162162
lsp_types::DiagnosticOptions {
163163
identifier: None,
@@ -172,7 +172,11 @@ impl LanguageServer for DjangoLanguageServer {
172172
name: SERVER_NAME.to_string(),
173173
version: Some(SERVER_VERSION.to_string()),
174174
}),
175-
offset_encoding: Some(encoding.to_string()),
175+
offset_encoding: Some(match encoding {
176+
djls_workspace::PositionEncoding::Utf8 => "utf-8".to_string(),
177+
djls_workspace::PositionEncoding::Utf16 => "utf-16".to_string(),
178+
djls_workspace::PositionEncoding::Utf32 => "utf-32".to_string(),
179+
}),
176180
})
177181
}
178182

crates/djls-server/src/session.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ impl Session {
7777
settings,
7878
workspace,
7979
client_capabilities: params.capabilities.clone(),
80-
position_encoding: PositionEncoding::negotiate(params),
80+
position_encoding: djls_workspace::negotiate_position_encoding(params),
8181
db,
8282
}
8383
}

crates/djls-source/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
mod db;
22
mod file;
33
mod position;
4+
mod protocol;
45

56
pub use db::Db;
67
pub use file::File;
@@ -9,3 +10,4 @@ pub use position::ByteOffset;
910
pub use position::LineCol;
1011
pub use position::LineIndex;
1112
pub use position::Span;
13+
pub use protocol::PositionEncoding;

crates/djls-source/src/position.rs

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ impl LineIndex {
120120
#[cfg(test)]
121121
mod tests {
122122
use super::*;
123+
use crate::protocol::PositionEncoding;
123124

124125
#[test]
125126
fn test_line_index_unix_endings() {
@@ -164,4 +165,44 @@ mod tests {
164165
assert_eq!(index.to_line_col(ByteOffset(7)), LineCol((1, 0)));
165166
assert_eq!(index.to_line_col(ByteOffset(8)), LineCol((1, 1)));
166167
}
168+
169+
#[test]
170+
fn test_line_col_to_offset_utf16() {
171+
let text = "Hello 🌍 world";
172+
let index = LineIndex::from_text(text);
173+
174+
// "Hello " = 6 UTF-16 units, "🌍" = 2 UTF-16 units
175+
// So position (0, 8) in UTF-16 should be after the emoji
176+
let offset = index
177+
.line_col_to_offset(LineCol((0, 8)), text, PositionEncoding::Utf16)
178+
.expect("Should get offset");
179+
assert_eq!(offset, ByteOffset(10)); // "Hello 🌍" is 10 bytes
180+
181+
// In UTF-8, character 10 would be at the 'r' in 'world'
182+
let offset_utf8 = index
183+
.line_col_to_offset(LineCol((0, 10)), text, PositionEncoding::Utf8)
184+
.expect("Should get offset");
185+
assert_eq!(offset_utf8, ByteOffset(10));
186+
}
187+
188+
#[test]
189+
fn test_line_col_to_offset_ascii_fast_path() {
190+
let text = "Hello world";
191+
let index = LineIndex::from_text(text);
192+
193+
// For ASCII text, all encodings should give the same result
194+
let offset_utf8 = index
195+
.line_col_to_offset(LineCol((0, 5)), text, PositionEncoding::Utf8)
196+
.expect("Should get offset");
197+
let offset_utf16 = index
198+
.line_col_to_offset(LineCol((0, 5)), text, PositionEncoding::Utf16)
199+
.expect("Should get offset");
200+
let offset_utf32 = index
201+
.line_col_to_offset(LineCol((0, 5)), text, PositionEncoding::Utf32)
202+
.expect("Should get offset");
203+
204+
assert_eq!(offset_utf8, ByteOffset(5));
205+
assert_eq!(offset_utf16, ByteOffset(5));
206+
assert_eq!(offset_utf32, ByteOffset(5));
207+
}
167208
}

crates/djls-source/src/protocol.rs

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
/// Protocol-specific text position handling.
2+
///
3+
/// This module provides types and functions for converting between different
4+
/// text position representations used by various protocols and editors.
5+
use crate::position::ByteOffset;
6+
/// Protocol-specific text position handling.
7+
///
8+
/// This module provides types and functions for converting between different
9+
/// text position representations used by various protocols and editors.
10+
use crate::position::LineCol;
11+
/// Protocol-specific text position handling.
12+
///
13+
/// This module provides types and functions for converting between different
14+
/// text position representations used by various protocols and editors.
15+
use crate::position::LineIndex;
16+
17+
/// Specifies how column positions are counted in text.
18+
///
19+
/// While motivated by LSP (Language Server Protocol) requirements, this enum
20+
/// represents a fundamental choice about text position measurement that any
21+
/// text processing system must make. Different systems count "column" positions
22+
/// differently:
23+
///
24+
/// - Some count bytes (fast but breaks on multi-byte characters)
25+
/// - Some count UTF-16 code units (common in JavaScript/Windows ecosystems)
26+
/// - Some count Unicode codepoints (intuitive but slower)
27+
///
28+
/// This crate provides encoding-aware position conversion to support different
29+
/// client expectations without coupling to specific protocol implementations.
30+
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
31+
pub enum PositionEncoding {
32+
/// Column positions count UTF-8 code units (bytes from line start)
33+
Utf8,
34+
/// Column positions count UTF-16 code units (common in VS Code and Windows editors)
35+
#[default]
36+
Utf16,
37+
/// Column positions count Unicode scalar values (codepoints)
38+
Utf32,
39+
}
40+
41+
impl LineIndex {
42+
/// Convert a line/column position to a byte offset with encoding awareness.
43+
///
44+
/// The `encoding` parameter specifies how the column value should be interpreted:
45+
/// - `PositionEncoding::Utf8`: column is a byte offset from line start
46+
/// - `PositionEncoding::Utf16`: column counts UTF-16 code units
47+
/// - `PositionEncoding::Utf32`: column counts Unicode codepoints
48+
///
49+
/// This method is primarily used to convert protocol-specific positions
50+
/// (which may use different column counting methods) into byte offsets
51+
/// that can be used to index into the actual UTF-8 text.
52+
///
53+
/// # Examples
54+
///
55+
/// ```
56+
/// # use djls_source::{LineIndex, LineCol, ByteOffset, PositionEncoding};
57+
/// let text = "Hello 🌍 world";
58+
/// let index = LineIndex::from_text(text);
59+
///
60+
/// // UTF-16: "Hello " (6) + "🌍" (2 UTF-16 units) = position 8
61+
/// let offset = index.line_col_to_offset(
62+
/// LineCol((0, 8)),
63+
/// text,
64+
/// PositionEncoding::Utf16
65+
/// );
66+
/// assert_eq!(offset, Some(ByteOffset(10))); // "Hello 🌍" is 10 bytes
67+
/// ```
68+
#[must_use]
69+
pub fn line_col_to_offset(
70+
&self,
71+
line_col: LineCol,
72+
text: &str,
73+
encoding: PositionEncoding,
74+
) -> Option<ByteOffset> {
75+
let line = line_col.line();
76+
let character = line_col.column();
77+
78+
// Handle line bounds - if line > line_count, return document length
79+
let line_start_utf8 = match self.lines().get(line as usize) {
80+
Some(start) => *start,
81+
None => return Some(ByteOffset(u32::try_from(text.len()).unwrap_or(u32::MAX))),
82+
};
83+
84+
if character == 0 {
85+
return Some(ByteOffset(line_start_utf8));
86+
}
87+
88+
let next_line_start = self
89+
.lines()
90+
.get(line as usize + 1)
91+
.copied()
92+
.unwrap_or_else(|| u32::try_from(text.len()).unwrap_or(u32::MAX));
93+
94+
let line_text = text.get(line_start_utf8 as usize..next_line_start as usize)?;
95+
96+
// Fast path optimization for ASCII text, all encodings are equivalent to byte offsets
97+
if line_text.is_ascii() {
98+
let char_offset = character.min(u32::try_from(line_text.len()).unwrap_or(u32::MAX));
99+
return Some(ByteOffset(line_start_utf8 + char_offset));
100+
}
101+
102+
match encoding {
103+
PositionEncoding::Utf8 => {
104+
// UTF-8: character positions are already byte offsets
105+
let char_offset = character.min(u32::try_from(line_text.len()).unwrap_or(u32::MAX));
106+
Some(ByteOffset(line_start_utf8 + char_offset))
107+
}
108+
PositionEncoding::Utf16 => {
109+
// UTF-16: count UTF-16 code units
110+
let mut utf16_pos = 0;
111+
let mut utf8_pos = 0;
112+
113+
for c in line_text.chars() {
114+
if utf16_pos >= character {
115+
break;
116+
}
117+
utf16_pos += u32::try_from(c.len_utf16()).unwrap_or(0);
118+
utf8_pos += u32::try_from(c.len_utf8()).unwrap_or(0);
119+
}
120+
121+
// If character position exceeds line length, clamp to line end
122+
Some(ByteOffset(line_start_utf8 + utf8_pos))
123+
}
124+
PositionEncoding::Utf32 => {
125+
// UTF-32: count Unicode code points (characters)
126+
let mut utf8_pos = 0;
127+
128+
for (char_count, c) in line_text.chars().enumerate() {
129+
if char_count >= character as usize {
130+
break;
131+
}
132+
utf8_pos += u32::try_from(c.len_utf8()).unwrap_or(0);
133+
}
134+
135+
// If character position exceeds line length, clamp to line end
136+
Some(ByteOffset(line_start_utf8 + utf8_pos))
137+
}
138+
}
139+
}
140+
}
141+
142+
#[cfg(test)]
143+
mod tests {
144+
use super::*;
145+
146+
#[test]
147+
fn test_line_col_to_offset_utf16() {
148+
let text = "Hello 🌍 world";
149+
let index = LineIndex::from_text(text);
150+
151+
// "Hello " = 6 UTF-16 units, "🌍" = 2 UTF-16 units
152+
// So position (0, 8) in UTF-16 should be after the emoji
153+
let offset = index
154+
.line_col_to_offset(LineCol((0, 8)), text, PositionEncoding::Utf16)
155+
.expect("Should get offset");
156+
assert_eq!(offset, ByteOffset(10)); // "Hello 🌍" is 10 bytes
157+
158+
// In UTF-8, character 10 would be at the 'r' in 'world'
159+
let offset_utf8 = index
160+
.line_col_to_offset(LineCol((0, 10)), text, PositionEncoding::Utf8)
161+
.expect("Should get offset");
162+
assert_eq!(offset_utf8, ByteOffset(10));
163+
}
164+
165+
#[test]
166+
fn test_line_col_to_offset_ascii_fast_path() {
167+
let text = "Hello world";
168+
let index = LineIndex::from_text(text);
169+
170+
// For ASCII text, all encodings should give the same result
171+
let offset_utf8 = index
172+
.line_col_to_offset(LineCol((0, 5)), text, PositionEncoding::Utf8)
173+
.expect("Should get offset");
174+
let offset_utf16 = index
175+
.line_col_to_offset(LineCol((0, 5)), text, PositionEncoding::Utf16)
176+
.expect("Should get offset");
177+
let offset_utf32 = index
178+
.line_col_to_offset(LineCol((0, 5)), text, PositionEncoding::Utf32)
179+
.expect("Should get offset");
180+
181+
assert_eq!(offset_utf8, ByteOffset(5));
182+
assert_eq!(offset_utf16, ByteOffset(5));
183+
assert_eq!(offset_utf32, ByteOffset(5));
184+
}
185+
}

crates/djls-workspace/src/document.rs

Lines changed: 6 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66
//! and diagnostics.
77
88
use djls_source::LineIndex;
9+
use djls_source::PositionEncoding;
910
use tower_lsp_server::lsp_types::Position;
1011
use tower_lsp_server::lsp_types::Range;
1112

12-
use crate::encoding::PositionEncoding;
1313
use crate::language::LanguageId;
1414

1515
/// In-memory representation of an open document in the LSP.
@@ -139,79 +139,17 @@ impl TextDocument {
139139

140140
/// Calculate byte offset from an LSP position using the given line index and text.
141141
///
142-
/// This handles the encoding-aware conversion from LSP positions (line/character)
143-
/// to byte offsets, supporting UTF-8, UTF-16, and UTF-32 encodings.
142+
/// This delegates to the encoding-aware conversion in `djls_source`.
144143
fn calculate_offset(
145144
line_index: &LineIndex,
146145
position: Position,
147146
text: &str,
148147
encoding: PositionEncoding,
149148
) -> Option<u32> {
150-
// Handle line bounds - if line > line_count, return document length
151-
let line_start_utf8 = match line_index.lines().get(position.line as usize) {
152-
Some(start) => *start,
153-
None => return Some(u32::try_from(text.len()).unwrap_or(u32::MAX)), // Past end of document
154-
};
155-
156-
if position.character == 0 {
157-
return Some(line_start_utf8);
158-
}
159-
160-
let next_line_start = line_index
161-
.lines()
162-
.get(position.line as usize + 1)
163-
.copied()
164-
.unwrap_or_else(|| u32::try_from(text.len()).unwrap_or(u32::MAX));
165-
166-
let line_text = text.get(line_start_utf8 as usize..next_line_start as usize)?;
167-
168-
// Fast path optimization for ASCII text, all encodings are equivalent to byte offsets
169-
if line_text.is_ascii() {
170-
let char_offset = position
171-
.character
172-
.min(u32::try_from(line_text.len()).unwrap_or(u32::MAX));
173-
return Some(line_start_utf8 + char_offset);
174-
}
175-
176-
match encoding {
177-
PositionEncoding::Utf8 => {
178-
// UTF-8: character positions are already byte offsets
179-
let char_offset = position
180-
.character
181-
.min(u32::try_from(line_text.len()).unwrap_or(u32::MAX));
182-
Some(line_start_utf8 + char_offset)
183-
}
184-
PositionEncoding::Utf16 => {
185-
// UTF-16: count UTF-16 code units
186-
let mut utf16_pos = 0;
187-
let mut utf8_pos = 0;
188-
189-
for c in line_text.chars() {
190-
if utf16_pos >= position.character {
191-
break;
192-
}
193-
utf16_pos += u32::try_from(c.len_utf16()).unwrap_or(0);
194-
utf8_pos += u32::try_from(c.len_utf8()).unwrap_or(0);
195-
}
196-
197-
// If character position exceeds line length, clamp to line end
198-
Some(line_start_utf8 + utf8_pos)
199-
}
200-
PositionEncoding::Utf32 => {
201-
// UTF-32: count Unicode code points (characters)
202-
let mut utf8_pos = 0;
203-
204-
for (char_count, c) in line_text.chars().enumerate() {
205-
if char_count >= position.character as usize {
206-
break;
207-
}
208-
utf8_pos += u32::try_from(c.len_utf8()).unwrap_or(0);
209-
}
210-
211-
// If character position exceeds line length, clamp to line end
212-
Some(line_start_utf8 + utf8_pos)
213-
}
214-
}
149+
let line_col = djls_source::LineCol((position.line, position.character));
150+
line_index
151+
.line_col_to_offset(line_col, text, encoding)
152+
.map(|djls_source::ByteOffset(offset)| offset)
215153
}
216154
}
217155

0 commit comments

Comments
 (0)