Skip to content

Commit 6c77d91

Browse files
clean up position conversion methods (#238)
1 parent 141bd34 commit 6c77d91

File tree

8 files changed

+319
-267
lines changed

8 files changed

+319
-267
lines changed

crates/djls-server/src/server.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ impl LanguageServer for DjangoLanguageServer {
157157
save: Some(lsp_types::SaveOptions::default().into()),
158158
},
159159
)),
160-
position_encoding: Some(lsp_types::PositionEncodingKind::from(encoding)),
160+
position_encoding: Some(djls_workspace::position_encoding_to_lsp(encoding)),
161161
diagnostic_provider: Some(lsp_types::DiagnosticServerCapabilities::Options(
162162
lsp_types::DiagnosticOptions {
163163
identifier: None,

crates/djls-server/src/session.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ impl Session {
7777
settings,
7878
workspace,
7979
client_capabilities: params.capabilities.clone(),
80-
position_encoding: PositionEncoding::negotiate(params),
80+
position_encoding: djls_workspace::negotiate_position_encoding(params),
8181
db,
8282
}
8383
}

crates/djls-source/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
mod db;
22
mod file;
33
mod position;
4+
mod protocol;
45

56
pub use db::Db;
67
pub use file::File;
@@ -9,3 +10,4 @@ pub use position::ByteOffset;
910
pub use position::LineCol;
1011
pub use position::LineIndex;
1112
pub use position::Span;
13+
pub use protocol::PositionEncoding;

crates/djls-source/src/protocol.rs

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
use std::fmt;
2+
3+
use crate::position::ByteOffset;
4+
use crate::position::LineCol;
5+
use crate::position::LineIndex;
6+
7+
/// Specifies how column positions are counted in text.
8+
///
9+
/// While motivated by LSP (Language Server Protocol) requirements, this enum
10+
/// represents a fundamental choice about text position measurement that any
11+
/// text processing system must make. Different systems count "column" positions
12+
/// differently:
13+
///
14+
/// - Some count bytes (fast but breaks on multi-byte characters)
15+
/// - Some count UTF-16 code units (common in JavaScript/Windows ecosystems)
16+
/// - Some count Unicode codepoints (intuitive but slower)
17+
///
18+
/// This crate provides encoding-aware position conversion to support different
19+
/// client expectations without coupling to specific protocol implementations.
20+
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
21+
pub enum PositionEncoding {
22+
/// Column positions count UTF-8 code units (bytes from line start)
23+
Utf8,
24+
/// Column positions count UTF-16 code units (common in VS Code and Windows editors)
25+
#[default]
26+
Utf16,
27+
/// Column positions count Unicode scalar values (codepoints)
28+
Utf32,
29+
}
30+
31+
impl fmt::Display for PositionEncoding {
32+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
33+
match self {
34+
Self::Utf8 => write!(f, "utf-8"),
35+
Self::Utf16 => write!(f, "utf-16"),
36+
Self::Utf32 => write!(f, "utf-32"),
37+
}
38+
}
39+
}
40+
41+
impl PositionEncoding {
42+
/// Convert a line/column position to a byte offset with encoding awareness.
43+
///
44+
/// The encoding specifies how the column value should be interpreted:
45+
/// - `PositionEncoding::Utf8`: column is a byte offset from line start
46+
/// - `PositionEncoding::Utf16`: column counts UTF-16 code units
47+
/// - `PositionEncoding::Utf32`: column counts Unicode codepoints
48+
///
49+
/// This method is primarily used to convert protocol-specific positions
50+
/// (which may use different column counting methods) into byte offsets
51+
/// that can be used to index into the actual UTF-8 text.
52+
///
53+
/// # Examples
54+
///
55+
/// ```
56+
/// # use djls_source::{LineIndex, LineCol, ByteOffset, PositionEncoding};
57+
/// let text = "Hello 🌍 world";
58+
/// let index = LineIndex::from_text(text);
59+
///
60+
/// // UTF-16: "Hello " (6) + "🌍" (2 UTF-16 units) = position 8
61+
/// let offset = PositionEncoding::Utf16.line_col_to_offset(
62+
/// &index,
63+
/// LineCol((0, 8)),
64+
/// text
65+
/// );
66+
/// assert_eq!(offset, Some(ByteOffset(10))); // "Hello 🌍" is 10 bytes
67+
/// ```
68+
#[must_use]
69+
pub fn line_col_to_offset(
70+
&self,
71+
index: &LineIndex,
72+
line_col: LineCol,
73+
text: &str,
74+
) -> Option<ByteOffset> {
75+
let line = line_col.line();
76+
let character = line_col.column();
77+
78+
// Handle line bounds - if line > line_count, return document length
79+
let line_start_utf8 = match index.lines().get(line as usize) {
80+
Some(start) => *start,
81+
None => return Some(ByteOffset(u32::try_from(text.len()).unwrap_or(u32::MAX))),
82+
};
83+
84+
if character == 0 {
85+
return Some(ByteOffset(line_start_utf8));
86+
}
87+
88+
let next_line_start = index
89+
.lines()
90+
.get(line as usize + 1)
91+
.copied()
92+
.unwrap_or_else(|| u32::try_from(text.len()).unwrap_or(u32::MAX));
93+
94+
let line_text = text.get(line_start_utf8 as usize..next_line_start as usize)?;
95+
96+
// Fast path optimization for ASCII text, all encodings are equivalent to byte offsets
97+
if line_text.is_ascii() {
98+
let char_offset = character.min(u32::try_from(line_text.len()).unwrap_or(u32::MAX));
99+
return Some(ByteOffset(line_start_utf8 + char_offset));
100+
}
101+
102+
match self {
103+
PositionEncoding::Utf8 => {
104+
// UTF-8: character positions are already byte offsets
105+
let char_offset = character.min(u32::try_from(line_text.len()).unwrap_or(u32::MAX));
106+
Some(ByteOffset(line_start_utf8 + char_offset))
107+
}
108+
PositionEncoding::Utf16 => {
109+
// UTF-16: count UTF-16 code units
110+
let mut utf16_pos = 0;
111+
let mut utf8_pos = 0;
112+
113+
for c in line_text.chars() {
114+
if utf16_pos >= character {
115+
break;
116+
}
117+
utf16_pos += u32::try_from(c.len_utf16()).unwrap_or(0);
118+
utf8_pos += u32::try_from(c.len_utf8()).unwrap_or(0);
119+
}
120+
121+
// If character position exceeds line length, clamp to line end
122+
Some(ByteOffset(line_start_utf8 + utf8_pos))
123+
}
124+
PositionEncoding::Utf32 => {
125+
// UTF-32: count Unicode code points (characters)
126+
let mut utf8_pos = 0;
127+
128+
for (char_count, c) in line_text.chars().enumerate() {
129+
if char_count >= character as usize {
130+
break;
131+
}
132+
utf8_pos += u32::try_from(c.len_utf8()).unwrap_or(0);
133+
}
134+
135+
// If character position exceeds line length, clamp to line end
136+
Some(ByteOffset(line_start_utf8 + utf8_pos))
137+
}
138+
}
139+
}
140+
}
141+
142+
#[cfg(test)]
143+
mod tests {
144+
use super::*;
145+
146+
#[test]
147+
fn test_position_encoding_display() {
148+
assert_eq!(PositionEncoding::Utf8.to_string(), "utf-8");
149+
assert_eq!(PositionEncoding::Utf16.to_string(), "utf-16");
150+
assert_eq!(PositionEncoding::Utf32.to_string(), "utf-32");
151+
}
152+
153+
#[test]
154+
fn test_line_col_to_offset_utf16() {
155+
let text = "Hello 🌍 world";
156+
let index = LineIndex::from_text(text);
157+
158+
// "Hello " = 6 UTF-16 units, "🌍" = 2 UTF-16 units
159+
// So position (0, 8) in UTF-16 should be after the emoji
160+
let offset = PositionEncoding::Utf16
161+
.line_col_to_offset(&index, LineCol((0, 8)), text)
162+
.expect("Should get offset");
163+
assert_eq!(offset, ByteOffset(10)); // "Hello 🌍" is 10 bytes
164+
165+
// In UTF-8, character 10 would be at the 'r' in 'world'
166+
let offset_utf8 = PositionEncoding::Utf8
167+
.line_col_to_offset(&index, LineCol((0, 10)), text)
168+
.expect("Should get offset");
169+
assert_eq!(offset_utf8, ByteOffset(10));
170+
}
171+
172+
#[test]
173+
fn test_line_col_to_offset_ascii_fast_path() {
174+
let text = "Hello world";
175+
let index = LineIndex::from_text(text);
176+
177+
// For ASCII text, all encodings should give the same result
178+
let offset_utf8 = PositionEncoding::Utf8
179+
.line_col_to_offset(&index, LineCol((0, 5)), text)
180+
.expect("Should get offset");
181+
let offset_utf16 = PositionEncoding::Utf16
182+
.line_col_to_offset(&index, LineCol((0, 5)), text)
183+
.expect("Should get offset");
184+
let offset_utf32 = PositionEncoding::Utf32
185+
.line_col_to_offset(&index, LineCol((0, 5)), text)
186+
.expect("Should get offset");
187+
188+
assert_eq!(offset_utf8, ByteOffset(5));
189+
assert_eq!(offset_utf16, ByteOffset(5));
190+
assert_eq!(offset_utf32, ByteOffset(5));
191+
}
192+
}

0 commit comments

Comments
 (0)