wip

joshuadavidthomas · joshuadavidthomas · commit e4d773ea9f00 · 2025-09-16T21:19:13.000-05:00
diff --git a/crates/djls-server/src/server.rs b/crates/djls-server/src/server.rs
@@ -157,7 +157,7 @@ impl LanguageServer for DjangoLanguageServer {
                         save: Some(lsp_types::SaveOptions::default().into()),
                     },
                 )),
-                position_encoding: Some(lsp_types::PositionEncodingKind::from(encoding)),
+                position_encoding: Some(djls_workspace::position_encoding_to_lsp(encoding)),
                 diagnostic_provider: Some(lsp_types::DiagnosticServerCapabilities::Options(
                     lsp_types::DiagnosticOptions {
                         identifier: None,
@@ -172,7 +172,11 @@ impl LanguageServer for DjangoLanguageServer {
                 name: SERVER_NAME.to_string(),
                 version: Some(SERVER_VERSION.to_string()),
             }),
-            offset_encoding: Some(encoding.to_string()),
+            offset_encoding: Some(match encoding {
+                djls_workspace::PositionEncoding::Utf8 => "utf-8".to_string(),
+                djls_workspace::PositionEncoding::Utf16 => "utf-16".to_string(),
+                djls_workspace::PositionEncoding::Utf32 => "utf-32".to_string(),
+            }),
         })
     }
 
diff --git a/crates/djls-server/src/session.rs b/crates/djls-server/src/session.rs
@@ -77,7 +77,7 @@ impl Session {
             settings,
             workspace,
             client_capabilities: params.capabilities.clone(),
-            position_encoding: PositionEncoding::negotiate(params),
+            position_encoding: djls_workspace::negotiate_position_encoding(params),
             db,
         }
     }
diff --git a/crates/djls-source/src/lib.rs b/crates/djls-source/src/lib.rs
@@ -1,6 +1,7 @@
 mod db;
 mod file;
 mod position;
+mod protocol;
 
 pub use db::Db;
 pub use file::File;
@@ -9,3 +10,4 @@ pub use position::ByteOffset;
 pub use position::LineCol;
 pub use position::LineIndex;
 pub use position::Span;
+pub use protocol::PositionEncoding;
diff --git a/crates/djls-source/src/position.rs b/crates/djls-source/src/position.rs
@@ -120,6 +120,7 @@ impl LineIndex {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::protocol::PositionEncoding;
 
     #[test]
     fn test_line_index_unix_endings() {
@@ -164,4 +165,44 @@ mod tests {
         assert_eq!(index.to_line_col(ByteOffset(7)), LineCol((1, 0)));
         assert_eq!(index.to_line_col(ByteOffset(8)), LineCol((1, 1)));
     }
+
+    #[test]
+    fn test_line_col_to_offset_utf16() {
+        let text = "Hello 🌍 world";
+        let index = LineIndex::from_text(text);
+
+        // "Hello " = 6 UTF-16 units, "🌍" = 2 UTF-16 units
+        // So position (0, 8) in UTF-16 should be after the emoji
+        let offset = index
+            .line_col_to_offset(LineCol((0, 8)), text, PositionEncoding::Utf16)
+            .expect("Should get offset");
+        assert_eq!(offset, ByteOffset(10)); // "Hello 🌍" is 10 bytes
+
+        // In UTF-8, character 10 would be at the 'r' in 'world'
+        let offset_utf8 = index
+            .line_col_to_offset(LineCol((0, 10)), text, PositionEncoding::Utf8)
+            .expect("Should get offset");
+        assert_eq!(offset_utf8, ByteOffset(10));
+    }
+
+    #[test]
+    fn test_line_col_to_offset_ascii_fast_path() {
+        let text = "Hello world";
+        let index = LineIndex::from_text(text);
+
+        // For ASCII text, all encodings should give the same result
+        let offset_utf8 = index
+            .line_col_to_offset(LineCol((0, 5)), text, PositionEncoding::Utf8)
+            .expect("Should get offset");
+        let offset_utf16 = index
+            .line_col_to_offset(LineCol((0, 5)), text, PositionEncoding::Utf16)
+            .expect("Should get offset");
+        let offset_utf32 = index
+            .line_col_to_offset(LineCol((0, 5)), text, PositionEncoding::Utf32)
+            .expect("Should get offset");
+
+        assert_eq!(offset_utf8, ByteOffset(5));
+        assert_eq!(offset_utf16, ByteOffset(5));
+        assert_eq!(offset_utf32, ByteOffset(5));
+    }
 }
diff --git a/crates/djls-source/src/protocol.rs b/crates/djls-source/src/protocol.rs
@@ -0,0 +1,185 @@
+/// Protocol-specific text position handling.
+///
+/// This module provides types and functions for converting between different
+/// text position representations used by various protocols and editors.
+use crate::position::ByteOffset;
+/// Protocol-specific text position handling.
+///
+/// This module provides types and functions for converting between different
+/// text position representations used by various protocols and editors.
+use crate::position::LineCol;
+/// Protocol-specific text position handling.
+///
+/// This module provides types and functions for converting between different
+/// text position representations used by various protocols and editors.
+use crate::position::LineIndex;
+
+/// Specifies how column positions are counted in text.
+///
+/// While motivated by LSP (Language Server Protocol) requirements, this enum
+/// represents a fundamental choice about text position measurement that any
+/// text processing system must make. Different systems count "column" positions
+/// differently:
+///
+/// - Some count bytes (fast but breaks on multi-byte characters)
+/// - Some count UTF-16 code units (common in JavaScript/Windows ecosystems)
+/// - Some count Unicode codepoints (intuitive but slower)
+///
+/// This crate provides encoding-aware position conversion to support different
+/// client expectations without coupling to specific protocol implementations.
+#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
+pub enum PositionEncoding {
+    /// Column positions count UTF-8 code units (bytes from line start)
+    Utf8,
+    /// Column positions count UTF-16 code units (common in VS Code and Windows editors)
+    #[default]
+    Utf16,
+    /// Column positions count Unicode scalar values (codepoints)
+    Utf32,
+}
+
+impl LineIndex {
+    /// Convert a line/column position to a byte offset with encoding awareness.
+    ///
+    /// The `encoding` parameter specifies how the column value should be interpreted:
+    /// - `PositionEncoding::Utf8`: column is a byte offset from line start
+    /// - `PositionEncoding::Utf16`: column counts UTF-16 code units
+    /// - `PositionEncoding::Utf32`: column counts Unicode codepoints
+    ///
+    /// This method is primarily used to convert protocol-specific positions
+    /// (which may use different column counting methods) into byte offsets
+    /// that can be used to index into the actual UTF-8 text.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use djls_source::{LineIndex, LineCol, ByteOffset, PositionEncoding};
+    /// let text = "Hello 🌍 world";
+    /// let index = LineIndex::from_text(text);
+    ///
+    /// // UTF-16: "Hello " (6) + "🌍" (2 UTF-16 units) = position 8
+    /// let offset = index.line_col_to_offset(
+    ///     LineCol((0, 8)),
+    ///     text,
+    ///     PositionEncoding::Utf16
+    /// );
+    /// assert_eq!(offset, Some(ByteOffset(10))); // "Hello 🌍" is 10 bytes
+    /// ```
+    #[must_use]
+    pub fn line_col_to_offset(
+        &self,
+        line_col: LineCol,
+        text: &str,
+        encoding: PositionEncoding,
+    ) -> Option<ByteOffset> {
+        let line = line_col.line();
+        let character = line_col.column();
+
+        // Handle line bounds - if line > line_count, return document length
+        let line_start_utf8 = match self.lines().get(line as usize) {
+            Some(start) => *start,
+            None => return Some(ByteOffset(u32::try_from(text.len()).unwrap_or(u32::MAX))),
+        };
+
+        if character == 0 {
+            return Some(ByteOffset(line_start_utf8));
+        }
+
+        let next_line_start = self
+            .lines()
+            .get(line as usize + 1)
+            .copied()
+            .unwrap_or_else(|| u32::try_from(text.len()).unwrap_or(u32::MAX));
+
+        let line_text = text.get(line_start_utf8 as usize..next_line_start as usize)?;
+
+        // Fast path optimization for ASCII text, all encodings are equivalent to byte offsets
+        if line_text.is_ascii() {
+            let char_offset = character.min(u32::try_from(line_text.len()).unwrap_or(u32::MAX));
+            return Some(ByteOffset(line_start_utf8 + char_offset));
+        }
+
+        match encoding {
+            PositionEncoding::Utf8 => {
+                // UTF-8: character positions are already byte offsets
+                let char_offset = character.min(u32::try_from(line_text.len()).unwrap_or(u32::MAX));
+                Some(ByteOffset(line_start_utf8 + char_offset))
+            }
+            PositionEncoding::Utf16 => {
+                // UTF-16: count UTF-16 code units
+                let mut utf16_pos = 0;
+                let mut utf8_pos = 0;
+
+                for c in line_text.chars() {
+                    if utf16_pos >= character {
+                        break;
+                    }
+                    utf16_pos += u32::try_from(c.len_utf16()).unwrap_or(0);
+                    utf8_pos += u32::try_from(c.len_utf8()).unwrap_or(0);
+                }
+
+                // If character position exceeds line length, clamp to line end
+                Some(ByteOffset(line_start_utf8 + utf8_pos))
+            }
+            PositionEncoding::Utf32 => {
+                // UTF-32: count Unicode code points (characters)
+                let mut utf8_pos = 0;
+
+                for (char_count, c) in line_text.chars().enumerate() {
+                    if char_count >= character as usize {
+                        break;
+                    }
+                    utf8_pos += u32::try_from(c.len_utf8()).unwrap_or(0);
+                }
+
+                // If character position exceeds line length, clamp to line end
+                Some(ByteOffset(line_start_utf8 + utf8_pos))
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_line_col_to_offset_utf16() {
+        let text = "Hello 🌍 world";
+        let index = LineIndex::from_text(text);
+
+        // "Hello " = 6 UTF-16 units, "🌍" = 2 UTF-16 units
+        // So position (0, 8) in UTF-16 should be after the emoji
+        let offset = index
+            .line_col_to_offset(LineCol((0, 8)), text, PositionEncoding::Utf16)
+            .expect("Should get offset");
+        assert_eq!(offset, ByteOffset(10)); // "Hello 🌍" is 10 bytes
+
+        // In UTF-8, character 10 would be at the 'r' in 'world'
+        let offset_utf8 = index
+            .line_col_to_offset(LineCol((0, 10)), text, PositionEncoding::Utf8)
+            .expect("Should get offset");
+        assert_eq!(offset_utf8, ByteOffset(10));
+    }
+
+    #[test]
+    fn test_line_col_to_offset_ascii_fast_path() {
+        let text = "Hello world";
+        let index = LineIndex::from_text(text);
+
+        // For ASCII text, all encodings should give the same result
+        let offset_utf8 = index
+            .line_col_to_offset(LineCol((0, 5)), text, PositionEncoding::Utf8)
+            .expect("Should get offset");
+        let offset_utf16 = index
+            .line_col_to_offset(LineCol((0, 5)), text, PositionEncoding::Utf16)
+            .expect("Should get offset");
+        let offset_utf32 = index
+            .line_col_to_offset(LineCol((0, 5)), text, PositionEncoding::Utf32)
+            .expect("Should get offset");
+
+        assert_eq!(offset_utf8, ByteOffset(5));
+        assert_eq!(offset_utf16, ByteOffset(5));
+        assert_eq!(offset_utf32, ByteOffset(5));
+    }
+}
diff --git a/crates/djls-workspace/src/document.rs b/crates/djls-workspace/src/document.rs
@@ -6,10 +6,10 @@
 //! and diagnostics.
 
 use djls_source::LineIndex;
+use djls_source::PositionEncoding;
 use tower_lsp_server::lsp_types::Position;
 use tower_lsp_server::lsp_types::Range;
 
-use crate::encoding::PositionEncoding;
 use crate::language::LanguageId;
 
 /// In-memory representation of an open document in the LSP.
@@ -139,79 +139,17 @@ impl TextDocument {
 
     /// Calculate byte offset from an LSP position using the given line index and text.
     ///
-    /// This handles the encoding-aware conversion from LSP positions (line/character)
-    /// to byte offsets, supporting UTF-8, UTF-16, and UTF-32 encodings.
+    /// This delegates to the encoding-aware conversion in `djls_source`.
     fn calculate_offset(
         line_index: &LineIndex,
         position: Position,
         text: &str,
         encoding: PositionEncoding,
     ) -> Option<u32> {
-        // Handle line bounds - if line > line_count, return document length
-        let line_start_utf8 = match line_index.lines().get(position.line as usize) {
-            Some(start) => *start,
-            None => return Some(u32::try_from(text.len()).unwrap_or(u32::MAX)), // Past end of document
-        };
-
-        if position.character == 0 {
-            return Some(line_start_utf8);
-        }
-
-        let next_line_start = line_index
-            .lines()
-            .get(position.line as usize + 1)
-            .copied()
-            .unwrap_or_else(|| u32::try_from(text.len()).unwrap_or(u32::MAX));
-
-        let line_text = text.get(line_start_utf8 as usize..next_line_start as usize)?;
-
-        // Fast path optimization for ASCII text, all encodings are equivalent to byte offsets
-        if line_text.is_ascii() {
-            let char_offset = position
-                .character
-                .min(u32::try_from(line_text.len()).unwrap_or(u32::MAX));
-            return Some(line_start_utf8 + char_offset);
-        }
-
-        match encoding {
-            PositionEncoding::Utf8 => {
-                // UTF-8: character positions are already byte offsets
-                let char_offset = position
-                    .character
-                    .min(u32::try_from(line_text.len()).unwrap_or(u32::MAX));
-                Some(line_start_utf8 + char_offset)
-            }
-            PositionEncoding::Utf16 => {
-                // UTF-16: count UTF-16 code units
-                let mut utf16_pos = 0;
-                let mut utf8_pos = 0;
-
-                for c in line_text.chars() {
-                    if utf16_pos >= position.character {
-                        break;
-                    }
-                    utf16_pos += u32::try_from(c.len_utf16()).unwrap_or(0);
-                    utf8_pos += u32::try_from(c.len_utf8()).unwrap_or(0);
-                }
-
-                // If character position exceeds line length, clamp to line end
-                Some(line_start_utf8 + utf8_pos)
-            }
-            PositionEncoding::Utf32 => {
-                // UTF-32: count Unicode code points (characters)
-                let mut utf8_pos = 0;
-
-                for (char_count, c) in line_text.chars().enumerate() {
-                    if char_count >= position.character as usize {
-                        break;
-                    }
-                    utf8_pos += u32::try_from(c.len_utf8()).unwrap_or(0);
-                }
-
-                // If character position exceeds line length, clamp to line end
-                Some(line_start_utf8 + utf8_pos)
-            }
-        }
+        let line_col = djls_source::LineCol((position.line, position.character));
+        line_index
+            .line_col_to_offset(line_col, text, encoding)
+            .map(|djls_source::ByteOffset(offset)| offset)
     }
 }
 
diff --git a/crates/djls-workspace/src/encoding.rs b/crates/djls-workspace/src/encoding.rs
diff --git a/crates/djls-workspace/src/lib.rs b/crates/djls-workspace/src/lib.rs
diff --git a/crates/djls-workspace/src/workspace.rs b/crates/djls-workspace/src/workspace.rs

Original file line number	Diff line number	Diff line change
`@@ -77,7 +77,7 @@ impl Session {`
`77`	`77`	`settings,`
`78`	`78`	`workspace,`
`79`	`79`	`client_capabilities: params.capabilities.clone(),`
`80`		`- position_encoding: PositionEncoding::negotiate(params),`
	`80`	`+ position_encoding: djls_workspace::negotiate_position_encoding(params),`
`81`	`81`	`db,`
`82`	`82`	`}`
`83`	`83`	`}`