Frontend now sends UTF-8 byte offsets

lionel- · lionel- · commit a41fec23ce4a · 2026-01-16T08:20:51.000+01:00
diff --git a/crates/amalthea/src/wire/execute_request.rs b/crates/amalthea/src/wire/execute_request.rs
@@ -58,12 +58,10 @@ pub struct JupyterPositronRange {
     pub end: JupyterPositronPosition,
 }
 
-/// See https://jupyter-client.readthedocs.io/en/stable/messaging.html#cursor-pos-unicode-note
-/// regarding choice of offset in unicode points
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct JupyterPositronPosition {
     pub line: u32,
-    /// Column offset in unicode points
+    /// Column offset in UTF-8 bytes
     pub character: u32,
 }
 
@@ -87,132 +85,69 @@ impl ExecuteRequest {
         let Some(positron) = &self.positron else {
             return Ok(None);
         };
-
         let Some(location) = &positron.code_location else {
             return Ok(None);
         };
 
         let uri = Url::parse(&location.uri).context("Failed to parse URI from code location")?;
+        let range = &location.range;
 
-        // Validate that range is not inverted (end must be >= start)
-        if location.range.end.line < location.range.start.line {
-            return Err(anyhow::anyhow!(
-                "Invalid range: end line ({}) is before start line ({})",
-                location.range.end.line,
-                location.range.start.line
-            ));
-        }
-        if location.range.end.line == location.range.start.line &&
-            location.range.end.character < location.range.start.character
+        // Validate that range is not inverted
+        if range.end.line < range.start.line ||
+            (range.end.line == range.start.line && range.end.character < range.start.character)
         {
             return Err(anyhow::anyhow!(
-                "Invalid range: end character ({}) is before start character ({})",
-                location.range.end.character,
-                location.range.start.character
+                "Invalid range: end ({}, {}) is before start ({}, {})",
+                range.end.line,
+                range.end.character,
+                range.start.line,
+                range.start.character
             ));
         }
 
-        // The location maps `self.code` to a range in the document. We'll first
-        // do a sanity check that the span dimensions (end - start) match the
-        // code extents.
-        let span_lines = location.range.end.line - location.range.start.line;
-
-        // For multiline code, the last line's expected length is just `end.character`.
-        // For single-line code, the expected length is `end.character - start.character`.
-        let expected_last_line_chars = if span_lines == 0 {
-            location.range.end.character - location.range.start.character
-        } else {
-            location.range.end.character
-        };
-
-        // Count newlines directly rather than with `lines()`. This correctly
-        // handles trailing newlines as distinct:
-        // - "a\nb\nc" has 2 newlines, spans lines 0-2 (span_lines = 2)
-        // - "a\nb\nc\n" has 3 newlines, spans lines 0-3 (span_lines = 3)
-        let code_line_count = self.code.matches('\n').count();
+        // Validate that the span dimensions match the code extents
+        let span_lines = (range.end.line - range.start.line) as usize;
+        let code_newlines = self.code.matches('\n').count();
 
-        // Sanity check: `code` conforms exactly to expected number of lines in the span
-        if code_line_count != span_lines as usize {
+        if code_newlines != span_lines {
             return Err(anyhow::anyhow!(
-                "Line count mismatch: location spans {} lines, but code has {} newlines",
-                span_lines,
-                code_line_count
+                "Line count mismatch: location spans {span_lines} lines, but code has {code_newlines} newlines"
             ));
         }
 
-        // Get the last line for character count validation.
-        // If code ends with newline, the last "line" in terms of the span is empty.
+        // Validate last line byte length
         let last_line = if self.code.ends_with('\n') {
             ""
         } else {
             self.code.lines().last().unwrap_or("")
         };
-
         let last_line = last_line.strip_suffix('\r').unwrap_or(last_line);
-        let last_line_chars = last_line.chars().count() as u32;
-
-        // Sanity check: the last line has exactly the expected number of characters
-        if last_line_chars != expected_last_line_chars {
-            return Err(anyhow::anyhow!(
-                "Expected last line to have {expected} characters, got {actual}",
-                expected = expected_last_line_chars,
-                actual = last_line_chars
-            ));
-        }
-
-        // Convert start character from unicode code points to UTF-8 bytes
-        let character_start =
-            unicode_char_to_utf8_offset(&self.code, 0, location.range.start.character)?;
 
-        // End character is start + last line byte length (for single line)
-        // or just last line byte length (for multiline, since it's on a new line)
-        let last_line_bytes = last_line.len();
-        let character_end = if span_lines == 0 {
-            character_start + last_line_bytes
+        let expected_bytes = if span_lines == 0 {
+            range.end.character - range.start.character
         } else {
-            last_line_bytes
+            range.end.character
         };
 
-        let start = Position {
-            line: location.range.start.line,
-            character: character_start as u32,
-        };
-        let end = Position {
-            line: location.range.end.line,
-            character: character_end as u32,
-        };
-
-        Ok(Some(CodeLocation { uri, start, end }))
-    }
-}
-
-/// Converts a character position in unicode scalar values to a UTF-8 byte
-/// offset within the specified line.
-fn unicode_char_to_utf8_offset(text: &str, line: u32, character: u32) -> anyhow::Result<usize> {
-    let target_line = text
-        .lines()
-        .nth(line as usize)
-        .ok_or_else(|| anyhow::anyhow!("Line {line} not found in text"))?;
-
-    unicode_char_to_utf8_offset_in_line(target_line, character)
-}
+        if last_line.len() as u32 != expected_bytes {
+            return Err(anyhow::anyhow!(
+                "Expected last line to have {expected_bytes} bytes, got {}",
+                last_line.len()
+            ));
+        }
 
-/// Converts a character count in unicode scalar values to a UTF-8 byte count.
-fn unicode_char_to_utf8_offset_in_line(line: &str, character: u32) -> anyhow::Result<usize> {
-    let line_chars = line.chars().count();
-    if character as usize > line_chars {
-        return Err(anyhow::anyhow!(
-            "Character position {character} exceeds line length ({line_chars})"
-        ));
+        Ok(Some(CodeLocation {
+            uri,
+            start: Position {
+                line: range.start.line,
+                character: range.start.character,
+            },
+            end: Position {
+                line: range.end.line,
+                character: range.end.character,
+            },
+        }))
     }
-
-    let byte_offset = line
-        .char_indices()
-        .nth(character as usize)
-        .map(|(byte_idx, _)| byte_idx)
-        .unwrap_or(line.len());
-
-    Ok(byte_offset)
 }
 
 impl MessageType for ExecuteRequest {
diff --git a/crates/ark/tests/kernel-srcref.rs b/crates/ark/tests/kernel-srcref.rs
@@ -204,7 +204,8 @@ $end
 fn test_execute_request_srcref_location_with_emoji_utf8_shift() {
     let frontend = DummyArkFrontend::lock();
 
-    // Starting at line 3, column 3 (these input positions are in Unicode code points)
+    // Starting at line 3, column 3 (these input positions are in UTF-8 bytes).
+    // The code is 29 UTF-8 bytes (the emoji 🙂 is 4 bytes), so end.character = 3 + 29 = 32.
     let code_location = JupyterPositronLocation {
         uri: "file:///path/to/file.R".to_owned(),
         range: JupyterPositronRange {
@@ -214,19 +215,17 @@ fn test_execute_request_srcref_location_with_emoji_utf8_shift() {
             },
             end: JupyterPositronPosition {
                 line: 2,
-                character: 26 + 3,
+                character: 29 + 3,
             },
         },
     };
 
-    // The function body contains a single emoji character. The input character positions above
-    // are specified as Unicode code points. The srcref we receive reports UTF-8 byte positions,
-    // so the presence of the multibyte emoji shifts the end position by the emoji's extra bytes.
+    // The function body contains a single emoji character which is 4 UTF-8 bytes.
     frontend.execute_request_with_location("fn <- function() \"🙂\"; NULL", |_| (), code_location);
 
-    // `function` starts at column 7 (code point counting), so with a start.character of 3 we get 10.
-    // The function body `"🙂"` ends at UTF-8 byte 20 locally (the closing quote), so with
-    // start.character=3 the reported end becomes 23.
+    // `function` starts at byte 7 in the code, so with start.character=3 we get 10.
+    // The function body `"🙂"` ends at byte 23 locally (the closing quote after the 4-byte emoji),
+    // so with start.character=3 the reported end becomes 23.
     frontend.execute_request(".ps.internal(get_srcref_range(fn))", |result| {
         assert_eq!(
             result,