Skip to content

Commit a41fec2

Browse files
committed
Frontend now sends UTF-8 byte offsets
1 parent 3ed98d1 commit a41fec2

File tree

2 files changed

+43
-109
lines changed

2 files changed

+43
-109
lines changed

crates/amalthea/src/wire/execute_request.rs

Lines changed: 36 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -58,12 +58,10 @@ pub struct JupyterPositronRange {
5858
pub end: JupyterPositronPosition,
5959
}
6060

61-
/// See https://jupyter-client.readthedocs.io/en/stable/messaging.html#cursor-pos-unicode-note
62-
/// regarding choice of offset in unicode points
6361
#[derive(Debug, Serialize, Deserialize, Clone)]
6462
pub struct JupyterPositronPosition {
6563
pub line: u32,
66-
/// Column offset in unicode points
64+
/// Column offset in UTF-8 bytes
6765
pub character: u32,
6866
}
6967

@@ -87,132 +85,69 @@ impl ExecuteRequest {
8785
let Some(positron) = &self.positron else {
8886
return Ok(None);
8987
};
90-
9188
let Some(location) = &positron.code_location else {
9289
return Ok(None);
9390
};
9491

9592
let uri = Url::parse(&location.uri).context("Failed to parse URI from code location")?;
93+
let range = &location.range;
9694

97-
// Validate that range is not inverted (end must be >= start)
98-
if location.range.end.line < location.range.start.line {
99-
return Err(anyhow::anyhow!(
100-
"Invalid range: end line ({}) is before start line ({})",
101-
location.range.end.line,
102-
location.range.start.line
103-
));
104-
}
105-
if location.range.end.line == location.range.start.line &&
106-
location.range.end.character < location.range.start.character
95+
// Validate that range is not inverted
96+
if range.end.line < range.start.line ||
97+
(range.end.line == range.start.line && range.end.character < range.start.character)
10798
{
10899
return Err(anyhow::anyhow!(
109-
"Invalid range: end character ({}) is before start character ({})",
110-
location.range.end.character,
111-
location.range.start.character
100+
"Invalid range: end ({}, {}) is before start ({}, {})",
101+
range.end.line,
102+
range.end.character,
103+
range.start.line,
104+
range.start.character
112105
));
113106
}
114107

115-
// The location maps `self.code` to a range in the document. We'll first
116-
// do a sanity check that the span dimensions (end - start) match the
117-
// code extents.
118-
let span_lines = location.range.end.line - location.range.start.line;
119-
120-
// For multiline code, the last line's expected length is just `end.character`.
121-
// For single-line code, the expected length is `end.character - start.character`.
122-
let expected_last_line_chars = if span_lines == 0 {
123-
location.range.end.character - location.range.start.character
124-
} else {
125-
location.range.end.character
126-
};
127-
128-
// Count newlines directly rather than with `lines()`. This correctly
129-
// handles trailing newlines as distinct:
130-
// - "a\nb\nc" has 2 newlines, spans lines 0-2 (span_lines = 2)
131-
// - "a\nb\nc\n" has 3 newlines, spans lines 0-3 (span_lines = 3)
132-
let code_line_count = self.code.matches('\n').count();
108+
// Validate that the span dimensions match the code extents
109+
let span_lines = (range.end.line - range.start.line) as usize;
110+
let code_newlines = self.code.matches('\n').count();
133111

134-
// Sanity check: `code` conforms exactly to expected number of lines in the span
135-
if code_line_count != span_lines as usize {
112+
if code_newlines != span_lines {
136113
return Err(anyhow::anyhow!(
137-
"Line count mismatch: location spans {} lines, but code has {} newlines",
138-
span_lines,
139-
code_line_count
114+
"Line count mismatch: location spans {span_lines} lines, but code has {code_newlines} newlines"
140115
));
141116
}
142117

143-
// Get the last line for character count validation.
144-
// If code ends with newline, the last "line" in terms of the span is empty.
118+
// Validate last line byte length
145119
let last_line = if self.code.ends_with('\n') {
146120
""
147121
} else {
148122
self.code.lines().last().unwrap_or("")
149123
};
150-
151124
let last_line = last_line.strip_suffix('\r').unwrap_or(last_line);
152-
let last_line_chars = last_line.chars().count() as u32;
153-
154-
// Sanity check: the last line has exactly the expected number of characters
155-
if last_line_chars != expected_last_line_chars {
156-
return Err(anyhow::anyhow!(
157-
"Expected last line to have {expected} characters, got {actual}",
158-
expected = expected_last_line_chars,
159-
actual = last_line_chars
160-
));
161-
}
162-
163-
// Convert start character from unicode code points to UTF-8 bytes
164-
let character_start =
165-
unicode_char_to_utf8_offset(&self.code, 0, location.range.start.character)?;
166125

167-
// End character is start + last line byte length (for single line)
168-
// or just last line byte length (for multiline, since it's on a new line)
169-
let last_line_bytes = last_line.len();
170-
let character_end = if span_lines == 0 {
171-
character_start + last_line_bytes
126+
let expected_bytes = if span_lines == 0 {
127+
range.end.character - range.start.character
172128
} else {
173-
last_line_bytes
129+
range.end.character
174130
};
175131

176-
let start = Position {
177-
line: location.range.start.line,
178-
character: character_start as u32,
179-
};
180-
let end = Position {
181-
line: location.range.end.line,
182-
character: character_end as u32,
183-
};
184-
185-
Ok(Some(CodeLocation { uri, start, end }))
186-
}
187-
}
188-
189-
/// Converts a character position in unicode scalar values to a UTF-8 byte
190-
/// offset within the specified line.
191-
fn unicode_char_to_utf8_offset(text: &str, line: u32, character: u32) -> anyhow::Result<usize> {
192-
let target_line = text
193-
.lines()
194-
.nth(line as usize)
195-
.ok_or_else(|| anyhow::anyhow!("Line {line} not found in text"))?;
196-
197-
unicode_char_to_utf8_offset_in_line(target_line, character)
198-
}
132+
if last_line.len() as u32 != expected_bytes {
133+
return Err(anyhow::anyhow!(
134+
"Expected last line to have {expected_bytes} bytes, got {}",
135+
last_line.len()
136+
));
137+
}
199138

200-
/// Converts a character count in unicode scalar values to a UTF-8 byte count.
201-
fn unicode_char_to_utf8_offset_in_line(line: &str, character: u32) -> anyhow::Result<usize> {
202-
let line_chars = line.chars().count();
203-
if character as usize > line_chars {
204-
return Err(anyhow::anyhow!(
205-
"Character position {character} exceeds line length ({line_chars})"
206-
));
139+
Ok(Some(CodeLocation {
140+
uri,
141+
start: Position {
142+
line: range.start.line,
143+
character: range.start.character,
144+
},
145+
end: Position {
146+
line: range.end.line,
147+
character: range.end.character,
148+
},
149+
}))
207150
}
208-
209-
let byte_offset = line
210-
.char_indices()
211-
.nth(character as usize)
212-
.map(|(byte_idx, _)| byte_idx)
213-
.unwrap_or(line.len());
214-
215-
Ok(byte_offset)
216151
}
217152

218153
impl MessageType for ExecuteRequest {

crates/ark/tests/kernel-srcref.rs

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,8 @@ $end
204204
fn test_execute_request_srcref_location_with_emoji_utf8_shift() {
205205
let frontend = DummyArkFrontend::lock();
206206

207-
// Starting at line 3, column 3 (these input positions are in Unicode code points)
207+
// Starting at line 3, column 3 (these input positions are in UTF-8 bytes).
208+
// The code is 29 UTF-8 bytes (the emoji 🙂 is 4 bytes), so end.character = 3 + 29 = 32.
208209
let code_location = JupyterPositronLocation {
209210
uri: "file:///path/to/file.R".to_owned(),
210211
range: JupyterPositronRange {
@@ -214,19 +215,17 @@ fn test_execute_request_srcref_location_with_emoji_utf8_shift() {
214215
},
215216
end: JupyterPositronPosition {
216217
line: 2,
217-
character: 26 + 3,
218+
character: 29 + 3,
218219
},
219220
},
220221
};
221222

222-
// The function body contains a single emoji character. The input character positions above
223-
// are specified as Unicode code points. The srcref we receive reports UTF-8 byte positions,
224-
// so the presence of the multibyte emoji shifts the end position by the emoji's extra bytes.
223+
// The function body contains a single emoji character which is 4 UTF-8 bytes.
225224
frontend.execute_request_with_location("fn <- function() \"🙂\"; NULL", |_| (), code_location);
226225

227-
// `function` starts at column 7 (code point counting), so with a start.character of 3 we get 10.
228-
// The function body `"🙂"` ends at UTF-8 byte 20 locally (the closing quote), so with
229-
// start.character=3 the reported end becomes 23.
226+
// `function` starts at byte 7 in the code, so with start.character=3 we get 10.
227+
// The function body `"🙂"` ends at byte 23 locally (the closing quote after the 4-byte emoji),
228+
// so with start.character=3 the reported end becomes 23.
230229
frontend.execute_request(".ps.internal(get_srcref_range(fn))", |result| {
231230
assert_eq!(
232231
result,

0 commit comments

Comments
 (0)