Skip to content

Commit 5b3dee0

Browse files
bugadanijessebraham
authored andcommitted
Delay processing of incomplete UTF-8 sequences
1 parent 129439a commit 5b3dee0

File tree

2 files changed

+110
-2
lines changed

2 files changed

+110
-2
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1111

1212
### Fixed
1313

14+
- Fixed printing UTF-8 sequences that were read in multiple parts. (#468)
15+
1416
### Changed
1517

1618
### Removed

espflash/src/cli/monitor/mod.rs

Lines changed: 108 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ struct SerialContext<'ctx> {
4343
symbols: Option<Symbols<'ctx>>,
4444
previous_frag: Option<String>,
4545
previous_line: Option<String>,
46+
incomplete_utf8_buffer: Vec<u8>,
4647
}
4748

4849
impl<'ctx> SerialContext<'ctx> {
@@ -52,6 +53,40 @@ impl<'ctx> SerialContext<'ctx> {
5253
..Self::default()
5354
}
5455
}
56+
57+
fn process_utf8(&mut self, buff: &[u8]) -> String {
58+
let mut buffer = std::mem::take(&mut self.incomplete_utf8_buffer);
59+
buffer.extend(normalized(buff.iter().copied()));
60+
61+
// look for longest slice that we can then lossily convert without introducing errors for
62+
// partial sequences (#457)
63+
let mut len = 0;
64+
65+
loop {
66+
match std::str::from_utf8(&buffer[len..]) {
67+
// whole input is valid
68+
Ok(str) if len == 0 => return String::from(str),
69+
70+
// input is valid after the last error, and we could ignore the last error, so
71+
// let's process the whole input
72+
Ok(_) => return String::from_utf8_lossy(&buffer).to_string(),
73+
74+
// input has some errors. We can ignore invalid sequences and replace them later,
75+
// but we have to stop if we encounter an incomplete sequence.
76+
Err(e) => {
77+
len += e.valid_up_to();
78+
if let Some(error_len) = e.error_len() {
79+
len += error_len;
80+
} else {
81+
// incomplete sequence. We split it off, save it for later
82+
let (bytes, incomplete) = buffer.split_at(len);
83+
self.incomplete_utf8_buffer = incomplete.to_vec();
84+
return String::from_utf8_lossy(bytes).to_string();
85+
}
86+
}
87+
}
88+
}
89+
}
5590
}
5691

5792
/// Type that ensures that raw mode is disabled when dropped.
@@ -144,8 +179,7 @@ pub fn monitor(
144179

145180
/// Handles and writes the received serial data to the given output stream.
146181
fn handle_serial(ctx: &mut SerialContext, buff: &[u8], out: &mut dyn Write) {
147-
let text: Vec<u8> = normalized(buff.iter().copied()).collect();
148-
let text = String::from_utf8_lossy(&text).to_string();
182+
let text = ctx.process_utf8(buff);
149183

150184
// Split the text into lines, storing the last of which separately if it is
151185
// incomplete (ie. does not end with '\n') because these need special handling.
@@ -278,3 +312,75 @@ fn handle_key_event(key_event: KeyEvent) -> Option<Vec<u8>> {
278312

279313
key_str.map(|slice| slice.into())
280314
}
315+
316+
#[cfg(test)]
317+
mod test {
318+
#[test]
319+
fn returns_valid_strings_immediately() {
320+
let mut ctx = super::SerialContext::default();
321+
let buff = b"Hello, world!";
322+
let text = ctx.process_utf8(buff);
323+
assert_eq!(text, "Hello, world!");
324+
}
325+
326+
#[test]
327+
fn does_not_repeat_valid_strings() {
328+
let mut ctx = super::SerialContext::default();
329+
let text = ctx.process_utf8(b"Hello, world!");
330+
assert_eq!(text, "Hello, world!");
331+
let text = ctx.process_utf8(b"Something else");
332+
assert_eq!(text, "Something else");
333+
}
334+
335+
#[test]
336+
fn replaces_invalid_sequence() {
337+
let mut ctx = super::SerialContext::default();
338+
let text = ctx.process_utf8(b"Hello, \xFF world!");
339+
assert_eq!(text, "Hello, \u{FFFD} world!");
340+
}
341+
342+
#[test]
343+
fn can_replace_unfinished_incomplete_sequence() {
344+
let mut ctx = super::SerialContext::default();
345+
let mut incomplete = Vec::from("Hello, ".as_bytes());
346+
let utf8 = "🙈".as_bytes();
347+
incomplete.extend_from_slice(&utf8[..utf8.len() - 1]);
348+
let text = ctx.process_utf8(&incomplete);
349+
assert_eq!(text, "Hello, ");
350+
351+
let text = ctx.process_utf8(b" world!");
352+
assert_eq!(text, "\u{FFFD} world!");
353+
}
354+
355+
#[test]
356+
fn can_merge_incomplete_sequence() {
357+
let mut ctx = super::SerialContext::default();
358+
let mut incomplete = Vec::from("Hello, ".as_bytes());
359+
let utf8 = "🙈".as_bytes();
360+
incomplete.extend_from_slice(&utf8[..utf8.len() - 1]);
361+
362+
let text = ctx.process_utf8(&incomplete);
363+
assert_eq!(text, "Hello, ");
364+
365+
let text = ctx.process_utf8(&utf8[utf8.len() - 1..]);
366+
assert_eq!(text, "🙈");
367+
}
368+
369+
#[test]
370+
fn issue_457() {
371+
let mut ctx = super::SerialContext::default();
372+
let mut result = String::new();
373+
374+
result.push_str(&ctx.process_utf8(&[0x48]));
375+
result.push_str(&ctx.process_utf8(&[0x65, 0x6C, 0x6C]));
376+
result.push_str(&ctx.process_utf8(&[
377+
0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, 0x20, 0x77, 0x69, 0x74,
378+
]));
379+
result.push_str(&ctx.process_utf8(&[
380+
0x68, 0x20, 0x55, 0x54, 0x46, 0x3A, 0x20, 0x77, 0x79, 0x73, 0x79,
381+
]));
382+
result.push_str(&ctx.process_utf8(&[0xC5, 0x82, 0x61, 0x6D, 0x0A]));
383+
384+
assert_eq!(result, "Hello world! with UTF: wysyłam\r\n");
385+
}
386+
}

0 commit comments

Comments
 (0)