Skip to content

Commit 844cc24

Browse files
committed
Avoid copying when decoding UTF-8 or ASCII
1 parent fac6e6f commit 844cc24

File tree

2 files changed

+55
-3
lines changed

2 files changed

+55
-3
lines changed

src/rewritable_units/text_decoder.rs

Lines changed: 54 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use crate::base::SharedEncoding;
22
use crate::rewriter::RewritingError;
3-
use encoding_rs::{CoderResult, Decoder, Encoding};
3+
use encoding_rs::{CoderResult, Decoder, Encoding, UTF_8};
44

55
pub(crate) struct TextDecoder {
66
encoding: SharedEncoding,
@@ -27,7 +27,6 @@ impl TextDecoder {
2727
) -> Result<(), RewritingError> {
2828
if self.pending_text_streaming_decoder.is_some() {
2929
self.feed_text(&[], true, output_handler)?;
30-
self.pending_text_streaming_decoder = None;
3130
}
3231
Ok(())
3332
}
@@ -40,13 +39,25 @@ impl TextDecoder {
4039
output_handler: &mut dyn FnMut(&str, bool, &'static Encoding) -> Result<(), RewritingError>,
4140
) -> Result<(), RewritingError> {
4241
let encoding = self.encoding.get();
43-
let buffer = self.text_buffer.as_mut_str();
42+
43+
if let Some((utf8_text, rest)) = self.split_utf8_start(raw_input, encoding) {
44+
raw_input = rest;
45+
let really_last = last_in_text_node && rest.is_empty();
46+
47+
(output_handler)(utf8_text, really_last, encoding)?;
48+
49+
if really_last {
50+
debug_assert!(self.pending_text_streaming_decoder.is_none());
51+
return Ok(());
52+
}
53+
};
4454

4555
let decoder = self
4656
.pending_text_streaming_decoder
4757
.get_or_insert_with(|| encoding.new_decoder_without_bom_handling());
4858

4959
loop {
60+
let buffer = self.text_buffer.as_mut_str();
5061
let (status, read, written, ..) =
5162
decoder.decode_to_str(raw_input, buffer, last_in_text_node);
5263

@@ -60,9 +71,49 @@ impl TextDecoder {
6071
}
6172

6273
if finished_decoding {
74+
if last_in_text_node {
75+
self.pending_text_streaming_decoder = None;
76+
}
6377
return Ok(());
6478
}
6579
raw_input = &raw_input[read..];
6680
}
6781
}
82+
83+
/// Fast path for UTF-8 or ASCII prefix
84+
///
85+
/// Returns UTF-8 text to emit + remaining bytes, or `None` if the fast path is not available
86+
#[inline]
87+
fn split_utf8_start<'i>(
88+
&self,
89+
raw_input: &'i [u8],
90+
encoding: &'static Encoding,
91+
) -> Option<(&'i str, &'i [u8])> {
92+
// Can't use the fast path if the decoder may have buffered some bytes
93+
if self.pending_text_streaming_decoder.is_some() {
94+
return None;
95+
}
96+
97+
let text_or_len = if encoding == UTF_8 {
98+
std::str::from_utf8(raw_input).map_err(|err| err.valid_up_to())
99+
} else {
100+
debug_assert!(encoding.is_ascii_compatible());
101+
Err(Encoding::ascii_valid_up_to(raw_input))
102+
};
103+
104+
match text_or_len {
105+
Ok(utf8_text) => Some((utf8_text, &[][..])),
106+
Err(valid_up_to) => {
107+
// The slow path buffers 1KB, and even though this shouldn't matter,
108+
// it is an observable behavior, and it makes bugs worse for text handlers
109+
// that assume they'll get only a single chunk.
110+
if valid_up_to != raw_input.len() && valid_up_to < self.text_buffer.len() {
111+
return None;
112+
}
113+
114+
let (text, rest) = raw_input.split_at_checked(valid_up_to)?;
115+
Some((std::str::from_utf8(text).ok()?, rest))
116+
}
117+
}
118+
}
68119
}

src/rewritable_units/tokens/text_chunk.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,7 @@ mod tests {
410410
macro_rules! skip_eof_chunk {
411411
($c:ident) => {
412412
if $c.last_in_text_node() {
413+
// This is not always true — a replacement char for an incomplete UTF-8 sequence could be flushed last
413414
assert!($c.as_str().is_empty());
414415
return;
415416
}

0 commit comments

Comments
 (0)