Skip to content

Commit a7cc905

Browse files
committed
Optimize decode_utf8
1 parent e424374 commit a7cc905

File tree

2 files changed

+67
-118
lines changed

2 files changed

+67
-118
lines changed

src/pct_enc/mod.rs

Lines changed: 38 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,16 @@ pub(crate) mod table;
77
pub use estring::EString;
88
pub use table::Table;
99

10-
use crate::imp::PathEncoder;
10+
use crate::{
11+
imp::PathEncoder,
12+
utf8::{self, Utf8Chunks},
13+
};
1114
use alloc::{
1215
borrow::{Cow, ToOwned},
1316
string::String,
1417
vec::Vec,
1518
};
16-
use core::{cmp::Ordering, hash, iter::FusedIterator, marker::PhantomData, str};
19+
use core::{cmp::Ordering, hash, iter::FusedIterator, marker::PhantomData, mem, str};
1720
use ref_cast::{ref_cast_custom, RefCastCustom};
1821

1922
/// A trait used by [`EStr`] and [`EString`] to specify the table used for encoding.
@@ -559,12 +562,12 @@ pub(crate) enum DecodedUtf8Chunk<'a, 'b> {
559562

560563
impl<'a> Decode<'a> {
561564
pub(crate) fn decode_utf8(self, mut handle_chunk: impl FnMut(DecodedUtf8Chunk<'a, '_>)) {
562-
use crate::utf8::Utf8Chunks;
565+
const BUF_SIZE: usize = 32;
563566

564-
let mut buf = [0; 32];
567+
let mut buf = [0; BUF_SIZE];
565568
let mut len = 0;
566569

567-
'decode: for chunk in self {
570+
for chunk in self {
568571
match chunk {
569572
DecodedChunk::Unencoded(s) => {
570573
if len > 0 {
@@ -582,26 +585,40 @@ impl<'a> Decode<'a> {
582585
buf[len] = x;
583586
len += 1;
584587

585-
if len == buf.len() {
586-
for chunk in Utf8Chunks::new(&buf[..len]) {
587-
if chunk.incomplete() {
588-
handle_chunk(DecodedUtf8Chunk::Decoded {
589-
valid: chunk.valid(),
590-
invalid: &[],
591-
});
592-
593-
let invalid_len = chunk.invalid().len();
594-
buf.copy_within(len - invalid_len..len, 0);
595-
596-
len = invalid_len;
597-
continue 'decode;
598-
}
588+
if len >= BUF_SIZE {
589+
// Normally, all bytes decoded are valid UTF-8, but may contain chars
590+
// that lie across the buffer boundary. Since we forbid `unsafe` and
591+
// sadly has no access to `str::Utf8Chunks` due to MSRV, we want to
592+
// use `str::from_utf8` to successfully parse as much bytes as possible
593+
// when the buffer is full. To do this, we search back for a char
594+
// boundary in the last 3 bytes. If one is found, we feed the prefix
595+
// before that boundary to our own `Utf8Chunks` impl (which uses
596+
// `str::from_utf8` internally) and shift the remaining bytes to
597+
// the front for the next round. Otherwise, we feed the entire buffer,
598+
// which is safe because if the last 3 bytes contain no char boundary,
599+
// either they are valid continuation bytes, or they are invalid and
600+
// cannot become valid when more bytes are added.
601+
602+
let mut split_at = BUF_SIZE - 1;
603+
while split_at >= BUF_SIZE - 3 && !utf8::is_char_boundary(buf[split_at]) {
604+
split_at -= 1;
605+
}
606+
607+
if split_at < BUF_SIZE - 3 {
608+
split_at = BUF_SIZE;
609+
}
610+
611+
let (prefix, rem) = buf.split_at_mut(split_at);
612+
613+
for chunk in Utf8Chunks::new(prefix) {
599614
handle_chunk(DecodedUtf8Chunk::Decoded {
600615
valid: chunk.valid(),
601616
invalid: chunk.invalid(),
602617
});
603618
}
604-
len = 0;
619+
620+
prefix[..rem.len()].copy_from_slice(rem);
621+
len = rem.len();
605622
}
606623
}
607624
}
@@ -681,7 +698,7 @@ impl<'a> Decode<'a> {
681698
Ok(string) => {
682699
string.push_str(valid);
683700
if !invalid.is_empty() {
684-
let mut vec = core::mem::take(string).into_bytes();
701+
let mut vec = mem::take(string).into_bytes();
685702
vec.extend_from_slice(invalid);
686703
buf = Err(vec);
687704
}

src/utf8.rs

Lines changed: 29 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
//! UTF-8 utilities taken from `core::str`, Rust 1.81.
22
3+
use core::str;
4+
35
#[inline]
46
const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
57
(byte & (0x7F >> width)) as u32
@@ -34,32 +36,16 @@ pub const fn next_code_point(bytes: &[u8], i: usize) -> (u32, usize) {
3436
}
3537
}
3638

37-
const UTF8_CHAR_WIDTH: &[u8; 256] = &[
38-
// 1 2 3 4 5 6 7 8 9 A B C D E F
39-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0
40-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1
41-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2
42-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3
43-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
44-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5
45-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
46-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7
47-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8
48-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9
49-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A
50-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B
51-
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C
52-
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D
53-
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E
54-
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // F
55-
];
56-
5739
const CONT_MASK: u8 = 0b0011_1111;
5840

41+
pub(crate) const fn is_char_boundary(b: u8) -> bool {
42+
// This is bit magic equivalent to: b < 128 || b >= 192
43+
(b as i8) >= -0x40
44+
}
45+
5946
pub struct Utf8Chunk<'a> {
6047
valid: &'a str,
6148
invalid: &'a [u8],
62-
incomplete: bool,
6349
}
6450

6551
impl<'a> Utf8Chunk<'a> {
@@ -70,10 +56,6 @@ impl<'a> Utf8Chunk<'a> {
7056
pub fn invalid(&self) -> &'a [u8] {
7157
self.invalid
7258
}
73-
74-
pub fn incomplete(&self) -> bool {
75-
self.incomplete
76-
}
7759
}
7860

7961
pub struct Utf8Chunks<'a> {
@@ -94,81 +76,31 @@ impl<'a> Iterator for Utf8Chunks<'a> {
9476
return None;
9577
}
9678

97-
const TAG_CONT_U8: u8 = 128;
79+
match str::from_utf8(self.source) {
80+
Ok(valid) => {
81+
self.source = &[];
9882

99-
let mut incomplete = false;
100-
let mut safe_get = |i| {
101-
if let Some(x) = self.source.get(i) {
102-
*x
103-
} else {
104-
incomplete = true;
105-
0
83+
Some(Utf8Chunk {
84+
valid,
85+
invalid: &[],
86+
})
10687
}
107-
};
108-
109-
let mut i = 0;
110-
let mut valid_up_to = 0;
111-
while i < self.source.len() {
112-
let byte = self.source[i];
113-
i += 1;
114-
115-
if byte >= 128 {
116-
let w = UTF8_CHAR_WIDTH[byte as usize];
117-
118-
match w {
119-
2 => {
120-
if safe_get(i) & 192 != TAG_CONT_U8 {
121-
break;
122-
}
123-
i += 1;
124-
}
125-
3 => {
126-
match (byte, safe_get(i)) {
127-
(0xE0, 0xA0..=0xBF) => (),
128-
(0xE1..=0xEC, 0x80..=0xBF) => (),
129-
(0xED, 0x80..=0x9F) => (),
130-
(0xEE..=0xEF, 0x80..=0xBF) => (),
131-
_ => break,
132-
}
133-
i += 1;
134-
if safe_get(i) & 192 != TAG_CONT_U8 {
135-
break;
136-
}
137-
i += 1;
138-
}
139-
4 => {
140-
match (byte, safe_get(i)) {
141-
(0xF0, 0x90..=0xBF) => (),
142-
(0xF1..=0xF3, 0x80..=0xBF) => (),
143-
(0xF4, 0x80..=0x8F) => (),
144-
_ => break,
145-
}
146-
i += 1;
147-
if safe_get(i) & 192 != TAG_CONT_U8 {
148-
break;
149-
}
150-
i += 1;
151-
if safe_get(i) & 192 != TAG_CONT_U8 {
152-
break;
153-
}
154-
i += 1;
155-
}
156-
_ => break,
157-
}
88+
Err(e) => {
89+
let (valid, after_valid) = self.source.split_at(e.valid_up_to());
90+
91+
let (invalid, rem) = if let Some(len) = e.error_len() {
92+
let (invalid, rem) = after_valid.split_at(len);
93+
(invalid, rem)
94+
} else {
95+
(after_valid, &[][..])
96+
};
97+
self.source = rem;
98+
99+
Some(Utf8Chunk {
100+
valid: str::from_utf8(valid).unwrap(),
101+
invalid,
102+
})
158103
}
159-
160-
valid_up_to = i;
161104
}
162-
163-
let (inspected, remaining) = self.source.split_at(i);
164-
self.source = remaining;
165-
166-
let (valid, invalid) = inspected.split_at(valid_up_to);
167-
168-
Some(Utf8Chunk {
169-
valid: core::str::from_utf8(valid).unwrap(),
170-
invalid,
171-
incomplete,
172-
})
173105
}
174106
}

0 commit comments

Comments
 (0)