Optimize decode_utf8

yescallop · yescallop · commit a7cc905bb78e · 2025-11-09T21:54:50.000+08:00
diff --git a/src/pct_enc/mod.rs b/src/pct_enc/mod.rs
@@ -7,13 +7,16 @@ pub(crate) mod table;
 pub use estring::EString;
 pub use table::Table;
 
-use crate::imp::PathEncoder;
+use crate::{
+    imp::PathEncoder,
+    utf8::{self, Utf8Chunks},
+};
 use alloc::{
     borrow::{Cow, ToOwned},
     string::String,
     vec::Vec,
 };
-use core::{cmp::Ordering, hash, iter::FusedIterator, marker::PhantomData, str};
+use core::{cmp::Ordering, hash, iter::FusedIterator, marker::PhantomData, mem, str};
 use ref_cast::{ref_cast_custom, RefCastCustom};
 
 /// A trait used by [`EStr`] and [`EString`] to specify the table used for encoding.
@@ -559,12 +562,12 @@ pub(crate) enum DecodedUtf8Chunk<'a, 'b> {
 
 impl<'a> Decode<'a> {
     pub(crate) fn decode_utf8(self, mut handle_chunk: impl FnMut(DecodedUtf8Chunk<'a, '_>)) {
-        use crate::utf8::Utf8Chunks;
+        const BUF_SIZE: usize = 32;
 
-        let mut buf = [0; 32];
+        let mut buf = [0; BUF_SIZE];
         let mut len = 0;
 
-        'decode: for chunk in self {
+        for chunk in self {
             match chunk {
                 DecodedChunk::Unencoded(s) => {
                     if len > 0 {
@@ -582,26 +585,40 @@ impl<'a> Decode<'a> {
                     buf[len] = x;
                     len += 1;
 
-                    if len == buf.len() {
-                        for chunk in Utf8Chunks::new(&buf[..len]) {
-                            if chunk.incomplete() {
-                                handle_chunk(DecodedUtf8Chunk::Decoded {
-                                    valid: chunk.valid(),
-                                    invalid: &[],
-                                });
-
-                                let invalid_len = chunk.invalid().len();
-                                buf.copy_within(len - invalid_len..len, 0);
-
-                                len = invalid_len;
-                                continue 'decode;
-                            }
+                    if len >= BUF_SIZE {
+                        // Normally, all bytes decoded are valid UTF-8, but may contain chars
+                        // that lie across the buffer boundary. Since we forbid `unsafe` and
+                        // sadly has no access to `str::Utf8Chunks` due to MSRV, we want to
+                        // use `str::from_utf8` to successfully parse as much bytes as possible
+                        // when the buffer is full. To do this, we search back for a char
+                        // boundary in the last 3 bytes. If one is found, we feed the prefix
+                        // before that boundary to our own `Utf8Chunks` impl (which uses
+                        // `str::from_utf8` internally) and shift the remaining bytes to
+                        // the front for the next round. Otherwise, we feed the entire buffer,
+                        // which is safe because if the last 3 bytes contain no char boundary,
+                        // either they are valid continuation bytes, or they are invalid and
+                        // cannot become valid when more bytes are added.
+
+                        let mut split_at = BUF_SIZE - 1;
+                        while split_at >= BUF_SIZE - 3 && !utf8::is_char_boundary(buf[split_at]) {
+                            split_at -= 1;
+                        }
+
+                        if split_at < BUF_SIZE - 3 {
+                            split_at = BUF_SIZE;
+                        }
+
+                        let (prefix, rem) = buf.split_at_mut(split_at);
+
+                        for chunk in Utf8Chunks::new(prefix) {
                             handle_chunk(DecodedUtf8Chunk::Decoded {
                                 valid: chunk.valid(),
                                 invalid: chunk.invalid(),
                             });
                         }
-                        len = 0;
+
+                        prefix[..rem.len()].copy_from_slice(rem);
+                        len = rem.len();
                     }
                 }
             }
@@ -681,7 +698,7 @@ impl<'a> Decode<'a> {
                 Ok(string) => {
                     string.push_str(valid);
                     if !invalid.is_empty() {
-                        let mut vec = core::mem::take(string).into_bytes();
+                        let mut vec = mem::take(string).into_bytes();
                         vec.extend_from_slice(invalid);
                         buf = Err(vec);
                     }
diff --git a/src/utf8.rs b/src/utf8.rs
@@ -1,5 +1,7 @@
 //! UTF-8 utilities taken from `core::str`, Rust 1.81.
 
+use core::str;
+
 #[inline]
 const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
     (byte & (0x7F >> width)) as u32
@@ -34,32 +36,16 @@ pub const fn next_code_point(bytes: &[u8], i: usize) -> (u32, usize) {
     }
 }
 
-const UTF8_CHAR_WIDTH: &[u8; 256] = &[
-    // 1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B
-    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D
-    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E
-    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // F
-];
-
 const CONT_MASK: u8 = 0b0011_1111;
 
+pub(crate) const fn is_char_boundary(b: u8) -> bool {
+    // This is bit magic equivalent to: b < 128 || b >= 192
+    (b as i8) >= -0x40
+}
+
 pub struct Utf8Chunk<'a> {
     valid: &'a str,
     invalid: &'a [u8],
-    incomplete: bool,
 }
 
 impl<'a> Utf8Chunk<'a> {
@@ -70,10 +56,6 @@ impl<'a> Utf8Chunk<'a> {
     pub fn invalid(&self) -> &'a [u8] {
         self.invalid
     }
-
-    pub fn incomplete(&self) -> bool {
-        self.incomplete
-    }
 }
 
 pub struct Utf8Chunks<'a> {
@@ -94,81 +76,31 @@ impl<'a> Iterator for Utf8Chunks<'a> {
             return None;
         }
 
-        const TAG_CONT_U8: u8 = 128;
+        match str::from_utf8(self.source) {
+            Ok(valid) => {
+                self.source = &[];
 
-        let mut incomplete = false;
-        let mut safe_get = |i| {
-            if let Some(x) = self.source.get(i) {
-                *x
-            } else {
-                incomplete = true;
-                0
+                Some(Utf8Chunk {
+                    valid,
+                    invalid: &[],
+                })
             }
-        };
-
-        let mut i = 0;
-        let mut valid_up_to = 0;
-        while i < self.source.len() {
-            let byte = self.source[i];
-            i += 1;
-
-            if byte >= 128 {
-                let w = UTF8_CHAR_WIDTH[byte as usize];
-
-                match w {
-                    2 => {
-                        if safe_get(i) & 192 != TAG_CONT_U8 {
-                            break;
-                        }
-                        i += 1;
-                    }
-                    3 => {
-                        match (byte, safe_get(i)) {
-                            (0xE0, 0xA0..=0xBF) => (),
-                            (0xE1..=0xEC, 0x80..=0xBF) => (),
-                            (0xED, 0x80..=0x9F) => (),
-                            (0xEE..=0xEF, 0x80..=0xBF) => (),
-                            _ => break,
-                        }
-                        i += 1;
-                        if safe_get(i) & 192 != TAG_CONT_U8 {
-                            break;
-                        }
-                        i += 1;
-                    }
-                    4 => {
-                        match (byte, safe_get(i)) {
-                            (0xF0, 0x90..=0xBF) => (),
-                            (0xF1..=0xF3, 0x80..=0xBF) => (),
-                            (0xF4, 0x80..=0x8F) => (),
-                            _ => break,
-                        }
-                        i += 1;
-                        if safe_get(i) & 192 != TAG_CONT_U8 {
-                            break;
-                        }
-                        i += 1;
-                        if safe_get(i) & 192 != TAG_CONT_U8 {
-                            break;
-                        }
-                        i += 1;
-                    }
-                    _ => break,
-                }
+            Err(e) => {
+                let (valid, after_valid) = self.source.split_at(e.valid_up_to());
+
+                let (invalid, rem) = if let Some(len) = e.error_len() {
+                    let (invalid, rem) = after_valid.split_at(len);
+                    (invalid, rem)
+                } else {
+                    (after_valid, &[][..])
+                };
+                self.source = rem;
+
+                Some(Utf8Chunk {
+                    valid: str::from_utf8(valid).unwrap(),
+                    invalid,
+                })
             }
-
-            valid_up_to = i;
         }
-
-        let (inspected, remaining) = self.source.split_at(i);
-        self.source = remaining;
-
-        let (valid, invalid) = inspected.split_at(valid_up_to);
-
-        Some(Utf8Chunk {
-            valid: core::str::from_utf8(valid).unwrap(),
-            invalid,
-            incomplete,
-        })
     }
 }