11//! UTF-8 utilities taken from `core::str`, Rust 1.81.
22
3+ use core:: str;
4+
35#[ inline]
46const fn utf8_first_byte ( byte : u8 , width : u32 ) -> u32 {
57 ( byte & ( 0x7F >> width) ) as u32
@@ -34,32 +36,16 @@ pub const fn next_code_point(bytes: &[u8], i: usize) -> (u32, usize) {
3436 }
3537}
3638
37- const UTF8_CHAR_WIDTH : & [ u8 ; 256 ] = & [
38- // 1 2 3 4 5 6 7 8 9 A B C D E F
39- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 0
40- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 1
41- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 2
42- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 3
43- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 4
44- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 5
45- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 6
46- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 7
47- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 8
48- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 9
49- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // A
50- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // B
51- 0 , 0 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , // C
52- 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , // D
53- 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , // E
54- 4 , 4 , 4 , 4 , 4 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // F
55- ] ;
56-
5739const CONT_MASK : u8 = 0b0011_1111 ;
5840
41+ pub ( crate ) const fn is_char_boundary ( b : u8 ) -> bool {
42+ // This is bit magic equivalent to: b < 128 || b >= 192
43+ ( b as i8 ) >= -0x40
44+ }
45+
5946pub struct Utf8Chunk < ' a > {
6047 valid : & ' a str ,
6148 invalid : & ' a [ u8 ] ,
62- incomplete : bool ,
6349}
6450
6551impl < ' a > Utf8Chunk < ' a > {
@@ -70,10 +56,6 @@ impl<'a> Utf8Chunk<'a> {
7056 pub fn invalid ( & self ) -> & ' a [ u8 ] {
7157 self . invalid
7258 }
73-
74- pub fn incomplete ( & self ) -> bool {
75- self . incomplete
76- }
7759}
7860
7961pub struct Utf8Chunks < ' a > {
@@ -94,81 +76,31 @@ impl<'a> Iterator for Utf8Chunks<'a> {
9476 return None ;
9577 }
9678
97- const TAG_CONT_U8 : u8 = 128 ;
79+ match str:: from_utf8 ( self . source ) {
80+ Ok ( valid) => {
81+ self . source = & [ ] ;
9882
99- let mut incomplete = false ;
100- let mut safe_get = |i| {
101- if let Some ( x) = self . source . get ( i) {
102- * x
103- } else {
104- incomplete = true ;
105- 0
83+ Some ( Utf8Chunk {
84+ valid,
85+ invalid : & [ ] ,
86+ } )
10687 }
107- } ;
108-
109- let mut i = 0 ;
110- let mut valid_up_to = 0 ;
111- while i < self . source . len ( ) {
112- let byte = self . source [ i] ;
113- i += 1 ;
114-
115- if byte >= 128 {
116- let w = UTF8_CHAR_WIDTH [ byte as usize ] ;
117-
118- match w {
119- 2 => {
120- if safe_get ( i) & 192 != TAG_CONT_U8 {
121- break ;
122- }
123- i += 1 ;
124- }
125- 3 => {
126- match ( byte, safe_get ( i) ) {
127- ( 0xE0 , 0xA0 ..=0xBF ) => ( ) ,
128- ( 0xE1 ..=0xEC , 0x80 ..=0xBF ) => ( ) ,
129- ( 0xED , 0x80 ..=0x9F ) => ( ) ,
130- ( 0xEE ..=0xEF , 0x80 ..=0xBF ) => ( ) ,
131- _ => break ,
132- }
133- i += 1 ;
134- if safe_get ( i) & 192 != TAG_CONT_U8 {
135- break ;
136- }
137- i += 1 ;
138- }
139- 4 => {
140- match ( byte, safe_get ( i) ) {
141- ( 0xF0 , 0x90 ..=0xBF ) => ( ) ,
142- ( 0xF1 ..=0xF3 , 0x80 ..=0xBF ) => ( ) ,
143- ( 0xF4 , 0x80 ..=0x8F ) => ( ) ,
144- _ => break ,
145- }
146- i += 1 ;
147- if safe_get ( i) & 192 != TAG_CONT_U8 {
148- break ;
149- }
150- i += 1 ;
151- if safe_get ( i) & 192 != TAG_CONT_U8 {
152- break ;
153- }
154- i += 1 ;
155- }
156- _ => break ,
157- }
88+ Err ( e) => {
89+ let ( valid, after_valid) = self . source . split_at ( e. valid_up_to ( ) ) ;
90+
91+ let ( invalid, rem) = if let Some ( len) = e. error_len ( ) {
92+ let ( invalid, rem) = after_valid. split_at ( len) ;
93+ ( invalid, rem)
94+ } else {
95+ ( after_valid, & [ ] [ ..] )
96+ } ;
97+ self . source = rem;
98+
99+ Some ( Utf8Chunk {
100+ valid : str:: from_utf8 ( valid) . unwrap ( ) ,
101+ invalid,
102+ } )
158103 }
159-
160- valid_up_to = i;
161104 }
162-
163- let ( inspected, remaining) = self . source . split_at ( i) ;
164- self . source = remaining;
165-
166- let ( valid, invalid) = inspected. split_at ( valid_up_to) ;
167-
168- Some ( Utf8Chunk {
169- valid : core:: str:: from_utf8 ( valid) . unwrap ( ) ,
170- invalid,
171- incomplete,
172- } )
173105 }
174106}
0 commit comments