11//! Operations related to UTF-8 validation.
22
33use super :: Utf8Error ;
4- use crate :: intrinsics:: const_eval_select;
5-
6- /// Returns the initial codepoint accumulator for the first byte.
7- /// The first byte is special, only want bottom 5 bits for width 2, 4 bits
8- /// for width 3, and 3 bits for width 4.
9- #[ inline]
10- const fn utf8_first_byte ( byte : u8 , width : u32 ) -> u32 {
11- ( byte & ( 0x7F >> width) ) as u32
12- }
13-
14- /// Returns the value of `ch` updated with continuation byte `byte`.
15- #[ inline]
16- const fn utf8_acc_cont_byte ( ch : u32 , byte : u8 ) -> u32 {
17- ( ch << 6 ) | ( byte & CONT_MASK ) as u32
18- }
4+ use crate :: intrinsics:: { assume, const_eval_select, disjoint_bitor} ;
195
206/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
217/// bits `10`).
@@ -33,39 +19,51 @@ pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
3319#[ unstable( feature = "str_internals" , issue = "none" ) ]
3420#[ inline]
3521pub unsafe fn next_code_point < ' a , I : Iterator < Item = & ' a u8 > > ( bytes : & mut I ) -> Option < u32 > {
36- // Decode UTF-8
37- let x = * bytes. next ( ) ?;
38- if x < 128 {
39- return Some ( x as u32 ) ;
22+ let b1 = * bytes. next ( ) ?;
23+ if b1 < 0x80 {
24+ // 1 byte case (U+00_00 ..= U+00_7F):
25+ // c = b1
26+ return Some ( u32:: from ( b1) ) ;
4027 }
4128
42- // Multibyte case follows
43- // Decode from a byte combination out of: [[[x y] z] w]
44- // NOTE: Performance is sensitive to the exact formulation here
45- let init = utf8_first_byte ( x, 2 ) ;
46- // SAFETY: `bytes` produces an UTF-8-like string,
47- // so the iterator must produce a value here.
48- let y = unsafe { * bytes. next ( ) . unwrap_unchecked ( ) } ;
49- let mut ch = utf8_acc_cont_byte ( init, y) ;
50- if x >= 0xE0 {
51- // [[x y z] w] case
52- // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
53- // SAFETY: `bytes` produces an UTF-8-like string,
54- // so the iterator must produce a value here.
55- let z = unsafe { * bytes. next ( ) . unwrap_unchecked ( ) } ;
56- let y_z = utf8_acc_cont_byte ( ( y & CONT_MASK ) as u32 , z) ;
57- ch = init << 12 | y_z;
58- if x >= 0xF0 {
59- // [x y z w] case
60- // use only the lower 3 bits of `init`
61- // SAFETY: `bytes` produces an UTF-8-like string,
62- // so the iterator must produce a value here.
63- let w = unsafe { * bytes. next ( ) . unwrap_unchecked ( ) } ;
64- ch = ( init & 7 ) << 18 | utf8_acc_cont_byte ( y_z, w) ;
65- }
29+ // SAFETY: `bytes` produces a UTF-8-like string
30+ let mut next_byte = || unsafe {
31+ let b = * bytes. next ( ) . unwrap_unchecked ( ) ;
32+ assume ( utf8_is_cont_byte ( b) ) ;
33+ b
34+ } ;
35+
36+ // SAFETY: `bytes` produces a UTF-8-like string
37+ let combine = |c : u32 , b : u8 | unsafe { disjoint_bitor ( c << 6 , u32:: from ( b & CONT_MASK ) ) } ;
38+
39+ let b2 = next_byte ( ) ;
40+ let c = u32:: from ( b1 & 0x1F ) ;
41+ let c = combine ( c, b2) ;
42+ if b1 < 0xE0 {
43+ // 2 byte case (U+00_80 ..= U+07_FF):
44+ // c = (b1 & 0x1F) << 6
45+ // | (b2 & 0x3F) << 0
46+ return Some ( c) ;
47+ }
48+
49+ let b3 = next_byte ( ) ;
50+ let c = combine ( c, b3) ;
51+ if b1 < 0xF0 {
52+ // 3 byte case (U+08_00 ..= U+FF_FF):
53+ // c = (b1 & 0x1F) << 12
54+ // | (b2 & 0x3F) << 6
55+ // | (b3 & 0x3F) << 0
56+ return Some ( c) ;
6657 }
6758
68- Some ( ch)
59+ let b4 = next_byte ( ) ;
60+ let c = combine ( c, b4) ;
61+ // 4 byte case (U+01_00_00 ..= U+10_FF_FF):
62+ // c = ((b1 & 0x1F) << 18
63+ // | (b2 & 0x3F) << 12
64+ // | (b3 & 0x3F) << 6
65+ // | (b4 & 0x3F) << 0) & 0x1F_FF_FF
66+ Some ( c & 0x1F_FF_FF )
6967}
7068
7169/// Reads the last code point out of a byte iterator (assuming a
@@ -80,36 +78,52 @@ pub unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
8078where
8179 I : DoubleEndedIterator < Item = & ' a u8 > ,
8280{
83- // Decode UTF-8
84- let w = match * bytes. next_back ( ) ? {
85- next_byte if next_byte < 128 => return Some ( next_byte as u32 ) ,
86- back_byte => back_byte,
81+ let b1 = * bytes. next_back ( ) ?;
82+ if b1 < 0x80 {
83+ // 1 byte case (U+00_00 ..= U+00_7F):
84+ // c = b1
85+ return Some ( u32:: from ( b1) ) ;
86+ }
87+
88+ // SAFETY: `bytes` produces a UTF-8-like string
89+ let mut next_byte = || unsafe {
90+ let b = * bytes. next_back ( ) . unwrap_unchecked ( ) ;
91+ assume ( !b. is_ascii ( ) ) ;
92+ b
8793 } ;
8894
89- // Multibyte case follows
90- // Decode from a byte combination out of: [x [y [z w]]]
91- let mut ch;
92- // SAFETY: `bytes` produces an UTF-8-like string,
93- // so the iterator must produce a value here.
94- let z = unsafe { * bytes. next_back ( ) . unwrap_unchecked ( ) } ;
95- ch = utf8_first_byte ( z, 2 ) ;
96- if utf8_is_cont_byte ( z) {
97- // SAFETY: `bytes` produces an UTF-8-like string,
98- // so the iterator must produce a value here.
99- let y = unsafe { * bytes. next_back ( ) . unwrap_unchecked ( ) } ;
100- ch = utf8_first_byte ( y, 3 ) ;
101- if utf8_is_cont_byte ( y) {
102- // SAFETY: `bytes` produces an UTF-8-like string,
103- // so the iterator must produce a value here.
104- let x = unsafe { * bytes. next_back ( ) . unwrap_unchecked ( ) } ;
105- ch = utf8_first_byte ( x, 4 ) ;
106- ch = utf8_acc_cont_byte ( ch, y) ;
107- }
108- ch = utf8_acc_cont_byte ( ch, z) ;
95+ // SAFETY: `bytes` produces a UTF-8-like string
96+ let combine = |c : u32 , b : u8 , n| unsafe { disjoint_bitor ( c, u32:: from ( b & CONT_MASK ) << n) } ;
97+
98+ let b2 = next_byte ( ) ;
99+ let c = u32:: from ( b1 & CONT_MASK ) ;
100+ let c = combine ( c, b2, 6 ) ;
101+ if !utf8_is_cont_byte ( b2) {
102+ // 2 byte case (U+00_80 ..= U+07_FF):
103+ // c = (b2 & 0x3F) << 6
104+ // | (b1 & 0x3F) << 0
105+ return Some ( c) ;
106+ }
107+
108+ let b3 = next_byte ( ) ;
109+ let c = combine ( c, b3, 12 ) ;
110+ if !utf8_is_cont_byte ( b3) {
111+ // 3 byte case (U+08_00 ..= U+FF_FF):
112+ // c = ((b3 & 0x3F) << 12
113+ // | (b2 & 0x3F) << 6
114+ // | (b1 & 0x3F) << 0) & 0xFF_FF
115+ return Some ( c & 0xFF_FF ) ;
109116 }
110- ch = utf8_acc_cont_byte ( ch, w) ;
111117
112- Some ( ch)
118+ let b4 = next_byte ( ) ;
119+ let c = combine ( c, b4, 18 ) ;
120+ // let c = c | u32::from(b4 & CONT_MASK) << 18;
121+ // 4 byte case (U+01_00_00 ..= U+10_FF_FF):
122+ // c = ((b4 & 0x3F) << 18
123+ // | (b3 & 0x3F) << 12
124+ // | (b2 & 0x3F) << 6
125+ // | (b1 & 0x3F) << 0) & 0x1F_FF_FF
126+ Some ( c & 0x1F_FF_FF )
113127}
114128
115129const NONASCII_MASK : usize = usize:: repeat_u8 ( 0x80 ) ;
@@ -280,5 +294,5 @@ pub const fn utf8_char_width(b: u8) -> usize {
280294 UTF8_CHAR_WIDTH [ b as usize ] as usize
281295}
282296
283- /// Mask of the value bits of a continuation byte.
297+ /// Mask of the value bits of a continuation byte (ie the lowest 6 bits) .
284298const CONT_MASK : u8 = 0b0011_1111 ;
0 commit comments