Skip to content

Commit 54a699b

Browse files
committed
Optimize next_code_point and next_code_point_reverse
By reordering some operations, we can expose some opportunites for CSE. Also convert the series of nested `if` branches to early return, which IMO makes the code clearer. Comparison of assembly before and after for `next_code_point`: https://godbolt.org/z/9Te84YzhK Comparison of assembly before and after for `next_code_point_reverse`: https://godbolt.org/z/fTx1a7oz1
1 parent 03fcfe0 commit 54a699b

File tree

1 file changed

+85
-71
lines changed

1 file changed

+85
-71
lines changed

library/core/src/str/validations.rs

Lines changed: 85 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,7 @@
11
//! Operations related to UTF-8 validation.
22
33
use super::Utf8Error;
4-
use crate::intrinsics::const_eval_select;
5-
6-
/// Returns the initial codepoint accumulator for the first byte.
7-
/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
8-
/// for width 3, and 3 bits for width 4.
9-
#[inline]
10-
const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
11-
(byte & (0x7F >> width)) as u32
12-
}
13-
14-
/// Returns the value of `ch` updated with continuation byte `byte`.
15-
#[inline]
16-
const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
17-
(ch << 6) | (byte & CONT_MASK) as u32
18-
}
4+
use crate::intrinsics::{assume, const_eval_select, disjoint_bitor};
195

206
/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
217
/// bits `10`).
@@ -33,39 +19,51 @@ pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
3319
#[unstable(feature = "str_internals", issue = "none")]
3420
#[inline]
3521
pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
36-
// Decode UTF-8
37-
let x = *bytes.next()?;
38-
if x < 128 {
39-
return Some(x as u32);
22+
let b1 = *bytes.next()?;
23+
if b1 < 0x80 {
24+
// 1 byte case (U+00_00 ..= U+00_7F):
25+
// c = b1
26+
return Some(u32::from(b1));
4027
}
4128

42-
// Multibyte case follows
43-
// Decode from a byte combination out of: [[[x y] z] w]
44-
// NOTE: Performance is sensitive to the exact formulation here
45-
let init = utf8_first_byte(x, 2);
46-
// SAFETY: `bytes` produces an UTF-8-like string,
47-
// so the iterator must produce a value here.
48-
let y = unsafe { *bytes.next().unwrap_unchecked() };
49-
let mut ch = utf8_acc_cont_byte(init, y);
50-
if x >= 0xE0 {
51-
// [[x y z] w] case
52-
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
53-
// SAFETY: `bytes` produces an UTF-8-like string,
54-
// so the iterator must produce a value here.
55-
let z = unsafe { *bytes.next().unwrap_unchecked() };
56-
let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
57-
ch = init << 12 | y_z;
58-
if x >= 0xF0 {
59-
// [x y z w] case
60-
// use only the lower 3 bits of `init`
61-
// SAFETY: `bytes` produces an UTF-8-like string,
62-
// so the iterator must produce a value here.
63-
let w = unsafe { *bytes.next().unwrap_unchecked() };
64-
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
65-
}
29+
// SAFETY: `bytes` produces a UTF-8-like string
30+
let mut next_byte = || unsafe {
31+
let b = *bytes.next().unwrap_unchecked();
32+
assume(utf8_is_cont_byte(b));
33+
b
34+
};
35+
36+
// SAFETY: `bytes` produces a UTF-8-like string
37+
let combine = |c: u32, b: u8| unsafe { disjoint_bitor(c << 6, u32::from(b & CONT_MASK)) };
38+
39+
let b2 = next_byte();
40+
let c = u32::from(b1 & 0x1F);
41+
let c = combine(c, b2);
42+
if b1 < 0xE0 {
43+
// 2 byte case (U+00_80 ..= U+07_FF):
44+
// c = (b1 & 0x1F) << 6
45+
// | (b2 & 0x3F) << 0
46+
return Some(c);
47+
}
48+
49+
let b3 = next_byte();
50+
let c = combine(c, b3);
51+
if b1 < 0xF0 {
52+
// 3 byte case (U+08_00 ..= U+FF_FF):
53+
// c = (b1 & 0x1F) << 12
54+
// | (b2 & 0x3F) << 6
55+
// | (b3 & 0x3F) << 0
56+
return Some(c);
6657
}
6758

68-
Some(ch)
59+
let b4 = next_byte();
60+
let c = combine(c, b4);
61+
// 4 byte case (U+01_00_00 ..= U+10_FF_FF):
62+
// c = ((b1 & 0x1F) << 18
63+
// | (b2 & 0x3F) << 12
64+
// | (b3 & 0x3F) << 6
65+
// | (b4 & 0x3F) << 0) & 0x1F_FF_FF
66+
Some(c & 0x1F_FF_FF)
6967
}
7068

7169
/// Reads the last code point out of a byte iterator (assuming a
@@ -80,36 +78,52 @@ pub unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
8078
where
8179
I: DoubleEndedIterator<Item = &'a u8>,
8280
{
83-
// Decode UTF-8
84-
let w = match *bytes.next_back()? {
85-
next_byte if next_byte < 128 => return Some(next_byte as u32),
86-
back_byte => back_byte,
81+
let b1 = *bytes.next_back()?;
82+
if b1 < 0x80 {
83+
// 1 byte case (U+00_00 ..= U+00_7F):
84+
// c = b1
85+
return Some(u32::from(b1));
86+
}
87+
88+
// SAFETY: `bytes` produces a UTF-8-like string
89+
let mut next_byte = || unsafe {
90+
let b = *bytes.next_back().unwrap_unchecked();
91+
assume(!b.is_ascii());
92+
b
8793
};
8894

89-
// Multibyte case follows
90-
// Decode from a byte combination out of: [x [y [z w]]]
91-
let mut ch;
92-
// SAFETY: `bytes` produces an UTF-8-like string,
93-
// so the iterator must produce a value here.
94-
let z = unsafe { *bytes.next_back().unwrap_unchecked() };
95-
ch = utf8_first_byte(z, 2);
96-
if utf8_is_cont_byte(z) {
97-
// SAFETY: `bytes` produces an UTF-8-like string,
98-
// so the iterator must produce a value here.
99-
let y = unsafe { *bytes.next_back().unwrap_unchecked() };
100-
ch = utf8_first_byte(y, 3);
101-
if utf8_is_cont_byte(y) {
102-
// SAFETY: `bytes` produces an UTF-8-like string,
103-
// so the iterator must produce a value here.
104-
let x = unsafe { *bytes.next_back().unwrap_unchecked() };
105-
ch = utf8_first_byte(x, 4);
106-
ch = utf8_acc_cont_byte(ch, y);
107-
}
108-
ch = utf8_acc_cont_byte(ch, z);
95+
// SAFETY: `bytes` produces a UTF-8-like string
96+
let combine = |c: u32, b: u8, n| unsafe { disjoint_bitor(c, u32::from(b & CONT_MASK) << n) };
97+
98+
let b2 = next_byte();
99+
let c = u32::from(b1 & CONT_MASK);
100+
let c = combine(c, b2, 6);
101+
if !utf8_is_cont_byte(b2) {
102+
// 2 byte case (U+00_80 ..= U+07_FF):
103+
// c = (b2 & 0x3F) << 6
104+
// | (b1 & 0x3F) << 0
105+
return Some(c);
106+
}
107+
108+
let b3 = next_byte();
109+
let c = combine(c, b3, 12);
110+
if !utf8_is_cont_byte(b3) {
111+
// 3 byte case (U+08_00 ..= U+FF_FF):
112+
// c = ((b3 & 0x3F) << 12
113+
// | (b2 & 0x3F) << 6
114+
// | (b1 & 0x3F) << 0) & 0xFF_FF
115+
return Some(c & 0xFF_FF);
109116
}
110-
ch = utf8_acc_cont_byte(ch, w);
111117

112-
Some(ch)
118+
let b4 = next_byte();
119+
let c = combine(c, b4, 18);
120+
// let c = c | u32::from(b4 & CONT_MASK) << 18;
121+
// 4 byte case (U+01_00_00 ..= U+10_FF_FF):
122+
// c = ((b4 & 0x3F) << 18
123+
// | (b3 & 0x3F) << 12
124+
// | (b2 & 0x3F) << 6
125+
// | (b1 & 0x3F) << 0) & 0x1F_FF_FF
126+
Some(c & 0x1F_FF_FF)
113127
}
114128

115129
const NONASCII_MASK: usize = usize::repeat_u8(0x80);
@@ -280,5 +294,5 @@ pub const fn utf8_char_width(b: u8) -> usize {
280294
UTF8_CHAR_WIDTH[b as usize] as usize
281295
}
282296

283-
/// Mask of the value bits of a continuation byte.
297+
/// Mask of the value bits of a continuation byte (ie the lowest 6 bits).
284298
const CONT_MASK: u8 = 0b0011_1111;

0 commit comments

Comments
 (0)