Skip to content

Commit 03fcfe0

Browse files
committed
Add exhaustive tests for next_code_point and next_code_point_reverse
There are only 0x10FFFF possible codepoints, so we can exhaustively test all of them.
1 parent a2d45f7 commit 03fcfe0

File tree

4 files changed

+98
-2
lines changed

4 files changed

+98
-2
lines changed

library/alloctests/tests/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
#![feature(round_char_boundary)]
2626
#![feature(slice_partition_dedup)]
2727
#![feature(string_from_utf8_lossy_owned)]
28+
#![feature(str_internals)]
29+
#![feature(char_internals)]
2830
#![feature(string_remove_matches)]
2931
#![feature(const_btree_len)]
3032
#![feature(const_trait_impl)]

library/alloctests/tests/str.rs

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1156,6 +1156,99 @@ fn test_total_ord() {
11561156
assert_eq!("22".cmp("1234"), Greater);
11571157
}
11581158

1159+
// There are only 1,114,112 code points (including surrogates for WTF-8). So we
1160+
// can test `next_code_point` and `next_code_point_reverse` exhaustively on all
1161+
// possible inputs.
1162+
1163+
/// Assert that encoding a codepoint with `encode_utf8_raw` and then decoding it
1164+
/// with `next_code_point` preserves the codepoint.
1165+
fn test_next_code_point(codepoint: u32) {
1166+
let mut bytes = [0; 4];
1167+
let mut bytes = std::char::encode_utf8_raw(codepoint, &mut bytes).iter();
1168+
1169+
// SAFETY: `bytes` is UTF8-like
1170+
let got = unsafe { core::str::next_code_point(&mut bytes) };
1171+
assert_eq!(got, Some(codepoint));
1172+
1173+
// SAFETY: `bytes` is UTF8-like
1174+
let got = unsafe { core::str::next_code_point(&mut bytes) };
1175+
assert_eq!(got, None);
1176+
}
1177+
1178+
/// The same but for `next_code_point_reverse`.
1179+
fn test_next_code_point_reverse(codepoint: u32) {
1180+
let mut bytes = [0; 4];
1181+
let mut bytes = std::char::encode_utf8_raw(codepoint, &mut bytes).iter();
1182+
1183+
// SAFETY: `bytes` is UTF8-like
1184+
let got = unsafe { core::str::next_code_point_reverse(&mut bytes) };
1185+
assert_eq!(got, Some(codepoint));
1186+
1187+
// SAFETY: `bytes` is UTF8-like
1188+
let got = unsafe { core::str::next_code_point_reverse(&mut bytes) };
1189+
assert_eq!(got, None);
1190+
}
1191+
1192+
#[test]
1193+
fn test_next_code_point_1byte() {
1194+
for c in 0..0x80 {
1195+
test_next_code_point(c);
1196+
}
1197+
}
1198+
1199+
#[test]
1200+
fn test_next_code_point_2byte() {
1201+
for c in 0x80..0x800 {
1202+
test_next_code_point(c);
1203+
}
1204+
}
1205+
1206+
#[test]
1207+
#[cfg(not(miri))] // Disabled on Miri because it is too slow
1208+
fn test_next_code_point_3byte() {
1209+
for c in 0x800..0x10_000 {
1210+
test_next_code_point(c);
1211+
}
1212+
}
1213+
1214+
#[test]
1215+
// #[cfg(not(miri))] // Disabled on Miri because it is too slow
1216+
fn test_next_code_point_4byte() {
1217+
for c in 0x10_000..=u32::from(char::MAX) {
1218+
test_next_code_point(c);
1219+
}
1220+
}
1221+
1222+
#[test]
1223+
fn test_next_code_point_reverse_1byte() {
1224+
for c in 0..0x80 {
1225+
test_next_code_point_reverse(c);
1226+
}
1227+
}
1228+
1229+
#[test]
1230+
fn test_next_code_point_reverse_2byte() {
1231+
for c in 0x80..0x800 {
1232+
test_next_code_point_reverse(c);
1233+
}
1234+
}
1235+
1236+
#[test]
1237+
#[cfg(not(miri))] // Disabled on Miri because it is too slow
1238+
fn test_next_code_point_reverse_3byte() {
1239+
for c in 0x800..0x10_000 {
1240+
test_next_code_point_reverse(c);
1241+
}
1242+
}
1243+
1244+
#[test]
1245+
#[cfg(not(miri))] // Disabled on Miri because it is too slow
1246+
fn test_next_code_point_reverse_4byte() {
1247+
for c in 0x10_000..=u32::from(char::MAX) {
1248+
test_next_code_point_reverse(c);
1249+
}
1250+
}
1251+
11591252
#[test]
11601253
fn test_iterator() {
11611254
let s = "ศไทย中华Việt Nam";

library/core/src/str/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ pub use lossy::{Utf8Chunk, Utf8Chunks};
5858
#[stable(feature = "rust1", since = "1.0.0")]
5959
pub use traits::FromStr;
6060
#[unstable(feature = "str_internals", issue = "none")]
61-
pub use validations::{next_code_point, utf8_char_width};
61+
pub use validations::{next_code_point, next_code_point_reverse, utf8_char_width};
6262

6363
#[inline(never)]
6464
#[cold]

library/core/src/str/validations.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,9 @@ pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) ->
7474
/// # Safety
7575
///
7676
/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
77+
#[unstable(feature = "str_internals", issue = "none")]
7778
#[inline]
78-
pub(super) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
79+
pub unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
7980
where
8081
I: DoubleEndedIterator<Item = &'a u8>,
8182
{

0 commit comments

Comments
 (0)