Skip to content

Commit 48d0413

Browse files
committed
Add exhaustive tests for next_code_point and next_code_point_reverse
There are only 0x10FFFF possible codepoints, so we can exhaustively test all of them.
1 parent f3fd3ef commit 48d0413

File tree

4 files changed

+83
-2
lines changed

4 files changed

+83
-2
lines changed

library/alloctests/tests/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
#![feature(iter_next_chunk)]
2626
#![feature(slice_partition_dedup)]
2727
#![feature(string_from_utf8_lossy_owned)]
28+
#![feature(str_internals)]
29+
#![feature(char_internals)]
2830
#![feature(string_remove_matches)]
2931
#![feature(const_btree_len)]
3032
#![feature(const_trait_impl)]

library/alloctests/tests/str.rs

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1156,6 +1156,84 @@ fn test_total_ord() {
11561156
assert_eq!("22".cmp("1234"), Greater);
11571157
}
11581158

1159+
// There are only 1,114,112 code points (including surrogates for WTF-8). So we
1160+
// can test `next_code_point` and `next_code_point_reverse` exhaustively on all
1161+
// possible inputs.
1162+
1163+
/// Assert that encoding a codepoint with `encode_utf8_raw` and then decoding it
1164+
/// with `next_code_point` preserves the codepoint.
1165+
fn test_next_code_point(codepoint: u32) {
1166+
let mut bytes = [0; 4];
1167+
let mut bytes = std::char::encode_utf8_raw(codepoint, &mut bytes).iter();
1168+
1169+
// SAFETY: `bytes` is UTF8-like
1170+
let got = unsafe { core::str::next_code_point(&mut bytes) };
1171+
assert_eq!(got, Some(codepoint));
1172+
1173+
// SAFETY: `bytes` is UTF8-like
1174+
let got = unsafe { core::str::next_code_point(&mut bytes) };
1175+
assert_eq!(got, None);
1176+
}
1177+
1178+
/// The same but for `next_code_point_reverse`.
1179+
fn test_next_code_point_reverse(codepoint: u32) {
1180+
let mut bytes = [0; 4];
1181+
let mut bytes = std::char::encode_utf8_raw(codepoint, &mut bytes).iter();
1182+
1183+
// SAFETY: `bytes` is UTF8-like
1184+
let got = unsafe { core::str::next_code_point_reverse(&mut bytes) };
1185+
assert_eq!(got, Some(codepoint));
1186+
1187+
// SAFETY: `bytes` is UTF8-like
1188+
let got = unsafe { core::str::next_code_point_reverse(&mut bytes) };
1189+
assert_eq!(got, None);
1190+
}
1191+
1192+
#[test]
1193+
#[cfg_attr(miri, ignore)] // Disabled on Miri because it is too slow
1194+
fn test_next_code_point_exhaustive() {
1195+
for c in 0..=u32::from(char::MAX) {
1196+
test_next_code_point(c);
1197+
}
1198+
}
1199+
1200+
#[test]
1201+
#[cfg_attr(miri, ignore)] // Disabled on Miri because it is too slow
1202+
fn test_next_code_point_reverse_exhaustive() {
1203+
for c in 0..=u32::from(char::MAX) {
1204+
test_next_code_point_reverse(c);
1205+
}
1206+
}
1207+
1208+
#[rustfmt::skip]
1209+
const CODEPOINT_BOUNDARIES: &[u32] = &[
1210+
// 1 byte codepoints (U+0000 ..= U+007F):
1211+
0x0000, 0x007F,
1212+
1213+
// 2 byte codepoints (U+0080 ..= U+07FF):
1214+
0x0080, 0x07FF,
1215+
1216+
// 3 byte codepoints (U+0800 ..= U+FFFF):
1217+
0800, 0xFFFF,
1218+
1219+
// 4 byte codepoints (U+01_0000 ..= U+10_FFFF):
1220+
0x01_0000, 0x10_FFFF,
1221+
];
1222+
1223+
#[test]
1224+
fn test_next_code_point_boundary_conditions() {
1225+
for c in CODEPOINT_BOUNDARIES {
1226+
test_next_code_point(*c);
1227+
}
1228+
}
1229+
1230+
#[test]
1231+
fn test_next_code_point_reverse_boundary_conditions() {
1232+
for c in CODEPOINT_BOUNDARIES {
1233+
test_next_code_point_reverse(*c);
1234+
}
1235+
}
1236+
11591237
#[test]
11601238
fn test_iterator() {
11611239
let s = "ศไทย中华Việt Nam";

library/core/src/str/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ pub use lossy::{Utf8Chunk, Utf8Chunks};
5858
#[stable(feature = "rust1", since = "1.0.0")]
5959
pub use traits::FromStr;
6060
#[unstable(feature = "str_internals", issue = "none")]
61-
pub use validations::{next_code_point, utf8_char_width};
61+
pub use validations::{next_code_point, next_code_point_reverse, utf8_char_width};
6262

6363
#[inline(never)]
6464
#[cold]

library/core/src/str/validations.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,9 @@ pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) ->
7474
/// # Safety
7575
///
7676
/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
77+
#[unstable(feature = "str_internals", issue = "none")]
7778
#[inline]
78-
pub(super) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
79+
pub unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
7980
where
8081
I: DoubleEndedIterator<Item = &'a u8>,
8182
{

0 commit comments

Comments
 (0)