Add exhaustive tests for next_code_point and next_code_point_reverse

Kmeakin · Kmeakin · commit 03fcfe096111 · 2025-07-08T00:48:38.000+01:00
There are only 0x10FFFF possible codepoints, so we can exhaustively test
all of them.
diff --git a/library/alloctests/tests/lib.rs b/library/alloctests/tests/lib.rs
@@ -25,6 +25,8 @@
 #![feature(round_char_boundary)]
 #![feature(slice_partition_dedup)]
 #![feature(string_from_utf8_lossy_owned)]
+#![feature(str_internals)]
+#![feature(char_internals)]
 #![feature(string_remove_matches)]
 #![feature(const_btree_len)]
 #![feature(const_trait_impl)]
diff --git a/library/alloctests/tests/str.rs b/library/alloctests/tests/str.rs
@@ -1156,6 +1156,99 @@ fn test_total_ord() {
     assert_eq!("22".cmp("1234"), Greater);
 }
 
+// There are only 1,114,112 code points (including surrogates for WTF-8). So we
+// can test `next_code_point` and `next_code_point_reverse` exhaustively on all
+// possible inputs.
+
+/// Assert that encoding a codepoint with `encode_utf8_raw` and then decoding it
+/// with `next_code_point` preserves the codepoint.
+fn test_next_code_point(codepoint: u32) {
+    let mut bytes = [0; 4];
+    let mut bytes = std::char::encode_utf8_raw(codepoint, &mut bytes).iter();
+
+    // SAFETY: `bytes` is UTF8-like
+    let got = unsafe { core::str::next_code_point(&mut bytes) };
+    assert_eq!(got, Some(codepoint));
+
+    // SAFETY: `bytes` is UTF8-like
+    let got = unsafe { core::str::next_code_point(&mut bytes) };
+    assert_eq!(got, None);
+}
+
+/// The same but for `next_code_point_reverse`.
+fn test_next_code_point_reverse(codepoint: u32) {
+    let mut bytes = [0; 4];
+    let mut bytes = std::char::encode_utf8_raw(codepoint, &mut bytes).iter();
+
+    // SAFETY: `bytes` is UTF8-like
+    let got = unsafe { core::str::next_code_point_reverse(&mut bytes) };
+    assert_eq!(got, Some(codepoint));
+
+    // SAFETY: `bytes` is UTF8-like
+    let got = unsafe { core::str::next_code_point_reverse(&mut bytes) };
+    assert_eq!(got, None);
+}
+
+#[test]
+fn test_next_code_point_1byte() {
+    for c in 0..0x80 {
+        test_next_code_point(c);
+    }
+}
+
+#[test]
+fn test_next_code_point_2byte() {
+    for c in 0x80..0x800 {
+        test_next_code_point(c);
+    }
+}
+
+#[test]
+#[cfg(not(miri))] // Disabled on Miri because it is too slow
+fn test_next_code_point_3byte() {
+    for c in 0x800..0x10_000 {
+        test_next_code_point(c);
+    }
+}
+
+#[test]
+// #[cfg(not(miri))] // Disabled on Miri because it is too slow
+fn test_next_code_point_4byte() {
+    for c in 0x10_000..=u32::from(char::MAX) {
+        test_next_code_point(c);
+    }
+}
+
+#[test]
+fn test_next_code_point_reverse_1byte() {
+    for c in 0..0x80 {
+        test_next_code_point_reverse(c);
+    }
+}
+
+#[test]
+fn test_next_code_point_reverse_2byte() {
+    for c in 0x80..0x800 {
+        test_next_code_point_reverse(c);
+    }
+}
+
+#[test]
+#[cfg(not(miri))] // Disabled on Miri because it is too slow
+fn test_next_code_point_reverse_3byte() {
+    for c in 0x800..0x10_000 {
+        test_next_code_point_reverse(c);
+    }
+}
+
+#[test]
+#[cfg(not(miri))] // Disabled on Miri because it is too slow
+fn test_next_code_point_reverse_4byte() {
+    for c in 0x10_000..=u32::from(char::MAX) {
+        test_next_code_point_reverse(c);
+    }
+}
+
 #[test]
 fn test_iterator() {
     let s = "ศไทย中华Việt Nam";
diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs
@@ -58,7 +58,7 @@ pub use lossy::{Utf8Chunk, Utf8Chunks};
 #[stable(feature = "rust1", since = "1.0.0")]
 pub use traits::FromStr;
 #[unstable(feature = "str_internals", issue = "none")]
-pub use validations::{next_code_point, utf8_char_width};
+pub use validations::{next_code_point, next_code_point_reverse, utf8_char_width};
 
 #[inline(never)]
 #[cold]
diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs
@@ -74,8 +74,9 @@ pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) ->
 /// # Safety
 ///
 /// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
+#[unstable(feature = "str_internals", issue = "none")]
 #[inline]
-pub(super) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
+pub unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
 where
     I: DoubleEndedIterator<Item = &'a u8>,
 {