From 1bb9b151c9f9b5116254827f04add845aff33408 Mon Sep 17 00:00:00 2001 From: Karl Meakin Date: Mon, 11 Aug 2025 00:57:29 +0000 Subject: [PATCH] refactor: Hard-code `char::is_control` According to https://www.unicode.org/policies/stability_policy.html#Property_Value, the set of codepoints in `Cc` will never change. So we can hard-code the patterns to match against instead of using a table. --- library/core/src/char/methods.rs | 6 ++++- library/core/src/unicode/mod.rs | 1 - library/core/src/unicode/unicode_data.rs | 25 ------------------- src/tools/unicode-table-generator/src/main.rs | 1 - 4 files changed, 5 insertions(+), 28 deletions(-) diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index 7ee0962721f5b..61ac7f8a339f6 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -950,7 +950,11 @@ impl char { #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn is_control(self) -> bool { - unicode::Cc(self) + // According to + // https://www.unicode.org/policies/stability_policy.html#Property_Value, + // the set of codepoints in `Cc` will never change. + // So we can just hard-code the patterns to match against instead of using a table. + matches!(self, '\0'..='\x1f' | '\x7f'..='\u{9f}') } /// Returns `true` if this `char` has the `Grapheme_Extend` property. diff --git a/library/core/src/unicode/mod.rs b/library/core/src/unicode/mod.rs index 49dbdeb1a6d1c..e1cb69c3c4fe4 100644 --- a/library/core/src/unicode/mod.rs +++ b/library/core/src/unicode/mod.rs @@ -9,7 +9,6 @@ pub use unicode_data::conversions; #[rustfmt::skip] pub(crate) use unicode_data::alphabetic::lookup as Alphabetic; -pub(crate) use unicode_data::cc::lookup as Cc; pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend; pub(crate) use unicode_data::lowercase::lookup as Lowercase; pub(crate) use unicode_data::n::lookup as N; diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index b57234bbee9a2..55f64f1e96e66 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -358,31 +358,6 @@ pub mod cased { } } -#[rustfmt::skip] -pub mod cc { - use super::ShortOffsetRunHeader; - - static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 1] = [ - ShortOffsetRunHeader::new(0, 1114272), - ]; - static OFFSETS: [u8; 5] = [ - 0, 32, 95, 33, 0, - ]; - pub fn lookup(c: char) -> bool { - const { - assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32); - let mut i = 0; - while i < SHORT_OFFSET_RUNS.len() { - assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len()); - i += 1; - } - } - // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX` - // and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`. - unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) } - } -} - #[rustfmt::skip] pub mod grapheme_extend { use super::ShortOffsetRunHeader; diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 6cdb82a87bdff..38e5e8bbdb9cd 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -92,7 +92,6 @@ static PROPERTIES: &[&str] = &[ "Case_Ignorable", "Grapheme_Extend", "White_Space", - "Cc", "N", ];