optimization: Don't include ASCII characters in Unicode tables

Kmeakin · Kmeakin · commit 8bf298a5afe8 · 2025-08-10T21:38:46.000Z
The ASCII subset of Unicode is fixed and will never change, so we don't
need to generate tables for it with every new Unicode version. This
saves a few bytes of static data and speeds up `char::is_control` and
`char::is_grapheme_extended` on ASCII inputs.

Since the table lookup functions exported from the `unicode` module will
give nonsensical errors on ASCII input (and in fact will panic in debug
mode), I had to add some private wrapper methods to `char` which check
for ASCII-ness first.
diff --git a/library/alloc/src/str.rs b/library/alloc/src/str.rs
@@ -418,9 +418,8 @@ impl str {
         }
 
         fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {
-            use core::unicode::{Case_Ignorable, Cased};
-            match iter.skip_while(|&c| Case_Ignorable(c)).next() {
-                Some(c) => Cased(c),
+            match iter.skip_while(|&c| c.is_case_ignorable()).next() {
+                Some(c) => c.is_cased(),
                 None => false,
             }
         }
diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs
@@ -950,7 +950,7 @@ impl char {
     #[stable(feature = "rust1", since = "1.0.0")]
     #[inline]
     pub fn is_control(self) -> bool {
-        unicode::Cc(self)
+        if self.is_ascii() { self.is_ascii_control() } else { unicode::Cc(self) }
     }
 
     /// Returns `true` if this `char` has the `Grapheme_Extend` property.
@@ -965,7 +965,41 @@ impl char {
     #[must_use]
     #[inline]
     pub(crate) fn is_grapheme_extended(self) -> bool {
-        unicode::Grapheme_Extend(self)
+        !self.is_ascii() && unicode::Grapheme_Extend(self)
+    }
+
+    /// Returns `true` if this `char` has the `Cased` property.
+    ///
+    /// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
+    /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
+    ///
+    /// [Unicode Standard]: https://www.unicode.org/versions/latest/
+    /// [ucd]: https://www.unicode.org/reports/tr44/
+    /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
+    #[must_use]
+    #[inline]
+    #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
+    pub fn is_cased(self) -> bool {
+        if self.is_ascii() { self.is_ascii_alphabetic() } else { unicode::Cased(self) }
+    }
+
+    /// Returns `true` if this `char` has the `Cased` property.
+    ///
+    /// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
+    /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
+    ///
+    /// [Unicode Standard]: https://www.unicode.org/versions/latest/
+    /// [ucd]: https://www.unicode.org/reports/tr44/
+    /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
+    #[must_use]
+    #[inline]
+    #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
+    pub fn is_case_ignorable(self) -> bool {
+        if self.is_ascii() {
+            matches!(self, '\'' | '.' | ':' | '^' | '`')
+        } else {
+            unicode::Case_Ignorable(self)
+        }
     }
 
     /// Returns `true` if this `char` has one of the general categories for numbers.
diff --git a/library/core/src/unicode/mod.rs b/library/core/src/unicode/mod.rs
@@ -3,12 +3,12 @@
 
 // for use in alloc, not re-exported in std.
 #[rustfmt::skip]
-pub use unicode_data::case_ignorable::lookup as Case_Ignorable;
-pub use unicode_data::cased::lookup as Cased;
 pub use unicode_data::conversions;
 
 #[rustfmt::skip]
 pub(crate) use unicode_data::alphabetic::lookup as Alphabetic;
+pub(crate) use unicode_data::case_ignorable::lookup as Case_Ignorable;
+pub(crate) use unicode_data::cased::lookup as Cased;
 pub(crate) use unicode_data::cc::lookup as Cc;
 pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend;
 pub(crate) use unicode_data::lowercase::lookup as Lowercase;
diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs
diff --git a/src/tools/unicode-table-generator/src/cascading_map.rs b/src/tools/unicode-table-generator/src/cascading_map.rs
@@ -64,6 +64,7 @@ impl RawEmitter {
 
         writeln!(&mut self.file, "#[inline]").unwrap();
         writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
+        writeln!(&mut self.file, "    debug_assert!(!c.is_ascii());").unwrap();
         writeln!(&mut self.file, "    match c as u32 >> 8 {{").unwrap();
         for arm in arms {
             writeln!(&mut self.file, "        {arm},").unwrap();
diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs
@@ -196,6 +196,7 @@ fn load_data() -> UnicodeData {
                 .into_iter()
                 .flatten()
                 .flat_map(|cp| cp.scalar())
+                .filter(|c| !c.is_ascii())
                 .map(u32::from)
                 .collect::<Vec<_>>();
             (prop, ranges_from_set(&codepoints))
diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs
@@ -98,6 +98,7 @@ impl RawEmitter {
         self.blank_line();
 
         writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
+        writeln!(&mut self.file, "    debug_assert!(!c.is_ascii());").unwrap();
         if first_code_point > 0x7f {
             writeln!(&mut self.file, "    (c as u32) >= {first_code_point:#04x} &&").unwrap();
         }
diff --git a/src/tools/unicode-table-generator/src/skiplist.rs b/src/tools/unicode-table-generator/src/skiplist.rs
@@ -99,6 +99,7 @@ impl RawEmitter {
         if first_code_point > 0x7f {
             writeln!(&mut self.file, "#[inline]").unwrap();
             writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
+            writeln!(&mut self.file, "    debug_assert!(!c.is_ascii());").unwrap();
             writeln!(&mut self.file, "    (c as u32) >= {first_code_point:#04x} && lookup_slow(c)")
                 .unwrap();
             writeln!(&mut self.file, "}}").unwrap();
@@ -107,6 +108,7 @@ impl RawEmitter {
             writeln!(&mut self.file, "fn lookup_slow(c: char) -> bool {{").unwrap();
         } else {
             writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
+            writeln!(&mut self.file, "    debug_assert!(!c.is_ascii());").unwrap();
         }
         writeln!(&mut self.file, "    const {{").unwrap();
         writeln!(

Original file line number	Diff line number	Diff line change
`@@ -418,9 +418,8 @@ impl str {`
`418`	`418`	`}`
`419`	`419`
`420`	`420`	`fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {`
`421`		`- use core::unicode::{Case_Ignorable, Cased};`
`422`		`- match iter.skip_while(\|&c\| Case_Ignorable(c)).next() {`
`423`		`- Some(c) => Cased(c),`
	`421`	`+ match iter.skip_while(\|&c\| c.is_case_ignorable()).next() {`
	`422`	`+ Some(c) => c.is_cased(),`
`424`	`423`	`None => false,`
`425`	`424`	`}`
`426`	`425`	`}`
Original file line number	Diff line number	Diff line change
`@@ -98,6 +98,7 @@ impl RawEmitter {`
`98`	`98`	`self.blank_line();`
`99`	`99`
`100`	`100`	`writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();`
	`101`	`+ writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap();`
`101`	`102`	`if first_code_point > 0x7f {`
`102`	`103`	`writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} &&").unwrap();`
`103`	`104`	`}`