Skip to content

Commit 8bf298a

Browse files
committed
optimization: Don't include ASCII characters in Unicode tables
The ASCII subset of Unicode is fixed and will never change, so we don't need to generate tables for it with every new Unicode version. This saves a few bytes of static data and speeds up `char::is_control` and `char::is_grapheme_extended` on ASCII inputs. Since the table lookup functions exported from the `unicode` module will give nonsensical errors on ASCII input (and in fact will panic in debug mode), I had to add some private wrapper methods to `char` which check for ASCII-ness first.
1 parent 3a3f30b commit 8bf298a

File tree

8 files changed

+331
-252
lines changed

8 files changed

+331
-252
lines changed

library/alloc/src/str.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -418,9 +418,8 @@ impl str {
418418
}
419419

420420
fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {
421-
use core::unicode::{Case_Ignorable, Cased};
422-
match iter.skip_while(|&c| Case_Ignorable(c)).next() {
423-
Some(c) => Cased(c),
421+
match iter.skip_while(|&c| c.is_case_ignorable()).next() {
422+
Some(c) => c.is_cased(),
424423
None => false,
425424
}
426425
}

library/core/src/char/methods.rs

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -950,7 +950,7 @@ impl char {
950950
#[stable(feature = "rust1", since = "1.0.0")]
951951
#[inline]
952952
pub fn is_control(self) -> bool {
953-
unicode::Cc(self)
953+
if self.is_ascii() { self.is_ascii_control() } else { unicode::Cc(self) }
954954
}
955955

956956
/// Returns `true` if this `char` has the `Grapheme_Extend` property.
@@ -965,7 +965,41 @@ impl char {
965965
#[must_use]
966966
#[inline]
967967
pub(crate) fn is_grapheme_extended(self) -> bool {
968-
unicode::Grapheme_Extend(self)
968+
!self.is_ascii() && unicode::Grapheme_Extend(self)
969+
}
970+
971+
/// Returns `true` if this `char` has the `Cased` property.
972+
///
973+
/// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
974+
/// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
975+
///
976+
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
977+
/// [ucd]: https://www.unicode.org/reports/tr44/
978+
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
979+
#[must_use]
980+
#[inline]
981+
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
982+
pub fn is_cased(self) -> bool {
983+
if self.is_ascii() { self.is_ascii_alphabetic() } else { unicode::Cased(self) }
984+
}
985+
986+
/// Returns `true` if this `char` has the `Cased` property.
987+
///
988+
/// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
989+
/// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
990+
///
991+
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
992+
/// [ucd]: https://www.unicode.org/reports/tr44/
993+
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
994+
#[must_use]
995+
#[inline]
996+
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
997+
pub fn is_case_ignorable(self) -> bool {
998+
if self.is_ascii() {
999+
matches!(self, '\'' | '.' | ':' | '^' | '`')
1000+
} else {
1001+
unicode::Case_Ignorable(self)
1002+
}
9691003
}
9701004

9711005
/// Returns `true` if this `char` has one of the general categories for numbers.

library/core/src/unicode/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@
33

44
// for use in alloc, not re-exported in std.
55
#[rustfmt::skip]
6-
pub use unicode_data::case_ignorable::lookup as Case_Ignorable;
7-
pub use unicode_data::cased::lookup as Cased;
86
pub use unicode_data::conversions;
97

108
#[rustfmt::skip]
119
pub(crate) use unicode_data::alphabetic::lookup as Alphabetic;
10+
pub(crate) use unicode_data::case_ignorable::lookup as Case_Ignorable;
11+
pub(crate) use unicode_data::cased::lookup as Cased;
1212
pub(crate) use unicode_data::cc::lookup as Cc;
1313
pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend;
1414
pub(crate) use unicode_data::lowercase::lookup as Lowercase;

library/core/src/unicode/unicode_data.rs

Lines changed: 286 additions & 245 deletions
Large diffs are not rendered by default.

src/tools/unicode-table-generator/src/cascading_map.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ impl RawEmitter {
6464

6565
writeln!(&mut self.file, "#[inline]").unwrap();
6666
writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
67+
writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap();
6768
writeln!(&mut self.file, " match c as u32 >> 8 {{").unwrap();
6869
for arm in arms {
6970
writeln!(&mut self.file, " {arm},").unwrap();

src/tools/unicode-table-generator/src/main.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ fn load_data() -> UnicodeData {
196196
.into_iter()
197197
.flatten()
198198
.flat_map(|cp| cp.scalar())
199+
.filter(|c| !c.is_ascii())
199200
.map(u32::from)
200201
.collect::<Vec<_>>();
201202
(prop, ranges_from_set(&codepoints))

src/tools/unicode-table-generator/src/raw_emitter.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ impl RawEmitter {
9898
self.blank_line();
9999

100100
writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
101+
writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap();
101102
if first_code_point > 0x7f {
102103
writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} &&").unwrap();
103104
}

src/tools/unicode-table-generator/src/skiplist.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ impl RawEmitter {
9999
if first_code_point > 0x7f {
100100
writeln!(&mut self.file, "#[inline]").unwrap();
101101
writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
102+
writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap();
102103
writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} && lookup_slow(c)")
103104
.unwrap();
104105
writeln!(&mut self.file, "}}").unwrap();
@@ -107,6 +108,7 @@ impl RawEmitter {
107108
writeln!(&mut self.file, "fn lookup_slow(c: char) -> bool {{").unwrap();
108109
} else {
109110
writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
111+
writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap();
110112
}
111113
writeln!(&mut self.file, " const {{").unwrap();
112114
writeln!(

0 commit comments

Comments
 (0)