Skip to content

Commit ce72b92

Browse files
Auto merge of #145219 - Kmeakin:km/optimize-unicode-tables, r=<try>
Reduce size of Unicode tables
2 parents 350d0ef + 3d5b2b8 commit ce72b92

File tree

10 files changed

+522
-553
lines changed

10 files changed

+522
-553
lines changed

library/alloc/src/str.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -418,9 +418,8 @@ impl str {
418418
}
419419

420420
fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {
421-
use core::unicode::{Case_Ignorable, Cased};
422-
match iter.skip_while(|&c| Case_Ignorable(c)).next() {
423-
Some(c) => Cased(c),
421+
match iter.skip_while(|&c| c.is_case_ignorable()).next() {
422+
Some(c) => c.is_cased(),
424423
None => false,
425424
}
426425
}

library/core/src/char/methods.rs

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use crate::slice;
66
use crate::str::from_utf8_unchecked_mut;
77
use crate::ub_checks::assert_unsafe_precondition;
88
use crate::unicode::printable::is_printable;
9-
use crate::unicode::{self, conversions};
9+
use crate::unicode::{self, Case_Ignorable, conversions};
1010

1111
impl char {
1212
/// The lowest valid code point a `char` can have, `'\0'`.
@@ -950,7 +950,11 @@ impl char {
950950
#[stable(feature = "rust1", since = "1.0.0")]
951951
#[inline]
952952
pub fn is_control(self) -> bool {
953-
unicode::Cc(self)
953+
// According to
954+
// https://www.unicode.org/policies/stability_policy.html#Property_Value,
955+
// the set of codepoints in `Cc` will never change. So we can hard-code
956+
// the patterns to match against instead of using a table.
957+
matches!(self, '\0'..='\x1f' | '\x7f'..='\u{9f}')
954958
}
955959

956960
/// Returns `true` if this `char` has the `Grapheme_Extend` property.
@@ -965,7 +969,47 @@ impl char {
965969
#[must_use]
966970
#[inline]
967971
pub(crate) fn is_grapheme_extended(self) -> bool {
968-
unicode::Grapheme_Extend(self)
972+
!self.is_ascii() && unicode::Grapheme_Extend(self)
973+
}
974+
975+
/// Returns `true` if this `char` has the `Cased` derived property.
976+
///
977+
/// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
978+
/// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
979+
///
980+
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
981+
/// [ucd]: https://www.unicode.org/reports/tr44/
982+
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
983+
#[must_use]
984+
#[inline]
985+
#[doc(hidden)]
986+
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
987+
pub fn is_cased(self) -> bool {
988+
if self.is_ascii() {
989+
self.is_ascii_alphabetic()
990+
} else {
991+
unicode::Lowercase(self) || unicode::Uppercase(self) || unicode::Lt(self)
992+
}
993+
}
994+
995+
/// Returns `true` if this `char` has the `Case_Ignorable` property.
996+
///
997+
/// `Case_Ignorable` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
998+
/// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
999+
///
1000+
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
1001+
/// [ucd]: https://www.unicode.org/reports/tr44/
1002+
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
1003+
#[must_use]
1004+
#[inline]
1005+
#[doc(hidden)]
1006+
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
1007+
pub fn is_case_ignorable(self) -> bool {
1008+
if self.is_ascii() {
1009+
matches!(self, '\'' | '.' | ':' | '^' | '`')
1010+
} else {
1011+
Case_Ignorable(self)
1012+
}
9691013
}
9701014

9711015
/// Returns `true` if this `char` has one of the general categories for numbers.

library/core/src/unicode/mod.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,14 @@
33

44
// for use in alloc, not re-exported in std.
55
#[rustfmt::skip]
6-
pub use unicode_data::case_ignorable::lookup as Case_Ignorable;
7-
pub use unicode_data::cased::lookup as Cased;
86
pub use unicode_data::conversions;
97

108
#[rustfmt::skip]
119
pub(crate) use unicode_data::alphabetic::lookup as Alphabetic;
12-
pub(crate) use unicode_data::cc::lookup as Cc;
10+
pub(crate) use unicode_data::case_ignorable::lookup as Case_Ignorable;
1311
pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend;
1412
pub(crate) use unicode_data::lowercase::lookup as Lowercase;
13+
pub(crate) use unicode_data::lt::lookup as Lt;
1514
pub(crate) use unicode_data::n::lookup as N;
1615
pub(crate) use unicode_data::uppercase::lookup as Uppercase;
1716
pub(crate) use unicode_data::white_space::lookup as White_Space;

library/core/src/unicode/unicode_data.rs

Lines changed: 278 additions & 297 deletions
Large diffs are not rendered by default.

src/tools/unicode-table-generator/src/cascading_map.rs

Lines changed: 0 additions & 77 deletions
This file was deleted.

src/tools/unicode-table-generator/src/case_mapping.rs

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,24 +6,26 @@ use crate::{UnicodeData, fmt_list};
66

77
const INDEX_MASK: u32 = 1 << 22;
88

9-
pub(crate) fn generate_case_mapping(data: &UnicodeData) -> String {
9+
pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [usize; 2]) {
1010
let mut file = String::new();
1111

1212
write!(file, "const INDEX_MASK: u32 = 0x{INDEX_MASK:x};").unwrap();
1313
file.push_str("\n\n");
1414
file.push_str(HEADER.trim_start());
1515
file.push('\n');
16-
file.push_str(&generate_tables("LOWER", &data.to_lower));
16+
let (lower_tables, lower_size) = generate_tables("LOWER", &data.to_lower);
17+
file.push_str(&lower_tables);
1718
file.push_str("\n\n");
18-
file.push_str(&generate_tables("UPPER", &data.to_upper));
19-
file
19+
let (upper_tables, upper_size) = generate_tables("UPPER", &data.to_upper);
20+
file.push_str(&upper_tables);
21+
(file, [lower_size, upper_size])
2022
}
2123

22-
fn generate_tables(case: &str, data: &BTreeMap<u32, (u32, u32, u32)>) -> String {
24+
fn generate_tables(case: &str, data: &BTreeMap<u32, [u32; 3]>) -> (String, usize) {
2325
let mut mappings = Vec::with_capacity(data.len());
2426
let mut multis = Vec::new();
2527

26-
for (&key, &(a, b, c)) in data.iter() {
28+
for (&key, &[a, b, c]) in data.iter() {
2729
let key = char::from_u32(key).unwrap();
2830

2931
if key.is_ascii() {
@@ -46,16 +48,31 @@ fn generate_tables(case: &str, data: &BTreeMap<u32, (u32, u32, u32)>) -> String
4648
}
4749

4850
let mut tables = String::new();
49-
50-
write!(tables, "static {}CASE_TABLE: &[(char, u32)] = &[{}];", case, fmt_list(mappings))
51-
.unwrap();
51+
let mut size = 0;
52+
53+
size += size_of_val(mappings.as_slice());
54+
write!(
55+
tables,
56+
"static {}CASE_TABLE: &[(char, u32); {}] = &[{}];",
57+
case,
58+
mappings.len(),
59+
fmt_list(mappings),
60+
)
61+
.unwrap();
5262

5363
tables.push_str("\n\n");
5464

55-
write!(tables, "static {}CASE_TABLE_MULTI: &[[char; 3]] = &[{}];", case, fmt_list(multis))
56-
.unwrap();
57-
58-
tables
65+
size += size_of_val(multis.as_slice());
66+
write!(
67+
tables,
68+
"static {}CASE_TABLE_MULTI: &[[char; 3]; {}] = &[{}];",
69+
case,
70+
multis.len(),
71+
fmt_list(multis),
72+
)
73+
.unwrap();
74+
75+
(tables, size)
5976
}
6077

6178
struct CharEscape(char);

0 commit comments

Comments
 (0)