Skip to content

Reduce size of Unicode tables #145219

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
5 changes: 2 additions & 3 deletions library/alloc/src/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -418,9 +418,8 @@ impl str {
}

fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {
use core::unicode::{Case_Ignorable, Cased};
match iter.skip_while(|&c| Case_Ignorable(c)).next() {
Some(c) => Cased(c),
match iter.skip_while(|&c| c.is_case_ignorable()).next() {
Some(c) => c.is_cased(),
None => false,
}
}
Expand Down
50 changes: 47 additions & 3 deletions library/core/src/char/methods.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use crate::slice;
use crate::str::from_utf8_unchecked_mut;
use crate::ub_checks::assert_unsafe_precondition;
use crate::unicode::printable::is_printable;
use crate::unicode::{self, conversions};
use crate::unicode::{self, Case_Ignorable, conversions};

impl char {
/// The lowest valid code point a `char` can have, `'\0'`.
Expand Down Expand Up @@ -950,7 +950,11 @@ impl char {
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
pub fn is_control(self) -> bool {
unicode::Cc(self)
// According to
// https://www.unicode.org/policies/stability_policy.html#Property_Value,
// the set of codepoints in `Cc` will never change. So we can hard-code
// the patterns to match against instead of using a table.
matches!(self, '\0'..='\x1f' | '\x7f'..='\u{9f}')
}

/// Returns `true` if this `char` has the `Grapheme_Extend` property.
Expand All @@ -965,7 +969,47 @@ impl char {
#[must_use]
#[inline]
pub(crate) fn is_grapheme_extended(self) -> bool {
unicode::Grapheme_Extend(self)
!self.is_ascii() && unicode::Grapheme_Extend(self)
}

/// Returns `true` if this `char` has the `Cased` derived property.
///
/// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
/// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
///
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
/// [ucd]: https://www.unicode.org/reports/tr44/
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
#[must_use]
#[inline]
#[doc(hidden)]
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
pub fn is_cased(self) -> bool {
if self.is_ascii() {
self.is_ascii_alphabetic()
} else {
unicode::Lowercase(self) || unicode::Uppercase(self) || unicode::Lt(self)
}
}

/// Returns `true` if this `char` has the `Case_Ignorable` property.
///
/// `Case_Ignorable` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
/// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
///
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
/// [ucd]: https://www.unicode.org/reports/tr44/
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
#[must_use]
#[inline]
#[doc(hidden)]
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
pub fn is_case_ignorable(self) -> bool {
if self.is_ascii() {
matches!(self, '\'' | '.' | ':' | '^' | '`')
} else {
Case_Ignorable(self)
}
}

/// Returns `true` if this `char` has one of the general categories for numbers.
Expand Down
5 changes: 2 additions & 3 deletions library/core/src/unicode/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,14 @@

// for use in alloc, not re-exported in std.
#[rustfmt::skip]
pub use unicode_data::case_ignorable::lookup as Case_Ignorable;
pub use unicode_data::cased::lookup as Cased;
pub use unicode_data::conversions;

#[rustfmt::skip]
pub(crate) use unicode_data::alphabetic::lookup as Alphabetic;
pub(crate) use unicode_data::cc::lookup as Cc;
pub(crate) use unicode_data::case_ignorable::lookup as Case_Ignorable;
pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend;
pub(crate) use unicode_data::lowercase::lookup as Lowercase;
pub(crate) use unicode_data::lt::lookup as Lt;
pub(crate) use unicode_data::n::lookup as N;
pub(crate) use unicode_data::uppercase::lookup as Uppercase;
pub(crate) use unicode_data::white_space::lookup as White_Space;
Expand Down
575 changes: 278 additions & 297 deletions library/core/src/unicode/unicode_data.rs

Large diffs are not rendered by default.

77 changes: 0 additions & 77 deletions src/tools/unicode-table-generator/src/cascading_map.rs

This file was deleted.

43 changes: 30 additions & 13 deletions src/tools/unicode-table-generator/src/case_mapping.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,26 @@ use crate::{UnicodeData, fmt_list};

const INDEX_MASK: u32 = 1 << 22;

pub(crate) fn generate_case_mapping(data: &UnicodeData) -> String {
pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [usize; 2]) {
let mut file = String::new();

write!(file, "const INDEX_MASK: u32 = 0x{INDEX_MASK:x};").unwrap();
file.push_str("\n\n");
file.push_str(HEADER.trim_start());
file.push('\n');
file.push_str(&generate_tables("LOWER", &data.to_lower));
let (lower_tables, lower_size) = generate_tables("LOWER", &data.to_lower);
file.push_str(&lower_tables);
file.push_str("\n\n");
file.push_str(&generate_tables("UPPER", &data.to_upper));
file
let (upper_tables, upper_size) = generate_tables("UPPER", &data.to_upper);
file.push_str(&upper_tables);
(file, [lower_size, upper_size])
}

fn generate_tables(case: &str, data: &BTreeMap<u32, (u32, u32, u32)>) -> String {
fn generate_tables(case: &str, data: &BTreeMap<u32, [u32; 3]>) -> (String, usize) {
let mut mappings = Vec::with_capacity(data.len());
let mut multis = Vec::new();

for (&key, &(a, b, c)) in data.iter() {
for (&key, &[a, b, c]) in data.iter() {
let key = char::from_u32(key).unwrap();

if key.is_ascii() {
Expand All @@ -46,16 +48,31 @@ fn generate_tables(case: &str, data: &BTreeMap<u32, (u32, u32, u32)>) -> String
}

let mut tables = String::new();

write!(tables, "static {}CASE_TABLE: &[(char, u32)] = &[{}];", case, fmt_list(mappings))
.unwrap();
let mut size = 0;

size += size_of_val(mappings.as_slice());
write!(
tables,
"static {}CASE_TABLE: &[(char, u32); {}] = &[{}];",
case,
mappings.len(),
fmt_list(mappings),
)
.unwrap();

tables.push_str("\n\n");

write!(tables, "static {}CASE_TABLE_MULTI: &[[char; 3]] = &[{}];", case, fmt_list(multis))
.unwrap();

tables
size += size_of_val(multis.as_slice());
write!(
tables,
"static {}CASE_TABLE_MULTI: &[[char; 3]; {}] = &[{}];",
case,
multis.len(),
fmt_list(multis),
)
.unwrap();

(tables, size)
}

struct CharEscape(char);
Expand Down
Loading
Loading