Skip to content

Commit 6065e4a

Browse files
committed
optimization: eliminate Cased table
`Cased` is a derived property - it is the union of the `Lowercase` property, the `Uppercase` property, and the `Titlecase_Letter` generaral category. We already have lookup tables for `Lowercase` and `Uppercase`, and `Titlecase_Letter` is small enough to use the `match` strategy. So instead of duplicating a lookup table for `Cased`, just test each of those properties in turn. This probably will be slower than the old lookup table approach, but it is not a public API: it is only used in `string::to_lower` when deciding when a Greek "sigma" should be mapped to `ς` or to `σ`. This is a very rare case, so is not performance sensitive.
1 parent 8bf298a commit 6065e4a

File tree

4 files changed

+47
-63
lines changed

4 files changed

+47
-63
lines changed

library/core/src/char/methods.rs

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use crate::slice;
66
use crate::str::from_utf8_unchecked_mut;
77
use crate::ub_checks::assert_unsafe_precondition;
88
use crate::unicode::printable::is_printable;
9-
use crate::unicode::{self, conversions};
9+
use crate::unicode::{self, Case_Ignorable, conversions};
1010

1111
impl char {
1212
/// The lowest valid code point a `char` can have, `'\0'`.
@@ -968,7 +968,7 @@ impl char {
968968
!self.is_ascii() && unicode::Grapheme_Extend(self)
969969
}
970970

971-
/// Returns `true` if this `char` has the `Cased` property.
971+
/// Returns `true` if this `char` has the `Cased` derived property.
972972
///
973973
/// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
974974
/// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
@@ -980,12 +980,16 @@ impl char {
980980
#[inline]
981981
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
982982
pub fn is_cased(self) -> bool {
983-
if self.is_ascii() { self.is_ascii_alphabetic() } else { unicode::Cased(self) }
983+
if self.is_ascii() {
984+
self.is_ascii_alphabetic()
985+
} else {
986+
unicode::Lowercase(self) || unicode::Uppercase(self) || unicode::Lt(self)
987+
}
984988
}
985989

986-
/// Returns `true` if this `char` has the `Cased` property.
990+
/// Returns `true` if this `char` has the `Case_Ignorable` derived property.
987991
///
988-
/// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
992+
/// `Case_Ignorable` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
989993
/// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
990994
///
991995
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
@@ -998,7 +1002,7 @@ impl char {
9981002
if self.is_ascii() {
9991003
matches!(self, '\'' | '.' | ':' | '^' | '`')
10001004
} else {
1001-
unicode::Case_Ignorable(self)
1005+
Case_Ignorable(self)
10021006
}
10031007
}
10041008

library/core/src/unicode/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,10 @@ pub use unicode_data::conversions;
88
#[rustfmt::skip]
99
pub(crate) use unicode_data::alphabetic::lookup as Alphabetic;
1010
pub(crate) use unicode_data::case_ignorable::lookup as Case_Ignorable;
11-
pub(crate) use unicode_data::cased::lookup as Cased;
1211
pub(crate) use unicode_data::cc::lookup as Cc;
1312
pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend;
1413
pub(crate) use unicode_data::lowercase::lookup as Lowercase;
14+
pub(crate) use unicode_data::lt::lookup as Lt;
1515
pub(crate) use unicode_data::n::lookup as N;
1616
pub(crate) use unicode_data::uppercase::lookup as Uppercase;
1717
pub(crate) use unicode_data::white_space::lookup as White_Space;

library/core/src/unicode/unicode_data.rs

Lines changed: 35 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
///! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!
22
// Alphabetic : 1723 bytes, 142707 codepoints in 755 ranges (U+0000AA - U+0323B0) using skiplist
33
// Case_Ignorable : 1043 bytes, 2744 codepoints in 447 ranges (U+0000A8 - U+0E01F0) using skiplist
4-
// Cased : 403 bytes, 4526 codepoints in 157 ranges (U+0000AA - U+01F18A) using skiplist
54
// Cc : 7 bytes, 32 codepoints in 1 ranges (U+000080 - U+0000A0) using skiplist
65
// Grapheme_Extend : 887 bytes, 2193 codepoints in 375 ranges (U+000300 - U+0E01F0) using skiplist
76
// Lowercase : 933 bytes, 2543 codepoints in 674 ranges (U+0000AA - U+01E944) using bitset
7+
// Lt : 33 bytes, 31 codepoints in 10 ranges (U+0001C5 - U+001FFD) using skiplist
88
// N : 455 bytes, 1901 codepoints in 143 ranges (U+0000B2 - U+01FBFA) using skiplist
99
// Uppercase : 797 bytes, 1952 codepoints in 655 ranges (U+0000C0 - U+01F18A) using bitset
1010
// White_Space : 256 bytes, 19 codepoints in 8 ranges (U+000085 - U+003001) using cascading
1111
// to_lower : 11484 bytes
1212
// to_upper : 13432 bytes
13-
// Total : 31420 bytes
13+
// Total : 31050 bytes
1414

1515
#[inline(always)]
1616
const fn bitset_search<
@@ -338,59 +338,6 @@ pub mod case_ignorable {
338338
}
339339
}
340340

341-
#[rustfmt::skip]
342-
pub mod cased {
343-
use super::ShortOffsetRunHeader;
344-
345-
static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 22] = [
346-
ShortOffsetRunHeader::new(0, 4256), ShortOffsetRunHeader::new(51, 5024),
347-
ShortOffsetRunHeader::new(61, 7296), ShortOffsetRunHeader::new(65, 7958),
348-
ShortOffsetRunHeader::new(74, 9398), ShortOffsetRunHeader::new(149, 11264),
349-
ShortOffsetRunHeader::new(151, 42560), ShortOffsetRunHeader::new(163, 43824),
350-
ShortOffsetRunHeader::new(183, 64256), ShortOffsetRunHeader::new(189, 65313),
351-
ShortOffsetRunHeader::new(193, 66560), ShortOffsetRunHeader::new(197, 67456),
352-
ShortOffsetRunHeader::new(219, 68736), ShortOffsetRunHeader::new(227, 71840),
353-
ShortOffsetRunHeader::new(235, 93760), ShortOffsetRunHeader::new(237, 119808),
354-
ShortOffsetRunHeader::new(239, 120486), ShortOffsetRunHeader::new(276, 122624),
355-
ShortOffsetRunHeader::new(299, 122928), ShortOffsetRunHeader::new(305, 125184),
356-
ShortOffsetRunHeader::new(307, 127280), ShortOffsetRunHeader::new(309, 1241482),
357-
];
358-
static OFFSETS: [u8; 315] = [
359-
170, 1, 10, 1, 4, 1, 5, 23, 1, 31, 1, 195, 1, 4, 4, 208, 1, 36, 7, 2, 30, 5, 96, 1, 42, 4,
360-
2, 2, 2, 4, 1, 1, 6, 1, 1, 3, 1, 1, 1, 20, 1, 83, 1, 139, 8, 166, 1, 38, 9, 41, 0, 38, 1, 1,
361-
5, 1, 2, 43, 1, 4, 0, 86, 2, 6, 0, 11, 5, 43, 2, 3, 64, 192, 64, 0, 2, 6, 2, 38, 2, 6, 2, 8,
362-
1, 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 7, 1, 1, 3, 3, 1, 7, 3, 4, 2, 6, 4, 13, 5, 3, 1, 7, 116,
363-
1, 13, 1, 16, 13, 101, 1, 4, 1, 2, 10, 1, 1, 3, 5, 6, 1, 1, 1, 1, 1, 1, 4, 1, 6, 4, 1, 2, 4,
364-
5, 5, 4, 1, 17, 32, 3, 2, 0, 52, 0, 229, 6, 4, 3, 2, 12, 38, 1, 1, 5, 1, 0, 46, 18, 30, 132,
365-
102, 3, 4, 1, 62, 2, 2, 1, 1, 1, 8, 21, 5, 1, 3, 0, 43, 1, 14, 6, 80, 0, 7, 12, 5, 0, 26, 6,
366-
26, 0, 80, 96, 36, 4, 36, 116, 11, 1, 15, 1, 7, 1, 2, 1, 11, 1, 15, 1, 7, 1, 2, 0, 1, 2, 3,
367-
1, 42, 1, 9, 0, 51, 13, 51, 93, 22, 10, 22, 0, 64, 0, 64, 0, 85, 1, 71, 1, 2, 2, 1, 2, 2, 2,
368-
4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, 2, 8, 1, 7, 1, 28, 1, 4, 1, 5, 1, 1, 3, 7, 1, 0, 2, 25,
369-
1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 8, 0, 10, 1, 20, 6, 6, 0,
370-
62, 0, 68, 0, 26, 6, 26, 6, 26, 0,
371-
];
372-
#[inline]
373-
pub fn lookup(c: char) -> bool {
374-
debug_assert!(!c.is_ascii());
375-
(c as u32) >= 0xaa && lookup_slow(c)
376-
}
377-
378-
#[inline(never)]
379-
fn lookup_slow(c: char) -> bool {
380-
const {
381-
assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32);
382-
let mut i = 0;
383-
while i < SHORT_OFFSET_RUNS.len() {
384-
assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len());
385-
i += 1;
386-
}
387-
}
388-
// SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`
389-
// and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`.
390-
unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
391-
}
392-
}
393-
394341
#[rustfmt::skip]
395342
pub mod cc {
396343
use super::ShortOffsetRunHeader;
@@ -605,6 +552,39 @@ pub mod lowercase {
605552
}
606553
}
607554

555+
#[rustfmt::skip]
556+
pub mod lt {
557+
use super::ShortOffsetRunHeader;
558+
559+
static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 3] = [
560+
ShortOffsetRunHeader::new(0, 453), ShortOffsetRunHeader::new(1, 8072),
561+
ShortOffsetRunHeader::new(9, 1122301),
562+
];
563+
static OFFSETS: [u8; 21] = [
564+
0, 1, 2, 1, 2, 1, 38, 1, 0, 8, 8, 8, 8, 8, 12, 1, 15, 1, 47, 1, 0,
565+
];
566+
#[inline]
567+
pub fn lookup(c: char) -> bool {
568+
debug_assert!(!c.is_ascii());
569+
(c as u32) >= 0x1c5 && lookup_slow(c)
570+
}
571+
572+
#[inline(never)]
573+
fn lookup_slow(c: char) -> bool {
574+
const {
575+
assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32);
576+
let mut i = 0;
577+
while i < SHORT_OFFSET_RUNS.len() {
578+
assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len());
579+
i += 1;
580+
}
581+
}
582+
// SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`
583+
// and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`.
584+
unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
585+
}
586+
}
587+
608588
#[rustfmt::skip]
609589
pub mod n {
610590
use super::ShortOffsetRunHeader;

src/tools/unicode-table-generator/src/main.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ static PROPERTIES: &[&str] = &[
9090
"Alphabetic",
9191
"Lowercase",
9292
"Uppercase",
93-
"Cased",
93+
"Lt",
9494
"Case_Ignorable",
9595
"Grapheme_Extend",
9696
"White_Space",

0 commit comments

Comments
 (0)