Skip to content

Commit 94be7eb

Browse files
committed
optimization: Eliminate Cased table
`Cased` is a derived property - it is the union of the `Lowercase` property, the `Uppercase` property, and the `Titlecase_Letter` general categories. We already have lookup tables for `Lowercase` and `Uppercase`, and `Titlecase_Letter` is very small. So instead of duplicating a lookup table for `Cased`, just test each of those properties in turn. This probably will be slower than the old approach, but it is not a public API: it is only used in `string::to_lower` when deciding when a Greek "sigma" should be mapped to `ς` or to `σ`. This is a very rare case, so should not be performance sensitive.
1 parent 2cb4e7d commit 94be7eb

File tree

4 files changed

+42
-58
lines changed

4 files changed

+42
-58
lines changed

library/core/src/char/methods.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -985,7 +985,11 @@ impl char {
985985
#[doc(hidden)]
986986
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
987987
pub fn is_cased(self) -> bool {
988-
if self.is_ascii() { self.is_ascii_alphabetic() } else { unicode::Cased(self) }
988+
if self.is_ascii() {
989+
self.is_ascii_alphabetic()
990+
} else {
991+
unicode::Lowercase(self) || unicode::Uppercase(self) || unicode::Lt(self)
992+
}
989993
}
990994

991995
/// Returns `true` if this `char` has the `Case_Ignorable` property.

library/core/src/unicode/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@
55
// for use in alloc, not re-exported in std.
66
#[rustfmt::skip]
77
pub use unicode_data::case_ignorable::lookup as Case_Ignorable;
8-
pub use unicode_data::cased::lookup as Cased;
98
pub use unicode_data::conversions;
109

1110
#[rustfmt::skip]
1211
pub(crate) use unicode_data::alphabetic::lookup as Alphabetic;
1312
pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend;
1413
pub(crate) use unicode_data::lowercase::lookup as Lowercase;
14+
pub(crate) use unicode_data::lt::lookup as Lt;
1515
pub(crate) use unicode_data::n::lookup as N;
1616
pub(crate) use unicode_data::uppercase::lookup as Uppercase;
1717
pub(crate) use unicode_data::white_space::lookup as White_Space;

library/core/src/unicode/unicode_data.rs

Lines changed: 35 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
//! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!
22
// Alphabetic : 1723 bytes, 147369 codepoints in 759 ranges (U+0000AA - U+03347A) using skiplist
33
// Case_Ignorable : 1063 bytes, 2789 codepoints in 459 ranges (U+0000A8 - U+0E01F0) using skiplist
4-
// Cased : 401 bytes, 4580 codepoints in 156 ranges (U+0000AA - U+01F18A) using skiplist
54
// Grapheme_Extend : 899 bytes, 2232 codepoints in 383 ranges (U+000300 - U+0E01F0) using skiplist
65
// Lowercase : 943 bytes, 2569 codepoints in 676 ranges (U+0000AA - U+01E944) using bitset
6+
// Lt : 33 bytes, 31 codepoints in 10 ranges (U+0001C5 - U+001FFD) using skiplist
77
// N : 463 bytes, 1914 codepoints in 145 ranges (U+0000B2 - U+01FBFA) using skiplist
88
// Uppercase : 799 bytes, 1980 codepoints in 659 ranges (U+0000C0 - U+01F18A) using bitset
99
// White_Space : 256 bytes, 19 codepoints in 8 ranges (U+000085 - U+003001) using cascading
1010
// to_lower : 11708 bytes
1111
// to_upper : 13656 bytes
12-
// Total : 31911 bytes
12+
// Total : 31543 bytes
1313

1414
#[inline(always)]
1515
const fn bitset_search<
@@ -336,59 +336,6 @@ pub mod case_ignorable {
336336
}
337337
}
338338

339-
#[rustfmt::skip]
340-
pub mod cased {
341-
use super::ShortOffsetRunHeader;
342-
343-
static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 22] = [
344-
ShortOffsetRunHeader::new(0, 4256), ShortOffsetRunHeader::new(51, 5024),
345-
ShortOffsetRunHeader::new(61, 7296), ShortOffsetRunHeader::new(65, 7958),
346-
ShortOffsetRunHeader::new(74, 9398), ShortOffsetRunHeader::new(149, 11264),
347-
ShortOffsetRunHeader::new(151, 42560), ShortOffsetRunHeader::new(163, 43824),
348-
ShortOffsetRunHeader::new(177, 64256), ShortOffsetRunHeader::new(183, 65313),
349-
ShortOffsetRunHeader::new(187, 66560), ShortOffsetRunHeader::new(191, 67456),
350-
ShortOffsetRunHeader::new(213, 68736), ShortOffsetRunHeader::new(221, 71840),
351-
ShortOffsetRunHeader::new(229, 93760), ShortOffsetRunHeader::new(231, 119808),
352-
ShortOffsetRunHeader::new(237, 120486), ShortOffsetRunHeader::new(274, 122624),
353-
ShortOffsetRunHeader::new(297, 122928), ShortOffsetRunHeader::new(303, 125184),
354-
ShortOffsetRunHeader::new(305, 127280), ShortOffsetRunHeader::new(307, 1241482),
355-
];
356-
static OFFSETS: [u8; 313] = [
357-
170, 1, 10, 1, 4, 1, 5, 23, 1, 31, 1, 195, 1, 4, 4, 208, 2, 35, 7, 2, 30, 5, 96, 1, 42, 4,
358-
2, 2, 2, 4, 1, 1, 6, 1, 1, 3, 1, 1, 1, 20, 1, 83, 1, 139, 8, 166, 1, 38, 9, 41, 0, 38, 1, 1,
359-
5, 1, 2, 43, 1, 4, 0, 86, 2, 6, 0, 11, 5, 43, 2, 3, 64, 192, 64, 0, 2, 6, 2, 38, 2, 6, 2, 8,
360-
1, 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 7, 1, 1, 3, 3, 1, 7, 3, 4, 2, 6, 4, 13, 5, 3, 1, 7, 116,
361-
1, 13, 1, 16, 13, 101, 1, 4, 1, 2, 10, 1, 1, 3, 5, 6, 1, 1, 1, 1, 1, 1, 4, 1, 6, 4, 1, 2, 4,
362-
5, 5, 4, 1, 17, 32, 3, 2, 0, 52, 0, 229, 6, 4, 3, 2, 12, 38, 1, 1, 5, 1, 0, 46, 18, 30, 132,
363-
102, 3, 4, 1, 77, 20, 6, 1, 3, 0, 43, 1, 14, 6, 80, 0, 7, 12, 5, 0, 26, 6, 26, 0, 80, 96,
364-
36, 4, 36, 116, 11, 1, 15, 1, 7, 1, 2, 1, 11, 1, 15, 1, 7, 1, 2, 0, 1, 2, 3, 1, 42, 1, 9, 0,
365-
51, 13, 51, 93, 22, 10, 22, 0, 64, 0, 64, 32, 25, 2, 25, 0, 85, 1, 71, 1, 2, 2, 1, 2, 2, 2,
366-
4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, 2, 8, 1, 7, 1, 28, 1, 4, 1, 5, 1, 1, 3, 7, 1, 0, 2, 25,
367-
1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 8, 0, 10, 1, 20, 6, 6, 0,
368-
62, 0, 68, 0, 26, 6, 26, 6, 26, 0,
369-
];
370-
#[inline]
371-
pub fn lookup(c: char) -> bool {
372-
debug_assert!(!c.is_ascii());
373-
(c as u32) >= 0xaa && lookup_slow(c)
374-
}
375-
376-
#[inline(never)]
377-
fn lookup_slow(c: char) -> bool {
378-
const {
379-
assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32);
380-
let mut i = 0;
381-
while i < SHORT_OFFSET_RUNS.len() {
382-
assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len());
383-
i += 1;
384-
}
385-
}
386-
// SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`
387-
// and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`.
388-
unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
389-
}
390-
}
391-
392339
#[rustfmt::skip]
393340
pub mod grapheme_extend {
394341
use super::ShortOffsetRunHeader;
@@ -573,6 +520,39 @@ pub mod lowercase {
573520
}
574521
}
575522

523+
#[rustfmt::skip]
524+
pub mod lt {
525+
use super::ShortOffsetRunHeader;
526+
527+
static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 3] = [
528+
ShortOffsetRunHeader::new(0, 453), ShortOffsetRunHeader::new(1, 8072),
529+
ShortOffsetRunHeader::new(9, 1122301),
530+
];
531+
static OFFSETS: [u8; 21] = [
532+
0, 1, 2, 1, 2, 1, 38, 1, 0, 8, 8, 8, 8, 8, 12, 1, 15, 1, 47, 1, 0,
533+
];
534+
#[inline]
535+
pub fn lookup(c: char) -> bool {
536+
debug_assert!(!c.is_ascii());
537+
(c as u32) >= 0x1c5 && lookup_slow(c)
538+
}
539+
540+
#[inline(never)]
541+
fn lookup_slow(c: char) -> bool {
542+
const {
543+
assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32);
544+
let mut i = 0;
545+
while i < SHORT_OFFSET_RUNS.len() {
546+
assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len());
547+
i += 1;
548+
}
549+
}
550+
// SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`
551+
// and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`.
552+
unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
553+
}
554+
}
555+
576556
#[rustfmt::skip]
577557
pub mod n {
578558
use super::ShortOffsetRunHeader;

src/tools/unicode-table-generator/src/main.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ static PROPERTIES: &[&str] = &[
9090
"Alphabetic",
9191
"Lowercase",
9292
"Uppercase",
93-
"Cased",
93+
"Lt",
9494
"Case_Ignorable",
9595
"Grapheme_Extend",
9696
"White_Space",

0 commit comments

Comments
 (0)