diff --git a/library/alloc/src/str.rs b/library/alloc/src/str.rs index 22cdd8ecde022..e772ac25a95c2 100644 --- a/library/alloc/src/str.rs +++ b/library/alloc/src/str.rs @@ -418,9 +418,8 @@ impl str { } fn case_ignorable_then_cased>(iter: I) -> bool { - use core::unicode::{Case_Ignorable, Cased}; - match iter.skip_while(|&c| Case_Ignorable(c)).next() { - Some(c) => Cased(c), + match iter.skip_while(|&c| c.is_case_ignorable()).next() { + Some(c) => c.is_cased(), None => false, } } diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index 7ee0962721f5b..838a4a61fbbd7 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -6,7 +6,7 @@ use crate::slice; use crate::str::from_utf8_unchecked_mut; use crate::ub_checks::assert_unsafe_precondition; use crate::unicode::printable::is_printable; -use crate::unicode::{self, conversions}; +use crate::unicode::{self, Case_Ignorable, conversions}; impl char { /// The lowest valid code point a `char` can have, `'\0'`. @@ -950,7 +950,11 @@ impl char { #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn is_control(self) -> bool { - unicode::Cc(self) + // According to + // https://www.unicode.org/policies/stability_policy.html#Property_Value, + // the set of codepoints in `Cc` will never change. So we can hard-code + // the patterns to match against instead of using a table. + matches!(self, '\0'..='\x1f' | '\x7f'..='\u{9f}') } /// Returns `true` if this `char` has the `Grapheme_Extend` property. @@ -965,7 +969,47 @@ impl char { #[must_use] #[inline] pub(crate) fn is_grapheme_extended(self) -> bool { - unicode::Grapheme_Extend(self) + !self.is_ascii() && unicode::Grapheme_Extend(self) + } + + /// Returns `true` if this `char` has the `Cased` derived property. + /// + /// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and + /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. + /// + /// [Unicode Standard]: https://www.unicode.org/versions/latest/ + /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt + #[must_use] + #[inline] + #[doc(hidden)] + #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] + pub fn is_cased(self) -> bool { + if self.is_ascii() { + self.is_ascii_alphabetic() + } else { + unicode::Lowercase(self) || unicode::Uppercase(self) || unicode::Lt(self) + } + } + + /// Returns `true` if this `char` has the `Case_Ignorable` property. + /// + /// `Case_Ignorable` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and + /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. + /// + /// [Unicode Standard]: https://www.unicode.org/versions/latest/ + /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt + #[must_use] + #[inline] + #[doc(hidden)] + #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] + pub fn is_case_ignorable(self) -> bool { + if self.is_ascii() { + matches!(self, '\'' | '.' | ':' | '^' | '`') + } else { + Case_Ignorable(self) + } } /// Returns `true` if this `char` has one of the general categories for numbers. diff --git a/library/core/src/unicode/mod.rs b/library/core/src/unicode/mod.rs index 49dbdeb1a6d1c..db6f9c5b350fc 100644 --- a/library/core/src/unicode/mod.rs +++ b/library/core/src/unicode/mod.rs @@ -3,15 +3,14 @@ // for use in alloc, not re-exported in std. #[rustfmt::skip] -pub use unicode_data::case_ignorable::lookup as Case_Ignorable; -pub use unicode_data::cased::lookup as Cased; pub use unicode_data::conversions; #[rustfmt::skip] pub(crate) use unicode_data::alphabetic::lookup as Alphabetic; -pub(crate) use unicode_data::cc::lookup as Cc; +pub(crate) use unicode_data::case_ignorable::lookup as Case_Ignorable; pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend; pub(crate) use unicode_data::lowercase::lookup as Lowercase; +pub(crate) use unicode_data::lt::lookup as Lt; pub(crate) use unicode_data::n::lookup as N; pub(crate) use unicode_data::uppercase::lookup as Uppercase; pub(crate) use unicode_data::white_space::lookup as White_Space; diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index b57234bbee9a2..ae1f1c29f8218 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -1,4 +1,15 @@ ///! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually! +// Alphabetic : 1723 bytes, 142707 codepoints in 755 ranges (U+0000AA - U+0323B0) using skiplist +// Case_Ignorable : 1043 bytes, 2744 codepoints in 447 ranges (U+0000A8 - U+0E01F0) using skiplist +// Grapheme_Extend : 887 bytes, 2193 codepoints in 375 ranges (U+000300 - U+0E01F0) using skiplist +// Lowercase : 933 bytes, 2543 codepoints in 674 ranges (U+0000AA - U+01E944) using bitset +// Lt : 0 bytes, 31 codepoints in 10 ranges (U+0001C5 - U+001FFD) using match +// N : 455 bytes, 1901 codepoints in 143 ranges (U+0000B2 - U+01FBFA) using skiplist +// Uppercase : 797 bytes, 1952 codepoints in 655 ranges (U+0000C0 - U+01F18A) using bitset +// White_Space : 0 bytes, 19 codepoints in 8 ranges (U+000085 - U+003001) using match +// to_lower : 11484 bytes +// to_upper : 13432 bytes +// Total : 30754 bytes #[inline(always)] const fn bitset_search< @@ -136,93 +147,100 @@ pub mod alphabetic { use super::ShortOffsetRunHeader; static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 53] = [ - ShortOffsetRunHeader::new(0, 706), ShortOffsetRunHeader::new(16, 4681), - ShortOffsetRunHeader::new(418, 5741), ShortOffsetRunHeader::new(456, 7958), - ShortOffsetRunHeader::new(556, 9398), ShortOffsetRunHeader::new(627, 11264), - ShortOffsetRunHeader::new(629, 12293), ShortOffsetRunHeader::new(667, 13312), - ShortOffsetRunHeader::new(691, 19904), ShortOffsetRunHeader::new(692, 42125), - ShortOffsetRunHeader::new(694, 42509), ShortOffsetRunHeader::new(698, 55204), - ShortOffsetRunHeader::new(788, 63744), ShortOffsetRunHeader::new(793, 64110), - ShortOffsetRunHeader::new(794, 64830), ShortOffsetRunHeader::new(816, 66176), - ShortOffsetRunHeader::new(857, 67383), ShortOffsetRunHeader::new(904, 73440), - ShortOffsetRunHeader::new(1221, 74650), ShortOffsetRunHeader::new(1232, 77712), - ShortOffsetRunHeader::new(1237, 78896), ShortOffsetRunHeader::new(1240, 82939), - ShortOffsetRunHeader::new(1244, 83527), ShortOffsetRunHeader::new(1246, 90368), - ShortOffsetRunHeader::new(1247, 92160), ShortOffsetRunHeader::new(1249, 92729), - ShortOffsetRunHeader::new(1250, 93504), ShortOffsetRunHeader::new(1265, 100344), - ShortOffsetRunHeader::new(1282, 101590), ShortOffsetRunHeader::new(1284, 110576), - ShortOffsetRunHeader::new(1287, 110883), ShortOffsetRunHeader::new(1294, 111356), - ShortOffsetRunHeader::new(1304, 113664), ShortOffsetRunHeader::new(1305, 119808), - ShortOffsetRunHeader::new(1315, 120486), ShortOffsetRunHeader::new(1352, 122624), - ShortOffsetRunHeader::new(1375, 123536), ShortOffsetRunHeader::new(1399, 124112), - ShortOffsetRunHeader::new(1403, 124896), ShortOffsetRunHeader::new(1409, 126464), - ShortOffsetRunHeader::new(1425, 127280), ShortOffsetRunHeader::new(1491, 131072), - ShortOffsetRunHeader::new(1497, 173792), ShortOffsetRunHeader::new(1498, 177978), - ShortOffsetRunHeader::new(1500, 183970), ShortOffsetRunHeader::new(1504, 191457), - ShortOffsetRunHeader::new(1506, 192094), ShortOffsetRunHeader::new(1508, 194560), - ShortOffsetRunHeader::new(1509, 195102), ShortOffsetRunHeader::new(1510, 196608), - ShortOffsetRunHeader::new(1511, 201547), ShortOffsetRunHeader::new(1512, 205744), - ShortOffsetRunHeader::new(1514, 1319856), + ShortOffsetRunHeader::new(0, 706), ShortOffsetRunHeader::new(12, 4681), + ShortOffsetRunHeader::new(414, 5741), ShortOffsetRunHeader::new(452, 7958), + ShortOffsetRunHeader::new(552, 9398), ShortOffsetRunHeader::new(623, 11264), + ShortOffsetRunHeader::new(625, 12293), ShortOffsetRunHeader::new(663, 13312), + ShortOffsetRunHeader::new(687, 19904), ShortOffsetRunHeader::new(688, 42125), + ShortOffsetRunHeader::new(690, 42509), ShortOffsetRunHeader::new(694, 55204), + ShortOffsetRunHeader::new(784, 63744), ShortOffsetRunHeader::new(789, 64110), + ShortOffsetRunHeader::new(790, 64830), ShortOffsetRunHeader::new(812, 66176), + ShortOffsetRunHeader::new(853, 67383), ShortOffsetRunHeader::new(900, 73440), + ShortOffsetRunHeader::new(1217, 74650), ShortOffsetRunHeader::new(1228, 77712), + ShortOffsetRunHeader::new(1233, 78896), ShortOffsetRunHeader::new(1236, 82939), + ShortOffsetRunHeader::new(1240, 83527), ShortOffsetRunHeader::new(1242, 90368), + ShortOffsetRunHeader::new(1243, 92160), ShortOffsetRunHeader::new(1245, 92729), + ShortOffsetRunHeader::new(1246, 93504), ShortOffsetRunHeader::new(1261, 100344), + ShortOffsetRunHeader::new(1278, 101590), ShortOffsetRunHeader::new(1280, 110576), + ShortOffsetRunHeader::new(1283, 110883), ShortOffsetRunHeader::new(1290, 111356), + ShortOffsetRunHeader::new(1300, 113664), ShortOffsetRunHeader::new(1301, 119808), + ShortOffsetRunHeader::new(1311, 120486), ShortOffsetRunHeader::new(1348, 122624), + ShortOffsetRunHeader::new(1371, 123536), ShortOffsetRunHeader::new(1395, 124112), + ShortOffsetRunHeader::new(1399, 124896), ShortOffsetRunHeader::new(1405, 126464), + ShortOffsetRunHeader::new(1421, 127280), ShortOffsetRunHeader::new(1487, 131072), + ShortOffsetRunHeader::new(1493, 173792), ShortOffsetRunHeader::new(1494, 177978), + ShortOffsetRunHeader::new(1496, 183970), ShortOffsetRunHeader::new(1500, 191457), + ShortOffsetRunHeader::new(1502, 192094), ShortOffsetRunHeader::new(1504, 194560), + ShortOffsetRunHeader::new(1505, 195102), ShortOffsetRunHeader::new(1506, 196608), + ShortOffsetRunHeader::new(1507, 201547), ShortOffsetRunHeader::new(1508, 205744), + ShortOffsetRunHeader::new(1510, 1319856), ]; - static OFFSETS: [u8; 1515] = [ - 65, 26, 6, 26, 47, 1, 10, 1, 4, 1, 5, 23, 1, 31, 1, 0, 4, 12, 14, 5, 7, 1, 1, 1, 86, 1, 29, - 18, 1, 2, 2, 4, 1, 1, 6, 1, 1, 3, 1, 1, 1, 20, 1, 83, 1, 139, 8, 166, 1, 38, 2, 1, 6, 41, - 39, 14, 1, 1, 1, 2, 1, 2, 1, 1, 8, 27, 4, 4, 29, 11, 5, 56, 1, 7, 14, 102, 1, 8, 4, 8, 4, 3, - 10, 3, 2, 1, 16, 48, 13, 101, 24, 33, 9, 2, 4, 1, 5, 24, 2, 19, 19, 25, 7, 11, 5, 24, 1, 6, - 8, 1, 8, 42, 10, 12, 3, 7, 6, 76, 1, 16, 1, 3, 4, 15, 13, 19, 1, 8, 2, 2, 2, 22, 1, 7, 1, 1, - 3, 4, 3, 8, 2, 2, 2, 2, 1, 1, 8, 1, 4, 2, 1, 5, 12, 2, 10, 1, 4, 3, 1, 6, 4, 2, 2, 22, 1, 7, - 1, 2, 1, 2, 1, 2, 4, 5, 4, 2, 2, 2, 4, 1, 7, 4, 1, 1, 17, 6, 11, 3, 1, 9, 1, 3, 1, 22, 1, 7, - 1, 2, 1, 5, 3, 9, 1, 3, 1, 2, 3, 1, 15, 4, 21, 4, 4, 3, 1, 8, 2, 2, 2, 22, 1, 7, 1, 2, 1, 5, - 3, 8, 2, 2, 2, 2, 9, 2, 4, 2, 1, 5, 13, 1, 16, 2, 1, 6, 3, 3, 1, 4, 3, 2, 1, 1, 1, 2, 3, 2, - 3, 3, 3, 12, 4, 5, 3, 3, 1, 3, 3, 1, 6, 1, 40, 13, 1, 3, 1, 23, 1, 16, 3, 8, 1, 3, 1, 3, 8, - 2, 1, 3, 2, 1, 2, 4, 28, 4, 1, 8, 1, 3, 1, 23, 1, 10, 1, 5, 3, 8, 1, 3, 1, 3, 8, 2, 6, 2, 1, - 4, 13, 3, 12, 13, 1, 3, 1, 41, 2, 8, 1, 3, 1, 3, 1, 1, 5, 4, 7, 5, 22, 6, 1, 3, 1, 18, 3, - 24, 1, 9, 1, 1, 2, 7, 8, 6, 1, 1, 1, 8, 18, 2, 13, 58, 5, 7, 6, 1, 51, 2, 1, 1, 1, 5, 1, 24, - 1, 1, 1, 19, 1, 3, 2, 5, 1, 1, 6, 1, 14, 4, 32, 1, 63, 8, 1, 36, 4, 19, 4, 16, 1, 36, 67, - 55, 1, 1, 2, 5, 16, 64, 10, 4, 2, 38, 1, 1, 5, 1, 2, 43, 1, 0, 1, 4, 2, 7, 1, 1, 1, 4, 2, - 41, 1, 4, 2, 33, 1, 4, 2, 7, 1, 1, 1, 4, 2, 15, 1, 57, 1, 4, 2, 67, 37, 16, 16, 86, 2, 6, 3, - 0, 2, 17, 1, 26, 5, 75, 3, 11, 7, 20, 11, 21, 12, 20, 12, 13, 1, 3, 1, 2, 12, 52, 2, 19, 14, - 1, 4, 1, 67, 89, 7, 43, 5, 70, 10, 31, 1, 12, 4, 9, 23, 30, 2, 5, 11, 44, 4, 26, 54, 28, 4, - 63, 2, 20, 50, 1, 23, 2, 11, 3, 49, 52, 1, 15, 1, 8, 51, 42, 2, 4, 10, 44, 1, 11, 14, 55, - 22, 3, 10, 36, 2, 11, 5, 43, 2, 3, 41, 4, 1, 6, 1, 2, 3, 1, 5, 192, 19, 34, 11, 0, 2, 6, 2, - 38, 2, 6, 2, 8, 1, 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 7, 1, 1, 3, 3, 1, 7, 3, 4, 2, 6, 4, 13, - 5, 3, 1, 7, 116, 1, 13, 1, 16, 13, 101, 1, 4, 1, 2, 10, 1, 1, 3, 5, 6, 1, 1, 1, 1, 1, 1, 4, - 1, 11, 2, 4, 5, 5, 4, 1, 17, 41, 0, 52, 0, 229, 6, 4, 3, 2, 12, 38, 1, 1, 5, 1, 2, 56, 7, 1, - 16, 23, 9, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 32, 47, 1, 0, 3, 25, 9, 7, 5, 2, - 5, 4, 86, 6, 3, 1, 90, 1, 4, 5, 43, 1, 94, 17, 32, 48, 16, 0, 0, 64, 0, 67, 46, 2, 0, 3, 16, - 10, 2, 20, 47, 5, 8, 3, 113, 39, 9, 2, 103, 2, 67, 2, 2, 1, 1, 1, 8, 21, 20, 1, 33, 24, 52, - 12, 68, 1, 1, 44, 6, 3, 1, 1, 3, 10, 33, 5, 35, 13, 29, 3, 51, 1, 12, 15, 1, 16, 16, 10, 5, - 1, 55, 9, 14, 18, 23, 3, 69, 1, 1, 1, 1, 24, 3, 2, 16, 2, 4, 11, 6, 2, 6, 2, 6, 9, 7, 1, 7, - 1, 43, 1, 14, 6, 123, 21, 0, 12, 23, 4, 49, 0, 0, 2, 106, 38, 7, 12, 5, 5, 12, 1, 13, 1, 5, - 1, 1, 1, 2, 1, 2, 1, 108, 33, 0, 18, 64, 2, 54, 40, 12, 116, 5, 1, 135, 36, 26, 6, 26, 11, - 89, 3, 6, 2, 6, 2, 6, 2, 3, 35, 12, 1, 26, 1, 19, 1, 2, 1, 15, 2, 14, 34, 123, 69, 53, 0, - 29, 3, 49, 47, 32, 13, 30, 5, 43, 5, 30, 2, 36, 4, 8, 1, 5, 42, 158, 18, 36, 4, 36, 4, 40, - 8, 52, 12, 11, 1, 15, 1, 7, 1, 2, 1, 11, 1, 15, 1, 7, 1, 2, 3, 52, 12, 0, 9, 22, 10, 8, 24, - 6, 1, 42, 1, 9, 69, 6, 2, 1, 1, 44, 1, 2, 3, 1, 2, 23, 10, 23, 9, 31, 65, 19, 1, 2, 10, 22, - 10, 26, 70, 56, 6, 2, 64, 4, 1, 2, 5, 8, 1, 3, 1, 29, 42, 29, 3, 29, 35, 8, 1, 28, 27, 54, - 10, 22, 10, 19, 13, 18, 110, 73, 55, 51, 13, 51, 13, 40, 34, 28, 3, 1, 5, 23, 250, 42, 1, 2, - 3, 2, 16, 3, 55, 1, 3, 29, 10, 1, 8, 22, 42, 18, 46, 21, 27, 23, 9, 70, 43, 5, 10, 57, 9, 1, - 13, 25, 23, 51, 17, 4, 8, 35, 3, 1, 9, 64, 1, 4, 9, 2, 10, 1, 1, 1, 35, 18, 1, 34, 2, 1, 6, - 4, 62, 7, 1, 1, 1, 4, 1, 15, 1, 10, 7, 57, 23, 4, 1, 8, 2, 2, 2, 22, 1, 7, 1, 2, 1, 5, 3, 8, - 2, 2, 2, 2, 3, 1, 6, 1, 5, 7, 28, 10, 1, 1, 2, 1, 1, 38, 1, 10, 1, 1, 2, 1, 1, 4, 1, 2, 3, - 1, 1, 1, 44, 66, 1, 3, 1, 4, 20, 3, 30, 66, 2, 2, 1, 1, 184, 54, 2, 7, 25, 6, 34, 63, 1, 1, - 3, 1, 59, 54, 2, 1, 71, 27, 2, 14, 21, 7, 185, 57, 103, 64, 31, 8, 2, 1, 2, 8, 1, 2, 1, 30, - 1, 2, 2, 2, 2, 4, 93, 8, 2, 46, 2, 6, 1, 1, 1, 2, 27, 51, 2, 10, 17, 72, 5, 1, 18, 73, 199, - 33, 31, 9, 1, 45, 1, 7, 1, 1, 49, 30, 2, 22, 1, 14, 73, 7, 1, 2, 1, 44, 3, 1, 1, 2, 1, 3, 1, - 1, 2, 2, 24, 6, 1, 2, 1, 37, 1, 2, 1, 4, 1, 1, 0, 23, 9, 17, 1, 41, 3, 3, 111, 1, 79, 0, - 102, 111, 17, 196, 0, 97, 15, 0, 17, 6, 25, 0, 5, 0, 0, 47, 0, 0, 7, 31, 17, 79, 17, 30, 18, - 48, 16, 4, 31, 21, 5, 19, 0, 45, 211, 64, 128, 75, 4, 57, 7, 17, 64, 2, 1, 1, 12, 2, 14, 0, - 8, 0, 41, 10, 0, 4, 1, 7, 1, 2, 1, 0, 15, 1, 29, 3, 2, 1, 14, 4, 8, 0, 0, 107, 5, 13, 3, 9, - 7, 10, 4, 1, 0, 85, 1, 71, 1, 2, 2, 1, 2, 2, 2, 4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, 2, 8, 1, - 7, 1, 28, 1, 4, 1, 5, 1, 1, 3, 7, 1, 0, 2, 25, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, - 25, 1, 31, 1, 25, 1, 8, 0, 31, 6, 6, 213, 7, 1, 17, 2, 7, 1, 2, 1, 5, 5, 62, 33, 1, 112, 45, - 10, 7, 16, 1, 0, 30, 18, 44, 0, 28, 228, 30, 2, 1, 0, 7, 1, 4, 1, 2, 1, 15, 1, 197, 59, 68, - 3, 1, 3, 1, 0, 4, 1, 27, 1, 2, 1, 1, 2, 1, 1, 10, 1, 4, 1, 1, 1, 1, 6, 1, 4, 1, 1, 1, 1, 1, - 1, 3, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 4, 1, 7, 1, 4, 1, 4, 1, 1, 1, - 10, 1, 17, 5, 3, 1, 5, 1, 17, 0, 26, 6, 26, 6, 26, 0, 0, 32, 0, 6, 222, 2, 0, 14, 0, 15, 0, - 0, 0, 0, 0, 5, 0, 0, + static OFFSETS: [u8; 1511] = [ + 170, 1, 10, 1, 4, 1, 5, 23, 1, 31, 1, 0, 4, 12, 14, 5, 7, 1, 1, 1, 86, 1, 29, 18, 1, 2, 2, + 4, 1, 1, 6, 1, 1, 3, 1, 1, 1, 20, 1, 83, 1, 139, 8, 166, 1, 38, 2, 1, 6, 41, 39, 14, 1, 1, + 1, 2, 1, 2, 1, 1, 8, 27, 4, 4, 29, 11, 5, 56, 1, 7, 14, 102, 1, 8, 4, 8, 4, 3, 10, 3, 2, 1, + 16, 48, 13, 101, 24, 33, 9, 2, 4, 1, 5, 24, 2, 19, 19, 25, 7, 11, 5, 24, 1, 6, 8, 1, 8, 42, + 10, 12, 3, 7, 6, 76, 1, 16, 1, 3, 4, 15, 13, 19, 1, 8, 2, 2, 2, 22, 1, 7, 1, 1, 3, 4, 3, 8, + 2, 2, 2, 2, 1, 1, 8, 1, 4, 2, 1, 5, 12, 2, 10, 1, 4, 3, 1, 6, 4, 2, 2, 22, 1, 7, 1, 2, 1, 2, + 1, 2, 4, 5, 4, 2, 2, 2, 4, 1, 7, 4, 1, 1, 17, 6, 11, 3, 1, 9, 1, 3, 1, 22, 1, 7, 1, 2, 1, 5, + 3, 9, 1, 3, 1, 2, 3, 1, 15, 4, 21, 4, 4, 3, 1, 8, 2, 2, 2, 22, 1, 7, 1, 2, 1, 5, 3, 8, 2, 2, + 2, 2, 9, 2, 4, 2, 1, 5, 13, 1, 16, 2, 1, 6, 3, 3, 1, 4, 3, 2, 1, 1, 1, 2, 3, 2, 3, 3, 3, 12, + 4, 5, 3, 3, 1, 3, 3, 1, 6, 1, 40, 13, 1, 3, 1, 23, 1, 16, 3, 8, 1, 3, 1, 3, 8, 2, 1, 3, 2, + 1, 2, 4, 28, 4, 1, 8, 1, 3, 1, 23, 1, 10, 1, 5, 3, 8, 1, 3, 1, 3, 8, 2, 6, 2, 1, 4, 13, 3, + 12, 13, 1, 3, 1, 41, 2, 8, 1, 3, 1, 3, 1, 1, 5, 4, 7, 5, 22, 6, 1, 3, 1, 18, 3, 24, 1, 9, 1, + 1, 2, 7, 8, 6, 1, 1, 1, 8, 18, 2, 13, 58, 5, 7, 6, 1, 51, 2, 1, 1, 1, 5, 1, 24, 1, 1, 1, 19, + 1, 3, 2, 5, 1, 1, 6, 1, 14, 4, 32, 1, 63, 8, 1, 36, 4, 19, 4, 16, 1, 36, 67, 55, 1, 1, 2, 5, + 16, 64, 10, 4, 2, 38, 1, 1, 5, 1, 2, 43, 1, 0, 1, 4, 2, 7, 1, 1, 1, 4, 2, 41, 1, 4, 2, 33, + 1, 4, 2, 7, 1, 1, 1, 4, 2, 15, 1, 57, 1, 4, 2, 67, 37, 16, 16, 86, 2, 6, 3, 0, 2, 17, 1, 26, + 5, 75, 3, 11, 7, 20, 11, 21, 12, 20, 12, 13, 1, 3, 1, 2, 12, 52, 2, 19, 14, 1, 4, 1, 67, 89, + 7, 43, 5, 70, 10, 31, 1, 12, 4, 9, 23, 30, 2, 5, 11, 44, 4, 26, 54, 28, 4, 63, 2, 20, 50, 1, + 23, 2, 11, 3, 49, 52, 1, 15, 1, 8, 51, 42, 2, 4, 10, 44, 1, 11, 14, 55, 22, 3, 10, 36, 2, + 11, 5, 43, 2, 3, 41, 4, 1, 6, 1, 2, 3, 1, 5, 192, 19, 34, 11, 0, 2, 6, 2, 38, 2, 6, 2, 8, 1, + 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 7, 1, 1, 3, 3, 1, 7, 3, 4, 2, 6, 4, 13, 5, 3, 1, 7, 116, 1, + 13, 1, 16, 13, 101, 1, 4, 1, 2, 10, 1, 1, 3, 5, 6, 1, 1, 1, 1, 1, 1, 4, 1, 11, 2, 4, 5, 5, + 4, 1, 17, 41, 0, 52, 0, 229, 6, 4, 3, 2, 12, 38, 1, 1, 5, 1, 2, 56, 7, 1, 16, 23, 9, 7, 1, + 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 32, 47, 1, 0, 3, 25, 9, 7, 5, 2, 5, 4, 86, 6, 3, + 1, 90, 1, 4, 5, 43, 1, 94, 17, 32, 48, 16, 0, 0, 64, 0, 67, 46, 2, 0, 3, 16, 10, 2, 20, 47, + 5, 8, 3, 113, 39, 9, 2, 103, 2, 67, 2, 2, 1, 1, 1, 8, 21, 20, 1, 33, 24, 52, 12, 68, 1, 1, + 44, 6, 3, 1, 1, 3, 10, 33, 5, 35, 13, 29, 3, 51, 1, 12, 15, 1, 16, 16, 10, 5, 1, 55, 9, 14, + 18, 23, 3, 69, 1, 1, 1, 1, 24, 3, 2, 16, 2, 4, 11, 6, 2, 6, 2, 6, 9, 7, 1, 7, 1, 43, 1, 14, + 6, 123, 21, 0, 12, 23, 4, 49, 0, 0, 2, 106, 38, 7, 12, 5, 5, 12, 1, 13, 1, 5, 1, 1, 1, 2, 1, + 2, 1, 108, 33, 0, 18, 64, 2, 54, 40, 12, 116, 5, 1, 135, 36, 26, 6, 26, 11, 89, 3, 6, 2, 6, + 2, 6, 2, 3, 35, 12, 1, 26, 1, 19, 1, 2, 1, 15, 2, 14, 34, 123, 69, 53, 0, 29, 3, 49, 47, 32, + 13, 30, 5, 43, 5, 30, 2, 36, 4, 8, 1, 5, 42, 158, 18, 36, 4, 36, 4, 40, 8, 52, 12, 11, 1, + 15, 1, 7, 1, 2, 1, 11, 1, 15, 1, 7, 1, 2, 3, 52, 12, 0, 9, 22, 10, 8, 24, 6, 1, 42, 1, 9, + 69, 6, 2, 1, 1, 44, 1, 2, 3, 1, 2, 23, 10, 23, 9, 31, 65, 19, 1, 2, 10, 22, 10, 26, 70, 56, + 6, 2, 64, 4, 1, 2, 5, 8, 1, 3, 1, 29, 42, 29, 3, 29, 35, 8, 1, 28, 27, 54, 10, 22, 10, 19, + 13, 18, 110, 73, 55, 51, 13, 51, 13, 40, 34, 28, 3, 1, 5, 23, 250, 42, 1, 2, 3, 2, 16, 3, + 55, 1, 3, 29, 10, 1, 8, 22, 42, 18, 46, 21, 27, 23, 9, 70, 43, 5, 10, 57, 9, 1, 13, 25, 23, + 51, 17, 4, 8, 35, 3, 1, 9, 64, 1, 4, 9, 2, 10, 1, 1, 1, 35, 18, 1, 34, 2, 1, 6, 4, 62, 7, 1, + 1, 1, 4, 1, 15, 1, 10, 7, 57, 23, 4, 1, 8, 2, 2, 2, 22, 1, 7, 1, 2, 1, 5, 3, 8, 2, 2, 2, 2, + 3, 1, 6, 1, 5, 7, 28, 10, 1, 1, 2, 1, 1, 38, 1, 10, 1, 1, 2, 1, 1, 4, 1, 2, 3, 1, 1, 1, 44, + 66, 1, 3, 1, 4, 20, 3, 30, 66, 2, 2, 1, 1, 184, 54, 2, 7, 25, 6, 34, 63, 1, 1, 3, 1, 59, 54, + 2, 1, 71, 27, 2, 14, 21, 7, 185, 57, 103, 64, 31, 8, 2, 1, 2, 8, 1, 2, 1, 30, 1, 2, 2, 2, 2, + 4, 93, 8, 2, 46, 2, 6, 1, 1, 1, 2, 27, 51, 2, 10, 17, 72, 5, 1, 18, 73, 199, 33, 31, 9, 1, + 45, 1, 7, 1, 1, 49, 30, 2, 22, 1, 14, 73, 7, 1, 2, 1, 44, 3, 1, 1, 2, 1, 3, 1, 1, 2, 2, 24, + 6, 1, 2, 1, 37, 1, 2, 1, 4, 1, 1, 0, 23, 9, 17, 1, 41, 3, 3, 111, 1, 79, 0, 102, 111, 17, + 196, 0, 97, 15, 0, 17, 6, 25, 0, 5, 0, 0, 47, 0, 0, 7, 31, 17, 79, 17, 30, 18, 48, 16, 4, + 31, 21, 5, 19, 0, 45, 211, 64, 128, 75, 4, 57, 7, 17, 64, 2, 1, 1, 12, 2, 14, 0, 8, 0, 41, + 10, 0, 4, 1, 7, 1, 2, 1, 0, 15, 1, 29, 3, 2, 1, 14, 4, 8, 0, 0, 107, 5, 13, 3, 9, 7, 10, 4, + 1, 0, 85, 1, 71, 1, 2, 2, 1, 2, 2, 2, 4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, 2, 8, 1, 7, 1, 28, + 1, 4, 1, 5, 1, 1, 3, 7, 1, 0, 2, 25, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, + 1, 25, 1, 8, 0, 31, 6, 6, 213, 7, 1, 17, 2, 7, 1, 2, 1, 5, 5, 62, 33, 1, 112, 45, 10, 7, 16, + 1, 0, 30, 18, 44, 0, 28, 228, 30, 2, 1, 0, 7, 1, 4, 1, 2, 1, 15, 1, 197, 59, 68, 3, 1, 3, 1, + 0, 4, 1, 27, 1, 2, 1, 1, 2, 1, 1, 10, 1, 4, 1, 1, 1, 1, 6, 1, 4, 1, 1, 1, 1, 1, 1, 3, 1, 2, + 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 4, 1, 7, 1, 4, 1, 4, 1, 1, 1, 10, 1, 17, + 5, 3, 1, 5, 1, 17, 0, 26, 6, 26, 6, 26, 0, 0, 32, 0, 6, 222, 2, 0, 14, 0, 15, 0, 0, 0, 0, 0, + 5, 0, 0, ]; + #[inline] pub fn lookup(c: char) -> bool { + debug_assert!(!c.is_ascii()); + (c as u32) >= 0xaa && lookup_slow(c) + } + + #[inline(never)] + fn lookup_slow(c: char) -> bool { const { assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32); let mut i = 0; @@ -242,133 +260,69 @@ pub mod case_ignorable { use super::ShortOffsetRunHeader; static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 37] = [ - ShortOffsetRunHeader::new(0, 688), ShortOffsetRunHeader::new(21, 4957), - ShortOffsetRunHeader::new(273, 5906), ShortOffsetRunHeader::new(275, 8125), - ShortOffsetRunHeader::new(385, 11388), ShortOffsetRunHeader::new(419, 12293), - ShortOffsetRunHeader::new(431, 40981), ShortOffsetRunHeader::new(443, 42232), - ShortOffsetRunHeader::new(445, 42508), ShortOffsetRunHeader::new(447, 64286), - ShortOffsetRunHeader::new(543, 65024), ShortOffsetRunHeader::new(547, 66045), - ShortOffsetRunHeader::new(577, 67456), ShortOffsetRunHeader::new(583, 68097), - ShortOffsetRunHeader::new(589, 68900), ShortOffsetRunHeader::new(601, 69291), - ShortOffsetRunHeader::new(609, 71727), ShortOffsetRunHeader::new(733, 71995), - ShortOffsetRunHeader::new(737, 72752), ShortOffsetRunHeader::new(765, 73459), - ShortOffsetRunHeader::new(795, 78896), ShortOffsetRunHeader::new(807, 90398), - ShortOffsetRunHeader::new(811, 92912), ShortOffsetRunHeader::new(815, 93504), - ShortOffsetRunHeader::new(821, 94031), ShortOffsetRunHeader::new(825, 110576), - ShortOffsetRunHeader::new(833, 113821), ShortOffsetRunHeader::new(839, 118528), - ShortOffsetRunHeader::new(843, 119143), ShortOffsetRunHeader::new(847, 121344), - ShortOffsetRunHeader::new(857, 122880), ShortOffsetRunHeader::new(869, 123566), - ShortOffsetRunHeader::new(885, 124139), ShortOffsetRunHeader::new(889, 125136), - ShortOffsetRunHeader::new(893, 127995), ShortOffsetRunHeader::new(897, 917505), - ShortOffsetRunHeader::new(899, 2032112), + ShortOffsetRunHeader::new(0, 688), ShortOffsetRunHeader::new(11, 4957), + ShortOffsetRunHeader::new(263, 5906), ShortOffsetRunHeader::new(265, 8125), + ShortOffsetRunHeader::new(375, 11388), ShortOffsetRunHeader::new(409, 12293), + ShortOffsetRunHeader::new(421, 40981), ShortOffsetRunHeader::new(433, 42232), + ShortOffsetRunHeader::new(435, 42508), ShortOffsetRunHeader::new(437, 64286), + ShortOffsetRunHeader::new(533, 65024), ShortOffsetRunHeader::new(537, 66045), + ShortOffsetRunHeader::new(567, 67456), ShortOffsetRunHeader::new(573, 68097), + ShortOffsetRunHeader::new(579, 68900), ShortOffsetRunHeader::new(591, 69291), + ShortOffsetRunHeader::new(599, 71727), ShortOffsetRunHeader::new(723, 71995), + ShortOffsetRunHeader::new(727, 72752), ShortOffsetRunHeader::new(755, 73459), + ShortOffsetRunHeader::new(785, 78896), ShortOffsetRunHeader::new(797, 90398), + ShortOffsetRunHeader::new(801, 92912), ShortOffsetRunHeader::new(805, 93504), + ShortOffsetRunHeader::new(811, 94031), ShortOffsetRunHeader::new(815, 110576), + ShortOffsetRunHeader::new(823, 113821), ShortOffsetRunHeader::new(829, 118528), + ShortOffsetRunHeader::new(833, 119143), ShortOffsetRunHeader::new(837, 121344), + ShortOffsetRunHeader::new(847, 122880), ShortOffsetRunHeader::new(859, 123566), + ShortOffsetRunHeader::new(875, 124139), ShortOffsetRunHeader::new(879, 125136), + ShortOffsetRunHeader::new(883, 127995), ShortOffsetRunHeader::new(887, 917505), + ShortOffsetRunHeader::new(889, 2032112), ]; - static OFFSETS: [u8; 905] = [ - 39, 1, 6, 1, 11, 1, 35, 1, 1, 1, 71, 1, 4, 1, 1, 1, 4, 1, 2, 2, 0, 192, 4, 2, 4, 1, 9, 2, - 1, 1, 251, 7, 207, 1, 5, 1, 49, 45, 1, 1, 1, 2, 1, 2, 1, 1, 44, 1, 11, 6, 10, 11, 1, 1, 35, - 1, 10, 21, 16, 1, 101, 8, 1, 10, 1, 4, 33, 1, 1, 1, 30, 27, 91, 11, 58, 11, 4, 1, 2, 1, 24, - 24, 43, 3, 44, 1, 7, 2, 5, 9, 41, 58, 55, 1, 1, 1, 4, 8, 4, 1, 3, 7, 10, 2, 13, 1, 15, 1, - 58, 1, 4, 4, 8, 1, 20, 2, 26, 1, 2, 2, 57, 1, 4, 2, 4, 2, 2, 3, 3, 1, 30, 2, 3, 1, 11, 2, - 57, 1, 4, 5, 1, 2, 4, 1, 20, 2, 22, 6, 1, 1, 58, 1, 2, 1, 1, 4, 8, 1, 7, 2, 11, 2, 30, 1, - 61, 1, 12, 1, 50, 1, 3, 1, 55, 1, 1, 3, 5, 3, 1, 4, 7, 2, 11, 2, 29, 1, 58, 1, 2, 1, 6, 1, - 5, 2, 20, 2, 28, 2, 57, 2, 4, 4, 8, 1, 20, 2, 29, 1, 72, 1, 7, 3, 1, 1, 90, 1, 2, 7, 11, 9, - 98, 1, 2, 9, 9, 1, 1, 7, 73, 2, 27, 1, 1, 1, 1, 1, 55, 14, 1, 5, 1, 2, 5, 11, 1, 36, 9, 1, - 102, 4, 1, 6, 1, 2, 2, 2, 25, 2, 4, 3, 16, 4, 13, 1, 2, 2, 6, 1, 15, 1, 94, 1, 0, 3, 0, 3, - 29, 2, 30, 2, 30, 2, 64, 2, 1, 7, 8, 1, 2, 11, 3, 1, 5, 1, 45, 5, 51, 1, 65, 2, 34, 1, 118, - 3, 4, 2, 9, 1, 6, 3, 219, 2, 2, 1, 58, 1, 1, 7, 1, 1, 1, 1, 2, 8, 6, 10, 2, 1, 39, 1, 8, 31, - 49, 4, 48, 1, 1, 5, 1, 1, 5, 1, 40, 9, 12, 2, 32, 4, 2, 2, 1, 3, 56, 1, 1, 2, 3, 1, 1, 3, - 58, 8, 2, 2, 64, 6, 82, 3, 1, 13, 1, 7, 4, 1, 6, 1, 3, 2, 50, 63, 13, 1, 34, 101, 0, 1, 1, - 3, 11, 3, 13, 3, 13, 3, 13, 2, 12, 5, 8, 2, 10, 1, 2, 1, 2, 5, 49, 5, 1, 10, 1, 1, 13, 1, - 16, 13, 51, 33, 0, 2, 113, 3, 125, 1, 15, 1, 96, 32, 47, 1, 0, 1, 36, 4, 3, 5, 5, 1, 93, 6, - 93, 3, 0, 1, 0, 6, 0, 1, 98, 4, 1, 10, 1, 1, 28, 4, 80, 2, 14, 34, 78, 1, 23, 3, 103, 3, 3, - 2, 8, 1, 3, 1, 4, 1, 25, 2, 5, 1, 151, 2, 26, 18, 13, 1, 38, 8, 25, 11, 46, 3, 48, 1, 2, 4, - 2, 2, 17, 1, 21, 2, 66, 6, 2, 2, 2, 2, 12, 1, 8, 1, 35, 1, 11, 1, 51, 1, 1, 3, 2, 2, 5, 2, - 1, 1, 27, 1, 14, 2, 5, 2, 1, 1, 100, 5, 9, 3, 121, 1, 2, 1, 4, 1, 0, 1, 147, 17, 0, 16, 3, - 1, 12, 16, 34, 1, 2, 1, 169, 1, 7, 1, 6, 1, 11, 1, 35, 1, 1, 1, 47, 1, 45, 2, 67, 1, 21, 3, - 0, 1, 226, 1, 149, 5, 0, 6, 1, 42, 1, 9, 0, 3, 1, 2, 5, 4, 40, 3, 4, 1, 165, 2, 0, 4, 38, 1, - 26, 5, 1, 1, 0, 2, 79, 4, 70, 11, 49, 4, 123, 1, 54, 15, 41, 1, 2, 2, 10, 3, 49, 4, 2, 2, 2, - 1, 4, 1, 10, 1, 50, 3, 36, 5, 1, 8, 62, 1, 12, 2, 52, 9, 10, 4, 2, 1, 95, 3, 2, 1, 1, 2, 6, - 1, 2, 1, 157, 1, 3, 8, 21, 2, 57, 2, 3, 1, 37, 7, 3, 5, 70, 6, 13, 1, 1, 1, 1, 1, 14, 2, 85, - 8, 2, 3, 1, 1, 23, 1, 84, 6, 1, 1, 4, 2, 1, 2, 238, 4, 6, 2, 1, 2, 27, 2, 85, 8, 2, 1, 1, 2, - 106, 1, 1, 1, 2, 6, 1, 1, 101, 1, 1, 1, 2, 4, 1, 5, 0, 9, 1, 2, 0, 2, 1, 1, 4, 1, 144, 4, 2, - 2, 4, 1, 32, 10, 40, 6, 2, 4, 8, 1, 9, 6, 2, 3, 46, 13, 1, 2, 0, 7, 1, 6, 1, 1, 82, 22, 2, - 7, 1, 2, 1, 2, 122, 6, 3, 1, 1, 2, 1, 7, 1, 1, 72, 2, 3, 1, 1, 1, 0, 2, 11, 2, 52, 5, 5, 1, - 1, 1, 23, 1, 0, 17, 6, 15, 0, 12, 3, 3, 0, 5, 59, 7, 9, 4, 0, 3, 40, 2, 0, 1, 63, 17, 64, 2, - 1, 2, 0, 4, 1, 7, 1, 2, 0, 2, 1, 4, 0, 46, 2, 23, 0, 3, 9, 16, 2, 7, 30, 4, 148, 3, 0, 55, - 4, 50, 8, 1, 14, 1, 22, 5, 1, 15, 0, 7, 1, 17, 2, 7, 1, 2, 1, 5, 5, 62, 33, 1, 160, 14, 0, - 1, 61, 4, 0, 5, 254, 2, 0, 7, 109, 8, 0, 5, 0, 1, 30, 96, 128, 240, 0, + static OFFSETS: [u8; 895] = [ + 168, 1, 4, 1, 1, 1, 4, 1, 2, 2, 0, 192, 4, 2, 4, 1, 9, 2, 1, 1, 251, 7, 207, 1, 5, 1, 49, + 45, 1, 1, 1, 2, 1, 2, 1, 1, 44, 1, 11, 6, 10, 11, 1, 1, 35, 1, 10, 21, 16, 1, 101, 8, 1, 10, + 1, 4, 33, 1, 1, 1, 30, 27, 91, 11, 58, 11, 4, 1, 2, 1, 24, 24, 43, 3, 44, 1, 7, 2, 5, 9, 41, + 58, 55, 1, 1, 1, 4, 8, 4, 1, 3, 7, 10, 2, 13, 1, 15, 1, 58, 1, 4, 4, 8, 1, 20, 2, 26, 1, 2, + 2, 57, 1, 4, 2, 4, 2, 2, 3, 3, 1, 30, 2, 3, 1, 11, 2, 57, 1, 4, 5, 1, 2, 4, 1, 20, 2, 22, 6, + 1, 1, 58, 1, 2, 1, 1, 4, 8, 1, 7, 2, 11, 2, 30, 1, 61, 1, 12, 1, 50, 1, 3, 1, 55, 1, 1, 3, + 5, 3, 1, 4, 7, 2, 11, 2, 29, 1, 58, 1, 2, 1, 6, 1, 5, 2, 20, 2, 28, 2, 57, 2, 4, 4, 8, 1, + 20, 2, 29, 1, 72, 1, 7, 3, 1, 1, 90, 1, 2, 7, 11, 9, 98, 1, 2, 9, 9, 1, 1, 7, 73, 2, 27, 1, + 1, 1, 1, 1, 55, 14, 1, 5, 1, 2, 5, 11, 1, 36, 9, 1, 102, 4, 1, 6, 1, 2, 2, 2, 25, 2, 4, 3, + 16, 4, 13, 1, 2, 2, 6, 1, 15, 1, 94, 1, 0, 3, 0, 3, 29, 2, 30, 2, 30, 2, 64, 2, 1, 7, 8, 1, + 2, 11, 3, 1, 5, 1, 45, 5, 51, 1, 65, 2, 34, 1, 118, 3, 4, 2, 9, 1, 6, 3, 219, 2, 2, 1, 58, + 1, 1, 7, 1, 1, 1, 1, 2, 8, 6, 10, 2, 1, 39, 1, 8, 31, 49, 4, 48, 1, 1, 5, 1, 1, 5, 1, 40, 9, + 12, 2, 32, 4, 2, 2, 1, 3, 56, 1, 1, 2, 3, 1, 1, 3, 58, 8, 2, 2, 64, 6, 82, 3, 1, 13, 1, 7, + 4, 1, 6, 1, 3, 2, 50, 63, 13, 1, 34, 101, 0, 1, 1, 3, 11, 3, 13, 3, 13, 3, 13, 2, 12, 5, 8, + 2, 10, 1, 2, 1, 2, 5, 49, 5, 1, 10, 1, 1, 13, 1, 16, 13, 51, 33, 0, 2, 113, 3, 125, 1, 15, + 1, 96, 32, 47, 1, 0, 1, 36, 4, 3, 5, 5, 1, 93, 6, 93, 3, 0, 1, 0, 6, 0, 1, 98, 4, 1, 10, 1, + 1, 28, 4, 80, 2, 14, 34, 78, 1, 23, 3, 103, 3, 3, 2, 8, 1, 3, 1, 4, 1, 25, 2, 5, 1, 151, 2, + 26, 18, 13, 1, 38, 8, 25, 11, 46, 3, 48, 1, 2, 4, 2, 2, 17, 1, 21, 2, 66, 6, 2, 2, 2, 2, 12, + 1, 8, 1, 35, 1, 11, 1, 51, 1, 1, 3, 2, 2, 5, 2, 1, 1, 27, 1, 14, 2, 5, 2, 1, 1, 100, 5, 9, + 3, 121, 1, 2, 1, 4, 1, 0, 1, 147, 17, 0, 16, 3, 1, 12, 16, 34, 1, 2, 1, 169, 1, 7, 1, 6, 1, + 11, 1, 35, 1, 1, 1, 47, 1, 45, 2, 67, 1, 21, 3, 0, 1, 226, 1, 149, 5, 0, 6, 1, 42, 1, 9, 0, + 3, 1, 2, 5, 4, 40, 3, 4, 1, 165, 2, 0, 4, 38, 1, 26, 5, 1, 1, 0, 2, 79, 4, 70, 11, 49, 4, + 123, 1, 54, 15, 41, 1, 2, 2, 10, 3, 49, 4, 2, 2, 2, 1, 4, 1, 10, 1, 50, 3, 36, 5, 1, 8, 62, + 1, 12, 2, 52, 9, 10, 4, 2, 1, 95, 3, 2, 1, 1, 2, 6, 1, 2, 1, 157, 1, 3, 8, 21, 2, 57, 2, 3, + 1, 37, 7, 3, 5, 70, 6, 13, 1, 1, 1, 1, 1, 14, 2, 85, 8, 2, 3, 1, 1, 23, 1, 84, 6, 1, 1, 4, + 2, 1, 2, 238, 4, 6, 2, 1, 2, 27, 2, 85, 8, 2, 1, 1, 2, 106, 1, 1, 1, 2, 6, 1, 1, 101, 1, 1, + 1, 2, 4, 1, 5, 0, 9, 1, 2, 0, 2, 1, 1, 4, 1, 144, 4, 2, 2, 4, 1, 32, 10, 40, 6, 2, 4, 8, 1, + 9, 6, 2, 3, 46, 13, 1, 2, 0, 7, 1, 6, 1, 1, 82, 22, 2, 7, 1, 2, 1, 2, 122, 6, 3, 1, 1, 2, 1, + 7, 1, 1, 72, 2, 3, 1, 1, 1, 0, 2, 11, 2, 52, 5, 5, 1, 1, 1, 23, 1, 0, 17, 6, 15, 0, 12, 3, + 3, 0, 5, 59, 7, 9, 4, 0, 3, 40, 2, 0, 1, 63, 17, 64, 2, 1, 2, 0, 4, 1, 7, 1, 2, 0, 2, 1, 4, + 0, 46, 2, 23, 0, 3, 9, 16, 2, 7, 30, 4, 148, 3, 0, 55, 4, 50, 8, 1, 14, 1, 22, 5, 1, 15, 0, + 7, 1, 17, 2, 7, 1, 2, 1, 5, 5, 62, 33, 1, 160, 14, 0, 1, 61, 4, 0, 5, 254, 2, 0, 7, 109, 8, + 0, 5, 0, 1, 30, 96, 128, 240, 0, ]; + #[inline] pub fn lookup(c: char) -> bool { - const { - assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32); - let mut i = 0; - while i < SHORT_OFFSET_RUNS.len() { - assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len()); - i += 1; - } - } - // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX` - // and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`. - unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) } + debug_assert!(!c.is_ascii()); + (c as u32) >= 0xa8 && lookup_slow(c) } -} - -#[rustfmt::skip] -pub mod cased { - use super::ShortOffsetRunHeader; - static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 22] = [ - ShortOffsetRunHeader::new(0, 4256), ShortOffsetRunHeader::new(55, 5024), - ShortOffsetRunHeader::new(65, 7296), ShortOffsetRunHeader::new(69, 7958), - ShortOffsetRunHeader::new(78, 9398), ShortOffsetRunHeader::new(153, 11264), - ShortOffsetRunHeader::new(155, 42560), ShortOffsetRunHeader::new(167, 43824), - ShortOffsetRunHeader::new(187, 64256), ShortOffsetRunHeader::new(193, 65313), - ShortOffsetRunHeader::new(197, 66560), ShortOffsetRunHeader::new(201, 67456), - ShortOffsetRunHeader::new(223, 68736), ShortOffsetRunHeader::new(231, 71840), - ShortOffsetRunHeader::new(239, 93760), ShortOffsetRunHeader::new(241, 119808), - ShortOffsetRunHeader::new(243, 120486), ShortOffsetRunHeader::new(280, 122624), - ShortOffsetRunHeader::new(303, 122928), ShortOffsetRunHeader::new(309, 125184), - ShortOffsetRunHeader::new(311, 127280), ShortOffsetRunHeader::new(313, 1241482), - ]; - static OFFSETS: [u8; 319] = [ - 65, 26, 6, 26, 47, 1, 10, 1, 4, 1, 5, 23, 1, 31, 1, 195, 1, 4, 4, 208, 1, 36, 7, 2, 30, 5, - 96, 1, 42, 4, 2, 2, 2, 4, 1, 1, 6, 1, 1, 3, 1, 1, 1, 20, 1, 83, 1, 139, 8, 166, 1, 38, 9, - 41, 0, 38, 1, 1, 5, 1, 2, 43, 1, 4, 0, 86, 2, 6, 0, 11, 5, 43, 2, 3, 64, 192, 64, 0, 2, 6, - 2, 38, 2, 6, 2, 8, 1, 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 7, 1, 1, 3, 3, 1, 7, 3, 4, 2, 6, 4, - 13, 5, 3, 1, 7, 116, 1, 13, 1, 16, 13, 101, 1, 4, 1, 2, 10, 1, 1, 3, 5, 6, 1, 1, 1, 1, 1, 1, - 4, 1, 6, 4, 1, 2, 4, 5, 5, 4, 1, 17, 32, 3, 2, 0, 52, 0, 229, 6, 4, 3, 2, 12, 38, 1, 1, 5, - 1, 0, 46, 18, 30, 132, 102, 3, 4, 1, 62, 2, 2, 1, 1, 1, 8, 21, 5, 1, 3, 0, 43, 1, 14, 6, 80, - 0, 7, 12, 5, 0, 26, 6, 26, 0, 80, 96, 36, 4, 36, 116, 11, 1, 15, 1, 7, 1, 2, 1, 11, 1, 15, - 1, 7, 1, 2, 0, 1, 2, 3, 1, 42, 1, 9, 0, 51, 13, 51, 93, 22, 10, 22, 0, 64, 0, 64, 0, 85, 1, - 71, 1, 2, 2, 1, 2, 2, 2, 4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, 2, 8, 1, 7, 1, 28, 1, 4, 1, 5, - 1, 1, 3, 7, 1, 0, 2, 25, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, - 8, 0, 10, 1, 20, 6, 6, 0, 62, 0, 68, 0, 26, 6, 26, 6, 26, 0, - ]; - pub fn lookup(c: char) -> bool { - const { - assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32); - let mut i = 0; - while i < SHORT_OFFSET_RUNS.len() { - assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len()); - i += 1; - } - } - // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX` - // and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`. - unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) } - } -} - -#[rustfmt::skip] -pub mod cc { - use super::ShortOffsetRunHeader; - - static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 1] = [ - ShortOffsetRunHeader::new(0, 1114272), - ]; - static OFFSETS: [u8; 5] = [ - 0, 32, 95, 33, 0, - ]; - pub fn lookup(c: char) -> bool { + #[inline(never)] + fn lookup_slow(c: char) -> bool { const { assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32); let mut i = 0; @@ -437,6 +391,7 @@ pub mod grapheme_extend { ]; #[inline] pub fn lookup(c: char) -> bool { + debug_assert!(!c.is_ascii()); (c as u32) >= 0x300 && lookup_slow(c) } @@ -459,7 +414,7 @@ pub mod grapheme_extend { #[rustfmt::skip] pub mod lowercase { static BITSET_CHUNKS_MAP: [u8; 123] = [ - 14, 17, 0, 0, 9, 0, 0, 12, 13, 10, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 12, 17, 0, 0, 9, 0, 0, 13, 14, 10, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 0, 15, 0, 8, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 19, 0, @@ -471,37 +426,37 @@ pub mod lowercase { [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 14, 56, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 40, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 44, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 67, 43, 0, 52, 48, 50, 33], - [0, 0, 0, 0, 10, 57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 9, 57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 3, 0, 16, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 27], [0, 0, 0, 62, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 71, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 34, 17, 23, 53, 54, 49, 47, 7, 35, 42, 0, 28, 12, 31], [0, 0, 46, 0, 56, 56, 56, 0, 22, 22, 69, 22, 36, 25, 24, 37], [0, 5, 70, 0, 29, 15, 75, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 66, 34, 17, 23, 53, 54, 49, 47, 8, 35, 42, 0, 28, 13, 31], - [11, 60, 0, 6, 0, 0, 30, 0, 0, 0, 0, 0, 0, 0, 32, 0], + [10, 60, 0, 6, 0, 0, 30, 0, 0, 0, 0, 0, 0, 0, 32, 0], [16, 26, 22, 38, 39, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [16, 51, 2, 21, 68, 9, 59, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [16, 51, 2, 21, 68, 8, 59, 0, 0, 0, 0, 0, 0, 0, 0, 0], [16, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [65, 41, 55, 12, 77, 63, 18, 1, 7, 64, 76, 20, 73, 74, 4, 45], + [65, 41, 55, 11, 66, 63, 18, 13, 1, 64, 76, 20, 73, 74, 4, 45], ]; static BITSET_CANONICAL: [u64; 56] = [ 0b0000000000000000000000000000000000000000000000000000000000000000, - 0b1111111111111111110000000000000000000000000011111111111111111111, + 0b0000111111111111111111111111110000000000000000000000000011111111, 0b1010101010101010101010101010101010101010101010101010100000000010, 0b0000000000000111111111111111111111111111111111111111111111111111, 0b1111111111111111111111000000000000000000000000001111110111111111, 0b1000000000000010000000000000000000000000000000000000000000000000, 0b0000111111111111111111111111111111111111000000000000000000000000, - 0b0000111111111111111111111111110000000000000000000000000011111111, 0b1111111111111111111111111111111111111111111111111010101010000101, 0b1111111111111111111111111111111100000000000000000000000000000000, 0b1111111111111111111111111111110000000000000000000000000000000000, 0b1111111111111111111111110000000000000000000000000000000000000000, 0b1111111111111111111111000000000000000000000000001111111111101111, 0b1111111111111111111100000000000000000000000000010000000000000000, + 0b1111111111111111110000000000000000000000000011111111111111111111, 0b1111111111111111000000111111111111110111111111111111111111111111, 0b1111111111111111000000000000000000000000000000000100001111000000, 0b1111111111111111000000000000000000000000000000000000000000000000, @@ -545,13 +500,15 @@ pub mod lowercase { 0b1110011001010001001011010010101001001110001001000011000100101001, 0b1110101111000000000000000000000000001111111111111111111111111100, ]; - static BITSET_MAPPING: [(u8, u8); 22] = [ - (0, 64), (1, 188), (1, 186), (1, 183), (1, 176), (1, 109), (1, 124), (1, 126), (1, 66), - (1, 70), (1, 77), (2, 146), (2, 144), (2, 83), (3, 93), (3, 147), (3, 133), (4, 12), (4, 6), - (5, 187), (6, 78), (7, 132), + static BITSET_MAPPING: [(u8, u8); 21] = [ + (0, 64), (1, 184), (1, 182), (1, 179), (1, 172), (1, 161), (1, 146), (1, 144), (1, 140), + (1, 136), (1, 132), (2, 146), (2, 144), (2, 83), (3, 93), (3, 147), (3, 133), (4, 12), + (4, 6), (5, 187), (6, 78), ]; pub const fn lookup(c: char) -> bool { + debug_assert!(!c.is_ascii()); + (c as u32) >= 0xaa && super::bitset_search( c as u32, &BITSET_CHUNKS_MAP, @@ -562,48 +519,76 @@ pub mod lowercase { } } +#[rustfmt::skip] +pub mod lt { + #[inline] + pub const fn lookup(c: char) -> bool { + debug_assert!(!c.is_ascii()); + match c as u32 { + 0x1c5 => true, + 0x1c8 => true, + 0x1cb => true, + 0x1f2 => true, + 0x1f88..=0x1f8f => true, + 0x1f98..=0x1f9f => true, + 0x1fa8..=0x1faf => true, + 0x1fbc => true, + 0x1fcc => true, + 0x1ffc => true, + _ => false, + } + } +} + #[rustfmt::skip] pub mod n { use super::ShortOffsetRunHeader; static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 42] = [ - ShortOffsetRunHeader::new(0, 1632), ShortOffsetRunHeader::new(9, 2406), - ShortOffsetRunHeader::new(15, 4160), ShortOffsetRunHeader::new(49, 4969), - ShortOffsetRunHeader::new(53, 5870), ShortOffsetRunHeader::new(55, 6470), - ShortOffsetRunHeader::new(63, 8304), ShortOffsetRunHeader::new(79, 9312), - ShortOffsetRunHeader::new(89, 10102), ShortOffsetRunHeader::new(93, 11517), - ShortOffsetRunHeader::new(95, 12295), ShortOffsetRunHeader::new(97, 12690), - ShortOffsetRunHeader::new(103, 42528), ShortOffsetRunHeader::new(115, 43056), - ShortOffsetRunHeader::new(119, 44016), ShortOffsetRunHeader::new(131, 65296), - ShortOffsetRunHeader::new(133, 65799), ShortOffsetRunHeader::new(135, 66273), - ShortOffsetRunHeader::new(141, 67672), ShortOffsetRunHeader::new(153, 68858), - ShortOffsetRunHeader::new(183, 69216), ShortOffsetRunHeader::new(189, 70736), - ShortOffsetRunHeader::new(209, 71248), ShortOffsetRunHeader::new(213, 71904), - ShortOffsetRunHeader::new(221, 72688), ShortOffsetRunHeader::new(225, 73552), - ShortOffsetRunHeader::new(233, 74752), ShortOffsetRunHeader::new(237, 90416), - ShortOffsetRunHeader::new(239, 92768), ShortOffsetRunHeader::new(241, 93552), - ShortOffsetRunHeader::new(249, 93824), ShortOffsetRunHeader::new(251, 118000), - ShortOffsetRunHeader::new(253, 119488), ShortOffsetRunHeader::new(255, 120782), - ShortOffsetRunHeader::new(261, 123200), ShortOffsetRunHeader::new(263, 123632), - ShortOffsetRunHeader::new(265, 124144), ShortOffsetRunHeader::new(267, 125127), - ShortOffsetRunHeader::new(271, 126065), ShortOffsetRunHeader::new(275, 127232), - ShortOffsetRunHeader::new(285, 130032), ShortOffsetRunHeader::new(287, 1244154), + ShortOffsetRunHeader::new(0, 1632), ShortOffsetRunHeader::new(7, 2406), + ShortOffsetRunHeader::new(13, 4160), ShortOffsetRunHeader::new(47, 4969), + ShortOffsetRunHeader::new(51, 5870), ShortOffsetRunHeader::new(53, 6470), + ShortOffsetRunHeader::new(61, 8304), ShortOffsetRunHeader::new(77, 9312), + ShortOffsetRunHeader::new(87, 10102), ShortOffsetRunHeader::new(91, 11517), + ShortOffsetRunHeader::new(93, 12295), ShortOffsetRunHeader::new(95, 12690), + ShortOffsetRunHeader::new(101, 42528), ShortOffsetRunHeader::new(113, 43056), + ShortOffsetRunHeader::new(117, 44016), ShortOffsetRunHeader::new(129, 65296), + ShortOffsetRunHeader::new(131, 65799), ShortOffsetRunHeader::new(133, 66273), + ShortOffsetRunHeader::new(139, 67672), ShortOffsetRunHeader::new(151, 68858), + ShortOffsetRunHeader::new(181, 69216), ShortOffsetRunHeader::new(187, 70736), + ShortOffsetRunHeader::new(207, 71248), ShortOffsetRunHeader::new(211, 71904), + ShortOffsetRunHeader::new(219, 72688), ShortOffsetRunHeader::new(223, 73552), + ShortOffsetRunHeader::new(231, 74752), ShortOffsetRunHeader::new(235, 90416), + ShortOffsetRunHeader::new(237, 92768), ShortOffsetRunHeader::new(239, 93552), + ShortOffsetRunHeader::new(247, 93824), ShortOffsetRunHeader::new(249, 118000), + ShortOffsetRunHeader::new(251, 119488), ShortOffsetRunHeader::new(253, 120782), + ShortOffsetRunHeader::new(259, 123200), ShortOffsetRunHeader::new(261, 123632), + ShortOffsetRunHeader::new(263, 124144), ShortOffsetRunHeader::new(265, 125127), + ShortOffsetRunHeader::new(269, 126065), ShortOffsetRunHeader::new(273, 127232), + ShortOffsetRunHeader::new(283, 130032), ShortOffsetRunHeader::new(285, 1244154), ]; - static OFFSETS: [u8; 289] = [ - 48, 10, 120, 2, 5, 1, 2, 3, 0, 10, 134, 10, 198, 10, 0, 10, 118, 10, 4, 6, 108, 10, 118, - 10, 118, 10, 2, 6, 110, 13, 115, 10, 8, 7, 103, 10, 104, 7, 7, 19, 109, 10, 96, 10, 118, 10, - 70, 20, 0, 10, 70, 10, 0, 20, 0, 3, 239, 10, 6, 10, 22, 10, 0, 10, 128, 11, 165, 10, 6, 10, - 182, 10, 86, 10, 134, 10, 6, 10, 0, 1, 3, 6, 6, 10, 198, 51, 2, 5, 0, 60, 78, 22, 0, 30, 0, - 1, 0, 1, 25, 9, 14, 3, 0, 4, 138, 10, 30, 8, 1, 15, 32, 10, 39, 15, 0, 10, 188, 10, 0, 6, - 154, 10, 38, 10, 198, 10, 22, 10, 86, 10, 0, 10, 0, 10, 0, 45, 12, 57, 17, 2, 0, 27, 36, 4, - 29, 1, 8, 1, 134, 5, 202, 10, 0, 8, 25, 7, 39, 9, 75, 5, 22, 6, 160, 2, 2, 16, 2, 46, 64, 9, - 52, 2, 30, 3, 75, 5, 104, 8, 24, 8, 41, 7, 0, 6, 48, 10, 6, 10, 0, 31, 158, 10, 42, 4, 112, - 7, 134, 30, 128, 10, 60, 10, 144, 10, 7, 20, 251, 10, 0, 10, 118, 10, 0, 10, 102, 10, 6, 20, - 76, 12, 0, 19, 93, 10, 0, 10, 86, 29, 227, 10, 70, 10, 0, 10, 102, 21, 0, 111, 0, 10, 0, 10, - 86, 10, 134, 10, 1, 7, 0, 10, 0, 23, 0, 10, 0, 20, 12, 20, 108, 25, 0, 50, 0, 10, 0, 10, 0, - 10, 247, 10, 0, 9, 128, 10, 0, 59, 1, 3, 1, 4, 76, 45, 1, 15, 0, 13, 0, 10, 0, + static OFFSETS: [u8; 287] = [ + 178, 2, 5, 1, 2, 3, 0, 10, 134, 10, 198, 10, 0, 10, 118, 10, 4, 6, 108, 10, 118, 10, 118, + 10, 2, 6, 110, 13, 115, 10, 8, 7, 103, 10, 104, 7, 7, 19, 109, 10, 96, 10, 118, 10, 70, 20, + 0, 10, 70, 10, 0, 20, 0, 3, 239, 10, 6, 10, 22, 10, 0, 10, 128, 11, 165, 10, 6, 10, 182, 10, + 86, 10, 134, 10, 6, 10, 0, 1, 3, 6, 6, 10, 198, 51, 2, 5, 0, 60, 78, 22, 0, 30, 0, 1, 0, 1, + 25, 9, 14, 3, 0, 4, 138, 10, 30, 8, 1, 15, 32, 10, 39, 15, 0, 10, 188, 10, 0, 6, 154, 10, + 38, 10, 198, 10, 22, 10, 86, 10, 0, 10, 0, 10, 0, 45, 12, 57, 17, 2, 0, 27, 36, 4, 29, 1, 8, + 1, 134, 5, 202, 10, 0, 8, 25, 7, 39, 9, 75, 5, 22, 6, 160, 2, 2, 16, 2, 46, 64, 9, 52, 2, + 30, 3, 75, 5, 104, 8, 24, 8, 41, 7, 0, 6, 48, 10, 6, 10, 0, 31, 158, 10, 42, 4, 112, 7, 134, + 30, 128, 10, 60, 10, 144, 10, 7, 20, 251, 10, 0, 10, 118, 10, 0, 10, 102, 10, 6, 20, 76, 12, + 0, 19, 93, 10, 0, 10, 86, 29, 227, 10, 70, 10, 0, 10, 102, 21, 0, 111, 0, 10, 0, 10, 86, 10, + 134, 10, 1, 7, 0, 10, 0, 23, 0, 10, 0, 20, 12, 20, 108, 25, 0, 50, 0, 10, 0, 10, 0, 10, 247, + 10, 0, 9, 128, 10, 0, 59, 1, 3, 1, 4, 76, 45, 1, 15, 0, 13, 0, 10, 0, ]; + #[inline] pub fn lookup(c: char) -> bool { + debug_assert!(!c.is_ascii()); + (c as u32) >= 0xb2 && lookup_slow(c) + } + + #[inline(never)] + fn lookup_slow(c: char) -> bool { const { assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32); let mut i = 0; @@ -621,34 +606,34 @@ pub mod n { #[rustfmt::skip] pub mod uppercase { static BITSET_CHUNKS_MAP: [u8; 125] = [ - 12, 15, 6, 6, 0, 6, 6, 2, 4, 11, 6, 16, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 5, 6, 14, 6, 10, 6, 6, 1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 13, 6, 6, - 6, 6, 9, 6, 3, + 3, 14, 6, 6, 0, 6, 6, 2, 5, 12, 6, 15, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 9, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 7, 6, 13, 6, 11, 6, 6, 1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 16, 6, 6, + 6, 6, 10, 6, 4, ]; static BITSET_INDEX_CHUNKS: [[u8; 16]; 17] = [ - [44, 44, 5, 35, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 5, 1], + [44, 44, 5, 35, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 5, 0], [44, 44, 5, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44], - [44, 44, 40, 44, 44, 44, 44, 44, 17, 17, 63, 17, 43, 29, 24, 23], + [44, 44, 40, 44, 44, 44, 44, 44, 17, 17, 62, 17, 43, 29, 24, 23], + [44, 44, 44, 32, 36, 21, 22, 15, 13, 34, 44, 44, 44, 11, 30, 39], [44, 44, 44, 44, 9, 8, 45, 44, 44, 44, 44, 44, 44, 44, 44, 44], - [44, 44, 44, 44, 37, 28, 67, 44, 44, 44, 44, 44, 44, 44, 44, 44], - [44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 0, 44, 44, 44], + [44, 44, 44, 44, 37, 28, 66, 44, 44, 44, 44, 44, 44, 44, 44, 44], [44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44], - [44, 44, 44, 44, 44, 44, 44, 44, 44, 55, 44, 44, 44, 44, 44, 44], - [44, 44, 44, 44, 44, 44, 44, 44, 44, 62, 61, 44, 20, 14, 16, 4], - [44, 44, 44, 44, 56, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44], - [44, 44, 59, 44, 44, 31, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44], - [44, 44, 60, 46, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44], - [44, 49, 44, 32, 36, 21, 22, 15, 13, 34, 44, 44, 44, 11, 30, 39], - [52, 54, 26, 50, 12, 7, 25, 51, 41, 53, 6, 3, 66, 65, 64, 68], - [57, 44, 9, 47, 44, 42, 33, 44, 44, 44, 44, 44, 44, 44, 44, 44], - [58, 19, 2, 18, 10, 48, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44], - [58, 38, 17, 27, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44], + [44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 57, 44, 44, 44], + [44, 44, 44, 44, 44, 44, 44, 44, 44, 49, 44, 44, 44, 44, 44, 44], + [44, 44, 44, 44, 44, 44, 44, 44, 44, 61, 60, 44, 20, 14, 16, 4], + [44, 44, 44, 44, 50, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44], + [44, 44, 53, 44, 44, 31, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44], + [44, 44, 54, 46, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44], + [51, 44, 9, 47, 44, 42, 33, 44, 44, 44, 44, 44, 44, 44, 44, 44], + [52, 19, 2, 18, 10, 48, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44], + [52, 38, 17, 27, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44], + [58, 1, 26, 55, 12, 7, 25, 56, 41, 59, 6, 3, 65, 64, 63, 67], ]; static BITSET_CANONICAL: [u64; 44] = [ - 0b0000011111111111111111111111111000000000000000000000000000000000, 0b0000000000111111111111111111111111111111111111111111111111111111, + 0b1111111111111111111111110000000000000000000000000011111111111111, 0b0101010101010101010101010101010101010101010101010101010000000001, 0b0000011111111111111111111111110000000000000000000000000000000001, 0b0000000000100000000000000000000000010101010000010001101011110101, @@ -692,13 +677,15 @@ pub mod uppercase { 0b1111011111111111000000000000000000000000000000000000000000000000, 0b1111111100000000111111110000000000111111000000001111111100000000, ]; - static BITSET_MAPPING: [(u8, u8); 25] = [ - (0, 187), (0, 177), (0, 171), (0, 167), (0, 164), (0, 32), (0, 47), (0, 51), (0, 121), - (0, 117), (0, 109), (1, 150), (1, 148), (1, 142), (1, 134), (1, 131), (1, 64), (2, 164), - (2, 146), (2, 20), (3, 146), (3, 140), (3, 134), (4, 178), (4, 171), + static BITSET_MAPPING: [(u8, u8); 24] = [ + (0, 182), (0, 74), (0, 166), (0, 162), (0, 159), (0, 150), (0, 148), (0, 142), (0, 134), + (0, 131), (0, 64), (1, 66), (1, 70), (1, 83), (1, 12), (1, 8), (2, 164), (2, 146), (2, 20), + (3, 146), (3, 140), (3, 134), (4, 178), (4, 171), ]; pub const fn lookup(c: char) -> bool { + debug_assert!(!c.is_ascii()); + (c as u32) >= 0xc0 && super::bitset_search( c as u32, &BITSET_CHUNKS_MAP, @@ -711,24 +698,18 @@ pub mod uppercase { #[rustfmt::skip] pub mod white_space { - static WHITESPACE_MAP: [u8; 256] = [ - 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]; #[inline] pub const fn lookup(c: char) -> bool { - match c as u32 >> 8 { - 0 => WHITESPACE_MAP[c as usize & 0xff] & 1 != 0, - 22 => c as u32 == 0x1680, - 32 => WHITESPACE_MAP[c as usize & 0xff] & 2 != 0, - 48 => c as u32 == 0x3000, + debug_assert!(!c.is_ascii()); + match c as u32 { + 0x85 => true, + 0xa0 => true, + 0x1680 => true, + 0x2000..=0x200a => true, + 0x2028..=0x2029 => true, + 0x202f => true, + 0x205f => true, + 0x3000 => true, _ => false, } } @@ -772,7 +753,7 @@ pub mod conversions { } } - static LOWERCASE_TABLE: &[(char, u32)] = &[ + static LOWERCASE_TABLE: &[(char, u32); 1434] = &[ ('\u{c0}', 224), ('\u{c1}', 225), ('\u{c2}', 226), ('\u{c3}', 227), ('\u{c4}', 228), ('\u{c5}', 229), ('\u{c6}', 230), ('\u{c7}', 231), ('\u{c8}', 232), ('\u{c9}', 233), ('\u{ca}', 234), ('\u{cb}', 235), ('\u{cc}', 236), ('\u{cd}', 237), ('\u{ce}', 238), @@ -1122,11 +1103,11 @@ pub mod conversions { ('\u{1e921}', 125251), ]; - static LOWERCASE_TABLE_MULTI: &[[char; 3]] = &[ + static LOWERCASE_TABLE_MULTI: &[[char; 3]; 1] = &[ ['i', '\u{307}', '\u{0}'], ]; - static UPPERCASE_TABLE: &[(char, u32)] = &[ + static UPPERCASE_TABLE: &[(char, u32); 1526] = &[ ('\u{b5}', 924), ('\u{df}', 4194304), ('\u{e0}', 192), ('\u{e1}', 193), ('\u{e2}', 194), ('\u{e3}', 195), ('\u{e4}', 196), ('\u{e5}', 197), ('\u{e6}', 198), ('\u{e7}', 199), ('\u{e8}', 200), ('\u{e9}', 201), ('\u{ea}', 202), ('\u{eb}', 203), ('\u{ec}', 204), @@ -1499,7 +1480,7 @@ pub mod conversions { ('\u{1e941}', 125215), ('\u{1e942}', 125216), ('\u{1e943}', 125217), ]; - static UPPERCASE_TABLE_MULTI: &[[char; 3]] = &[ + static UPPERCASE_TABLE_MULTI: &[[char; 3]; 102] = &[ ['S', 'S', '\u{0}'], ['\u{2bc}', 'N', '\u{0}'], ['J', '\u{30c}', '\u{0}'], ['\u{399}', '\u{308}', '\u{301}'], ['\u{3a5}', '\u{308}', '\u{301}'], ['\u{535}', '\u{552}', '\u{0}'], ['H', '\u{331}', '\u{0}'], ['T', '\u{308}', '\u{0}'], diff --git a/src/tools/unicode-table-generator/src/cascading_map.rs b/src/tools/unicode-table-generator/src/cascading_map.rs deleted file mode 100644 index 78a7bba320870..0000000000000 --- a/src/tools/unicode-table-generator/src/cascading_map.rs +++ /dev/null @@ -1,77 +0,0 @@ -use std::collections::HashMap; -use std::fmt::Write as _; -use std::ops::Range; - -use crate::fmt_list; -use crate::raw_emitter::RawEmitter; - -impl RawEmitter { - pub fn emit_cascading_map(&mut self, ranges: &[Range]) -> bool { - let mut map: [u8; 256] = [ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]; - - let points = ranges - .iter() - .flat_map(|r| (r.start..r.end).collect::>()) - .collect::>(); - - println!("there are {} points", points.len()); - - // how many distinct ranges need to be counted? - let mut codepoints_by_high_bytes = HashMap::>::new(); - for point in points { - // assert that there is no whitespace over the 0x3000 range. - assert!(point <= 0x3000, "the highest unicode whitespace value has changed"); - let high_bytes = point as usize >> 8; - let codepoints = codepoints_by_high_bytes.entry(high_bytes).or_default(); - codepoints.push(point); - } - - let mut bit_for_high_byte = 1u8; - let mut arms = Vec::::new(); - - let mut high_bytes: Vec = codepoints_by_high_bytes.keys().copied().collect(); - high_bytes.sort(); - for high_byte in high_bytes { - let codepoints = codepoints_by_high_bytes.get_mut(&high_byte).unwrap(); - if codepoints.len() == 1 { - let ch = codepoints.pop().unwrap(); - arms.push(format!("{high_byte} => c as u32 == {ch:#04x}")); - continue; - } - // more than 1 codepoint in this arm - for codepoint in codepoints { - map[(*codepoint & 0xff) as usize] |= bit_for_high_byte; - } - arms.push(format!( - "{high_byte} => WHITESPACE_MAP[c as usize & 0xff] & {bit_for_high_byte} != 0" - )); - bit_for_high_byte <<= 1; - } - - writeln!(&mut self.file, "static WHITESPACE_MAP: [u8; 256] = [{}];", fmt_list(map.iter())) - .unwrap(); - self.bytes_used += 256; - - writeln!(&mut self.file, "#[inline]").unwrap(); - writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap(); - writeln!(&mut self.file, " match c as u32 >> 8 {{").unwrap(); - for arm in arms { - writeln!(&mut self.file, " {arm},").unwrap(); - } - writeln!(&mut self.file, " _ => false,").unwrap(); - writeln!(&mut self.file, " }}").unwrap(); - writeln!(&mut self.file, "}}").unwrap(); - - true - } -} diff --git a/src/tools/unicode-table-generator/src/case_mapping.rs b/src/tools/unicode-table-generator/src/case_mapping.rs index 9c6454492e7e0..49aef3ec33ec7 100644 --- a/src/tools/unicode-table-generator/src/case_mapping.rs +++ b/src/tools/unicode-table-generator/src/case_mapping.rs @@ -6,24 +6,26 @@ use crate::{UnicodeData, fmt_list}; const INDEX_MASK: u32 = 1 << 22; -pub(crate) fn generate_case_mapping(data: &UnicodeData) -> String { +pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [usize; 2]) { let mut file = String::new(); write!(file, "const INDEX_MASK: u32 = 0x{INDEX_MASK:x};").unwrap(); file.push_str("\n\n"); file.push_str(HEADER.trim_start()); file.push('\n'); - file.push_str(&generate_tables("LOWER", &data.to_lower)); + let (lower_tables, lower_size) = generate_tables("LOWER", &data.to_lower); + file.push_str(&lower_tables); file.push_str("\n\n"); - file.push_str(&generate_tables("UPPER", &data.to_upper)); - file + let (upper_tables, upper_size) = generate_tables("UPPER", &data.to_upper); + file.push_str(&upper_tables); + (file, [lower_size, upper_size]) } -fn generate_tables(case: &str, data: &BTreeMap) -> String { +fn generate_tables(case: &str, data: &BTreeMap) -> (String, usize) { let mut mappings = Vec::with_capacity(data.len()); let mut multis = Vec::new(); - for (&key, &(a, b, c)) in data.iter() { + for (&key, &[a, b, c]) in data.iter() { let key = char::from_u32(key).unwrap(); if key.is_ascii() { @@ -46,16 +48,31 @@ fn generate_tables(case: &str, data: &BTreeMap) -> String } let mut tables = String::new(); - - write!(tables, "static {}CASE_TABLE: &[(char, u32)] = &[{}];", case, fmt_list(mappings)) - .unwrap(); + let mut size = 0; + + size += size_of_val(mappings.as_slice()); + write!( + tables, + "static {}CASE_TABLE: &[(char, u32); {}] = &[{}];", + case, + mappings.len(), + fmt_list(mappings), + ) + .unwrap(); tables.push_str("\n\n"); - write!(tables, "static {}CASE_TABLE_MULTI: &[[char; 3]] = &[{}];", case, fmt_list(multis)) - .unwrap(); - - tables + size += size_of_val(multis.as_slice()); + write!( + tables, + "static {}CASE_TABLE_MULTI: &[[char; 3]; {}] = &[{}];", + case, + multis.len(), + fmt_list(multis), + ) + .unwrap(); + + (tables, size) } struct CharEscape(char); diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 6cdb82a87bdff..85dc597b50a89 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -72,37 +72,38 @@ //! or not. use std::collections::{BTreeMap, HashMap}; +use std::fmt; +use std::fmt::Write; use std::ops::Range; use ucd_parse::Codepoints; -mod cascading_map; mod case_mapping; +mod r#match; mod raw_emitter; mod skiplist; mod unicode_download; -use raw_emitter::{RawEmitter, emit_codepoints, emit_whitespace}; +use raw_emitter::{RawEmitter, emit_codepoints}; static PROPERTIES: &[&str] = &[ "Alphabetic", "Lowercase", "Uppercase", - "Cased", + "Lt", "Case_Ignorable", "Grapheme_Extend", "White_Space", - "Cc", "N", ]; struct UnicodeData { ranges: Vec<(&'static str, Vec>)>, - to_upper: BTreeMap, - to_lower: BTreeMap, + to_upper: BTreeMap, + to_lower: BTreeMap, } -fn to_mapping(origin: u32, codepoints: Vec) -> Option<(u32, u32, u32)> { +fn to_mapping(origin: u32, codepoints: Vec) -> Option<[u32; 3]> { let mut a = None; let mut b = None; let mut c = None; @@ -123,7 +124,7 @@ fn to_mapping(origin: u32, codepoints: Vec) -> Option<(u32 } } - Some((a.unwrap(), b.unwrap_or(0), c.unwrap_or(0))) + Some([a.unwrap(), b.unwrap_or(0), c.unwrap_or(0)]) } static UNICODE_DIRECTORY: &str = "unicode-downloads"; @@ -163,12 +164,12 @@ fn load_data() -> UnicodeData { if let Some(mapped) = row.simple_lowercase_mapping && mapped != row.codepoint { - to_lower.insert(row.codepoint.value(), (mapped.value(), 0, 0)); + to_lower.insert(row.codepoint.value(), [mapped.value(), 0, 0]); } if let Some(mapped) = row.simple_uppercase_mapping && mapped != row.codepoint { - to_upper.insert(row.codepoint.value(), (mapped.value(), 0, 0)); + to_upper.insert(row.codepoint.value(), [mapped.value(), 0, 0]); } } @@ -187,33 +188,20 @@ fn load_data() -> UnicodeData { } } - let mut properties: HashMap<&'static str, Vec>> = properties + let mut properties: Vec<(&'static str, Vec>)> = properties .into_iter() - .map(|(k, v)| { - ( - k, - v.into_iter() - .flat_map(|codepoints| match codepoints { - Codepoints::Single(c) => c - .scalar() - .map(|ch| ch as u32..ch as u32 + 1) - .into_iter() - .collect::>(), - Codepoints::Range(c) => c - .into_iter() - .flat_map(|c| c.scalar().map(|ch| ch as u32..ch as u32 + 1)) - .collect::>(), - }) - .collect::>>(), - ) + .map(|(prop, codepoints)| { + let codepoints = codepoints + .into_iter() + .flatten() + .flat_map(|cp| cp.scalar()) + .filter(|c| !c.is_ascii()) + .map(u32::from) + .collect::>(); + (prop, ranges_from_set(&codepoints)) }) .collect(); - for ranges in properties.values_mut() { - merge_ranges(ranges); - } - - let mut properties = properties.into_iter().collect::>(); properties.sort_by_key(|p| p.0); UnicodeData { ranges: properties, to_lower, to_upper } } @@ -236,24 +224,25 @@ fn main() { let ranges_by_property = &unicode_data.ranges; if let Some(path) = test_path { - std::fs::write(&path, generate_tests(&write_location, ranges_by_property)).unwrap(); + std::fs::write(&path, generate_tests(&unicode_data).unwrap()).unwrap(); } + let mut table_file = String::new(); + table_file.push_str( + "///! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!\n", + ); + let mut total_bytes = 0; let mut modules = Vec::new(); for (property, ranges) in ranges_by_property { let datapoints = ranges.iter().map(|r| r.end - r.start).sum::(); let mut emitter = RawEmitter::new(); - if property == &"White_Space" { - emit_whitespace(&mut emitter, ranges); - } else { - emit_codepoints(&mut emitter, ranges); - } + emit_codepoints(&mut emitter, ranges); modules.push((property.to_lowercase().to_string(), emitter.file)); - println!( - "{:15}: {} bytes, {} codepoints in {} ranges ({} - {}) using {}", + table_file.push_str(&format!( + "// {:16}: {:5} bytes, {:6} codepoints in {:3} ranges (U+{:06X} - U+{:06X}) using {}\n", property, emitter.bytes_used, datapoints, @@ -261,15 +250,15 @@ fn main() { ranges.first().unwrap().start, ranges.last().unwrap().end, emitter.desc, - ); + )); total_bytes += emitter.bytes_used; } - - let mut table_file = String::new(); - - table_file.push_str( - "///! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!\n", - ); + let (conversions, sizes) = case_mapping::generate_case_mapping(&unicode_data); + for (name, size) in ["to_lower", "to_upper"].iter().zip(sizes) { + table_file.push_str(&format!("// {:16}: {:5} bytes\n", name, size)); + total_bytes += size; + } + table_file.push_str(&format!("// {:16}: {:5} bytes\n", "Total", total_bytes)); // Include the range search function table_file.push('\n'); @@ -280,7 +269,7 @@ fn main() { table_file.push('\n'); - modules.push((String::from("conversions"), case_mapping::generate_case_mapping(&unicode_data))); + modules.push((String::from("conversions"), conversions)); for (name, contents) in modules { table_file.push_str("#[rustfmt::skip]\n"); @@ -296,8 +285,6 @@ fn main() { } std::fs::write(&write_location, format!("{}\n", table_file.trim_end())).unwrap(); - - println!("Total table sizes: {total_bytes} bytes"); } fn version() -> String { @@ -337,110 +324,96 @@ fn fmt_list(values: impl IntoIterator) -> String { out } -fn generate_tests(data_path: &str, ranges: &[(&str, Vec>)]) -> String { +fn generate_tests(data: &UnicodeData) -> Result { let mut s = String::new(); - s.push_str("#![allow(incomplete_features, unused)]\n"); - s.push_str("#![feature(const_generics)]\n\n"); - s.push_str("\n#[allow(unused)]\nuse std::hint;\n"); - s.push_str(&format!("#[path = \"{data_path}\"]\n")); - s.push_str("mod unicode_data;\n\n"); - - s.push_str("\nfn main() {\n"); - - for (property, ranges) in ranges { - s.push_str(&format!(r#" println!("Testing {property}");"#)); - s.push('\n'); - s.push_str(&format!(" {}_true();\n", property.to_lowercase())); - s.push_str(&format!(" {}_false();\n", property.to_lowercase())); - let mut is_true = Vec::new(); - let mut is_false = Vec::new(); - for ch_num in 0..(std::char::MAX as u32) { - if std::char::from_u32(ch_num).is_none() { - continue; - } - if ranges.iter().any(|r| r.contains(&ch_num)) { - is_true.push(ch_num); - } else { - is_false.push(ch_num); - } - } - - s.push_str(&format!(" fn {}_true() {{\n", property.to_lowercase())); - generate_asserts(&mut s, property, &is_true, true); - s.push_str(" }\n\n"); - s.push_str(&format!(" fn {}_false() {{\n", property.to_lowercase())); - generate_asserts(&mut s, property, &is_false, false); - s.push_str(" }\n\n"); + writeln!(s, "#![feature(core_intrinsics)]")?; + writeln!(s, "#![allow(internal_features, dead_code)]")?; + writeln!(s, "// ignore-tidy-filelength")?; + writeln!(s, "use std::intrinsics;")?; + writeln!(s, "mod unicode_data;")?; + writeln!(s, "fn main() {{")?; + for (property, ranges) in &data.ranges { + let prop = property.to_lowercase(); + writeln!(s, r#" println!("Testing {prop}");"#)?; + writeln!(s, " {prop}_true();")?; + writeln!(s, " {prop}_false();")?; + let (is_true, is_false): (Vec<_>, Vec<_>) = (char::MIN..=char::MAX) + .filter(|c| !c.is_ascii()) + .map(u32::from) + .partition(|c| ranges.iter().any(|r| r.contains(c))); + + writeln!(s, " fn {prop}_true() {{")?; + generate_asserts(&mut s, &prop, &is_true, true)?; + writeln!(s, " }}")?; + + writeln!(s, " fn {prop}_false() {{")?; + generate_asserts(&mut s, &prop, &is_false, false)?; + writeln!(s, " }}")?; } - s.push('}'); - s -} - -fn generate_asserts(s: &mut String, property: &str, points: &[u32], truthy: bool) { - for range in ranges_from_set(points) { - if range.end == range.start + 1 { - s.push_str(&format!( - " assert!({}unicode_data::{}::lookup({:?}), \"{}\");\n", - if truthy { "" } else { "!" }, - property.to_lowercase(), - std::char::from_u32(range.start).unwrap(), - range.start, - )); - } else { - s.push_str(&format!(" for chn in {range:?}u32 {{\n")); - s.push_str(&format!( - " assert!({}unicode_data::{}::lookup(std::char::from_u32(chn).unwrap()), \"{{:?}}\", chn);\n", - if truthy { "" } else { "!" }, - property.to_lowercase(), - )); - s.push_str(" }\n"); + for (name, conversion) in ["to_lower", "to_upper"].iter().zip([&data.to_lower, &data.to_upper]) + { + writeln!(s, r#" println!("Testing {name}");"#)?; + for (c, mapping) in conversion { + let c = char::from_u32(*c).unwrap(); + let mapping = mapping.map(|c| char::from_u32(c).unwrap()); + writeln!( + s, + r#" assert_eq!(unicode_data::conversions::{name}({c:?}), {mapping:?});"# + )?; + } + let unmapped: Vec<_> = (char::MIN..=char::MAX) + .filter(|c| !c.is_ascii()) + .map(u32::from) + .filter(|c| !conversion.contains_key(c)) + .collect(); + let unmapped_ranges = ranges_from_set(&unmapped); + for range in unmapped_ranges { + let start = char::from_u32(range.start).unwrap(); + let end = char::from_u32(range.end - 1).unwrap(); + writeln!(s, " for c in {start:?}..={end:?} {{")?; + writeln!( + s, + r#" assert_eq!(unicode_data::conversions::{name}(c), [c, '\0', '\0']);"# + )?; + + writeln!(s, " }}")?; } } -} -fn ranges_from_set(set: &[u32]) -> Vec> { - let mut ranges = set.iter().map(|e| (*e)..(*e + 1)).collect::>>(); - merge_ranges(&mut ranges); - ranges + writeln!(s, "}}")?; + Ok(s) } -fn merge_ranges(ranges: &mut Vec>) { - loop { - let mut new_ranges = Vec::new(); - let mut idx_iter = 0..(ranges.len() - 1); - let mut should_insert_last = true; - while let Some(idx) = idx_iter.next() { - let cur = ranges[idx].clone(); - let next = ranges[idx + 1].clone(); - if cur.end == next.start { - if idx_iter.next().is_none() { - // We're merging the last element - should_insert_last = false; - } - new_ranges.push(cur.start..next.end); - } else { - // We're *not* merging the last element - should_insert_last = true; - new_ranges.push(cur); +fn generate_asserts( + s: &mut String, + prop: &str, + points: &[u32], + truthy: bool, +) -> Result<(), fmt::Error> { + let truthy = if truthy { "" } else { "!" }; + for range in ranges_from_set(points) { + let start = char::from_u32(range.start).unwrap(); + let end = char::from_u32(range.end - 1).unwrap(); + match range.len() { + 1 => writeln!(s, " assert!({truthy}unicode_data::{prop}::lookup({start:?}));")?, + _ => { + writeln!(s, " for c in {start:?}..={end:?} {{")?; + writeln!(s, " assert!({truthy}unicode_data::{prop}::lookup(c));")?; + writeln!(s, " }}")?; } } - if should_insert_last { - new_ranges.push(ranges.last().unwrap().clone()); - } - if new_ranges.len() == ranges.len() { - *ranges = new_ranges; - break; - } else { - *ranges = new_ranges; - } } + Ok(()) +} - let mut last_end = None; - for range in ranges { - if let Some(last) = last_end { - assert!(range.start > last, "{range:?}"); - } - last_end = Some(range.end); - } +/// Group the elements of `set` into contigous ranges +fn ranges_from_set(set: &[u32]) -> Vec> { + set.chunk_by(|a, b| a + 1 == *b) + .map(|chunk| { + let start = *chunk.first().unwrap(); + let end = *chunk.last().unwrap(); + start..(end + 1) + }) + .collect() } diff --git a/src/tools/unicode-table-generator/src/match.rs b/src/tools/unicode-table-generator/src/match.rs new file mode 100644 index 0000000000000..440178f919e70 --- /dev/null +++ b/src/tools/unicode-table-generator/src/match.rs @@ -0,0 +1,30 @@ +use std::fmt::{self, Write as _}; +use std::ops::Range; + +use crate::raw_emitter::RawEmitter; + +impl RawEmitter { + pub fn emit_match(&mut self, ranges: &[Range]) -> Result<(), fmt::Error> { + let arms: Vec<_> = ranges + .iter() + .map(|range| match range.len() { + 1 => format!("{:#x} => true,", range.start), + + // minus one because inclusive range pattern + _ => format!("{:#x}..={:#x} => true,", range.start, range.end - 1), + }) + .collect(); + + writeln!(self.file, "#[inline]")?; + writeln!(self.file, "pub const fn lookup(c: char) -> bool {{")?; + writeln!(self.file, " debug_assert!(!c.is_ascii());")?; + writeln!(self.file, " match c as u32 {{")?; + for arm in arms { + writeln!(self.file, " {arm}")?; + } + writeln!(self.file, " _ => false,")?; + writeln!(self.file, " }}")?; + writeln!(self.file, "}}")?; + Ok(()) + } +} diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs index e9e0efc459442..db7c6b42af17e 100644 --- a/src/tools/unicode-table-generator/src/raw_emitter.rs +++ b/src/tools/unicode-table-generator/src/raw_emitter.rs @@ -98,6 +98,7 @@ impl RawEmitter { self.blank_line(); writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap(); + writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap(); if first_code_point > 0x7f { writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} &&").unwrap(); } @@ -155,6 +156,12 @@ impl RawEmitter { pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range]) { emitter.blank_line(); + if ranges.len() <= 10 { + emitter.emit_match(ranges).unwrap(); + emitter.desc = String::from("match"); + return; + } + let mut bitset = emitter.clone(); let bitset_ok = bitset.emit_bitset(ranges).is_ok(); @@ -170,15 +177,6 @@ pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range]) { } } -pub fn emit_whitespace(emitter: &mut RawEmitter, ranges: &[Range]) { - emitter.blank_line(); - - let mut cascading = emitter.clone(); - cascading.emit_cascading_map(ranges); - *emitter = cascading; - emitter.desc = String::from("cascading"); -} - struct Canonicalized { canonical_words: Vec, canonicalized_words: Vec<(u8, u8)>, @@ -339,10 +337,13 @@ impl Canonicalized { // We'll probably always have some slack though so this loop will still // be needed. for &w in unique_words { - unique_mapping.entry(w).or_insert_with(|| { + if !unique_mapping.contains_key(&w) { + assert_eq!( + unique_mapping.insert(w, UniqueMapping::Canonical(canonical_words.len())), + None + ); canonical_words.push(w); - UniqueMapping::Canonical(canonical_words.len()) - }); + } } assert_eq!(canonicalized_words.len() + canonical_words.len(), unique_words.len()); assert_eq!(unique_mapping.len(), unique_words.len()); diff --git a/src/tools/unicode-table-generator/src/skiplist.rs b/src/tools/unicode-table-generator/src/skiplist.rs index 34c9802e122f6..660a8f342f7a7 100644 --- a/src/tools/unicode-table-generator/src/skiplist.rs +++ b/src/tools/unicode-table-generator/src/skiplist.rs @@ -99,6 +99,7 @@ impl RawEmitter { if first_code_point > 0x7f { writeln!(&mut self.file, "#[inline]").unwrap(); writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); + writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap(); writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} && lookup_slow(c)") .unwrap(); writeln!(&mut self.file, "}}").unwrap(); @@ -107,6 +108,7 @@ impl RawEmitter { writeln!(&mut self.file, "fn lookup_slow(c: char) -> bool {{").unwrap(); } else { writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); + writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap(); } writeln!(&mut self.file, " const {{").unwrap(); writeln!(