Skip to content

Commit 4eb6212

Browse files
authored
Rollup merge of rust-lang#145414 - Kmeakin:km/unicode-table-refactors, r=joshtriplett,tgross35
unicode-table-generator refactors Split off from rust-lang#145219
2 parents e97edb0 + c3ce079 commit 4eb6212

File tree

3 files changed

+159
-153
lines changed

3 files changed

+159
-153
lines changed

library/core/src/unicode/unicode_data.rs

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,16 @@
11
///! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!
2+
// Alphabetic : 1727 bytes, 142759 codepoints in 757 ranges (U+000041 - U+0323B0) using skiplist
3+
// Case_Ignorable : 1053 bytes, 2749 codepoints in 452 ranges (U+000027 - U+0E01F0) using skiplist
4+
// Cased : 407 bytes, 4578 codepoints in 159 ranges (U+000041 - U+01F18A) using skiplist
5+
// Cc : 9 bytes, 65 codepoints in 2 ranges (U+000000 - U+0000A0) using skiplist
6+
// Grapheme_Extend : 887 bytes, 2193 codepoints in 375 ranges (U+000300 - U+0E01F0) using skiplist
7+
// Lowercase : 935 bytes, 2569 codepoints in 675 ranges (U+000061 - U+01E944) using bitset
8+
// N : 457 bytes, 1911 codepoints in 144 ranges (U+000030 - U+01FBFA) using skiplist
9+
// Uppercase : 799 bytes, 1978 codepoints in 656 ranges (U+000041 - U+01F18A) using bitset
10+
// White_Space : 256 bytes, 25 codepoints in 10 ranges (U+000009 - U+003001) using cascading
11+
// to_lower : 11484 bytes
12+
// to_upper : 13432 bytes
13+
// Total : 31446 bytes
214

315
#[inline(always)]
416
const fn bitset_search<
@@ -747,7 +759,7 @@ pub mod conversions {
747759
}
748760
}
749761

750-
static LOWERCASE_TABLE: &[(char, u32)] = &[
762+
static LOWERCASE_TABLE: &[(char, u32); 1434] = &[
751763
('\u{c0}', 224), ('\u{c1}', 225), ('\u{c2}', 226), ('\u{c3}', 227), ('\u{c4}', 228),
752764
('\u{c5}', 229), ('\u{c6}', 230), ('\u{c7}', 231), ('\u{c8}', 232), ('\u{c9}', 233),
753765
('\u{ca}', 234), ('\u{cb}', 235), ('\u{cc}', 236), ('\u{cd}', 237), ('\u{ce}', 238),
@@ -1097,11 +1109,11 @@ pub mod conversions {
10971109
('\u{1e921}', 125251),
10981110
];
10991111

1100-
static LOWERCASE_TABLE_MULTI: &[[char; 3]] = &[
1112+
static LOWERCASE_TABLE_MULTI: &[[char; 3]; 1] = &[
11011113
['i', '\u{307}', '\u{0}'],
11021114
];
11031115

1104-
static UPPERCASE_TABLE: &[(char, u32)] = &[
1116+
static UPPERCASE_TABLE: &[(char, u32); 1526] = &[
11051117
('\u{b5}', 924), ('\u{df}', 4194304), ('\u{e0}', 192), ('\u{e1}', 193), ('\u{e2}', 194),
11061118
('\u{e3}', 195), ('\u{e4}', 196), ('\u{e5}', 197), ('\u{e6}', 198), ('\u{e7}', 199),
11071119
('\u{e8}', 200), ('\u{e9}', 201), ('\u{ea}', 202), ('\u{eb}', 203), ('\u{ec}', 204),
@@ -1474,7 +1486,7 @@ pub mod conversions {
14741486
('\u{1e941}', 125215), ('\u{1e942}', 125216), ('\u{1e943}', 125217),
14751487
];
14761488

1477-
static UPPERCASE_TABLE_MULTI: &[[char; 3]] = &[
1489+
static UPPERCASE_TABLE_MULTI: &[[char; 3]; 102] = &[
14781490
['S', 'S', '\u{0}'], ['\u{2bc}', 'N', '\u{0}'], ['J', '\u{30c}', '\u{0}'],
14791491
['\u{399}', '\u{308}', '\u{301}'], ['\u{3a5}', '\u{308}', '\u{301}'],
14801492
['\u{535}', '\u{552}', '\u{0}'], ['H', '\u{331}', '\u{0}'], ['T', '\u{308}', '\u{0}'],

src/tools/unicode-table-generator/src/case_mapping.rs

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,24 +6,26 @@ use crate::{UnicodeData, fmt_list};
66

77
const INDEX_MASK: u32 = 1 << 22;
88

9-
pub(crate) fn generate_case_mapping(data: &UnicodeData) -> String {
9+
pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [usize; 2]) {
1010
let mut file = String::new();
1111

1212
write!(file, "const INDEX_MASK: u32 = 0x{INDEX_MASK:x};").unwrap();
1313
file.push_str("\n\n");
1414
file.push_str(HEADER.trim_start());
1515
file.push('\n');
16-
file.push_str(&generate_tables("LOWER", &data.to_lower));
16+
let (lower_tables, lower_size) = generate_tables("LOWER", &data.to_lower);
17+
file.push_str(&lower_tables);
1718
file.push_str("\n\n");
18-
file.push_str(&generate_tables("UPPER", &data.to_upper));
19-
file
19+
let (upper_tables, upper_size) = generate_tables("UPPER", &data.to_upper);
20+
file.push_str(&upper_tables);
21+
(file, [lower_size, upper_size])
2022
}
2123

22-
fn generate_tables(case: &str, data: &BTreeMap<u32, (u32, u32, u32)>) -> String {
24+
fn generate_tables(case: &str, data: &BTreeMap<u32, [u32; 3]>) -> (String, usize) {
2325
let mut mappings = Vec::with_capacity(data.len());
2426
let mut multis = Vec::new();
2527

26-
for (&key, &(a, b, c)) in data.iter() {
28+
for (&key, &[a, b, c]) in data.iter() {
2729
let key = char::from_u32(key).unwrap();
2830

2931
if key.is_ascii() {
@@ -46,16 +48,31 @@ fn generate_tables(case: &str, data: &BTreeMap<u32, (u32, u32, u32)>) -> String
4648
}
4749

4850
let mut tables = String::new();
49-
50-
write!(tables, "static {}CASE_TABLE: &[(char, u32)] = &[{}];", case, fmt_list(mappings))
51-
.unwrap();
51+
let mut size = 0;
52+
53+
size += size_of_val(mappings.as_slice());
54+
write!(
55+
tables,
56+
"static {}CASE_TABLE: &[(char, u32); {}] = &[{}];",
57+
case,
58+
mappings.len(),
59+
fmt_list(mappings),
60+
)
61+
.unwrap();
5262

5363
tables.push_str("\n\n");
5464

55-
write!(tables, "static {}CASE_TABLE_MULTI: &[[char; 3]] = &[{}];", case, fmt_list(multis))
56-
.unwrap();
57-
58-
tables
65+
size += size_of_val(multis.as_slice());
66+
write!(
67+
tables,
68+
"static {}CASE_TABLE_MULTI: &[[char; 3]; {}] = &[{}];",
69+
case,
70+
multis.len(),
71+
fmt_list(multis),
72+
)
73+
.unwrap();
74+
75+
(tables, size)
5976
}
6077

6178
struct CharEscape(char);

0 commit comments

Comments
 (0)