Skip to content

Commit 4aeb297

Browse files
committed
Revert "unicode_data refactors RUST-147622"
This PR reverts RUST-147622 for several reasons: 1. The RUST-147622 PR would format the generated core library code using an arbitrary `rustfmt` picked up from `PATH`, which will cause hard-to-debug failures when the `rustfmt` used to format the generated unicode data code versus the `rustfmt` used to format the in-tree library code. 2. Previously, the `unicode-table-generator` tests were not run under CI as part of `coretests`, and since for `x86_64-gnu-aux` job we run library `coretests` with `miri`, the generated tests unfortunately caused an unacceptably large Merge CI time regression from ~2 hours to ~3.5 hours, making it the slowest Merge CI job (and thus the new bottleneck). 3. This PR also has an unintended effect of causing a diagnostic regression (RUST-148387), though that's mostly an edge case not properly handled by `rustc` diagnostics. Given that these are three distinct causes with non-trivial fixes, I'm proposing to revert this PR to return us to baseline. This is not prejudice against relanding the changes with these issues addressed, but to alleviate time pressure to address these non-trivial issues.
1 parent f2bae99 commit 4aeb297

File tree

13 files changed

+1592
-4700
lines changed

13 files changed

+1592
-4700
lines changed

library/core/src/unicode/mod.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,8 @@ pub(crate) use unicode_data::white_space::lookup as White_Space;
1818

1919
pub(crate) mod printable;
2020

21-
mod rt;
2221
#[allow(unreachable_pub)]
23-
pub mod unicode_data;
22+
mod unicode_data;
2423

2524
/// The version of [Unicode](https://www.unicode.org/) that the Unicode parts of
2625
/// `char` and `str` methods are based on.

library/core/src/unicode/unicode_data.rs

Lines changed: 1215 additions & 1322 deletions
Large diffs are not rendered by default.

library/coretests/tests/lib.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,6 @@
116116
#![feature(try_find)]
117117
#![feature(try_trait_v2)]
118118
#![feature(uint_bit_width)]
119-
#![feature(unicode_internals)]
120119
#![feature(unsize)]
121120
#![feature(unwrap_infallible)]
122121
// tidy-alphabetical-end

library/coretests/tests/unicode.rs

Lines changed: 0 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -1,101 +1,5 @@
1-
use core::unicode::unicode_data;
2-
use std::ops::RangeInclusive;
3-
4-
mod test_data;
5-
61
#[test]
72
pub fn version() {
83
let (major, _minor, _update) = core::char::UNICODE_VERSION;
94
assert!(major >= 10);
105
}
11-
12-
#[track_caller]
13-
fn test_boolean_property(ranges: &[RangeInclusive<char>], lookup: fn(char) -> bool) {
14-
let mut start = '\u{80}';
15-
for range in ranges {
16-
for c in start..*range.start() {
17-
assert!(!lookup(c), "{c:?}");
18-
}
19-
for c in range.clone() {
20-
assert!(lookup(c), "{c:?}");
21-
}
22-
start = char::from_u32(*range.end() as u32 + 1).unwrap();
23-
}
24-
for c in start..=char::MAX {
25-
assert!(!lookup(c), "{c:?}");
26-
}
27-
}
28-
29-
#[track_caller]
30-
fn test_case_mapping(ranges: &[(char, [char; 3])], lookup: fn(char) -> [char; 3]) {
31-
let mut start = '\u{80}';
32-
for &(key, val) in ranges {
33-
for c in start..key {
34-
assert_eq!(lookup(c), [c, '\0', '\0'], "{c:?}");
35-
}
36-
assert_eq!(lookup(key), val, "{key:?}");
37-
start = char::from_u32(key as u32 + 1).unwrap();
38-
}
39-
for c in start..=char::MAX {
40-
assert_eq!(lookup(c), [c, '\0', '\0'], "{c:?}");
41-
}
42-
}
43-
44-
#[test]
45-
#[cfg_attr(miri, ignore)]
46-
fn alphabetic() {
47-
test_boolean_property(test_data::ALPHABETIC, unicode_data::alphabetic::lookup);
48-
}
49-
50-
#[test]
51-
#[cfg_attr(miri, ignore)]
52-
fn case_ignorable() {
53-
test_boolean_property(test_data::CASE_IGNORABLE, unicode_data::case_ignorable::lookup);
54-
}
55-
56-
#[test]
57-
#[cfg_attr(miri, ignore)]
58-
fn cased() {
59-
test_boolean_property(test_data::CASED, unicode_data::cased::lookup);
60-
}
61-
62-
#[test]
63-
#[cfg_attr(miri, ignore)]
64-
fn grapheme_extend() {
65-
test_boolean_property(test_data::GRAPHEME_EXTEND, unicode_data::grapheme_extend::lookup);
66-
}
67-
68-
#[test]
69-
#[cfg_attr(miri, ignore)]
70-
fn lowercase() {
71-
test_boolean_property(test_data::LOWERCASE, unicode_data::lowercase::lookup);
72-
}
73-
74-
#[test]
75-
fn n() {
76-
test_boolean_property(test_data::N, unicode_data::n::lookup);
77-
}
78-
79-
#[test]
80-
#[cfg_attr(miri, ignore)]
81-
fn uppercase() {
82-
test_boolean_property(test_data::UPPERCASE, unicode_data::uppercase::lookup);
83-
}
84-
85-
#[test]
86-
#[cfg_attr(miri, ignore)]
87-
fn white_space() {
88-
test_boolean_property(test_data::WHITE_SPACE, unicode_data::white_space::lookup);
89-
}
90-
91-
#[test]
92-
#[cfg_attr(miri, ignore)]
93-
fn to_lowercase() {
94-
test_case_mapping(test_data::TO_LOWER, unicode_data::conversions::to_lower);
95-
}
96-
97-
#[test]
98-
#[cfg_attr(miri, ignore)]
99-
fn to_uppercase() {
100-
test_case_mapping(test_data::TO_UPPER, unicode_data::conversions::to_upper);
101-
}

library/coretests/tests/unicode/test_data.rs

Lines changed: 0 additions & 2928 deletions
This file was deleted.

src/bootstrap/src/core/build_steps/run.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -374,7 +374,6 @@ impl Step for UnicodeTableGenerator {
374374
fn run(self, builder: &Builder<'_>) {
375375
let mut cmd = builder.tool_cmd(Tool::UnicodeTableGenerator);
376376
cmd.arg(builder.src.join("library/core/src/unicode/unicode_data.rs"));
377-
cmd.arg(builder.src.join("library/coretests/tests/unicode/test_data.rs"));
378377
cmd.run(builder);
379378
}
380379
}

src/tools/unicode-table-generator/src/cascading_map.rs

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
use std::collections::HashMap;
2+
use std::fmt::Write as _;
23
use std::ops::Range;
34

5+
use crate::fmt_list;
46
use crate::raw_emitter::RawEmitter;
5-
use crate::writeln;
67

78
impl RawEmitter {
89
pub fn emit_cascading_map(&mut self, ranges: &[Range<u32>]) -> bool {
@@ -23,6 +24,8 @@ impl RawEmitter {
2324
.flat_map(|r| (r.start..r.end).collect::<Vec<u32>>())
2425
.collect::<Vec<u32>>();
2526

27+
println!("there are {} points", points.len());
28+
2629
// how many distinct ranges need to be counted?
2730
let mut codepoints_by_high_bytes = HashMap::<usize, Vec<u32>>::new();
2831
for point in points {
@@ -34,41 +37,41 @@ impl RawEmitter {
3437
}
3538

3639
let mut bit_for_high_byte = 1u8;
37-
let mut arms = String::new();
40+
let mut arms = Vec::<String>::new();
3841

3942
let mut high_bytes: Vec<usize> = codepoints_by_high_bytes.keys().copied().collect();
4043
high_bytes.sort();
4144
for high_byte in high_bytes {
4245
let codepoints = codepoints_by_high_bytes.get_mut(&high_byte).unwrap();
4346
if codepoints.len() == 1 {
4447
let ch = codepoints.pop().unwrap();
45-
writeln!(arms, "{high_byte:#04x} => c as u32 == {ch:#04x},");
48+
arms.push(format!("{high_byte} => c as u32 == {ch:#04x}"));
4649
continue;
4750
}
4851
// more than 1 codepoint in this arm
4952
for codepoint in codepoints {
5053
map[(*codepoint & 0xff) as usize] |= bit_for_high_byte;
5154
}
52-
writeln!(
53-
arms,
54-
"{high_byte:#04x} => WHITESPACE_MAP[c as usize & 0xff] & {bit_for_high_byte} != 0,"
55-
);
55+
arms.push(format!(
56+
"{high_byte} => WHITESPACE_MAP[c as usize & 0xff] & {bit_for_high_byte} != 0"
57+
));
5658
bit_for_high_byte <<= 1;
5759
}
5860

61+
writeln!(&mut self.file, "static WHITESPACE_MAP: [u8; 256] = [{}];", fmt_list(map.iter()))
62+
.unwrap();
5963
self.bytes_used += 256;
60-
self.file = format!(
61-
"static WHITESPACE_MAP: [u8; 256] = {map:?};
6264

63-
#[inline]
64-
pub const fn lookup(c: char) -> bool {{
65-
debug_assert!(!c.is_ascii());
66-
match c as u32 >> 8 {{
67-
{arms}\
68-
_ => false,
69-
}}
70-
}}"
71-
);
65+
writeln!(&mut self.file, "#[inline]").unwrap();
66+
writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
67+
writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap();
68+
writeln!(&mut self.file, " match c as u32 >> 8 {{").unwrap();
69+
for arm in arms {
70+
writeln!(&mut self.file, " {arm},").unwrap();
71+
}
72+
writeln!(&mut self.file, " _ => false,").unwrap();
73+
writeln!(&mut self.file, " }}").unwrap();
74+
writeln!(&mut self.file, "}}").unwrap();
7275

7376
true
7477
}
Lines changed: 82 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,27 @@
11
use std::char;
22
use std::collections::BTreeMap;
3+
use std::fmt::{self, Write};
34

4-
use crate::fmt_helpers::Hex;
5-
use crate::{CharEscape, UnicodeData, fmt_list};
5+
use crate::{UnicodeData, fmt_list};
66

77
const INDEX_MASK: u32 = 1 << 22;
88

99
pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [usize; 2]) {
10+
let mut file = String::new();
11+
12+
write!(file, "const INDEX_MASK: u32 = 0x{INDEX_MASK:x};").unwrap();
13+
file.push_str("\n\n");
14+
file.push_str(HEADER.trim_start());
15+
file.push('\n');
1016
let (lower_tables, lower_size) = generate_tables("LOWER", &data.to_lower);
17+
file.push_str(&lower_tables);
18+
file.push_str("\n\n");
1119
let (upper_tables, upper_size) = generate_tables("UPPER", &data.to_upper);
12-
let file = format!(
13-
"{lower_tables}
14-
{upper_tables}"
15-
);
20+
file.push_str(&upper_tables);
1621
(file, [lower_size, upper_size])
1722
}
1823

1924
fn generate_tables(case: &str, data: &BTreeMap<u32, [u32; 3]>) -> (String, usize) {
20-
let case_lower = case.to_lowercase();
21-
let case_upper = case.to_uppercase();
22-
2325
let mut mappings = Vec::with_capacity(data.len());
2426
let mut multis = Vec::new();
2527

@@ -42,49 +44,77 @@ fn generate_tables(case: &str, data: &BTreeMap<u32, [u32; 3]>) -> (String, usize
4244
INDEX_MASK | (u32::try_from(multis.len()).unwrap() - 1)
4345
};
4446

45-
mappings.push((CharEscape(key), Hex(value)));
47+
mappings.push((CharEscape(key), value));
48+
}
49+
50+
let mut tables = String::new();
51+
let mut size = 0;
52+
53+
size += size_of_val(mappings.as_slice());
54+
write!(
55+
tables,
56+
"static {}CASE_TABLE: &[(char, u32); {}] = &[{}];",
57+
case,
58+
mappings.len(),
59+
fmt_list(mappings),
60+
)
61+
.unwrap();
62+
63+
tables.push_str("\n\n");
64+
65+
size += size_of_val(multis.as_slice());
66+
write!(
67+
tables,
68+
"static {}CASE_TABLE_MULTI: &[[char; 3]; {}] = &[{}];",
69+
case,
70+
multis.len(),
71+
fmt_list(multis),
72+
)
73+
.unwrap();
74+
75+
(tables, size)
76+
}
77+
78+
struct CharEscape(char);
79+
80+
impl fmt::Debug for CharEscape {
81+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
82+
write!(f, "'{}'", self.0.escape_default())
83+
}
84+
}
85+
86+
static HEADER: &str = r"
87+
pub fn to_lower(c: char) -> [char; 3] {
88+
if c.is_ascii() {
89+
[(c as u8).to_ascii_lowercase() as char, '\0', '\0']
90+
} else {
91+
LOWERCASE_TABLE
92+
.binary_search_by(|&(key, _)| key.cmp(&c))
93+
.map(|i| {
94+
let u = LOWERCASE_TABLE[i].1;
95+
char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| {
96+
// SAFETY: Index comes from statically generated table
97+
unsafe { *LOWERCASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) }
98+
})
99+
})
100+
.unwrap_or([c, '\0', '\0'])
46101
}
102+
}
47103
48-
let size = size_of_val(mappings.as_slice()) + size_of_val(multis.as_slice());
49-
let file = format!(
50-
"
51-
#[rustfmt::skip]
52-
static {case}CASE_TABLE: &[(char, u32); {mappings_len}] = &[{mappings}];
53-
54-
#[rustfmt::skip]
55-
static {case}CASE_TABLE_MULTI: &[[char; 3]; {multis_len}] = &[{multis}];
56-
57-
#[inline]
58-
pub fn to_{case_lower}(c: char) -> [char; 3] {{
59-
const {{
60-
let mut i = 0;
61-
while i < {case_upper}CASE_TABLE.len() {{
62-
let (_, val) = {case_upper}CASE_TABLE[i];
63-
if val & (1 << 22) == 0 {{
64-
assert!(char::from_u32(val).is_some());
65-
}} else {{
66-
let index = val & ((1 << 22) - 1);
67-
assert!((index as usize) < {case_upper}CASE_TABLE_MULTI.len());
68-
}}
69-
i += 1;
70-
}}
71-
}}
72-
73-
// SAFETY: Just checked that the tables are valid
74-
unsafe {{
75-
super::case_conversion(
76-
c,
77-
|c| c.to_ascii_{case_lower}case(),
78-
{case_upper}CASE_TABLE,
79-
{case_upper}CASE_TABLE_MULTI,
80-
)
81-
}}
82-
}}",
83-
mappings = fmt_list(&mappings),
84-
mappings_len = mappings.len(),
85-
multis = fmt_list(&multis),
86-
multis_len = multis.len(),
87-
);
88-
89-
(file, size)
104+
pub fn to_upper(c: char) -> [char; 3] {
105+
if c.is_ascii() {
106+
[(c as u8).to_ascii_uppercase() as char, '\0', '\0']
107+
} else {
108+
UPPERCASE_TABLE
109+
.binary_search_by(|&(key, _)| key.cmp(&c))
110+
.map(|i| {
111+
let u = UPPERCASE_TABLE[i].1;
112+
char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| {
113+
// SAFETY: Index comes from statically generated table
114+
unsafe { *UPPERCASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) }
115+
})
116+
})
117+
.unwrap_or([c, '\0', '\0'])
118+
}
90119
}
120+
";

0 commit comments

Comments
 (0)