Skip to content

Commit f5711a5

Browse files
committed
Auto merge of rust-lang#148436 - jieyouxu:revert-unicode-generator, r=Zalathar
Revert "unicode_data refactors RUST-147622" This PR reverts RUST-147622 for several reasons: 1. The RUST-147622 PR would format the generated core library code using an arbitrary `rustfmt` picked up from `PATH`, which will cause hard-to-debug failures when the `rustfmt` used to format the generated unicode data code versus the `rustfmt` used to format the in-tree library code produce incompatible formatting. 2. Previously, the `unicode-table-generator` tests were not run under CI as part of `coretests`, and since for `x86_64-gnu-aux` job we run library `coretests` with `miri`, the generated tests unfortunately caused an unacceptably large Merge CI time regression from ~2 hours to ~3.5 hours, making it the slowest Merge CI job (and thus the new bottleneck). 3. This PR also has an unintended effect of causing a diagnostic regression (RUST-148387), though that's mostly an edge case not properly handled by `rustc` diagnostics. Given that these are three distinct causes with non-trivial fixes, I'm proposing to revert this PR to return us to baseline. **This is not prejudice against relanding the changes with these issues addressed, but to alleviate time pressure to address these non-trivial issues.** FYI `@Kmeakin` `@joboet` (PR author/review). Note that these issues are very subtle, so you cannot be reasonably expected to know about them beforehand. This was discussed in: - [#t-infra/bootstrap > &rust-lang#96;unicode_data&rust-lang#96; refactors PR](https://rust-lang.zulipchat.com/#narrow/channel/326414-t-infra.2Fbootstrap/topic/.60unicode_data.60.20refactors.20PR/with/553360227) - [#t-infra > x86_64-gnu-aux job went from ~2 to ~3.5 hours](https://rust-lang.zulipchat.com/#narrow/channel/242791-t-infra/topic/x86_64-gnu-aux.20job.20went.20from.20~2.20to.20~3.2E5.20hours/with/553354955)
2 parents 35ebdf9 + 4aeb297 commit f5711a5

File tree

13 files changed

+1592
-4700
lines changed

13 files changed

+1592
-4700
lines changed

library/core/src/unicode/mod.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,8 @@ pub(crate) use unicode_data::white_space::lookup as White_Space;
1818

1919
pub(crate) mod printable;
2020

21-
mod rt;
2221
#[allow(unreachable_pub)]
23-
pub mod unicode_data;
22+
mod unicode_data;
2423

2524
/// The version of [Unicode](https://www.unicode.org/) that the Unicode parts of
2625
/// `char` and `str` methods are based on.

library/core/src/unicode/unicode_data.rs

Lines changed: 1215 additions & 1322 deletions
Large diffs are not rendered by default.

library/coretests/tests/lib.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,6 @@
116116
#![feature(try_find)]
117117
#![feature(try_trait_v2)]
118118
#![feature(uint_bit_width)]
119-
#![feature(unicode_internals)]
120119
#![feature(unsize)]
121120
#![feature(unwrap_infallible)]
122121
// tidy-alphabetical-end

library/coretests/tests/unicode.rs

Lines changed: 0 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -1,101 +1,5 @@
1-
use core::unicode::unicode_data;
2-
use std::ops::RangeInclusive;
3-
4-
mod test_data;
5-
61
#[test]
72
pub fn version() {
83
let (major, _minor, _update) = core::char::UNICODE_VERSION;
94
assert!(major >= 10);
105
}
11-
12-
#[track_caller]
13-
fn test_boolean_property(ranges: &[RangeInclusive<char>], lookup: fn(char) -> bool) {
14-
let mut start = '\u{80}';
15-
for range in ranges {
16-
for c in start..*range.start() {
17-
assert!(!lookup(c), "{c:?}");
18-
}
19-
for c in range.clone() {
20-
assert!(lookup(c), "{c:?}");
21-
}
22-
start = char::from_u32(*range.end() as u32 + 1).unwrap();
23-
}
24-
for c in start..=char::MAX {
25-
assert!(!lookup(c), "{c:?}");
26-
}
27-
}
28-
29-
#[track_caller]
30-
fn test_case_mapping(ranges: &[(char, [char; 3])], lookup: fn(char) -> [char; 3]) {
31-
let mut start = '\u{80}';
32-
for &(key, val) in ranges {
33-
for c in start..key {
34-
assert_eq!(lookup(c), [c, '\0', '\0'], "{c:?}");
35-
}
36-
assert_eq!(lookup(key), val, "{key:?}");
37-
start = char::from_u32(key as u32 + 1).unwrap();
38-
}
39-
for c in start..=char::MAX {
40-
assert_eq!(lookup(c), [c, '\0', '\0'], "{c:?}");
41-
}
42-
}
43-
44-
#[test]
45-
#[cfg_attr(miri, ignore)]
46-
fn alphabetic() {
47-
test_boolean_property(test_data::ALPHABETIC, unicode_data::alphabetic::lookup);
48-
}
49-
50-
#[test]
51-
#[cfg_attr(miri, ignore)]
52-
fn case_ignorable() {
53-
test_boolean_property(test_data::CASE_IGNORABLE, unicode_data::case_ignorable::lookup);
54-
}
55-
56-
#[test]
57-
#[cfg_attr(miri, ignore)]
58-
fn cased() {
59-
test_boolean_property(test_data::CASED, unicode_data::cased::lookup);
60-
}
61-
62-
#[test]
63-
#[cfg_attr(miri, ignore)]
64-
fn grapheme_extend() {
65-
test_boolean_property(test_data::GRAPHEME_EXTEND, unicode_data::grapheme_extend::lookup);
66-
}
67-
68-
#[test]
69-
#[cfg_attr(miri, ignore)]
70-
fn lowercase() {
71-
test_boolean_property(test_data::LOWERCASE, unicode_data::lowercase::lookup);
72-
}
73-
74-
#[test]
75-
fn n() {
76-
test_boolean_property(test_data::N, unicode_data::n::lookup);
77-
}
78-
79-
#[test]
80-
#[cfg_attr(miri, ignore)]
81-
fn uppercase() {
82-
test_boolean_property(test_data::UPPERCASE, unicode_data::uppercase::lookup);
83-
}
84-
85-
#[test]
86-
#[cfg_attr(miri, ignore)]
87-
fn white_space() {
88-
test_boolean_property(test_data::WHITE_SPACE, unicode_data::white_space::lookup);
89-
}
90-
91-
#[test]
92-
#[cfg_attr(miri, ignore)]
93-
fn to_lowercase() {
94-
test_case_mapping(test_data::TO_LOWER, unicode_data::conversions::to_lower);
95-
}
96-
97-
#[test]
98-
#[cfg_attr(miri, ignore)]
99-
fn to_uppercase() {
100-
test_case_mapping(test_data::TO_UPPER, unicode_data::conversions::to_upper);
101-
}

library/coretests/tests/unicode/test_data.rs

Lines changed: 0 additions & 2928 deletions
This file was deleted.

src/bootstrap/src/core/build_steps/run.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -374,7 +374,6 @@ impl Step for UnicodeTableGenerator {
374374
fn run(self, builder: &Builder<'_>) {
375375
let mut cmd = builder.tool_cmd(Tool::UnicodeTableGenerator);
376376
cmd.arg(builder.src.join("library/core/src/unicode/unicode_data.rs"));
377-
cmd.arg(builder.src.join("library/coretests/tests/unicode/test_data.rs"));
378377
cmd.run(builder);
379378
}
380379
}

src/tools/unicode-table-generator/src/cascading_map.rs

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
use std::collections::HashMap;
2+
use std::fmt::Write as _;
23
use std::ops::Range;
34

5+
use crate::fmt_list;
46
use crate::raw_emitter::RawEmitter;
5-
use crate::writeln;
67

78
impl RawEmitter {
89
pub fn emit_cascading_map(&mut self, ranges: &[Range<u32>]) -> bool {
@@ -23,6 +24,8 @@ impl RawEmitter {
2324
.flat_map(|r| (r.start..r.end).collect::<Vec<u32>>())
2425
.collect::<Vec<u32>>();
2526

27+
println!("there are {} points", points.len());
28+
2629
// how many distinct ranges need to be counted?
2730
let mut codepoints_by_high_bytes = HashMap::<usize, Vec<u32>>::new();
2831
for point in points {
@@ -34,41 +37,41 @@ impl RawEmitter {
3437
}
3538

3639
let mut bit_for_high_byte = 1u8;
37-
let mut arms = String::new();
40+
let mut arms = Vec::<String>::new();
3841

3942
let mut high_bytes: Vec<usize> = codepoints_by_high_bytes.keys().copied().collect();
4043
high_bytes.sort();
4144
for high_byte in high_bytes {
4245
let codepoints = codepoints_by_high_bytes.get_mut(&high_byte).unwrap();
4346
if codepoints.len() == 1 {
4447
let ch = codepoints.pop().unwrap();
45-
writeln!(arms, "{high_byte:#04x} => c as u32 == {ch:#04x},");
48+
arms.push(format!("{high_byte} => c as u32 == {ch:#04x}"));
4649
continue;
4750
}
4851
// more than 1 codepoint in this arm
4952
for codepoint in codepoints {
5053
map[(*codepoint & 0xff) as usize] |= bit_for_high_byte;
5154
}
52-
writeln!(
53-
arms,
54-
"{high_byte:#04x} => WHITESPACE_MAP[c as usize & 0xff] & {bit_for_high_byte} != 0,"
55-
);
55+
arms.push(format!(
56+
"{high_byte} => WHITESPACE_MAP[c as usize & 0xff] & {bit_for_high_byte} != 0"
57+
));
5658
bit_for_high_byte <<= 1;
5759
}
5860

61+
writeln!(&mut self.file, "static WHITESPACE_MAP: [u8; 256] = [{}];", fmt_list(map.iter()))
62+
.unwrap();
5963
self.bytes_used += 256;
60-
self.file = format!(
61-
"static WHITESPACE_MAP: [u8; 256] = {map:?};
6264

63-
#[inline]
64-
pub const fn lookup(c: char) -> bool {{
65-
debug_assert!(!c.is_ascii());
66-
match c as u32 >> 8 {{
67-
{arms}\
68-
_ => false,
69-
}}
70-
}}"
71-
);
65+
writeln!(&mut self.file, "#[inline]").unwrap();
66+
writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
67+
writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap();
68+
writeln!(&mut self.file, " match c as u32 >> 8 {{").unwrap();
69+
for arm in arms {
70+
writeln!(&mut self.file, " {arm},").unwrap();
71+
}
72+
writeln!(&mut self.file, " _ => false,").unwrap();
73+
writeln!(&mut self.file, " }}").unwrap();
74+
writeln!(&mut self.file, "}}").unwrap();
7275

7376
true
7477
}
Lines changed: 82 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,27 @@
11
use std::char;
22
use std::collections::BTreeMap;
3+
use std::fmt::{self, Write};
34

4-
use crate::fmt_helpers::Hex;
5-
use crate::{CharEscape, UnicodeData, fmt_list};
5+
use crate::{UnicodeData, fmt_list};
66

77
const INDEX_MASK: u32 = 1 << 22;
88

99
pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [usize; 2]) {
10+
let mut file = String::new();
11+
12+
write!(file, "const INDEX_MASK: u32 = 0x{INDEX_MASK:x};").unwrap();
13+
file.push_str("\n\n");
14+
file.push_str(HEADER.trim_start());
15+
file.push('\n');
1016
let (lower_tables, lower_size) = generate_tables("LOWER", &data.to_lower);
17+
file.push_str(&lower_tables);
18+
file.push_str("\n\n");
1119
let (upper_tables, upper_size) = generate_tables("UPPER", &data.to_upper);
12-
let file = format!(
13-
"{lower_tables}
14-
{upper_tables}"
15-
);
20+
file.push_str(&upper_tables);
1621
(file, [lower_size, upper_size])
1722
}
1823

1924
fn generate_tables(case: &str, data: &BTreeMap<u32, [u32; 3]>) -> (String, usize) {
20-
let case_lower = case.to_lowercase();
21-
let case_upper = case.to_uppercase();
22-
2325
let mut mappings = Vec::with_capacity(data.len());
2426
let mut multis = Vec::new();
2527

@@ -42,49 +44,77 @@ fn generate_tables(case: &str, data: &BTreeMap<u32, [u32; 3]>) -> (String, usize
4244
INDEX_MASK | (u32::try_from(multis.len()).unwrap() - 1)
4345
};
4446

45-
mappings.push((CharEscape(key), Hex(value)));
47+
mappings.push((CharEscape(key), value));
48+
}
49+
50+
let mut tables = String::new();
51+
let mut size = 0;
52+
53+
size += size_of_val(mappings.as_slice());
54+
write!(
55+
tables,
56+
"static {}CASE_TABLE: &[(char, u32); {}] = &[{}];",
57+
case,
58+
mappings.len(),
59+
fmt_list(mappings),
60+
)
61+
.unwrap();
62+
63+
tables.push_str("\n\n");
64+
65+
size += size_of_val(multis.as_slice());
66+
write!(
67+
tables,
68+
"static {}CASE_TABLE_MULTI: &[[char; 3]; {}] = &[{}];",
69+
case,
70+
multis.len(),
71+
fmt_list(multis),
72+
)
73+
.unwrap();
74+
75+
(tables, size)
76+
}
77+
78+
struct CharEscape(char);
79+
80+
impl fmt::Debug for CharEscape {
81+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
82+
write!(f, "'{}'", self.0.escape_default())
83+
}
84+
}
85+
86+
static HEADER: &str = r"
87+
pub fn to_lower(c: char) -> [char; 3] {
88+
if c.is_ascii() {
89+
[(c as u8).to_ascii_lowercase() as char, '\0', '\0']
90+
} else {
91+
LOWERCASE_TABLE
92+
.binary_search_by(|&(key, _)| key.cmp(&c))
93+
.map(|i| {
94+
let u = LOWERCASE_TABLE[i].1;
95+
char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| {
96+
// SAFETY: Index comes from statically generated table
97+
unsafe { *LOWERCASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) }
98+
})
99+
})
100+
.unwrap_or([c, '\0', '\0'])
46101
}
102+
}
47103
48-
let size = size_of_val(mappings.as_slice()) + size_of_val(multis.as_slice());
49-
let file = format!(
50-
"
51-
#[rustfmt::skip]
52-
static {case}CASE_TABLE: &[(char, u32); {mappings_len}] = &[{mappings}];
53-
54-
#[rustfmt::skip]
55-
static {case}CASE_TABLE_MULTI: &[[char; 3]; {multis_len}] = &[{multis}];
56-
57-
#[inline]
58-
pub fn to_{case_lower}(c: char) -> [char; 3] {{
59-
const {{
60-
let mut i = 0;
61-
while i < {case_upper}CASE_TABLE.len() {{
62-
let (_, val) = {case_upper}CASE_TABLE[i];
63-
if val & (1 << 22) == 0 {{
64-
assert!(char::from_u32(val).is_some());
65-
}} else {{
66-
let index = val & ((1 << 22) - 1);
67-
assert!((index as usize) < {case_upper}CASE_TABLE_MULTI.len());
68-
}}
69-
i += 1;
70-
}}
71-
}}
72-
73-
// SAFETY: Just checked that the tables are valid
74-
unsafe {{
75-
super::case_conversion(
76-
c,
77-
|c| c.to_ascii_{case_lower}case(),
78-
{case_upper}CASE_TABLE,
79-
{case_upper}CASE_TABLE_MULTI,
80-
)
81-
}}
82-
}}",
83-
mappings = fmt_list(&mappings),
84-
mappings_len = mappings.len(),
85-
multis = fmt_list(&multis),
86-
multis_len = multis.len(),
87-
);
88-
89-
(file, size)
104+
pub fn to_upper(c: char) -> [char; 3] {
105+
if c.is_ascii() {
106+
[(c as u8).to_ascii_uppercase() as char, '\0', '\0']
107+
} else {
108+
UPPERCASE_TABLE
109+
.binary_search_by(|&(key, _)| key.cmp(&c))
110+
.map(|i| {
111+
let u = UPPERCASE_TABLE[i].1;
112+
char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| {
113+
// SAFETY: Index comes from statically generated table
114+
unsafe { *UPPERCASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) }
115+
})
116+
})
117+
.unwrap_or([c, '\0', '\0'])
118+
}
90119
}
120+
";

0 commit comments

Comments
 (0)