diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index b57234bbee9a2..787efcc091454 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -1,4 +1,16 @@ ///! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually! +// Alphabetic : 1727 bytes, 142759 codepoints in 757 ranges (U+000041 - U+0323B0) using skiplist +// Case_Ignorable : 1053 bytes, 2749 codepoints in 452 ranges (U+000027 - U+0E01F0) using skiplist +// Cased : 407 bytes, 4578 codepoints in 159 ranges (U+000041 - U+01F18A) using skiplist +// Cc : 9 bytes, 65 codepoints in 2 ranges (U+000000 - U+0000A0) using skiplist +// Grapheme_Extend : 887 bytes, 2193 codepoints in 375 ranges (U+000300 - U+0E01F0) using skiplist +// Lowercase : 935 bytes, 2569 codepoints in 675 ranges (U+000061 - U+01E944) using bitset +// N : 457 bytes, 1911 codepoints in 144 ranges (U+000030 - U+01FBFA) using skiplist +// Uppercase : 799 bytes, 1978 codepoints in 656 ranges (U+000041 - U+01F18A) using bitset +// White_Space : 256 bytes, 25 codepoints in 10 ranges (U+000009 - U+003001) using cascading +// to_lower : 11484 bytes +// to_upper : 13432 bytes +// Total : 31446 bytes #[inline(always)] const fn bitset_search< @@ -772,7 +784,7 @@ pub mod conversions { } } - static LOWERCASE_TABLE: &[(char, u32)] = &[ + static LOWERCASE_TABLE: &[(char, u32); 1434] = &[ ('\u{c0}', 224), ('\u{c1}', 225), ('\u{c2}', 226), ('\u{c3}', 227), ('\u{c4}', 228), ('\u{c5}', 229), ('\u{c6}', 230), ('\u{c7}', 231), ('\u{c8}', 232), ('\u{c9}', 233), ('\u{ca}', 234), ('\u{cb}', 235), ('\u{cc}', 236), ('\u{cd}', 237), ('\u{ce}', 238), @@ -1122,11 +1134,11 @@ pub mod conversions { ('\u{1e921}', 125251), ]; - static LOWERCASE_TABLE_MULTI: &[[char; 3]] = &[ + static LOWERCASE_TABLE_MULTI: &[[char; 3]; 1] = &[ ['i', '\u{307}', '\u{0}'], ]; - static UPPERCASE_TABLE: &[(char, u32)] = &[ + static UPPERCASE_TABLE: &[(char, u32); 1526] = &[ ('\u{b5}', 924), ('\u{df}', 4194304), ('\u{e0}', 192), ('\u{e1}', 193), ('\u{e2}', 194), ('\u{e3}', 195), ('\u{e4}', 196), ('\u{e5}', 197), ('\u{e6}', 198), ('\u{e7}', 199), ('\u{e8}', 200), ('\u{e9}', 201), ('\u{ea}', 202), ('\u{eb}', 203), ('\u{ec}', 204), @@ -1499,7 +1511,7 @@ pub mod conversions { ('\u{1e941}', 125215), ('\u{1e942}', 125216), ('\u{1e943}', 125217), ]; - static UPPERCASE_TABLE_MULTI: &[[char; 3]] = &[ + static UPPERCASE_TABLE_MULTI: &[[char; 3]; 102] = &[ ['S', 'S', '\u{0}'], ['\u{2bc}', 'N', '\u{0}'], ['J', '\u{30c}', '\u{0}'], ['\u{399}', '\u{308}', '\u{301}'], ['\u{3a5}', '\u{308}', '\u{301}'], ['\u{535}', '\u{552}', '\u{0}'], ['H', '\u{331}', '\u{0}'], ['T', '\u{308}', '\u{0}'], diff --git a/src/tools/unicode-table-generator/src/case_mapping.rs b/src/tools/unicode-table-generator/src/case_mapping.rs index 9c6454492e7e0..49aef3ec33ec7 100644 --- a/src/tools/unicode-table-generator/src/case_mapping.rs +++ b/src/tools/unicode-table-generator/src/case_mapping.rs @@ -6,24 +6,26 @@ use crate::{UnicodeData, fmt_list}; const INDEX_MASK: u32 = 1 << 22; -pub(crate) fn generate_case_mapping(data: &UnicodeData) -> String { +pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [usize; 2]) { let mut file = String::new(); write!(file, "const INDEX_MASK: u32 = 0x{INDEX_MASK:x};").unwrap(); file.push_str("\n\n"); file.push_str(HEADER.trim_start()); file.push('\n'); - file.push_str(&generate_tables("LOWER", &data.to_lower)); + let (lower_tables, lower_size) = generate_tables("LOWER", &data.to_lower); + file.push_str(&lower_tables); file.push_str("\n\n"); - file.push_str(&generate_tables("UPPER", &data.to_upper)); - file + let (upper_tables, upper_size) = generate_tables("UPPER", &data.to_upper); + file.push_str(&upper_tables); + (file, [lower_size, upper_size]) } -fn generate_tables(case: &str, data: &BTreeMap) -> String { +fn generate_tables(case: &str, data: &BTreeMap) -> (String, usize) { let mut mappings = Vec::with_capacity(data.len()); let mut multis = Vec::new(); - for (&key, &(a, b, c)) in data.iter() { + for (&key, &[a, b, c]) in data.iter() { let key = char::from_u32(key).unwrap(); if key.is_ascii() { @@ -46,16 +48,31 @@ fn generate_tables(case: &str, data: &BTreeMap) -> String } let mut tables = String::new(); - - write!(tables, "static {}CASE_TABLE: &[(char, u32)] = &[{}];", case, fmt_list(mappings)) - .unwrap(); + let mut size = 0; + + size += size_of_val(mappings.as_slice()); + write!( + tables, + "static {}CASE_TABLE: &[(char, u32); {}] = &[{}];", + case, + mappings.len(), + fmt_list(mappings), + ) + .unwrap(); tables.push_str("\n\n"); - write!(tables, "static {}CASE_TABLE_MULTI: &[[char; 3]] = &[{}];", case, fmt_list(multis)) - .unwrap(); - - tables + size += size_of_val(multis.as_slice()); + write!( + tables, + "static {}CASE_TABLE_MULTI: &[[char; 3]; {}] = &[{}];", + case, + multis.len(), + fmt_list(multis), + ) + .unwrap(); + + (tables, size) } struct CharEscape(char); diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 6cdb82a87bdff..1d70eebdbc6c2 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -72,6 +72,8 @@ //! or not. use std::collections::{BTreeMap, HashMap}; +use std::fmt; +use std::fmt::Write; use std::ops::Range; use ucd_parse::Codepoints; @@ -98,11 +100,11 @@ static PROPERTIES: &[&str] = &[ struct UnicodeData { ranges: Vec<(&'static str, Vec>)>, - to_upper: BTreeMap, - to_lower: BTreeMap, + to_upper: BTreeMap, + to_lower: BTreeMap, } -fn to_mapping(origin: u32, codepoints: Vec) -> Option<(u32, u32, u32)> { +fn to_mapping(origin: u32, codepoints: Vec) -> Option<[u32; 3]> { let mut a = None; let mut b = None; let mut c = None; @@ -123,7 +125,7 @@ fn to_mapping(origin: u32, codepoints: Vec) -> Option<(u32 } } - Some((a.unwrap(), b.unwrap_or(0), c.unwrap_or(0))) + Some([a.unwrap(), b.unwrap_or(0), c.unwrap_or(0)]) } static UNICODE_DIRECTORY: &str = "unicode-downloads"; @@ -163,12 +165,12 @@ fn load_data() -> UnicodeData { if let Some(mapped) = row.simple_lowercase_mapping && mapped != row.codepoint { - to_lower.insert(row.codepoint.value(), (mapped.value(), 0, 0)); + to_lower.insert(row.codepoint.value(), [mapped.value(), 0, 0]); } if let Some(mapped) = row.simple_uppercase_mapping && mapped != row.codepoint { - to_upper.insert(row.codepoint.value(), (mapped.value(), 0, 0)); + to_upper.insert(row.codepoint.value(), [mapped.value(), 0, 0]); } } @@ -187,33 +189,19 @@ fn load_data() -> UnicodeData { } } - let mut properties: HashMap<&'static str, Vec>> = properties + let mut properties: Vec<(&'static str, Vec>)> = properties .into_iter() - .map(|(k, v)| { - ( - k, - v.into_iter() - .flat_map(|codepoints| match codepoints { - Codepoints::Single(c) => c - .scalar() - .map(|ch| ch as u32..ch as u32 + 1) - .into_iter() - .collect::>(), - Codepoints::Range(c) => c - .into_iter() - .flat_map(|c| c.scalar().map(|ch| ch as u32..ch as u32 + 1)) - .collect::>(), - }) - .collect::>>(), - ) + .map(|(prop, codepoints)| { + let codepoints = codepoints + .into_iter() + .flatten() + .flat_map(|cp| cp.scalar()) + .map(u32::from) + .collect::>(); + (prop, ranges_from_set(&codepoints)) }) .collect(); - for ranges in properties.values_mut() { - merge_ranges(ranges); - } - - let mut properties = properties.into_iter().collect::>(); properties.sort_by_key(|p| p.0); UnicodeData { ranges: properties, to_lower, to_upper } } @@ -236,9 +224,14 @@ fn main() { let ranges_by_property = &unicode_data.ranges; if let Some(path) = test_path { - std::fs::write(&path, generate_tests(&write_location, ranges_by_property)).unwrap(); + std::fs::write(&path, generate_tests(&unicode_data).unwrap()).unwrap(); } + let mut table_file = String::new(); + table_file.push_str( + "///! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!\n", + ); + let mut total_bytes = 0; let mut modules = Vec::new(); for (property, ranges) in ranges_by_property { @@ -252,8 +245,8 @@ fn main() { } modules.push((property.to_lowercase().to_string(), emitter.file)); - println!( - "{:15}: {} bytes, {} codepoints in {} ranges ({} - {}) using {}", + table_file.push_str(&format!( + "// {:16}: {:5} bytes, {:6} codepoints in {:3} ranges (U+{:06X} - U+{:06X}) using {}\n", property, emitter.bytes_used, datapoints, @@ -261,15 +254,15 @@ fn main() { ranges.first().unwrap().start, ranges.last().unwrap().end, emitter.desc, - ); + )); total_bytes += emitter.bytes_used; } - - let mut table_file = String::new(); - - table_file.push_str( - "///! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!\n", - ); + let (conversions, sizes) = case_mapping::generate_case_mapping(&unicode_data); + for (name, size) in ["to_lower", "to_upper"].iter().zip(sizes) { + table_file.push_str(&format!("// {:16}: {:5} bytes\n", name, size)); + total_bytes += size; + } + table_file.push_str(&format!("// {:16}: {:5} bytes\n", "Total", total_bytes)); // Include the range search function table_file.push('\n'); @@ -280,7 +273,7 @@ fn main() { table_file.push('\n'); - modules.push((String::from("conversions"), case_mapping::generate_case_mapping(&unicode_data))); + modules.push((String::from("conversions"), conversions)); for (name, contents) in modules { table_file.push_str("#[rustfmt::skip]\n"); @@ -296,8 +289,6 @@ fn main() { } std::fs::write(&write_location, format!("{}\n", table_file.trim_end())).unwrap(); - - println!("Total table sizes: {total_bytes} bytes"); } fn version() -> String { @@ -337,110 +328,96 @@ fn fmt_list(values: impl IntoIterator) -> String { out } -fn generate_tests(data_path: &str, ranges: &[(&str, Vec>)]) -> String { +fn generate_tests(data: &UnicodeData) -> Result { let mut s = String::new(); - s.push_str("#![allow(incomplete_features, unused)]\n"); - s.push_str("#![feature(const_generics)]\n\n"); - s.push_str("\n#[allow(unused)]\nuse std::hint;\n"); - s.push_str(&format!("#[path = \"{data_path}\"]\n")); - s.push_str("mod unicode_data;\n\n"); - - s.push_str("\nfn main() {\n"); - - for (property, ranges) in ranges { - s.push_str(&format!(r#" println!("Testing {property}");"#)); - s.push('\n'); - s.push_str(&format!(" {}_true();\n", property.to_lowercase())); - s.push_str(&format!(" {}_false();\n", property.to_lowercase())); - let mut is_true = Vec::new(); - let mut is_false = Vec::new(); - for ch_num in 0..(std::char::MAX as u32) { - if std::char::from_u32(ch_num).is_none() { - continue; - } - if ranges.iter().any(|r| r.contains(&ch_num)) { - is_true.push(ch_num); - } else { - is_false.push(ch_num); - } - } - - s.push_str(&format!(" fn {}_true() {{\n", property.to_lowercase())); - generate_asserts(&mut s, property, &is_true, true); - s.push_str(" }\n\n"); - s.push_str(&format!(" fn {}_false() {{\n", property.to_lowercase())); - generate_asserts(&mut s, property, &is_false, false); - s.push_str(" }\n\n"); + writeln!(s, "#![feature(core_intrinsics)]")?; + writeln!(s, "#![allow(internal_features, dead_code)]")?; + writeln!(s, "// ignore-tidy-filelength")?; + writeln!(s, "use std::intrinsics;")?; + writeln!(s, "mod unicode_data;")?; + writeln!(s, "fn main() {{")?; + for (property, ranges) in &data.ranges { + let prop = property.to_lowercase(); + writeln!(s, r#" println!("Testing {prop}");"#)?; + writeln!(s, " {prop}_true();")?; + writeln!(s, " {prop}_false();")?; + let (is_true, is_false): (Vec<_>, Vec<_>) = (char::MIN..=char::MAX) + .filter(|c| !c.is_ascii()) + .map(u32::from) + .partition(|c| ranges.iter().any(|r| r.contains(c))); + + writeln!(s, " fn {prop}_true() {{")?; + generate_asserts(&mut s, &prop, &is_true, true)?; + writeln!(s, " }}")?; + + writeln!(s, " fn {prop}_false() {{")?; + generate_asserts(&mut s, &prop, &is_false, false)?; + writeln!(s, " }}")?; } - s.push('}'); - s -} - -fn generate_asserts(s: &mut String, property: &str, points: &[u32], truthy: bool) { - for range in ranges_from_set(points) { - if range.end == range.start + 1 { - s.push_str(&format!( - " assert!({}unicode_data::{}::lookup({:?}), \"{}\");\n", - if truthy { "" } else { "!" }, - property.to_lowercase(), - std::char::from_u32(range.start).unwrap(), - range.start, - )); - } else { - s.push_str(&format!(" for chn in {range:?}u32 {{\n")); - s.push_str(&format!( - " assert!({}unicode_data::{}::lookup(std::char::from_u32(chn).unwrap()), \"{{:?}}\", chn);\n", - if truthy { "" } else { "!" }, - property.to_lowercase(), - )); - s.push_str(" }\n"); + for (name, conversion) in ["to_lower", "to_upper"].iter().zip([&data.to_lower, &data.to_upper]) + { + writeln!(s, r#" println!("Testing {name}");"#)?; + for (c, mapping) in conversion { + let c = char::from_u32(*c).unwrap(); + let mapping = mapping.map(|c| char::from_u32(c).unwrap()); + writeln!( + s, + r#" assert_eq!(unicode_data::conversions::{name}({c:?}), {mapping:?});"# + )?; + } + let unmapped: Vec<_> = (char::MIN..=char::MAX) + .filter(|c| !c.is_ascii()) + .map(u32::from) + .filter(|c| !conversion.contains_key(c)) + .collect(); + let unmapped_ranges = ranges_from_set(&unmapped); + for range in unmapped_ranges { + let start = char::from_u32(range.start).unwrap(); + let end = char::from_u32(range.end - 1).unwrap(); + writeln!(s, " for c in {start:?}..={end:?} {{")?; + writeln!( + s, + r#" assert_eq!(unicode_data::conversions::{name}(c), [c, '\0', '\0']);"# + )?; + + writeln!(s, " }}")?; } } -} -fn ranges_from_set(set: &[u32]) -> Vec> { - let mut ranges = set.iter().map(|e| (*e)..(*e + 1)).collect::>>(); - merge_ranges(&mut ranges); - ranges + writeln!(s, "}}")?; + Ok(s) } -fn merge_ranges(ranges: &mut Vec>) { - loop { - let mut new_ranges = Vec::new(); - let mut idx_iter = 0..(ranges.len() - 1); - let mut should_insert_last = true; - while let Some(idx) = idx_iter.next() { - let cur = ranges[idx].clone(); - let next = ranges[idx + 1].clone(); - if cur.end == next.start { - if idx_iter.next().is_none() { - // We're merging the last element - should_insert_last = false; - } - new_ranges.push(cur.start..next.end); - } else { - // We're *not* merging the last element - should_insert_last = true; - new_ranges.push(cur); +fn generate_asserts( + s: &mut String, + prop: &str, + points: &[u32], + truthy: bool, +) -> Result<(), fmt::Error> { + let truthy = if truthy { "" } else { "!" }; + for range in ranges_from_set(points) { + let start = char::from_u32(range.start).unwrap(); + let end = char::from_u32(range.end - 1).unwrap(); + match range.len() { + 1 => writeln!(s, " assert!({truthy}unicode_data::{prop}::lookup({start:?}));")?, + _ => { + writeln!(s, " for c in {start:?}..={end:?} {{")?; + writeln!(s, " assert!({truthy}unicode_data::{prop}::lookup(c));")?; + writeln!(s, " }}")?; } } - if should_insert_last { - new_ranges.push(ranges.last().unwrap().clone()); - } - if new_ranges.len() == ranges.len() { - *ranges = new_ranges; - break; - } else { - *ranges = new_ranges; - } } + Ok(()) +} - let mut last_end = None; - for range in ranges { - if let Some(last) = last_end { - assert!(range.start > last, "{range:?}"); - } - last_end = Some(range.end); - } +/// Group the elements of `set` into contigous ranges +fn ranges_from_set(set: &[u32]) -> Vec> { + set.chunk_by(|a, b| a + 1 == *b) + .map(|chunk| { + let start = *chunk.first().unwrap(); + let end = *chunk.last().unwrap(); + start..(end + 1) + }) + .collect() }