Skip to content

Commit 0186d4c

Browse files
authored
Rollup merge of rust-lang#147622 - Kmeakin:km/unicode-data/refactors, r=joboet
`unicode_data` refactors Minor refactors to `unicode_data` that occured to me while trying to reduce the size of the tables. Splitting into a separate PR. NFC
2 parents cb183ab + b3fcc06 commit 0186d4c

File tree

6 files changed

+4511
-1216
lines changed

6 files changed

+4511
-1216
lines changed

core/src/unicode/mod.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,9 @@ pub(crate) use unicode_data::white_space::lookup as White_Space;
1818

1919
pub(crate) mod printable;
2020

21+
mod rt;
2122
#[allow(unreachable_pub)]
22-
mod unicode_data;
23+
pub mod unicode_data;
2324

2425
/// The version of [Unicode](https://www.unicode.org/) that the Unicode parts of
2526
/// `char` and `str` methods are based on.

core/src/unicode/rt.rs

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
//! Runtime support for `unicode_data`.
2+
3+
#[inline(always)]
4+
pub(super) const fn bitset_search<
5+
const N: usize,
6+
const CHUNK_SIZE: usize,
7+
const N1: usize,
8+
const CANONICAL: usize,
9+
const CANONICALIZED: usize,
10+
>(
11+
needle: u32,
12+
chunk_idx_map: &[u8; N],
13+
bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1],
14+
bitset_canonical: &[u64; CANONICAL],
15+
bitset_canonicalized: &[(u8, u8); CANONICALIZED],
16+
) -> bool {
17+
let bucket_idx = (needle / 64) as usize;
18+
let chunk_map_idx = bucket_idx / CHUNK_SIZE;
19+
let chunk_piece = bucket_idx % CHUNK_SIZE;
20+
// FIXME(const-hack): Revert to `slice::get` when slice indexing becomes possible in const.
21+
let chunk_idx = if chunk_map_idx < chunk_idx_map.len() {
22+
chunk_idx_map[chunk_map_idx]
23+
} else {
24+
return false;
25+
};
26+
let idx = bitset_chunk_idx[chunk_idx as usize][chunk_piece] as usize;
27+
// FIXME(const-hack): Revert to `slice::get` when slice indexing becomes possible in const.
28+
let word = if idx < bitset_canonical.len() {
29+
bitset_canonical[idx]
30+
} else {
31+
let (real_idx, mapping) = bitset_canonicalized[idx - bitset_canonical.len()];
32+
let mut word = bitset_canonical[real_idx as usize];
33+
let should_invert = mapping & (1 << 6) != 0;
34+
if should_invert {
35+
word = !word;
36+
}
37+
// Lower 6 bits
38+
let quantity = mapping & ((1 << 6) - 1);
39+
if mapping & (1 << 7) != 0 {
40+
// shift
41+
word >>= quantity as u64;
42+
} else {
43+
word = word.rotate_left(quantity as u32);
44+
}
45+
word
46+
};
47+
(word & (1 << (needle % 64) as u64)) != 0
48+
}
49+
50+
#[repr(transparent)]
51+
pub(super) struct ShortOffsetRunHeader(pub(super) u32);
52+
53+
impl ShortOffsetRunHeader {
54+
pub(super) const fn new(start_index: usize, prefix_sum: u32) -> Self {
55+
assert!(start_index < (1 << 11));
56+
assert!(prefix_sum < (1 << 21));
57+
58+
Self((start_index as u32) << 21 | prefix_sum)
59+
}
60+
61+
#[inline]
62+
pub(super) const fn start_index(&self) -> usize {
63+
(self.0 >> 21) as usize
64+
}
65+
66+
#[inline]
67+
pub(super) const fn prefix_sum(&self) -> u32 {
68+
self.0 & ((1 << 21) - 1)
69+
}
70+
}
71+
72+
/// # Safety
73+
///
74+
/// - The last element of `short_offset_runs` must be greater than `std::char::MAX`.
75+
/// - The start indices of all elements in `short_offset_runs` must be less than `OFFSETS`.
76+
#[inline(always)]
77+
pub(super) unsafe fn skip_search<const SOR: usize, const OFFSETS: usize>(
78+
needle: char,
79+
short_offset_runs: &[ShortOffsetRunHeader; SOR],
80+
offsets: &[u8; OFFSETS],
81+
) -> bool {
82+
let needle = needle as u32;
83+
84+
let last_idx =
85+
match short_offset_runs.binary_search_by_key(&(needle << 11), |header| header.0 << 11) {
86+
Ok(idx) => idx + 1,
87+
Err(idx) => idx,
88+
};
89+
// SAFETY: `last_idx` *cannot* be past the end of the array, as the last
90+
// element is greater than `std::char::MAX` (the largest possible needle)
91+
// as guaranteed by the caller.
92+
//
93+
// So, we cannot have found it (i.e. `Ok(idx) => idx + 1 != length`) and the
94+
// correct location cannot be past it, so `Err(idx) => idx != length` either.
95+
//
96+
// This means that we can avoid bounds checking for the accesses below, too.
97+
//
98+
// We need to use `intrinsics::assume` since the `panic_nounwind` contained
99+
// in `hint::assert_unchecked` may not be optimized out.
100+
unsafe { crate::intrinsics::assume(last_idx < SOR) };
101+
102+
let mut offset_idx = short_offset_runs[last_idx].start_index();
103+
let length = if let Some(next) = short_offset_runs.get(last_idx + 1) {
104+
(*next).start_index() - offset_idx
105+
} else {
106+
offsets.len() - offset_idx
107+
};
108+
109+
let prev =
110+
last_idx.checked_sub(1).map(|prev| short_offset_runs[prev].prefix_sum()).unwrap_or(0);
111+
112+
let total = needle - prev;
113+
let mut prefix_sum = 0;
114+
for _ in 0..(length - 1) {
115+
// SAFETY: It is guaranteed that `length <= OFFSETS - offset_idx`,
116+
// so it follows that `length - 1 + offset_idx < OFFSETS`, therefore
117+
// `offset_idx < OFFSETS` is always true in this loop.
118+
//
119+
// We need to use `intrinsics::assume` since the `panic_nounwind` contained
120+
// in `hint::assert_unchecked` may not be optimized out.
121+
unsafe { crate::intrinsics::assume(offset_idx < OFFSETS) };
122+
let offset = offsets[offset_idx];
123+
prefix_sum += offset as u32;
124+
if prefix_sum > total {
125+
break;
126+
}
127+
offset_idx += 1;
128+
}
129+
offset_idx % 2 == 1
130+
}
131+
132+
/// # Safety
133+
/// The second component of each tuple in `table` must either be:
134+
/// - A valid `char`
135+
/// - A value with the high bit (1 << 22) set, and the lower 22 bits
136+
/// being a valid index into `multi`.
137+
#[inline(always)]
138+
pub(super) unsafe fn case_conversion(
139+
c: char,
140+
ascii_fn: fn(char) -> char,
141+
table: &[(char, u32)],
142+
multi: &[[char; 3]],
143+
) -> [char; 3] {
144+
const INDEX_MASK: u32 = 1 << 22;
145+
146+
if c.is_ascii() {
147+
return [ascii_fn(c), '\0', '\0'];
148+
}
149+
150+
let Ok(i) = table.binary_search_by(|&(key, _)| key.cmp(&c)) else {
151+
return [c, '\0', '\0'];
152+
};
153+
154+
let u = table[i].1;
155+
match char::from_u32(u) {
156+
Option::Some(c) => [c, '\0', '\0'],
157+
Option::None => {
158+
// SAFETY: Index comes from statically generated table
159+
unsafe { *multi.get_unchecked((u & (INDEX_MASK - 1)) as usize) }
160+
}
161+
}
162+
}

0 commit comments

Comments
 (0)