Skip to content

Commit 5a57070

Browse files
committed
Add an is_public_assigned predicate.
Add an `is_public_assigned` predicate, which tests whether a given `char` is assigned (`General_Category` != `Unassigned`) in the currently supported version of Unicode, and not Private-Use (`General_Category` != `Private_Use`). This comes up in some use cases sensitive to the stability of NFC over Unicode version changes. An unassigned codepoint could become assigned in the future, and new normalizations could apply to it. For further details, see - <https://unicode.org/reports/tr15/#Versioning>
1 parent 3dc2211 commit 5a57070

File tree

4 files changed

+814
-0
lines changed

4 files changed

+814
-0
lines changed

scripts/unicode.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,10 @@ def _load_unicode_data(self):
9898
self.compat_decomp = {}
9999
self.canon_decomp = {}
100100
self.general_category_mark = []
101+
self.general_category_public_assigned = []
102+
103+
assigned_start = 0;
104+
prev_char_int = -1;
101105

102106
for line in self._fetch("UnicodeData.txt").splitlines():
103107
# See ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
@@ -120,6 +124,15 @@ def _load_unicode_data(self):
120124
if category == 'M' or 'M' in expanded_categories.get(category, []):
121125
self.general_category_mark.append(char_int)
122126

127+
assert category != 'Cn', "Unexpected: Unassigned codepoint in UnicodeData.txt"
128+
if category not in ['Co', 'Cs']:
129+
if char_int != prev_char_int + 1:
130+
self.general_category_public_assigned.append((assigned_start, prev_char_int))
131+
assigned_start = char_int
132+
prev_char_int = char_int
133+
134+
self.general_category_public_assigned.append((assigned_start, prev_char_int))
135+
123136
def _load_cjk_compat_ideograph_variants(self):
124137
for line in self._fetch("StandardizedVariants.txt").splitlines():
125138
strip_comments = line.split('#', 1)[0].strip()
@@ -418,6 +431,30 @@ def gen_combining_mark(general_category_mark, out):
418431
gen_mph_data('combining_mark', general_category_mark, 'u32',
419432
lambda k: '0x{:04x}'.format(k))
420433

434+
def gen_public_assigned(general_category_public_assigned, out):
435+
# This could be done as a hash but the table is somewhat small.
436+
out.write("#[inline]\n")
437+
out.write("pub fn is_public_assigned(c: char) -> bool {\n")
438+
out.write(" match c {\n")
439+
440+
start = True
441+
for first, last in general_category_public_assigned:
442+
if start:
443+
out.write(" ")
444+
start = False
445+
else:
446+
out.write(" | ")
447+
if first == last:
448+
out.write("'\\u{%s}'\n" % hexify(first))
449+
else:
450+
out.write("'\\u{%s}'..='\\u{%s}'\n" % (hexify(first), hexify(last)))
451+
out.write(" => true,\n")
452+
453+
out.write(" _ => false,\n")
454+
out.write(" }\n")
455+
out.write("}\n")
456+
out.write("\n")
457+
421458
def gen_stream_safe(leading, trailing, out):
422459
# This could be done as a hash but the table is very small.
423460
out.write("#[inline]\n")
@@ -540,6 +577,9 @@ def minimal_perfect_hash(d):
540577
gen_combining_mark(data.general_category_mark, out)
541578
out.write("\n")
542579

580+
gen_public_assigned(data.general_category_public_assigned, out)
581+
out.write("\n")
582+
543583
gen_nfc_qc(data.norm_props, out)
544584
out.write("\n")
545585

src/lib.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,11 @@ pub mod char {
9090
};
9191

9292
pub use crate::lookups::{canonical_combining_class, is_combining_mark};
93+
94+
/// Return whether the given character is assigned (`General_Category` != `Unassigned`)
95+
/// and not Private-Use (`General_Category` != `Private_Use`), in the supported version
96+
/// of Unicode.
97+
pub use crate::tables::is_public_assigned;
9398
}
9499

95100
/// Methods for iterating over strings while applying Unicode normalizations

0 commit comments

Comments
 (0)