Skip to content

Commit 94b9325

Browse files
authored
Merge pull request #78 from sunfishcode/main
Add an `is_public_assigned` predicate.
2 parents 3dc2211 + 5a57070 commit 94b9325

File tree

4 files changed

+814
-0
lines changed

4 files changed

+814
-0
lines changed

scripts/unicode.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,10 @@ def _load_unicode_data(self):
9898
self.compat_decomp = {}
9999
self.canon_decomp = {}
100100
self.general_category_mark = []
101+
self.general_category_public_assigned = []
102+
103+
assigned_start = 0;
104+
prev_char_int = -1;
101105

102106
for line in self._fetch("UnicodeData.txt").splitlines():
103107
# See ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
@@ -120,6 +124,15 @@ def _load_unicode_data(self):
120124
if category == 'M' or 'M' in expanded_categories.get(category, []):
121125
self.general_category_mark.append(char_int)
122126

127+
assert category != 'Cn', "Unexpected: Unassigned codepoint in UnicodeData.txt"
128+
if category not in ['Co', 'Cs']:
129+
if char_int != prev_char_int + 1:
130+
self.general_category_public_assigned.append((assigned_start, prev_char_int))
131+
assigned_start = char_int
132+
prev_char_int = char_int
133+
134+
self.general_category_public_assigned.append((assigned_start, prev_char_int))
135+
123136
def _load_cjk_compat_ideograph_variants(self):
124137
for line in self._fetch("StandardizedVariants.txt").splitlines():
125138
strip_comments = line.split('#', 1)[0].strip()
@@ -418,6 +431,30 @@ def gen_combining_mark(general_category_mark, out):
418431
gen_mph_data('combining_mark', general_category_mark, 'u32',
419432
lambda k: '0x{:04x}'.format(k))
420433

434+
def gen_public_assigned(general_category_public_assigned, out):
435+
# This could be done as a hash but the table is somewhat small.
436+
out.write("#[inline]\n")
437+
out.write("pub fn is_public_assigned(c: char) -> bool {\n")
438+
out.write(" match c {\n")
439+
440+
start = True
441+
for first, last in general_category_public_assigned:
442+
if start:
443+
out.write(" ")
444+
start = False
445+
else:
446+
out.write(" | ")
447+
if first == last:
448+
out.write("'\\u{%s}'\n" % hexify(first))
449+
else:
450+
out.write("'\\u{%s}'..='\\u{%s}'\n" % (hexify(first), hexify(last)))
451+
out.write(" => true,\n")
452+
453+
out.write(" _ => false,\n")
454+
out.write(" }\n")
455+
out.write("}\n")
456+
out.write("\n")
457+
421458
def gen_stream_safe(leading, trailing, out):
422459
# This could be done as a hash but the table is very small.
423460
out.write("#[inline]\n")
@@ -540,6 +577,9 @@ def minimal_perfect_hash(d):
540577
gen_combining_mark(data.general_category_mark, out)
541578
out.write("\n")
542579

580+
gen_public_assigned(data.general_category_public_assigned, out)
581+
out.write("\n")
582+
543583
gen_nfc_qc(data.norm_props, out)
544584
out.write("\n")
545585

src/lib.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,11 @@ pub mod char {
9090
};
9191

9292
pub use crate::lookups::{canonical_combining_class, is_combining_mark};
93+
94+
/// Return whether the given character is assigned (`General_Category` != `Unassigned`)
95+
/// and not Private-Use (`General_Category` != `Private_Use`), in the supported version
96+
/// of Unicode.
97+
pub use crate::tables::is_public_assigned;
9398
}
9499

95100
/// Methods for iterating over strings while applying Unicode normalizations

0 commit comments

Comments
 (0)