@@ -98,6 +98,10 @@ def _load_unicode_data(self):
98
98
self .compat_decomp = {}
99
99
self .canon_decomp = {}
100
100
self .general_category_mark = []
101
+ self .general_category_public_assigned = []
102
+
103
+ assigned_start = 0 ;
104
+ prev_char_int = - 1 ;
101
105
102
106
for line in self ._fetch ("UnicodeData.txt" ).splitlines ():
103
107
# See ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
@@ -120,6 +124,15 @@ def _load_unicode_data(self):
120
124
if category == 'M' or 'M' in expanded_categories .get (category , []):
121
125
self .general_category_mark .append (char_int )
122
126
127
+ assert category != 'Cn' , "Unexpected: Unassigned codepoint in UnicodeData.txt"
128
+ if category not in ['Co' , 'Cs' ]:
129
+ if char_int != prev_char_int + 1 :
130
+ self .general_category_public_assigned .append ((assigned_start , prev_char_int ))
131
+ assigned_start = char_int
132
+ prev_char_int = char_int
133
+
134
+ self .general_category_public_assigned .append ((assigned_start , prev_char_int ))
135
+
123
136
def _load_cjk_compat_ideograph_variants (self ):
124
137
for line in self ._fetch ("StandardizedVariants.txt" ).splitlines ():
125
138
strip_comments = line .split ('#' , 1 )[0 ].strip ()
@@ -418,6 +431,30 @@ def gen_combining_mark(general_category_mark, out):
418
431
gen_mph_data ('combining_mark' , general_category_mark , 'u32' ,
419
432
lambda k : '0x{:04x}' .format (k ))
420
433
434
+ def gen_public_assigned (general_category_public_assigned , out ):
435
+ # This could be done as a hash but the table is somewhat small.
436
+ out .write ("#[inline]\n " )
437
+ out .write ("pub fn is_public_assigned(c: char) -> bool {\n " )
438
+ out .write (" match c {\n " )
439
+
440
+ start = True
441
+ for first , last in general_category_public_assigned :
442
+ if start :
443
+ out .write (" " )
444
+ start = False
445
+ else :
446
+ out .write (" | " )
447
+ if first == last :
448
+ out .write ("'\\ u{%s}'\n " % hexify (first ))
449
+ else :
450
+ out .write ("'\\ u{%s}'..='\\ u{%s}'\n " % (hexify (first ), hexify (last )))
451
+ out .write (" => true,\n " )
452
+
453
+ out .write (" _ => false,\n " )
454
+ out .write (" }\n " )
455
+ out .write ("}\n " )
456
+ out .write ("\n " )
457
+
421
458
def gen_stream_safe (leading , trailing , out ):
422
459
# This could be done as a hash but the table is very small.
423
460
out .write ("#[inline]\n " )
@@ -540,6 +577,9 @@ def minimal_perfect_hash(d):
540
577
gen_combining_mark (data .general_category_mark , out )
541
578
out .write ("\n " )
542
579
580
+ gen_public_assigned (data .general_category_public_assigned , out )
581
+ out .write ("\n " )
582
+
543
583
gen_nfc_qc (data .norm_props , out )
544
584
out .write ("\n " )
545
585
0 commit comments