11import regex
22
33
4- def cpt_to_utf8_str (cpt ):
5- if cpt <= 0xFF :
6- return bytes ([cpt , 0 , 0 , 0 ])
7- elif cpt <= 0xFFFF :
8- return bytes ([cpt & 0xFF , cpt >> 8 , 0 , 0 ])
9- elif cpt <= 0xFFFFFF :
10- return bytes ([cpt & 0xFF , (cpt >> 8 ) & 0xFF , (cpt >> 16 ) & 0xFF , 0 ])
11- else :
12- return bytes ([cpt & 0xFF , (cpt >> 8 ) & 0xFF , (cpt >> 16 ) & 0xFF , cpt >> 24 ])
13-
14-
15- def is_match (codepoint , regex_expr ):
16- try :
17- res = regex .match (regex_expr , cpt_to_utf8_str (codepoint ).decode ('utf-32' ))
18- return res is not None
19- except Exception :
20- return False
21-
22-
234def get_matches (regex_expr ):
5+ regex_expr_compiled = regex .compile (regex_expr )
246 unicode_ranges = []
257 current_range = None
268
279 for codepoint in range (0x110000 ):
28- if is_match (codepoint , regex_expr ):
10+ char = chr (codepoint )
11+ if regex_expr_compiled .match (char ):
2912 if current_range is None :
3013 current_range = [codepoint , codepoint ]
3114 else :
@@ -40,27 +23,42 @@ def get_matches(regex_expr):
4023 return unicode_ranges
4124
4225
43- def print_cat (cat , ranges ):
44- print ("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{" .format (cat )) # noqa: NP100
45- cnt = 0
46- for start , end in ranges :
47- if cnt % 4 != 0 :
48- print (" " , end = "" ) # noqa: NP100
49- print ("{{0x{:08X}, 0x{:08X}}}," .format (start , end ), end = "" ) # noqa: NP100
50- if cnt % 4 == 3 :
51- print ("" ) # noqa: NP100
52- cnt += 1
53-
54- if cnt % 4 != 0 :
55- print ("" ) # noqa: NP100
26+ def print_cat (mode , cat , ranges ):
27+ if mode == "range" :
28+ print ("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{" .format (cat )) # noqa: NP100
29+ if mode == "map" :
30+ print ("const std::map<uint32_t, uint32_t> unicode_map_{} = {{" .format (cat )) # noqa: NP100
31+ for i , values in enumerate (ranges ):
32+ end = ",\n " if (i % 4 == 3 or i + 1 == len (ranges )) else ", "
33+ values = ["0x%08X" % value for value in values ]
34+ print ("{" + ", " .join (values ) + "}" , end = end ) # noqa: NP100
5635 print ("};" ) # noqa: NP100
5736 print ("" ) # noqa: NP100
5837
5938
60- print_cat ("number" , get_matches (r'\p{N}' ))
61- print_cat ("letter" , get_matches (r'\p{L}' ))
62- print_cat ("whitespace" , get_matches (r'\p{Z}' ))
63- print_cat ("accent_mark" , get_matches (r'\p{M}' ))
64- print_cat ("punctuation" , get_matches (r'\p{P}' ))
65- print_cat ("symbol" , get_matches (r'\p{S}' ))
66- print_cat ("control" , get_matches (r'\p{C}' ))
39+ print_cat ("range" , "number" , get_matches (r'\p{N}' ))
40+ print_cat ("range" , "letter" , get_matches (r'\p{L}' ))
41+ print_cat ("range" , "separator" , get_matches (r'\p{Z}' ))
42+ print_cat ("range" , "accent_mark" , get_matches (r'\p{M}' ))
43+ print_cat ("range" , "punctuation" , get_matches (r'\p{P}' ))
44+ print_cat ("range" , "symbol" , get_matches (r'\p{S}' ))
45+ print_cat ("range" , "control" , get_matches (r'\p{C}' ))
46+
47+ print_cat ("range" , "whitespace" , get_matches (r'\s' ))
48+
49+
50+ map_lowercase = []
51+ map_uppercase = []
52+ for codepoint in range (0x110000 ):
53+ char = chr (codepoint )
54+ lower = ord (char .lower ()[0 ])
55+ upper = ord (char .upper ()[0 ])
56+ if codepoint != lower :
57+ map_lowercase .append ((codepoint , lower ))
58+ if codepoint != upper :
59+ map_uppercase .append ((codepoint , upper ))
60+ print_cat ("map" , "lowercase" , map_lowercase )
61+ print_cat ("map" , "uppercase" , map_uppercase )
62+
63+
64+ # TODO: generate unicode_map_nfd
0 commit comments