Skip to content

Commit 1048576

Browse files
Chubercikdalexeev
andcommitted
Automate generation of the char_range.inc file
Co-authored-by: Danil Alexeev <[email protected]>
1 parent 1b7b009 commit 1048576

File tree

2 files changed

+141
-1
lines changed

2 files changed

+141
-1
lines changed

core/string/char_range.inc

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
2929
/**************************************************************************/
3030

31+
// This file was generated using the `misc/scripts/char_range_fetch.py` script.
32+
3133
#ifndef CHAR_RANGE_INC
3234
#define CHAR_RANGE_INC
3335

@@ -43,7 +45,7 @@ struct CharRange {
4345

4446
constexpr inline CharRange xid_start[] = {
4547
{ 0x41, 0x5a },
46-
{ 0x5f, 0x5f }, // Underscore technically isn't in XID_Start, but for our purposes it's included.
48+
{ 0x5f, 0x5f },
4749
{ 0x61, 0x7a },
4850
{ 0xaa, 0xaa },
4951
{ 0xb5, 0xb5 },

misc/scripts/char_range_fetch.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
#!/usr/bin/env python3
2+
3+
# Script used to dump char ranges for specific properties from
4+
# the Unicode Character Database to the `char_range.inc` file.
5+
# NOTE: This script is deliberately not integrated into the build system;
6+
# you should run it manually whenever you want to update the data.
7+
8+
import os
9+
import sys
10+
from typing import Final, List, Tuple
11+
from urllib.request import urlopen
12+
13+
if __name__ == "__main__":
14+
sys.path.insert(1, os.path.join(os.path.dirname(__file__), "../../"))
15+
16+
from methods import generate_copyright_header
17+
18+
URL: Final[str] = "https://www.unicode.org/Public/16.0.0/ucd/DerivedCoreProperties.txt"
19+
20+
21+
xid_start: List[Tuple[int, int]] = []
22+
xid_continue: List[Tuple[int, int]] = []
23+
uppercase_letter: List[Tuple[int, int]] = []
24+
lowercase_letter: List[Tuple[int, int]] = []
25+
unicode_letter: List[Tuple[int, int]] = []
26+
27+
28+
def merge_ranges(ranges: List[Tuple[int, int]]) -> None:
29+
if len(ranges) < 2:
30+
return
31+
32+
last_start: int = ranges[0][0]
33+
last_end: int = ranges[0][1]
34+
original_ranges: List[Tuple[int, int]] = ranges[1:]
35+
36+
ranges.clear()
37+
38+
for curr_range in original_ranges:
39+
curr_start: int = curr_range[0]
40+
curr_end: int = curr_range[1]
41+
if last_end + 1 != curr_start:
42+
ranges.append((last_start, last_end))
43+
last_start = curr_start
44+
last_end = curr_end
45+
46+
ranges.append((last_start, last_end))
47+
48+
49+
def parse_unicode_data() -> None:
50+
lines: List[str] = [line.decode("utf-8") for line in urlopen(URL)]
51+
52+
for line in lines:
53+
if line.startswith("#") or not line.strip():
54+
continue
55+
56+
split_line: List[str] = line.split(";")
57+
58+
char_range: str = split_line[0].strip()
59+
char_property: str = split_line[1].strip().split("#")[0].strip()
60+
61+
range_start: str = char_range
62+
range_end: str = char_range
63+
if ".." in char_range:
64+
range_start, range_end = char_range.split("..")
65+
66+
range_tuple: Tuple[int, int] = (int(range_start, 16), int(range_end, 16))
67+
68+
if char_property == "XID_Start":
69+
xid_start.append(range_tuple)
70+
elif char_property == "XID_Continue":
71+
xid_continue.append(range_tuple)
72+
elif char_property == "Uppercase":
73+
uppercase_letter.append(range_tuple)
74+
elif char_property == "Lowercase":
75+
lowercase_letter.append(range_tuple)
76+
elif char_property == "Alphabetic":
77+
unicode_letter.append(range_tuple)
78+
79+
# Underscore technically isn't in XID_Start, but for our purposes it's included.
80+
xid_start.append((0x005F, 0x005F))
81+
xid_start.sort(key=lambda x: x[0])
82+
83+
merge_ranges(xid_start)
84+
merge_ranges(xid_continue)
85+
merge_ranges(uppercase_letter)
86+
merge_ranges(lowercase_letter)
87+
merge_ranges(unicode_letter)
88+
89+
90+
def make_array(array_name: str, range_list: List[Tuple[int, int]]) -> str:
91+
result: str = f"constexpr inline CharRange {array_name}[] = {{\n"
92+
93+
for start, end in range_list:
94+
result += f"\t{{ 0x{start:x}, 0x{end:x} }},\n"
95+
96+
result += "};\n\n"
97+
98+
return result
99+
100+
101+
def generate_char_range_inc() -> None:
102+
parse_unicode_data()
103+
104+
source: str = generate_copyright_header("char_range.inc")
105+
106+
source += f"""
107+
// This file was generated using the `misc/scripts/char_range_fetch.py` script.
108+
109+
#ifndef CHAR_RANGE_INC
110+
#define CHAR_RANGE_INC
111+
112+
#include "core/typedefs.h"
113+
114+
// Unicode Derived Core Properties
115+
// Source: {URL}
116+
117+
struct CharRange {{
118+
\tchar32_t start;
119+
\tchar32_t end;
120+
}};\n\n"""
121+
122+
source += make_array("xid_start", xid_start)
123+
source += make_array("xid_continue", xid_continue)
124+
source += make_array("uppercase_letter", uppercase_letter)
125+
source += make_array("lowercase_letter", lowercase_letter)
126+
source += make_array("unicode_letter", unicode_letter)
127+
128+
source += "#endif // CHAR_RANGE_INC\n"
129+
130+
char_range_path: str = os.path.join(os.path.dirname(__file__), "../../core/string/char_range.inc")
131+
with open(char_range_path, "w", newline="\n") as f:
132+
f.write(source)
133+
134+
print("`char_range.inc` generated successfully.")
135+
136+
137+
if __name__ == "__main__":
138+
generate_char_range_inc()

0 commit comments

Comments
 (0)