Skip to content

Commit 3a04e01

Browse files
authored
[libc][wctype][codegen] Add generation script for conversion data (#170868)
Closes #170871
1 parent 7d21334 commit 3a04e01

File tree

7 files changed

+918
-0
lines changed

7 files changed

+918
-0
lines changed

libc/src/__support/wctype/lower_to_upper.inc

Lines changed: 400 additions & 0 deletions
Large diffs are not rendered by default.

libc/src/__support/wctype/upper_to_lower.inc

Lines changed: 390 additions & 0 deletions
Large diffs are not rendered by default.

libc/utils/wctype_utils/README.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
This folder contains utility scripts for wide character in Python 3 for
2+
generating the necessary data used by internal implementation of ``wctype``
3+
utils. These scripts are meant to be run manually by the maintainers when the
4+
data needs to be updated or a new version of unicode data are released. The
5+
generated data and files are then checked into the repository by the maintainers
6+
and built with the internal helper utils found in ``libc/src/__support/wctype``.
7+
Manual modification of the generated files is prohibited.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
2+
# See https://llvm.org/LICENSE.txt for license information.
3+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# ===- Generate conversion data for wctype utils -------*- python -*--==#
2+
#
3+
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
# See https://llvm.org/LICENSE.txt for license information.
5+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
#
7+
# ==------------------------------------------------------------------------==#
8+
9+
10+
def extract_maps_from_unicode_file(
11+
file_path: str,
12+
) -> tuple[dict[int, int], dict[int, int]]:
13+
"""Extracts lower-to-upper and upper-to-lower case mappings"""
14+
lower_to_upper = {}
15+
upper_to_lower = {}
16+
17+
# Construct upper-lower case mappings
18+
with open(file_path) as file:
19+
for line in file.readlines():
20+
line_entries = line.split(";")
21+
code_point, name, classification = line_entries[:3]
22+
code_point = int(code_point, 16)
23+
24+
if classification == "Lu":
25+
if line_entries[13]:
26+
upper_to_lower[code_point] = int(line_entries[13], 16)
27+
elif classification == "Ll":
28+
if line_entries[12]:
29+
lower_to_upper[code_point] = int(line_entries[12], 16)
30+
31+
return (lower_to_upper, upper_to_lower)
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# ===- Hex numbers writer for wctype include files -----------*- python -*--==#
2+
#
3+
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
# See https://llvm.org/LICENSE.txt for license information.
5+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
#
7+
# ==------------------------------------------------------------------------==#
8+
9+
10+
def write_hex_conversions(file_path: str, mappings: dict[int, int]) -> None:
11+
"""Writes the given mapping as hex numbers to the given file path"""
12+
with open(file_path, "w") as file:
13+
title = file_path.split("/")[-1].split(".")[0].replace("_", " ")
14+
file.write(
15+
f"""//===-- Auto-generated {title} case mappings table -------*- C++ -*-===//
16+
//
17+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
18+
// See https://llvm.org/LICENSE.txt for license information.
19+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
20+
//
21+
//===----------------------------------------------------------------------===//
22+
"""
23+
+ "// DO NOT EDIT MANUALLY.\n"
24+
+ "// This file is generated by libc/utils/wctype_utils scripts.\n"
25+
+ "// This file is meant to be included directly into LLVM libc code\n"
26+
+ "// Format: {from_codepoint, to_codepoint}\n"
27+
+ f"// Info: {len(mappings)} entries\n\n"
28+
)
29+
for i, key in enumerate(mappings):
30+
file.write("{")
31+
file.write(f"0x{key:X}, 0x{mappings[key]:X}")
32+
if i == len(mappings) - 1:
33+
file.write("}\n")
34+
else:
35+
file.write("},\n")

libc/utils/wctype_utils/gen.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#!/usr/bin/env python3
2+
#
3+
# ===- Run wctype generator ----------------------------------*- python -*--==#
4+
#
5+
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
6+
# See https://llvm.org/LICENSE.txt for license information.
7+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8+
#
9+
# ==------------------------------------------------------------------------==#
10+
11+
from conversion.gen_conversion_data import extract_maps_from_unicode_file
12+
from conversion.hex_writer import write_hex_conversions
13+
from sys import argv
14+
from sys import exit
15+
16+
17+
def write_wctype_conversion_data(
18+
llvm_project_root_path: str, unicode_data_folder_path: str
19+
) -> None:
20+
"""Generates and writes wctype conversion data files"""
21+
lower_to_upper, upper_to_lower = extract_maps_from_unicode_file(
22+
f"{unicode_data_folder_path}/UnicodeData.txt"
23+
)
24+
write_hex_conversions(
25+
file_path=f"{llvm_project_root_path}/libc/src/__support/wctype/lower_to_upper.inc",
26+
mappings=lower_to_upper,
27+
)
28+
write_hex_conversions(
29+
file_path=f"{llvm_project_root_path}/libc/src/__support/wctype/upper_to_lower.inc",
30+
mappings=upper_to_lower,
31+
)
32+
33+
34+
def main() -> None:
35+
if len(argv) != 3:
36+
print("Codegen: wctype data generator script")
37+
print(
38+
f"Usage:\n\t{argv[0]} <path-to-llvm-project-root> <path-to-unicode-data-folder>"
39+
)
40+
print(
41+
"INFO: You can download Unicode data files from https://www.unicode.org/Public/UCD/latest/ucd/"
42+
)
43+
exit(1)
44+
45+
write_wctype_conversion_data(
46+
llvm_project_root_path=argv[1], unicode_data_folder_path=argv[2]
47+
)
48+
print(f"wctype conversion data is written to {argv[1]}/libc/src/__support/wctype/")
49+
50+
51+
if __name__ == "__main__":
52+
main()

0 commit comments

Comments
 (0)