[libc][wctype][codegen] Add generation script for conversion data (#170868)

bassiounix · web-flow · commit 3a04e01f347e · 2025-12-12T16:47:51.000+02:00
Closes #170871
diff --git a/libc/src/__support/wctype/lower_to_upper.inc b/libc/src/__support/wctype/lower_to_upper.inc
diff --git a/libc/src/__support/wctype/upper_to_lower.inc b/libc/src/__support/wctype/upper_to_lower.inc
diff --git a/libc/utils/wctype_utils/README.rst b/libc/utils/wctype_utils/README.rst
@@ -0,0 +1,7 @@
+This folder contains utility scripts for wide character in Python 3 for
+generating the necessary data used by internal implementation of ``wctype``
+utils. These scripts are meant to be run manually by the maintainers when the
+data needs to be updated or a new version of unicode data are released. The
+generated data and files are then checked into the repository by the maintainers
+and built with the internal helper utils found in ``libc/src/__support/wctype``.
+Manual modification of the generated files is prohibited.
diff --git a/libc/utils/wctype_utils/conversion/__init__.py b/libc/utils/wctype_utils/conversion/__init__.py
@@ -0,0 +1,3 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libc/utils/wctype_utils/conversion/gen_conversion_data.py b/libc/utils/wctype_utils/conversion/gen_conversion_data.py
@@ -0,0 +1,31 @@
+# ===- Generate conversion data for wctype utils -------*- python -*--==#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ==------------------------------------------------------------------------==#
+
+
+def extract_maps_from_unicode_file(
+    file_path: str,
+) -> tuple[dict[int, int], dict[int, int]]:
+    """Extracts lower-to-upper and upper-to-lower case mappings"""
+    lower_to_upper = {}
+    upper_to_lower = {}
+
+    # Construct upper-lower case mappings
+    with open(file_path) as file:
+        for line in file.readlines():
+            line_entries = line.split(";")
+            code_point, name, classification = line_entries[:3]
+            code_point = int(code_point, 16)
+
+            if classification == "Lu":
+                if line_entries[13]:
+                    upper_to_lower[code_point] = int(line_entries[13], 16)
+            elif classification == "Ll":
+                if line_entries[12]:
+                    lower_to_upper[code_point] = int(line_entries[12], 16)
+
+    return (lower_to_upper, upper_to_lower)
diff --git a/libc/utils/wctype_utils/conversion/hex_writer.py b/libc/utils/wctype_utils/conversion/hex_writer.py
@@ -0,0 +1,35 @@
+# ===- Hex numbers writer for wctype include files -----------*- python -*--==#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ==------------------------------------------------------------------------==#
+
+
+def write_hex_conversions(file_path: str, mappings: dict[int, int]) -> None:
+    """Writes the given mapping as hex numbers to the given file path"""
+    with open(file_path, "w") as file:
+        title = file_path.split("/")[-1].split(".")[0].replace("_", " ")
+        file.write(
+            f"""//===-- Auto-generated {title} case mappings table -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+"""
+            + "// DO NOT EDIT MANUALLY.\n"
+            + "// This file is generated by libc/utils/wctype_utils scripts.\n"
+            + "// This file is meant to be included directly into LLVM libc code\n"
+            + "// Format: {from_codepoint, to_codepoint}\n"
+            + f"// Info: {len(mappings)} entries\n\n"
+        )
+        for i, key in enumerate(mappings):
+            file.write("{")
+            file.write(f"0x{key:X}, 0x{mappings[key]:X}")
+            if i == len(mappings) - 1:
+                file.write("}\n")
+            else:
+                file.write("},\n")
diff --git a/libc/utils/wctype_utils/gen.py b/libc/utils/wctype_utils/gen.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+#
+# ===- Run wctype generator ----------------------------------*- python -*--==#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ==------------------------------------------------------------------------==#
+
+from conversion.gen_conversion_data import extract_maps_from_unicode_file
+from conversion.hex_writer import write_hex_conversions
+from sys import argv
+from sys import exit
+
+
+def write_wctype_conversion_data(
+    llvm_project_root_path: str, unicode_data_folder_path: str
+) -> None:
+    """Generates and writes wctype conversion data files"""
+    lower_to_upper, upper_to_lower = extract_maps_from_unicode_file(
+        f"{unicode_data_folder_path}/UnicodeData.txt"
+    )
+    write_hex_conversions(
+        file_path=f"{llvm_project_root_path}/libc/src/__support/wctype/lower_to_upper.inc",
+        mappings=lower_to_upper,
+    )
+    write_hex_conversions(
+        file_path=f"{llvm_project_root_path}/libc/src/__support/wctype/upper_to_lower.inc",
+        mappings=upper_to_lower,
+    )
+
+
+def main() -> None:
+    if len(argv) != 3:
+        print("Codegen: wctype data generator script")
+        print(
+            f"Usage:\n\t{argv[0]} <path-to-llvm-project-root> <path-to-unicode-data-folder>"
+        )
+        print(
+            "INFO: You can download Unicode data files from https://www.unicode.org/Public/UCD/latest/ucd/"
+        )
+        exit(1)
+
+    write_wctype_conversion_data(
+        llvm_project_root_path=argv[1], unicode_data_folder_path=argv[2]
+    )
+    print(f"wctype conversion data is written to {argv[1]}/libc/src/__support/wctype/")
+
+
+if __name__ == "__main__":
+    main()

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.`
	`2`	`+# See https://llvm.org/LICENSE.txt for license information.`
	`3`	`+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception`