Skip to content

Commit eb6da50

Browse files
Move from deprecated codecvt to ICU for Unicode handling on non-Windows platforms (#210)
* Move from codecvt to ICU on non-Windows platforms. * Update setup.py to point to libicu instead of codecvt * Update documentation * Fix CI jobs to pull in libicu as needed on MacOS and Linux * Use clang-format from llvm repo Co-authored-by: Emilio López
1 parent 31ac596 commit eb6da50

File tree

6 files changed

+108
-23
lines changed

6 files changed

+108
-23
lines changed

.github/workflows/ci.yml

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ on:
44
push:
55
branches:
66
- master
7+
- libicu-test
78
pull_request:
89
schedule:
910
# run CI every day even if no PRs/merges occur
@@ -17,8 +18,10 @@ jobs:
1718

1819
- name: deps
1920
run: |
21+
wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key | sudo tee /etc/apt/trusted.gpg.d/apt.llvm.org.asc
22+
sudo add-apt-repository -y "deb http://apt.llvm.org/noble/ llvm-toolchain-noble-18 main"
2023
sudo apt-get update
21-
sudo apt-get install -y clang-format
24+
sudo apt-get install -y clang-format-18 libicu-dev
2225
2326
- name: lint
2427
run: |
@@ -44,6 +47,19 @@ jobs:
4447
- uses: actions/checkout@v4
4548
with:
4649
submodules: 'true'
50+
- name: Install ICU (Ubuntu)
51+
if: matrix.platform == 'ubuntu-latest'
52+
run: |
53+
sudo apt-get update
54+
sudo apt-get install -y libicu-dev
55+
- name: Install ICU (macOS)
56+
if: matrix.platform == 'macos-latest'
57+
run: |
58+
brew install icu4c
59+
- name: Set ICU_ROOT for macOS
60+
if: matrix.platform == 'macos-latest'
61+
run: |
62+
echo "ICU_ROOT=$(brew --prefix icu4c)" >> $GITHUB_ENV
4763
- name: Enable ASan+UBSan Sanitizers
4864
if: matrix.build-type == 'Debug'
4965
run: |
@@ -85,6 +101,16 @@ jobs:
85101
- uses: actions/setup-python@v5
86102
with:
87103
python-version: ${{ matrix.python }}
104+
- name: Install ICU (Ubuntu)
105+
if: matrix.platform == 'ubuntu-latest'
106+
run: |
107+
sudo apt-get update
108+
sudo apt-get install -y libicu-dev
109+
- name: Install ICU (macOS)
110+
if: matrix.platform == 'macos-latest'
111+
run: |
112+
brew install icu4c
113+
echo "ICU_ROOT=$(brew --prefix icu4c)" >> $GITHUB_ENV
88114
- name: build
89115
run: |
90116
python -m pip install build wheel setuptools

README.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,16 @@ More information about `pepy` can be found in its [README](./pepy/README.md).
5252

5353
## Dependencies
5454

55+
### ICU
56+
57+
ICU is required on non-Windows platforms (Linux, macOS). **You do not need ICU on Windows.**
58+
59+
- ICU library (International Components for Unicode)
60+
- Debian/Ubuntu: `sudo apt-get install libicu-dev`
61+
- RedHat/Fedora: `sudo dnf install libicu-devel`
62+
- macOS: `brew install icu4c`
63+
- vcpkg: `vcpkg install icu`
64+
5565
### CMake
5666
* Debian/Ubuntu: `sudo apt-get install cmake`
5767
* RedHat/Fedora: `sudo yum install cmake`
@@ -91,6 +101,17 @@ cmake -G "Visual Studio 16 2019" -A Win64 ..
91101
cmake --build . --config Release
92102
```
93103

104+
### MacOS-specific
105+
106+
When ICU is installed via brew it will not be in the usual library path, so you will need to set the `ICU_ROOT` environment variable to the location where ICU is installed when running cmake.
107+
108+
The path is usually `/opt/homebrew/opt/icu4c@version`, where `version` is the version number of ICU you have installed.
109+
110+
```
111+
ICU_ROOT=/opt/homebrew/opt/icu4c@123 cmake -DCMAKE_BUILD_TYPE=Release ..
112+
ICU_ROOT=/opt/homebrew/opt/icu4c@123 cmake --build .
113+
```
114+
94115
## Testing
95116

96117
You can build the (catch2-based) tests by adding `-DPEPARSE_ENABLE_TESTING=ON` during CMake configuration. Build, and then run with `ctest` or `cmake --build . --target test`.

pe-parser-library/CMakeLists.txt

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,13 @@ list(APPEND PEPARSERLIB_SOURCEFILES
1414
)
1515

1616
# NOTE(ww): On Windows we use the Win32 API's built-in UTF16 conversion
17-
# routines; on other platforms we use codecvt. codecvt is nominally deprecated
18-
# in C++17 and onwards, but will probably be available for quite some time.
19-
# Previous versions of pe-parse used ICU when available, but this caused
20-
# DLL hell on Windows and wasn't worth the additional dependency.
17+
# routines; on other platforms we use ICU (International Components for Unicode).
18+
# Previous versions used codecvt, which was deprecated in C++17.
2119
if(MSVC)
2220
list(APPEND PEPARSERLIB_SOURCEFILES src/unicode_winapi.cpp)
2321
else()
24-
list(APPEND PEPARSERLIB_SOURCEFILES src/unicode_codecvt.cpp)
22+
find_package(ICU COMPONENTS uc REQUIRED)
23+
list(APPEND PEPARSERLIB_SOURCEFILES src/unicode_libicu.cpp)
2524
endif()
2625

2726
add_library(${PROJECT_NAME} ${PEPARSERLIB_SOURCEFILES})
@@ -39,6 +38,11 @@ target_include_directories(
3938
)
4039
target_compile_options(${PROJECT_NAME} PRIVATE ${GLOBAL_CXXFLAGS})
4140

41+
# Link ICU on non-Windows platforms
42+
if(NOT MSVC)
43+
target_link_libraries(${PROJECT_NAME} PRIVATE ICU::uc)
44+
endif()
45+
4246
install(
4347
TARGETS ${PROJECT_NAME}
4448
EXPORT pe-parse-config
Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,37 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
2222
THE SOFTWARE.
2323
*/
2424

25-
#include <codecvt>
26-
#include <locale>
2725
#include <pe-parse/to_string.h>
26+
#include <unicode/ustring.h>
27+
#include <unicode/utypes.h>
2828

2929
namespace peparse {
30-
// See
31-
// https://stackoverflow.com/questions/38688417/utf-conversion-functions-in-c11
3230
std::string from_utf16(const UCharString &u) {
33-
std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t> convert;
34-
return convert.to_bytes(u);
31+
if (u.empty()) {
32+
return std::string();
33+
}
34+
35+
const UChar *src = reinterpret_cast<const UChar *>(u.data());
36+
int32_t srcLength = static_cast<int32_t>(u.size());
37+
38+
// First pass: determine required buffer size
39+
UErrorCode status = U_ZERO_ERROR;
40+
int32_t destLength = 0;
41+
u_strToUTF8(nullptr, 0, &destLength, src, srcLength, &status);
42+
43+
if (status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(status)) {
44+
return std::string(); // Return empty on error (matches current behavior)
45+
}
46+
47+
// Second pass: perform actual conversion
48+
std::string result(static_cast<std::size_t>(destLength), '\0');
49+
status = U_ZERO_ERROR;
50+
u_strToUTF8(&result[0], destLength, nullptr, src, srcLength, &status);
51+
52+
if (U_FAILURE(status)) {
53+
return std::string();
54+
}
55+
56+
return result;
3557
}
3658
} // namespace peparse

pepy/pepy.cpp

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -42,18 +42,20 @@ using namespace peparse;
4242
return ((pepy_##OBJ *) self)->ATTR; \
4343
}
4444

45-
#define OBJECTGETTER(OBJ, ATTR, DOC) \
46-
{ \
47-
(char *) #ATTR, (getter) pepy_##OBJ##_get_##ATTR, \
48-
(setter) pepy_attr_not_writable, (char *) #DOC, NULL \
49-
}
45+
#define OBJECTGETTER(OBJ, ATTR, DOC) \
46+
{(char *) #ATTR, \
47+
(getter) pepy_##OBJ##_get_##ATTR, \
48+
(setter) pepy_attr_not_writable, \
49+
(char *) #DOC, \
50+
NULL}
5051

5152
/* 'OPTIONAL' references the fact that these are from the Optional Header */
52-
#define OBJECTGETTER_OPTIONAL(ATTR, DOC) \
53-
{ \
54-
(char *) #ATTR, (getter) pepy_parsed_get_optional_##ATTR, \
55-
(setter) pepy_attr_not_writable, (char *) #DOC, NULL \
56-
}
53+
#define OBJECTGETTER_OPTIONAL(ATTR, DOC) \
54+
{(char *) #ATTR, \
55+
(getter) pepy_parsed_get_optional_##ATTR, \
56+
(setter) pepy_attr_not_writable, \
57+
(char *) #DOC, \
58+
NULL}
5759

5860
static PyObject *pepy_error;
5961

setup.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747

4848
INCLUDE_DIRS = []
4949
LIBRARY_DIRS = []
50+
LIBRARIES = []
5051

5152
if platform.system() == "Windows":
5253
SOURCE_FILES.append(
@@ -64,7 +65,7 @@
6465
COMPILE_ARGS = ["/EHsc"]
6566
else:
6667
SOURCE_FILES.append(
67-
os.path.join(here, "pe-parser-library", "src", "unicode_codecvt.cpp")
68+
os.path.join(here, "pe-parser-library", "src", "unicode_libicu.cpp")
6869
)
6970
INCLUDE_DIRS += [
7071
"/usr/local/include",
@@ -73,8 +74,16 @@
7374
os.path.join(here, "pe-parser-library", "include"),
7475
]
7576
LIBRARY_DIRS += ["/usr/lib", "/usr/local/lib"]
77+
LIBRARIES += ["icuuc"]
7678
COMPILE_ARGS = ["-std=c++17"]
7779

80+
# Add Homebrew ICU paths on macOS if ICU_ROOT is set
81+
if platform.system() == "Darwin":
82+
icu_root = os.environ.get("ICU_ROOT")
83+
if icu_root:
84+
INCLUDE_DIRS.insert(0, os.path.join(icu_root, "include"))
85+
LIBRARY_DIRS.insert(0, os.path.join(icu_root, "lib"))
86+
7887
extension_mod = Extension(
7988
"pepy",
8089
define_macros=[("PEPARSE_VERSION", f'"{VERSION}"')],
@@ -83,6 +92,7 @@
8392
language="c++",
8493
include_dirs=INCLUDE_DIRS,
8594
library_dirs=LIBRARY_DIRS,
95+
libraries=LIBRARIES,
8696
)
8797

8898
setup(

0 commit comments

Comments
 (0)