diff --git a/.github/workflows/cibuildwheel.yaml b/.github/workflows/cibuildwheel.yaml new file mode 100644 index 000000000..d6246f050 --- /dev/null +++ b/.github/workflows/cibuildwheel.yaml @@ -0,0 +1,123 @@ +# SPDX-FileCopyrightText: 2025 geisserml +# SPDX-FileCopyrightText: 2025 wojiushixiaobai <296015668@qq.com> +# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause + +# NOTE: This workflow is currently written with a dynamic matrix. +# Another option would be to extract a reusable "build one" workflow and declare an individual job for each target here. + +name: Build with cibuildwheel +on: + workflow_dispatch: + inputs: + cibw_py_ver: + default: 'cp38' + type: string + linux_main: + default: true + type: boolean + linux_ibm: + default: true + type: boolean + linux_emulated: + default: false + type: boolean + linux_musl: + default: true + type: boolean + +permissions: {} + +jobs: + + prepare_matrix: + name: Determine build matrix + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + + steps: + - name: Run python script that outputs the build matrix + id: set-matrix + shell: python + env: + LINUX_MAIN: ${{ inputs.linux_main && 1 || 0 }} + LINUX_IBM: ${{ inputs.linux_ibm && 1 || 0 }} + LINUX_EMULATED: ${{ inputs.linux_emulated && 1 || 0 }} + LINUX_MUSL: ${{ inputs.linux_musl && 1 || 0 }} + run: | + import os, sys, json + + LINUX_MAIN = bool(int( os.environ["LINUX_MAIN"] )) + LINUX_IBM = bool(int( os.environ["LINUX_IBM"] )) + LINUX_EMULATED = bool(int( os.environ["LINUX_EMULATED"] )) + LINUX_MUSL = bool(int( os.environ["LINUX_MUSL"] )) + + matrix = [] + images = ["manylinux"] + if LINUX_MUSL: + images.append("musllinux") + + def job(image, os, arch, emulated=False): + matrix.append(dict( + image=image, os=os, arch=arch, emulated=emulated + )) + + def linux_job(os, arch, emulated=False, images=images): + for image in images: + job(os, arch, image, emulated) + + if LINUX_MAIN: + linux_job("ubuntu-24.04", "x86_64") + linux_job("ubuntu-24.04-arm", "aarch64") + if LINUX_IBM: + # XXX will become native as soon as we get access to IBM's self-hosted runners + linux_job("ubuntu-24.04", "ppc64le", True) # False + linux_job("ubuntu-24.04", "s390x", True) # False + if LINUX_EMULATED: + linux_job("ubuntu-24.04", "loongarch64", True) + linux_job("ubuntu-24.04", "riscv64", True) + if LINUX_MUSL: + # pdfium-binaries don't currently build armv7l for musl (but they do for glibc) + linux_job("ubuntu-24.04", "armv7l", True, images=("musllinux", )) + + matrix_json = json.dumps(matrix) + print(matrix_json, file=sys.stderr) + with open(os.environ["GITHUB_OUTPUT"], 'a') as output_fh: + print(f"matrix={matrix_json}", file=output_fh) + + build_wheels: + name: Build ${{ matrix.arch }} ${{ matrix.image }} on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + needs: prepare_matrix + + strategy: + fail-fast: false + matrix: + include: ${{ fromJSON(needs.prepare_matrix.outputs.matrix) }} + + steps: + + - name: Check out the repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up QEMU + if: ${{ matrix.emulated }} + uses: docker/setup-qemu-action@v3 + + # Reminder: most configuration is in pyproject.toml so we can use TOML overrides + - name: Build wheels + uses: pypdfium2-team/cibuildwheel@v3.1.3 + env: + # Will be tagged as not python specific by our setup.py. inputs.cibw_py_ver only controls the version used at build time. Could also use `*`, then cibuildwheel would build with the oldest supported version, and walk through the others but skip because a compatible wheel is around already. + CIBW_BUILD: "${{ inputs.cibw_py_ver }}-${{ matrix.image }}_${{ matrix.arch }}" + CIBW_ARCHS: ${{ matrix.arch }} + with: + output-dir: wheelhouse + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + path: ./wheelhouse/*.whl + name: cibw-${{ matrix.image }}-${{ matrix.arch }} diff --git a/.github/workflows/conda.yaml b/.github/workflows/conda.yaml index bb424f8f6..0d947705e 100644 --- a/.github/workflows/conda.yaml +++ b/.github/workflows/conda.yaml @@ -15,7 +15,7 @@ on: default: 'latest' type: string new_only: - # only with package == "raw", ignored otherwise (actually the default should be false in that case, but I don't know if GH supports dynamic defaults depending on other inputs) + # only with package == "raw", ignored otherwise (actually the default should be false in that case, but don't know if GH supports dynamic defaults depending on other inputs) default: true type: boolean test: @@ -95,6 +95,7 @@ jobs: fail-fast: false matrix: # NOTE On GH actions, macOS <=13 is Intel, whereas macOS >=14 will be ARM64 + # Can't test 'windows-11-arm' because setup-miniconda doesn't support it AOTW os: ['ubuntu-latest', 'ubuntu-24.04-arm', 'macos-13', 'macos-latest', 'windows-latest'] py: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13'] diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 0173c9639..d3214d03c 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -118,8 +118,16 @@ jobs: fail-fast: false matrix: # NOTE On GH actions, macOS <=13 is Intel, whereas macOS >=14 will be ARM64 - os: ['ubuntu-latest', 'ubuntu-24.04-arm', 'macos-13', 'macos-latest', 'windows-latest'] + os: ['ubuntu-latest', 'ubuntu-24.04-arm', 'macos-13', 'macos-latest', 'windows-latest', 'windows-11-arm'] py: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13'] + exclude: + # not supported by setup-python action + - os: windows-11-arm + py: '3.8' + - os: windows-11-arm + py: '3.9' + - os: windows-11-arm + py: '3.10' include: - os: ubuntu-latest wheel: dist/*manylinux_*_x86_64*.whl @@ -131,13 +139,13 @@ jobs: wheel: dist/*macosx_*_arm64*.whl - os: windows-latest wheel: dist/*win_amd64.whl + - os: windows-11-arm + wheel: dist/*win_arm64.whl runs-on: ${{ matrix.os }} steps: - - uses: extractions/setup-just@v3 - - name: Set up Python uses: actions/setup-python@v5 with: @@ -172,7 +180,7 @@ jobs: WHEEL: ${{ matrix.wheel }} - name: Run Test Suite - run: just test + run: python3 -m pytest tests/ publish: diff --git a/.github/workflows/test_release.yaml b/.github/workflows/test_release.yaml index 6ab25fe67..3b3403b1f 100644 --- a/.github/workflows/test_release.yaml +++ b/.github/workflows/test_release.yaml @@ -21,15 +21,21 @@ jobs: fail-fast: false matrix: # NOTE On GH actions, macOS <=13 is Intel, whereas macOS >=14 will be ARM64 - os: ['ubuntu-latest', 'ubuntu-24.04-arm', 'macos-13', 'macos-latest', 'windows-latest'] + os: ['ubuntu-latest', 'ubuntu-24.04-arm', 'macos-13', 'macos-latest', 'windows-latest', 'windows-11-arm'] py: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13'] + exclude: + # not supported by setup-python action + - os: windows-11-arm + py: '3.8' + - os: windows-11-arm + py: '3.9' + - os: windows-11-arm + py: '3.10' runs-on: ${{ matrix.os }} steps: - - uses: extractions/setup-just@v3 - - name: Set up Python uses: actions/setup-python@v5 with: @@ -54,4 +60,4 @@ jobs: run: python3 -m pip install -U -r req/converters.txt -r req/test.txt - name: Run tests - run: just test + run: python3 -m pytest tests/ diff --git a/.github/workflows/test_setup.yaml b/.github/workflows/test_setup.yaml index b77fe0d48..bdcee453d 100644 --- a/.github/workflows/test_setup.yaml +++ b/.github/workflows/test_setup.yaml @@ -18,15 +18,19 @@ jobs: fail-fast: false matrix: # NOTE On GH actions, macOS <=13 is Intel, whereas macOS >=14 will be ARM64 - os: ['ubuntu-latest', 'ubuntu-24.04-arm', 'macos-13', 'macos-latest', 'windows-latest'] + os: ['ubuntu-latest', 'ubuntu-24.04-arm', 'macos-13', 'macos-latest', 'windows-latest', 'windows-11-arm'] py: ['3.9', '3.10', '3.11', '3.12', '3.13'] + exclude: + # not supported by setup-python action + - os: windows-11-arm + py: '3.9' + - os: windows-11-arm + py: '3.10' runs-on: ${{ matrix.os }} steps: - - uses: extractions/setup-just@v3 - # AOTW, the slsa-verifier GH action does not support anything but Ubuntu x86_64. - name: slsa-verifier if: ${{ startsWith(matrix.os, 'ubuntu') && !endsWith(matrix.os, '-arm') }} @@ -61,7 +65,7 @@ jobs: run: python3 -m pip install -v --no-build-isolation -e . - name: Build docs - run: just docs-build + run: python3 -m sphinx -b html docs/source docs/build/html - name: Run test suite - run: just test + run: python3 -m pytest tests/ diff --git a/.github/workflows/test_sourcebuild.yaml b/.github/workflows/test_sourcebuild.yaml index cf6af8cac..a2386772d 100644 --- a/.github/workflows/test_sourcebuild.yaml +++ b/.github/workflows/test_sourcebuild.yaml @@ -19,8 +19,9 @@ jobs: fail-fast: false matrix: # On GH actions, macOS <=13 is Intel, whereas macOS >=14 will be ARM64 - # Google's toolchain doesn't seem to run on Linux arm64 natively. The toolchain-free build (or cross-compilation from x86_64) should work, though. - os: ['ubuntu-latest', 'macos-13', 'macos-latest', 'windows-latest'] # 'ubuntu-24.04-arm' + # Google's toolchain doesn't seem to run on Linux/Windows arm64 natively. The toolchain-free build (or cross-compilation from x86_64) should work, though. + # 'ubuntu-24.04-arm', 'windows-11-arm' + os: ['ubuntu-latest', 'macos-13', 'macos-latest', 'windows-latest'] build_mode: ['toolchained'] include: - os: 'ubuntu-latest' diff --git a/README.md b/README.md index 37565436c..f9b4acd5a 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,7 @@ This project comes with two scripts to automate the build process: `build_toolch - `build_toolchained` is based on the build instructions in pdfium's Readme, and uses Google's toolchain (this means foreign binaries and sysroots). This results in a heavy checkout process that may take a lot of time and space. By default, this script will use vendored libraries, but you can also pass `--use-syslibs` to try to use system libraries. An advantage of the toolchain is its powerful cross-compilation support (including symbol reversioning). - `build_native` is an attempt to address some shortcomings of the toolchained build (mainly a bloated checkout process, and lack of portability). It is tailored towards native compilation, and uses system tools and libraries (including the system's GCC compiler), which must be installed by the caller beforehand. This script should theoretically work on arbitrary Linux architectures. As a drawback, this process is not supported or even documented upstream, so it might be hard to maintain. -You can also set `PDFIUM_PLATFORM` to `sourcebuild-native` or `sourcebuild-toolchained` to trigger either build script through setup. +You can also set `PDFIUM_PLATFORM` to `sourcebuild-native` or `sourcebuild-toolchained` to trigger either build script through setup, and pass command-line flags with `$BUILD_PARAMS`. However, for simplicity, both scripts/subtargets share just `sourcebuild` as staging directory. Dependencies: @@ -160,7 +160,7 @@ PDFIUM_PLATFORM="sourcebuild" python -m pip install -v . Or for the native build, on Ubuntu 24.04, you could do e.g.: ```bash # Install dependencies -sudo apt-get install generate-ninja ninja-build libfreetype-dev liblcms2-dev libjpeg-dev libopenjp2-7-dev libpng-dev zlib1g-dev libicu-dev libtiff-dev libglib2.0-dev +sudo apt-get install generate-ninja ninja-build libfreetype-dev liblcms2-dev libjpeg-dev libopenjp2-7-dev libpng-dev libtiff-dev zlib1g-dev libicu-dev libglib2.0-dev ``` ```bash # Build with GCC @@ -180,9 +180,50 @@ python ./setupsrc/pypdfium2_setup/build_native.py --compiler clang PDFIUM_PLATFORM="sourcebuild" python -m pip install -v . ``` +Note, on *some* platforms, you might also need symlinks for GCC, e.g.: +```bash +PREFIX=$(python ./utils/get_gcc_prefix.py) # in pypdfium2 dir +GCC_DIR="/usr" # or e.g. /opt/rh/gcc-toolset-14/root +sudo ln -s $GCC_DIR/bin/gcc $GCC_DIR/bin/$PREFIX-gcc +sudo ln -s $GCC_DIR/bin/g++ $GCC_DIR/bin/$PREFIX-g++ +sudo ln -s $GCC_DIR/bin/nm $GCC_DIR/bin/$PREFIX-nm +sudo ln -s $GCC_DIR/bin/readelf $GCC_DIR/bin/$PREFIX-readelf +``` + > [!TIP] > By default, the build scripts will create separate DLLs for vendored dependency libraries (e.g. `abseil`). However, if you want to bundle everything into a single DLL, pass `--single-lib`. +> [!NOTE] +> The native sourcebuild currently supports Linux (or similar). +> macOS and Windows are not handled, as we do not have access to these systems, and working over CI did not turn out feasible – use the toolchain-based build for now. +> Community help / pull requests to extend platform support would be welcome. + +##### cibuildwheel + +The native sourcebuild can be run through cibuildwheel. For targets configured in our [`pyproject.toml`](./pyproject.toml), the basic invocation is as simple as p.ex. +```bash +CIBW_BUILD="cp311-manylinux_x86_64" cibuildwheel +``` + +See also our [cibuildwheel workflow](.github/workflows/cibuildwheel.yaml). +For more options, see the [upstream documentation](https://cibuildwheel.pypa.io/en/stable/options). + +Note that, for Linux, cibuildwheel requires Docker. On the author's version of Fedora, it can be installed as follows: +```bash +sudo dnf in moby-engine # this provides the docker command +sudo systemctl start docker +sudo systemctl enable docker +sudo usermod -aG docker $USER +# then reboot (re-login might also suffice) +``` +For other ways of installing Docker, refer to the cibuildwheel docs ([Setup](https://cibuildwheel.pypa.io/en/stable/setup/), [Platforms](https://cibuildwheel.pypa.io/en/stable/platforms/)) and the links therein. + +> [!WARNING] +> cibuildwheel copies the project directory into a container, not taking `.gitignore` rules into account. +> Thus, it is advisable to make a fresh checkout of pypdfium2 before running cibuildwheel. +> In particular, a toolchained checkout of pdfium within pypdfium2 is problematic, and will cause a halt on the `Copying project into container...` step. +> For development, make sure the fresh checkout is in sync with the working copy. + ##### Android (Termux) The native build may also work on Android with Termux in principle. @@ -310,7 +351,7 @@ Disclaimer: As it is hard to keep up with constantly evolving setup code, it is + If unset or `auto`, the host platform is detected and a corresponding binary will be selected. + If an explicit platform identifier (e.g. `linux_x64`, `darwin_arm64`, ...), binaries for the requested platform will be used.[^platform_ids] + If `system-search`, look for and bind against system-provided pdfium instead of embedding a binary. If just `system`, consume existing bindings from `data/system/`. - + If `sourcebuild`, binary and bindings will be taken from `data/sourcebuild/`, assuming a prior run of the native or toolchained build scripts. `sourcebuild-native` or `sourcebuild-toolchained` can also be used to trigger either build through setup. However, triggering on the caller side is preferred as this allows to pass custom options. + + If `sourcebuild`, binary and bindings will be taken from `data/sourcebuild/`, assuming a prior run of the native or toolchained build scripts. `sourcebuild-native` or `sourcebuild-toolchained` can also be used to trigger either build through setup (use `$BUILD_PARAMS` to pass custom options). + If `sdist`, no platform-specific files will be included, so as to create a source distribution. * `$PYPDFIUM_MODULES=[raw,helpers]` defines the modules to include. Metadata adapts dynamically. @@ -954,7 +995,6 @@ Additionally, one doc build can also be hosted on [GitHub Pages](https://pypdfiu It is implemented with a CI workflow, which is supposed to be triggered automatically on release. This provides us with full control over build env and used commands, whereas RTD may be less liberal in this regard. - ### Testing pypdfium2 contains a small test suite to verify the library's functionality. It is written with [pytest](https://github.com/pytest-dev/pytest/): @@ -984,10 +1024,28 @@ find . -name '*.pdf' -exec bash -c "echo \"{}\" && pypdfium2 toc \"{}\"" \; [^testing_corpora]: For instance, one could use the testing corpora of open-source PDF libraries (pdfium, pikepdf/ocrmypdf, mupdf/ghostscript, tika/pdfbox, pdfjs, ...) +### Adding a new workflow + +When writing a new workflow, it is usually desirable to test in a branch first before merging into main. +However, new workflows from branches cannot be dispatched from the GitHub Actions panel yet. That's why you'll want to use the [`gh`](https://cli.github.com/) command-line tool, as follows: +```bash +gh workflow run $WORKFLOW_NAME.yaml --ref $MY_BRANCH +``` +If inputs are needed, JSON can be used +```bash +echo '{"my_json_info":1, "my_var":"hello"}' | gh workflow run $WORKFLOW_NAME.yaml --ref $MY_BRANCH --json +# real-world example +echo '{"cibw_py_ver":"cp38", "linux_main":"true", "linux_ibm":"false", "linux_emulated":"false", "linux_musl":"true"}' | gh workflow run cibuildwheel.yaml --ref cibuildwheel --json +``` +You should pass the complete set of fields here, defaults might not be recognized with this form of dispatch. + +> [!IMPORTANT] +> You need to be in the pypdfium2 directory for this to work. Otherwise, the request will be silently ignored. + ### Release workflow The release process is fully automated using Python scripts and scheduled release workflows. -You may also trigger the workflow manually using the GitHub Actions panel or the [`gh`](https://cli.github.com/) command-line tool. +You may also trigger the workflow manually from the GitHub Actions panel or similar. Python release scripts are located in the folder `setupsrc/pypdfium2_setup`, along with custom setup code: * `update.py` downloads binaries. @@ -1038,10 +1096,10 @@ If something went wrong with commit or tag, you can still revert the changes: # perform an interactive rebase to change history (substitute $N_COMMITS with the number of commits to drop or modify) git rebase -i HEAD~$N_COMMITS git push --force -# delete local tag (substitute $TAGNAME accordingly) -git tag -d $TAGNAME -# delete remote tag +# delete remote tag (substitute $TAGNAME accordingly) git push --delete origin $TAGNAME +# delete local tag +git tag -d $TAGNAME ``` Faulty PyPI releases may be yanked using the web interface. diff --git a/REUSE.toml b/REUSE.toml index 9b5a96eb4..f74801916 100644 --- a/REUSE.toml +++ b/REUSE.toml @@ -134,6 +134,31 @@ SPDX-FileCopyrightText = [ ] SPDX-License-Identifier = "BSD-3-Clause OR Apache-2.0" +[[annotations]] +path = "pdfium_patches/cibuildwheel.patch" +precedence = "aggregate" +SPDX-FileCopyrightText = [ + "2025 wojiushixiaobai <296015668@qq.com>", + "2025 geisserml ", +] +SPDX-License-Identifier = "BSD-3-Clause OR Apache-2.0" + +[[annotations]] +path = [ + "pdfium_patches/bigendian.patch", + "pdfium_patches/bigendian_test.patch", +] +precedence = "aggregate" +SPDX-FileCopyrightText = [ + "2025 Christian Heimes ", + "2025 geisserml ", +] +SPDX-License-Identifier = "LicenseRef-Ignore" +SPDX-FileComment = ''' +Derived from RedHat packaging repository: https://github.com/tiran/libpdfium/blob/d99370b3ac7f0c9cd7222be2dfab2c7b648f2e9e/0001-bigendian.patch +Repo does not have an own license file, but assumably the patches are being made available under pdfium's license, or leastways an open-source license. +''' + [[annotations]] path = [ "docs/build/.gitkeep", diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index 10bb803c8..c4cb63f8b 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -47,6 +47,7 @@ Android `arm64_v8a`, `armeabi_v7a`, `x86_64`, `x86` and iOS `arm64` device and `arm64`, `x86_64` simulators are now handled in setup and should implicitly download the right pdfium-binaries. Provided on a best effort basis, and largely untested. Testers/feedback welcome. - pypdfium2's setup is now also capable of producing wheels for these platforms, but they will not actually be included in releases at this time. (Once Termux ships Python 3.13, we may want to publish Android `arm64_v8a` and maybe `armeabi_v7a` wheels, but we do not intend to provide wheels for simulators.) - iOS will not actually work yet, as the PEP indicates binaries ought to be moved to a special Frameworks location for permission reasons, in which case you'd also have to patch pypdfium2's library search. We cannot do anything about this yet without access to a device or clearer instructions. Community help would be appreciated here. +- Added draft cibuildwheel workflow and configuration, as a second footing for the project (based on the native sourcebuild, see below). In the future, this may allow to stuff some Linux architecture gaps the *quick & dirty* way with emulation, or maybe new native runners. Many thanks to `wojiushixiaobai` for providing the initial workflow and helpful pointers. *Setup* - When pdfium binaries are downloaded implicitly on setup or `emplace.py` is run, we now pin the pdfium version by default. This is to prevent possible API breakage when pypdfium2 is installed from source. It should also make the `git` dependency optional on default setup. `update.py` and `craft.py` continue to default to the latest pdfium-binaries version. @@ -67,7 +68,7 @@ *Project* - Replaced the bash `./run` file with a [`justfile`](https://github.com/casey/just). Note that the runfile previously did not fail fast and propagate errors, which is potentially dangerous for a release workflow. This had been fixed on the runfile in v5.0.0b1 before introducing the justfile. -- CI: Added Linux aarch64 (GH now provides free runners) and Python 3.13 to the test matrix. +- CI: Extended test matrices by Linux and Windows ARM64 (GH now provides free runners) and new Python versions. - Merged `tests_old/` back into `tests/`. - Migrated from deprecated `.reuse/dep5` to more visible `REUSE.toml`. Removed non-standard `.reuse/dep5-wheel`. - Docs: Improved logic when to include the unreleased version warning and upcoming changelog. diff --git a/pdfium_patches/bigendian.patch b/pdfium_patches/bigendian.patch new file mode 100644 index 000000000..022ec484b --- /dev/null +++ b/pdfium_patches/bigendian.patch @@ -0,0 +1,40 @@ +diff --git a/core/fxcrt/cfx_seekablestreamproxy.cpp b/core/fxcrt/cfx_seekablestreamproxy.cpp +index 33cc528..95dc752 100644 +--- a/core/fxcrt/cfx_seekablestreamproxy.cpp ++++ b/core/fxcrt/cfx_seekablestreamproxy.cpp +@@ -89,11 +89,19 @@ void SwapByteOrder(pdfium::span str) { + + } // namespace + ++#if defined(ARCH_CPU_LITTLE_ENDIAN) + #define BOM_UTF8_MASK 0x00FFFFFF + #define BOM_UTF8 0x00BFBBEF + #define BOM_UTF16_MASK 0x0000FFFF + #define BOM_UTF16_BE 0x0000FFFE + #define BOM_UTF16_LE 0x0000FEFF ++#else ++#define BOM_UTF8_MASK 0xFFFFFF00 ++#define BOM_UTF8 0xEFBBBF00 ++#define BOM_UTF16_MASK 0xFFFF0000 ++#define BOM_UTF16_BE 0xFEFF0000 ++#define BOM_UTF16_LE 0xFFFE0000 ++#endif + + CFX_SeekableStreamProxy::CFX_SeekableStreamProxy( + const RetainPtr& stream) +@@ -188,9 +196,15 @@ size_t CFX_SeekableStreamProxy::ReadBlock(pdfium::span buffer) { + size_t bytes_read = + ReadData(pdfium::as_writable_bytes(buffer).first(bytes_to_read)); + size_t elements = bytes_read / sizeof(uint16_t); ++#if defined(ARCH_CPU_LITTLE_ENDIAN) + if (code_page_ == FX_CodePage::kUTF16BE) { + SwapByteOrder(fxcrt::reinterpret_span(buffer).first(elements)); + } ++#else ++ if (code_page_ == FX_CodePage::kUTF16LE) { ++ SwapByteOrder(fxcrt::reinterpret_span(buffer).first(elements)); ++ } ++#endif + UTF16ToWChar(buffer.first(elements)); + return elements; + } diff --git a/pdfium_patches/bigendian_test.patch b/pdfium_patches/bigendian_test.patch new file mode 100644 index 000000000..9a7310d0e --- /dev/null +++ b/pdfium_patches/bigendian_test.patch @@ -0,0 +1,29 @@ +--- a/core/fxcrt/binary_buffer_unittest.cpp ++++ b/core/fxcrt/binary_buffer_unittest.cpp +@@ -3,6 +3,7 @@ + // found in the LICENSE file. + + #include "core/fxcrt/binary_buffer.h" ++#include "core/fxcrt/byteorder.h" + + #include + #include +@@ -122,7 +123,7 @@ + // Assumes little endian. + TEST(BinaryBuffer, AppendUint16) { + BinaryBuffer buffer; +- buffer.AppendUint16(0x4321); ++ buffer.AppendUint16(FromLE16(0x4321)); + EXPECT_EQ(2u, buffer.GetSize()); + EXPECT_EQ(2u, buffer.GetLength()); + EXPECT_EQ(0x21u, buffer.GetSpan()[0]); +@@ -132,7 +133,7 @@ + // Assumes little endian. + TEST(BinaryBuffer, AppendUint32) { + BinaryBuffer buffer; +- buffer.AppendUint32(0x87654321); ++ buffer.AppendUint32(FromLE32(0x87654321)); + EXPECT_EQ(4u, buffer.GetSize()); + EXPECT_EQ(4u, buffer.GetLength()); + EXPECT_EQ(0x21u, buffer.GetSpan()[0]); + diff --git a/pdfium_patches/cibuildwheel.patch b/pdfium_patches/cibuildwheel.patch new file mode 100644 index 000000000..39eebd215 --- /dev/null +++ b/pdfium_patches/cibuildwheel.patch @@ -0,0 +1,29 @@ +diff --git a/core/fxcodec/jpx/cjpx_decoder.cpp b/core/fxcodec/jpx/cjpx_decoder.cpp +index 4650609..1eadfd5 100644 +--- a/core/fxcodec/jpx/cjpx_decoder.cpp ++++ b/core/fxcodec/jpx/cjpx_decoder.cpp +@@ -493,9 +493,11 @@ bool CJPX_Decoder::Init(pdfium::span src_data, + } + + // For https://crbug.com/42270564 ++ /* OpenJPEG 2.5 API + if (!strict_mode) { + CHECK(opj_decoder_set_strict_mode(codec_.get(), false)); + } ++ */ + + opj_image_t* pTempImage = nullptr; + if (!opj_read_header(stream_.get(), codec_.get(), &pTempImage)) { +diff --git a/core/fxge/cfx_face.cpp b/core/fxge/cfx_face.cpp +index fa02d5b..bfa21d3 100644 +--- a/core/fxge/cfx_face.cpp ++++ b/core/fxge/cfx_face.cpp +@@ -659,7 +659,7 @@ int CFX_Face::GetCharIndex(uint32_t code) { + } + + int CFX_Face::GetNameIndex(const char* name) { +- return FT_Get_Name_Index(GetRec(), name); ++ return FT_Get_Name_Index(GetRec(), const_cast(name)); + } + + FX_RECT CFX_Face::GetCharBBox(uint32_t code, int glyph_index) { diff --git a/pyproject.toml b/pyproject.toml index fd2d6b40f..9690c1016 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,3 +10,85 @@ requires = [ "wheel !=0.38.0, !=0.38.1", "ctypesgen @ git+https://github.com/pypdfium2-team/ctypesgen@pypdfium2", ] + +[tool.cibuildwheel] +build-frontend = "build" +test-command = [ + "python -m pypdfium2 --version", + "python {project}/conda/raw/minitest.py", +] + +[[tool.cibuildwheel.overrides]] +# run actual test suite on platforms that have pillow and numpy prebuilds on PyPI +select = "manylinux_{x86_64,aarch64}" +test-requires = ["pytest", "pillow", "numpy"] +inherit.test-command = "append" +test-command = [ + "python -m pytest {project}/tests", +] + +[tool.cibuildwheel.environment] +PDFIUM_PLATFORM = "sourcebuild-native" +BUILD_PARAMS = "--vendor icu" + +[tool.cibuildwheel.linux] +before-all = [ + "dnf -y install ninja-build freetype-devel lcms2-devel libjpeg-devel openjpeg2-devel libpng-devel libtiff-devel zlib-devel glib2-devel", + "sh -c 'dnf -y install gn || true'", # not manylinux_{riscv64,loongarch64} +] + +[[tool.cibuildwheel.overrides]] +select = "*-musllinux*" +before-all = [ + "apk add gn ninja-build freetype-dev lcms2-dev jpeg-dev openjpeg-dev libpng-dev tiff-dev zlib-dev glib-dev", +] + +[[tool.cibuildwheel.overrides]] +select = "*-manylinux_{aarch64,loongarch64}" +inherit.before-all = "append" +before-all = [ + "export PREFIX=$(python {package}/utils/get_gcc_prefix.py)", + "ln -s /opt/rh/gcc-toolset-14/root/bin/gcc /opt/rh/gcc-toolset-14/root/bin/$PREFIX-gcc", + "ln -s /opt/rh/gcc-toolset-14/root/bin/g++ /opt/rh/gcc-toolset-14/root/bin/$PREFIX-g++", + "ln -s /opt/rh/gcc-toolset-14/root/bin/nm /opt/rh/gcc-toolset-14/root/bin/$PREFIX-nm", + "ln -s /opt/rh/gcc-toolset-14/root/bin/readelf /opt/rh/gcc-toolset-14/root/bin/$PREFIX-readelf", +] + +[[tool.cibuildwheel.overrides]] +select = "*-manylinux_riscv64" +inherit.before-all = "append" +before-all = [ + "export PREFIX=$(python {package}/utils/get_gcc_prefix.py)", + "ln -s /usr/bin/gcc /usr/bin/$PREFIX-gcc", + "ln -s /usr/bin/g++ /usr/bin/$PREFIX-g++", + "ln -s /usr/bin/nm /usr/bin/$PREFIX-nm", + "ln -s /usr/bin/readelf /usr/bin/$PREFIX-readelf", +] + +[[tool.cibuildwheel.overrides]] +select = "*-musllinux_{aarch64,armv7l,riscv64,loongarch64}" +inherit.before-all = "append" +before-all = [ + "export PREFIX=$(python {package}/utils/get_gcc_prefix.py)", + "ln -s /usr/bin/gcc /usr/bin/$PREFIX-gcc", + "ln -s /usr/bin/g++ /usr/bin/$PREFIX-g++", + "ln -s /usr/bin/nm /usr/bin/$PREFIX-nm", + "ln -s /usr/bin/readelf /usr/bin/$PREFIX-readelf", +] + +[[tool.cibuildwheel.overrides]] +select = "*-manylinux_loongarch64" +inherit.before-all = "append" +before-all = [ + "curl -L 'https://github.com/loong64/gn/releases/download/2024.12/gn-linux-loong64.tar.gz' | tar xz -C /usr/local/bin", +] + +[[tool.cibuildwheel.overrides]] +select = "*-manylinux_riscv64" +inherit.before-all = "append" +before-all = [ + "export GN_ARCHIVE='./gn-linux-riscv64.zip'", + "curl -L -o '$GN_ARCHIVE' 'https://chrome-infra-packages.appspot.com/dl/gn/gn/linux-riscv64/+/latest'", + "unzip -j '$GN_ARCHIVE' gn -d /usr/local/bin", + "unlink '$GN_ARCHIVE'", +] diff --git a/setupsrc/pypdfium2_setup/base.py b/setupsrc/pypdfium2_setup/base.py index ca6a1b9ee..5baa6c6e4 100644 --- a/setupsrc/pypdfium2_setup/base.py +++ b/setupsrc/pypdfium2_setup/base.py @@ -37,7 +37,7 @@ def cached_property(func): PlatSpec_V8Sym = "-v8" BindSpec_EnvVar = "PDFIUM_BINDINGS" -IS_CI = bool(os.getenv("GITHUB_ACTIONS")) +IS_CI = bool(os.getenv("GITHUB_ACTIONS")) or bool(int(os.getenv("CIBUILDWHEEL", 0))) USE_REFBINDINGS = os.getenv(BindSpec_EnvVar) == "reference" or not any((shutil.which("ctypesgen"), IS_CI)) ModulesSpec_EnvVar = "PYPDFIUM_MODULES" @@ -268,8 +268,7 @@ def to_full(self, v_short): @cached_property def pinned(self): - # comments are not permitted in JSON, so the reason for the post_pdfium pin (if set) goes here: - # 7309 is the latest tested version. 6996 is too old because we use FPDFFormObj_RemoveObject() which first arrived in 7191. + # comments are not permitted in JSON, so the reason for the post_pdfium pin (if set) goes here: (currently no post_pdfium pin) record = read_json(AR_RecordFile) return record["post_pdfium"] or record["pdfium"] diff --git a/setupsrc/pypdfium2_setup/build_native.py b/setupsrc/pypdfium2_setup/build_native.py index 848d0a8bf..f6caee780 100644 --- a/setupsrc/pypdfium2_setup/build_native.py +++ b/setupsrc/pypdfium2_setup/build_native.py @@ -14,22 +14,24 @@ sys.path.insert(0, str(Path(__file__).parents[1])) from pypdfium2_setup.base import * +IS_CIBUILDWHEEL = bool(int( os.environ.get("CIBUILDWHEEL", 0) )) + PDFIUM_URL = "https://pdfium.googlesource.com/pdfium" _CR_PREFIX = "https://chromium.googlesource.com/" DEPS_URLS = dict( build = _CR_PREFIX + "chromium/src/build", abseil = _CR_PREFIX + "chromium/src/third_party/abseil-cpp", fast_float = _CR_PREFIX + "external/github.com/fastfloat/fast_float", + catapult = _CR_PREFIX + "catapult", # android + icu = _CR_PREFIX + "chromium/deps/icu", # for cibuildwheel gtest = _CR_PREFIX + "external/github.com/google/googletest", test_fonts = _CR_PREFIX + "chromium/src/third_party/test_fonts", - catapult = _CR_PREFIX + "catapult", ) SOURCES_DIR = ProjectDir / "sbuild" / "native" PDFIUM_DIR = SOURCES_DIR / "pdfium" PDFIUM_3RDPARTY = PDFIUM_DIR / "third_party" Compiler = Enum("Compiler", "gcc clang") -RESET_REPOS = False DefaultConfig = { "is_debug": False, @@ -57,10 +59,6 @@ "use_libcxx_modules": False, } -if sys.platform.startswith("darwin"): - DefaultConfig["mac_deployment_target"] = "10.13.0" - DefaultConfig["use_system_xcode"] = True - IS_ANDROID = Host.system == SysNames.android if IS_ANDROID: DefaultConfig.update({ @@ -88,10 +86,10 @@ log(f"Warning: Unknown Android CPU {raw_cpu}") -def _get_repo(url, target_dir, rev, depth=1): +def _get_repo(url, target_dir, rev, reset=False, depth=1): if target_dir.exists(): - if RESET_REPOS and target_dir.name in ("pdfium", "build"): + if reset: log(f"Resetting {target_dir.name} as per --reset option.") run_cmd(["git", "reset", "--hard"], cwd=target_dir) return True @@ -112,16 +110,18 @@ def _get_repo(url, target_dir, rev, depth=1): DEPS_RE = r"\s*'{key}': '(\w+)'" -DEPS_FIELDS = ("build", "abseil", "fast_float") -class _DeferredClass: +class _DeferredInfo: + + def __init__(self, deps_fields): + self.deps_fields = deps_fields @cached_property # included from base.py def deps(self): # TODO get a proper parser for the DEPS file format? deps_content = (PDFIUM_DIR/"DEPS").read_text() result = {} - for field in DEPS_FIELDS: + for field in self.deps_fields: field_re = DEPS_RE.format(key=f"{field}_revision") match = re.search(field_re, deps_content) assert match, f"Could not find {field!r} in DEPS file" @@ -129,12 +129,10 @@ def deps(self): log(f"Found DEPS revisions:\n{result}") return result -_Deferred = _DeferredClass() - -def _fetch_dep(name, target_dir): +def _fetch_dep(info, name, target_dir, reset=False): # parse out DEPS revisions only when we actually need them - return _get_repo(DEPS_URLS[name], target_dir, rev=lambda: _Deferred.deps[name]) + return _get_repo(DEPS_URLS[name], target_dir, rev=lambda: info.deps[name], reset=reset) def autopatch(file, pattern, repl, is_regex, exp_count=None): @@ -154,12 +152,13 @@ def autopatch_dir(dir, globexpr, pattern, repl, is_regex, exp_count=None): autopatch(file, pattern, repl, is_regex, exp_count) -def get_sources(short_ver, with_tests, compiler, clang_path, single_lib): +def get_sources(deps_info, short_ver, with_tests, compiler, clang_path, single_lib, reset, vendor_deps): assert not IGNORE_FULLVER full_ver, pdfium_rev, chromium_rev = handle_sbuild_vers(short_ver) - do_patches = _get_repo(PDFIUM_URL, PDFIUM_DIR, rev=pdfium_rev) + # pass through reset only for the repositories we actually patch + do_patches = _get_repo(PDFIUM_URL, PDFIUM_DIR, rev=pdfium_rev, reset=reset) if do_patches: autopatch_dir( PDFIUM_DIR/"public"/"cpp", "*.h", @@ -185,13 +184,21 @@ def get_sources(short_ver, with_tests, compiler, clang_path, single_lib): "#if 1 // defined(COMPONENT_BUILD)", is_regex=False, exp_count=1, ) + if sys.byteorder == "big": + git_apply_patch(PatchDir/"bigendian.patch", cwd=PDFIUM_DIR) + if with_tests: + git_apply_patch(PatchDir/"bigendian_test.patch", cwd=PDFIUM_DIR) - do_patches = _fetch_dep("build", PDFIUM_DIR/"build") + do_patches = _fetch_dep(deps_info, "build", PDFIUM_DIR/"build", reset=reset) if do_patches: - # Work around error about path_exists() being undefined + # siso.patch: work around error about path_exists() being undefined git_apply_patch(PatchDir/"siso.patch", cwd=PDFIUM_DIR/"build") if IS_ANDROID: + # fix linkage step git_apply_patch(PatchDir/"android_build.patch", cwd=PDFIUM_DIR/"build") + if IS_CIBUILDWHEEL: + # compatibility patch for older system libraries from container + git_apply_patch(PatchDir/"cibuildwheel.patch", cwd=PDFIUM_DIR) if compiler is Compiler.gcc: # https://crbug.com/402282789 git_apply_patch(PatchDir/"ffp_contract.patch", cwd=PDFIUM_DIR/"build") @@ -212,27 +219,30 @@ def get_sources(short_ver, with_tests, compiler, clang_path, single_lib): get_shimheaders_tool(PDFIUM_DIR, rev=chromium_rev) - _fetch_dep("abseil", PDFIUM_3RDPARTY/"abseil-cpp") - _fetch_dep("fast_float", PDFIUM_3RDPARTY/"fast_float"/"src") + _fetch_dep(deps_info, "abseil", PDFIUM_3RDPARTY/"abseil-cpp") + _fetch_dep(deps_info, "fast_float", PDFIUM_3RDPARTY/"fast_float"/"src") if IS_ANDROID: - _fetch_dep("catapult", PDFIUM_3RDPARTY/"catapult") + _fetch_dep(deps_info, "catapult", PDFIUM_3RDPARTY/"catapult") + if "icu" in vendor_deps: + _fetch_dep(deps_info, "icu", PDFIUM_3RDPARTY/"icu") if with_tests: - _fetch_dep("gtest", PDFIUM_3RDPARTY/"googletest"/"src") - _fetch_dep("test_fonts", PDFIUM_3RDPARTY/"test_fonts") + _fetch_dep(deps_info, "gtest", PDFIUM_3RDPARTY/"googletest"/"src") + _fetch_dep(deps_info, "test_fonts", PDFIUM_3RDPARTY/"test_fonts") return full_ver -def prepare(config_dict, build_dir): +def prepare(config_dict, build_dir, vendor_deps): # Create an empty gclient config (PDFIUM_DIR/"build"/"config"/"gclient_args.gni").touch(exist_ok=True) - # Unbundle ICU - # alternatively, we could call build/linux/unbundle/replace_gn_files.py --system-libraries icu - (PDFIUM_3RDPARTY/"icu").mkdir(exist_ok=True) - shutil.copyfile( - PDFIUM_DIR/"build"/"linux"/"unbundle"/"icu.gn", - PDFIUM_3RDPARTY/"icu"/"BUILD.gn" - ) + if "icu" not in vendor_deps: + # Unbundle ICU + # alternatively, we could call build/linux/unbundle/replace_gn_files.py --system-libraries icu + (PDFIUM_3RDPARTY/"icu").mkdir(exist_ok=True) + shutil.copyfile( + PDFIUM_DIR/"build"/"linux"/"unbundle"/"icu.gn", + PDFIUM_3RDPARTY/"icu"/"BUILD.gn" + ) # Create target dir (or reuse existing) and write build config mkdir(build_dir) # Remove existing libraries from the build dir, to avoid packing unnecessary DLLs when a single_lib build is done after a component build. This also ensures we really built a new DLL in the end. @@ -266,15 +276,12 @@ def test(build_dir): def _get_clang_ver(clang_path): from packaging.version import Version - try_libpaths = [ - clang_path/"lib"/"clang", - clang_path/"lib64"/"clang", - ] - libpath = next(filter(Path.exists, try_libpaths)) - candidates = (Version(p.name) for p in libpath.iterdir() if re.fullmatch(r"[\d\.]+", p.name)) - version = max(candidates) - return version.major - + output = run_cmd([str(clang_path/"bin"/"clang"), "--version"], capture=True, cwd=None) + log(output) + version = re.search(r"version ([\d\.]+)", output).group(1) + version = Version(version).major + log(f"Determined clang version {version!r}") + return version def setup_compiler(config, compiler, clang_path): if compiler is Compiler.gcc: @@ -291,18 +298,12 @@ def setup_compiler(config, compiler, clang_path): assert False, f"Unhandled compiler {compiler}" -def main(build_ver=None, with_tests=False, n_jobs=None, compiler=None, clang_path=None, single_lib=False, reset=False): - - # q&d: use a global to expose the `reset` setting to _get_repo(), easier than handing it down through a lot of functions - global RESET_REPOS, DEPS_FIELDS - RESET_REPOS = reset - if IS_ANDROID: - DEPS_FIELDS += ("catapult", ) - if with_tests: - DEPS_FIELDS += ("gtest", "test_fonts") +def main(build_ver=None, with_tests=False, n_jobs=None, compiler=None, clang_path=None, single_lib=False, reset=False, vendor_deps=None): if build_ver is None: build_ver = SBUILD_NATIVE_PIN + if vendor_deps is None: + vendor_deps = set() if compiler is None: if shutil.which("gcc"): compiler = Compiler.gcc @@ -319,10 +320,22 @@ def main(build_ver=None, with_tests=False, n_jobs=None, compiler=None, clang_pat if single_lib: config["is_component_build"] = False + deps_fields = ["build", "abseil", "fast_float"] + if IS_ANDROID: + deps_fields.append("catapult") + if "icu" in vendor_deps: + deps_fields.append("icu") + if with_tests: + deps_fields += ("gtest", "test_fonts") + + deps_info = _DeferredInfo(deps_fields) + mkdir(SOURCES_DIR) - full_ver = get_sources(build_ver, with_tests, compiler, clang_path, single_lib) + full_ver = get_sources( + deps_info, build_ver, with_tests, compiler, clang_path, single_lib, reset, vendor_deps + ) setup_compiler(config, compiler, clang_path) - prepare(config, build_dir) + prepare(config, build_dir, vendor_deps) build(with_tests, build_dir, n_jobs) if with_tests: test(build_dir) @@ -373,9 +386,21 @@ def parse_args(argv): action = "store_true", help = "Whether to create a single DLL that bundles the dependency libraries. Otherwise, separate DLLs will be used. Note, the corresponding patch will only be applied if pdfium is downloaded anew or reset, else the existing state is used.", ) + # The --vendor option is provided for cibuildwheel clients: + # - libicudata pulled in from the system via `auditwheel repair` is quite big. Using vendored ICU reduces wheel size by about 10 MB (compressed). + # - libc++ is used but not pulled in by auditwheel. This appears to be ABI-unsafe (although the wheels seem to work across different hosts according to downstream feedback), so we may want to add that in the future. Actually, options to use system libc++ are deprecated upstream anyway. + parser.add_argument( + "--vendor", + dest = "vendor_deps", + nargs = "+", + action = "extend", + help = "Dependencies to vendor. Note, this only supports libraries where there is a specific reason to vendor despite the native build. Currently this means 'icu' only ('libc++' may be added in the future). For an exhaustive vendored build, use build_toolchained.py" + ) args = parser.parse_args(argv) if args.compiler: args.compiler = Compiler[args.compiler] + if args.vendor_deps: + args.vendor_deps = set(args.vendor_deps) return args diff --git a/setupsrc/pypdfium2_setup/emplace.py b/setupsrc/pypdfium2_setup/emplace.py index da1b88799..7be73d7cd 100644 --- a/setupsrc/pypdfium2_setup/emplace.py +++ b/setupsrc/pypdfium2_setup/emplace.py @@ -4,6 +4,7 @@ import os import sys +import shlex import argparse import traceback from pathlib import Path @@ -77,10 +78,14 @@ def stage_platfiles(pl_name, sub_target, pdfium_ver, flags): elif pl_name == ExtPlats.sourcebuild: if flags: log(f"sourcebuild: flags {flags!r} are not handled (will be discarded).") - if sub_target == "native": - build_native.main(build_ver=pdfium_ver) - elif sub_target == "toolchained": - build_toolchained.main(build_ver=pdfium_ver) + + build_params = shlex.split( os.getenv("BUILD_PARAMS", "") ) + builder = dict(native=build_native, toolchained=build_toolchained).get(sub_target) + if builder: + build_params = vars(builder.parse_args(build_params)) + build_params.update(dict(build_ver=pdfium_ver)) + log(build_params) + builder.main(**build_params) else: _end_subtargets(sub_target, pdfium_ver) diff --git a/utils/get_gcc_prefix.py b/utils/get_gcc_prefix.py new file mode 100644 index 000000000..3417a1ea3 --- /dev/null +++ b/utils/get_gcc_prefix.py @@ -0,0 +1,17 @@ +# SPDX-FileCopyrightText: 2025 geisserml +# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause + +# Simple shim for use within cibuildwheel + +import subprocess + +proc = subprocess.run(["uname", "-m"], stdout=subprocess.PIPE) +arch = proc.stdout.decode().strip() +if arch == "loongarch64": + prefix = f"{arch}-unknown-linux-gnu" +elif arch == "armv7l": + prefix = f"arm-linux-gnueabihf" +else: + prefix = f"{arch}-linux-gnu" + +print(prefix)