diff --git a/.ci/docker/manywheel/Dockerfile_ppc64le b/.ci/docker/manywheel/Dockerfile_ppc64le new file mode 100755 index 000000000000..936d5037d74c --- /dev/null +++ b/.ci/docker/manywheel/Dockerfile_ppc64le @@ -0,0 +1,92 @@ +# Use the manylinux_2_28 base image for ppc64le +FROM quay.io/pypa/manylinux_2_28_ppc64le as base + +# Language variables +ENV LC_ALL=C.UTF-8 +ENV LANG=C.UTF-8 +ENV LANGUAGE=C.UTF-8 + +ARG DEVTOOLSET_VERSION=13 + +# Create symbolic links for Python 3.12 +RUN ln -sf /opt/python/cp312-cp312/bin/python3.12 /usr/bin/python3 && \ + ln -sf /opt/python/cp312-cp312/bin/python3.12 /usr/bin/python + +# Install required system dependencies +RUN yum -y install epel-release && \ + yum -y update && \ + yum install -y \ + sudo \ + autoconf \ + automake \ + bison \ + bzip2 \ + curl \ + diffutils \ + file \ + git \ + make \ + patch \ + perl \ + unzip \ + util-linux \ + wget \ + which \ + xz \ + yasm \ + less \ + zstd \ + libgomp \ + gcc-toolset-${DEVTOOLSET_VERSION}-gcc \ + gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ \ + gcc-toolset-${DEVTOOLSET_VERSION}-binutils \ + gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran \ + cmake \ + ninja-build \ + rust \ + cargo \ + llvm-devel \ + libzstd-devel \ + python3.12-devel \ + python3.12-setuptools \ + python3.12-pip \ + python3-virtualenv \ + python3.12-pyyaml \ + python3.12-numpy \ + python3.12-wheel \ + python3.12-cryptography \ + blas-devel \ + openblas-devel \ + lapack-devel \ + atlas-devel \ + libjpeg-devel \ + libxslt-devel \ + libxml2-devel \ + openssl-devel \ + valgrind + + +# Ensure the correct Python version is used +ENV PATH=/opt/python/cp312-cp312/bin:$PATH +# Add gcc-toolset to the path +ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH + +# Configure git to avoid safe directory issues +RUN git config --global --add safe.directory "*" + +# Install required Python packages +RUN pip install --upgrade pip +RUN pip install typing_extensions pyyaml setuptools + +# Install test dependencies +RUN dnf install -y \ + protobuf-devel \ + protobuf-c-devel \ + protobuf-lite-devel \ + wget \ + patch + +# Set default entrypoint +ENTRYPOINT [] +CMD ["/bin/bash"] \ No newline at end of file diff --git a/.ci/docker/manywheel/build.sh b/.ci/docker/manywheel/build.sh index 0601d7605d84..8b65df9d2b62 100755 --- a/.ci/docker/manywheel/build.sh +++ b/.ci/docker/manywheel/build.sh @@ -65,6 +65,13 @@ case ${GPU_ARCH_TYPE} in DOCKER_GPU_BUILD_ARG="" MANY_LINUX_VERSION="s390x" ;; + cpu-ppc64le) + TARGET=base + DOCKER_TAG=ppc64le + GPU_IMAGE=redhat/ubi9 + DOCKER_GPU_BUILD_ARG="" + MANY_LINUX_VERSION="ppc64le" + ;; cuda) TARGET=cuda_final DOCKER_TAG=cuda${GPU_ARCH_VERSION} @@ -121,8 +128,10 @@ fi ( set -x - # Only activate this if in CI - if [ "$(uname -m)" != "s390x" ] && [ -v CI ]; then + + + if [ "$(uname -m)" != "s390x" && "$(uname -m)" != "ppc64le" ] && [ -v CI ]; then + # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712 # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023. sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service diff --git a/.ci/docker/manywheel/build_scripts/build.sh b/.ci/docker/manywheel/build_scripts/build.sh index e2cb1c7f27cd..34ea62cc2099 100644 --- a/.ci/docker/manywheel/build_scripts/build.sh +++ b/.ci/docker/manywheel/build_scripts/build.sh @@ -20,7 +20,7 @@ AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969 # the final image after compiling Python PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel libpcap-devel xz-devel libffi-devel" -if [ "$(uname -m)" != "s390x" ] ; then +if [ "$(uname -m)" != "s390x" && "$(uname -m)" != "ppc64le" ] ; then PYTHON_COMPILE_DEPS="${PYTHON_COMPILE_DEPS} db4-devel" else PYTHON_COMPILE_DEPS="${PYTHON_COMPILE_DEPS} libdb-devel" diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh index dfc4e0fab927..4592011dd0d3 100755 --- a/.ci/pytorch/build.sh +++ b/.ci/pytorch/build.sh @@ -231,7 +231,7 @@ fi # Do not change workspace permissions for ROCm and s390x CI jobs # as it can leave workspace with bad permissions for cancelled jobs -if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /var/lib/jenkins/workspace ]]; then +if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *ppc64le* && -d /var/lib/jenkins/workspace ]]; then # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96) WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace") cleanup_workspace() { @@ -275,8 +275,10 @@ else # XLA test build fails when WERROR=1 # set only when building other architectures # or building non-XLA tests. + # ppc64le builds fail when WERROR=1 if [[ "$BUILD_ENVIRONMENT" != *rocm* && - "$BUILD_ENVIRONMENT" != *xla* ]]; then + "$BUILD_ENVIRONMENT" != *xla* && + "$BUILD_ENVIRONMENT" != *ppc64le* ]]; then if [[ "$BUILD_ENVIRONMENT" != *py3.8* ]]; then # Install numpy-2.0.2 for builds which are backward compatible with 1.X python -mpip install numpy==2.0.2 @@ -399,6 +401,6 @@ if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]]; python tools/stats/export_test_times.py fi # don't do this for bazel or s390x as they don't use sccache -if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then +if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *ppc64le* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then print_sccache_stats fi diff --git a/.github/scripts/ppc64le-ci/README.md b/.github/scripts/ppc64le-ci/README.md new file mode 100755 index 000000000000..8676799de341 --- /dev/null +++ b/.github/scripts/ppc64le-ci/README.md @@ -0,0 +1,63 @@ +# Configuring the builder. + +## Install prerequisites. + +``` +$ sudo apt install podman podman-docker jq +``` +## Add services. + +``` +$ sudo cp self-hosted-builder/*.service /etc/systemd/system/ +$ sudo systemctl daemon-reload +``` + +## Rebuild the image + +First build ppc64le builder image `docker.io/pytorch/manylinuxppc64le-builder`, +using following commands: + +``` +$ cd ~ +$ git clone https://github.com/pytorch/pytorch +$ cd pytorch +$ git submodule update --init --recursive +$ GPU_ARCH_TYPE=cpu-ppc64le "$(pwd)/.ci/docker/manywheel/build.sh" manylinuxppc64le-builder +$ docker image tag localhost/pytorch/manylinuxppc64le-builder docker.io/pytorch/manylinuxppc64le-builder:cpu-ppc64le +$ docker image save -o ~/manywheel-ppc64le.tar docker.io/pytorch/manylinuxppc64le-builder:cpu-ppc64le +``` + +Next step is to build `actions-runner` image using: + +``` +## clone gaplib repo (https://github.com/anup-kodlekere/gaplib.git) and copy runner-sdk-8.ppc64le patch from gaplib/build-files into pytorch/.github\scripts\ppc64le-ci\self-hosted-builder + +$ cd self-hosted-builder +$ sudo docker build \ + --pull \ + -f actions-runner.Dockerfile \ + --build-arg RUNNERPATCH="runner-sdk-8.ppc64le.patch" \ + -t iiilinuxibmcom/actions-runner. \ + . +``` + +Now prepare all necessary files for runner registration: + +``` +$ sudo mkdir -p /etc/actions-runner/ +$ sudo chmod 755 /etc/actions-runner/ +$ sudo /bin/cp /etc/actions-runner//key_private.pem +$ sudo echo | sudo tee /etc/actions-runner//appid.env +$ sudo echo | sudo tee /etc/actions-runner//installid.env +$ sudo echo NAME= | sudo tee /etc/actions-runner//env +$ sudo echo ORG= | sudo tee -a /etc/actions-runner//env +$ cd self-hosted-builder +$ sudo /bin/cp helpers/*.sh /usr/local/bin/ +$ sudo chmod 755 /usr/local/bin/app_token.sh /usr/local/bin/gh_token_generator.sh +``` + +## Autostart the runner. + +``` +$ sudo systemctl enable --now actions-runner@$NAME +``` \ No newline at end of file diff --git a/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner.Dockerfile b/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner.Dockerfile new file mode 100755 index 000000000000..f1589d7edf9a --- /dev/null +++ b/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner.Dockerfile @@ -0,0 +1,104 @@ +# Self-Hosted IBM Power Github Actions Runner. +FROM ubuntu:22.04 + +# Set non-interactive mode for apt +ENV DEBIAN_FRONTEND=noninteractive + +# Fix sources to point to ports.ubuntu.com for ppc64le +RUN echo "deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports jammy main restricted universe multiverse" > /etc/apt/sources.list && \ + echo "deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports jammy-updates main restricted universe multiverse" >> /etc/apt/sources.list && \ + echo "deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports jammy-backports main restricted universe multiverse" >> /etc/apt/sources.list && \ + echo "deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports jammy-security main restricted universe multiverse" >> /etc/apt/sources.list + +# Fix sources for ppc64le and update system +RUN apt-get update -o Acquire::Retries=5 -o Acquire::http::Timeout="10" && \ + apt-get -y install --no-install-recommends \ + build-essential \ + curl \ + sudo \ + jq \ + gnupg-agent \ + iptables \ + ca-certificates \ + software-properties-common \ + vim \ + zip \ + python3 \ + python3-pip && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +# Switch to iptables-legacy +RUN update-alternatives --set iptables /usr/sbin/iptables-legacy && \ + update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy + + +# Install Podman and podman-docker (Docker compatibility) +RUN apt-get update && apt-get install -y podman podman-docker && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +# Install dotnet SDK and other dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + wget \ + git \ + dotnet-sdk-8.0 \ + cmake \ + make \ + automake \ + autoconf \ + m4 \ + libtool && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + + +# Setup user and permissions +RUN useradd -c "Action Runner" -m runner && \ + usermod -L runner && \ + echo "runner ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/runner && \ + groupadd podman || true && \ + usermod -aG podman runner + +# Configure Podman cgroup manager +RUN mkdir -p /etc/containers && \ + echo "[engine]\ncgroup_manager = \"cgroupfs\"" | sudo tee /etc/containers/containers.conf + +# Add and configure GitHub Actions runner +ARG RUNNERREPO="https://github.com/actions/runner" +ARG RUNNERPATCH + +ADD ${RUNNERPATCH} /tmp/runner.patch + +RUN git clone -q ${RUNNERREPO} /tmp/runner && \ + cd /tmp/runner && \ + git checkout main -b build && \ + git apply /tmp/runner.patch && \ + sed -i'' -e /version/s/8......\"$/${SDK}.0.100\"/ src/global.json + +RUN cd /tmp/runner/src && \ + ./dev.sh layout && \ + ./dev.sh package && \ + ./dev.sh test && \ + rm -rf /root/.dotnet /root/.nuget + +RUN mkdir -p /opt/runner && \ + tar -xf /tmp/runner/_package/*.tar.gz -C /opt/runner && \ + chown -R runner:runner /opt/runner && \ + su - runner -c "/opt/runner/config.sh --version" + +RUN rm -rf /tmp/runner /tmp/runner.patch + +# Copy custom scripts and set permissions +COPY fs/ / +RUN chmod +x /usr/bin/actions-runner /usr/bin/entrypoint + +# Switch to the runner user +USER runner + +# Set working directory +WORKDIR /opt/runner + +COPY --chown=runner:runner manywheel-ppc64le.tar /opt/runner/manywheel-ppc64le.tar + +# Define entry point and command +ENTRYPOINT ["/usr/bin/entrypoint"] +CMD ["/usr/bin/actions-runner"] + diff --git a/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner@.service b/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner@.service new file mode 100755 index 000000000000..bd1a636cef3c --- /dev/null +++ b/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner@.service @@ -0,0 +1,31 @@ +[Unit] +Description=Self-Hosted IBM power Github Actions Runner +StartLimitIntervalSec=0 + +[Service] +Type=simple +Restart=always + +# Cleanup stale containers +ExecStartPre=-/usr/bin/docker rm --force actions-runner.%i +ExecStartPre=-/usr/local/bin/gh_token_generator.sh /etc/actions-runner/%i/appid.env /etc/actions-runner/%i/installid.env /etc/actions-runner/%i/key_private.pem /etc/actions-runner/%i/ghtoken.env +ExecStartPre=-/usr/local/bin/gh_cat_token.sh /etc/actions-runner/%i/ghtoken.env /etc/actions-runner/%i/ghtoken.txt + +ExecStart=/usr/bin/docker run \ + --env-file=/etc/actions-runner/%i/env \ + --volume /etc/actions-runner/%i/ghtoken.txt:/run/runner_secret \ + --init \ + --interactive \ + --name=actions-runner.%i \ + --rm \ + --privileged \ + --log-driver=journald \ + iiilinuxibmcom/actions-runner.%i +ExecStop=/bin/sh -c "docker exec actions-runner.%i kill -INT -- -1" +ExecStop=/bin/sh -c "docker wait actions-runner.%i" +ExecStop=/bin/sh -c "docker rm actions-runner.%i" + +ExecStop=/usr/bin/env rm -f /etc/actions-runner/%i/ghtoken.env /etc/actions-runner/%i/ghtoken.txt + +[Install] +WantedBy=multi-user.target diff --git a/.github/scripts/ppc64le-ci/self-hosted-builder/fs/usr/bin/actions-runner b/.github/scripts/ppc64le-ci/self-hosted-builder/fs/usr/bin/actions-runner new file mode 100755 index 000000000000..e60c9e7314d7 --- /dev/null +++ b/.github/scripts/ppc64le-ci/self-hosted-builder/fs/usr/bin/actions-runner @@ -0,0 +1,55 @@ +#!/usr/bin/env bash + +set -e -u + +# first import docker image +if [ -f ./manywheel-ppc64le.tar ] ; then + docker image load --input manywheel-ppc64le.tar + docker image tag docker.io/pytorch/manylinuxppc64le-builder:cpu-ppc64le docker.io/pytorch/manylinuxppc64le-builder:cpu-ppc64le-main + rm -f manywheel-ppc64le.tar +fi + +token_file=registration-token.json + +# Fetch GitHub access token +if [ ! -f /run/runner_secret ]; then + echo "Error: Access token file not found at /run/runner_secret." + exit 1 +fi + + +ACCESS_TOKEN="$(cat /run/runner_secret)" + +# Generate registration token +curl \ + -X POST \ + -H "Accept: application/vnd.github.v3+json" \ + -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "https://api.github.com/orgs/${ORG}/actions/runners/registration-token" \ + -o "$token_file" + +unset ACCESS_TOKEN + +sudo umount /run/runner_secret +sudo rm -f /run/runner_secret + +# register runner as ephemeral runner +# it does one job, stops and unregisters +registration_token=$(jq --raw-output .token "$token_file") + +./config.sh \ + --unattended \ + --ephemeral \ + --url "https://github.com/${ORG}" \ + --token "${registration_token}" \ + --name "${NAME}" \ + --no-default-labels \ + --labels self-hosted,linux.ppc64le + +unset registration_token +rm -f "$token_file" + +# Run one job. +./run.sh + +echo "Ephemeral runner workflow completed." diff --git a/.github/scripts/ppc64le-ci/self-hosted-builder/fs/usr/bin/entrypoint b/.github/scripts/ppc64le-ci/self-hosted-builder/fs/usr/bin/entrypoint new file mode 100755 index 000000000000..14f6c84ca602 --- /dev/null +++ b/.github/scripts/ppc64le-ci/self-hosted-builder/fs/usr/bin/entrypoint @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +# +# Container entrypoint that waits for all spawned processes. +# + +set -e -u + +# Create a FIFO and start reading from its read end. +tempdir=$(mktemp -d "/tmp/done.XXXXXXXXXX") +trap 'rm -r "$tempdir"' EXIT +done="$tempdir/pipe" +mkfifo "$done" +cat "$done" & waiter=$! + +# Start the workload. Its descendants will inherit the FIFO's write end. +status=0 +if [ "$#" -eq 0 ]; then + bash 9>"$done" || status=$? +else + "$@" 9>"$done" || status=$? +fi + +# When the workload and all of its descendants exit, the FIFO's write end will +# be closed and `cat "$done"` will exit. Wait until it happens. This is needed +# in order to handle SelfUpdater, which the workload may start in background +# before exiting. +wait "$waiter" + +exit "$status" diff --git a/.github/scripts/ppc64le-ci/self-hosted-builder/helpers/app_token.sh b/.github/scripts/ppc64le-ci/self-hosted-builder/helpers/app_token.sh new file mode 100755 index 000000000000..cecde970b84b --- /dev/null +++ b/.github/scripts/ppc64le-ci/self-hosted-builder/helpers/app_token.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# +# Request an ACCESS_TOKEN to be used by a GitHub APP +# Environment variable that need to be set up: +# * APP_ID, the GitHub's app ID +# * INSTALL_ID, the Github's app's installation ID +# * APP_PRIVATE_KEY, the content of GitHub app's private key in PEM format. +# +# https://github.com/orgs/community/discussions/24743#discussioncomment-3245300 +# + +set -o pipefail + +set -e # Exit on error + +# Generate JWT +header='{"alg":"RS256","typ":"JWT"}' +payload="{\"iat\":$(date +%s),\"exp\":$(( $(date +%s) + 600 )),\"iss\":${APP_ID}}" + +header_base64=$(echo -n "$header" | openssl base64 | tr -d '=' | tr '/+' '_-' | tr -d '\n') +payload_base64=$(echo -n "$payload" | openssl base64 | tr -d '=' | tr '/+' '_-' | tr -d '\n') + +signature=$(echo -n "${header_base64}.${payload_base64}" | \ + openssl dgst -sha256 -sign "${APP_PRIVATE_KEY}" | \ + openssl base64 | tr -d '=' | tr '/+' '_-' | tr -d '\n') + +generated_jwt="${header_base64}.${payload_base64}.${signature}" + +API_VERSION=v3 +API_HEADER="Accept: application/vnd.github+json" + +auth_header="Authorization: Bearer ${generated_jwt}" + +app_installations_response=$(curl -sX POST \ + -H "${auth_header}" \ + -H "${API_HEADER}" \ + --url "https://api.github.com/app/installations/${INSTALL_ID}/access_tokens" \ + ) + +echo "$app_installations_response" | jq --raw-output '.token' diff --git a/.github/scripts/ppc64le-ci/self-hosted-builder/helpers/gh_cat_token.sh b/.github/scripts/ppc64le-ci/self-hosted-builder/helpers/gh_cat_token.sh new file mode 100755 index 000000000000..2274e5a13c74 --- /dev/null +++ b/.github/scripts/ppc64le-ci/self-hosted-builder/helpers/gh_cat_token.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +TOKEN_FILE=$1 +OUTPUT_FILE=$2 + +echo "Starting gh_cat_token.sh with TOKEN_FILE=${TOKEN_FILE}, OUTPUT_FILE=${OUTPUT_FILE}" + +# Validate inputs +if [[ ! -r "${TOKEN_FILE}" ]]; then + echo "Error: Token file '${TOKEN_FILE}' does not exist or is not readable." + exit 1 +fi + +# Write the token to the output file +cat "${TOKEN_FILE}" > "${OUTPUT_FILE}" +echo "Token written to ${OUTPUT_FILE}" diff --git a/.github/scripts/ppc64le-ci/self-hosted-builder/helpers/gh_token_generator.sh b/.github/scripts/ppc64le-ci/self-hosted-builder/helpers/gh_token_generator.sh new file mode 100755 index 000000000000..1feee26eb2c1 --- /dev/null +++ b/.github/scripts/ppc64le-ci/self-hosted-builder/helpers/gh_token_generator.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +SCRIPT_DIR=$(dirname "$0") +APP_ID=$1 +INSTALL_ID=$2 +APP_PRIVATE_KEY=$3 +DST_FILE="$4" + +ACCESS_TOKEN="$(APP_ID="$(<"${APP_ID}")" INSTALL_ID="$(<"${INSTALL_ID}")" APP_PRIVATE_KEY="${APP_PRIVATE_KEY}" "${SCRIPT_DIR}/app_token.sh")" +echo "${ACCESS_TOKEN}" > "${DST_FILE}" diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml index ef9e2acb993d..7f6132fd4d7a 100644 --- a/.github/workflows/_linux-build.yml +++ b/.github/workflows/_linux-build.yml @@ -107,7 +107,7 @@ jobs: steps: - name: Setup SSH (Click me for login details) uses: pytorch/test-infra/.github/actions/setup-ssh@main - if: inputs.build-environment != 'linux-s390x-binary-manywheel' + if: inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel' with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -122,11 +122,11 @@ jobs: - name: Setup Linux uses: ./.github/actions/setup-linux - if: inputs.build-environment != 'linux-s390x-binary-manywheel' + if: inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel' - name: configure aws credentials uses: aws-actions/configure-aws-credentials@v3 - if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }} + if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel' }} with: role-to-assume: ${{ inputs.aws-role-to-assume }} role-session-name: gha-linux-build @@ -135,13 +135,13 @@ jobs: - name: Calculate docker image id: calculate-docker-image uses: pytorch/test-infra/.github/actions/calculate-docker-image@main - if: inputs.build-environment != 'linux-s390x-binary-manywheel' + if: inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel' with: docker-image-name: ${{ inputs.docker-image-name }} - name: Use following to pull public copy of the image id: print-ghcr-mirror - if: inputs.build-environment != 'linux-s390x-binary-manywheel' + if: inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel' env: ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} shell: bash @@ -151,7 +151,7 @@ jobs: - name: Pull docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@main - if: inputs.build-environment != 'linux-s390x-binary-manywheel' + if: inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel' with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -179,7 +179,7 @@ jobs: - name: Download pytest cache uses: ./.github/actions/pytest-cache-download continue-on-error: true - if: inputs.build-environment != 'linux-s390x-binary-manywheel' + if: inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel' with: cache_dir: .pytest_cache job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }} @@ -203,6 +203,7 @@ jobs: TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }} DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} DOCKER_IMAGE_S390X: ${{ inputs.docker-image-name }} + DOCKER_IMAGE_PPC64LE: ${{ inputs.docker-image-name }} XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }} DEBUG: ${{ inputs.build-with-debug && '1' || '0' }} OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }} @@ -222,23 +223,43 @@ jobs: # since some steps are skipped on s390x, if they are necessary, run them here env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}" env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}" + + elif [[ ${BUILD_ENVIRONMENT} == *"ppc64le"* ]]; then + JENKINS_USER="" + USED_IMAGE="${DOCKER_IMAGE_PPC64LE}" + # ensure that docker container cleanly exits in 12 hours + # if for some reason cleanup action doesn't stop container + # when job is cancelled + DOCKER_SHELL_CMD="sleep 12h" + env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}" + env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}" + else JENKINS_USER="--user jenkins" USED_IMAGE="${DOCKER_IMAGE}" DOCKER_SHELL_CMD= fi + if [[ ${MAX_JOBS_OVERRIDE} == "" ]]; then MAX_JOBS="$(nproc --ignore=2)" else MAX_JOBS="${MAX_JOBS_OVERRIDE}" fi - # Leaving 1GB for the runner and other things - TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo) - # https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap - # comes from https://github.com/pytorch/test-infra/pull/6058 - TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3)) + # Compute memory limits, but skip setting them for ppc64le + if [[ ${BUILD_ENVIRONMENT} != *"ppc64le"* ]]; then + + # Leaving 1GB for the runner and other things + TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo) + # https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap + # comes from https://github.com/pytorch/test-infra/pull/6058 + TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3)) + MEMORY_FLAGS="--memory=${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g --memory-swap=${TOTAL_MEMORY_WITH_SWAP}g" + else + MEMORY_FLAGS="" + fi + # detached container should get cleaned up by teardown_ec2_linux # Used for JENKINS_USER and DOCKER_SHELL_CMD, which can be empty @@ -262,13 +283,12 @@ jobs: -e HUGGING_FACE_HUB_TOKEN \ -e SCRIBE_GRAPHQL_ACCESS_TOKEN \ -e USE_SPLIT_BUILD \ - --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \ - --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ --tty \ --detach \ + ${MEMORY_FLAGS} \ ${JENKINS_USER} \ -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ -w /var/lib/jenkins/workspace \ @@ -287,7 +307,9 @@ jobs: - name: Store PyTorch Build Artifacts on S3 uses: seemethere/upload-artifact-s3@v5 - if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel' + + if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel' + with: name: ${{ inputs.build-environment }} retention-days: 14 @@ -295,9 +317,9 @@ jobs: path: artifacts.zip s3-bucket: ${{ inputs.s3-bucket }} - - name: Store PyTorch Build Artifacts for s390x + - name: Store PyTorch Build Artifacts for s390x and ppc64le uses: actions/upload-artifact@v4 - if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.build-environment == 'linux-s390x-binary-manywheel' + if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && (inputs.build-environment == 'linux-s390x-binary-manywheel' || inputs.build-environment == 'linux-ppc64le-binary-manywheel') with: name: ${{ inputs.build-environment }} retention-days: 14 @@ -305,7 +327,7 @@ jobs: path: artifacts.zip - name: Upload sccache stats - if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel' + if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel' uses: ./.github/actions/upload-sccache-stats with: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -313,12 +335,12 @@ jobs: - name: Teardown Linux uses: pytorch/test-infra/.github/actions/teardown-linux@main - if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel' + if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel' - name: Cleanup docker - if: always() && inputs.build-environment == 'linux-s390x-binary-manywheel' + if: always() && (inputs.build-environment == 'linux-s390x-binary-manywheel' || inputs.build-environment == 'linux-ppc64le-binary-manywheel' ) shell: bash run: | - # on s390x stop the container for clean worker stop + # on s390x and ppc64le stop the container for clean worker stop docker stop -a || true docker kill -a || true diff --git a/.github/workflows/ppc64le.yml b/.github/workflows/ppc64le.yml new file mode 100755 index 000000000000..e5875982c187 --- /dev/null +++ b/.github/workflows/ppc64le.yml @@ -0,0 +1,22 @@ +name: ppc64le + +on: + push: + branches: + - main + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +jobs: + linux-manylinux-2_28-py3-cpu-ppc64le-build: + if: github.repository_owner == 'pytorch' + name: linux-manylinux-2_28-py3-cpu-ppc64le-build + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-ppc64le-binary-manywheel + docker-image-name: pytorch/manylinuxppc64le-builder:cpu-ppc64le-main + runner: linux.ppc64le + secrets: inherit \ No newline at end of file