Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions .ci/docker/manywheel/Dockerfile_ppc64le
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# Use the manylinux_2_28 base image for ppc64le
FROM quay.io/pypa/manylinux_2_28_ppc64le as base

# Language variables
ENV LC_ALL=C.UTF-8
ENV LANG=C.UTF-8
ENV LANGUAGE=C.UTF-8

ARG DEVTOOLSET_VERSION=13

# Create symbolic links for Python 3.12
RUN ln -sf /opt/python/cp312-cp312/bin/python3.12 /usr/bin/python3 && \
ln -sf /opt/python/cp312-cp312/bin/python3.12 /usr/bin/python

# Install required system dependencies
RUN yum -y install epel-release && \
yum -y update && \
yum install -y \
sudo \
autoconf \
automake \
bison \
bzip2 \
curl \
diffutils \
file \
git \
make \
patch \
perl \
unzip \
util-linux \
wget \
which \
xz \
yasm \
less \
zstd \
libgomp \
gcc-toolset-${DEVTOOLSET_VERSION}-gcc \
gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ \
gcc-toolset-${DEVTOOLSET_VERSION}-binutils \
gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran \
cmake \
ninja-build \
rust \
cargo \
llvm-devel \
libzstd-devel \
python3.12-devel \
python3.12-setuptools \
python3.12-pip \
python3-virtualenv \
python3.12-pyyaml \
python3.12-numpy \
python3.12-wheel \
python3.12-cryptography \
blas-devel \
openblas-devel \
lapack-devel \
atlas-devel \
libjpeg-devel \
libxslt-devel \
libxml2-devel \
openssl-devel \
valgrind


# Ensure the correct Python version is used
ENV PATH=/opt/python/cp312-cp312/bin:$PATH
# Add gcc-toolset to the path
ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH

# Configure git to avoid safe directory issues
RUN git config --global --add safe.directory "*"

# Install required Python packages
RUN pip install --upgrade pip
RUN pip install typing_extensions pyyaml setuptools

# Install test dependencies
RUN dnf install -y \
protobuf-devel \
protobuf-c-devel \
protobuf-lite-devel \
wget \
patch

# Set default entrypoint
ENTRYPOINT []
CMD ["/bin/bash"]
13 changes: 11 additions & 2 deletions .ci/docker/manywheel/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,13 @@ case ${GPU_ARCH_TYPE} in
DOCKER_GPU_BUILD_ARG=""
MANY_LINUX_VERSION="s390x"
;;
cpu-ppc64le)
TARGET=base
DOCKER_TAG=ppc64le
GPU_IMAGE=redhat/ubi9
DOCKER_GPU_BUILD_ARG=""
MANY_LINUX_VERSION="ppc64le"
;;
cuda)
TARGET=cuda_final
DOCKER_TAG=cuda${GPU_ARCH_VERSION}
Expand Down Expand Up @@ -121,8 +128,10 @@ fi
(
set -x

# Only activate this if in CI
if [ "$(uname -m)" != "s390x" ] && [ -v CI ]; then


if [ "$(uname -m)" != "s390x" && "$(uname -m)" != "ppc64le" ] && [ -v CI ]; then

# TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
# is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
Expand Down
2 changes: 1 addition & 1 deletion .ci/docker/manywheel/build_scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969
# the final image after compiling Python
PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel libpcap-devel xz-devel libffi-devel"

if [ "$(uname -m)" != "s390x" ] ; then
if [ "$(uname -m)" != "s390x" && "$(uname -m)" != "ppc64le" ] ; then
PYTHON_COMPILE_DEPS="${PYTHON_COMPILE_DEPS} db4-devel"
else
PYTHON_COMPILE_DEPS="${PYTHON_COMPILE_DEPS} libdb-devel"
Expand Down
8 changes: 5 additions & 3 deletions .ci/pytorch/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ fi

# Do not change workspace permissions for ROCm and s390x CI jobs
# as it can leave workspace with bad permissions for cancelled jobs
if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /var/lib/jenkins/workspace ]]; then
if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *ppc64le* && -d /var/lib/jenkins/workspace ]]; then
# Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
cleanup_workspace() {
Expand Down Expand Up @@ -275,8 +275,10 @@ else
# XLA test build fails when WERROR=1
# set only when building other architectures
# or building non-XLA tests.
# ppc64le builds fail when WERROR=1
if [[ "$BUILD_ENVIRONMENT" != *rocm* &&
"$BUILD_ENVIRONMENT" != *xla* ]]; then
"$BUILD_ENVIRONMENT" != *xla* &&
"$BUILD_ENVIRONMENT" != *ppc64le* ]]; then
if [[ "$BUILD_ENVIRONMENT" != *py3.8* ]]; then
# Install numpy-2.0.2 for builds which are backward compatible with 1.X
python -mpip install numpy==2.0.2
Expand Down Expand Up @@ -399,6 +401,6 @@ if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]];
python tools/stats/export_test_times.py
fi
# don't do this for bazel or s390x as they don't use sccache
if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *ppc64le* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
print_sccache_stats
fi
63 changes: 63 additions & 0 deletions .github/scripts/ppc64le-ci/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Configuring the builder.

## Install prerequisites.

```
$ sudo apt install podman podman-docker jq
```
## Add services.

```
$ sudo cp self-hosted-builder/*.service /etc/systemd/system/
$ sudo systemctl daemon-reload
```

## Rebuild the image

First build ppc64le builder image `docker.io/pytorch/manylinuxppc64le-builder`,
using following commands:

```
$ cd ~
$ git clone https://github.com/pytorch/pytorch
$ cd pytorch
$ git submodule update --init --recursive
$ GPU_ARCH_TYPE=cpu-ppc64le "$(pwd)/.ci/docker/manywheel/build.sh" manylinuxppc64le-builder
$ docker image tag localhost/pytorch/manylinuxppc64le-builder docker.io/pytorch/manylinuxppc64le-builder:cpu-ppc64le
$ docker image save -o ~/manywheel-ppc64le.tar docker.io/pytorch/manylinuxppc64le-builder:cpu-ppc64le
```

Next step is to build `actions-runner` image using:

```
## clone gaplib repo (https://github.com/anup-kodlekere/gaplib.git) and copy runner-sdk-8.ppc64le patch from gaplib/build-files into pytorch/.github\scripts\ppc64le-ci\self-hosted-builder

$ cd self-hosted-builder
$ sudo docker build \
--pull \
-f actions-runner.Dockerfile \
--build-arg RUNNERPATCH="runner-sdk-8.ppc64le.patch" \
-t iiilinuxibmcom/actions-runner.<name> \
.
```

Now prepare all necessary files for runner registration:

```
$ sudo mkdir -p /etc/actions-runner/<name>
$ sudo chmod 755 /etc/actions-runner/<name>
$ sudo /bin/cp <github_app_private_key_file> /etc/actions-runner/<name>/key_private.pem
$ sudo echo <github_app_id> | sudo tee /etc/actions-runner/<name>/appid.env
$ sudo echo <github_app_install_id> | sudo tee /etc/actions-runner/<name>/installid.env
$ sudo echo NAME=<worker_name> | sudo tee /etc/actions-runner/<name>/env
$ sudo echo ORG=<github_org> | sudo tee -a /etc/actions-runner/<name>/env
$ cd self-hosted-builder
$ sudo /bin/cp helpers/*.sh /usr/local/bin/
$ sudo chmod 755 /usr/local/bin/app_token.sh /usr/local/bin/gh_token_generator.sh
```

## Autostart the runner.

```
$ sudo systemctl enable --now actions-runner@$NAME
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Self-Hosted IBM Power Github Actions Runner.
FROM ubuntu:22.04

# Set non-interactive mode for apt
ENV DEBIAN_FRONTEND=noninteractive

# Fix sources to point to ports.ubuntu.com for ppc64le
RUN echo "deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports jammy main restricted universe multiverse" > /etc/apt/sources.list && \
echo "deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports jammy-updates main restricted universe multiverse" >> /etc/apt/sources.list && \
echo "deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports jammy-backports main restricted universe multiverse" >> /etc/apt/sources.list && \
echo "deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports jammy-security main restricted universe multiverse" >> /etc/apt/sources.list

# Fix sources for ppc64le and update system
RUN apt-get update -o Acquire::Retries=5 -o Acquire::http::Timeout="10" && \
apt-get -y install --no-install-recommends \
build-essential \
curl \
sudo \
jq \
gnupg-agent \
iptables \
ca-certificates \
software-properties-common \
vim \
zip \
python3 \
python3-pip && \
apt-get clean && rm -rf /var/lib/apt/lists/*

# Switch to iptables-legacy
RUN update-alternatives --set iptables /usr/sbin/iptables-legacy && \
update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy


# Install Podman and podman-docker (Docker compatibility)
RUN apt-get update && apt-get install -y podman podman-docker && \
apt-get clean && rm -rf /var/lib/apt/lists/*

# Install dotnet SDK and other dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
wget \
git \
dotnet-sdk-8.0 \
cmake \
make \
automake \
autoconf \
m4 \
libtool && \
apt-get clean && rm -rf /var/lib/apt/lists/*


# Setup user and permissions
RUN useradd -c "Action Runner" -m runner && \
usermod -L runner && \
echo "runner ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/runner && \
groupadd podman || true && \
usermod -aG podman runner

# Configure Podman cgroup manager
RUN mkdir -p /etc/containers && \
echo "[engine]\ncgroup_manager = \"cgroupfs\"" | sudo tee /etc/containers/containers.conf

# Add and configure GitHub Actions runner
ARG RUNNERREPO="https://github.com/actions/runner"
ARG RUNNERPATCH

ADD ${RUNNERPATCH} /tmp/runner.patch

RUN git clone -q ${RUNNERREPO} /tmp/runner && \
cd /tmp/runner && \
git checkout main -b build && \
git apply /tmp/runner.patch && \
sed -i'' -e /version/s/8......\"$/${SDK}.0.100\"/ src/global.json

RUN cd /tmp/runner/src && \
./dev.sh layout && \
./dev.sh package && \
./dev.sh test && \
rm -rf /root/.dotnet /root/.nuget

RUN mkdir -p /opt/runner && \
tar -xf /tmp/runner/_package/*.tar.gz -C /opt/runner && \
chown -R runner:runner /opt/runner && \
su - runner -c "/opt/runner/config.sh --version"

RUN rm -rf /tmp/runner /tmp/runner.patch

# Copy custom scripts and set permissions
COPY fs/ /
RUN chmod +x /usr/bin/actions-runner /usr/bin/entrypoint

# Switch to the runner user
USER runner

# Set working directory
WORKDIR /opt/runner

COPY --chown=runner:runner manywheel-ppc64le.tar /opt/runner/manywheel-ppc64le.tar

# Define entry point and command
ENTRYPOINT ["/usr/bin/entrypoint"]
CMD ["/usr/bin/actions-runner"]

31 changes: 31 additions & 0 deletions .github/scripts/ppc64le-ci/self-hosted-builder/[email protected]
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
[Unit]
Description=Self-Hosted IBM power Github Actions Runner
StartLimitIntervalSec=0

[Service]
Type=simple
Restart=always

# Cleanup stale containers
ExecStartPre=-/usr/bin/docker rm --force actions-runner.%i
ExecStartPre=-/usr/local/bin/gh_token_generator.sh /etc/actions-runner/%i/appid.env /etc/actions-runner/%i/installid.env /etc/actions-runner/%i/key_private.pem /etc/actions-runner/%i/ghtoken.env
ExecStartPre=-/usr/local/bin/gh_cat_token.sh /etc/actions-runner/%i/ghtoken.env /etc/actions-runner/%i/ghtoken.txt

ExecStart=/usr/bin/docker run \
--env-file=/etc/actions-runner/%i/env \
--volume /etc/actions-runner/%i/ghtoken.txt:/run/runner_secret \
--init \
--interactive \
--name=actions-runner.%i \
--rm \
--privileged \
--log-driver=journald \
iiilinuxibmcom/actions-runner.%i
ExecStop=/bin/sh -c "docker exec actions-runner.%i kill -INT -- -1"
ExecStop=/bin/sh -c "docker wait actions-runner.%i"
ExecStop=/bin/sh -c "docker rm actions-runner.%i"

ExecStop=/usr/bin/env rm -f /etc/actions-runner/%i/ghtoken.env /etc/actions-runner/%i/ghtoken.txt

[Install]
WantedBy=multi-user.target
Loading