Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions .ci/docker/manywheel/Dockerfile_ppc64le
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Use UBI 9.3 as base image
FROM registry.access.redhat.com/ubi9/ubi:9.5

# Install necessary dependencies
RUN dnf install -y \
https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
dnf install -y git cmake ninja-build gcc-toolset-13 rust cargo zip \
python3 python3-devel && \
dnf clean all

ENV PATH="/opt/rh/gcc-toolset-13/root/usr/bin:$PATH"
ENV MANPATH="/opt/rh/gcc-toolset-13/root/usr/share/man"
ENV INFOPATH="/opt/rh/gcc-toolset-13/root/usr/share/info"
ENV PCP_DIR="/opt/rh/gcc-toolset-13/root"
ENV LD_LIBRARY_PATH="/opt/rh/gcc-toolset-13/root/usr/lib64:/opt/rh/gcc-toolset-13/root/usr/lib"

# Set Python and pip aliases to use Python 3.9
RUN ln -sf /usr/bin/python3 /usr/bin/python && \
ln -sf /usr/bin/pip3 /usr/bin/pip

COPY requirements.txt .
# Install Python packages via pip
RUN pip install wheel setuptools pyyaml typing_extensions expecttest

#RUN source /opt/rh/gcc-toolset-13/enable && pip install -r requirements.txt
RUN pip install -r requirements.txt

# Copy the PyTorch source code into the container
COPY . /workspace/pytorch

WORKDIR /workspace/pytorch

# Ensure submodules are initialized and updated
RUN git submodule update --init --recursive

# Copy the build script and make it executable
COPY .github/scripts/ppc64le-build.sh /ppc64le-build.sh
RUN chmod +x /ppc64le-build.sh

# Verify permissions and ensure Unix line endings
RUN dos2unix /ppc64le-build.sh || sed -i 's/\r$//' /ppc64le-build.sh
RUN chmod +x /ppc64le-build.sh


34 changes: 34 additions & 0 deletions .github/scripts/ppc64le-build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/usr/bin/env bash

# Environment variables
PACKAGE_NAME=pytorch
PACKAGE_VERSION=${PACKAGE_VERSION:-v2.6.0}

cd /workspace/$PACKAGE_NAME

# Clean up old artifacts
rm -rf build/ dist/ torch.egg-info/

# Build and install PyTorch wheel
if ! (MAX_JOBS=$(nproc) python setup.py bdist_wheel && pip install dist/*.whl); then
echo "------------------$PACKAGE_NAME:install_fails-------------------------------------"
exit 1
fi

# register PrivateUse1HooksInterface
python test/test_utils.py TestDeviceUtilsCPU.test_device_mode_ops_sparse_mm_reduce_cpu_bfloat16
python test/test_utils.py TestDeviceUtilsCPU.test_device_mode_ops_sparse_mm_reduce_cpu_float16
python test/test_utils.py TestDeviceUtilsCPU.test_device_mode_ops_sparse_mm_reduce_cpu_float32
python test/test_utils.py TestDeviceUtilsCPU.test_device_mode_ops_sparse_mm_reduce_cpu_float64

cd ..
pip install pytest pytest-xdist

if ! pytest "$PACKAGE_NAME/test/test_utils.py"; then
echo "------------------$PACKAGE_NAME:install_success_but_test_fails---------------------"
exit 2

else
echo "------------------$PACKAGE_NAME:install_and_test_both_success-------------------------"
exit 0
fi
50 changes: 50 additions & 0 deletions .github/scripts/ppc64le-ci/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Configuring the builder.

## Install prerequisites.

```
Install Docker
```
## Clone pytorch repository

## Add services.

```
$ sudo cp self-hosted-builder/*.service /etc/systemd/system/
$ sudo systemctl daemon-reload
```
Next step is to build `actions-runner` image using:

```
## clone gaplib repo (https://github.com/anup-kodlekere/gaplib.git) and copy runner-sdk-8.ppc64le patch from gaplib/build-files into pytorch/.github\scripts\ppc64le-ci\self-hosted-builder

$ cd self-hosted-builder
$ sudo docker build \
--pull \
-f actions-runner.Dockerfile \
--build-arg RUNNERPATCH="runner-sdk-8.ppc64le.patch" \
-t iiilinuxibmcom/actions-runner.<name> \
.
```

Now prepare all necessary files for runner registration:

```
$ sudo mkdir -p /etc/actions-runner/<name>
$ sudo chmod 755 /etc/actions-runner/<name>
$ sudo /bin/cp <github_app_private_key_file> /etc/actions-runner/<name>/key_private.pem
$ sudo echo <github_app_id> | sudo tee /etc/actions-runner/<name>/appid.env
$ sudo echo <github_app_install_id> | sudo tee /etc/actions-runner/<name>/installid.env
$ sudo echo NAME=<worker_name> | sudo tee /etc/actions-runner/<name>/env
$ sudo echo OWNER=<github_owner> | sudo tee -a /etc/actions-runner/<name>/env
$ sudo echo REPO=pytorch | sudo tee -a /etc/actions-runner/<name>/env
$ cd self-hosted-builder
$ sudo /bin/cp helpers/*.sh /usr/local/bin/
$ sudo chmod 755 /usr/local/bin/app_token.sh /usr/local/bin/gh_token_generator.sh
```

## Autostart the runner.

```
$ sudo systemctl enable --now actions-runner@$NAME
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# Self-Hosted IBM Power Github Actions Runner.
FROM ubuntu:22.04

# Set non-interactive mode for apt
ENV DEBIAN_FRONTEND=noninteractive

# Fix sources to point to ports.ubuntu.com for ppc64le
RUN echo "deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports jammy main restricted universe multiverse" > /etc/apt/sources.list && \
echo "deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports jammy-updates main restricted universe multiverse" >> /etc/apt/sources.list && \
echo "deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports jammy-backports main restricted universe multiverse" >> /etc/apt/sources.list && \
echo "deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports jammy-security main restricted universe multiverse" >> /etc/apt/sources.list

# Fix sources for ppc64le and update system
RUN apt-get update -o Acquire::Retries=5 -o Acquire::http::Timeout="10" && \
apt-get -y install --no-install-recommends \
build-essential \
curl \
sudo \
jq \
gnupg-agent \
iptables \
ca-certificates \
software-properties-common \
vim \
zip \
python3 \
python3-pip && \
apt-get clean && rm -rf /var/lib/apt/lists/*

# Switch to iptables-legacy
RUN update-alternatives --set iptables /usr/sbin/iptables-legacy && \
update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy


# Add Docker GPG key and repository
RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg && \
echo "deb [arch=ppc64el signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" > /etc/apt/sources.list.d/docker.list && \
apt-get update && apt-get install -y docker-ce docker-ce-cli containerd.io && \
apt-get clean && rm -rf /var/lib/apt/lists/*

# Install dotnet SDK and other dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
wget \
git \
dotnet-sdk-8.0 \
cmake \
make \
automake \
autoconf \
m4 \
libtool && \
apt-get clean && rm -rf /var/lib/apt/lists/*


# Setup user and permissions
RUN useradd -c "Action Runner" -m runner && \
usermod -L runner && \
echo "runner ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/runner && \
groupadd docker || true && \
usermod -aG docker runner && \
(test -S /var/run/docker.sock && chmod 660 /var/run/docker.sock && chgrp docker /var/run/docker.sock || true)


# Add and configure GitHub Actions runner
ARG RUNNERREPO="https://github.com/actions/runner"
ARG RUNNERPATCH

ADD ${RUNNERPATCH} /tmp/runner.patch

RUN git clone -q ${RUNNERREPO} /tmp/runner && \
cd /tmp/runner && \
git checkout main -b build && \
git apply /tmp/runner.patch && \
sed -i'' -e /version/s/8......\"$/${SDK}.0.100\"/ src/global.json

RUN cd /tmp/runner/src && \
./dev.sh layout && \
./dev.sh package && \
./dev.sh test && \
rm -rf /root/.dotnet /root/.nuget

RUN mkdir -p /opt/runner && \
tar -xf /tmp/runner/_package/*.tar.gz -C /opt/runner && \
chown -R runner:runner /opt/runner && \
su - runner -c "/opt/runner/config.sh --version"

RUN rm -rf /tmp/runner /tmp/runner.patch

# Copy custom scripts and set permissions
COPY fs/ /
RUN chmod 777 /usr/bin/actions-runner /usr/bin/entrypoint

# Switch to the runner user
USER runner

# Set working directory
WORKDIR /opt/runner

# Define entry point and command
ENTRYPOINT ["/usr/bin/entrypoint"]
CMD ["/usr/bin/actions-runner"]

32 changes: 32 additions & 0 deletions .github/scripts/ppc64le-ci/self-hosted-builder/[email protected]
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
[Unit]
Description=Self-Hosted IBM power Github Actions Runner
StartLimitIntervalSec=0

[Service]
Type=simple
Restart=always

# Cleanup stale containers
ExecStartPre=-/usr/bin/docker rm --force actions-runner.%i
ExecStartPre=-/usr/local/bin/gh_token_generator.sh /etc/actions-runner/%i/appid.env /etc/actions-runner/%i/installid.env /etc/actions-runner/%i/key_private.pem /etc/actions-runner/%i/ghtoken.env
ExecStartPre=-/usr/local/bin/gh_cat_token.sh /etc/actions-runner/%i/ghtoken.env /etc/actions-runner/%i/ghtoken.txt

ExecStart=/usr/bin/docker run \
--env-file=/etc/actions-runner/%i/env \
--volume /etc/actions-runner/%i/ghtoken.txt:/run/runner_secret \
--volume /var/run/docker.sock:/var/run/docker.sock \
--init \
--interactive \
--name=actions-runner.%i \
--rm \
--privileged \
--log-driver=journald \
iiilinuxibmcom/actions-runner.%i
ExecStop=/bin/sh -c "docker exec actions-runner.%i kill -INT -- -1"
ExecStop=/bin/sh -c "docker wait actions-runner.%i"
ExecStop=/bin/sh -c "docker rm actions-runner.%i"

ExecStop=/usr/bin/env rm -f /etc/actions-runner/%i/ghtoken.env /etc/actions-runner/%i/ghtoken.txt

[Install]
WantedBy=multi-user.target
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/usr/bin/env bash

set -e -u

trap cleanup EXIT

token_file=registration-token.json

# Function to clean up and unregister the runner
cleanup() {
echo "Cleaning up temporary files..."
[ -f "$token_file" ] && rm -f "$token_file"
[ -f "runner-id.json" ] && rm -f "runner-id.json"

echo "Unregistering the runner from GitHub..."
ACCESS_TOKEN="$(cat /run/runner_secret)"
runner_id=$(curl -s \
-H "Accept: application/vnd.github.v3+json" \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
"https://api.github.com/repos/${OWNER}/${REPO}/actions/runners" | \
jq --raw-output '.runners[] | select(.name=="'"${NAME}"'") | .id')

if [ -n "$runner_id" ]; then
curl -s \
-X DELETE \
-H "Accept: application/vnd.github.v3+json" \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
"https://api.github.com/repos/${OWNER}/${REPO}/actions/runners/$runner_id"
echo "Runner unregistered successfully."
else
echo "Warning: Runner ID for ${NAME} not found. It may already be removed."
fi

unset ACCESS_TOKEN runner_id
}

# Fetch GitHub access token
if [ ! -f /run/runner_secret ]; then
echo "Error: Access token file not found at /run/runner_secret."
exit 1
fi


ACCESS_TOKEN="$(cat /run/runner_secret)"

# Generate registration token
curl \
-X POST \
-H "Accept: application/vnd.github.v3+json" \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
"https://api.github.com/repos/${OWNER}/${REPO}/actions/runners/registration-token" \
-o "$token_file"

unset ACCESS_TOKEN

# register runner as ephemeral runner
# it does one job, stops and unregisters
registration_token=$(jq --raw-output .token "$token_file")

./config.sh \
--unattended \
--ephemeral \
--url "https://github.com/${OWNER}/${REPO}" \
--token "${registration_token}" \
--name "${NAME}" \
--no-default-labels \
--labels self-hosted,linux.ppc64le

unset registration_token
rm -f "$token_file"

# Run one job.
./run.sh

echo "Ephemeral runner workflow completed."
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env bash

#
# Container entrypoint that waits for all spawned processes.
#

set -e -u

# Create a FIFO and start reading from its read end.
tempdir=$(mktemp -d "/tmp/done.XXXXXXXXXX")
trap 'rm -r "$tempdir"' EXIT
done="$tempdir/pipe"
mkfifo "$done"
cat "$done" & waiter=$!

# Start the workload. Its descendants will inherit the FIFO's write end.
status=0
if [ "$#" -eq 0 ]; then
bash 9>"$done" || status=$?
else
"$@" 9>"$done" || status=$?
fi

# When the workload and all of its descendants exit, the FIFO's write end will
# be closed and `cat "$done"` will exit. Wait until it happens. This is needed
# in order to handle SelfUpdater, which the workload may start in background
# before exiting.
wait "$waiter"

exit "$status"
Loading