Skip to content

Commit 728043e

Browse files
authored
Test torchprime from PyTorch/XLA (#9152)
1 parent 37ed591 commit 728043e

File tree

5 files changed

+254
-0
lines changed

5 files changed

+254
-0
lines changed
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
name: torchprime E2E tests
2+
description: |
3+
This workflow builds a docker image with the PyTorch/XLA wheels and then
4+
triggers a torchprime (https://github.com/AI-Hypercomputer/torchprime)
5+
E2E test using that docker image. It is intended to catch performance
6+
regressions and API breaking changes in PyTorch/XLA pull requests.
7+
on:
8+
workflow_call:
9+
inputs:
10+
timeout-minutes:
11+
required: false
12+
type: number
13+
description: Timeout in minutes for the job run
14+
default: 80
15+
has_code_changes:
16+
required: false
17+
type: string
18+
description: Whether to run full workflow or not
19+
default: 'true'
20+
secrets:
21+
# This is a token for the `torchxlabot2` user, which has access to the torchprime repo.
22+
# It is used to trigger the torchprime E2E test workflow.
23+
# The token should be managed in the "Settings > Secrets and variables > Actions"
24+
# section of the repo.
25+
TORCH_XLA_BOT_TOKEN:
26+
required: true
27+
GCLOUD_SERVICE_KEY:
28+
required: true
29+
jobs:
30+
torchprime-e2e-test:
31+
name: Run torchprime E2E tests
32+
timeout-minutes: ${{ inputs.timeout-minutes }}
33+
runs-on: ubuntu-22.04
34+
steps:
35+
- name: Use Docker in rootless mode
36+
if: inputs.has_code_changes == 'true'
37+
uses: ScribeMD/[email protected]
38+
- name: Add user to docker group
39+
if: inputs.has_code_changes == 'true'
40+
run: |
41+
sudo usermod -aG docker $USER
42+
newgrp docker
43+
shell: bash
44+
# Googlers: if this fails, follow go/ptxla-sa-key to debug.
45+
- uses: google-github-actions/auth@v2
46+
if: inputs.has_code_changes == 'true'
47+
with:
48+
credentials_json: '${{ secrets.GCLOUD_SERVICE_KEY }}'
49+
- uses: google-github-actions/setup-gcloud@v2
50+
if: inputs.has_code_changes == 'true'
51+
with:
52+
version: '>= 363.0.0'
53+
install_components: 'beta,gke-gcloud-auth-plugin'
54+
- name: Verify GCP setup
55+
if: inputs.has_code_changes == 'true'
56+
run: gcloud info
57+
shell: bash
58+
- name: Authenticate Docker
59+
if: inputs.has_code_changes == 'true'
60+
run: gcloud auth configure-docker --quiet
61+
shell: bash
62+
- name: Activate SA credentials
63+
if: inputs.has_code_changes == 'true'
64+
run: gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS
65+
shell: bash
66+
- name: Checkout infra
67+
if: inputs.has_code_changes == 'true'
68+
uses: actions/checkout@v4
69+
with:
70+
sparse-checkout: |
71+
infra
72+
fetch-depth: 1
73+
path: pytorch-xla
74+
# Build a docker image for torchprime E2E test
75+
# First download the torch-xla-wheels
76+
- name: Fetch wheels
77+
if: inputs.has_code_changes == 'true'
78+
uses: actions/download-artifact@v4
79+
with:
80+
name: torch-xla-wheels
81+
path: /tmp/wheels/
82+
# Generate a 16-character random ID for the docker tag
83+
- name: Generate random docker tag
84+
if: inputs.has_code_changes == 'true'
85+
id: random_tag
86+
shell: bash
87+
run: |
88+
echo "random_id=$(openssl rand -hex 8)" >> $GITHUB_OUTPUT
89+
# Then run docker to install them and push a docker
90+
- name: Build and push docker image
91+
if: inputs.has_code_changes == 'true'
92+
id: build_docker
93+
shell: bash
94+
working-directory: pytorch-xla
95+
run: |
96+
. ./infra/ansible/publish_torchprime_e2e_test_docker.sh
97+
echo "docker_url=gcr.io/${DOCKER_PROJECT}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}" >> $GITHUB_OUTPUT
98+
env:
99+
DEFAULT_CONTEXT_PATH: /tmp/wheels
100+
DOCKER_IMAGE_NAME: for-torchprime-ci
101+
DOCKER_IMAGE_TAG: ${{ steps.random_tag.outputs.random_id }}
102+
DOCKER_PROJECT: tpu-pytorch
103+
# Trigger torchprime E2E test workflow.
104+
# (Googlers only) in case of infra failure, refer to go/ptxla-torchprime-trigger
105+
# Refer to the same doc on the retention policy of the docker images.
106+
- uses: convictional/[email protected]
107+
if: inputs.has_code_changes == 'true'
108+
with:
109+
owner: AI-Hypercomputer
110+
repo: torchprime
111+
github_token: ${{ secrets.TORCH_XLA_BOT_TOKEN }}
112+
workflow_file_name: e2e_test.yml
113+
wait_interval: 60
114+
ref: main
115+
client_payload: '{"docker_url": "${{ steps.build_docker.outputs.docker_url }}"}'

.github/workflows/_tpu_ci.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@ name: TPU Integration Test
22
on:
33
workflow_call:
44
inputs:
5+
torch-commit:
6+
required: false
7+
type: string
8+
description: torch-commit
59
timeout-minutes:
610
required: false
711
type: number

.github/workflows/build_and_test.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,16 @@ jobs:
7676
has_code_changes: ${{ needs.check_code_changes.outputs.has_code_changes }}
7777
if: github.event_name == 'push' || github.event_name == 'pull_request'
7878

79+
test-torchprime:
80+
name: "torchprime tests"
81+
uses: ./.github/workflows/_torchprime_ci.yml
82+
needs: [build-torch-xla, check_code_changes]
83+
with:
84+
timeout-minutes: 100
85+
has_code_changes: ${{ needs.check_code_changes.outputs.has_code_changes }}
86+
if: github.event_name == 'push' || github.event_name == 'pull_request'
87+
secrets: inherit
88+
7989
push-docs:
8090
name: "Build docs"
8191
uses: ./.github/workflows/_docs.yml
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# syntax=docker/dockerfile:1.4
2+
#
3+
# Dockerfile for building a PyTorch/XLA docker image to be used in torchprime
4+
# E2E tests (https://github.com/AI-Hypercomputer/torchprime/actions/workflows/e2e_test.yml)
5+
# triggered from PyTorch/XLA PRs.
6+
#
7+
# This Dockerfile is not used during nightly builds of PyTorch/XLA.
8+
#
9+
# This Dockerfile is also not used by torchprime when a PR is made on torchprime.
10+
# torchprime pins a PyTorch/XLA docker image for use in torchprime PR tests. However,
11+
# when running torchprime tests on PyTorch/XLA PRs, we would override that docker image
12+
# with one built by this Dockerfile.
13+
#
14+
# This Dockerfile is a simplified version of `infra/ansible/Dockerfile`. The latter is meant
15+
# to be run from Cloud Build during nightly triggers. That file would build PyTorch and
16+
# PyTorch/XLA from scratch, and then install those wheels. In contrast, this Dockerfile expects the
17+
# PyTorch and PyTorch/XLA wheels to be built already, which is the case in PR tests.
18+
# The `build-torch-xla` job in the "Build and test" action will have already built the
19+
# wheels, which the `test-torchprime` job will download and make available to this docker build.
20+
#
21+
# The docker image will be pushed to `gcr.io/tpu-pytorch/for-torchprime-ci:${random_id}`. The
22+
# ID is unique for each run of the workflow to avoid interference between concurrent runs.
23+
#
24+
# (Googlers only) Refer to go/ptxla-torchprime-trigger for information on retention policy of
25+
# the docker images.
26+
ARG python_version=3.10
27+
ARG debian_version=bullseye
28+
29+
FROM python:${python_version}-${debian_version} AS release
30+
31+
# Install PyTorch wheels. We expect to install three wheels. Example:
32+
# - torch-2.8.0-cp310-cp310-linux_x86_64.whl
33+
# - torch_xla-2.8.0+gitd4b0a48-cp310-cp310-linux_x86_64.whl
34+
# - torchvision-0.22.0a0+966da7e-cp310-cp310-linux_x86_64.whl
35+
# The precise names will depend on the git commit hash used at build time.
36+
WORKDIR /tmp/wheels
37+
COPY ./*.whl ./
38+
39+
RUN echo "Installing the following wheels" && ls *.whl
40+
RUN pip install *.whl
41+
42+
# Install the dependencies including libtpu.
43+
WORKDIR /ansible
44+
RUN pip install ansible
45+
COPY --from=ansible . /ansible
46+
47+
ARG ansible_vars
48+
RUN ansible-playbook -vvv playbook.yaml -e "stage=release" -e "${ansible_vars}" --tags "install_deps"
49+
50+
WORKDIR /
51+
52+
RUN rm -rf /ansible /tmp/wheels
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
#!/bin/bash
2+
3+
# This script builds and pushes a docker image to be used for torchprime E2E tests.
4+
#
5+
# torchprime is a reference implementation of models using PyTorch/XLA:
6+
# https://github.com/AI-Hypercomputer/torchprime.
7+
#
8+
# The purpose of building a docker image here is to trigger torchprime E2E tests
9+
# from PyTorch/XLA PRs and post-submits. The reason for running torchprime tests
10+
# on PyTorch/XLA changes is to ensure that torchprime models are not broken.
11+
# See https://github.com/AI-Hypercomputer/torchprime/issues/161 for the detailed
12+
# motivation.
13+
#
14+
# The docker image will be pushed to
15+
# `gcr.io/${DOCKER_PROJECT}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}`. By default, the
16+
# `torchprime-e2e-test` job in the `.github/workflows/_torchprime_ci.yml` workflow will
17+
# configure the env vars such that the image is pushed to
18+
# `gcr.io/tpu-pytorch/for-torchprime-ci:${random_id}`. The ID is unique for each run
19+
# of the workflow to avoid interference between concurrent runs.
20+
#
21+
# (Googlers only) Refer to go/ptxla-torchprime-trigger for information on retention policy of
22+
# the docker images.
23+
24+
set -ex
25+
26+
# Check required environment variables
27+
if [ -z "${DEFAULT_CONTEXT_PATH}" ]; then
28+
echo "ERROR: DEFAULT_CONTEXT_PATH is not set"
29+
exit 1
30+
fi
31+
if [ -z "${DOCKER_IMAGE_NAME}" ]; then
32+
echo "ERROR: DOCKER_IMAGE_NAME is not set"
33+
exit 1
34+
fi
35+
if [ -z "${DOCKER_IMAGE_TAG}" ]; then
36+
echo "ERROR: DOCKER_IMAGE_TAG is not set"
37+
exit 1
38+
fi
39+
if [ -z "${DOCKER_PROJECT}" ]; then
40+
echo "ERROR: DOCKER_PROJECT is not set"
41+
exit 1
42+
fi
43+
44+
export DOCKER_URL="gcr.io/${DOCKER_PROJECT}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}"
45+
export DOCKERFILE_PATH="infra/ansible/ptxla_docker_for_torchprime.Dockerfile"
46+
47+
echo "Building and pushing image: ${DOCKER_URL}"
48+
49+
# Define ansible vars used in the docker file by `ansible-playbook`.
50+
#
51+
# See `infra/ansible/playbook.yaml` and `infra/ansible/config/vars.yaml`
52+
# for definition of the variables.
53+
read -r -d '' ANSIBLE_VARS_JSON << EOM || { exit_code=$?; [[ $exit_code -eq 1 ]]; }
54+
{
55+
"arch": "amd64",
56+
"accelerator": "tpu",
57+
"bundle_libtpu": "0",
58+
"git_versioned_xla_build": true,
59+
"nightly_release": true
60+
}
61+
EOM
62+
ANSIBLE_VARS_COMPACT=$(echo "$ANSIBLE_VARS_JSON" | tr -d '\n' | tr -d ' ')
63+
64+
docker build -t "${DOCKER_URL}" \
65+
--build-context ansible=infra/ansible \
66+
"${DEFAULT_CONTEXT_PATH}" \
67+
-f "${DOCKERFILE_PATH}" \
68+
--build-arg ansible_vars="${ANSIBLE_VARS_COMPACT}" \
69+
--build-arg python_version=3.10 \
70+
--build-arg debian_version=bullseye
71+
docker push "${DOCKER_URL}"
72+
73+
echo "Successfully pushed image: ${DOCKER_URL}"

0 commit comments

Comments
 (0)