Test torchprime from PyTorch/XLA (#9152)

tengyifei · web-flow · commit 728043e66a62 · 2025-06-10T03:44:50.000Z
diff --git a/.github/workflows/_torchprime_ci.yml b/.github/workflows/_torchprime_ci.yml
@@ -0,0 +1,115 @@
+name: torchprime E2E tests
+description: |
+  This workflow builds a docker image with the PyTorch/XLA wheels and then
+  triggers a torchprime (https://github.com/AI-Hypercomputer/torchprime)
+  E2E test using that docker image. It is intended to catch performance
+  regressions and API breaking changes in PyTorch/XLA pull requests.
+on:
+  workflow_call:
+    inputs:
+      timeout-minutes:
+        required: false
+        type: number
+        description: Timeout in minutes for the job run
+        default: 80
+      has_code_changes:
+        required: false
+        type: string
+        description: Whether to run full workflow or not
+        default: 'true'
+    secrets:
+      # This is a token for the `torchxlabot2` user, which has access to the torchprime repo.
+      # It is used to trigger the torchprime E2E test workflow.
+      # The token should be managed in the "Settings > Secrets and variables > Actions"
+      # section of the repo.
+      TORCH_XLA_BOT_TOKEN:
+        required: true
+      GCLOUD_SERVICE_KEY:
+        required: true
+jobs:
+  torchprime-e2e-test:
+    name: Run torchprime E2E tests
+    timeout-minutes: ${{ inputs.timeout-minutes }}
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Use Docker in rootless mode
+        if: inputs.has_code_changes == 'true'
+        uses: ScribeMD/rootless-docker@0.2.2
+      - name: Add user to docker group
+        if: inputs.has_code_changes == 'true'
+        run: |
+          sudo usermod -aG docker $USER
+          newgrp docker
+        shell: bash
+      # Googlers: if this fails, follow go/ptxla-sa-key to debug.
+      - uses: google-github-actions/auth@v2
+        if: inputs.has_code_changes == 'true'
+        with:
+          credentials_json: '${{ secrets.GCLOUD_SERVICE_KEY }}'
+      - uses: google-github-actions/setup-gcloud@v2
+        if: inputs.has_code_changes == 'true'
+        with:
+          version: '>= 363.0.0'
+          install_components: 'beta,gke-gcloud-auth-plugin'
+      - name: Verify GCP setup
+        if: inputs.has_code_changes == 'true'
+        run: gcloud info
+        shell: bash
+      - name: Authenticate Docker
+        if: inputs.has_code_changes == 'true'
+        run: gcloud auth configure-docker --quiet
+        shell: bash
+      - name: Activate SA credentials
+        if: inputs.has_code_changes == 'true'
+        run: gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS
+        shell: bash
+      - name: Checkout infra
+        if: inputs.has_code_changes == 'true'
+        uses: actions/checkout@v4
+        with:
+          sparse-checkout: |
+            infra
+          fetch-depth: 1
+          path: pytorch-xla
+      # Build a docker image for torchprime E2E test
+      # First download the torch-xla-wheels
+      - name: Fetch wheels
+        if: inputs.has_code_changes == 'true'
+        uses: actions/download-artifact@v4
+        with:
+          name: torch-xla-wheels
+          path: /tmp/wheels/
+      # Generate a 16-character random ID for the docker tag
+      - name: Generate random docker tag
+        if: inputs.has_code_changes == 'true'
+        id: random_tag
+        shell: bash
+        run: |
+          echo "random_id=$(openssl rand -hex 8)" >> $GITHUB_OUTPUT
+      # Then run docker to install them and push a docker
+      - name: Build and push docker image
+        if: inputs.has_code_changes == 'true'
+        id: build_docker
+        shell: bash
+        working-directory: pytorch-xla
+        run: |
+          . ./infra/ansible/publish_torchprime_e2e_test_docker.sh
+          echo "docker_url=gcr.io/${DOCKER_PROJECT}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}" >> $GITHUB_OUTPUT
+        env:
+          DEFAULT_CONTEXT_PATH: /tmp/wheels
+          DOCKER_IMAGE_NAME: for-torchprime-ci
+          DOCKER_IMAGE_TAG: ${{ steps.random_tag.outputs.random_id }}
+          DOCKER_PROJECT: tpu-pytorch
+      # Trigger torchprime E2E test workflow.
+      # (Googlers only) in case of infra failure, refer to go/ptxla-torchprime-trigger
+      # Refer to the same doc on the retention policy of the docker images.
+      - uses: convictional/trigger-workflow-and-wait@v1.6.5
+        if: inputs.has_code_changes == 'true'
+        with:
+          owner: AI-Hypercomputer
+          repo: torchprime
+          github_token: ${{ secrets.TORCH_XLA_BOT_TOKEN }}
+          workflow_file_name: e2e_test.yml
+          wait_interval: 60
+          ref: main
+          client_payload: '{"docker_url": "${{ steps.build_docker.outputs.docker_url }}"}'
diff --git a/.github/workflows/_tpu_ci.yml b/.github/workflows/_tpu_ci.yml
@@ -2,6 +2,10 @@ name: TPU Integration Test
 on:
   workflow_call:
     inputs:
+      torch-commit:
+        required: false
+        type: string
+        description: torch-commit
       timeout-minutes:
         required: false
         type: number
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -76,6 +76,16 @@ jobs:
       has_code_changes: ${{ needs.check_code_changes.outputs.has_code_changes }}
     if: github.event_name == 'push' || github.event_name == 'pull_request'
 
+  test-torchprime:
+    name: "torchprime tests"
+    uses: ./.github/workflows/_torchprime_ci.yml
+    needs: [build-torch-xla, check_code_changes]
+    with:
+      timeout-minutes: 100
+      has_code_changes: ${{ needs.check_code_changes.outputs.has_code_changes }}
+    if: github.event_name == 'push' || github.event_name == 'pull_request'
+    secrets: inherit
+
   push-docs:
     name: "Build docs"
     uses: ./.github/workflows/_docs.yml
diff --git a/infra/ansible/ptxla_docker_for_torchprime.Dockerfile b/infra/ansible/ptxla_docker_for_torchprime.Dockerfile
@@ -0,0 +1,52 @@
+# syntax=docker/dockerfile:1.4
+#
+# Dockerfile for building a PyTorch/XLA docker image to be used in torchprime
+# E2E tests (https://github.com/AI-Hypercomputer/torchprime/actions/workflows/e2e_test.yml)
+# triggered from PyTorch/XLA PRs.
+#
+# This Dockerfile is not used during nightly builds of PyTorch/XLA.
+#
+# This Dockerfile is also not used by torchprime when a PR is made on torchprime.
+# torchprime pins a PyTorch/XLA docker image for use in torchprime PR tests. However,
+# when running torchprime tests on PyTorch/XLA PRs, we would override that docker image
+# with one built by this Dockerfile.
+#
+# This Dockerfile is a simplified version of `infra/ansible/Dockerfile`. The latter is meant
+# to be run from Cloud Build during nightly triggers. That file would build PyTorch and
+# PyTorch/XLA from scratch, and then install those wheels. In contrast, this Dockerfile expects the
+# PyTorch and PyTorch/XLA wheels to be built already, which is the case in PR tests.
+# The `build-torch-xla` job in the "Build and test" action will have already built the
+# wheels, which the `test-torchprime` job will download and make available to this docker build.
+#
+# The docker image will be pushed to `gcr.io/tpu-pytorch/for-torchprime-ci:${random_id}`. The
+# ID is unique for each run of the workflow to avoid interference between concurrent runs.
+#
+# (Googlers only) Refer to go/ptxla-torchprime-trigger for information on retention policy of
+# the docker images.
+ARG python_version=3.10
+ARG debian_version=bullseye
+
+FROM python:${python_version}-${debian_version} AS release
+
+# Install PyTorch wheels. We expect to install three wheels. Example:
+# - torch-2.8.0-cp310-cp310-linux_x86_64.whl
+# - torch_xla-2.8.0+gitd4b0a48-cp310-cp310-linux_x86_64.whl
+# - torchvision-0.22.0a0+966da7e-cp310-cp310-linux_x86_64.whl
+# The precise names will depend on the git commit hash used at build time.
+WORKDIR /tmp/wheels
+COPY ./*.whl ./
+
+RUN echo "Installing the following wheels" && ls *.whl
+RUN pip install *.whl
+
+# Install the dependencies including libtpu.
+WORKDIR /ansible
+RUN pip install ansible
+COPY --from=ansible . /ansible
+
+ARG ansible_vars
+RUN ansible-playbook -vvv playbook.yaml -e "stage=release" -e "${ansible_vars}" --tags "install_deps"
+
+WORKDIR /
+
+RUN rm -rf /ansible /tmp/wheels
diff --git a/infra/ansible/publish_torchprime_e2e_test_docker.sh b/infra/ansible/publish_torchprime_e2e_test_docker.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+
+# This script builds and pushes a docker image to be used for torchprime E2E tests.
+#
+# torchprime is a reference implementation of models using PyTorch/XLA:
+# https://github.com/AI-Hypercomputer/torchprime.
+#
+# The purpose of building a docker image here is to trigger torchprime E2E tests
+# from PyTorch/XLA PRs and post-submits. The reason for running torchprime tests
+# on PyTorch/XLA changes is to ensure that torchprime models are not broken.
+# See https://github.com/AI-Hypercomputer/torchprime/issues/161 for the detailed
+# motivation.
+#
+# The docker image will be pushed to
+# `gcr.io/${DOCKER_PROJECT}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}`. By default, the
+# `torchprime-e2e-test` job in the `.github/workflows/_torchprime_ci.yml` workflow will
+# configure the env vars such that the image is pushed to
+# `gcr.io/tpu-pytorch/for-torchprime-ci:${random_id}`. The ID is unique for each run
+# of the workflow to avoid interference between concurrent runs.
+#
+# (Googlers only) Refer to go/ptxla-torchprime-trigger for information on retention policy of
+# the docker images.
+
+set -ex
+
+# Check required environment variables
+if [ -z "${DEFAULT_CONTEXT_PATH}" ]; then
+  echo "ERROR: DEFAULT_CONTEXT_PATH is not set"
+  exit 1
+fi
+if [ -z "${DOCKER_IMAGE_NAME}" ]; then
+  echo "ERROR: DOCKER_IMAGE_NAME is not set"
+  exit 1
+fi
+if [ -z "${DOCKER_IMAGE_TAG}" ]; then
+  echo "ERROR: DOCKER_IMAGE_TAG is not set"
+  exit 1
+fi
+if [ -z "${DOCKER_PROJECT}" ]; then
+  echo "ERROR: DOCKER_PROJECT is not set"
+  exit 1
+fi
+
+export DOCKER_URL="gcr.io/${DOCKER_PROJECT}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}"
+export DOCKERFILE_PATH="infra/ansible/ptxla_docker_for_torchprime.Dockerfile"
+
+echo "Building and pushing image: ${DOCKER_URL}"
+
+# Define ansible vars used in the docker file by `ansible-playbook`.
+#
+# See `infra/ansible/playbook.yaml` and `infra/ansible/config/vars.yaml`
+# for definition of the variables.
+read -r -d '' ANSIBLE_VARS_JSON << EOM || { exit_code=$?; [[ $exit_code -eq 1 ]]; }
+{
+  "arch": "amd64",
+  "accelerator": "tpu",
+  "bundle_libtpu": "0",
+  "git_versioned_xla_build": true,
+  "nightly_release": true
+}
+EOM
+ANSIBLE_VARS_COMPACT=$(echo "$ANSIBLE_VARS_JSON" | tr -d '\n' | tr -d ' ')
+
+docker build -t "${DOCKER_URL}" \
+    --build-context ansible=infra/ansible \
+    "${DEFAULT_CONTEXT_PATH}" \
+    -f "${DOCKERFILE_PATH}" \
+    --build-arg ansible_vars="${ANSIBLE_VARS_COMPACT}" \
+    --build-arg python_version=3.10 \
+    --build-arg debian_version=bullseye
+docker push "${DOCKER_URL}"
+
+echo "Successfully pushed image: ${DOCKER_URL}"