Skip to content
39 changes: 26 additions & 13 deletions .github/workflows/integration_test_8gpu_models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ name: 8 GPU Model Tests
on:
push:
branches: [ main ]
tags:
- ciflow/8gpu/*
paths-ignore:
- 'torchtitan/experiments/**'
pull_request:
Expand All @@ -21,18 +23,27 @@ defaults:
run:
shell: bash -l -eo pipefail {0}

permissions:
id-token: write
contents: read

jobs:
# Step 1: Dynamically compute the matrix based on conditions
set-matrix:
uses: ./.github/workflows/set-matrix.yaml

# Step 2: Use the dynamic matrix in the build-test job
build-test:
needs: set-matrix
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
with:
runner: linux.g5.48xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: "12.6"
# This image is faster to clone than the default, but it lacks CC needed by triton
# (1m25s vs 2m37s).
docker-image: torchtitan-ubuntu-20.04-clang12
runner: ${{ matrix.runner }}
gpu-arch-type: ${{ matrix.gpu-arch-type }}
gpu-arch-version: ${{ matrix.gpu-arch-version }}
docker-image: ${{ matrix.docker-image }}
repository: pytorch/torchtitan
upload-artifact: outputs
timeout: 45
script: |
set -eux

Expand All @@ -46,12 +57,14 @@ jobs:

pip config --user set global.progress_bar off

python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }}

USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }}

USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
sudo mkdir -p "$RUNNER_TEMP/artifacts-to-be-uploaded"
sudo chown -R $(id -u):$(id -g) "$RUNNER_TEMP/artifacts-to-be-uploaded"

mkdir artifacts-to-be-uploaded
python -m tests.integration_tests.run_tests --test_suite models artifacts-to-be-uploaded --ngpu 8
python -m tests.integration_tests.flux artifacts-to-be-uploaded/flux --ngpu 8
rm -rf artifacts-to-be-uploaded/*/checkpoint
rm -rf artifacts-to-be-uploaded/flux/*/inference_results/
python -m tests.integration_tests.run_tests --gpu_arch_type ${{ matrix.gpu-arch-type }} --test_suite models $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8
python -m tests.integration_tests.flux --gpu_arch_type ${{ matrix.gpu-arch-type }} $RUNNER_TEMP/artifacts-to-be-uploaded/flux --ngpu 8
rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*/checkpoint
rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/flux/*/inference_results/
Loading