[parked] Use gpu runner in CI #488
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Unit Test | |
on: | |
pull_request: | |
jobs: | |
build-and-test: | |
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main | |
secrets: inherit # Pass all secrets | |
strategy: | |
fail-fast: false | |
matrix: | |
include: | |
- name: 4xlargegpu | |
runs-on: linux.g5.4xlarge.nvidia.gpu | |
gpu-arch-type: "cuda" | |
gpu-arch-version: "12.6" | |
# adapted from torchtitan/.github/workflows/integration_test_8gpu_h100.yaml | |
with: | |
timeout: 120 | |
runner: ${{ matrix.runs-on }} | |
gpu-arch-type: ${{ matrix.gpu-arch-type }} | |
gpu-arch-version: ${{ matrix.gpu-arch-version }} | |
submodules: recursive | |
upload-artifact: "coverage-report" | |
script: | | |
set -eux | |
conda create -n testenv python=3.10 --yes | |
conda activate testenv | |
echo "python version: $(python --version)" | |
echo "python path: $(which python)" | |
# Log CUDA driver version for debugging. | |
DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true) | |
echo "CUDA driver version: ${DRIVER_VERSION}" | |
# Test the existence of secrets and abort if they are not available | |
echo "$(ssh-agent -s)" | |
[[ "$SECRET_FORGE_GITHUB_CI_FOR_TORCHSTORE" != "" ]] || (echo "SECRET_FORGE_GITHUB_CI_FOR_TORCHSTORE is empty" && exit 1) | |
# Add github fingerprint to known hosts | |
mkdir -p ~/.ssh | |
cat .github/workflows/github_fingerprints.txt >> ~/.ssh/known_hosts | |
python -m pip config --user set global.progress_bar off | |
# Upgrade pip | |
python -m pip install --upgrade pip | |
# Install pytorch | |
python -m pip install torch==2.9.0.dev20250826 --extra-index-url https://download.pytorch.org/whl/nightly/cu126 | |
# Install monarch | |
python -m pip install monarch-no-torch==0.1.0.dev20250826 --find-links assets/ci | |
# Install torchstore | |
eval "$(ssh-agent -s)" | |
echo "$SECRET_FORGE_GITHUB_CI_FOR_TORCHSTORE" >> torchstore_ssh_key | |
chmod 600 torchstore_ssh_key | |
# Print the first and last 2 lines of the key to make sure it's valid | |
echo "torchstore_ssh_key contents:" | |
head -2 torchstore_ssh_key | |
echo "..." | |
tail -2 torchstore_ssh_key | |
echo "End of torchstore_ssh_key contents" | |
ssh-add torchstore_ssh_key | |
python -m pip install git+ssh://[email protected]/meta-pytorch/torchstore.git | |
# torchtitan requires tomli>=1.1.0 | |
pip install "tomli>=1.1.0" | |
# check if we have the correct version of tomli installed | |
pip freeze | grep tomli | |
# Install torchtitan | |
pip install --pre torchtitan --index-url https://download.pytorch.org/whl/nightly/cu126 | |
# Install dependencies | |
python -m pip install --no-build-isolation -e ".[dev]" | |
# Run unit tests with coverage | |
pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv | |
upload-coverage: | |
strategy: | |
matrix: | |
python-version: ['3.10', '3.11', '3.12'] | |
needs: build-and-test | |
runs-on: ubuntu-latest | |
steps: | |
- name: Download coverage artifact | |
uses: actions/download-artifact@v4 | |
with: | |
name: coverage-report | |
- name: Upload coverage to Codecov | |
uses: codecov/codecov-action@v4 | |
with: | |
files: coverage.xml | |
token: ${{ secrets.CODECOV_TOKEN }} |