Skip to content

[parked] Use gpu runner in CI #490

[parked] Use gpu runner in CI

[parked] Use gpu runner in CI #490

Workflow file for this run

name: Unit Test
on:
pull_request:
jobs:
build-and-test:
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
secrets: inherit # Pass all secrets
strategy:
fail-fast: false
matrix:
include:
- name: 4xlargegpu
runs-on: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: "cuda"
gpu-arch-version: "12.6"
# adapted from torchtitan/.github/workflows/integration_test_8gpu_h100.yaml
with:
timeout: 120
runner: ${{ matrix.runs-on }}
gpu-arch-type: ${{ matrix.gpu-arch-type }}
gpu-arch-version: ${{ matrix.gpu-arch-version }}
submodules: recursive
upload-artifact: "coverage-report"
script: |
set -eux
conda create -n testenv python=3.10 --yes
conda activate testenv
echo "python version: $(python --version)"
echo "python path: $(which python)"
# Log CUDA driver version for debugging.
DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
echo "CUDA driver version: ${DRIVER_VERSION}"
# Test the existence of secrets and abort if they are not available
echo "$(ssh-agent -s)"
[[ "$SECRET_FORGE_GITHUB_CI_FOR_TORCHSTORE" != "" ]] || (echo "SECRET_FORGE_GITHUB_CI_FOR_TORCHSTORE is empty" && exit 1)
# Add github fingerprint to known hosts
mkdir -p ~/.ssh
cat .github/workflows/github_fingerprints.txt >> ~/.ssh/known_hosts
python -m pip config --user set global.progress_bar off
# Upgrade pip
python -m pip install --upgrade pip
# Install pytorch
python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 --force-reinstall
# Install torchtitan
python -m pip install -r https://raw.githubusercontent.com/pytorch/torchtitan/refs/heads/main/.ci/docker/requirements.txt
python -m pip install --pre torchtitan --index-url https://download.pytorch.org/whl/nightly/cu126 --force-reinstall
# Install monarch
python -m pip install monarch-no-torch==0.1.0.dev20250826 --find-links assets/ci
# Install torchstore
eval "$(ssh-agent -s)"
echo "$SECRET_FORGE_GITHUB_CI_FOR_TORCHSTORE" >> torchstore_ssh_key
chmod 600 torchstore_ssh_key
# Print the first and last 2 lines of the key to make sure it's valid
echo "torchstore_ssh_key contents:"
head -2 torchstore_ssh_key
echo "..."
tail -2 torchstore_ssh_key
echo "End of torchstore_ssh_key contents"
ssh-add torchstore_ssh_key
python -m pip install git+ssh://[email protected]/meta-pytorch/torchstore.git
# Install dependencies
python -m pip install --no-build-isolation -e ".[dev]"
# Run unit tests with coverage
pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv
upload-coverage:
strategy:
matrix:
python-version: ['3.10', '3.11', '3.12']
needs: build-and-test
runs-on: ubuntu-latest
steps:
- name: Download coverage artifact
uses: actions/download-artifact@v4
with:
name: coverage-report
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
with:
files: coverage.xml
token: ${{ secrets.CODECOV_TOKEN }}