Install enroot for gpu unit tests #290
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: GPU Tests | |
on: | |
schedule: | |
# Runs at midnight every day | |
- cron: '0 0 * * *' | |
push: | |
branches: [ main ] | |
pull_request: | |
workflow_dispatch: | |
concurrency: | |
group: gpu-test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} | |
cancel-in-progress: true | |
permissions: | |
id-token: write | |
contents: read | |
defaults: | |
run: | |
shell: bash -l -eo pipefail {0} | |
jobs: | |
gpu_test: | |
if: github.repository_owner == 'meta-pytorch' | |
runs-on: linux.g5.12xlarge.nvidia.gpu | |
strategy: | |
matrix: | |
python-version: ['3.10'] | |
steps: | |
- name: Check out repo | |
uses: actions/checkout@v4 | |
- name: Install enroot | |
run: | | |
version="3.5.0" | |
# Install dependencies | |
yum install -y jq squashfs-tools parallel || apt-get install -y jq squashfs-tools parallel || true | |
# Download pre-built packages | |
curl -fSsL -O https://github.com/NVIDIA/enroot/releases/download/v${version}/enroot-${version}-1.x86_64.rpm | |
curl -fSsL -O https://github.com/NVIDIA/enroot/releases/download/v${version}/enroot+caps-${version}-1.x86_64.rpm | |
# Install packages | |
rpm -ivh enroot-${version}-1.x86_64.rpm enroot+caps-${version}-1.x86_64.rpm | |
rm -f enroot*.rpm | |
- name: Setup conda env | |
uses: conda-incubator/setup-miniconda@v2 | |
with: | |
auto-update-conda: true | |
miniconda-version: "latest" | |
activate-environment: test | |
python-version: ${{ matrix.python-version }} | |
- name: Update pip | |
run: python -m pip install --upgrade pip | |
- name: Install pinned torch nightly | |
run: python -m pip install --pre torch==2.9.0.dev20250905 --no-cache-dir --index-url https://download.pytorch.org/whl/nightly/cu129 | |
- name: Download and install vLLM and its dependencies | |
# TODO: this honestly could not be hackier if I tried | |
run: | | |
python -m pip install -r .github/packaging/vllm_reqs.txt | |
python -m pip install vllm==0.10.1.dev0+g6d8d0a24c.d20251009.cu129 --no-cache-dir --index-url https://download.pytorch.org/whl/preview/forge | |
- name: Install Monarch | |
run: pip install torchmonarch==0.1.0rc1 | |
- name: Install torchtitan and torchstore | |
run: | | |
python -m pip install git+https://github.com/pytorch/torchtitan.git | |
python -m pip install git+https://github.com/meta-pytorch/torchstore.git | |
- name: Install dependencies | |
run: python -m pip install --no-build-isolation -e ".[dev]" | |
- name: Run unit tests with coverage | |
# TODO add all tests | |
run: | | |
export LD_PRELOAD=$CONDA/envs/test/lib/libpython3.10.so.1.0 | |
export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0 | |
pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv | |
- name: Upload Coverage to Codecov | |
uses: codecov/codecov-action@v3 |