Install enroot for gpu unit tests #310
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: GPU Tests | |
on: | |
schedule: | |
# Runs at midnight every day | |
- cron: '0 0 * * *' | |
push: | |
branches: [ main ] | |
pull_request: | |
workflow_dispatch: | |
concurrency: | |
group: gpu-test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} | |
cancel-in-progress: true | |
permissions: | |
id-token: write | |
contents: read | |
defaults: | |
run: | |
shell: bash -l -eo pipefail {0} | |
jobs: | |
gpu_test: | |
if: github.repository_owner == 'meta-pytorch' | |
runs-on: linux.g5.12xlarge.nvidia.gpu | |
strategy: | |
matrix: | |
python-version: ['3.10'] | |
steps: | |
- name: Check out repo | |
uses: actions/checkout@v4 | |
- name: Install enroot | |
run: | | |
version="3.5.0" | |
# Install dependencies | |
# - jq, squashfs-tools, parallel: core enroot dependencies | |
# - curl, pigz, zstd: for image import operations | |
sudo dnf install -y jq squashfs-tools parallel curl pigz zstd | |
# Download pre-built packages | |
curl -fSsL -O https://github.com/NVIDIA/enroot/releases/download/v${version}/enroot-${version}-1.el8.x86_64.rpm | |
curl -fSsL -O https://github.com/NVIDIA/enroot/releases/download/v${version}/enroot+caps-${version}-1.el8.x86_64.rpm | |
# Install packages | |
sudo rpm -ivh enroot-${version}-1.el8.x86_64.rpm enroot+caps-${version}-1.el8.x86_64.rpm | |
rm -f enroot*.rpm | |
# Create user-writable directories for enroot | |
mkdir -p ${HOME}/.local/share/enroot/{runtime,cache,data,temp} | |
# Create a system-wide config for enroot (since GITHUB_ENV doesn't work for spawned processes) | |
sudo mkdir -p /etc/enroot | |
sudo tee /etc/enroot/enroot.conf > /dev/null <<EOF | |
ENROOT_RUNTIME_PATH ${HOME}/.local/share/enroot/runtime | |
ENROOT_CACHE_PATH ${HOME}/.local/share/enroot/cache | |
ENROOT_DATA_PATH ${HOME}/.local/share/enroot/data | |
ENROOT_TEMP_PATH ${HOME}/.local/share/enroot/temp | |
EOF | |
# Also set env vars for current process | |
echo "ENROOT_RUNTIME_PATH=${HOME}/.local/share/enroot/runtime" >> $GITHUB_ENV | |
echo "ENROOT_CACHE_PATH=${HOME}/.local/share/enroot/cache" >> $GITHUB_ENV | |
echo "ENROOT_DATA_PATH=${HOME}/.local/share/enroot/data" >> $GITHUB_ENV | |
echo "ENROOT_TEMP_PATH=${HOME}/.local/share/enroot/temp" >> $GITHUB_ENV | |
- name: Test enroot installation | |
run: | | |
# Verify enroot is installed and check version | |
enroot version | |
# Test importing a simple image | |
enroot import -o /tmp/test_alpine.sqsh docker://alpine:latest | |
ls -lh /tmp/test_alpine.sqsh | |
# Create and test container | |
enroot create --name test_alpine /tmp/test_alpine.sqsh | |
enroot start test_alpine echo "Hello from enroot!" | |
enroot remove -f test_alpine | |
rm -f /tmp/test_alpine.sqsh | |
echo "Enroot test successful!" | |
- name: Setup conda env | |
uses: conda-incubator/setup-miniconda@v2 | |
with: | |
auto-update-conda: true | |
miniconda-version: "latest" | |
activate-environment: test | |
python-version: ${{ matrix.python-version }} | |
- name: Update pip | |
run: python -m pip install --upgrade pip | |
- name: Install pinned torch nightly | |
run: python -m pip install --pre torch==2.9.0.dev20250905 --no-cache-dir --index-url https://download.pytorch.org/whl/nightly/cu129 | |
- name: Download and install vLLM and its dependencies | |
# TODO: this honestly could not be hackier if I tried | |
run: | | |
python -m pip install -r .github/packaging/vllm_reqs.txt | |
python -m pip install vllm==0.10.1.dev0+g6d8d0a24c.d20251009.cu129 --no-cache-dir --index-url https://download.pytorch.org/whl/preview/forge | |
- name: Install Monarch | |
run: pip install torchmonarch==0.1.0rc1 | |
- name: Install torchtitan and torchstore | |
run: | | |
python -m pip install git+https://github.com/pytorch/torchtitan.git | |
python -m pip install git+https://github.com/meta-pytorch/torchstore.git | |
- name: Install dependencies | |
run: python -m pip install --no-build-isolation -e ".[dev]" | |
- name: Run unit tests with coverage | |
# TODO add all tests | |
run: | | |
export LD_PRELOAD=$CONDA/envs/test/lib/libpython3.10.so.1.0 | |
export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0 | |
pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv | |
- name: Upload Coverage to Codecov | |
uses: codecov/codecov-action@v3 |