Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/unittest/llm/scripts_llm/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@ dependencies:
- scipy
- hydra-core
- transformers
- accelerate
- datasets
- vllm
# Note: SGLang is installed in a separate workflow (test-linux-llm-sglang.yml)
# due to Triton version conflicts with vLLM
- mcp
- langdetect
7 changes: 7 additions & 0 deletions .github/unittest/llm/scripts_llm/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,10 @@ export PATH="$HOME/.deno/bin:$PATH"
# Verify installations
uvx --version || echo "Warning: uvx not installed"
deno --version || echo "Warning: Deno not installed"

# Pre-download models for LLM tests to avoid timeout during test execution
printf "* Pre-downloading models for LLM tests\n"
python -c "from transformers import AutoTokenizer, AutoModelForCausalLM; AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B'); AutoModelForCausalLM.from_pretrained('Qwen/Qwen2.5-0.5B')"

# Note: SGLang tests are run in a separate workflow (test-linux-llm-sglang.yml)
# due to Triton version conflicts between vLLM and SGLang.
5 changes: 4 additions & 1 deletion .github/unittest/llm/scripts_llm/run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,16 @@ json_report_args="--json-report --json-report-file=${json_report_dir}/test-resul
# Run pytest with:
# - --runslow: Run slow tests that would otherwise skip
# - --ignore: Exclude tests requiring unavailable dependencies (mlgym not on PyPI)
# - --ignore: Exclude SGLang tests (run in separate workflow due to Triton conflicts)
# - --timeout: 5 minute timeout per test to prevent hangs
# Note: Removed --isolate (too slow - each test in subprocess adds huge overhead)
# Note: Removed --error-for-skips as many LLM tests use pytest.skip for optional dependencies
# Note: Removed --exitfirst to run all tests and collect all failures
pytest test/llm ${json_report_args} -vvv --instafail --durations 600 --capture no --timeout=300 \
--runslow \
--ignore=test/llm/libs/test_mlgym.py
--ignore=test/llm/libs/test_mlgym.py \
--ignore=test/llm/test_sglang.py \
--ignore=test/llm/test_sglang_updaters.py

# Upload test results with metadata for flaky tracking
python .github/unittest/helpers/upload_test_results.py || echo "Warning: Failed to process test results for flaky tracking"
136 changes: 136 additions & 0 deletions .github/unittest/llm/scripts_sglang/install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
#!/usr/bin/env bash

# Install script for SGLang tests.
# This uses uv and installs SGLang WITHOUT vLLM to avoid Triton version conflicts.

set -e

this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
root_dir="$(git rev-parse --show-toplevel)"
env_dir="${root_dir}/env"

# Ensure uv is available
export PATH="$HOME/.local/bin:$PATH"

# Activate environment
source "${env_dir}/bin/activate"

# submodules
git submodule sync && git submodule update --init --recursive

# ============================================================================================ #
# ================================ Install dependencies ====================================== #
# ============================================================================================ #

printf "* Installing base dependencies\n"
uv pip install \
hypothesis \
future \
cloudpickle \
pytest \
pytest-cov \
pytest-mock \
pytest-instafail \
pytest-rerunfailures \
pytest-json-report \
pytest-asyncio \
pytest-timeout \
expecttest \
pyyaml \
scipy \
hydra-core

# ============================================================================================ #
# ================================ PyTorch Installation ====================================== #
# ============================================================================================ #

printf "* Installing PyTorch with %s\n" "${CU_VERSION}"
if [[ "$TORCH_VERSION" == "nightly" ]]; then
if [ "${CU_VERSION:-}" == cpu ]; then
uv pip install --upgrade --pre torch torchvision "numpy<2.0.0" --index-url https://download.pytorch.org/whl/nightly/cpu
else
uv pip install --upgrade --pre torch torchvision "numpy<2.0.0" --index-url https://download.pytorch.org/whl/nightly/${CU_VERSION}
fi
elif [[ "$TORCH_VERSION" == "stable" ]]; then
if [ "${CU_VERSION:-}" == cpu ]; then
uv pip install --upgrade torch torchvision "numpy<2.0.0" --index-url https://download.pytorch.org/whl/cpu
else
uv pip install --upgrade torch torchvision "numpy<2.0.0" --index-url https://download.pytorch.org/whl/${CU_VERSION}
fi
else
printf "Failed to install pytorch\n"
exit 1
fi

# ============================================================================================ #
# ================================ TensorDict Installation =================================== #
# ============================================================================================ #

printf "* Installing tensordict\n"
# Install tensordict dependencies first (pyvers is required but --no-deps skips it)
uv pip install cloudpickle packaging importlib_metadata orjson "pyvers>=0.1.0,<0.2.0"
uv pip install "pybind11[global]" ninja
if [[ "$RELEASE" == 0 ]]; then
uv pip install --no-build-isolation --no-deps git+https://github.com/pytorch/tensordict.git
else
uv pip install --no-deps tensordict
fi

# smoke test
python -c "import tensordict"

# ============================================================================================ #
# ================================ TorchRL Installation ====================================== #
# ============================================================================================ #

printf "* Installing torchrl\n"
uv pip install -e . --no-build-isolation --no-deps

# smoke test
python -c "import torchrl"

# ============================================================================================ #
# ================================ SGLang Installation ======================================= #
# ============================================================================================ #

printf "* Installing SGLang dependencies\n"
uv pip install transformers accelerate datasets

# Install system dependencies required by SGLang
# libnuma is required by sgl_kernel
printf "* Installing system dependencies for SGLang\n"
apt-get update && apt-get install -y libnuma-dev

# Install SGLang with all extras
# Note: We do NOT install vLLM here to avoid Triton version conflicts
printf "* Installing SGLang\n"
uv pip install "sglang[all]"

# Install sgl_kernel separately to ensure it's properly installed
printf "* Installing sgl_kernel\n"
uv pip install --upgrade sgl_kernel

# Install MCP dependencies for tool execution tests
printf "* Installing MCP dependencies (uvx, Deno)\n"

# Install Deno (required by mcp-run-python)
curl -fsSL https://deno.land/install.sh | sh
export PATH="$HOME/.deno/bin:$PATH"

# Install mcp
uv pip install mcp langdetect

# Verify installations
deno --version || echo "Warning: Deno not installed"

# Pre-download models for LLM tests to avoid timeout during test execution
printf "* Pre-downloading models for LLM tests\n"
python -c "from transformers import AutoTokenizer, AutoModelForCausalLM; AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B'); AutoModelForCausalLM.from_pretrained('Qwen/Qwen2.5-0.5B')"

printf "* SGLang installation complete\n"

# Show installed versions for debugging
printf "* Installed versions:\n"
python -c "import torch; print(f'PyTorch: {torch.__version__}')"
python -c "import sglang; print(f'SGLang: {sglang.__version__}')" || echo "SGLang version check failed"
python -c "import triton; print(f'Triton: {triton.__version__}')" || echo "Triton version check failed"
7 changes: 7 additions & 0 deletions .github/unittest/llm/scripts_sglang/post_process.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/usr/bin/env bash

# Post-processing for SGLang tests.

set -e

echo "SGLang tests complete"
40 changes: 40 additions & 0 deletions .github/unittest/llm/scripts_sglang/run_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/usr/bin/env bash

# Run SGLang-specific tests only.

set -e

this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
root_dir="$(git rev-parse --show-toplevel)"
env_dir="${root_dir}/env"

# Activate environment
source "${env_dir}/bin/activate"

apt-get update && apt-get install -y git gcc cmake
ln -s /usr/bin/swig3.0 /usr/bin/swig 2>/dev/null || true

export PYTORCH_TEST_WITH_SLOW='1'
export LAZY_LEGACY_OP=False

python -m torch.utils.collect_env
# Avoid error: "fatal: unsafe repository"
git config --global --add safe.directory '*'

# JSON report for flaky test tracking
json_report_dir="${RUNNER_ARTIFACT_DIR:-${root_dir}}"
json_report_args="--json-report --json-report-file=${json_report_dir}/test-results-sglang.json --json-report-indent=2"

# Run only SGLang-related tests
# Uses glob pattern to pick up all sglang test files that exist
pytest test/llm/test_sglang*.py \
${json_report_args} \
-vvv \
--instafail \
--durations 600 \
--capture no \
--timeout=600 \
--runslow

# Upload test results with metadata for flaky tracking
python .github/unittest/helpers/upload_test_results.py || echo "Warning: Failed to process test results for flaky tracking"
40 changes: 40 additions & 0 deletions .github/unittest/llm/scripts_sglang/setup_env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/usr/bin/env bash

# This script sets up the environment for SGLang tests using uv.
# SGLang has different Triton requirements than vLLM, so we need a separate environment.

set -e
export DEBIAN_FRONTEND=noninteractive
export TZ=UTC

apt-get update
apt-get install -yq --no-install-recommends git wget unzip curl patchelf
# Avoid error: "fatal: unsafe repository"
git config --global --add safe.directory '*'

# Cleanup APT cache
apt-get clean && rm -rf /var/lib/apt/lists/*

this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
root_dir="$(git rev-parse --show-toplevel)"
env_dir="${root_dir}/env"

cd "${root_dir}"

# Install uv if not present
if ! command -v uv &> /dev/null; then
printf "* Installing uv\n"
curl -LsSf https://astral.sh/uv/install.sh | sh
export PATH="$HOME/.local/bin:$PATH"
fi

# Create virtual environment using uv
printf "* Creating virtual environment with Python ${PYTHON_VERSION}\n"
if [ ! -d "${env_dir}" ]; then
uv venv "${env_dir}" --python "${PYTHON_VERSION}"
fi

# Activate environment
source "${env_dir}/bin/activate"

printf "* Environment setup complete\n"
44 changes: 41 additions & 3 deletions .github/workflows/test-linux-llm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ permissions:
contents: read

jobs:
unittests:
# Job 1: vLLM tests (uses conda + pip)
# Runs all LLM tests EXCEPT SGLang tests
unittests-vllm:
if: ${{ github.event_name == 'push' || (github.event_name == 'pull_request' && contains(join(github.event.pull_request.labels.*.name, ', '), 'llm/')) }}
strategy:
matrix:
Expand All @@ -30,8 +32,6 @@ jobs:
with:
repository: pytorch/rl
runner: "linux.g6.4xlarge.experimental.nvidia.gpu"
# gpu-arch-type: cuda
# gpu-arch-version: "11.7"
docker-image: "pytorch/pytorch:2.8.0-cuda12.9-cudnn9-devel"
timeout: 60
script: |
Expand All @@ -55,3 +55,41 @@ jobs:
bash .github/unittest/llm/scripts_llm/install.sh
bash .github/unittest/llm/scripts_llm/run_test.sh
bash .github/unittest/llm/scripts_llm/post_process.sh

# Job 2: SGLang tests (uses uv, separate from vLLM due to Triton version conflicts)
# SGLang requires a different Triton version than vLLM, so we run it in a separate job
unittests-sglang:
if: ${{ github.event_name == 'push' || (github.event_name == 'pull_request' && contains(join(github.event.pull_request.labels.*.name, ', '), 'llm/')) }}
strategy:
matrix:
python_version: ["3.12"]
cuda_arch_version: ["12.9"]
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
with:
repository: pytorch/rl
runner: "linux.g6.4xlarge.experimental.nvidia.gpu"
docker-image: "pytorch/pytorch:2.8.0-cuda12.9-cudnn9-devel"
timeout: 60
script: |
if [[ "${{ github.ref }}" =~ release/* ]]; then
export RELEASE=1
export TORCH_VERSION=stable
else
export RELEASE=0
export TORCH_VERSION=nightly
fi

set -euo pipefail
export PYTHON_VERSION="3.12"
export CU_VERSION="cu129"
export TAR_OPTIONS="--no-same-owner"
export UPLOAD_CHANNEL="nightly"
export TF_CPP_MIN_LOG_LEVEL=0
export TD_GET_DEFAULTS_TO_NONE=1

# Use SGLang-specific scripts that use uv and don't install vLLM
# This avoids Triton version conflicts between vLLM and SGLang
bash .github/unittest/llm/scripts_sglang/setup_env.sh
bash .github/unittest/llm/scripts_sglang/install.sh
bash .github/unittest/llm/scripts_sglang/run_test.sh
bash .github/unittest/llm/scripts_sglang/post_process.sh
Loading
Loading