pytorch · vmoens · Feb 3, 2026 · Jan 31, 2026 · Jan 31, 2026 · Jan 31, 2026
diff --git a/.github/unittest/llm/scripts_llm/environment.yml b/.github/unittest/llm/scripts_llm/environment.yml
@@ -23,7 +23,10 @@ dependencies:
     - scipy
     - hydra-core
     - transformers
+    - accelerate
     - datasets
     - vllm
+    # Note: SGLang is installed in a separate workflow (test-linux-llm-sglang.yml)
+    # due to Triton version conflicts with vLLM
     - mcp
     - langdetect
diff --git a/.github/unittest/llm/scripts_llm/install.sh b/.github/unittest/llm/scripts_llm/install.sh
@@ -75,3 +75,10 @@ export PATH="$HOME/.deno/bin:$PATH"
 # Verify installations
 uvx --version || echo "Warning: uvx not installed"
 deno --version || echo "Warning: Deno not installed"
+
+# Pre-download models for LLM tests to avoid timeout during test execution
+printf "* Pre-downloading models for LLM tests\n"
+python -c "from transformers import AutoTokenizer, AutoModelForCausalLM; AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B'); AutoModelForCausalLM.from_pretrained('Qwen/Qwen2.5-0.5B')"
+
+# Note: SGLang tests are run in a separate workflow (test-linux-llm-sglang.yml)
+# due to Triton version conflicts between vLLM and SGLang.
diff --git a/.github/unittest/llm/scripts_llm/run_test.sh b/.github/unittest/llm/scripts_llm/run_test.sh
@@ -30,13 +30,16 @@ json_report_args="--json-report --json-report-file=${json_report_dir}/test-resul
 # Run pytest with:
 # - --runslow: Run slow tests that would otherwise skip
 # - --ignore: Exclude tests requiring unavailable dependencies (mlgym not on PyPI)
+# - --ignore: Exclude SGLang tests (run in separate workflow due to Triton conflicts)
 # - --timeout: 5 minute timeout per test to prevent hangs
 # Note: Removed --isolate (too slow - each test in subprocess adds huge overhead)
 # Note: Removed --error-for-skips as many LLM tests use pytest.skip for optional dependencies
 # Note: Removed --exitfirst to run all tests and collect all failures
 pytest test/llm ${json_report_args} -vvv --instafail --durations 600 --capture no --timeout=300 \
     --runslow \
-    --ignore=test/llm/libs/test_mlgym.py
+    --ignore=test/llm/libs/test_mlgym.py \
+    --ignore=test/llm/test_sglang.py \
+    --ignore=test/llm/test_sglang_updaters.py
 
 # Upload test results with metadata for flaky tracking
 python .github/unittest/helpers/upload_test_results.py || echo "Warning: Failed to process test results for flaky tracking"
diff --git a/.github/unittest/llm/scripts_sglang/install.sh b/.github/unittest/llm/scripts_sglang/install.sh
@@ -0,0 +1,136 @@
+#!/usr/bin/env bash
+
+# Install script for SGLang tests.
+# This uses uv and installs SGLang WITHOUT vLLM to avoid Triton version conflicts.
+
+set -e
+
+this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+root_dir="$(git rev-parse --show-toplevel)"
+env_dir="${root_dir}/env"
+
+# Ensure uv is available
+export PATH="$HOME/.local/bin:$PATH"
+
+# Activate environment
+source "${env_dir}/bin/activate"
+
+# submodules
+git submodule sync && git submodule update --init --recursive
+
+# ============================================================================================ #
+# ================================ Install dependencies ====================================== #
+# ============================================================================================ #
+
+printf "* Installing base dependencies\n"
+uv pip install \
+    hypothesis \
+    future \
+    cloudpickle \
+    pytest \
+    pytest-cov \
+    pytest-mock \
+    pytest-instafail \
+    pytest-rerunfailures \
+    pytest-json-report \
+    pytest-asyncio \
+    pytest-timeout \
+    expecttest \
+    pyyaml \
+    scipy \
+    hydra-core
+
+# ============================================================================================ #
+# ================================ PyTorch Installation ====================================== #
+# ============================================================================================ #
+
+printf "* Installing PyTorch with %s\n" "${CU_VERSION}"
+if [[ "$TORCH_VERSION" == "nightly" ]]; then
+    if [ "${CU_VERSION:-}" == cpu ]; then
+        uv pip install --upgrade --pre torch torchvision "numpy<2.0.0" --index-url https://download.pytorch.org/whl/nightly/cpu
+    else
+        uv pip install --upgrade --pre torch torchvision "numpy<2.0.0" --index-url https://download.pytorch.org/whl/nightly/${CU_VERSION}
+    fi
+elif [[ "$TORCH_VERSION" == "stable" ]]; then
+    if [ "${CU_VERSION:-}" == cpu ]; then
+        uv pip install --upgrade torch torchvision "numpy<2.0.0" --index-url https://download.pytorch.org/whl/cpu
+    else
+        uv pip install --upgrade torch torchvision "numpy<2.0.0" --index-url https://download.pytorch.org/whl/${CU_VERSION}
+    fi
+else
+    printf "Failed to install pytorch\n"
+    exit 1
+fi
+
+# ============================================================================================ #
+# ================================ TensorDict Installation =================================== #
+# ============================================================================================ #
+
+printf "* Installing tensordict\n"
+# Install tensordict dependencies first (pyvers is required but --no-deps skips it)
+uv pip install cloudpickle packaging importlib_metadata orjson "pyvers>=0.1.0,<0.2.0"
+uv pip install "pybind11[global]" ninja
+if [[ "$RELEASE" == 0 ]]; then
+    uv pip install --no-build-isolation --no-deps git+https://github.com/pytorch/tensordict.git
+else
+    uv pip install --no-deps tensordict
+fi
+
+# smoke test
+python -c "import tensordict"
+
+# ============================================================================================ #
+# ================================ TorchRL Installation ====================================== #
+# ============================================================================================ #
+
+printf "* Installing torchrl\n"
+uv pip install -e . --no-build-isolation --no-deps
+
+# smoke test
+python -c "import torchrl"
+
+# ============================================================================================ #
+# ================================ SGLang Installation ======================================= #
+# ============================================================================================ #
+
+printf "* Installing SGLang dependencies\n"
+uv pip install transformers accelerate datasets
+
+# Install system dependencies required by SGLang
+# libnuma is required by sgl_kernel
+printf "* Installing system dependencies for SGLang\n"
+apt-get update && apt-get install -y libnuma-dev
+
+# Install SGLang with all extras
+# Note: We do NOT install vLLM here to avoid Triton version conflicts
+printf "* Installing SGLang\n"
+uv pip install "sglang[all]"
+
+# Install sgl_kernel separately to ensure it's properly installed
+printf "* Installing sgl_kernel\n"
+uv pip install --upgrade sgl_kernel
+
+# Install MCP dependencies for tool execution tests
+printf "* Installing MCP dependencies (uvx, Deno)\n"
+
+# Install Deno (required by mcp-run-python)
+curl -fsSL https://deno.land/install.sh | sh
+export PATH="$HOME/.deno/bin:$PATH"
+
+# Install mcp
+uv pip install mcp langdetect
+
+# Verify installations
+deno --version || echo "Warning: Deno not installed"
+
+# Pre-download models for LLM tests to avoid timeout during test execution
+printf "* Pre-downloading models for LLM tests\n"
+python -c "from transformers import AutoTokenizer, AutoModelForCausalLM; AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B'); AutoModelForCausalLM.from_pretrained('Qwen/Qwen2.5-0.5B')"
+
+printf "* SGLang installation complete\n"
+
+# Show installed versions for debugging
+printf "* Installed versions:\n"
+python -c "import torch; print(f'PyTorch: {torch.__version__}')"
+python -c "import sglang; print(f'SGLang: {sglang.__version__}')" || echo "SGLang version check failed"
+python -c "import triton; print(f'Triton: {triton.__version__}')" || echo "Triton version check failed"
diff --git a/.github/unittest/llm/scripts_sglang/post_process.sh b/.github/unittest/llm/scripts_sglang/post_process.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+# Post-processing for SGLang tests.
+
+set -e
+
+echo "SGLang tests complete"
diff --git a/.github/unittest/llm/scripts_sglang/run_test.sh b/.github/unittest/llm/scripts_sglang/run_test.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+
+# Run SGLang-specific tests only.
+
+set -e
+
+this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+root_dir="$(git rev-parse --show-toplevel)"
+env_dir="${root_dir}/env"
+
+# Activate environment
+source "${env_dir}/bin/activate"
+
+apt-get update && apt-get install -y git gcc cmake
+ln -s /usr/bin/swig3.0 /usr/bin/swig 2>/dev/null || true
+
+export PYTORCH_TEST_WITH_SLOW='1'
+export LAZY_LEGACY_OP=False
+
+python -m torch.utils.collect_env
+# Avoid error: "fatal: unsafe repository"
+git config --global --add safe.directory '*'
+
+# JSON report for flaky test tracking
+json_report_dir="${RUNNER_ARTIFACT_DIR:-${root_dir}}"
+json_report_args="--json-report --json-report-file=${json_report_dir}/test-results-sglang.json --json-report-indent=2"
+
+# Run only SGLang-related tests
+# Uses glob pattern to pick up all sglang test files that exist
+pytest test/llm/test_sglang*.py \
+    ${json_report_args} \
+    -vvv \
+    --instafail \
+    --durations 600 \
+    --capture no \
+    --timeout=600 \
+    --runslow
+
+# Upload test results with metadata for flaky tracking
+python .github/unittest/helpers/upload_test_results.py || echo "Warning: Failed to process test results for flaky tracking"
diff --git a/.github/unittest/llm/scripts_sglang/setup_env.sh b/.github/unittest/llm/scripts_sglang/setup_env.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+
+# This script sets up the environment for SGLang tests using uv.
+# SGLang has different Triton requirements than vLLM, so we need a separate environment.
+
+set -e
+export DEBIAN_FRONTEND=noninteractive
+export TZ=UTC
+
+apt-get update
+apt-get install -yq --no-install-recommends git wget unzip curl patchelf
+# Avoid error: "fatal: unsafe repository"
+git config --global --add safe.directory '*'
+
+# Cleanup APT cache
+apt-get clean && rm -rf /var/lib/apt/lists/*
+
+this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+root_dir="$(git rev-parse --show-toplevel)"
+env_dir="${root_dir}/env"
+
+cd "${root_dir}"
+
+# Install uv if not present
+if ! command -v uv &> /dev/null; then
+    printf "* Installing uv\n"
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+    export PATH="$HOME/.local/bin:$PATH"
+fi
+
+# Create virtual environment using uv
+printf "* Creating virtual environment with Python ${PYTHON_VERSION}\n"
+if [ ! -d "${env_dir}" ]; then
+    uv venv "${env_dir}" --python "${PYTHON_VERSION}"
+fi
+
+# Activate environment
+source "${env_dir}/bin/activate"
+
+printf "* Environment setup complete\n"
diff --git a/.github/workflows/test-linux-llm.yml b/.github/workflows/test-linux-llm.yml
@@ -20,7 +20,9 @@ permissions:
   contents: read
 
 jobs:
-  unittests:
+  # Job 1: vLLM tests (uses conda + pip)
+  # Runs all LLM tests EXCEPT SGLang tests
+  unittests-vllm:
     if: ${{ github.event_name == 'push' || (github.event_name == 'pull_request' && contains(join(github.event.pull_request.labels.*.name, ', '), 'llm/')) }}
     strategy:
       matrix:
@@ -30,8 +32,6 @@ jobs:
     with:
       repository: pytorch/rl
       runner: "linux.g6.4xlarge.experimental.nvidia.gpu"
-      # gpu-arch-type: cuda
-      # gpu-arch-version: "11.7"
       docker-image: "pytorch/pytorch:2.8.0-cuda12.9-cudnn9-devel"
       timeout: 60
       script: |
@@ -55,3 +55,41 @@ jobs:
         bash .github/unittest/llm/scripts_llm/install.sh
         bash .github/unittest/llm/scripts_llm/run_test.sh
         bash .github/unittest/llm/scripts_llm/post_process.sh
+
+  # Job 2: SGLang tests (uses uv, separate from vLLM due to Triton version conflicts)
+  # SGLang requires a different Triton version than vLLM, so we run it in a separate job
+  unittests-sglang:
+    if: ${{ github.event_name == 'push' || (github.event_name == 'pull_request' && contains(join(github.event.pull_request.labels.*.name, ', '), 'llm/')) }}
+    strategy:
+      matrix:
+        python_version: ["3.12"]
+        cuda_arch_version: ["12.9"]
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      repository: pytorch/rl
+      runner: "linux.g6.4xlarge.experimental.nvidia.gpu"
+      docker-image: "pytorch/pytorch:2.8.0-cuda12.9-cudnn9-devel"
+      timeout: 60
+      script: |
+        if [[ "${{ github.ref }}" =~ release/* ]]; then
+          export RELEASE=1
+          export TORCH_VERSION=stable
+        else
+          export RELEASE=0
+          export TORCH_VERSION=nightly
+        fi
+
+        set -euo pipefail
+        export PYTHON_VERSION="3.12"
+        export CU_VERSION="cu129"
+        export TAR_OPTIONS="--no-same-owner"
+        export UPLOAD_CHANNEL="nightly"
+        export TF_CPP_MIN_LOG_LEVEL=0
+        export TD_GET_DEFAULTS_TO_NONE=1
+
+        # Use SGLang-specific scripts that use uv and don't install vLLM
+        # This avoids Triton version conflicts between vLLM and SGLang
+        bash .github/unittest/llm/scripts_sglang/setup_env.sh
+        bash .github/unittest/llm/scripts_sglang/install.sh
+        bash .github/unittest/llm/scripts_sglang/run_test.sh
+        bash .github/unittest/llm/scripts_sglang/post_process.sh