rl/.github/workflows/test-linux-llm.yml at ff77695f2d88459c5299e70b96c7e7be08f359ac · pytorch/rl · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
name: LLM Tests on Linux

on:
  pull_request:
  push:
    branches:
      - nightly
      - main
      - release/*
  workflow_dispatch:
  workflow_call:

concurrency:
  # Documentation suggests ${{ github.head_ref }}, but that's only available on pull_request/pull_request_target triggers, so using ${{ github.ref }}.
  # On master, we want all builds to complete even if merging happens faster to make it easier to discover at which point something broke.
  group: test-linux-llm-${{ github.ref == 'refs/heads/main' && format('ci-master-{0}', github.sha) || format('ci-{0}', github.ref) }}
  cancel-in-progress: true

permissions:
  id-token: write
  contents: read

jobs:
  # Job 1: vLLM tests (uses conda + pip)
  # Runs all LLM tests EXCEPT SGLang tests
  unittests-vllm:
    if: ${{ github.event_name == 'push' || github.event_name == 'workflow_call' || github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && contains(join(github.event.pull_request.labels.*.name, ', '), 'llm/')) }}
    strategy:
      matrix:
        python_version: ["3.12"]
        cuda_arch_version: ["12.9"]
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
    with:
      repository: pytorch/rl
      runner: "linux.g6.4xlarge.experimental.nvidia.gpu"
      docker-image: "pytorch/pytorch:2.8.0-cuda12.9-cudnn9-devel"
      timeout: 60
      script: |
        if [[ "${{ github.ref }}" =~ release/* ]]; then
          export RELEASE=1
          export TORCH_VERSION=stable
        else
          export RELEASE=0
          export TORCH_VERSION=nightly
        fi

        set -euo pipefail
        export PYTHON_VERSION="3.12"
        export CU_VERSION="cu129"
        export TAR_OPTIONS="--no-same-owner"
        export UPLOAD_CHANNEL="nightly"
        export TF_CPP_MIN_LOG_LEVEL=0
        export TD_GET_DEFAULTS_TO_NONE=1

        bash .github/unittest/llm/scripts_llm/setup_env.sh
        bash .github/unittest/llm/scripts_llm/install.sh
        bash .github/unittest/llm/scripts_llm/run_test.sh
        bash .github/unittest/llm/scripts_llm/post_process.sh

  # Job 2: SGLang tests (uses uv, separate from vLLM due to Triton version conflicts)
  # SGLang requires a different Triton version than vLLM, so we run it in a separate job
  unittests-sglang:
    if: ${{ github.event_name == 'push' || github.event_name == 'workflow_call' || github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && contains(join(github.event.pull_request.labels.*.name, ', '), 'llm/')) }}
    strategy:
      matrix:
        python_version: ["3.12"]
        cuda_arch_version: ["12.9"]
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
    with:
      repository: pytorch/rl
      runner: "linux.g6.4xlarge.experimental.nvidia.gpu"
      docker-image: "pytorch/pytorch:2.8.0-cuda12.9-cudnn9-devel"
      timeout: 60
      script: |
        if [[ "${{ github.ref }}" =~ release/* ]]; then
          export RELEASE=1
          export TORCH_VERSION=stable
        else
          export RELEASE=0
          export TORCH_VERSION=nightly
        fi

        set -euo pipefail
        export PYTHON_VERSION="3.12"
        export CU_VERSION="cu129"
        export TAR_OPTIONS="--no-same-owner"
        export UPLOAD_CHANNEL="nightly"
        export TF_CPP_MIN_LOG_LEVEL=0
        export TD_GET_DEFAULTS_TO_NONE=1

        # Use SGLang-specific scripts that use uv and don't install vLLM
        # This avoids Triton version conflicts between vLLM and SGLang
        bash .github/unittest/llm/scripts_sglang/setup_env.sh
        bash .github/unittest/llm/scripts_sglang/install.sh
        bash .github/unittest/llm/scripts_sglang/run_test.sh
        bash .github/unittest/llm/scripts_sglang/post_process.sh