diff --git a/.github/workflows/sglang-tests.yml b/.github/workflows/sglang-tests.yml deleted file mode 100644 index dc5cabc991..0000000000 --- a/.github/workflows/sglang-tests.yml +++ /dev/null @@ -1,100 +0,0 @@ -name: Third party SGLang tests - -on: - workflow_dispatch: - inputs: - runner_label: - description: Runner label, keep empty for default - type: string - default: "" - use_pyenv_python: - description: Use Python built with pyenv - type: boolean - default: false - schedule: - # About midnight PST Sunday (UTC-8) - - cron: "5 10 * * SUN" - - -# Cancels in-progress PR runs when the PR is updated. Manual runs are never cancelled. -concurrency: - group: ${{ github.workflow }}-${{ github.event_name == 'workflow_dispatch' && github.run_id || github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -permissions: read-all - -env: - PYTHON_VERSION: "3.10" - TAG: ${{ inputs.tag || (github.event_name == 'pull_request' && format('pr-{0}', github.event.number)) || (github.event_name == 'schedule' && 'ci') || 'test' }} - -jobs: - build: - name: SGLang tests - runs-on: - - linux - - ${{ inputs.runner_label || 'rolling' }} - timeout-minutes: 720 - defaults: - run: - shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}" - steps: - - name: Print inputs - run: | - cat <> $GITHUB_ENV - - - name: Install SGLang - id: install - run: | - git clone https://github.com/sgl-project/sglang.git - cd sglang - git apply ../benchmarks/third_party/sglang/sglang-fix.patch - pip install "./python[dev_xpu]" - - - name: Setup PyTorch - uses: ./.github/actions/setup-pytorch - - - name: Setup Triton - uses: ./.github/actions/setup-triton - - - name: Run SGLANG tests - if: ${{ steps.install.outcome == 'success' && !cancelled() }} - run: | - ./scripts/test-triton.sh --sglang --skip-pip-install --skip-pytorch-install - - - name: Upload test report - if: ${{ steps.install.outcome == 'success' && !cancelled() }} - uses: actions/upload-artifact@v4 - with: - name: test-reports - path: reports diff --git a/.github/workflows/third-party-tests.yml b/.github/workflows/third-party-tests.yml index 41a38b5c3f..2dac6d56ba 100644 --- a/.github/workflows/third-party-tests.yml +++ b/.github/workflows/third-party-tests.yml @@ -1,4 +1,4 @@ -name: Third party tests [liger-kernels, vllm] +name: Third party tests [liger-kernels, vllm, sglang] on: workflow_dispatch: @@ -28,12 +28,12 @@ env: TAG: ${{ inputs.tag || (github.event_name == 'pull_request' && format('pr-{0}', github.event.number)) || (github.event_name == 'schedule' && 'ci') || 'test' }} jobs: - build: - name: Third party tests [liger-kernels, vllm] + small-tests: + name: Third party tests [vllm, sglang] runs-on: - linux - ${{ inputs.runner_label || 'max1550' }} - timeout-minutes: 720 + timeout-minutes: 120 defaults: run: shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}" @@ -47,14 +47,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v5 - - name: Install Python - if: ${{ !(inputs.use_pyenv_python || false) }} - uses: actions/setup-python@v6 - with: - python-version: ${{ env.PYTHON_VERSION }} - - name: Install Python (from pyenv) ${{ inputs.python_version }} - if: ${{ inputs.use_pyenv_python }} uses: ./.github/actions/setup-pyenv-python with: python-version: ${{ env.PYTHON_VERSION }} @@ -71,28 +64,78 @@ jobs: - name: Setup PyTorch uses: ./.github/actions/setup-pytorch - - name: Build Triton wheels - uses: ./.github/actions/setup-triton - with: - command: DEBUG=1 python -m build --wheel --no-isolation - - - name: Install Triton + - name: Setup Triton id: install - run: | - pip install dist/*.whl + uses: ./.github/actions/setup-triton - name: Create reports dir run: | mkdir reports echo "REPORTS=$PWD/reports" >> $GITHUB_ENV + - name: Run SGLANG tests + if: ${{ steps.install.outcome == 'success' && !cancelled() }} + run: | + ./scripts/test-triton.sh --sglang --skip-pip-install --skip-pytorch-install + - name: Run VLLM tests if: ${{ steps.install.outcome == 'success' && !cancelled() }} run: | ./scripts/test-triton.sh --vllm --skip-pip-install --skip-pytorch-install - - name: Run Liger-Kernel tests + - name: Upload test report if: ${{ steps.install.outcome == 'success' && !cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: test-main-reports + path: reports + # We run all tests for Liger, so it's slow and we test it separately + liger: + name: Liger testing + runs-on: + - linux + - ${{ inputs.runner_label || 'max1550' }} + timeout-minutes: 120 + defaults: + run: + shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}" + steps: + - name: Print inputs + run: | + cat <> $GITHUB_ENV + + - name: Run Liger-Kernel tests run: | ./scripts/test-triton.sh --liger --skip-pip-install --skip-pytorch-install @@ -100,5 +143,5 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() }} uses: actions/upload-artifact@v4 with: - name: test-reports + name: test-liger-reports path: reports diff --git a/benchmarks/third_party/sglang/sglang-fix.patch b/benchmarks/third_party/sglang/sglang-fix.patch index 9b9d38dc43..b3769b6385 100644 --- a/benchmarks/third_party/sglang/sglang-fix.patch +++ b/benchmarks/third_party/sglang/sglang-fix.patch @@ -1,9 +1,9 @@ -diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py -index bc2affa1..8ef91e66 100644 ---- a/python/sglang/srt/utils.py -+++ b/python/sglang/srt/utils.py -@@ -228,6 +228,22 @@ def is_flashinfer_available(): - return importlib.util.find_spec("flashinfer") is not None and is_cuda() +diff --git a/python/sglang/srt/utils/common.py b/python/sglang/srt/utils/common.py +index 7c2f573e4..8023cd6be 100644 +--- a/python/sglang/srt/utils/common.py ++++ b/python/sglang/srt/utils/common.py +@@ -155,12 +155,44 @@ def is_cpu() -> bool: + return os.getenv("SGLANG_USE_CPU_ENGINE", "0") == "1" and is_host_cpu_x86() +def auto_detect_device(): @@ -22,26 +22,48 @@ index bc2affa1..8ef91e66 100644 + return "cpu" + + - _ENABLE_TORCH_INFERENCE_MODE = get_bool_env_var( - "SGLANG_ENABLE_TORCH_INFERENCE_MODE", "false" - ) + def get_cuda_version(): + if torch.version.cuda: + return tuple(map(int, torch.version.cuda.split("."))) + return (0, 0) + + ++def auto_detect_device(): ++ """ ++ Infer the device type based on the current environment. ++ """ ++ if is_cuda_alike(): ++ return "cuda" ++ elif is_xpu(): ++ return "xpu" ++ elif is_hpu(): ++ return "hpu" ++ elif is_npu(): ++ return "npu" ++ else: ++ return "cpu" ++ ++ + def _check(cc_major): + if not is_cuda(): + return False diff --git a/test/srt/test_triton_attention_kernels.py b/test/srt/test_triton_attention_kernels.py -index 47eb16a9..cce70fb9 100644 +index 16c107006..03b9411fa 100644 --- a/test/srt/test_triton_attention_kernels.py +++ b/test/srt/test_triton_attention_kernels.py -@@ -16,8 +16,11 @@ from sglang.srt.layers.attention.triton_ops.prefill_attention import ( +@@ -18,8 +18,11 @@ from sglang.srt.layers.attention.triton_ops.extend_attention import ( + from sglang.srt.layers.attention.triton_ops.prefill_attention import ( context_attention_fwd, ) - from sglang.test.test_utils import CustomTestCase +from sglang.srt.utils import auto_detect_device - + from sglang.test.test_utils import CustomTestCase +device = auto_detect_device() + - class TestTritonAttention(CustomTestCase): - def _set_all_seeds(self, seed): -@@ -37,24 +40,24 @@ class TestTritonAttention(CustomTestCase): + def extend_attention_fwd_torch( + q: torch.Tensor, # [extend_tokens, H_Q, D] +@@ -114,24 +117,24 @@ class TestTritonAttention(CustomTestCase): dtype = torch.bfloat16 b_seq_len_prefix = torch.randint( @@ -73,7 +95,7 @@ index 47eb16a9..cce70fb9 100644 ) for i in range(B): -@@ -65,15 +68,15 @@ class TestTritonAttention(CustomTestCase): +@@ -142,15 +145,15 @@ class TestTritonAttention(CustomTestCase): total_token_num = torch.sum(b_seq_len).item() extend_token_num = torch.sum(b_seq_len_extend).item() k_buffer = torch.empty( @@ -94,7 +116,7 @@ index 47eb16a9..cce70fb9 100644 for i in range(B): extend_start_in_buffer = b_start_loc[i] + b_seq_len_prefix[i] extend_end_in_buffer = b_start_loc[i] + b_seq_len[i] -@@ -86,20 +89,20 @@ class TestTritonAttention(CustomTestCase): +@@ -163,20 +166,20 @@ class TestTritonAttention(CustomTestCase): extend_start_in_buffer:extend_end_in_buffer ] q_extend[extend_start:extend_end] = torch.empty( @@ -120,7 +142,7 @@ index 47eb16a9..cce70fb9 100644 qo_indptr[1 : B + 1] = torch.cumsum(b_seq_len_extend[:B], dim=0) custom_mask = None -@@ -123,9 +126,9 @@ class TestTritonAttention(CustomTestCase): +@@ -200,9 +203,9 @@ class TestTritonAttention(CustomTestCase): b_seq_mask_len = b_seq_len_extend * b_seq_len custom_mask = torch.ones( @@ -132,7 +154,81 @@ index 47eb16a9..cce70fb9 100644 mask_indptr[1 : B + 1] = torch.cumsum(b_seq_mask_len[:B], dim=0) for i in range(B): causal_mask = ( -@@ -187,14 +190,14 @@ class TestTritonAttention(CustomTestCase): +@@ -263,22 +266,22 @@ class TestTritonAttention(CustomTestCase): + dtype = torch.bfloat16 + + b_seq_len_prefix = torch.randint( +- 1, N_CTX // 2, (B,), dtype=torch.int32, device="cuda" ++ 1, N_CTX // 2, (B,), dtype=torch.int32, device=device + ) + b_seq_len_extend = torch.randint( +- 1, N_CTX // 2, (B,), dtype=torch.int32, device="cuda" ++ 1, N_CTX // 2, (B,), dtype=torch.int32, device=device + ) + b_seq_len = b_seq_len_prefix + b_seq_len_extend + +- b_start_loc = torch.zeros((B,), dtype=torch.int32, device="cuda") ++ b_start_loc = torch.zeros((B,), dtype=torch.int32, device=device) + b_start_loc[1:] = torch.cumsum(b_seq_len[:-1], 0) +- b_start_loc_extend = torch.zeros((B,), dtype=torch.int32, device="cuda") ++ b_start_loc_extend = torch.zeros((B,), dtype=torch.int32, device=device) + b_start_loc_extend[1:] = torch.cumsum(b_seq_len_extend[:-1], 0) + +- kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda") ++ kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device=device) + kv_indptr[1 : B + 1] = torch.cumsum(b_seq_len_prefix[:B], dim=0) + kv_indices = torch.zeros( +- (b_seq_len_prefix.sum().item(),), dtype=torch.int32, device="cuda" ++ (b_seq_len_prefix.sum().item(),), dtype=torch.int32, device=device + ) + + for i in range(B): +@@ -289,15 +292,15 @@ class TestTritonAttention(CustomTestCase): + total_token_num = torch.sum(b_seq_len).item() + extend_token_num = torch.sum(b_seq_len_extend).item() + k_buffer = torch.empty( +- (total_token_num, H_KV, D), dtype=dtype, device="cuda" ++ (total_token_num, H_KV, D), dtype=dtype, device=device + ).normal_(mean=0.1, std=0.2) + v_buffer = torch.empty( +- (total_token_num, H_KV, D), dtype=dtype, device="cuda" ++ (total_token_num, H_KV, D), dtype=dtype, device=device + ).normal_(mean=0.1, std=0.2) + +- k_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device="cuda") +- v_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device="cuda") +- q_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device="cuda") ++ k_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device=device) ++ v_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device=device) ++ q_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device=device) + for i in range(B): + extend_start_in_buffer = b_start_loc[i] + b_seq_len_prefix[i] + extend_end_in_buffer = b_start_loc[i] + b_seq_len[i] +@@ -310,19 +313,19 @@ class TestTritonAttention(CustomTestCase): + extend_start_in_buffer:extend_end_in_buffer + ] + q_extend[extend_start:extend_end] = torch.empty( +- (b_seq_len_extend[i], H_Q, D), dtype=dtype, device="cuda" ++ (b_seq_len_extend[i], H_Q, D), dtype=dtype, device=device + ).normal_(mean=0.1, std=0.2) + + o_extend_triton = torch.empty( +- (extend_token_num, H_Q, D), dtype=dtype, device="cuda" ++ (extend_token_num, H_Q, D), dtype=dtype, device=device + ) + o_extend_torch = torch.empty( +- (extend_token_num, H_Q, D), dtype=dtype, device="cuda" ++ (extend_token_num, H_Q, D), dtype=dtype, device=device + ) + + b_seq_len_extend = b_seq_len - b_seq_len_prefix + max_len_extend = torch.max(b_seq_len_extend, 0)[0].item() +- qo_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda") ++ qo_indptr = torch.zeros((B + 1,), dtype=torch.int32, device=device) + qo_indptr[1 : B + 1] = torch.cumsum(b_seq_len_extend[:B], dim=0) + + extend_attention_fwd( +@@ -373,14 +376,14 @@ class TestTritonAttention(CustomTestCase): max_seq_len = max(seq_lens) # Create random input tensors @@ -153,7 +249,7 @@ index 47eb16a9..cce70fb9 100644 context_attention_fwd( q, k, v, o, b_start_loc, b_seq_len, max_seq_len, is_causal=is_causal -@@ -232,33 +235,33 @@ class TestTritonAttention(CustomTestCase): +@@ -418,33 +421,33 @@ class TestTritonAttention(CustomTestCase): total_tokens = B * seq_len sm_scale = 1.0 / (D**0.5) max_kv_splits = 8 @@ -197,7 +293,7 @@ index 47eb16a9..cce70fb9 100644 ) decode_attention_fwd( -@@ -296,34 +299,34 @@ class TestTritonAttention(CustomTestCase): +@@ -482,34 +485,34 @@ class TestTritonAttention(CustomTestCase): total_tokens = B * seq_len sm_scale = 1.0 / (D**0.5) max_kv_splits = 8 @@ -243,7 +339,7 @@ index 47eb16a9..cce70fb9 100644 ) decode_attention_fwd_normal( -@@ -343,12 +346,12 @@ class TestTritonAttention(CustomTestCase): +@@ -529,12 +532,12 @@ class TestTritonAttention(CustomTestCase): attn_logits1 = torch.empty( (B, H_Q, max_kv_splits, D_V), dtype=torch.float32, @@ -258,3 +354,103 @@ index 47eb16a9..cce70fb9 100644 ) decode_attention_fwd_grouped( +@@ -578,23 +581,23 @@ class TestTritonAttention(CustomTestCase): + dtype = torch.bfloat16 + + b_seq_len_prefix = torch.randint( +- 1, N_CTX // 2, (B,), dtype=torch.int32, device="cuda" ++ 1, N_CTX // 2, (B,), dtype=torch.int32, device=device + ) + b_seq_len_extend = torch.randint( +- 1, N_CTX // 2, (B,), dtype=torch.int32, device="cuda" ++ 1, N_CTX // 2, (B,), dtype=torch.int32, device=device + ) + b_seq_len = b_seq_len_prefix + b_seq_len_extend + +- b_start_loc = torch.zeros((B,), dtype=torch.int32, device="cuda") ++ b_start_loc = torch.zeros((B,), dtype=torch.int32, device=device) + b_start_loc[1:] = torch.cumsum(b_seq_len[:-1], 0) +- b_start_loc_extend = torch.zeros((B,), dtype=torch.int32, device="cuda") ++ b_start_loc_extend = torch.zeros((B,), dtype=torch.int32, device=device) + b_start_loc_extend[1:] = torch.cumsum(b_seq_len_extend[:-1], 0) + + # Setup prefix KV indices +- kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda") ++ kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device=device) + kv_indptr[1 : B + 1] = torch.cumsum(b_seq_len_prefix[:B], dim=0) + kv_indices = torch.zeros( +- (b_seq_len_prefix.sum().item(),), dtype=torch.int64, device="cuda" ++ (b_seq_len_prefix.sum().item(),), dtype=torch.int64, device=device + ) + + for i in range(B): +@@ -605,15 +608,15 @@ class TestTritonAttention(CustomTestCase): + total_token_num = torch.sum(b_seq_len).item() + extend_token_num = torch.sum(b_seq_len_extend).item() + k_buffer = torch.empty( +- (total_token_num, H_KV, D), dtype=dtype, device="cuda" ++ (total_token_num, H_KV, D), dtype=dtype, device=device + ).normal_(mean=0.1, std=0.2) + v_buffer = torch.empty( +- (total_token_num, H_KV, D), dtype=dtype, device="cuda" ++ (total_token_num, H_KV, D), dtype=dtype, device=device + ).normal_(mean=0.1, std=0.2) + +- k_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device="cuda") +- v_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device="cuda") +- q_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device="cuda") ++ k_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device=device) ++ v_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device=device) ++ q_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device=device) + + for i in range(B): + extend_start_in_buffer = b_start_loc[i] + b_seq_len_prefix[i] +@@ -627,16 +630,16 @@ class TestTritonAttention(CustomTestCase): + extend_start_in_buffer:extend_end_in_buffer + ] + q_extend[extend_start:extend_end] = torch.empty( +- (b_seq_len_extend[i], H_Q, D), dtype=dtype, device="cuda" ++ (b_seq_len_extend[i], H_Q, D), dtype=dtype, device=device + ).normal_(mean=0.1, std=0.2) + + # Setup for extend attention + max_len_extend = torch.max(b_seq_len_extend, 0)[0].item() +- qo_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda") ++ qo_indptr = torch.zeros((B + 1,), dtype=torch.int32, device=device) + qo_indptr[1 : B + 1] = torch.cumsum(b_seq_len_extend[:B], dim=0) + + # Run 2-stage kernel +- o_regular = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device="cuda") ++ o_regular = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device=device) + extend_attention_fwd( + q_extend, + k_extend, +@@ -658,9 +661,9 @@ class TestTritonAttention(CustomTestCase): + total_token_num - extend_token_num, + total_token_num, + dtype=torch.int64, +- device="cuda", ++ device=device, + ) +- extend_start_loc = torch.zeros((B,), dtype=torch.int32, device="cuda") ++ extend_start_loc = torch.zeros((B,), dtype=torch.int32, device=device) + extend_start_loc[1:] = torch.cumsum(b_seq_len_extend[:-1], 0) + + unified_kv_indptr, unified_kv_indices, prefix_lens = build_unified_kv_indices( +@@ -673,7 +676,7 @@ class TestTritonAttention(CustomTestCase): + ) + + # Run unified kernel +- o_unified = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device="cuda") ++ o_unified = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device=device) + extend_attention_fwd_unified( + q_extend, + o_unified, +@@ -716,7 +719,6 @@ class TestTritonAttention(CustomTestCase): + """Test build_unified_kv_indices correctness.""" + B = 4 + dtype = torch.int64 +- device = "cuda" + + # Setup test data + prefix_lens = torch.tensor([10, 20, 15, 25], dtype=torch.int32, device=device) diff --git a/benchmarks/third_party/sglang/sglang-pin.txt b/benchmarks/third_party/sglang/sglang-pin.txt new file mode 100644 index 0000000000..8f8517ba4b --- /dev/null +++ b/benchmarks/third_party/sglang/sglang-pin.txt @@ -0,0 +1 @@ +d6fee73d1f593bd6754cd2550775fd2e54aeae60 diff --git a/scripts/test-triton.sh b/scripts/test-triton.sh index 0bdc5de7ad..bc347f4a00 100755 --- a/scripts/test-triton.sh +++ b/scripts/test-triton.sh @@ -30,6 +30,7 @@ TEST: --liger --vllm --install-vllm + --install-sglang OPTION: --unskip @@ -74,6 +75,7 @@ TEST_SGLANG=false TEST_LIGER=false TEST_VLLM=false INSTALL_VLLM=false +INSTALL_SGLANG=false TEST_TRITON_KERNELS=false VENV=false TRITON_TEST_REPORTS=false @@ -190,6 +192,11 @@ while (( $# != 0 )); do TEST_DEFAULT=false shift ;; + --install-sglang) + INSTALL_SGLANG=true + TEST_DEFAULT=false + shift + ;; --sglang) TEST_SGLANG=true TEST_DEFAULT=false @@ -589,26 +596,41 @@ run_inductor_tests() { grep AlbertForMaskedLM inductor_log.csv | grep -q ,pass, } -run_sglang_tests() { - echo "***************************************************" - echo "****** Running SGLang Triton tests ******" - echo "***************************************************" +run_sglang_install() { + echo "************************************************" + echo "****** Installing SGLang ****" + echo "************************************************" if ! [ -d "./sglang" ]; then git clone https://github.com/sgl-project/sglang.git fi - cd sglang if ! pip list | grep "sglang" ; then - git apply $TRITON_PROJ/benchmarks/third_party/sglang/sglang-fix.patch - pip install "./python[dev_xpu]" - - # SGLang installation breaks the default PyTorch and Triton versions, so we need to reinstall them. - $SCRIPTS_DIR/install-pytorch.sh --force-reinstall - $SCRIPTS_DIR/compile-triton.sh --triton + cd sglang + git checkout "$(<../benchmarks/third_party/sglang/sglang-pin.txt)" + git apply ../benchmarks/third_party/sglang/sglang-fix.patch + + # That's how sglang assumes we'll pick out platform for now + cp python/pyproject_xpu.toml python/pyproject.toml + # We should remove all torch libraries from requirements to avoid reinstalling triton & torch + # We remove sgl kernel due to a bug in the current environment probably due to using newer torch, we don't currently use it anyway + # We remove timm because it depends on torchvision, which depends on torch==2.9 + sed -i '/pytorch\|torch\|sgl-kernel\|timm/d' python/pyproject.toml + cat python/pyproject.toml + pip install -e "./python" + cd .. fi - pip install pytest pytest-xdist + pip install pytest pytest-cov pytest-xdist +} + +run_sglang_tests() { + echo "***************************************************" + echo "****** Running SGLang Triton tests ******" + echo "***************************************************" + + run_sglang_install + cd sglang run_pytest_command -vvv -n ${PYTEST_MAX_PROCESSES:-4} test/srt/test_triton_attention_kernels.py } @@ -648,6 +670,7 @@ run_vllm_install() { cd vllm-xpu-kernels git checkout "$(<../benchmarks/third_party/vllm/vllm-kernels-pin.txt)" sed -i '/pytorch\|torch/d' requirements.txt + sed -i '/pytorch\|torch/d' pyproject.toml pip install -r requirements.txt VLLM_TARGET_DEVICE=xpu pip install --no-build-isolation -e . cd .. @@ -672,7 +695,7 @@ run_vllm_tests() { run_triton_kernels_tests() { echo "***************************************************" - echo "****** Running Triton Kernels tests ******" + echo "****** Running Triton Kernels tests *******" echo "***************************************************" cd $TRITON_PROJ/python/triton_kernels/tests @@ -745,6 +768,9 @@ test_triton() { if [ "$TEST_INDUCTOR" == true ]; then run_inductor_tests fi + if [ "$INSTALL_SGLANG" == true ]; then + run_sglang_install + fi if [ "$TEST_SGLANG" == true ]; then run_sglang_tests fi