[CI] Initial integrate sglang kernels to triton-test (#3901)

leonling-ll · web-flow · commit 92e34afd5848 · 2025-05-29T00:38:33.000Z
Initial integrate SGLang attention kernels test to `triton-test`. The sglang patch will be removed after upstream PR sgl-project/sglang#5278 land.
diff --git a/.github/workflows/ligerkernels-tests.yml b/.github/workflows/ligerkernels-tests.yml
@@ -1,4 +1,4 @@
-name: Third party tests
+name: Third party Liger Kernels tests
 
 on:
   workflow_dispatch:
diff --git a/.github/workflows/sglang-tests.yml b/.github/workflows/sglang-tests.yml
@@ -0,0 +1,96 @@
+name: Third party SGLang tests
+
+on:
+  workflow_dispatch:
+    inputs:
+      runner_label:
+        description: Runner label, keep empty for default
+        type: string
+        default: ""
+      use_pyenv_python:
+        description: Use Python built with pyenv
+        type: boolean
+        default: false
+  schedule:
+    # About midnight PST Sunday (UTC-8)
+    - cron: "5 10 * * SUN"
+
+
+# Cancels in-progress PR runs when the PR is updated.  Manual runs are never cancelled.
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event_name == 'workflow_dispatch' && github.run_id || github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+permissions: read-all
+
+env:
+  PYTHON_VERSION: "3.10"
+  TAG: ${{ inputs.tag || (github.event_name == 'pull_request' && format('pr-{0}', github.event.number)) || (github.event_name == 'schedule' && 'ci') || 'test' }}
+
+jobs:
+  build:
+    name: Triton benchmarks
+    runs-on:
+      - linux
+      - ${{ inputs.runner_label || 'rolling' }}
+    timeout-minutes: 720
+    defaults:
+      run:
+        shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}"
+    steps:
+      - name: Print inputs
+        run: |
+          cat <<EOF
+          ${{ toJSON(inputs) }}
+          EOF
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install Python
+        if: ${{ !(inputs.use_pyenv_python || false) }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - name: Install Python (from pyenv) ${{ inputs.python_version }}
+        if: ${{ inputs.use_pyenv_python }}
+        uses: ./.github/actions/setup-pyenv-python
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - name: Identify Python version
+        run: |
+          PYTHON_VERSION="$(python -c 'import sys; print(f"{sys.version_info[0]}.{ sys.version_info[1]}")')"
+          echo "PYTHON_VERSION=$PYTHON_VERSION" | tee -a $GITHUB_ENV
+
+      - name: Install Python build dependencies
+        run: |
+          pip install wheel cmake
+
+      - name: Create reports dir
+        run: |
+          mkdir reports
+          echo "REPORTS=$PWD/reports" >> $GITHUB_ENV
+
+      - name: Install SGLang
+        id: install-sglang
+        run: |
+          git clone https://github.com/sgl-project/sglang.git
+          cd sglang
+          git apply ../benchmarks/third_party/sglang/sglang.patch
+          pip install ./python[dev_xpu]
+
+      # Install Pytorch and Triton after SGLANG to ensure that the correct versions are used
+      - name: Setup PyTorch
+        uses: ./.github/actions/setup-pytorch
+
+      - name: Setup Triton
+        uses: ./.github/actions/setup-triton
+
+      - name: Run SGLANG tests
+        if: ${{ steps.install.outcome == 'success' && steps.install-sglang.outcome == 'success' && !cancelled() }}
+        run: |
+            pip install pytest pytest-xdist
+            cd sglang
+            pytest -vvv -n 4 test/srt/test_triton_attention_kernels.py
diff --git a/benchmarks/third_party/sglang/sglang.patch b/benchmarks/third_party/sglang/sglang.patch
@@ -0,0 +1,283 @@
+diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
+index 884e715f..14e5df33 100644
+--- a/python/sglang/srt/utils.py
++++ b/python/sglang/srt/utils.py
+@@ -77,12 +77,20 @@ from torch.func import functional_call
+ from torch.library import Library
+ from torch.profiler import ProfilerActivity, profile, record_function
+ from torch.utils._contextlib import _DecoratorContextManager
+-from triton.runtime.cache import (
+-    FileCacheManager,
+-    default_cache_dir,
+-    default_dump_dir,
+-    default_override_dir,
+-)
++try:
++    from triton.runtime.cache import (
++        FileCacheManager,
++        default_cache_dir,
++        default_dump_dir,
++        default_override_dir,
++    )
++except ImportError:
++    from triton.runtime.cache import FileCacheManager
++    from triton.knobs import cache as tt_cache
++
++    default_cache_dir = lambda: tt_cache.dir
++    default_dump_dir = lambda: tt_cache.dump_dir
++    default_override_dir = lambda: tt_cache.override_dir
+ 
+ logger = logging.getLogger(__name__)
+ 
+@@ -156,6 +164,18 @@ def is_xpu() -> bool:
+ def is_npu() -> bool:
+     return hasattr(torch, "npu") and torch.npu.is_available()
+ 
++def infer_device():
++    """
++    Infer the device type based on the current environment.
++    """
++    if is_cuda_alike():
++        return "cuda"
++    elif is_xpu():
++        return "xpu"
++    elif is_hpu():
++        return "hpu"
++    else:
++        return "cpu"
+ 
+ def is_flashinfer_available():
+     """
+diff --git a/test/srt/test_triton_attention_kernels.py b/test/srt/test_triton_attention_kernels.py
+index 47eb16a9..9d6a0af0 100644
+--- a/test/srt/test_triton_attention_kernels.py
++++ b/test/srt/test_triton_attention_kernels.py
+@@ -16,8 +16,11 @@ from sglang.srt.layers.attention.triton_ops.prefill_attention import (
+     context_attention_fwd,
+ )
+ from sglang.test.test_utils import CustomTestCase
++from sglang.srt.utils import infer_device
+ 
+ 
++device = infer_device()
++
+ class TestTritonAttention(CustomTestCase):
+ 
+     def _set_all_seeds(self, seed):
+@@ -37,24 +40,24 @@ class TestTritonAttention(CustomTestCase):
+         dtype = torch.bfloat16
+ 
+         b_seq_len_prefix = torch.randint(
+-            1, N_CTX // 2, (B,), dtype=torch.int32, device="cuda"
++            1, N_CTX // 2, (B,), dtype=torch.int32, device=device
+         )
+         b_seq_len_extend = torch.randint(
+-            1, N_CTX // 2, (B,), dtype=torch.int32, device="cuda"
++            1, N_CTX // 2, (B,), dtype=torch.int32, device=device
+         )
+         b_seq_len = b_seq_len_prefix + b_seq_len_extend
+         max_len_in_batch = torch.max(b_seq_len, 0)[0].item()
+ 
+-        b_req_idx = torch.arange(B, dtype=torch.int32, device="cuda")
+-        b_start_loc = torch.zeros((B,), dtype=torch.int32, device="cuda")
++        b_req_idx = torch.arange(B, dtype=torch.int32, device=device)
++        b_start_loc = torch.zeros((B,), dtype=torch.int32, device=device)
+         b_start_loc[1:] = torch.cumsum(b_seq_len[:-1], 0)
+-        b_start_loc_extend = torch.zeros((B,), dtype=torch.int32, device="cuda")
++        b_start_loc_extend = torch.zeros((B,), dtype=torch.int32, device=device)
+         b_start_loc_extend[1:] = torch.cumsum(b_seq_len_extend[:-1], 0)
+ 
+-        kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda")
++        kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device=device)
+         kv_indptr[1 : B + 1] = torch.cumsum(b_seq_len_prefix[:B], dim=0)
+         kv_indices = torch.zeros(
+-            (b_seq_len_prefix.sum().item(),), dtype=torch.int32, device="cuda"
++            (b_seq_len_prefix.sum().item(),), dtype=torch.int32, device=device
+         )
+ 
+         for i in range(B):
+@@ -65,15 +68,15 @@ class TestTritonAttention(CustomTestCase):
+         total_token_num = torch.sum(b_seq_len).item()
+         extend_token_num = torch.sum(b_seq_len_extend).item()
+         k_buffer = torch.empty(
+-            (total_token_num, H_KV, D), dtype=dtype, device="cuda"
++            (total_token_num, H_KV, D), dtype=dtype, device=device
+         ).normal_(mean=0.1, std=0.2)
+         v_buffer = torch.empty(
+-            (total_token_num, H_KV, D), dtype=dtype, device="cuda"
++            (total_token_num, H_KV, D), dtype=dtype, device=device
+         ).normal_(mean=0.1, std=0.2)
+ 
+-        k_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device="cuda")
+-        v_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device="cuda")
+-        q_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device="cuda")
++        k_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device=device)
++        v_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device=device)
++        q_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device=device)
+         for i in range(B):
+             extend_start_in_buffer = b_start_loc[i] + b_seq_len_prefix[i]
+             extend_end_in_buffer = b_start_loc[i] + b_seq_len[i]
+@@ -86,20 +89,20 @@ class TestTritonAttention(CustomTestCase):
+                 extend_start_in_buffer:extend_end_in_buffer
+             ]
+             q_extend[extend_start:extend_end] = torch.empty(
+-                (b_seq_len_extend[i], H_Q, D), dtype=dtype, device="cuda"
++                (b_seq_len_extend[i], H_Q, D), dtype=dtype, device=device
+             ).normal_(mean=0.1, std=0.2)
+ 
+-        o_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device="cuda")
++        o_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device=device)
+         o_extend_mask = torch.empty(
+-            (extend_token_num, H_Q, D), dtype=dtype, device="cuda"
++            (extend_token_num, H_Q, D), dtype=dtype, device=device
+         )
+         o_redundant = torch.empty(
+-            (extend_token_num, H_Q, D), dtype=dtype, device="cuda"
++            (extend_token_num, H_Q, D), dtype=dtype, device=device
+         )
+ 
+         b_seq_len_extend = b_seq_len - b_seq_len_prefix
+         max_len_extend = torch.max(b_seq_len_extend, 0)[0].item()
+-        qo_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda")
++        qo_indptr = torch.zeros((B + 1,), dtype=torch.int32, device=device)
+         qo_indptr[1 : B + 1] = torch.cumsum(b_seq_len_extend[:B], dim=0)
+ 
+         custom_mask = None
+@@ -123,9 +126,9 @@ class TestTritonAttention(CustomTestCase):
+ 
+         b_seq_mask_len = b_seq_len_extend * b_seq_len
+         custom_mask = torch.ones(
+-            (b_seq_mask_len.sum().item(),), dtype=torch.bool, device="cuda"
++            (b_seq_mask_len.sum().item(),), dtype=torch.bool, device=device
+         )
+-        mask_indptr = torch.zeros((B + 1,), dtype=torch.int64, device="cuda")
++        mask_indptr = torch.zeros((B + 1,), dtype=torch.int64, device=device)
+         mask_indptr[1 : B + 1] = torch.cumsum(b_seq_mask_len[:B], dim=0)
+         for i in range(B):
+             causal_mask = (
+@@ -187,14 +190,14 @@ class TestTritonAttention(CustomTestCase):
+         max_seq_len = max(seq_lens)
+ 
+         # Create random input tensors
+-        q = torch.randn(sum(seq_lens), num_heads, head_dim, device="cuda")
+-        k = torch.randn(sum(seq_lens), num_heads, head_dim, device="cuda")
+-        v = torch.randn(sum(seq_lens), num_heads, head_dim, device="cuda")
+-        o = torch.zeros(sum(seq_lens), num_heads, head_dim, device="cuda")
++        q = torch.randn(sum(seq_lens), num_heads, head_dim, device=device)
++        k = torch.randn(sum(seq_lens), num_heads, head_dim, device=device)
++        v = torch.randn(sum(seq_lens), num_heads, head_dim, device=device)
++        o = torch.zeros(sum(seq_lens), num_heads, head_dim, device=device)
+ 
+         # Create b_start_loc and b_seq_len tensors
+-        b_start_loc = torch.tensor([0, seq_lens[0]], device="cuda")
+-        b_seq_len = torch.tensor(seq_lens, device="cuda")
++        b_start_loc = torch.tensor([0, seq_lens[0]], device=device)
++        b_seq_len = torch.tensor(seq_lens, device=device)
+ 
+         context_attention_fwd(
+             q, k, v, o, b_start_loc, b_seq_len, max_seq_len, is_causal=is_causal
+@@ -232,33 +235,33 @@ class TestTritonAttention(CustomTestCase):
+         total_tokens = B * seq_len
+         sm_scale = 1.0 / (D**0.5)
+         max_kv_splits = 8
+-        num_kv_splits = torch.full((B,), 4, dtype=torch.int32, device="cuda")
++        num_kv_splits = torch.full((B,), 4, dtype=torch.int32, device=device)
+ 
+         # q represents the new token being generated, one per batch
+-        q = torch.randn(B, H_Q, D, dtype=dtype, device="cuda")
++        q = torch.randn(B, H_Q, D, dtype=dtype, device=device)
+ 
+         # k_buffer and v_buffer represent all previous tokens
+-        k_buffer = torch.randn(total_tokens, H_KV, D, dtype=dtype, device="cuda")
+-        v_buffer = torch.randn(total_tokens, H_KV, D, dtype=dtype, device="cuda")
++        k_buffer = torch.randn(total_tokens, H_KV, D, dtype=dtype, device=device)
++        v_buffer = torch.randn(total_tokens, H_KV, D, dtype=dtype, device=device)
+ 
+         # o will have the same shape as q
+-        o = torch.zeros(B, H_Q, D, dtype=dtype, device="cuda")
++        o = torch.zeros(B, H_Q, D, dtype=dtype, device=device)
+ 
+-        b_seq_len = torch.full((B,), seq_len, device="cuda")
++        b_seq_len = torch.full((B,), seq_len, device=device)
+ 
+-        kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda")
++        kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device=device)
+         kv_indptr[1 : B + 1] = torch.cumsum(b_seq_len[:B], dim=0)
+-        kv_indices = torch.arange(total_tokens, device="cuda")
++        kv_indices = torch.arange(total_tokens, device=device)
+ 
+         attn_logits = torch.empty(
+             (B, H_Q, max_kv_splits, D),
+             dtype=torch.float32,
+-            device="cuda",
++            device=device,
+         )
+         attn_lse = torch.empty(
+             (B, H_Q, max_kv_splits),
+             dtype=torch.float32,
+-            device="cuda",
++            device=device,
+         )
+ 
+         decode_attention_fwd(
+@@ -296,34 +299,34 @@ class TestTritonAttention(CustomTestCase):
+         total_tokens = B * seq_len
+         sm_scale = 1.0 / (D**0.5)
+         max_kv_splits = 8
+-        num_kv_splits = torch.full((B,), 4, dtype=torch.int32, device="cuda")
++        num_kv_splits = torch.full((B,), 4, dtype=torch.int32, device=device)
+ 
+         # q represents the new token being generated, one per batch
+-        q = torch.randn(B, H_Q, D, dtype=dtype, device="cuda")
++        q = torch.randn(B, H_Q, D, dtype=dtype, device=device)
+ 
+         # k_buffer and v_buffer represent all previous tokens
+-        k_buffer = torch.randn(total_tokens, H_KV, D, dtype=dtype, device="cuda")
+-        v_buffer = torch.randn(total_tokens, H_KV, D_V, dtype=dtype, device="cuda")
++        k_buffer = torch.randn(total_tokens, H_KV, D, dtype=dtype, device=device)
++        v_buffer = torch.randn(total_tokens, H_KV, D_V, dtype=dtype, device=device)
+ 
+         # o will have the same shape as q
+-        o = torch.zeros(B, H_Q, D_V, dtype=dtype, device="cuda")
+-        o_grouped = torch.zeros(B, H_Q, D_V, dtype=dtype, device="cuda")
++        o = torch.zeros(B, H_Q, D_V, dtype=dtype, device=device)
++        o_grouped = torch.zeros(B, H_Q, D_V, dtype=dtype, device=device)
+ 
+-        b_seq_len = torch.full((B,), seq_len, device="cuda")
++        b_seq_len = torch.full((B,), seq_len, device=device)
+ 
+-        kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda")
++        kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device=device)
+         kv_indptr[1 : B + 1] = torch.cumsum(b_seq_len[:B], dim=0)
+-        kv_indices = torch.arange(total_tokens, device="cuda")
++        kv_indices = torch.arange(total_tokens, device=device)
+ 
+         attn_logits = torch.empty(
+             (B, H_Q, max_kv_splits, D_V),
+             dtype=torch.float32,
+-            device="cuda",
++            device=device,
+         )
+         attn_lse = torch.empty(
+             (B, H_Q, max_kv_splits),
+             dtype=torch.float32,
+-            device="cuda",
++            device=device,
+         )
+ 
+         decode_attention_fwd_normal(
+@@ -343,12 +346,12 @@ class TestTritonAttention(CustomTestCase):
+         attn_logits1 = torch.empty(
+             (B, H_Q, max_kv_splits, D_V),
+             dtype=torch.float32,
+-            device="cuda",
++            device=device,
+         )
+         attn_lse1 = torch.empty(
+             (B, H_Q, max_kv_splits, D_V),
+             dtype=torch.float32,
+-            device="cuda",
++            device=device,
+         )
+ 
+         decode_attention_fwd_grouped(

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-name: Third party tests`
	`1`	`+name: Third party Liger Kernels tests`
`2`	`2`
`3`	`3`	`on:`
`4`	`4`	`workflow_dispatch:`