diff --git a/.github/workflows/sglang-tests.yml b/.github/workflows/sglang-tests.yml
deleted file mode 100644
index dc5cabc991..0000000000
--- a/.github/workflows/sglang-tests.yml
+++ /dev/null
@@ -1,100 +0,0 @@
-name: Third party SGLang tests
-
-on:
-  workflow_dispatch:
-    inputs:
-      runner_label:
-        description: Runner label, keep empty for default
-        type: string
-        default: ""
-      use_pyenv_python:
-        description: Use Python built with pyenv
-        type: boolean
-        default: false
-  schedule:
-    # About midnight PST Sunday (UTC-8)
-    - cron: "5 10 * * SUN"
-
-
-# Cancels in-progress PR runs when the PR is updated.  Manual runs are never cancelled.
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event_name == 'workflow_dispatch' && github.run_id || github.event.pull_request.number || github.ref }}
-  cancel-in-progress: true
-
-permissions: read-all
-
-env:
-  PYTHON_VERSION: "3.10"
-  TAG: ${{ inputs.tag || (github.event_name == 'pull_request' && format('pr-{0}', github.event.number)) || (github.event_name == 'schedule' && 'ci') || 'test' }}
-
-jobs:
-  build:
-    name: SGLang tests
-    runs-on:
-      - linux
-      - ${{ inputs.runner_label || 'rolling' }}
-    timeout-minutes: 720
-    defaults:
-      run:
-        shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}"
-    steps:
-      - name: Print inputs
-        run: |
-          cat <<EOF
-          ${{ toJSON(inputs) }}
-          EOF
-
-      - name: Checkout repository
-        uses: actions/checkout@v5
-
-      - name: Install Python
-        if: ${{ !(inputs.use_pyenv_python || false) }}
-        uses: actions/setup-python@v6
-        with:
-          python-version: ${{ env.PYTHON_VERSION }}
-
-      - name: Install Python (from pyenv) ${{ inputs.python_version }}
-        if: ${{ inputs.use_pyenv_python }}
-        uses: ./.github/actions/setup-pyenv-python
-        with:
-          python-version: ${{ env.PYTHON_VERSION }}
-
-      - name: Identify Python version
-        run: |
-          PYTHON_VERSION="$(python -c 'import sys; print(f"{sys.version_info[0]}.{ sys.version_info[1]}")')"
-          echo "PYTHON_VERSION=$PYTHON_VERSION" | tee -a $GITHUB_ENV
-
-      - name: Install Python build dependencies
-        run: |
-          pip install cmake
-
-      - name: Create reports dir
-        run: |
-          mkdir reports
-          echo "REPORTS=$PWD/reports" >> $GITHUB_ENV
-
-      - name: Install SGLang
-        id: install
-        run: |
-          git clone https://github.com/sgl-project/sglang.git
-          cd sglang
-          git apply ../benchmarks/third_party/sglang/sglang-fix.patch
-          pip install "./python[dev_xpu]"
-
-      - name: Setup PyTorch
-        uses: ./.github/actions/setup-pytorch
-
-      - name: Setup Triton
-        uses: ./.github/actions/setup-triton
-
-      - name: Run SGLANG tests
-        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
-        run: |
-          ./scripts/test-triton.sh --sglang --skip-pip-install --skip-pytorch-install
-
-      - name: Upload test report
-        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: test-reports
-          path: reports
diff --git a/.github/workflows/third-party-tests.yml b/.github/workflows/third-party-tests.yml
index 41a38b5c3f..2dac6d56ba 100644
--- a/.github/workflows/third-party-tests.yml
+++ b/.github/workflows/third-party-tests.yml
@@ -1,4 +1,4 @@
-name: Third party tests [liger-kernels, vllm]
+name: Third party tests [liger-kernels, vllm, sglang]
 
 on:
   workflow_dispatch:
@@ -28,12 +28,12 @@ env:
   TAG: ${{ inputs.tag || (github.event_name == 'pull_request' && format('pr-{0}', github.event.number)) || (github.event_name == 'schedule' && 'ci') || 'test' }}
 
 jobs:
-  build:
-    name: Third party tests [liger-kernels, vllm]
+  small-tests:
+    name: Third party tests [vllm, sglang]
     runs-on:
       - linux
       - ${{ inputs.runner_label || 'max1550' }}
-    timeout-minutes: 720
+    timeout-minutes: 120
     defaults:
       run:
         shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}"
@@ -47,14 +47,7 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v5
 
-      - name: Install Python
-        if: ${{ !(inputs.use_pyenv_python || false) }}
-        uses: actions/setup-python@v6
-        with:
-          python-version: ${{ env.PYTHON_VERSION }}
-
       - name: Install Python (from pyenv) ${{ inputs.python_version }}
-        if: ${{ inputs.use_pyenv_python }}
         uses: ./.github/actions/setup-pyenv-python
         with:
           python-version: ${{ env.PYTHON_VERSION }}
@@ -71,28 +64,78 @@ jobs:
       - name: Setup PyTorch
         uses: ./.github/actions/setup-pytorch
 
-      - name: Build Triton wheels
-        uses: ./.github/actions/setup-triton
-        with:
-          command: DEBUG=1 python -m build --wheel --no-isolation
-
-      - name: Install Triton
+      - name: Setup Triton
         id: install
-        run: |
-          pip install dist/*.whl
+        uses: ./.github/actions/setup-triton
 
       - name: Create reports dir
         run: |
           mkdir reports
           echo "REPORTS=$PWD/reports" >> $GITHUB_ENV
 
+      - name: Run SGLANG tests
+        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
+        run: |
+          ./scripts/test-triton.sh --sglang --skip-pip-install --skip-pytorch-install
+
       - name: Run VLLM tests
         if: ${{ steps.install.outcome == 'success' && !cancelled() }}
         run: |
           ./scripts/test-triton.sh --vllm --skip-pip-install --skip-pytorch-install
 
-      - name: Run Liger-Kernel tests
+      - name: Upload test report
         if: ${{ steps.install.outcome == 'success' && !cancelled() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-main-reports
+          path: reports
+  # We run all tests for Liger, so it's slow and we test it separately
+  liger:
+    name: Liger testing
+    runs-on:
+      - linux
+      - ${{ inputs.runner_label || 'max1550' }}
+    timeout-minutes: 120
+    defaults:
+      run:
+        shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}"
+    steps:
+      - name: Print inputs
+        run: |
+          cat <<EOF
+          ${{ toJSON(inputs) }}
+          EOF
+
+      - name: Checkout repository
+        uses: actions/checkout@v5
+
+      - name: Install Python (from pyenv) ${{ inputs.python_version }}
+        uses: ./.github/actions/setup-pyenv-python
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - name: Identify Python version
+        run: |
+          PYTHON_VERSION="$(python -c 'import sys; print(f"{sys.version_info[0]}.{ sys.version_info[1]}")')"
+          echo "PYTHON_VERSION=$PYTHON_VERSION" | tee -a $GITHUB_ENV
+
+      - name: Install Python build dependencies
+        run: |
+          pip install cmake
+
+      - name: Setup PyTorch
+        uses: ./.github/actions/setup-pytorch
+
+      - name: Setup Triton
+        id: install
+        uses: ./.github/actions/setup-triton
+
+      - name: Create reports dir
+        run: |
+          mkdir reports
+          echo "REPORTS=$PWD/reports" >> $GITHUB_ENV
+
+      - name: Run Liger-Kernel tests
         run: |
           ./scripts/test-triton.sh --liger --skip-pip-install --skip-pytorch-install
 
@@ -100,5 +143,5 @@ jobs:
         if: ${{ steps.install.outcome == 'success' && !cancelled() }}
         uses: actions/upload-artifact@v4
         with:
-          name: test-reports
+          name: test-liger-reports
           path: reports
diff --git a/benchmarks/third_party/sglang/sglang-fix.patch b/benchmarks/third_party/sglang/sglang-fix.patch
index 9b9d38dc43..b3769b6385 100644
--- a/benchmarks/third_party/sglang/sglang-fix.patch
+++ b/benchmarks/third_party/sglang/sglang-fix.patch
@@ -1,9 +1,9 @@
-diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
-index bc2affa1..8ef91e66 100644
---- a/python/sglang/srt/utils.py
-+++ b/python/sglang/srt/utils.py
-@@ -228,6 +228,22 @@ def is_flashinfer_available():
-     return importlib.util.find_spec("flashinfer") is not None and is_cuda()
+diff --git a/python/sglang/srt/utils/common.py b/python/sglang/srt/utils/common.py
+index 7c2f573e4..8023cd6be 100644
+--- a/python/sglang/srt/utils/common.py
++++ b/python/sglang/srt/utils/common.py
+@@ -155,12 +155,44 @@ def is_cpu() -> bool:
+     return os.getenv("SGLANG_USE_CPU_ENGINE", "0") == "1" and is_host_cpu_x86()
  
  
 +def auto_detect_device():
@@ -22,26 +22,48 @@ index bc2affa1..8ef91e66 100644
 +        return "cpu"
 +
 +
- _ENABLE_TORCH_INFERENCE_MODE = get_bool_env_var(
-     "SGLANG_ENABLE_TORCH_INFERENCE_MODE", "false"
- )
+ def get_cuda_version():
+     if torch.version.cuda:
+         return tuple(map(int, torch.version.cuda.split(".")))
+     return (0, 0)
+ 
+ 
++def auto_detect_device():
++    """
++    Infer the device type based on the current environment.
++    """
++    if is_cuda_alike():
++        return "cuda"
++    elif is_xpu():
++        return "xpu"
++    elif is_hpu():
++        return "hpu"
++    elif is_npu():
++        return "npu"
++    else:
++        return "cpu"
++
++
+ def _check(cc_major):
+     if not is_cuda():
+         return False
 diff --git a/test/srt/test_triton_attention_kernels.py b/test/srt/test_triton_attention_kernels.py
-index 47eb16a9..cce70fb9 100644
+index 16c107006..03b9411fa 100644
 --- a/test/srt/test_triton_attention_kernels.py
 +++ b/test/srt/test_triton_attention_kernels.py
-@@ -16,8 +16,11 @@ from sglang.srt.layers.attention.triton_ops.prefill_attention import (
+@@ -18,8 +18,11 @@ from sglang.srt.layers.attention.triton_ops.extend_attention import (
+ from sglang.srt.layers.attention.triton_ops.prefill_attention import (
      context_attention_fwd,
  )
- from sglang.test.test_utils import CustomTestCase
 +from sglang.srt.utils import auto_detect_device
- 
+ from sglang.test.test_utils import CustomTestCase
  
 +device = auto_detect_device()
 +
- class TestTritonAttention(CustomTestCase):
  
-     def _set_all_seeds(self, seed):
-@@ -37,24 +40,24 @@ class TestTritonAttention(CustomTestCase):
+ def extend_attention_fwd_torch(
+     q: torch.Tensor,  # [extend_tokens, H_Q, D]
+@@ -114,24 +117,24 @@ class TestTritonAttention(CustomTestCase):
          dtype = torch.bfloat16
  
          b_seq_len_prefix = torch.randint(
@@ -73,7 +95,7 @@ index 47eb16a9..cce70fb9 100644
          )
  
          for i in range(B):
-@@ -65,15 +68,15 @@ class TestTritonAttention(CustomTestCase):
+@@ -142,15 +145,15 @@ class TestTritonAttention(CustomTestCase):
          total_token_num = torch.sum(b_seq_len).item()
          extend_token_num = torch.sum(b_seq_len_extend).item()
          k_buffer = torch.empty(
@@ -94,7 +116,7 @@ index 47eb16a9..cce70fb9 100644
          for i in range(B):
              extend_start_in_buffer = b_start_loc[i] + b_seq_len_prefix[i]
              extend_end_in_buffer = b_start_loc[i] + b_seq_len[i]
-@@ -86,20 +89,20 @@ class TestTritonAttention(CustomTestCase):
+@@ -163,20 +166,20 @@ class TestTritonAttention(CustomTestCase):
                  extend_start_in_buffer:extend_end_in_buffer
              ]
              q_extend[extend_start:extend_end] = torch.empty(
@@ -120,7 +142,7 @@ index 47eb16a9..cce70fb9 100644
          qo_indptr[1 : B + 1] = torch.cumsum(b_seq_len_extend[:B], dim=0)
  
          custom_mask = None
-@@ -123,9 +126,9 @@ class TestTritonAttention(CustomTestCase):
+@@ -200,9 +203,9 @@ class TestTritonAttention(CustomTestCase):
  
          b_seq_mask_len = b_seq_len_extend * b_seq_len
          custom_mask = torch.ones(
@@ -132,7 +154,81 @@ index 47eb16a9..cce70fb9 100644
          mask_indptr[1 : B + 1] = torch.cumsum(b_seq_mask_len[:B], dim=0)
          for i in range(B):
              causal_mask = (
-@@ -187,14 +190,14 @@ class TestTritonAttention(CustomTestCase):
+@@ -263,22 +266,22 @@ class TestTritonAttention(CustomTestCase):
+         dtype = torch.bfloat16
+ 
+         b_seq_len_prefix = torch.randint(
+-            1, N_CTX // 2, (B,), dtype=torch.int32, device="cuda"
++            1, N_CTX // 2, (B,), dtype=torch.int32, device=device
+         )
+         b_seq_len_extend = torch.randint(
+-            1, N_CTX // 2, (B,), dtype=torch.int32, device="cuda"
++            1, N_CTX // 2, (B,), dtype=torch.int32, device=device
+         )
+         b_seq_len = b_seq_len_prefix + b_seq_len_extend
+ 
+-        b_start_loc = torch.zeros((B,), dtype=torch.int32, device="cuda")
++        b_start_loc = torch.zeros((B,), dtype=torch.int32, device=device)
+         b_start_loc[1:] = torch.cumsum(b_seq_len[:-1], 0)
+-        b_start_loc_extend = torch.zeros((B,), dtype=torch.int32, device="cuda")
++        b_start_loc_extend = torch.zeros((B,), dtype=torch.int32, device=device)
+         b_start_loc_extend[1:] = torch.cumsum(b_seq_len_extend[:-1], 0)
+ 
+-        kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda")
++        kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device=device)
+         kv_indptr[1 : B + 1] = torch.cumsum(b_seq_len_prefix[:B], dim=0)
+         kv_indices = torch.zeros(
+-            (b_seq_len_prefix.sum().item(),), dtype=torch.int32, device="cuda"
++            (b_seq_len_prefix.sum().item(),), dtype=torch.int32, device=device
+         )
+ 
+         for i in range(B):
+@@ -289,15 +292,15 @@ class TestTritonAttention(CustomTestCase):
+         total_token_num = torch.sum(b_seq_len).item()
+         extend_token_num = torch.sum(b_seq_len_extend).item()
+         k_buffer = torch.empty(
+-            (total_token_num, H_KV, D), dtype=dtype, device="cuda"
++            (total_token_num, H_KV, D), dtype=dtype, device=device
+         ).normal_(mean=0.1, std=0.2)
+         v_buffer = torch.empty(
+-            (total_token_num, H_KV, D), dtype=dtype, device="cuda"
++            (total_token_num, H_KV, D), dtype=dtype, device=device
+         ).normal_(mean=0.1, std=0.2)
+ 
+-        k_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device="cuda")
+-        v_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device="cuda")
+-        q_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device="cuda")
++        k_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device=device)
++        v_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device=device)
++        q_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device=device)
+         for i in range(B):
+             extend_start_in_buffer = b_start_loc[i] + b_seq_len_prefix[i]
+             extend_end_in_buffer = b_start_loc[i] + b_seq_len[i]
+@@ -310,19 +313,19 @@ class TestTritonAttention(CustomTestCase):
+                 extend_start_in_buffer:extend_end_in_buffer
+             ]
+             q_extend[extend_start:extend_end] = torch.empty(
+-                (b_seq_len_extend[i], H_Q, D), dtype=dtype, device="cuda"
++                (b_seq_len_extend[i], H_Q, D), dtype=dtype, device=device
+             ).normal_(mean=0.1, std=0.2)
+ 
+         o_extend_triton = torch.empty(
+-            (extend_token_num, H_Q, D), dtype=dtype, device="cuda"
++            (extend_token_num, H_Q, D), dtype=dtype, device=device
+         )
+         o_extend_torch = torch.empty(
+-            (extend_token_num, H_Q, D), dtype=dtype, device="cuda"
++            (extend_token_num, H_Q, D), dtype=dtype, device=device
+         )
+ 
+         b_seq_len_extend = b_seq_len - b_seq_len_prefix
+         max_len_extend = torch.max(b_seq_len_extend, 0)[0].item()
+-        qo_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda")
++        qo_indptr = torch.zeros((B + 1,), dtype=torch.int32, device=device)
+         qo_indptr[1 : B + 1] = torch.cumsum(b_seq_len_extend[:B], dim=0)
+ 
+         extend_attention_fwd(
+@@ -373,14 +376,14 @@ class TestTritonAttention(CustomTestCase):
          max_seq_len = max(seq_lens)
  
          # Create random input tensors
@@ -153,7 +249,7 @@ index 47eb16a9..cce70fb9 100644
  
          context_attention_fwd(
              q, k, v, o, b_start_loc, b_seq_len, max_seq_len, is_causal=is_causal
-@@ -232,33 +235,33 @@ class TestTritonAttention(CustomTestCase):
+@@ -418,33 +421,33 @@ class TestTritonAttention(CustomTestCase):
          total_tokens = B * seq_len
          sm_scale = 1.0 / (D**0.5)
          max_kv_splits = 8
@@ -197,7 +293,7 @@ index 47eb16a9..cce70fb9 100644
          )
  
          decode_attention_fwd(
-@@ -296,34 +299,34 @@ class TestTritonAttention(CustomTestCase):
+@@ -482,34 +485,34 @@ class TestTritonAttention(CustomTestCase):
          total_tokens = B * seq_len
          sm_scale = 1.0 / (D**0.5)
          max_kv_splits = 8
@@ -243,7 +339,7 @@ index 47eb16a9..cce70fb9 100644
          )
  
          decode_attention_fwd_normal(
-@@ -343,12 +346,12 @@ class TestTritonAttention(CustomTestCase):
+@@ -529,12 +532,12 @@ class TestTritonAttention(CustomTestCase):
          attn_logits1 = torch.empty(
              (B, H_Q, max_kv_splits, D_V),
              dtype=torch.float32,
@@ -258,3 +354,103 @@ index 47eb16a9..cce70fb9 100644
          )
  
          decode_attention_fwd_grouped(
+@@ -578,23 +581,23 @@ class TestTritonAttention(CustomTestCase):
+         dtype = torch.bfloat16
+ 
+         b_seq_len_prefix = torch.randint(
+-            1, N_CTX // 2, (B,), dtype=torch.int32, device="cuda"
++            1, N_CTX // 2, (B,), dtype=torch.int32, device=device
+         )
+         b_seq_len_extend = torch.randint(
+-            1, N_CTX // 2, (B,), dtype=torch.int32, device="cuda"
++            1, N_CTX // 2, (B,), dtype=torch.int32, device=device
+         )
+         b_seq_len = b_seq_len_prefix + b_seq_len_extend
+ 
+-        b_start_loc = torch.zeros((B,), dtype=torch.int32, device="cuda")
++        b_start_loc = torch.zeros((B,), dtype=torch.int32, device=device)
+         b_start_loc[1:] = torch.cumsum(b_seq_len[:-1], 0)
+-        b_start_loc_extend = torch.zeros((B,), dtype=torch.int32, device="cuda")
++        b_start_loc_extend = torch.zeros((B,), dtype=torch.int32, device=device)
+         b_start_loc_extend[1:] = torch.cumsum(b_seq_len_extend[:-1], 0)
+ 
+         # Setup prefix KV indices
+-        kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda")
++        kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device=device)
+         kv_indptr[1 : B + 1] = torch.cumsum(b_seq_len_prefix[:B], dim=0)
+         kv_indices = torch.zeros(
+-            (b_seq_len_prefix.sum().item(),), dtype=torch.int64, device="cuda"
++            (b_seq_len_prefix.sum().item(),), dtype=torch.int64, device=device
+         )
+ 
+         for i in range(B):
+@@ -605,15 +608,15 @@ class TestTritonAttention(CustomTestCase):
+         total_token_num = torch.sum(b_seq_len).item()
+         extend_token_num = torch.sum(b_seq_len_extend).item()
+         k_buffer = torch.empty(
+-            (total_token_num, H_KV, D), dtype=dtype, device="cuda"
++            (total_token_num, H_KV, D), dtype=dtype, device=device
+         ).normal_(mean=0.1, std=0.2)
+         v_buffer = torch.empty(
+-            (total_token_num, H_KV, D), dtype=dtype, device="cuda"
++            (total_token_num, H_KV, D), dtype=dtype, device=device
+         ).normal_(mean=0.1, std=0.2)
+ 
+-        k_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device="cuda")
+-        v_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device="cuda")
+-        q_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device="cuda")
++        k_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device=device)
++        v_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device=device)
++        q_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device=device)
+ 
+         for i in range(B):
+             extend_start_in_buffer = b_start_loc[i] + b_seq_len_prefix[i]
+@@ -627,16 +630,16 @@ class TestTritonAttention(CustomTestCase):
+                 extend_start_in_buffer:extend_end_in_buffer
+             ]
+             q_extend[extend_start:extend_end] = torch.empty(
+-                (b_seq_len_extend[i], H_Q, D), dtype=dtype, device="cuda"
++                (b_seq_len_extend[i], H_Q, D), dtype=dtype, device=device
+             ).normal_(mean=0.1, std=0.2)
+ 
+         # Setup for extend attention
+         max_len_extend = torch.max(b_seq_len_extend, 0)[0].item()
+-        qo_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda")
++        qo_indptr = torch.zeros((B + 1,), dtype=torch.int32, device=device)
+         qo_indptr[1 : B + 1] = torch.cumsum(b_seq_len_extend[:B], dim=0)
+ 
+         # Run 2-stage kernel
+-        o_regular = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device="cuda")
++        o_regular = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device=device)
+         extend_attention_fwd(
+             q_extend,
+             k_extend,
+@@ -658,9 +661,9 @@ class TestTritonAttention(CustomTestCase):
+             total_token_num - extend_token_num,
+             total_token_num,
+             dtype=torch.int64,
+-            device="cuda",
++            device=device,
+         )
+-        extend_start_loc = torch.zeros((B,), dtype=torch.int32, device="cuda")
++        extend_start_loc = torch.zeros((B,), dtype=torch.int32, device=device)
+         extend_start_loc[1:] = torch.cumsum(b_seq_len_extend[:-1], 0)
+ 
+         unified_kv_indptr, unified_kv_indices, prefix_lens = build_unified_kv_indices(
+@@ -673,7 +676,7 @@ class TestTritonAttention(CustomTestCase):
+         )
+ 
+         # Run unified kernel
+-        o_unified = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device="cuda")
++        o_unified = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device=device)
+         extend_attention_fwd_unified(
+             q_extend,
+             o_unified,
+@@ -716,7 +719,6 @@ class TestTritonAttention(CustomTestCase):
+         """Test build_unified_kv_indices correctness."""
+         B = 4
+         dtype = torch.int64
+-        device = "cuda"
+ 
+         # Setup test data
+         prefix_lens = torch.tensor([10, 20, 15, 25], dtype=torch.int32, device=device)
diff --git a/benchmarks/third_party/sglang/sglang-pin.txt b/benchmarks/third_party/sglang/sglang-pin.txt
new file mode 100644
index 0000000000..8f8517ba4b
--- /dev/null
+++ b/benchmarks/third_party/sglang/sglang-pin.txt
@@ -0,0 +1 @@
+d6fee73d1f593bd6754cd2550775fd2e54aeae60
diff --git a/scripts/test-triton.sh b/scripts/test-triton.sh
index 0bdc5de7ad..bc347f4a00 100755
--- a/scripts/test-triton.sh
+++ b/scripts/test-triton.sh
@@ -30,6 +30,7 @@ TEST:
     --liger
     --vllm
     --install-vllm
+    --install-sglang
 
 OPTION:
     --unskip
@@ -74,6 +75,7 @@ TEST_SGLANG=false
 TEST_LIGER=false
 TEST_VLLM=false
 INSTALL_VLLM=false
+INSTALL_SGLANG=false
 TEST_TRITON_KERNELS=false
 VENV=false
 TRITON_TEST_REPORTS=false
@@ -190,6 +192,11 @@ while (( $# != 0 )); do
       TEST_DEFAULT=false
       shift
       ;;
+    --install-sglang)
+      INSTALL_SGLANG=true
+      TEST_DEFAULT=false
+      shift
+      ;;
     --sglang)
       TEST_SGLANG=true
       TEST_DEFAULT=false
@@ -589,26 +596,41 @@ run_inductor_tests() {
   grep AlbertForMaskedLM inductor_log.csv | grep -q ,pass,
 }
 
-run_sglang_tests() {
-  echo "***************************************************"
-  echo "******    Running SGLang Triton tests        ******"
-  echo "***************************************************"
+run_sglang_install() {
+  echo "************************************************"
+  echo "******    Installing SGLang                 ****"
+  echo "************************************************"
 
   if ! [ -d "./sglang" ]; then
     git clone https://github.com/sgl-project/sglang.git
   fi
-  cd sglang
 
   if ! pip list | grep "sglang" ; then
-    git apply $TRITON_PROJ/benchmarks/third_party/sglang/sglang-fix.patch
-    pip install "./python[dev_xpu]"
-
-    # SGLang installation breaks the default PyTorch and Triton versions, so we need to reinstall them.
-    $SCRIPTS_DIR/install-pytorch.sh --force-reinstall
-    $SCRIPTS_DIR/compile-triton.sh --triton
+    cd sglang
+    git checkout "$(<../benchmarks/third_party/sglang/sglang-pin.txt)"
+    git apply ../benchmarks/third_party/sglang/sglang-fix.patch
+
+    # That's how sglang assumes we'll pick out platform for now
+    cp python/pyproject_xpu.toml python/pyproject.toml
+    # We should remove all torch libraries from requirements to avoid reinstalling triton & torch
+    # We remove sgl kernel due to a bug in the current environment probably due to using newer torch, we don't currently use it anyway
+    # We remove timm because it depends on torchvision, which depends on torch==2.9
+    sed -i '/pytorch\|torch\|sgl-kernel\|timm/d' python/pyproject.toml
+    cat python/pyproject.toml
+    pip install -e "./python"
+    cd ..
   fi
 
-  pip install pytest pytest-xdist
+  pip install pytest pytest-cov pytest-xdist
+}
+
+run_sglang_tests() {
+  echo "***************************************************"
+  echo "******    Running SGLang Triton tests        ******"
+  echo "***************************************************"
+
+  run_sglang_install
+  cd sglang
   run_pytest_command -vvv -n ${PYTEST_MAX_PROCESSES:-4} test/srt/test_triton_attention_kernels.py
 }
 
@@ -648,6 +670,7 @@ run_vllm_install() {
     cd vllm-xpu-kernels
     git checkout "$(<../benchmarks/third_party/vllm/vllm-kernels-pin.txt)"
     sed -i '/pytorch\|torch/d' requirements.txt
+    sed -i '/pytorch\|torch/d' pyproject.toml
     pip install -r requirements.txt
     VLLM_TARGET_DEVICE=xpu pip install --no-build-isolation -e .
     cd ..
@@ -672,7 +695,7 @@ run_vllm_tests() {
 
 run_triton_kernels_tests() {
   echo "***************************************************"
-  echo "******    Running Triton Kernels tests      ******"
+  echo "******    Running Triton Kernels tests      *******"
   echo "***************************************************"
   cd $TRITON_PROJ/python/triton_kernels/tests
 
@@ -745,6 +768,9 @@ test_triton() {
   if [ "$TEST_INDUCTOR" == true ]; then
     run_inductor_tests
   fi
+  if [ "$INSTALL_SGLANG" == true ]; then
+    run_sglang_install
+  fi
   if [ "$TEST_SGLANG" == true ]; then
     run_sglang_tests
   fi