enable ut

airMeng · airMeng · commit e8dd142fdb84 · 2025-08-13T23:30:12.000+08:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,6 @@
 [build-system]
 requires = [
   "scikit-build-core>=0.10",
-  "pytorch-triton-xpu @ https://download.pytorch.org/whl/test/pytorch_triton_xpu-3.3.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl",
   "wheel",
 ]
 build-backend = "scikit_build_core.build"
diff --git a/src/torch_extension_sycl.cc b/src/torch_extension_sycl.cc
@@ -16,6 +16,9 @@ limitations under the License.
 #include <torch/all.h>
 #include <torch/library.h>
 
+#include "sgl_kernel_torch_shim.h"
+
+#include "sgl_flash_kernel_ops.h"
 #include "sgl_kernel_ops.h"
 
 TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
diff --git a/tests/test_flash_attention.py b/tests/test_flash_attention.py
@@ -8,6 +8,10 @@
 import torch.nn.functional as F
 from einops import rearrange, repeat
 
+import utils
+
+device = utils.get_device()
+
 apply_rotary_emb = None
 
 
@@ -25,11 +29,14 @@ def is_fa3_supported(device=None) -> bool:
     #  https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x
     #  And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a.
     #  That means if you use A100/A*0/L20/L40/L40s/4090 you can use fa3.
-    return (
+    if torch.cuda.is_available():
+      return (
         torch.cuda.get_device_capability(device)[0] == 9
         or torch.cuda.get_device_capability(device)[0] == 8
-    ) and (torch.version.cuda >= "12.3")
-
+        ) and (torch.version.cuda >= "12.3")
+    elif torch.xpu.is_available():
+        device_name = torch.xpu.get_device_properties(0).name
+        return "B580" in device_name or "e211" in device_name
 
 DISABLE_BACKWARD = True
 # For CI test, we close them to True.
@@ -551,7 +558,6 @@ def test_flash_attn_kvcache(
         pytest.skip()
     if rotary_fraction == 0.0 and has_rotary_seqlens:
         pytest.skip()
-    device = "cuda"
     # set seed
     torch.random.manual_seed(0)
     batch_size = 5
@@ -1077,7 +1083,6 @@ def test_flash_attn_varlen_output(
 ):
     from sgl_kernel.flash_attn import flash_attn_varlen_func
 
-    device = "cuda"
     # set seed
     torch.random.manual_seed(seqlen_q + seqlen_k + d + int(causal) * 2 + int(local))
     # batch_size = 40
diff --git a/tests/utils.py b/tests/utils.py
@@ -0,0 +1,10 @@
+import torch
+
+def get_device() :
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    elif torch.xpu.is_available():
+        device = torch.device("xpu")
+    else:
+        device = torch.device("cpu")
+    return device

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,6 @@`
`1`	`1`	`[build-system]`
`2`	`2`	`requires = [`
`3`	`3`	`"scikit-build-core>=0.10",`
`4`		`- "pytorch-triton-xpu @ https://download.pytorch.org/whl/test/pytorch_triton_xpu-3.3.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl",`
`5`	`4`	`"wheel",`
`6`	`5`	`]`
`7`	`6`	`build-backend = "scikit_build_core.build"`