align the interface

airMeng · airMeng · commit 39c7bd0d9bea · 2025-08-14T22:45:30.000+08:00
diff --git a/python/sgl_kernel/flash_attn.py b/python/sgl_kernel/flash_attn.py
@@ -3,10 +3,10 @@
 import torch
 import torch.nn as nn
 
-try:
-    from sgl_kernel import flash_ops
-except:
-    raise ImportError("Can not import sgl_kernel. Please check your installation.")
+# try:
+#     from sgl_kernel import flash_ops
+# except:
+#     raise ImportError("Can not import sgl_kernel. Please check your installation.")
 
 
 def is_fa3_supported(device=None) -> bool:
@@ -18,10 +18,16 @@ def is_fa3_supported(device=None) -> bool:
     #  https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x
     #  And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a.
     #  That means if you use A100/A*0/L20/L40/L40s/4090 you can use fa3.
-    return (
+    if torch.cuda.is_available():
+      return (
         torch.cuda.get_device_capability(device)[0] == 9
         or torch.cuda.get_device_capability(device)[0] == 8
-    ) and (torch.version.cuda >= "12.3")
+        ) and (torch.version.cuda >= "12.3")
+    elif torch.xpu.is_available():
+        device_name = torch.xpu.get_device_properties(0).name
+        return "B580" in device_name or "e211" in device_name
+    else:
+        return False
 
 
 def maybe_contiguous(x):
diff --git a/src/sycl/chunked_prefill.cpp b/src/sycl/chunked_prefill.cpp
@@ -133,7 +133,6 @@ struct Flash_fwd_params {
 
   // Local window size
   int window_size_left, window_size_right;
-  int attention_chunk;
 
   // Pointer to the RNG seed (idx 0) and offset (idx 1).
   uint64_t* rng_state;
@@ -541,14 +540,13 @@ std::vector<at::Tensor> mha_fwd(
     std::optional<const at::Tensor>& rotary_cos_,      // seqlen_ro x (rotary_dim / 2)
     std::optional<const at::Tensor>& rotary_sin_,      // seqlen_ro x (rotary_dim / 2)
     std::optional<const at::Tensor>& seqlens_rotary_,  // b
-    // std::optional<at::Tensor> &q_descale_,  // (b, h_k), not (b, h)
-    // std::optional<at::Tensor> &k_descale_,  // (b, h_k)
-    // std::optional<at::Tensor> &v_descale_,  // (b, h_k)
-    std::optional<double> softmax_scale_,
+    std::optional<at::Tensor>& q_descale_,             // (b, h_k), not (b, h)
+    std::optional<at::Tensor>& k_descale_,             // (b, h_k)
+    std::optional<at::Tensor>& v_descale_,             // (b, h_k)
+    const float softmax_scale_,
     bool is_causal,
     int window_size_left,
     int window_size_right,
-    int attention_chunk,
     float const softcap,
     bool const is_rotary_interleaved,  // if true, rotary combines indices 0 & 1, else indices 0 & rotary_dim / 2
     std::optional<at::Tensor>& scheduler_metadata_,  // (b + 1)
@@ -619,10 +617,8 @@ std::vector<at::Tensor> mha_fwd(
   int const total_k = !is_varlen_k ? batch_size * k.size(1) : k.size(0);
   int const num_heads_k = k.size(-2);
   int const batch_size_k = !paged_KV ? (!is_varlen_k ? k.size(0) : cu_seqlens_k.size(0) - 1) : page_table.size(0);
-  double softmax_scale = 1.0 / sqrt(double(head_size));
-  if (softmax_scale_.has_value()) {
-    softmax_scale = softmax_scale_.value();
-  }
+  float softmax_scale = softmax_scale_;
+
   if (!kv_batch_idx_.has_value()) {
     TORCH_CHECK(batch_size == batch_size_k, "batch_size must be equal to batch_size_k");
   }
@@ -791,8 +787,8 @@ std::vector<at::Tensor> mha_fwd(
 
   // Causal is the special case where window_size_right == 0 and window_size_left < 0.
   // Local is the more general case where window_size_right >= 0 or window_size_left >= 0.
-  params.is_causal = window_size_left < 0 && window_size_right == 0 && attention_chunk == 0;
-  params.is_local = (window_size_left >= 0 || window_size_right >= 0 || attention_chunk >= 1) && !params.is_causal;
+  params.is_causal = window_size_left < 0 && window_size_right == 0;
+  params.is_local = (window_size_left >= 0 || window_size_right >= 0) && !params.is_causal;
 
   // TODO: check this
   if (window_size_left < 0) {
@@ -801,13 +797,8 @@ std::vector<at::Tensor> mha_fwd(
   if (window_size_right < 0) {
     window_size_right = seqlen_q - 1;
   }
-  if (attention_chunk > 0) {
-    window_size_left = std::min(window_size_left, attention_chunk - 1);
-    window_size_right = std::min(window_size_right, attention_chunk - 1);
-  }
   params.window_size_left = window_size_left;
   params.window_size_right = window_size_right;
-  params.attention_chunk = attention_chunk;
 
   params.total_q = total_q;
   params.total_k = total_k;
diff --git a/src/torch_extension_sycl.cc b/src/torch_extension_sycl.cc
@@ -16,10 +16,9 @@ limitations under the License.
 #include <torch/all.h>
 #include <torch/library.h>
 
-#include "sgl_kernel_torch_shim.h"
-
 #include "sgl_flash_kernel_ops.h"
 #include "sgl_kernel_ops.h"
+#include "sgl_kernel_torch_shim.h"
 
 TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
   /*
diff --git a/tests/test_flash_attention.py b/tests/test_flash_attention.py
@@ -17,7 +17,10 @@
 
 def is_hopper():
     #  Only Hopper supports different V headdim
-    return torch.cuda.get_device_properties(0).major >= 9
+    if torch.cuda.is_available():
+        return torch.cuda.get_device_properties(0).major >= 9
+    else:
+        return False
 
 
 def is_fa3_supported(device=None) -> bool: