sgl-project
diff --git a/‎include/sgl_flash_kernel_ops.h‎
Lines changed: 1 addition & 6 deletions b/‎include/sgl_flash_kernel_ops.h‎
Lines changed: 1 addition & 6 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/sgl_kernel/flash_attn.py‎
Lines changed: 18 additions & 5 deletions b/‎python/sgl_kernel/flash_attn.py‎
Lines changed: 18 additions & 5 deletions
@@ -53,18 +53,13 @@ std::vector<at::Tensor> mha_fwd(
     std::optional<const at::Tensor>&
         v_new_,  // (b, s_k_new, h_k, dv) or (total_k_new, h_k, dv) if there is cu_seqlens_k_new
     std::optional<const at::Tensor>& q_v_,           // (b, s_q, h, dv) or (total_q_new, h, dv) if there is cu_seqlens_q
-    std::optional<at::Tensor>& out_,                 // (b, s_q, h, dv) or (total_q, h, dv) if there is cu_seqlens_q
     std::optional<const at::Tensor>& cu_seqlens_q_,  // b+1
     std::optional<const at::Tensor>& cu_seqlens_k_,  // b+1
     std::optional<const at::Tensor>& cu_seqlens_k_new_,  // b+1
-    std::optional<const at::Tensor>&
-        seqused_q_,  // b. If given, only this many elements of each batch element's queries and outputs are used.
-    std::optional<const at::Tensor>&
-        seqused_k_,  // b. If given, only this many elements of each batch element's keys are used.
     std::optional<int> max_seqlen_q_,
-    // TODO: check if we need max_seqlen_k
     std::optional<int> max_seqlen_k_,
     std::optional<const at::Tensor>& page_table_,      // (b_k, max_num_pages_per_seq)
+    std::optional<const at::Tensor>& num_pages_,       // (b_k, )
     std::optional<const at::Tensor>& kv_batch_idx_,    // b. indices to index into the KV cache
     std::optional<const at::Tensor>& leftpad_k_,       // b
     std::optional<const at::Tensor>& rotary_cos_,      // seqlen_ro x (rotary_dim / 2)
 
@@ -31,6 +31,7 @@ exclude = [
 
 [tool.scikit-build]
 cmake.build-type = "Release"
+build-dir = "build"
 minimum-version = "build-system.requires"
 
 wheel.py-api = "cp39"
 
@@ -177,22 +177,35 @@ def flash_attn_with_kvcache(
     rotary_cos, rotary_sin = [maybe_contiguous(x) for x in (rotary_cos, rotary_sin)]
     rotary_seqlens = maybe_contiguous(rotary_seqlens)
 
+    if cu_seqlens_q == None: # !is_varlen_q
+        cu_seqlens_q = torch.arange(0, q.size(0)+1, dtype=torch.int, device=q.device) * q.size(1)
+        max_seqlen_q = q.size(1)
+        q = q.view(-1, q.size(-2), q.size(-1)).contiguous()
+    if cu_seqlens_k_new is None and k is not None:  # !is_varlen_k_new
+        cu_seqlens_k_new = torch.arange(0, k.size(0)+1, dtype=torch.int, device=k.device)
+    elif k is None:
+        cu_seqlens_k_new = torch.zeros_like(cu_seqlens_q, dtype=torch.int32, device=q.device)
+    if cache_seqlens is not None:
+        max_seqlen_k = cache_seqlens.max().item()
+        assert cache_seqlens.size(0) + 1 == cu_seqlens_q.size(0)
+        page_size = k_cache.size(1)
+        num_pages_per_seq = (cache_seqlens + page_size - 1) // page_size
+        cu_seqlens_k = torch.concat((torch.zeros(1, dtype=torch.int32, device=cache_seqlens.device), torch.cumsum(cache_seqlens, 0))).to(torch.int32)
+
     out, softmax_lse, *rest = torch.ops.sgl_kernel.fwd.default(
         q,
         k_cache,
         v_cache,
         k,
         v,
         qv,
-        None,  # out
         cu_seqlens_q,
-        None,  # cu_seqlens_k
+        cu_seqlens_k,
         cu_seqlens_k_new,
-        None,  # seqused_q
-        cache_seqlens,
         max_seqlen_q,
-        None,  # max_seqlen_k
+        max_seqlen_k,
         page_table,
+        num_pages_per_seq,
         cache_batch_idx,
         cache_leftpad,
         rotary_cos,