enable causal

airMeng · airMeng · commit d2e195a6a624 · 2025-09-24T16:18:42.000+08:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -38,7 +38,7 @@ set(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable headers only mode in cutla
 FetchContent_Declare(
     repo-cutlass-sycl
     GIT_REPOSITORY https://github.com/sunjiweiswift/cutlass-sycl.git
-    GIT_TAG        742d127cf5ee75cc6db4eac32c8b72f00c53d0fe
+    GIT_TAG        f46ae0df764a1751879ce3e22765c700b1d52eca
     GIT_SHALLOW    OFF
 )
 FetchContent_MakeAvailable(repo-cutlass-sycl)
diff --git a/src/sycl/chunked_prefill.cpp b/src/sycl/chunked_prefill.cpp
@@ -392,7 +392,7 @@ struct FMHAConfig {
         ElementOutput,
         GmemTiledCopyStore>;
     using CollectiveSoftmaxEpilogue = cutlass::flash_attention::collective::
-        FlashChunkPrefillSoftmaxEpilogue<Causal, false, EpilogueDispatchPolicy, ElementAccumulator>;
+        FlashChunkPrefillSoftmaxEpilogue<Causal, LocalMask, EpilogueDispatchPolicy, ElementAccumulator>;
 
     using ProblemShapeRegular = cute::tuple<int, int, int, int, int, int, int, int>;
     using namespace cutlass::fmha::collective;
@@ -777,7 +777,7 @@ std::vector<at::Tensor> mha_fwd(
       params.cu_seqlens_knew = static_cast<int*>(cu_seqlens_k_new.data_ptr());
     }
   } else {
-    TORCH_CHECK(cu_seqlens_k_new_.has_value(), "If k_new ");
+    TORCH_CHECK(cu_seqlens_k_new_.has_value(), "cu_seqlens_k_new all zeros");
     params.seqlen_knew = 0;
     params.total_knew = 0;
     at::Tensor cu_seqlens_k_new = cu_seqlens_k_new_.value();
diff --git a/tests/test_flash_attention.py b/tests/test_flash_attention.py
@@ -479,8 +479,8 @@ def generate_qkv(
 #     "causal,local",
 #     [(False, False), (True, False)] + ([(False, True)] if not DISABLE_LOCAL else []),
 # )
-# @pytest.mark.parametrize("causal,local", [(False, False), (True, False)])
-@pytest.mark.parametrize("causal,local", [(False, False)])
+@pytest.mark.parametrize("causal,local", [(False, False), (True, False)])
+# @pytest.mark.parametrize("causal,local", [(True, False)])
 # @pytest.mark.parametrize(
 #     "seqlen_new_eq_seqlen_q", [True, False] if not DISABLE_APPENDKV else [True]
 # )
@@ -566,6 +566,8 @@ def test_flash_attn_kvcache(
     batch_size = 5
     batch_size_cache = batch_size if not has_batch_idx else batch_size * 2
     nheads = 16
+    if seqlen_k <= seqlen_q:
+        seqlen_k += seqlen_q
     # nheads = 1
     # rotary_dim must be a multiple of 16, and must be <= d
     rotary_dim = math.floor(int(rotary_fraction * d) / 16) * 16
@@ -694,17 +696,9 @@ def test_flash_attn_kvcache(
                 dtype_ref,
             )
         cache_seqlens = torch.randint(
-            0 if new_kv else 1,
+            seqlen_q,
             # If we don't use seqlen_q in the case of causal and rotary, cos/sin won't be long enough
-            (
-                (
-                    seqlen_k
-                    - (seqlen_q if (causal or local) and rotary_dim > 1 else seqlen_new)
-                    + 1
-                )
-                if new_kv
-                else (seqlen_k + 1)
-            ),
+            seqlen_k,
             (batch_size,),
             dtype=torch.int32,
             device=device,

Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ set(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable headers only mode in cutla`
`38`	`38`	`FetchContent_Declare(`
`39`	`39`	`repo-cutlass-sycl`
`40`	`40`	`GIT_REPOSITORY https://github.com/sunjiweiswift/cutlass-sycl.git`
`41`		`- GIT_TAG 742d127cf5ee75cc6db4eac32c8b72f00c53d0fe`
	`41`	`+ GIT_TAG f46ae0df764a1751879ce3e22765c700b1d52eca`
`42`	`42`	`GIT_SHALLOW OFF`
`43`	`43`	`)`
`44`	`44`	`FetchContent_MakeAvailable(repo-cutlass-sycl)`