Update chunked_prefill.cpp piplinestage=2

sunjiweiswift · airMeng · commit c3947725f61b · 2025-09-22T18:20:26.000+08:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -38,7 +38,7 @@ set(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable headers only mode in cutla
 FetchContent_Declare(
     repo-cutlass-sycl
     GIT_REPOSITORY https://github.com/sunjiweiswift/cutlass-sycl.git
-    GIT_TAG        ab1f4b8ddfd5748e4c00317710cdbcecda58de28
+    GIT_TAG        e02de57e31a20f1c5c7e472aecd322e9196b2792
     GIT_SHALLOW    OFF
 )
 FetchContent_MakeAvailable(repo-cutlass-sycl)
diff --git a/cmake/BuildFlags.cmake b/cmake/BuildFlags.cmake
@@ -113,18 +113,10 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
 
 
   set(AOT_TARGETS "bmg")
-  if(TORCH_XPU_ARCH_LIST)
-    set(AOT_TARGETS "${TORCH_XPU_ARCH_LIST}")
-  endif()
-  if(AOT_TARGETS STREQUAL "none")
-    set(TORCH_XPU_ARCH_LIST "" PARENT_SCOPE)
-  else()
-    set(SYCL_TARGETS_OPTION -fsycl-targets=spir64_gen)
-    set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} ${SYCL_TARGETS_OPTION})
-    set(SYCL_DEVICE_LINK_FLAGS ${SYCL_DEVICE_LINK_FLAGS} ${SYCL_TARGETS_OPTION})
-    set(SYCL_OFFLINE_COMPILER_AOT_OPTIONS "-device ${AOT_TARGETS}")
-    set(TORCH_XPU_ARCH_LIST ${AOT_TARGETS} PARENT_SCOPE)
-  endif()
+  set(SYCL_TARGETS_OPTION -fsycl-targets=spir64_gen)
+  set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} ${SYCL_TARGETS_OPTION})
+  set(SYCL_DEVICE_LINK_FLAGS ${SYCL_DEVICE_LINK_FLAGS} ${SYCL_TARGETS_OPTION})
+  set(SYCL_OFFLINE_COMPILER_AOT_OPTIONS "-device ${AOT_TARGETS}")
   message(STATUS "Compile Intel GPU AOT Targets for ${AOT_TARGETS}")
 
   set(SYCL_FLAGS ${SYCL_FLAGS} ${SYCL_KERNEL_OPTIONS})
diff --git a/src/sycl/chunked_prefill.cpp b/src/sycl/chunked_prefill.cpp
@@ -863,7 +863,7 @@ std::vector<at::Tensor> mha_fwd(
   at::Tensor out_accum, softmax_lse_accum;
   auto outaccum_type = at::ScalarType::Float;
 
-  constexpr int PipelineStages = 0;
+  constexpr int PipelineStages = 2;
   if (params.is_causal) {
     switch (params.d) {
       case 64:
diff --git a/tests/test_flash_attention.py b/tests/test_flash_attention.py
@@ -1022,6 +1022,10 @@ def _generate_block_kvcache(
 
 
 # @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float8_e4m3fn])
+@pytest.mark.skipif(
+    True,
+    reason="flash_attn at sgl-kernel-xpu only supports paged cache",
+)
 @pytest.mark.parametrize(
     "dtype", [torch.bfloat16] + ([torch.float8_e4m3fn] if not DISABLE_FP8 else [])
 )

Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ set(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable headers only mode in cutla`
`38`	`38`	`FetchContent_Declare(`
`39`	`39`	`repo-cutlass-sycl`
`40`	`40`	`GIT_REPOSITORY https://github.com/sunjiweiswift/cutlass-sycl.git`
`41`		`- GIT_TAG ab1f4b8ddfd5748e4c00317710cdbcecda58de28`
	`41`	`+ GIT_TAG e02de57e31a20f1c5c7e472aecd322e9196b2792`
`42`	`42`	`GIT_SHALLOW OFF`
`43`	`43`	`)`
`44`	`44`	`FetchContent_MakeAvailable(repo-cutlass-sycl)`
Original file line number	Diff line number	Diff line change
`@@ -1022,6 +1022,10 @@ def _generate_block_kvcache(`
`1022`	`1022`
`1023`	`1023`
`1024`	`1024`	`# @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float8_e4m3fn])`
	`1025`	`+@pytest.mark.skipif(`
	`1026`	`+ True,`
	`1027`	`+ reason="flash_attn at sgl-kernel-xpu only supports paged cache",`
	`1028`	`+)`
`1025`	`1029`	`@pytest.mark.parametrize(`
`1026`	`1030`	`"dtype", [torch.bfloat16] + ([torch.float8_e4m3fn] if not DISABLE_FP8 else [])`
`1027`	`1031`	`)`