intel
diff --git a/‎.github/workflows/triton-benchmarks.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/triton-benchmarks.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎benchmarks/cmake/FindCUTLASSLibrary.cmake‎
Lines changed: 1 addition & 0 deletions b/‎benchmarks/cmake/FindCUTLASSLibrary.cmake‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/cutlass_kernel/CMakeLists.txt‎
Lines changed: 4 additions & 1 deletion b/‎benchmarks/cutlass_kernel/CMakeLists.txt‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎benchmarks/cutlass_kernel/attention/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎benchmarks/cutlass_kernel/attention/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/cutlass_kernel/attention/attention.hpp‎
Lines changed: 246 additions & 0 deletions b/‎benchmarks/cutlass_kernel/attention/attention.hpp‎
Lines changed: 246 additions & 0 deletions
diff --git a/‎benchmarks/cutlass_kernel/cutlass-library.conf‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/cutlass_kernel/cutlass-library.conf‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/cutlass_kernel/gemm/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎benchmarks/cutlass_kernel/gemm/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
@@ -277,6 +277,7 @@ jobs:
           source ../../scripts/capture-hw-details.sh
           python build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark flash-attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
           python build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark flash-attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
+          python build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-cutlass-report.csv --benchmark flash-attn --compiler cutlass --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
 
       - name: Run Triton FA bwd kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_bwd_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_bwd_benchmark.py') }}
@@ -302,6 +303,7 @@ jobs:
           source ../../scripts/capture-hw-details.sh
           python build_report.py $REPORTS/attn-tensor-desc-performance.csv $REPORTS/attn-tensor-desc-triton-report.csv --benchmark flash-attn-tensor-desc --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
           python build_report.py $REPORTS/attn-tensor-desc-performance.csv $REPORTS/attn-tensor-desc-xetla-report.csv --benchmark flash-attn-tensor-desc --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
+          python build_report.py $REPORTS/attn-tensor-desc-performance.csv $REPORTS/attn-tensor-desc-cutlass-report.csv --benchmark flash-attn-tensor-desc --compiler cutlass --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
 
       - name: Run Triton FlexAttention Causal Mask fwd kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py') }}
 
@@ -27,6 +27,7 @@ if (NOT CUTLASSLibrary_FOUND)
 
     set(CUTLASSLibrary_INCLUDE_DIR "${CUTLASSLibrary_SOURCE_DIR}/include" CACHE INTERNAL "CUTLASSLibrary_SOURCE_DIR")
     set(CUTLASSLibrary_INCLUDE_TOOL_DIR "${CUTLASSLibrary_SOURCE_DIR}/tools/util/include" CACHE INTERNAL "CUTLASSLibrary_SOURCE_DIR")
+    set(CUTLASSLibrary_INCLUDE_APPLICATION_DIR "${CUTLASSLibrary_SOURCE_DIR}/applications" CACHE INTERNAL "CUTLASSLibrary_SOURCE_DIR")
 
     find_package_handle_standard_args(
             CUTLASSLibrary
 
@@ -16,6 +16,9 @@ target_compile_options(cutlass_kernel PRIVATE "-DSYCL_INTEL_TARGET")
 target_link_options(cutlass_kernel PRIVATE ${CUTLASS_KERNEL_FLAGS})
 target_link_libraries(cutlass_kernel PUBLIC ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIBRARY})
 
-target_include_directories(cutlass_kernel PUBLIC "${CUTLASSLibrary_INCLUDE_DIR}" "${CUTLASSLibrary_INCLUDE_TOOL_DIR}")
+target_include_directories(cutlass_kernel PUBLIC "${CUTLASSLibrary_INCLUDE_DIR}" "${CUTLASSLibrary_INCLUDE_TOOL_DIR}" "${CUTLASSLibrary_INCLUDE_APPLICATION_DIR}")
+
+add_subdirectory(gemm)
+add_subdirectory(attention)
 
 install(TARGETS cutlass_kernel LIBRARY DESTINATION .)
@@ -0,0 +1 @@
+target_include_directories(cutlass_kernel PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
@@ -0,0 +1,246 @@
+#include "flash_attention_v2/collective/fmha_fusion.hpp"
+#include "flash_attention_v2/collective/xe_flash_attn_prefill_epilogue.hpp"
+#include "flash_attention_v2/collective/xe_flash_attn_prefill_mma.hpp"
+#include "flash_attention_v2/collective/xe_flash_attn_prefill_softmax_epilogue.hpp"
+#include "flash_attention_v2/kernel/tile_scheduler.hpp"
+#include "flash_attention_v2/kernel/xe_flash_attn_prefill.hpp"
+
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include <exception>
+#include <iostream>
+
+////////////////////////////////////////////////////////////////////////////////
+// PRIVATE FUNCTION
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename FMHA> static auto run(typename FMHA::Params params) -> void {
+  cute::dim3 const block = FMHA::get_block_shape();
+  cute::dim3 const grid = FMHA::get_grid_shape(params);
+
+  int smem_size = FMHA::SharedStorageSize;
+
+  const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
+  const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+
+#if !defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
+  using namespace syclcompat::experimental;
+  auto event = launch<cutlass::device_kernel<FMHA>>(
+      launch_policy{
+          sycl_grid, sycl_block,
+          local_mem_size{static_cast<std::size_t>(smem_size)},
+          kernel_properties{
+              sycl_exp::sub_group_size<FMHA::DispatchPolicy::SubgroupSize>}},
+      params);
+#else
+  syclcompat::experimental::launch_properties launch_props{
+      sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size),
+  };
+  syclcompat::experimental::kernel_properties kernel_props{
+      sycl::ext::oneapi::experimental::sub_group_size<
+          FMHA::DispatchPolicy::SubgroupSize>};
+  syclcompat::experimental::launch_policy policy{sycl_grid, sycl_block,
+                                                 launch_props, kernel_props};
+  auto event = syclcompat::experimental::launch<cutlass::device_kernel<FMHA>>(
+      policy, params);
+#endif
+
+  EventManager::getInstance().addEvent(event);
+}
+
+template <bool Causal, typename TileShapeQK, typename TileShapePV,
+          typename TileShapeOutput, typename SubgroupLayout, int PipelineStages>
+static auto attention_run(const at::Tensor &Q, const at::Tensor &K,
+                          const at::Tensor &V, at::Tensor &O, int Batch,
+                          int NumHeadsQ, int NumHeadsKV, int SeqLengthQO,
+                          int SeqLengthKV, int HeadSizeQK, int HeadSizeVO,
+                          float sm_scale) -> int {
+  RECORD_FUNCTION("cutlass fa", {});
+
+  using ElementAccumulator = float;
+  using ElementInputQ = cutlass::half_t;
+  using ElementInputKV = cutlass::half_t;
+  using ElementOutput = float;
+
+  using LayoutQ = cutlass::layout::RowMajor;
+  using LayoutK = cutlass::layout::ColumnMajor;
+  using LayoutV = cutlass::layout::RowMajor;
+  using LayoutO = cutlass::layout::RowMajor;
+
+  using GEMMDispatchPolicy =
+      cutlass::gemm::MainloopIntelXeXMX16<PipelineStages>;
+  using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16;
+
+  using MMAOperation = cute::XE_8x16x16_F32F16F16F32_TT;
+
+  using GmemTiledCopyQ = cute::XE_2D_U16x8x32_LD_N;
+  using GmemTiledCopyK = cute::XE_2D_U16x16x16_LD_T;
+  using GmemTiledCopyV = cute::XE_2D_U16x16x32_LD_V;
+  using GmemTiledCopyStore = cute::XE_2D_U32x8x16_ST_N;
+
+  using ProblemShapeType = cute::tuple<int, int, int, int, int, int, int>;
+
+  /// MAIN LOOP ///
+
+  using CollectiveMainloop =
+      cutlass::flash_attention::collective::FlashPrefillMma<
+          GEMMDispatchPolicy, ProblemShapeType, ElementInputQ,
+          cutlass::gemm::TagToStrideA_t<LayoutQ>, ElementInputKV,
+          cutlass::gemm::TagToStrideB_t<LayoutK>, ElementInputKV,
+          cutlass::gemm::TagToStrideB_t<LayoutV>, MMAOperation, TileShapeQK,
+          TileShapePV, SubgroupLayout,
+          GmemTiledCopyQ, // Q
+          GmemTiledCopyK, // K
+          GmemTiledCopyV, // V,
+          Causal>;
+
+  /// EPILOGUE LOOP ///
+
+  using CollectiveSoftmaxEpilogue =
+      cutlass::flash_attention::collective::FlashPrefillSoftmaxEpilogue<
+          Causal, EpilogueDispatchPolicy, ElementAccumulator>;
+  using CollectiveEpilogue =
+      cutlass::flash_attention::collective::FlashPrefillEpilogue<
+          EpilogueDispatchPolicy, MMAOperation, TileShapeOutput, SubgroupLayout,
+          ElementAccumulator, cutlass::gemm::TagToStrideC_t<LayoutO>,
+          ElementOutput, GmemTiledCopyStore>;
+
+  /// FA ///
+
+  using FMHAPrefillKernel = cutlass::flash_attention::kernel::FMHAPrefill<
+      ProblemShapeType, CollectiveMainloop, CollectiveSoftmaxEpilogue,
+      CollectiveEpilogue>;
+
+  /// FA INVOCATION ///
+
+  try {
+    /// Buffer Initialization
+    const cutlass::half_t *_Q =
+        static_cast<const cutlass::half_t *>(Q.data_ptr());
+    const cutlass::half_t *_K =
+        static_cast<const cutlass::half_t *>(K.data_ptr());
+    const cutlass::half_t *_V =
+        static_cast<const cutlass::half_t *>(V.data_ptr());
+    const float *_O = static_cast<const float *>(O.data_ptr());
+
+    /// Problem size
+    using ProblemShapeType = typename FMHAPrefillKernel::ProblemShape;
+    ProblemShapeType problem_size =
+        ProblemShapeType{Batch,       NumHeadsQ,  NumHeadsKV, SeqLengthQO,
+                         SeqLengthKV, HeadSizeQK, HeadSizeVO};
+
+    /// Stride
+    using StrideQ = typename FMHAPrefillKernel::StrideQ;
+    using StrideK = typename FMHAPrefillKernel::StrideK;
+    using StrideV = typename FMHAPrefillKernel::StrideV;
+    using StrideO = typename FMHAPrefillKernel::StrideO;
+    StrideQ stride_Q = cutlass::make_cute_packed_stride(
+        StrideQ{},
+        cute::make_shape(SeqLengthQO, HeadSizeQK, Batch * NumHeadsQ));
+    StrideK stride_K = cutlass::make_cute_packed_stride(
+        StrideK{},
+        cute::make_shape(SeqLengthKV, HeadSizeQK, Batch * NumHeadsKV));
+    StrideV stride_V = cutlass::make_cute_packed_stride(
+        StrideV{},
+        cute::make_shape(HeadSizeVO, SeqLengthKV, Batch * NumHeadsKV));
+    StrideO stride_O = cutlass::make_cute_packed_stride(
+        StrideO{},
+        cute::make_shape(SeqLengthQO, HeadSizeVO, Batch * NumHeadsQ));
+
+    static cutlass::KernelHardwareInfo hw_info;
+    if (hw_info.sm_count == 0) {
+      hw_info.sm_count =
+          cutlass::KernelHardwareInfo::query_device_multiprocessor_count(0);
+      CUTLASS_TRACE_HOST(
+          "Query result for SM count per device: " << hw_info.sm_count);
+    }
+
+    typename FMHAPrefillKernel::Arguments arguments = {
+        cutlass::gemm::GemmUniversalMode::kGemm,
+        problem_size,
+        {_Q, stride_Q, _K, stride_K, _V, stride_V},
+        {sm_scale},
+        {_O, stride_O},
+        hw_info};
+
+    size_t workspace_size = FMHAPrefillKernel::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+    auto workspace_ptr = workspace.get();
+
+    if (!FMHAPrefillKernel::can_implement(arguments)) {
+      std::cout << "Invalid Problem Size: " << Batch << 'x' << NumHeadsQ << 'x'
+                << SeqLengthQO << 'x' << SeqLengthKV << 'x' << HeadSizeQK << 'x'
+                << HeadSizeVO << (Causal ? "xCausal" : "xNonCausal")
+                << std::endl;
+      return -1;
+    }
+
+    CUTLASS_CHECK(
+        FMHAPrefillKernel::initialize_workspace(arguments, workspace_ptr));
+    auto params =
+        FMHAPrefillKernel::to_underlying_arguments(arguments, workspace_ptr);
+    run<FMHAPrefillKernel>(params);
+
+    syclcompat::wait();
+
+  } catch (std::exception &e) {
+    std::cerr << "Runtime error: " << e.what() << std::endl;
+    return -1;
+  } catch (...) {
+    std::cerr << "Unexpected error" << std::endl;
+    return -1;
+  }
+
+  return 0;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// PUBLIC FUNCTION
+////////////////////////////////////////////////////////////////////////////////
+
+using FARunPtr = int (*)(const at::Tensor &Q, const at::Tensor &K,
+                         const at::Tensor &V, at::Tensor &O, int Batch,
+                         int NumHeadsQ, int NumHeadsKV, int SeqLengthQO,
+                         int SeqLengthKV, int HeadSizeQK, int HeadSizeVO,
+                         float sm_scale);
+
+auto attention(const at::Tensor &Q, const at::Tensor &K, const at::Tensor &V,
+               at::Tensor &O, int Batch, int NumHeadsQ, int NumHeadsKV,
+               int SeqLengthQO, int SeqLengthKV, int HeadSizeQK, int HeadSizeVO,
+               bool Causal, float sm_scale) -> int {
+  constexpr int PipelineStages = 2;
+  FARunPtr f = nullptr;
+
+  if (HeadSizeVO == 64) {
+    using ShapeQK = cute::Shape<cute::_128, cute::_64, cute::_64>;
+    using ShapePV = cute::Shape<cute::_128, cute::_32, cute::_64>;
+    using ShapeOutPut = cute::Shape<cute::_128, cute::_64, cute::_64>;
+    using SubgroupLayout =
+        cute::Layout<cute::Shape<cute::_8, cute::_1, cute::_1>,
+                     cute::Stride<cute::_1, cute::_1, cute::_1>>;
+
+    f = Causal ? attention_run<true, ShapeQK, ShapePV, ShapeOutPut,
+                               SubgroupLayout, PipelineStages>
+               : attention_run<false, ShapeQK, ShapePV, ShapeOutPut,
+                               SubgroupLayout, PipelineStages>;
+
+  } else if (HeadSizeVO == 128) {
+    using ShapeQK = cute::Shape<cute::_128, cute::_64, cute::_64>;
+    using ShapePV = cute::Shape<cute::_128, cute::_32, cute::_64>;
+    using ShapeOutPut = cute::Shape<cute::_128, cute::_128, cute::_64>;
+    using SubgroupLayout =
+        cute::Layout<cute::Shape<cute::_16, cute::_1, cute::_1>,
+                     cute::Stride<cute::_1, cute::_1, cute::_1>>;
+
+    f = Causal ? attention_run<true, ShapeQK, ShapePV, ShapeOutPut,
+                               SubgroupLayout, PipelineStages>
+               : attention_run<false, ShapeQK, ShapePV, ShapeOutPut,
+                               SubgroupLayout, PipelineStages>;
+  } else {
+    std::cerr << "Unsupported HeadSizeVO: " << HeadSizeVO << std::endl;
+    return -1;
+  }
+
+  return f(Q, K, V, O, Batch, NumHeadsQ, NumHeadsKV, SeqLengthQO, SeqLengthKV,
+           HeadSizeQK, HeadSizeVO, sm_scale);
+}
@@ -1 +1 @@
-dee33709bdc0cc579df49f251da894d4546b2624
+dd43242ea2f3e08e73a73153f00a5dbe5a31c41c
@@ -0,0 +1 @@
+target_include_directories(cutlass_kernel PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+target_include_directories(cutlass_kernel PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-dee33709bdc0cc579df49f251da894d4546b2624`
	`1`	`+dd43242ea2f3e08e73a73153f00a5dbe5a31c41c`