diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8faee83..6a18ec0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -171,7 +171,7 @@ if(VLLM_GPU_LANG STREQUAL "SYCL")
   set(CUTLASS_ENABLE_HEADERS_ONLY "ON" CACHE BOOL "Enable only the header library")
 
   # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
-  set(CUTLASS_REVISION "9baca2cff3a28590fcd03e55515e2d91ff2cbc8b" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "f58e2df1951b4f99c21be64d4fcd500742a41c59" CACHE STRING "CUTLASS revision to use")
 
   # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
   FetchContent_Declare(
diff --git a/csrc/xpu/cutlass_kernels/collective/gemm/default_gemm_universal.h b/csrc/xpu/cutlass_kernels/collective/gemm/default_gemm_universal.h
deleted file mode 100644
index f2743bf..0000000
--- a/csrc/xpu/cutlass_kernels/collective/gemm/default_gemm_universal.h
+++ /dev/null
@@ -1,306 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights
- *reserved. SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief
-      Default kernel-level GEMM definitions combine threadblock-scoped matrix
-   multiply-add with the appropriate threadblock-scoped epilogue.
-
-      Note, CUTLASS epilogues universally target row-major outputs. Column-major
-   outputs are accommodated by exchanging A and B operands and assuming
-   transposed layouts. Partial specializations here choose
-   'device::GemmTransposed' to implement this functionality.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/complex.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
-
-#include "gemm_universal_k.h"
-#include "cutlass/gemm/kernel/gemm_universal_streamk.h"
-#include "cutlass/gemm/kernel/default_gemm.h"
-#include "cutlass/gemm/kernel/default_gemm_complex.h"
-
-#include "cutlass/layout/permute.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
-    /// Gather operand A by using an index array
-    bool GatherA = false,
-    /// Gather operand B by using an index array
-    bool GatherB = false,
-    /// Scatter result D by using an index array
-    bool ScatterD = false,
-    /// Permute result D
-    typename PermuteDLayout = layout::NoPermute,
-    /// Permute operand A
-    typename PermuteALayout_ = layout::NoPermute,
-    /// Permute operand B
-    typename PermuteBLayout_ = layout::NoPermute,
-    ///
-    typename Enable = void>
-struct DefaultGemmUniversal;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Real-valued GEMM kernels
-//
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear,
-    /// Gather operand A by using an index array
-    bool GatherA,
-    /// Gather operand B by using an index array
-    bool GatherB,
-    /// Scatter result D by using an index array
-    bool ScatterD,
-    /// Permute result D
-    typename PermuteDLayout,
-    /// Permute operand A
-    typename PermuteALayout,
-    /// Permute operand B
-    typename PermuteBLayout>
-struct DefaultGemmUniversal<
-    ElementA, LayoutA,
-    ComplexTransform::kNone,  // transform A
-    kAlignmentA, ElementB, LayoutB,
-    ComplexTransform::kNone,  // transform B
-    kAlignmentB, ElementC, LayoutC, ElementAccumulator, OperatorClass, ArchTag,
-    ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp,
-    ThreadblockSwizzle, Stages, Operator, SharedMemoryClear, GatherA, GatherB,
-    ScatterD, PermuteDLayout, PermuteALayout, PermuteBLayout,
-    typename platform::enable_if<
-        !cutlass::is_complex<ElementAccumulator>::value>::type> {
-  using DefaultGemmKernel = typename kernel::DefaultGemm<
-      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
-      LayoutC, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape,
-      WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages,
-      true, Operator, SharedMemoryClear, GatherA, GatherB, ScatterD,
-      PermuteDLayout, PermuteALayout, PermuteBLayout>::GemmKernel;
-
-  /// Universal kernel without StreamkFeature member type
-  template <class SwizzleT, class Enable = void>
-  class SelectBase
-      : public kernel::GemmUniversal<typename DefaultGemmKernel::Mma,
-                                     typename DefaultGemmKernel::Epilogue,
-                                     SwizzleT> {};
-
-  /// Universal kernel with StreamkFeature member type
-  template <class SwizzleT>
-  class SelectBase<SwizzleT, typename SwizzleT::StreamkFeature>
-      : public kernel::GemmUniversalStreamk<
-            typename DefaultGemmKernel::Mma,
-            typename DefaultGemmKernel::Epilogue, SwizzleT> {};
-
-  /// Select kernel by ThreadblockSwizzle's support for StreamkFeature
-  using GemmKernel = SelectBase<ThreadblockSwizzle>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// Complex-valued GEMM kernels
-//
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear>
-struct DefaultGemmUniversal<
-    ElementA, LayoutA, TransformA, kAlignmentA, ElementB, LayoutB, TransformB,
-    kAlignmentB, ElementC, LayoutC, ElementAccumulator, OperatorClass, ArchTag,
-    ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp,
-    ThreadblockSwizzle, Stages, Operator, SharedMemoryClear, false, false,
-    false, layout::NoPermute, layout::NoPermute, layout::NoPermute,
-    typename platform::enable_if<
-        cutlass::is_complex<ElementAccumulator>::value>::type> {
-  using DefaultGemmKernel = typename kernel::DefaultGemmComplex<
-      ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-      ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
-      InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages,
-      TransformA, TransformB, Operator, false>::GemmKernel;
-
-  /// Universal kernel without StreamkFeature member type
-  template <class SwizzleT, class Enable = void>
-  class SelectBase
-      : public kernel::GemmUniversal<typename DefaultGemmKernel::Mma,
-                                     typename DefaultGemmKernel::Epilogue,
-                                     SwizzleT> {};
-
-  /// Universal kernel with StreamkFeature member type
-  template <class SwizzleT>
-  class SelectBase<SwizzleT, typename SwizzleT::StreamkFeature>
-      : public kernel::GemmUniversalStreamk<
-            typename DefaultGemmKernel::Mma,
-            typename DefaultGemmKernel::Epilogue, SwizzleT> {};
-
-  /// Select kernel by ThreadblockSwizzle's support for StreamkFeature
-  using GemmKernel = SelectBase<ThreadblockSwizzle>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal.h b/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal.h
deleted file mode 100644
index 411f673..0000000
--- a/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal.h
+++ /dev/null
@@ -1,366 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights
- *reserved. SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief
-*/
-
-#pragma once
-
-#include "cutlass/arch/mma.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "gemm_universal_k.h"
-
-#include "default_gemm_universal.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-#include "gemm_universal_base.h"
-
-#include "cutlass/layout/permute.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/*!
-  GemmUniversal is a stateful, reusable GEMM handle.  Once initialized for a
-  given GEMM computation (problem geometry and data references), it can be
-  reused across different GEMM problems having the geometry.  (Once initialized,
-  details regarding problem geometry and references to workspace memory cannot
-  be updated.)
-
-  The universal GEMM accommodates serial reductions, parallel reductions,
-  batched strided, and batched array variants.
-*/
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_ = ElementC_,
-    /// Operator class tag
-    typename OperatorClass_ = arch::OpClassSimt,
-    /// Tag indicating architecture to tune for.  This is the minimum SM that
-    /// supports the intended feature. The device kernel can be built
-    /// targeting any SM larger than this number.
-    typename ArchTag_ = arch::Sm70,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_ =
-        threadblock::GemmIdentityThreadblockSwizzle<>,
-    /// Number of stages used in the pipelined mainloop
-    int Stages =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kStages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentB,
-    /// Operation performed by GEMM
-    typename Operator_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::Operator,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA = ComplexTransform::kNone,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB = ComplexTransform::kNone,
-    /// Gather operand A by using an index array
-    bool GatherA = false,
-    /// Gather operand B by using an index array
-    bool GatherB = false,
-    /// Scatter result D by using an index array
-    bool ScatterD = false,
-    /// Permute result D
-    typename PermuteDLayout_ = layout::NoPermute,
-    /// Permute operand A
-    typename PermuteALayout_ = layout::NoPermute,
-    /// Permute operand B
-    typename PermuteBLayout_ = layout::NoPermute>
-class GemmUniversal
-    : public GemmUniversalBase<typename kernel::DefaultGemmUniversal<
-          ElementA_, LayoutA_, TransformA, AlignmentA, ElementB_, LayoutB_,
-          TransformB, AlignmentB, ElementC_, LayoutC_, ElementAccumulator_,
-          OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_,
-          InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages,
-          Operator_, SharedMemoryClearOption::kNone, GatherA, GatherB, ScatterD,
-          PermuteDLayout_, PermuteALayout_, PermuteBLayout_>::GemmKernel> {
- public:
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  using PermuteDLayout = PermuteDLayout_;
-  using PermuteALayout = PermuteALayout_;
-  using PermuteBLayout = PermuteBLayout_;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-
-  using Base = GemmUniversalBase<typename kernel::DefaultGemmUniversal<
-      ElementA_, LayoutA_, TransformA, AlignmentA, ElementB_, LayoutB_,
-      TransformB, AlignmentB, ElementC_, LayoutC_, ElementAccumulator_,
-      OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_,
-      InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages,
-      Operator_, SharedMemoryClearOption::kNone, GatherA, GatherB, ScatterD,
-      PermuteDLayout_, PermuteALayout_, PermuteBLayout_>::GemmKernel>;
-
-  using Arguments = typename Base::Arguments;
-  using GemmKernel = typename Base::GemmKernel;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for column-major output exchanges problem size and
-/// operand.
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_,
-    /// Operator class tag
-    typename OperatorClass_,
-    /// Tag indicating architecture to tune for.  This is the minimum SM that
-    /// supports the intended feature. The device kernel can be built
-    /// targeting any SM larger than this number.
-    typename ArchTag_,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB,
-    /// Operation performed by GEMM
-    typename Operator_,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Gather operand A by using an index array
-    bool GatherA,
-    /// Gather operand B by using an index array
-    bool GatherB,
-    /// Scatter result D by using an index array
-    bool ScatterD,
-    /// Permute result D
-    typename PermuteDLayout_,
-    /// Permute operand A
-    typename PermuteALayout_,
-    /// Permute operand B
-    typename PermuteBLayout_>
-class GemmUniversal<
-    ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
-    layout::ColumnMajor,  // partially specialized on LayoutC
-    ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
-    WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_,
-    Stages, AlignmentA, AlignmentB, Operator_, TransformA, TransformB, GatherA,
-    GatherB, ScatterD, PermuteDLayout_, PermuteALayout_, PermuteBLayout_> {
- public:
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  using ElementC = ElementC_;
-  using LayoutC = layout::ColumnMajor;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  using PermuteDLayout = PermuteDLayout_;
-  using PermuteALayout = PermuteALayout_;
-  using PermuteBLayout = PermuteBLayout_;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-
-  using UnderlyingOperator = typename GemmUniversal<
-      ElementB, typename layout::LayoutTranspose<LayoutB>::type, ElementA,
-      typename layout::LayoutTranspose<LayoutA>::type, ElementC,
-      layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag,
-      ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp,
-      ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA, Operator,
-      kTransformB, kTransformA, GatherB, GatherA, ScatterD, PermuteDLayout,
-      PermuteBLayout, PermuteALayout>::Base;
-
-  using GemmKernel = typename UnderlyingOperator::GemmKernel;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-
-  /// Argument structure
-  using Arguments = typename UnderlyingOperator::Arguments;
-
- private:
-  UnderlyingOperator underlying_operator_;
-
- public:
-  /// Constructs the GEMM.
-  GemmUniversal() {}
-
-  /// Helper to construct a transposed equivalent for the underlying GEMM
-  /// operator
-  static Arguments to_underlying_arguments(Arguments const& args) {
-    return args.transposed_problem();
-  }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const& args) {
-    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const& args) {
-    return UnderlyingOperator::get_workspace_size(
-        to_underlying_arguments(args));
-  }
-
-  /// Computes the grid shape
-  static dim3 get_grid_shape(Arguments const& args) {
-    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
-  }
-
-  /// Computes the maximum number of active blocks per multiprocessor
-  static int maximum_active_blocks(int smem_capacity = -1) {
-    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const& args, void* workspace = nullptr,
-                    cudaStream_t stream = nullptr) {
-    return underlying_operator_.initialize(to_underlying_arguments(args),
-                                           workspace, stream);
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const& args, void* workspace = nullptr) {
-    return underlying_operator_.update(to_underlying_arguments(args),
-                                       workspace);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-    return underlying_operator_.run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) { return run(stream); }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(Arguments const& args, void* workspace = nullptr,
-                    cudaStream_t stream = nullptr) {
-    Status status = initialize(args, workspace, stream);
-
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace device
-}  // namespace gemm
-}  // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal.hpp b/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal.hpp
deleted file mode 100644
index 3b59cc8..0000000
--- a/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights
- *reserved. SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/gemm/kernel/gemm_universal_decl.h"
-#include "cutlass/gemm/kernel/tile_scheduler.hpp"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::kernel {
-
-// In cases where ProblemShape is not a tuple, this is used to check if the
-// underlying problem shape type is aliased within or not.
-// Used for dispatching GemmUniversal to 2.x API or 3.x API
-template <class ProblemShape, class = void>
-struct IsCutlass3ArrayKernel : cute::false_type {};
-
-template <typename ProblemShape>
-struct IsCutlass3ArrayKernel<
-    ProblemShape, cute::void_t<typename ProblemShape::UnderlyingProblemShape>>
-    : cute::true_type {};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace cutlass::gemm::kernel
-
-////////////////////////////////////////////////////////////////////////////////
-#include "xe_gemm_array_cooperative.hpp"
diff --git a/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal_adapter.h b/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal_adapter.h
deleted file mode 100644
index 0c923e8..0000000
--- a/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal_adapter.h
+++ /dev/null
@@ -1,844 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights
- *reserved. SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*!
-  \file
-  \brief The universal GEMM accommodates serial reductions, parallel reductions,
-  batched strided, and batched array variants.
-*/
-
-#pragma once
-
-// common
-#include "cutlass/cutlass.h"
-#include "cutlass/device_kernel.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/detail/layout.hpp"
-#include "cutlass/detail/mma.hpp"
-#include "cutlass/cuda_host_adapter.hpp"
-
-#include "cutlass/kernel_launch.h"
-#if !defined(__CUDACC_RTC__)
-  #include "cutlass/cluster_launch.hpp"
-  #include "cutlass/trace.h"
-#endif  // !defined(__CUDACC_RTC__)
-
-// 2.x
-#include "gemm_universal_base.h"
-#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h"
-
-// 3.x
-#include "gemm_universal.hpp"
-
-#if defined(CUTLASS_ENABLE_SYCL)
-  #include "cutlass/util/sycl_event_manager.hpp"
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::device {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/*!
-  GemmUniversalAdapter is a stateful, reusable GEMM handle built around a kernel
-  of type cutlass::gemm::kernel::Gemm or cutlass::gemm::kernel::GemmUniversal.
-
-  It manages the lifetime of the underlying `kernel::Params` struct, and exposes
-  APIs to create it from the host facing arguments. For power users, new static
-  methods are exposed in 3.x APIs that bypass the stateful methods or
-  args->params lowering.
-
-  It supports kernel types that implement both the 2.x and 3.0 APIs,
-  however, this is done by specializing the implementation of
-  GemmUniversalAdapter on the two kernel API types, and thus,
-  GemmUniversalAdapter's behaviour might differ between the two specializations.
-*/
-template <class GemmKernel_, class Enable = void>
-class GemmUniversalAdapter;
-
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////// CUTLASS 3.x API /////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-// Work-around for some DispatchPolicy types not having a Stages member.
-// In that case, the Stages value is 0.  Most code should static_assert
-// that the number of stages is valid.
-
-// Whether DispatchPolicy::Stages is valid.
-// It should also be convertible to int, but if not, that will show up
-// as a build error when GemmUniversalAdapter attempts to assign it to kStages.
-template <class DispatchPolicy, class Enable = void>
-struct has_Stages : cute::false_type {};
-
-template <class DispatchPolicy>
-struct has_Stages<DispatchPolicy,
-                  cute::void_t<decltype(DispatchPolicy::Stages)>>
-    : cute::true_type {};
-
-template <class DispatchPolicy>
-constexpr int stages_member(DispatchPolicy) {
-  if constexpr (has_Stages<DispatchPolicy>::value) {
-    return DispatchPolicy::Stages;
-  } else {
-    return 0;
-  }
-}
-
-}  // namespace detail
-
-template <class GemmKernel_>
-class GemmUniversalAdapter<GemmKernel_,
-                           cute::enable_if_t<gemm::detail::IsCutlass3GemmKernel<
-                               GetUnderlyingKernel_t<GemmKernel_>>::value>> {
- public:
-  using GemmKernel = GetUnderlyingKernel_t<GemmKernel_>;
-  using TileShape = typename GemmKernel::TileShape;
-  using ElementA = typename GemmKernel::ElementA;
-  using ElementB = typename GemmKernel::ElementB;
-  using ElementC = typename GemmKernel::ElementC;
-  using ElementD = typename GemmKernel::ElementD;
-  using ElementAccumulator = typename GemmKernel::ElementAccumulator;
-  using DispatchPolicy = typename GemmKernel::DispatchPolicy;
-  using CollectiveMainloop = typename GemmKernel::CollectiveMainloop;
-  using CollectiveEpilogue = typename GemmKernel::CollectiveEpilogue;
-
-  // Map back to 2.x type as best as possible
-  using LayoutA =
-      gemm::detail::StrideToLayoutTagA_t<typename GemmKernel::StrideA>;
-  using LayoutB =
-      gemm::detail::StrideToLayoutTagB_t<typename GemmKernel::StrideB>;
-  using LayoutC =
-      gemm::detail::StrideToLayoutTagC_t<typename GemmKernel::StrideC>;
-  using LayoutD =
-      gemm::detail::StrideToLayoutTagC_t<typename GemmKernel::StrideD>;
-
-  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
-
-  static ComplexTransform const kTransformA =
-      cute::is_same_v<typename GemmKernel::CollectiveMainloop::TransformA,
-                      cute::conjugate>
-          ? ComplexTransform::kConjugate
-          : ComplexTransform::kNone;
-  static ComplexTransform const kTransformB =
-      cute::is_same_v<typename GemmKernel::CollectiveMainloop::TransformB,
-                      cute::conjugate>
-          ? ComplexTransform::kConjugate
-          : ComplexTransform::kNone;
-
-  // Legacy: Assume MultiplyAdd only since we do not use this tag type in 3.0
-  using MathOperator = cutlass::arch::OpMultiplyAdd;
-
-  using OperatorClass = cutlass::detail::get_operator_class_t<
-      typename CollectiveMainloop::TiledMma>;
-
-  using ArchTag = typename GemmKernel::ArchTag;
-
-  // NOTE: Assume identity swizzle for now
-  using ThreadblockSwizzle =
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
-
-  // Assume TiledMma's ShapeMNK is the same as 2.x's ThreadblockShape
-  using ThreadblockShape = cutlass::gemm::GemmShape<cute::size<0>(TileShape{}),
-                                                    cute::size<1>(TileShape{}),
-                                                    cute::size<2>(TileShape{})>;
-
-  using ClusterShape = cutlass::gemm::GemmShape<
-      cute::size<0>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
-      cute::size<1>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
-      cute::size<2>(typename GemmKernel::DispatchPolicy::ClusterShape{})>;
-
-  // Instruction shape is easy too, since we get that directly from our
-  // TiledMma's atom shape
-  using InstructionShape = cutlass::gemm::GemmShape<
-      cute::size<0>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}),
-      cute::size<1>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}),
-      cute::size<2>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{})>;
-
-  // Legacy: provide a correct warp count, but no reliable warp shape
-  static int const kThreadCount = GemmKernel::MaxThreadsPerBlock;
-
-  // Warp shape is not a primary API type in 3.x
-  // But we can best approximate it by inspecting the TiledMma
-  // For this, we make the assumption that we always have 4 warps along M, and
-  // rest along N, none along K We also always round up the warp count to 4 if
-  // the tiled mma is smaller than 128 threads
-  static constexpr int WarpsInMma = cute::max(
-      4, CUTE_STATIC_V(cute::size(typename GemmKernel::TiledMma{})) / 32);
-  static constexpr int WarpsInMmaM = 4;
-  static constexpr int WarpsInMmaN = cute::ceil_div(WarpsInMma, WarpsInMmaM);
-  using WarpCount = cutlass::gemm::GemmShape<WarpsInMmaM, WarpsInMmaN, 1>;
-  using WarpShape =
-      cutlass::gemm::GemmShape<CUTE_STATIC_V(cute::tile_size<0>(
-                                   typename CollectiveMainloop::TiledMma{})) /
-                                   WarpsInMmaM,
-                               CUTE_STATIC_V(cute::tile_size<1>(
-                                   typename CollectiveMainloop::TiledMma{})) /
-                                   WarpsInMmaN,
-                               CUTE_STATIC_V(cute::tile_size<2>(
-                                   typename CollectiveMainloop::TiledMma{}))>;
-
-  static int constexpr kStages =
-      detail::stages_member(typename CollectiveMainloop::DispatchPolicy{});
-
-  // Inspect TiledCopy for A and B to compute the alignment size
-  static int constexpr kAlignmentA =
-      cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
-          typename CollectiveMainloop::GmemTiledCopyA, ElementA,
-          typename CollectiveMainloop::TiledMma::ValTypeA>();
-  static int constexpr kAlignmentB =
-      cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
-          typename CollectiveMainloop::GmemTiledCopyB, ElementB,
-          typename CollectiveMainloop::TiledMma::ValTypeB>();
-  static int constexpr kAlignmentC =
-      cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
-          typename CollectiveEpilogue::GmemTiledCopyC, ElementC>();
-  static int constexpr kAlignmentD =
-      cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
-          typename CollectiveEpilogue::GmemTiledCopyD, ElementD>();
-
-  using EpilogueOutputOp = typename CollectiveEpilogue::ThreadEpilogueOp;
-
-  // Split-K preserves splits that are 128b aligned
-  static int constexpr kSplitKAlignment = cute::max(
-      128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
-
-  /// Argument structure: User API
-  using Arguments = typename GemmKernel::Arguments;
-  /// Argument structure: Kernel API
-  using Params = typename GemmKernel::Params;
-
- private:
-  /// Kernel API parameters object
-  Params params_;
-
- public:
-  /// Access the Params structure
-  Params const& params() const { return params_; }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const& args) {
-    if (GemmKernel::can_implement(args)) {
-      return Status::kSuccess;
-    } else {
-      return Status::kInvalid;
-    }
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const& args) {
-    size_t workspace_bytes = 0;
-    if (args.mode == GemmUniversalMode::kGemmSplitKParallel) {
-      workspace_bytes += sizeof(int) * size_t(cute::size<0>(TileShape{})) *
-                         size_t(cute::size<1>(TileShape{}));
-    }
-
-    workspace_bytes += GemmKernel::get_workspace_size(args);
-
-    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
-
-    return workspace_bytes;
-  }
-
-  /// Computes the grid shape
-  static dim3 get_grid_shape(Arguments const& args, void* workspace = nullptr) {
-    auto tmp_params = GemmKernel::to_underlying_arguments(args, workspace);
-    return GemmKernel::get_grid_shape(tmp_params);
-  }
-
-  /// Computes the grid shape
-  static dim3 get_grid_shape(Params const& params) {
-    return GemmKernel::get_grid_shape(params);
-  }
-
-  /// Computes the maximum number of active blocks per multiprocessor
-  static int maximum_active_blocks(int /* smem_capacity */ = -1) {
-    CUTLASS_TRACE_HOST("GemmUniversal::maximum_active_blocks()");
-    int max_active_blocks = -1;
-    int smem_size = GemmKernel::SharedStorageSize;
-
-    // first, account for dynamic smem capacity if needed
-    cudaError_t result;
-    if (smem_size >= (48 << 10)) {
-      CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
-      result = cudaFuncSetAttribute(device_kernel<GemmKernel>,
-                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                    smem_size);
-      if (cudaSuccess != result) {
-        result = cudaGetLastError();  // to clear the error bit
-        CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error: "
-                           << cudaGetErrorString(result));
-        return -1;
-      }
-    }
-
-    // query occupancy after setting smem size
-    result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks, device_kernel<GemmKernel>,
-        GemmKernel::MaxThreadsPerBlock, smem_size);
-
-    if (cudaSuccess != result) {
-      result = cudaGetLastError();  // to clear the error bit
-      CUTLASS_TRACE_HOST(
-          "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error: "
-          << cudaGetErrorString(result));
-      return -1;
-    }
-
-    CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
-    return max_active_blocks;
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const& args, void* workspace = nullptr,
-                    cudaStream_t stream = nullptr,
-                    CudaHostAdapter* cuda_adapter = nullptr) {
-    CUTLASS_TRACE_HOST("GemmUniversal::initialize() - workspace "
-                       << workspace
-                       << ", stream: " << (stream ? "non-null" : "null"));
-
-    // Initialize the workspace
-    Status status =
-        GemmKernel::initialize_workspace(args, workspace, stream, cuda_adapter);
-    if (status != Status::kSuccess) {
-      return status;
-    }
-    // Initialize the Params structure
-    params_ = GemmKernel::to_underlying_arguments(args, workspace);
-    // Don't set the function attributes - require the CudaHostAdapter to set
-    // it.
-    if constexpr (kEnableCudaHostAdapter) {
-      CUTLASS_ASSERT(cuda_adapter);
-      return Status::kSuccess;
-    } else {
-      //
-      // Account for dynamic smem capacity if needed
-      //
-      int smem_size = GemmKernel::SharedStorageSize;
-
-      CUTLASS_ASSERT(cuda_adapter == nullptr);
-
-#if !defined(CUTLASS_ENABLE_SYCL)
-      if (smem_size >= (48 << 10)) {
-        CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
-        cudaError_t result = cudaFuncSetAttribute(
-            device_kernel<GemmKernel>,
-            cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
-        if (cudaSuccess != result) {
-          result = cudaGetLastError();  // to clear the error bit
-          CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error: "
-                             << cudaGetErrorString(result));
-          return Status::kErrorInternal;
-        }
-      }
-#endif
-    }
-    return Status::kSuccess;
-  }
-
-  /// Update API is preserved in 3.0, but does not guarantee a lightweight
-  /// update of params.
-  Status update(Arguments const& args, void* workspace = nullptr) {
-    CUTLASS_TRACE_HOST("GemmUniversal()::update() - workspace: " << workspace);
-
-    size_t workspace_bytes = get_workspace_size(args);
-    if (workspace_bytes > 0 && nullptr == workspace) {
-      return Status::kErrorWorkspaceNull;
-    }
-
-    params_ = GemmKernel::to_underlying_arguments(args, workspace);
-    return Status::kSuccess;
-  }
-
-  /// Primary run() entry point API that is static allowing users to create and
-  /// manage their own params. Supplied params struct must be construct by
-  /// calling GemmKernel::to_underlying_arguments()
-  static Status run(Params& params, sycl::queue& stream,
-                    CudaHostAdapter* cuda_adapter = nullptr,
-                    bool launch_with_pdl = false) {
-    CUTLASS_TRACE_HOST("GemmUniversal::run()");
-    dim3 const block = GemmKernel::get_block_shape();
-    dim3 const grid = get_grid_shape(params);
-
-#if defined(CUTLASS_ENABLE_SYCL)
-    const syclcompat::dim3 sycl_block(block.x, block.y, block.z);
-    const syclcompat::dim3 sycl_grid(grid.x, grid.y, grid.z);
-#endif
-
-    // configure smem size and carveout
-    int smem_size = GemmKernel::SharedStorageSize;
-
-    Status launch_result{Status::kSuccess};
-    // Use extended launch API only for mainloops that use it
-    if constexpr (GemmKernel::ArchTag::kMinComputeCapability >= 90) {
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-      CUTLASS_TRACE_HOST("GemmUniversal::run: Use extended launch API");
-#endif
-#if !defined(CUTLASS_ENABLE_SYCL)
-      [[maybe_unused]] constexpr bool is_static_1x1x1 =
-          cute::is_static_v<
-              typename GemmKernel::DispatchPolicy::ClusterShape> and
-          cute::size(typename GemmKernel::DispatchPolicy::ClusterShape{}) == 1;
-      [[maybe_unused]] dim3 cluster(
-          cute::size<0>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
-          cute::size<1>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
-          cute::size<2>(typename GemmKernel::DispatchPolicy::ClusterShape{}));
-
-      // Dynamic cluster support
-      [[maybe_unused]] dim3 fallback_cluster = dim3{0, 0, 0};
-      if constexpr (GemmKernel::ArchTag::kMinComputeCapability == 100 ||
-                    GemmKernel::ArchTag::kMinComputeCapability == 101) {
-        if constexpr (!cute::is_static_v<
-                          typename GemmKernel::DispatchPolicy::ClusterShape>) {
-          fallback_cluster = params.hw_info.cluster_shape_fallback;
-          cluster = params.hw_info.cluster_shape;
-        }
-      }
-
-      [[maybe_unused]] void* kernel_params[] = {&params};
-
-      if constexpr (kEnableCudaHostAdapter) {
-        //
-        // Use the cuda host adapter
-        //
-        CUTLASS_ASSERT(cuda_adapter);
-        if (cuda_adapter) {
-          if (launch_with_pdl) {
-            CUTLASS_TRACE_HOST(
-                "GemmUniversal::run() does not support launching with PDL and "
-                "a custom cuda adapter.");
-            return Status::kErrorInternal;
-          }
-  #if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-          CUTLASS_TRACE_HOST(
-              "GemmUniversal::run: Launching kernel with CUDA host adapter");
-  #endif
-          if constexpr (is_static_1x1x1) {
-            launch_result = cuda_adapter->launch(grid, block, smem_size, stream,
-                                                 kernel_params, 0);
-          } else {
-            launch_result =
-                cuda_adapter->launch(grid, cluster, fallback_cluster, block,
-                                     smem_size, stream, kernel_params, 0);
-          }
-        } else {
-          CUTLASS_TRACE_HOST(
-              "GemmUniversal::run: kEnableCudaHostAdapter is true, but CUDA "
-              "host adapter is null");
-          return Status::kErrorInternal;
-        }
-      } else {
-        CUTLASS_ASSERT(cuda_adapter == nullptr);
-        [[maybe_unused]] void const* kernel =
-            (void const*)device_kernel<GemmKernel>;
-        static constexpr bool kClusterLaunch =
-            GemmKernel::ArchTag::kMinComputeCapability == 90;
-        if constexpr (kClusterLaunch) {
-          if constexpr (is_static_1x1x1) {
-  #if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-            CUTLASS_TRACE_HOST(
-                "GemmUniversal::run: Launching static 1x1x1 kernel");
-  #endif
-            launch_result = cutlass::kernel_launch<GemmKernel>(
-                grid, block, smem_size, stream, params, launch_with_pdl);
-            if (launch_result != Status::kSuccess) {
-              CUTLASS_TRACE_HOST(
-                  "GemmUniversal::run: cutlass::kernel_launch reports failure");
-            }
-  #if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-            else {
-              CUTLASS_TRACE_HOST(
-                  "GemmUniversal::run: cutlass::kernel_launch reports success");
-            }
-  #endif
-          } else {
-  #if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-            CUTLASS_TRACE_HOST(
-                "GemmUniversal::run: Launching dynamic cluster kernel");
-  #endif
-            launch_result =
-                ClusterLauncher::launch(grid, cluster, block, smem_size, stream,
-                                        kernel, kernel_params, launch_with_pdl);
-          }
-        }
-
-        else {
-          if constexpr (GemmKernel::ArchTag::kMinComputeCapability == 100 ||
-                        GemmKernel::ArchTag::kMinComputeCapability == 101 ||
-                        GemmKernel::ArchTag::kMinComputeCapability == 120) {
-            if constexpr (is_static_1x1x1) {
-  #if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-              CUTLASS_TRACE_HOST(
-                  "GemmUniversal::run: Launching static 1x1x1 kernel");
-  #endif
-              launch_result = cutlass::kernel_launch<GemmKernel>(
-                  grid, block, smem_size, stream, params, launch_with_pdl);
-              if (launch_result != Status::kSuccess) {
-                CUTLASS_TRACE_HOST(
-                    "GemmUniversal::run: cutlass::kernel_launch reports "
-                    "failure");
-              }
-  #if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-              else {
-                CUTLASS_TRACE_HOST(
-                    "GemmUniversal::run: cutlass::kernel_launch reports "
-                    "success");
-              }
-  #endif
-            } else {
-  #if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-              CUTLASS_TRACE_HOST(
-                  "GemmUniversal::run: Launching kernel with fall-back "
-                  "cluster");
-  #endif
-              launch_result = ClusterLauncher::launch_with_fallback_cluster(
-                  grid, cluster, fallback_cluster, block, smem_size, stream,
-                  kernel, kernel_params, launch_with_pdl);
-            }
-          }
-        }
-      }
-#endif
-    } else {
-      launch_result = Status::kSuccess;
-      cutlass::arch::synclog_setup();
-
-      if constexpr (kEnableCudaHostAdapter) {
-        CUTLASS_ASSERT(cuda_adapter);
-        if (cuda_adapter) {
-          void* kernel_params[] = {&params};
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-          CUTLASS_TRACE_HOST(
-              "GemmUniversal::run: Launching kernel with CUDA host adapter");
-#endif
-          launch_result = cuda_adapter->launch(grid, block, smem_size, stream,
-                                               kernel_params, 0);
-
-        } else {
-          CUTLASS_TRACE_HOST("GemmUniversal::run: CUDA host adapter is null");
-          return Status::kErrorInternal;
-        }
-      } else {
-        CUTLASS_ASSERT(cuda_adapter == nullptr);
-#if defined(CUTLASS_ENABLE_SYCL)
-        // sycl::queue q = stream; // ? *stream :
-        // syclcompat::get_default_queue();
-  #if !defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
-        using namespace syclcompat::experimental;
-        if constexpr (cute::is_same_v<DispatchPolicy, MainloopDeviceAgnostic>) {
-          auto event = launch<device_kernel<GemmKernel>>(
-              launch_policy{sycl_grid, sycl_block,
-                            local_mem_size {
-                              static_cast<std::size_t>(smem_size)
-                            }},
-              q, params);
-          EventManager::getInstance().addEvent(event);
-        } else {
-          auto event = launch<device_kernel<GemmKernel>>(
-              launch_policy{
-                  sycl_grid, sycl_block,
-                  local_mem_size{static_cast<std::size_t>(smem_size)}
-    #if defined(SYCL_INTEL_TARGET)
-                  ,
-                  kernel_properties {
-                    sycl_exp::sub_group_size<DispatchPolicy::SubgroupSize>
-                  }
-    #endif
-              },
-              stream, params);
-          EventManager::getInstance().addEvent(event);
-        }
-  #else
-    #if defined(SYCL_INTEL_TARGET)
-        constexpr bool allow_subgroup_size_prop = true;
-    #else
-        constexpr bool allow_subgroup_size_prop = false;
-    #endif
-        auto kernel_props = [] {
-          constexpr bool is_device_agnostic =
-              cute::is_same_v<DispatchPolicy, MainloopDeviceAgnostic>;
-          if constexpr (!allow_subgroup_size_prop or is_device_agnostic) {
-            using EmptyProperties =
-                decltype(sycl::ext::oneapi::experimental::properties());
-            return syclcompat::experimental::kernel_properties<
-                EmptyProperties>{};
-          } else {
-            return syclcompat::experimental::kernel_properties{
-                sycl::ext::oneapi::experimental::sub_group_size<
-                    DispatchPolicy::SubgroupSize>};
-          }
-        }();
-        syclcompat::experimental::launch_properties launch_props{
-            sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size),
-        };
-        syclcompat::experimental::launch_policy policy{
-            sycl_grid, sycl_block, launch_props, kernel_props};
-        auto event =
-            syclcompat::experimental::launch<device_kernel<GemmKernel>>(
-                policy, stream, params);
-        EventManager::getInstance().addEvent(event);
-  #endif  // !defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
-#else
-  #if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-        CUTLASS_TRACE_HOST(
-            "GemmUniversal::run: Launching kernel with cutlass::kernel_launch");
-  #endif
-        launch_result = cutlass::kernel_launch<GemmKernel>(
-            grid, block, smem_size, stream, params, launch_with_pdl);
-        if (launch_result != Status::kSuccess) {
-          CUTLASS_TRACE_HOST(
-              "GemmUniversal::run: cutlass::kernel_launch reports failure");
-        }
-  #if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-        else {
-          CUTLASS_TRACE_HOST(
-              "GemmUniversal::run: cutlass::kernel_launch reports success");
-        }
-  #endif
-#endif
-      }
-    }
-
-    cudaError_t result = cudaGetLastError();
-    if (cudaSuccess == result && Status::kSuccess == launch_result) {
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-      CUTLASS_TRACE_HOST(
-          "GemmUniversal::run: cudaGetLastError reports success");
-#endif
-      return Status::kSuccess;
-    } else {
-      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
-      return Status::kErrorInternal;
-    }
-  }
-
-  //
-  // Non-static launch overloads that first create and set the internal params
-  // struct of this kernel handle.
-  //
-
-  /// Launches the kernel after first constructing Params internal state from
-  /// supplied arguments.
-  Status run(Arguments const& args, void* workspace, sycl::queue& stream,
-             CudaHostAdapter* cuda_adapter = nullptr,
-             bool launch_with_pdl = false) {
-    Status status = initialize(args, workspace, stream, cuda_adapter);
-
-    if (Status::kSuccess == status) {
-      status = run(params_, stream, cuda_adapter, launch_with_pdl);
-    }
-    return status;
-  }
-
-  /// Launches the kernel after first constructing Params internal state from
-  /// supplied arguments.
-  Status operator()(Arguments const& args, void* workspace, sycl::queue& stream,
-                    CudaHostAdapter* cuda_adapter = nullptr,
-                    bool launch_with_pdl = false) {
-    return run(args, workspace, stream, cuda_adapter, launch_with_pdl);
-  }
-
-  /// Overload that allows a user to re-launch the same kernel without updating
-  /// internal params struct.
-  Status run(sycl::queue& stream, CudaHostAdapter* cuda_adapter = nullptr,
-             bool launch_with_pdl = false) {
-    return run(params_, stream, cuda_adapter, launch_with_pdl);
-  }
-
-  /// Overload that allows a user to re-launch the same kernel without updating
-  /// internal params struct.
-  Status operator()(sycl::queue& stream,
-                    CudaHostAdapter* cuda_adapter = nullptr,
-                    bool launch_with_pdl = false) {
-    return run(params_, stream, cuda_adapter, launch_with_pdl);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////// CUTLASS 2.x API /////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-
-template <class GemmKernel_>
-class GemmUniversalAdapter<
-    GemmKernel_, cute::enable_if_t<not gemm::detail::IsCutlass3GemmKernel<
-                     GetUnderlyingKernel_t<GemmKernel_>>::value>> {
- public:
-  using GemmKernel = GetUnderlyingKernel_t<GemmKernel_>;
-
-  static bool const kInternalTranspose =
-      !cutlass::epilogue::threadblock::detail::is_2x_evt_v<
-          typename GemmKernel::Epilogue> &&  // 2.x EVT does not require
-                                             // internal transpose
-      cute::is_same<typename GemmKernel::LayoutC,
-                    cutlass::layout::RowMajor>::value;
-
-  using ThreadblockShape = typename GemmKernel::Mma::Shape;
-  using WarpShape = typename GemmKernel::WarpShape;
-  using InstructionShape = typename GemmKernel::InstructionShape;
-
-  // warp-level, arch-level (instruction), math operator
-  using WarpMmaOperator = typename GemmKernel::Mma::Policy::Operator;
-  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
-  using MathOperator = typename WarpMmaOperator::MathOperator;
-
-  // Operator class and arch tag extract bottom-up
-  // set it for top-level gemm device-level template
-  using OperatorClass = typename WarpMmaOperator::OperatorClass;
-  using ArchTag = typename WarpMmaOperator::ArchTag;
-
-  // Type, layout, and complex transform deliberately exchanged with B
-  using MapArguments = kernel::detail::MapArguments<
-      typename GemmKernel::ElementA, typename GemmKernel::LayoutA,
-      GemmKernel::kTransformA, GemmKernel::kAlignmentA,
-      typename GemmKernel::ElementB, typename GemmKernel::LayoutB,
-      GemmKernel::kTransformB, GemmKernel::kAlignmentB,
-      typename GemmKernel::LayoutC, kInternalTranspose>;
-
-  using ElementA = typename MapArguments::ElementA;
-  using LayoutA = typename MapArguments::LayoutA;
-  static ComplexTransform const kTransformA = MapArguments::kTransformA;
-  static int const kAlignmentA = MapArguments::kAlignmentA;
-
-  using ElementB = typename MapArguments::ElementB;
-  using LayoutB = typename MapArguments::LayoutB;
-  static ComplexTransform const kTransformB = MapArguments::kTransformB;
-  static int const kAlignmentB = MapArguments::kAlignmentB;
-
-  using ElementC = typename GemmKernel::ElementC;
-  using LayoutC = typename MapArguments::LayoutC;
-  static int const kAlignmentC = GemmKernel::kAlignmentC;
-
-  // C and D same type for 2.x kernel
-  using ElementD = ElementC;
-  using LayoutD = LayoutC;
-
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementD, LayoutD>;
-
-  static int const kStages = GemmKernel::Mma::kStages;
-
-  using EpilogueOutputOp = typename GemmKernel::EpilogueOutputOp;
-  using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
-  using ThreadblockSwizzle = typename GemmKernel::ThreadblockSwizzle;
-  using UnderlyingOperator = GemmUniversalBase<GemmKernel>;
-  using Arguments = typename UnderlyingOperator::Arguments;
-
- private:
-  UnderlyingOperator underlying_operator_;
-
- public:
-  /// Constructs the GEMM.
-  GemmUniversalAdapter() {}
-
-  /// Helper to construct a transposed equivalent for the underlying GEMM
-  /// operator
-  static Arguments to_underlying_arguments(Arguments const& args) {
-    if (kInternalTranspose) {
-      return args.transposed_problem();
-    } else {
-      return args;
-    }
-  }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const& args,
-                              CudaHostAdapter* cuda_adapter = nullptr) {
-    return UnderlyingOperator::can_implement(to_underlying_arguments(args),
-                                             cuda_adapter);
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const& args,
-                                   CudaHostAdapter* cuda_adapter = nullptr) {
-    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args),
-                                                  cuda_adapter);
-  }
-
-  /// Computes the grid shape
-  static dim3 get_grid_shape(Arguments const& args) {
-    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
-  }
-
-  /// Computes the maximum number of active blocks per multiprocessor
-  static int maximum_active_blocks(int smem_capacity = -1) {
-    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const& args, void* workspace = nullptr,
-                    cudaStream_t stream = nullptr,
-                    CudaHostAdapter* cuda_adapter = nullptr) {
-    return underlying_operator_.initialize(to_underlying_arguments(args),
-                                           workspace, stream, cuda_adapter);
-  }
-
-  /// Lightweight update given a subset of arguments.
-  Status update(Arguments const& args) {
-    return underlying_operator_.update(to_underlying_arguments(args));
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr,
-             CudaHostAdapter* cuda_adapter = nullptr) {
-    return underlying_operator_.run(stream, cuda_adapter);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr,
-                    CudaHostAdapter* cuda_adapter = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(Arguments const& args, void* workspace = nullptr,
-                    cudaStream_t stream = nullptr,
-                    CudaHostAdapter* cuda_adapter = nullptr) {
-    Status status = initialize(args, workspace, stream, cuda_adapter);
-
-    if (status == Status::kSuccess) {
-      status = run(stream, cuda_adapter);
-    }
-
-    return status;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace cutlass::gemm::device
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal_base.h b/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal_base.h
deleted file mode 100644
index b909318..0000000
--- a/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal_base.h
+++ /dev/null
@@ -1,524 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights
- *reserved. SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*!
-  \file
-  \brief The universal GEMM accommodates streamk, batched strided, and batched
-  array variants.
-*/
-
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-  #include <cuda/std/limits>
-#else
-  #include <limits>
-#endif
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/device_kernel.h"
-#include "cutlass/cuda_host_adapter.hpp"
-
-#include "cutlass/gemm/gemm.h"
-#include "gemm_universal_k.h"
-
-#include "default_gemm_universal.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-
-#include "cutlass/trace.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename GemmKernel_>
-class GemmUniversalBase {
- public:
-  using GemmKernel = GemmKernel_;
-
-  /// Boolean indicating whether the CudaHostAdapter is enabled
-  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
-
-  using ThreadblockShape = typename GemmKernel::Mma::Shape;
-
-  using ElementA = typename GemmKernel::ElementA;
-  using LayoutA = typename GemmKernel::LayoutA;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  static ComplexTransform const kTransformA = GemmKernel::kTransformA;
-
-  using ElementB = typename GemmKernel::ElementB;
-  using LayoutB = typename GemmKernel::LayoutB;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  static ComplexTransform const kTransformB = GemmKernel::kTransformB;
-
-  using ElementC = typename GemmKernel::ElementC;
-  using LayoutC = typename GemmKernel::LayoutC;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
-
-  /// Numerical accumulation element type
-  using ElementAccumulator = typename GemmKernel::Mma::ElementC;
-
-  using EpilogueOutputOp = typename GemmKernel::EpilogueOutputOp;
-  using ThreadblockSwizzle = typename GemmKernel::ThreadblockSwizzle;
-  using Operator = typename GemmKernel::Operator;
-
-  /// Argument structure
-  using Arguments = typename GemmKernel::Arguments;
-
-  /// Index of the GEMM Kernel within the CudaHostAdapter
-  static int32_t const kGemmKernelIndex = 0;
-
-  /// Kernel dynamic shared memory allocation requirement
-  /// Update the kernel function's shared memory configuration for the current
-  /// device
-  static constexpr size_t kSharedStorageSize =
-      sizeof(typename GemmKernel::SharedStorage);
-
- protected:
-  //
-  // Device properties (uniform across all instances of the current thread)
-  //
-
-  // Device ordinal
-  CUTLASS_THREAD_LOCAL static int device_ordinal_;
-
-  /// Device SM count
-  CUTLASS_THREAD_LOCAL static int device_sms_;
-
-  /// Kernel SM occupancy (in thread blocks)
-  CUTLASS_THREAD_LOCAL static int sm_occupancy_;
-
- protected:
-  /// Initialize static thread-local members for the thread's current device,
-  /// if necessary.
-  static Status init_device_props() {
-    CUTLASS_TRACE_HOST("GemmUniversalBase::init_device_props()");
-
-    cudaError_t cudart_result;
-
-    // Get current device ordinal
-    int current_ordinal;
-    cudart_result = cudaGetDevice(&current_ordinal);
-    if (cudart_result != cudaSuccess) {
-      CUTLASS_TRACE_HOST("  cudaGetDevice() returned error "
-                         << cudaGetErrorString(cudart_result));
-      return Status::kErrorInternal;
-    }
-
-    // Done if matches the current static member
-    if (current_ordinal == device_ordinal_) {
-      // Already initialized
-      return Status::kSuccess;
-    }
-
-    // Update SM count member
-    cudart_result = cudaDeviceGetAttribute(
-        &device_sms_, cudaDevAttrMultiProcessorCount, current_ordinal);
-    if (cudart_result != cudaSuccess) {
-      CUTLASS_TRACE_HOST("  cudaDeviceGetAttribute() returned error "
-                         << cudaGetErrorString(cudart_result));
-      return Status::kErrorInternal;
-    }
-
-    // If requires more than 48KB: configure for extended, dynamic shared memory
-    if constexpr (kSharedStorageSize >= (48 << 10)) {
-      cudart_result = cudaFuncSetAttribute(
-          Kernel2<GemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize,
-          kSharedStorageSize);
-      if (cudart_result != cudaSuccess) {
-        CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error "
-                           << cudaGetErrorString(cudart_result));
-        return Status::kErrorInternal;
-      }
-    }
-
-    // Update SM occupancy member
-    cudart_result = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-        &sm_occupancy_, Kernel2<GemmKernel>, GemmKernel::kThreadCount,
-        kSharedStorageSize, cudaOccupancyDisableCachingOverride);
-    if (cudart_result != cudaSuccess) {
-      CUTLASS_TRACE_HOST(
-          "  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags() returned "
-          "error "
-          << cudaGetErrorString(cudart_result));
-      return Status::kErrorInternal;
-    }
-
-    // Update device ordinal member on success
-    device_ordinal_ = current_ordinal;
-
-    CUTLASS_TRACE_HOST(
-        "  "
-        "device_ordinal: ("
-        << device_ordinal_
-        << "), "
-           "device_sms: ("
-        << device_sms_
-        << "), "
-           "sm_occupancy: ("
-        << sm_occupancy_
-        << ") "
-           "smem_size: ("
-        << kSharedStorageSize
-        << ") "
-           "GemmKernel::kThreadCount: ("
-        << GemmKernel::kThreadCount << ")");
-
-    return Status::kSuccess;
-  }
-
- protected:
-  //
-  // Instance data members
-  //
-
-  /// Kernel parameters
-  typename GemmKernel::Params params_;
-
-  /// Initialize params member
-  Status init_params(Arguments const& args,
-                     CudaHostAdapter* cuda_adapter = nullptr) {
-    int32_t device_sms = 0;
-    int32_t sm_occupancy = 0;
-
-    if constexpr (kEnableCudaHostAdapter) {
-      CUTLASS_ASSERT(cuda_adapter);
-
-      //
-      // Occupancy query using CudaHostAdapter::query_occupancy().
-      //
-
-      if (cuda_adapter) {
-        Status status = cuda_adapter->query_occupancy(
-            &device_sms, &sm_occupancy, kGemmKernelIndex,
-            GemmKernel::kThreadCount, kSharedStorageSize);
-
-        CUTLASS_ASSERT(status == Status::kSuccess);
-
-        if (status != Status::kSuccess) {
-          return status;
-        }
-      } else {
-        return Status::kErrorInternal;
-      }
-    } else {
-      CUTLASS_ASSERT(cuda_adapter == nullptr);
-
-      // Initialize static device properties, if necessary
-      Status result = init_device_props();
-
-      if (result != Status::kSuccess) {
-        return result;
-      }
-
-      //
-      // Use thread-local static members for occupancy query initialized by call
-      // to `init_device_props()`
-      //
-
-      device_sms = device_sms_;
-      sm_occupancy = sm_occupancy_;
-    }
-
-    // Initialize params member
-    params_ = typename GemmKernel::Params(args, device_sms, sm_occupancy);
-    return Status::kSuccess;
-  }
-
- public:
-  //---------------------------------------------------------------------------------------------
-  // Stateless API
-  //---------------------------------------------------------------------------------------------
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const& args,
-                              CudaHostAdapter* cuda_adapter = nullptr) {
-    CUTLASS_TRACE_HOST("GemmUniversalBase::can_implement()");
-
-    if (!kEnableCudaHostAdapter || cuda_adapter) {
-      dim3 grid = get_grid_shape(args, cuda_adapter);
-
-      if (!(grid.y <= std::numeric_limits<uint16_t>::max() &&
-            grid.z <= std::numeric_limits<uint16_t>::max())) {
-        return Status::kErrorInvalidProblem;
-      }
-    } else {
-      //
-      // With a null host adapter, a conservative grid shape is computed and
-      // required to conform to CUDA grid dimension limits.
-      //
-
-      int64_t logicalGridM =
-          (int64_t(args.problem_size.m()) + ThreadblockShape::kM - 1) /
-          ThreadblockShape::kM;
-      int64_t logicalGridN =
-          (int64_t(args.problem_size.n()) + ThreadblockShape::kN - 1) /
-          ThreadblockShape::kN;
-      int32_t logicalGridL = args.batch_count;
-
-      if ((int64_t(std::numeric_limits<uint32_t>::max()) < logicalGridM) ||
-          (int64_t(std::numeric_limits<uint16_t>::max()) < logicalGridN) ||
-          (int32_t(std::numeric_limits<uint16_t>::max()) < logicalGridL)) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-
-    return GemmKernel::can_implement(args);
-  }
-
-  /// Returns the workspace size (in bytes) needed for the problem
-  /// geometry expressed by these arguments
-  static size_t get_workspace_size(Arguments const& args,
-                                   CudaHostAdapter* cuda_adapter = nullptr) {
-    CUTLASS_TRACE_HOST("GemmUniversalBase::get_workspace_size()");
-
-    // Initialize parameters from args
-    GemmUniversalBase base;
-    if (base.init_params(args, cuda_adapter) != Status::kSuccess) {
-      return 0;
-    }
-
-    // Get size from parameters
-    size_t workspace_bytes = base.params_.get_workspace_size();
-
-    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
-    return workspace_bytes;
-  }
-
-  /// Returns the grid extents in thread blocks to launch
-  static dim3 get_grid_shape(Arguments const& args,
-                             CudaHostAdapter* cuda_adapter = nullptr) {
-    CUTLASS_TRACE_HOST("GemmUniversalBase::get_grid_shape()");
-
-    // Initialize parameters from args
-    GemmUniversalBase base;
-    if (base.init_params(args, cuda_adapter) != Status::kSuccess) {
-      return dim3(0, 0, 0);
-    }
-
-    // Get dims from parameters
-    dim3 grid_dims = base.params_.get_grid_dims();
-
-    CUTLASS_TRACE_HOST("  tiled_shape: "
-                       << base.params_.get_tiled_shape() << "\n"
-                       << "  grid_dims: {" << grid_dims << "}");
-
-    return grid_dims;
-  }
-
-  /// Returns the maximum number of active thread blocks per multiprocessor
-  static int maximum_active_blocks(CudaHostAdapter* cuda_adapter = nullptr) {
-    CUTLASS_TRACE_HOST("GemmUniversalBase::maximum_active_blocks()");
-
-    int32_t device_sms = 0;
-    int32_t sm_occupancy = 0;
-
-    if constexpr (kEnableCudaHostAdapter) {
-      CUTLASS_ASSERT(cuda_adapter);
-
-      if (cuda_adapter) {
-        Status status = cuda_adapter->query_occupancy(
-            &device_sms, &sm_occupancy, kGemmKernelIndex,
-            GemmKernel::kThreadCount, kSharedStorageSize);
-
-        CUTLASS_ASSERT(status == Status::kSuccess);
-
-        if (status != Status::kSuccess) {
-          return -1;
-        }
-      } else {
-        return -1;
-      }
-    } else {
-      CUTLASS_ASSERT(cuda_adapter == nullptr);
-      // Initialize static device properties, if necessary
-      if (init_device_props() != Status::kSuccess) {
-        return -1;
-      }
-
-      sm_occupancy = sm_occupancy_;
-    }
-
-    CUTLASS_TRACE_HOST("  max_active_blocks: " << sm_occupancy_);
-    return sm_occupancy;
-  }
-
-  //---------------------------------------------------------------------------------------------
-  // Stateful API
-  //---------------------------------------------------------------------------------------------
-
-  /// Initializes GEMM state from arguments and workspace memory
-  Status initialize(Arguments const& args, void* workspace = nullptr,
-                    cudaStream_t stream = nullptr,
-                    CudaHostAdapter* cuda_adapter = nullptr) {
-    CUTLASS_TRACE_HOST("GemmUniversalBase::initialize() - workspace "
-                       << workspace
-                       << ", stream: " << (stream ? "non-null" : "null"));
-
-    // Initialize parameters from args
-    Status result = init_params(args, cuda_adapter);
-    if (result != Status::kSuccess) {
-      return result;
-    }
-
-    // Assign and prepare workspace memory
-    if (args.mode == GemmUniversalMode::kGemm) {
-      return params_.init_workspace(workspace, stream);
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Lightweight update given a subset of arguments.
-  Status update(Arguments const& args) {
-    CUTLASS_TRACE_HOST("GemmUniversalBase()::update()");
-    params_.update(args);
-    return Status::kSuccess;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr,
-             CudaHostAdapter* cuda_adapter = nullptr) {
-    CUTLASS_TRACE_HOST("GemmUniversalBase::run()");
-
-    // Configure grid and block dimensions
-    dim3 block(GemmKernel::kThreadCount, 1, 1);
-    dim3 grid = params_.get_grid_dims();
-
-    // Launch kernel
-    CUTLASS_TRACE_HOST(
-        "  "
-        "grid: ("
-        << grid
-        << "), "
-           "block: ("
-        << block
-        << "), "
-           "SMEM: ("
-        << kSharedStorageSize << ")");
-
-    cutlass::arch::synclog_setup();
-
-    if constexpr (kEnableCudaHostAdapter) {
-      CUTLASS_ASSERT(cuda_adapter);
-      if (cuda_adapter) {
-        void* kernel_params[] = {&params_};
-        return cuda_adapter->launch(grid, block, kSharedStorageSize, stream,
-                                    kernel_params, 0);
-      } else {
-        return Status::kErrorInternal;
-      }
-    } else {
-      CUTLASS_ASSERT(cuda_adapter == nullptr);
-
-#if defined(CUTLASS_ENABLE_SYCL)
-      const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
-      const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
-
-      sycl::queue q = stream ? *stream : syclcompat::get_default_queue();
-      syclcompat::experimental::launch<Kernel2<GemmKernel>>(
-          syclcompat::experimental::launch_policy{
-              sycl_grid, sycl_block,
-  #if defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
-              sycl::ext::oneapi::experimental::work_group_scratch_size(
-                  kSharedStorageSize)
-  #else
-              syclcompat::experimental::local_mem_size{
-                  static_cast<std::size_t>(kSharedStorageSize)}
-  #endif
-          },
-          q, params_);
-#else
-      Kernel2<GemmKernel><<<grid, block, kSharedStorageSize, stream>>>(params_);
-#endif
-
-      // Query for errors
-      cudaError_t result = cudaGetLastError();
-      if (result != cudaSuccess) {
-        CUTLASS_TRACE_HOST("  grid launch failed with error "
-                           << cudaGetErrorString(result));
-        return Status::kErrorInternal;
-      }
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr,
-                    CudaHostAdapter* cuda_adapter = nullptr) {
-    return run(stream, cuda_adapter);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(Arguments const& args, void* workspace = nullptr,
-                    cudaStream_t stream = nullptr,
-                    CudaHostAdapter* cuda_adapter = nullptr) {
-    Status status = initialize(args, workspace, stream, cuda_adapter);
-
-    if (status == Status::kSuccess) {
-      status = run(stream, cuda_adapter);
-    }
-
-    return status;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Static initializers
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Device ordinal
-template <typename GemmKernel_>
-CUTLASS_THREAD_LOCAL int GemmUniversalBase<GemmKernel_>::device_ordinal_ = -1;
-
-/// Device SM count
-template <typename GemmKernel_>
-CUTLASS_THREAD_LOCAL int GemmUniversalBase<GemmKernel_>::device_sms_ = -1;
-
-/// Kernel SM occupancy (in thread blocks)
-template <typename GemmKernel_>
-CUTLASS_THREAD_LOCAL int GemmUniversalBase<GemmKernel_>::sm_occupancy_ = -1;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace device
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal_k.h b/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal_k.h
deleted file mode 100644
index 19871ee..0000000
--- a/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal_k.h
+++ /dev/null
@@ -1,649 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights
- *reserved. SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/arch/arch.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/semaphore.h"
-#include "gemm_universal.hpp"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/params_universal_base.h"
-#include "cutlass/trace.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename Mma_,  ///! Threadblock-scoped matrix multiply-accumulate
-          typename Epilogue_,           ///! Epilogue
-          typename ThreadblockSwizzle_  ///! Threadblock swizzling function
-          >
-class GemmUniversal<
-    Mma_, Epilogue_, ThreadblockSwizzle_, void,
-    // 3.x kernels use the first template argument to define the ProblemShape
-    // We use this invariant to SFINAE dispatch against either the 2.x API or
-    // the 3.x API
-    cute::enable_if_t<not(cute::is_tuple<Mma_>::value ||
-                          IsCutlass3ArrayKernel<Mma_>::value)>> {
- public:
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-
-  using ElementA = typename Mma::IteratorA::Element;
-  using LayoutA = typename Mma::IteratorA::Layout;
-  using ElementB = typename Mma::IteratorB::Element;
-  using LayoutB = typename Mma::IteratorB::Layout;
-  using ElementC = typename Epilogue::OutputTileIterator::Element;
-  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
-
-  static ComplexTransform const kTransformA = Mma::kTransformA;
-  static ComplexTransform const kTransformB = Mma::kTransformB;
-  using Operator = typename Mma::Operator;
-
-  using OperatorClass = typename Mma::Operator::OperatorClass;
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename Mma::Operator::Shape;
-  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
-  using ArchTag = typename Mma::ArchTag;
-
-  static int const kStages = Mma::kStages;
-  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-  static int const kAlignmentC =
-      Epilogue::OutputTileIterator::kElementsPerAccess;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  /// Split-K preserves splits that are 128b aligned
-  static int const kSplitKAlignment = const_max(
-      128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
-
-  //
-  // Structures
-  //
-
-  /// Argument structure
-  struct Arguments : UniversalArgumentsBase {
-    //
-    // Data members
-    //
-
-    typename EpilogueOutputOp::Params epilogue;
-
-    void const* ptr_A;
-    void const* ptr_B;
-    void const* ptr_C;
-    void* ptr_D;
-
-    int64_t batch_stride_A;
-    int64_t batch_stride_B;
-    int64_t batch_stride_C;
-
-    typename LayoutA::Stride stride_a;
-    typename LayoutB::Stride stride_b;
-    typename LayoutC::Stride stride_c;
-    typename LayoutC::Stride stride_d;
-
-    typename LayoutA::Stride::LongIndex lda;
-    typename LayoutB::Stride::LongIndex ldb;
-    typename LayoutC::Stride::LongIndex ldc;
-    typename LayoutC::Stride::LongIndex ldd;
-
-    int const* ptr_gather_A_indices;
-    int const* ptr_gather_B_indices;
-    int const* ptr_scatter_D_indices;
-
-    //
-    // Methods
-    //
-
-    Arguments()
-        : ptr_A(nullptr),
-          ptr_B(nullptr),
-          ptr_C(nullptr),
-          ptr_D(nullptr),
-          ptr_gather_A_indices(nullptr),
-          ptr_gather_B_indices(nullptr),
-          ptr_scatter_D_indices(nullptr) {}
-
-    /// constructs an arguments structure
-    Arguments(GemmUniversalMode mode, GemmCoord problem_size, int batch_count,
-              typename EpilogueOutputOp::Params epilogue, void const* ptr_A,
-              void const* ptr_B, void const* ptr_C, void* ptr_D,
-              int64_t batch_stride_A, int64_t batch_stride_B,
-              int64_t batch_stride_C, int64_t batch_stride_D,
-              typename LayoutA::Stride stride_a,
-              typename LayoutB::Stride stride_b,
-              typename LayoutC::Stride stride_c,
-              typename LayoutC::Stride stride_d,
-              int const* ptr_gather_A_indices = nullptr,
-              int const* ptr_gather_B_indices = nullptr,
-              int const* ptr_scatter_D_indices = nullptr)
-        : UniversalArgumentsBase(mode, problem_size, batch_count,
-                                 batch_stride_D),
-          epilogue(epilogue),
-          ptr_A(ptr_A),
-          ptr_B(ptr_B),
-          ptr_C(ptr_C),
-          ptr_D(ptr_D),
-          batch_stride_A(batch_stride_A),
-          batch_stride_B(batch_stride_B),
-          batch_stride_C(batch_stride_C),
-          stride_a(stride_a),
-          stride_b(stride_b),
-          stride_c(stride_c),
-          stride_d(stride_d),
-          ptr_gather_A_indices(ptr_gather_A_indices),
-          ptr_gather_B_indices(ptr_gather_B_indices),
-          ptr_scatter_D_indices(ptr_scatter_D_indices) {
-      lda = 0;
-      ldb = 0;
-      ldc = 0;
-      ldd = 0;
-      CUTLASS_TRACE_HOST(
-          "GemmUniversal::Arguments::Arguments() - problem_size: "
-          << problem_size);
-    }
-
-    /// constructs an arguments structure
-    Arguments(GemmUniversalMode mode, GemmCoord problem_size, int batch_count,
-              typename EpilogueOutputOp::Params epilogue, void const* ptr_A,
-              void const* ptr_B, void const* ptr_C, void* ptr_D,
-              int64_t batch_stride_A, int64_t batch_stride_B,
-              int64_t batch_stride_C, int64_t batch_stride_D,
-              typename LayoutA::Stride::LongIndex lda,
-              typename LayoutB::Stride::LongIndex ldb,
-              typename LayoutC::Stride::LongIndex ldc,
-              typename LayoutC::Stride::LongIndex ldd,
-              int const* ptr_gather_A_indices = nullptr,
-              int const* ptr_gather_B_indices = nullptr,
-              int const* ptr_scatter_D_indices = nullptr)
-        : UniversalArgumentsBase(mode, problem_size, batch_count,
-                                 batch_stride_D),
-          epilogue(epilogue),
-          ptr_A(ptr_A),
-          ptr_B(ptr_B),
-          ptr_C(ptr_C),
-          ptr_D(ptr_D),
-          batch_stride_A(batch_stride_A),
-          batch_stride_B(batch_stride_B),
-          batch_stride_C(batch_stride_C),
-          lda(lda),
-          ldb(ldb),
-          ldc(ldc),
-          ldd(ldd),
-          ptr_gather_A_indices(ptr_gather_A_indices),
-          ptr_gather_B_indices(ptr_gather_B_indices),
-          ptr_scatter_D_indices(ptr_scatter_D_indices) {
-      stride_a = make_Coord(lda);
-      stride_b = make_Coord(ldb);
-      stride_c = make_Coord(ldc);
-      stride_d = make_Coord(ldd);
-      CUTLASS_TRACE_HOST(
-          "GemmUniversal::Arguments::Arguments() - problem_size: "
-          << problem_size);
-    }
-
-    /// Returns arguments for the transposed problem
-    Arguments transposed_problem() const {
-      Arguments args(*this);
-
-      std::swap(args.problem_size.m(), args.problem_size.n());
-      std::swap(args.ptr_A, args.ptr_B);
-      std::swap(args.lda, args.ldb);
-      std::swap(args.stride_a, args.stride_b);
-      std::swap(args.batch_stride_A, args.batch_stride_B);
-      std::swap(args.ptr_gather_A_indices, args.ptr_gather_B_indices);
-
-      return args;
-    }
-  };
-
-  //
-  // Structure for precomputing values in host memory and passing to kernels
-  //
-
-  /// Parameters structure
-  struct Params
-      : UniversalParamsBase<ThreadblockSwizzle, ThreadblockShape, ElementA,
-                            ElementB, ElementC, LayoutA, LayoutB> {
-    using ParamsBase =
-        UniversalParamsBase<ThreadblockSwizzle, ThreadblockShape, ElementA,
-                            ElementB, ElementC, LayoutA, LayoutB>;
-
-    //
-    // Data members
-    //
-
-    typename Mma::IteratorA::Params params_A;
-    typename Mma::IteratorB::Params params_B;
-    typename Epilogue::OutputTileIterator::Params params_C;
-    typename Epilogue::OutputTileIterator::Params params_D;
-
-    typename EpilogueOutputOp::Params output_op;
-
-    void* ptr_A;
-    void* ptr_B;
-    void* ptr_C;
-    void* ptr_D;
-
-    int64_t batch_stride_A;
-    int64_t batch_stride_B;
-    int64_t batch_stride_C;
-
-    int* ptr_gather_A_indices;
-    int* ptr_gather_B_indices;
-    int* ptr_scatter_D_indices;
-
-    //
-    // Host dispatch API
-    //
-
-    /// Default constructor
-    Params() = default;
-
-    /// Constructor
-    Params(Arguments const& args,  /// GEMM application arguments
-           int device_sms,         /// Number of SMs on the device
-           int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
-        : ParamsBase(args, device_sms, sm_occupancy),
-          params_A(args.lda
-                       ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda)
-                       : args.stride_a),
-          params_B(args.ldb
-                       ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb)
-                       : args.stride_b),
-          params_C(args.ldc
-                       ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldc)
-                       : args.stride_c),
-          params_D(args.ldd
-                       ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldd)
-                       : args.stride_d),
-          output_op(args.epilogue),
-          ptr_A(const_cast<void*>(args.ptr_A)),
-          ptr_B(const_cast<void*>(args.ptr_B)),
-          ptr_C(const_cast<void*>(args.ptr_C)),
-          ptr_D(args.ptr_D),
-          batch_stride_A(args.batch_stride_A),
-          batch_stride_B(args.batch_stride_B),
-          batch_stride_C(args.batch_stride_C),
-          ptr_gather_A_indices(const_cast<int*>(args.ptr_gather_A_indices)),
-          ptr_gather_B_indices(const_cast<int*>(args.ptr_gather_B_indices)),
-          ptr_scatter_D_indices(const_cast<int*>(args.ptr_scatter_D_indices)) {}
-
-    /// Lightweight update given a subset of arguments.
-    void update(Arguments const& args) {
-      CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
-
-      // Update input/output pointers
-      ptr_A = const_cast<void*>(args.ptr_A);
-      ptr_B = const_cast<void*>(args.ptr_B);
-      ptr_C = const_cast<void*>(args.ptr_C);
-      ptr_D = args.ptr_D;
-
-      batch_stride_A = args.batch_stride_A;
-      batch_stride_B = args.batch_stride_B;
-      batch_stride_C = args.batch_stride_C;
-      this->batch_stride_D = args.batch_stride_D;
-
-      ptr_gather_A_indices = const_cast<int*>(args.ptr_gather_A_indices);
-      ptr_gather_B_indices = const_cast<int*>(args.ptr_gather_B_indices);
-      ptr_scatter_D_indices = const_cast<int*>(args.ptr_scatter_D_indices);
-
-      output_op = args.epilogue;
-    }
-  };
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
- public:
-  //
-  // Host dispatch API
-  //
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(cutlass::gemm::GemmCoord const& problem_size) {
-    CUTLASS_TRACE_HOST("GemmUniversal::can_implement()");
-
-    static int const kAlignmentA =
-        (cute::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value) ? 32
-        : (cute::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value)
-            ? 64
-            : Mma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB =
-        (cute::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value) ? 32
-        : (cute::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value)
-            ? 64
-            : Mma::IteratorB::AccessType::kElements;
-    static int const kAlignmentC =
-        (cute::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value) ? 32
-        : (cute::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value)
-            ? 64
-            : Epilogue::OutputTileIterator::kElementsPerAccess;
-
-    bool isAMisaligned = false;
-    bool isBMisaligned = false;
-    bool isCMisaligned = false;
-
-    if (cute::is_same<LayoutA, layout::RowMajor>::value) {
-      isAMisaligned = problem_size.k() % kAlignmentA;
-    } else if (cute::is_same<LayoutA, layout::ColumnMajor>::value) {
-      isAMisaligned = problem_size.m() % kAlignmentA;
-    } else if (cute::is_same<LayoutA,
-                             layout::ColumnMajorInterleaved<32>>::value ||
-               cute::is_same<LayoutA,
-                             layout::ColumnMajorInterleaved<64>>::value) {
-      isAMisaligned = problem_size.k() % kAlignmentA;
-    }
-
-    if (cute::is_same<LayoutB, layout::RowMajor>::value) {
-      isBMisaligned = problem_size.n() % kAlignmentB;
-    } else if (cute::is_same<LayoutB, layout::ColumnMajor>::value) {
-      isBMisaligned = problem_size.k() % kAlignmentB;
-    } else if (cute::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value ||
-               cute::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
-      isBMisaligned = problem_size.k() % kAlignmentB;
-    }
-
-    if (cute::is_same<LayoutC, layout::RowMajor>::value) {
-      isCMisaligned = problem_size.n() % kAlignmentC;
-    } else if (cute::is_same<LayoutC, layout::ColumnMajor>::value) {
-      isCMisaligned = problem_size.m() % kAlignmentC;
-    } else if (cute::is_same<LayoutC,
-                             layout::ColumnMajorInterleaved<32>>::value ||
-               cute::is_same<LayoutC,
-                             layout::ColumnMajorInterleaved<64>>::value) {
-      isCMisaligned = problem_size.n() % kAlignmentC;
-    }
-
-    if (isAMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isBMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isCMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    CUTLASS_TRACE_HOST("  returning kSuccess");
-
-    return Status::kSuccess;
-  }
-
-  static Status can_implement(Arguments const& args) {
-    return can_implement(args.problem_size);
-  }
-
- public:
-  //
-  // Device-only API
-  //
-
-  // Factory invocation
-  CUTLASS_DEVICE
-  static void invoke(Params const& params, SharedStorage& shared_storage) {
-    GemmUniversal op;
-    op(params, shared_storage);
-  }
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const& params, SharedStorage& shared_storage) {
-    ThreadblockSwizzle threadblock_swizzle;
-    run_with_swizzle(params, shared_storage, threadblock_swizzle);
-  }
-
-  /// Executes one GEMM with an externally-provided swizzling function
-  CUTLASS_DEVICE
-  void run_with_swizzle(Params const& params, SharedStorage& shared_storage,
-                        ThreadblockSwizzle& threadblock_swizzle) {
-    cutlass::gemm::GemmCoord threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-        params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-      return;
-    }
-
-    int offset_k = 0;
-    int problem_size_k = params.problem_size.k();
-
-    ElementA* ptr_A = static_cast<ElementA*>(params.ptr_A);
-    ElementB* ptr_B = static_cast<ElementB*>(params.ptr_B);
-
-    //
-    // Fetch pointers based on mode.
-    //
-    if (params.mode == GemmUniversalMode::kGemm ||
-        params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
-        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
-      }
-
-      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
-    } else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
-      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
-    } else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_A = static_cast<ElementA* const*>(
-          params.ptr_A)[threadblock_tile_offset.k()];
-      ptr_B = static_cast<ElementB* const*>(
-          params.ptr_B)[threadblock_tile_offset.k()];
-    }
-
-    syncthreads();
-
-    // Compute initial location in logical coordinates
-    cutlass::MatrixCoord tb_offset_A{
-        threadblock_tile_offset.m() * Mma::Shape::kM,
-        offset_k,
-    };
-
-    cutlass::MatrixCoord tb_offset_B{
-        offset_k, threadblock_tile_offset.n() * Mma::Shape::kN};
-
-    // Compute position within threadblock
-    int thread_idx = ThreadIdxX();
-
-    // Construct iterators to A and B operands
-    typename Mma::IteratorA iterator_A(
-        params.params_A, ptr_A, {params.problem_size.m(), problem_size_k},
-        thread_idx, tb_offset_A, params.ptr_gather_A_indices);
-
-    typename Mma::IteratorB iterator_B(
-        params.params_B, ptr_B, {problem_size_k, params.problem_size.n()},
-        thread_idx, tb_offset_B, params.ptr_gather_B_indices);
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = canonical_warp_idx_sync();
-
-    int lane_idx = ThreadIdxX() % 32;
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply
-    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    // Compute threadblock-scoped matrix multiply-add
-    int gemm_k_iterations =
-        (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
-
-    // Compute threadblock-scoped matrix multiply-add
-    mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
-
-    //
-    // Epilogue
-    //
-
-    EpilogueOutputOp output_op(params.output_op);
-
-    //
-    // Masked tile iterators constructed from members
-    //
-
-    threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // assume identity swizzle
-    MatrixCoord threadblock_offset(
-        threadblock_tile_offset.m() * Mma::Shape::kM,
-        threadblock_tile_offset.n() * Mma::Shape::kN);
-
-    int block_idx = threadblock_tile_offset.m() +
-                    threadblock_tile_offset.n() * params.grid_tiled_shape.m();
-
-    ElementC* ptr_C = static_cast<ElementC*>(params.ptr_C);
-    ElementC* ptr_D = static_cast<ElementC*>(params.ptr_D);
-
-    //
-    // Fetch pointers based on mode.
-    //
-
-    // Construct the semaphore.
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-
-    if (params.mode == GemmUniversalMode::kGemm) {
-      // If performing a reduction via split-K, fetch the initial
-      // synchronization
-      if (params.grid_tiled_shape.k() > 1) {
-        // Fetch the synchronization lock initially but do not block.
-        semaphore.fetch();
-
-        // Indicate which position in a serial reduction the output operator is
-        // currently updating
-        output_op.set_k_partition(threadblock_tile_offset.k(),
-                                  params.grid_tiled_shape.k());
-      }
-    } else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-    } else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
-      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-    } else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_C = static_cast<ElementC* const*>(
-          params.ptr_C)[threadblock_tile_offset.k()];
-      ptr_D = static_cast<ElementC* const*>(
-          params.ptr_D)[threadblock_tile_offset.k()];
-    }
-
-    // Tile iterator loading from source tensor.
-    typename Epilogue::OutputTileIterator iterator_C(
-        params.params_C, ptr_C, params.problem_size.mn(), thread_idx,
-        threadblock_offset, params.ptr_scatter_D_indices);
-
-    // Tile iterator writing to destination tensor.
-    typename Epilogue::OutputTileIterator iterator_D(
-        params.params_D, ptr_D, params.problem_size.mn(), thread_idx,
-        threadblock_offset, params.ptr_scatter_D_indices);
-
-    Epilogue epilogue(shared_storage.epilogue, thread_idx, warp_idx, lane_idx);
-
-    // Wait on the semaphore - this latency may have been covered by iterator
-    // construction
-    if (params.mode == GemmUniversalMode::kGemm &&
-        params.grid_tiled_shape.k() > 1) {
-      // For subsequent threadblocks, the source matrix is held in the 'D'
-      // tensor.
-      if (threadblock_tile_offset.k()) {
-        iterator_C = iterator_D;
-      }
-
-      semaphore.wait(threadblock_tile_offset.k());
-    }
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(output_op, iterator_D, accumulators, iterator_C);
-
-    //
-    // Release the semaphore
-    //
-
-    if (params.mode == GemmUniversalMode::kGemm &&
-        params.grid_tiled_shape.k() > 1) {
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      } else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_offset.k() + 1;
-      }
-
-      semaphore.release(lock);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/xpu/cutlass_kernels/collective/gemm/xe_array_epilogue.hpp b/csrc/xpu/cutlass_kernels/collective/gemm/xe_array_epilogue.hpp
deleted file mode 100644
index bd49242..0000000
--- a/csrc/xpu/cutlass_kernels/collective/gemm/xe_array_epilogue.hpp
+++ /dev/null
@@ -1,562 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Functor performing elementwise operations used by epilogues.
-*/
-
-#pragma once
-
-#include <sycl/sycl.hpp>
-#include "cutlass/cutlass.h"
-#include "cutlass/epilogue/dispatch_policy.hpp"
-// #include "cutlass/epilogue/collective/collective_epilogue.hpp"
-#include "cutlass/epilogue/collective/detail.hpp"
-#include "cutlass/epilogue/fusion/callbacks.hpp"
-#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
-#include "cutlass/epilogue/fusion/xe_visitor_softmax.hpp"
-#include "cutlass/detail/layout.hpp"
-
-#include "cute/tensor.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::epilogue::collective {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <class DispatchPolicy, class... Args>
-class CollectiveEpilogue {
-  static_assert(cutlass::detail::dependent_false<DispatchPolicy>,
-                "Could not find an epilogue specialization.");
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace cutlass::epilogue::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace collective {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <class CtaTileMNK_, class ElementC_, class StrideC_, class ElementD_,
-          class StrideD_, class FusionCallbacks_, class CopyOpG2R_,
-          class SmemLayoutAtomC_, class CopyOpS2R_, class CopyOpR2G_,
-          class SmemLayoutAtomD_, class CopyOpR2S_>
-class CollectiveEpilogue<IntelXeXMX16Group, CtaTileMNK_, ElementC_, StrideC_,
-                         ElementD_, StrideD_, FusionCallbacks_, CopyOpG2R_,
-                         SmemLayoutAtomC_, CopyOpS2R_, CopyOpR2G_,
-                         SmemLayoutAtomD_, CopyOpR2S_> {
- public:
-  //
-  // Type Aliases
-  //
-  using DispatchPolicy = IntelXeXMX16Group;
-  using CtaTileMNK = CtaTileMNK_;
-  using FusionCallbacks = FusionCallbacks_;
-  using ElementC = ElementC_;
-  using ElementAccumulator = ElementC_;
-  using StrideC = StrideC_;
-  using InternalStrideC = cute::remove_pointer_t<StrideC>;
-  using ElementD = ElementD_;
-  using StrideD = StrideD_;
-  using InternalStrideD = cute::remove_pointer_t<StrideD>;
-  using CopyOpG2R = CopyOpG2R_;
-  using SmemLayoutAtomC = SmemLayoutAtomC_;
-  using CopyOpS2R = CopyOpS2R_;
-  using CopyOpR2G = CopyOpR2G_;
-  using SmemLayoutAtomD = SmemLayoutAtomD_;
-  using CopyOpR2S = CopyOpR2S_;
-
-  using ThreadEpilogueOp =
-      typename fusion::FusionCallbacksTraits<FusionCallbacks>::Operation;
-  using GmemTiledCopyC = CopyOpG2R;
-  using GmemTiledCopyD = cute::conditional_t<not cute::is_void_v<ElementD> &&
-                                                 not cute::is_void_v<CopyOpR2G>,
-                                             CopyOpR2G, XE_2D_U32x8x16_ST_N>;
-  using ElementOutput = ElementD;
-  using ElementCompute = ElementAccumulator;
-  using ElementSource = typename FusionCallbacks::ElementSource;
-  using ElementScalar = typename FusionCallbacks::ElementScalar;
-  static constexpr FloatRoundStyle RoundStyle =
-      FloatRoundStyle::round_to_nearest;
-
-  static_assert(
-      cute::is_same_v<
-          typename FusionCallbacks::Operation,
-          fusion::LinearCombination<ElementAccumulator, ElementCompute,
-                                    ElementSource, ElementScalar, RoundStyle>>,
-      "Only Linear Combination Epilogue is supported for Grouped GEMM at the "
-      "moment.");
-
-  static constexpr int SubgroupSize = DispatchPolicy::SubgroupSize;
-
-  static_assert(cute::rank(CtaTileMNK{}) == 3,
-                "CtaTileMNK must be rank-3: [CTA_M, CTA_N, CTA_K]");
-  static_assert(cute::rank(InternalStrideC{}) == 3,
-                "StrideC must be rank-3: [M, N, L]");
-  static_assert(cute::rank(InternalStrideD{}) == 3,
-                "StrideD must be rank-3: [M, N, L]");
-
-  static_assert(std::is_same_v<CopyOpS2R, void>,
-                "Copy operation to shared memory is not supported");
-  static_assert(std::is_same_v<CopyOpR2S, void>,
-                "Copy operation to shared memory is not supported");
-  static_assert(std::is_same_v<SmemLayoutAtomC, void>,
-                "Copy operation to shared memory is not supported");
-  static_assert(std::is_same_v<SmemLayoutAtomD, void>,
-                "Copy operation to shared memory is not supported");
-
-  using CopyThreadShape = Shape<_1, Int<SubgroupSize>>;
-  using Trait_C = Copy_Traits<GmemTiledCopyC, InternalStrideC>;
-  using XE_Copy_C = decltype(make_tiled_copy(
-      Copy_Atom<Trait_C, ElementC>{}, Layout<CopyThreadShape>{},
-      make_layout(
-          shape_div(typename Trait_C::BlockShape{}, CopyThreadShape{}))));
-  using Trait_D = Copy_Traits<GmemTiledCopyD, InternalStrideD>;
-  using XE_Copy_D = decltype(make_tiled_copy(
-      Copy_Atom<Trait_D, ElementD>{}, Layout<CopyThreadShape>{},
-      make_layout(
-          shape_div(typename Trait_D::BlockShape{}, CopyThreadShape{}))));
-
- private:
-  // constexpr static bool is_source_supported = not cute::is_void_v<ElementC>;
-  constexpr static bool is_source_supported = false;
-  constexpr static bool is_destination_supported =
-      not cute::is_void_v<ElementD> && not cute::is_void_v<CopyOpR2G>;
-
- public:
-  using EmptyType = cute::tuple<>;
-  using SmemCStorage = EmptyType;
-  using SmemDStorage = EmptyType;
-
-  struct TensorStorageImpl : cute::tuple<SmemCStorage, SmemDStorage> {
-    using FusionStorage = typename FusionCallbacks::SharedStorage;
-    FusionStorage thread;
-  };
-
-  struct SharedStorage {
-    using TensorStorage = TensorStorageImpl;
-
-    TensorStorage tensors;
-  };
-  using TensorStorage = typename SharedStorage::TensorStorage;
-
-  using TensorC =
-      decltype(make_tensor(make_gmem_ptr(static_cast<ElementC const*>(nullptr)),
-                           make_shape(0, 0, 0), InternalStrideC{}));  //(m, n)
-  using TensorD =
-      decltype(make_tensor(make_gmem_ptr(static_cast<ElementD*>(nullptr)),
-                           make_shape(0, 0, 0), InternalStrideD{}));  //(m, n)
-  using EpilogueTensors = cute::tuple<TensorC, TensorD>;
-
-  // Host side epilogue arguments
-  struct Arguments {
-    typename FusionCallbacks::Arguments thread{};
-    ElementC const** ptr_C;
-    StrideC dC;
-    ElementD** ptr_D;
-    StrideD dD;
-  };
-
-  // Device side epilogue params
-  struct Params {
-    typename FusionCallbacks::Params thread{};
-    XE_Copy_C xe_load_c;
-    XE_Copy_D xe_store_d;
-    ElementC const** ptr_C;
-    StrideC dC;
-    ElementD** ptr_D;
-    StrideD dD;
-  };
-
-  //
-  // Methods
-  //
-
-  template <class ProblemShape>
-  static constexpr Params to_underlying_arguments(
-      ProblemShape const& problem_shape, Arguments const& args,
-      [[maybe_unused]] void* workspace) {
-    // Optionally append 1s until problem shape is rank-4 in case its is only
-    // rank-3 (MNK)
-    auto problem_shape_MNL = repeat_like(
-        typename ProblemShape::UnderlyingProblemShape{}, int32_t(1));
-    auto [M, N, L] = problem_shape_MNL;
-
-    XE_Copy_C xe_load_c = {};
-    if constexpr (is_source_supported) {
-      ElementC const* ptr_C_first_batch =
-          reinterpret_cast<ElementC const*>(args.ptr_C);
-      TensorC mC_mnl =
-          make_tensor(make_gmem_ptr(ptr_C_first_batch),
-                      make_layout(make_shape(M, N, L), InternalStrideC{}));
-      xe_load_c = {xe_load_c.with(mC_mnl)};
-    }
-
-    XE_Copy_D xe_store_d = {};
-    if constexpr (is_destination_supported) {
-      ElementD* ptr_D_first_batch = reinterpret_cast<ElementD*>(args.ptr_D);
-      TensorD mD_mnl =
-          make_tensor(make_gmem_ptr(ptr_D_first_batch),
-                      make_layout(make_shape(M, N, L), InternalStrideD{}));
-      xe_store_d = {xe_store_d.with(mD_mnl)};
-    }
-
-    return {FusionCallbacks::to_underlying_arguments(problem_shape, args.thread,
-                                                     workspace),
-            xe_load_c,
-            xe_store_d,
-            args.ptr_C,
-            args.dC,
-            args.ptr_D,
-            args.dD};
-  }
-
-  template <class ProblemShape>
-  static size_t get_workspace_size(ProblemShape const& problem_shape,
-                                   Arguments const& args) {
-    return 0;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status initialize_workspace(
-      ProblemShape const& problem_shape, Arguments const& args, void* workspace,
-      cudaStream_t stream, CudaHostAdapter* cuda_adapter = nullptr) {
-    return Status::kSuccess;
-  }
-
-  template <class ProblemShape>
-  static bool can_implement(ProblemShape problem_shape, Arguments const& args) {
-    constexpr int copy_alignment_bits = 128;
-    constexpr int batch_alignment_bits = 512;
-
-    bool implementable = true;
-    bool fusion_implementable = true;
-
-    for (int i = 0; i < problem_shape.groups(); ++i) {
-      auto problem_shape_MNKL =
-          append<4>(problem_shape.get_host_problem_shape(i), 1);
-      auto [M, N, K, L] = problem_shape_MNKL;
-
-      if constexpr (is_destination_supported) {
-        constexpr int min_aligned_elements_D =
-            copy_alignment_bits / sizeof_bits<ElementD>::value;
-        implementable &=
-            cutlass::detail::check_alignment<min_aligned_elements_D>(
-                cute::make_shape(M, N, L), InternalStrideD{});
-        if (L > 1) {
-          constexpr int min_batch_aligned_elements_D =
-              batch_alignment_bits / sizeof_bits<ElementD>::value;
-          implementable &=
-              get<2>(InternalStrideD{}) % min_batch_aligned_elements_D == 0;
-        }
-      }
-
-      if constexpr (is_source_supported) {
-        constexpr int min_aligned_elements_C =
-            copy_alignment_bits / sizeof_bits<ElementC>::value;
-        implementable &=
-            cutlass::detail::check_alignment<min_aligned_elements_C>(
-                cute::make_shape(M, N, L), InternalStrideC{});
-        if (L > 1) {
-          constexpr int min_batch_aligned_elements_C =
-              batch_alignment_bits / sizeof_bits<ElementC>::value;
-          implementable &=
-              get<2>(InternalStrideC{}) % min_batch_aligned_elements_C == 0;
-        }
-      }
-
-      fusion_implementable =
-          fusion_implementable &&
-          FusionCallbacks::can_implement(problem_shape_MNKL, args.thread);
-    }
-
-    if (!implementable) {
-      CUTLASS_TRACE_HOST(
-          "  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment "
-          "requirements for XE 2D copy.\n");
-    }
-
-    if (!fusion_implementable) {
-      CUTLASS_TRACE_HOST(
-          "  CAN IMPLEMENT: Problem Size doesn't meet the minimum requirements "
-          "for FusionCallbacks.\n");
-    }
-
-    return implementable && fusion_implementable;
-  }
-
-  CUTLASS_HOST_DEVICE
-  CollectiveEpilogue(Params const& params_,
-                     TensorStorage const& shared_storage_)
-      : params(params_),
-        fusion_callbacks(params_.thread, shared_storage_.thread) {}
-
-  CUTLASS_DEVICE
-  bool is_producer_load_needed() const {
-    return fusion_callbacks.is_producer_load_needed();
-  }
-
-  template <class ProblemShapeMNKL, class TileShapeMNK, class TileCoordMNKL,
-            class Accumulator, class TiledMma, class LoadStoreTensor>
-  CUTLASS_DEVICE void operator()(ProblemShapeMNKL problem_shape_mnkl,
-                                 TileShapeMNK tile_shape_MNK,
-                                 TileCoordMNKL tile_coord_mnkl,
-                                 Accumulator accumulators, TiledMma tiled_mma,
-                                 int thread_idx,
-                                 LoadStoreTensor const& load_store_tensors) {
-    (void)tiled_mma;
-    using namespace cute;
-
-    static_assert(cute::rank(CtaTileMNK{}) == 3,
-                  "CtaTileMNK must be rank-3: [CTA_M, CTA_N, CTA_K]");
-    static_assert(cute::rank(InternalStrideC{}) == 3,
-                  "StrideC must be rank-3: [M, N, L]");
-    static_assert(cute::rank(InternalStrideD{}) == 3,
-                  "StrideD must be rank-3: [M, N, L]");
-
-    using MmaAtomShape = typename TiledMma::AtomShape_MNK;
-    static constexpr auto BLK_M = get<0>(CtaTileMNK{});
-    static constexpr auto BLK_N = get<1>(CtaTileMNK{});
-    static constexpr auto BLK_K = get<2>(CtaTileMNK{});
-    // static_assert(is_same_v<typename TiledMma::ThrLayoutVMNK, int>,
-    // "assertion fail");
-    static constexpr auto ATOM_M =
-        get<1>(typename TiledMma::ThrLayoutVMNK{}.shape());
-    static constexpr auto ATOM_N =
-        get<2>(typename TiledMma::ThrLayoutVMNK{}.shape());
-    static constexpr auto ATOM_K =
-        get<3>(typename TiledMma::ThrLayoutVMNK{}.shape());
-
-    static_assert(
-        BLK_M % ATOM_M == 0 && BLK_N % ATOM_N == 0 && BLK_K % ATOM_K == 0,
-        "expected CTATileMNK to be evenly divided by TiledMma::ThrLayoutVMNK");
-    static constexpr auto SG_M = BLK_M / ATOM_M;
-    static constexpr auto SG_N = BLK_N / ATOM_N;
-    static constexpr auto SG_K = BLK_K / ATOM_K;
-    using SubgroupTileShape =
-        Shape<decltype(SG_M), decltype(SG_N), decltype(SG_K)>;
-
-    static constexpr int FragsM =
-        get<0>(SubgroupTileShape{}) /
-        get<0>(MmaAtomShape());  // A frags per sub_group
-    static constexpr int FragsN =
-        get<1>(SubgroupTileShape{}) /
-        get<1>(MmaAtomShape());  // B frags per sub_group
-
-    static constexpr int FragmentSize =
-        (get<0>(MmaAtomShape()) * get<1>(MmaAtomShape())) / SubgroupSize;
-
-    // Indexing variables
-    auto [M, N, K, L] = problem_shape_mnkl;
-    auto [m_coord, n_coord, k_coord, l_coord] = tile_coord_mnkl;
-    auto m_sg = get_sub_group_id() / ATOM_N;
-    auto n_sg = get_sub_group_id() % ATOM_N;
-
-    // Get the layout and reconstruct the MN mapping equivalent to the old
-    // get_layoutS_MN()
-    auto layoutS_TV = params.xe_store_d.get_layoutS_TV();
-    auto mn_shape = shape(typename decltype(params.xe_store_d)::Tiler_MN{});
-    auto layoutS_MN = right_inverse(layoutS_TV).with_shape(mn_shape);
-    using EpilogueTile = decltype(layoutS_MN.shape());
-
-    auto sg_local_m_coord = get_sub_group_id() / ATOM_N;
-    auto sg_local_n_coord = get_sub_group_id() % ATOM_N;
-
-    auto sg_m_coord = m_coord * ATOM_M + sg_local_m_coord;
-    auto sg_n_coord = n_coord * ATOM_N + sg_local_n_coord;
-    auto sg_coord = make_coord(sg_m_coord, sg_n_coord, k_coord, l_coord);
-
-    bool is_C_load_needed =
-        is_source_supported && fusion_callbacks.is_C_load_needed();
-
-    // Represent the full output tensor
-    Tensor mD_mnl = cute::get_xe_tensor(make_shape(M, N, L));
-
-    // Tile the output tensor per WG and select the tile for current WG
-    Tensor g_wg_D =
-        local_tile(mD_mnl, take<0, 2>(CtaTileMNK{}),
-                   make_coord(m_coord, n_coord, l_coord));  // (BLK_M,BLK_N)
-
-    // Tile the output tensor per SG and select tile for the current SG
-    Tensor gD = local_tile(g_wg_D, take<0, 2>(SubgroupTileShape{}),
-                           make_coord(m_sg, n_sg));  // (SG_M,SG_N)
-
-    auto thread_xe_store_d = params.xe_store_d.get_thread_slice(thread_idx);
-    Tensor tCgD = thread_xe_store_d.partition_D(gD);
-
-    Tensor trC =
-        make_tensor<typename TiledMma::ValTypeC>(Shape<Int<FragmentSize>>{});
-    Tensor trD_compute =
-        make_tensor<ElementCompute>(Shape<Int<FragmentSize>>{});
-
-    // Because Sm90 uses shared memory, they are not tied to using the same
-    // accumulator values for MMA and Epilogue. But because we are operating
-    // directly in the accumulators, we need to be sure that we are operating on
-    // the same values.
-    ThrCopy thread_g2r = params.xe_load_c.get_slice(thread_idx);
-
-    // OOB predication for tile quantization "residue"
-    // Absolute coordinate tensors (dynamic)
-    Tensor mD_crd = make_identity_tensor(make_shape(M, N));  // (M,N)
-    Tensor cD = local_tile(mD_crd, take<0, 2>(SubgroupTileShape{}),
-                           make_coord(sg_m_coord, sg_n_coord));
-    Tensor cD_mn = local_tile(mD_crd, take<0, 2>(CtaTileMNK{}),
-                              make_coord(m_coord, n_coord));  // (CTA_M,CTA_N)
-    Tensor tRS_cD_mn = thread_g2r.partition_S(
-        flat_divide(cD_mn, EpilogueTile{}));  // (G2R,G2R_M,G2R_N,EPI_M,EPI_N)
-    Tensor tRS_cD =
-        make_coord_tensor(tRS_cD_mn.layout());  // (G2R,G2R_M,G2R_N,EPI_M,EPI_N)
-
-    // Get the fusion callbacks
-    // Arguments passed here relate to sub-group tiles, rather than CTA
-    // (work-group) tiles
-    constexpr bool RefSrc = true;
-    auto residue_mn = make_coord(M, N);  // TODO(Codeplay): this is not correct
-    auto cst_args = cutlass::epilogue::fusion::detail::ConsumerStoreArgs{
-        problem_shape_mnkl,
-        SubgroupTileShape{},
-        sg_coord,
-        tiled_mma,
-        EpilogueTile{},
-        params.xe_store_d,
-        cD,
-        residue_mn,
-        tRS_cD,
-        residue_mn,
-        trC,
-        thread_idx,
-    };
-    auto cst_callbacks =
-        fusion_callbacks.template get_consumer_store_callbacks<RefSrc>(
-            cst_args);
-
-    cst_callbacks.begin();
-
-    auto acc_frag = recast<Array<ElementCompute, FragmentSize>>(accumulators);
-    auto trD_compute_frag =
-        recast<Array<ElementCompute, FragmentSize>>(trD_compute);
-
-    Tensor trD = make_tensor<ElementOutput>(Shape<Int<FragmentSize>>{});
-    auto trD_frag = recast<Array<ElementOutput, FragmentSize>>(trD);
-
-    constexpr int ValuesLoaded = FragsM * FragsN * FragmentSize * SubgroupSize *
-                                 ATOM_M * ATOM_N * ATOM_K;
-    constexpr int MN = get<0>(CtaTileMNK{}) * get<1>(CtaTileMNK{});
-    static_assert(
-        ValuesLoaded == MN,
-        "the total elements loaded by all threads should be the same as MxN");
-
-    auto synchronize = [&]() {};
-    CUTLASS_PRAGMA_UNROLL
-    for (int epi_n = 0; epi_n < FragsN; epi_n++) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int epi_m = 0; epi_m < FragsM; epi_m++) {
-        if (is_C_load_needed) {
-          // coordinates for C and D are the same
-          copy(params.xe_load_c.with(get<0>(load_store_tensors)),
-               tCgD(_, epi_m, epi_n), trC);
-        }
-
-        cst_callbacks.previsit(epi_m, epi_n, 0, is_C_load_needed);
-
-        auto acc_frag_mn = acc_frag(_, epi_m, epi_n);
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int epi_v = 0; epi_v < size<0>(trD_compute_frag); ++epi_v) {
-          trD_compute_frag(epi_v) =
-              cst_callbacks.visit(acc_frag_mn(epi_v), epi_v, epi_m, epi_n);
-        }
-        cst_callbacks.reduce(nullptr, synchronize, epi_m, epi_n,
-                             (epi_m == FragsM - 1 && epi_n == FragsN - 1),
-                             trD_compute_frag);
-
-        if constexpr (is_destination_supported) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int i = 0; i < size(trD_compute_frag); ++i) {
-            trD_frag(i) =
-                cutlass::NumericArrayConverter<ElementOutput, ElementCompute,
-                                               FragmentSize>{}(
-                    trD_compute_frag(i));
-          }
-          copy(params.xe_store_d.with(get<1>(load_store_tensors)), trD,
-               tCgD(_, epi_m, epi_n));
-        }
-      }
-    }
-
-    cst_callbacks.end();
-  }
-
-  template <typename ProblemShape_MNKL>
-  CUTLASS_DEVICE auto update_tensor_shape_stride(
-      int32_t const& next_group, ProblemShape_MNKL const& problem_shape_mnkl) {
-    auto [M, N, K, L] = problem_shape_mnkl;
-
-    TensorC mC_mnl;
-    TensorD mD_mnl;
-    if constexpr (is_source_supported) {
-      ElementC const* ptr_C_curr_batch =
-          reinterpret_cast<ElementC const*>(params.ptr_C[next_group]);
-      mC_mnl =
-          make_tensor(make_gmem_ptr(ptr_C_curr_batch),
-                      make_layout(make_shape(M, N, L), params.dC[next_group]));
-    }
-
-    if constexpr (is_destination_supported) {
-      ElementD* ptr_D_curr_batch =
-          reinterpret_cast<ElementD*>(params.ptr_D[next_group]);
-      mD_mnl =
-          make_tensor(make_gmem_ptr(ptr_D_curr_batch),
-                      make_layout(make_shape(M, N, L), params.dD[next_group]));
-    }
-    return cute::make_tuple(mC_mnl, mD_mnl);
-  }
-
- private:
-  Params const& params;
-  FusionCallbacks fusion_callbacks;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace collective
-}  // namespace epilogue
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/xpu/cutlass_kernels/collective/gemm/xe_array_mma.hpp b/csrc/xpu/cutlass_kernels/collective/gemm/xe_array_mma.hpp
deleted file mode 100644
index a2abb4b..0000000
--- a/csrc/xpu/cutlass_kernels/collective/gemm/xe_array_mma.hpp
+++ /dev/null
@@ -1,360 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-
-#include "cute/algorithm/functional.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cute/algorithm/gemm.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-using namespace cute;
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <int Stages, class Schedule, class TileShape_, class ElementA_,
-          class StrideA_, class ElementB_, class StrideB_, class TiledMma_,
-          class GmemTiledCopyA_, class SmemLayoutAtomA_, class SmemCopyAtomA_,
-          class TransformA_, class GmemTiledCopyB_, class SmemLayoutAtomB_,
-          class SmemCopyAtomB_, class TransformB_>
-struct CollectiveMma<MainloopIntelXeXMX16Group<Stages, Schedule>, TileShape_,
-                     ElementA_, StrideA_, ElementB_, StrideB_, TiledMma_,
-                     GmemTiledCopyA_, SmemLayoutAtomA_, SmemCopyAtomA_,
-                     TransformA_, GmemTiledCopyB_, SmemLayoutAtomB_,
-                     SmemCopyAtomB_, TransformB_> {
-  //
-  // Type Aliases
-  //
-  using DispatchPolicy = MainloopIntelXeXMX16Group<Stages, Schedule>;
-  using WorkgroupTileShape = TileShape_;
-  using ElementA = ElementA_;
-  using StrideA = StrideA_;
-  using InternalStrideA = cute::remove_pointer_t<StrideA>;
-  using ElementB = ElementB_;
-  using StrideB = StrideB_;
-  using InternalStrideB = cute::remove_pointer_t<StrideB>;
-  using TiledMma = TiledMma_;
-  using ElementAccumulator = typename TiledMma::ValTypeC;
-  using GmemTiledCopyA = GmemTiledCopyA_;
-  using GmemTiledCopyB = GmemTiledCopyB_;
-  using SmemLayoutAtomA = SmemLayoutAtomA_;
-  using SmemLayoutAtomB = SmemLayoutAtomB_;
-  using SmemCopyAtomA = SmemCopyAtomA_;
-  using SmemCopyAtomB = SmemCopyAtomB_;
-  using TransformA = TransformA_;
-  using TransformB = TransformB_;
-  using ArchTag = typename DispatchPolicy::ArchTag;
-
-  static_assert(
-      platform::is_same<ElementA, ElementB>::value,
-      "MainloopIntelXeXMX16Array requires that A and B have same type.");
-
-  static_assert(std::is_same_v<TransformA, cute::identity>,
-                "Transformation for A is not currently supported on Intel PVC");
-  static_assert(std::is_same_v<TransformB, cute::identity>,
-                "Transformation for B is not currently supported on Intel PVC");
-
-  static constexpr int SubgroupSize = DispatchPolicy::SubgroupSize;
-
-  using MmaAtomShape = typename TiledMma::AtomShape_MNK;
-
-  static constexpr auto BLK_M = get<0>(WorkgroupTileShape{});
-  static constexpr auto BLK_N = get<1>(WorkgroupTileShape{});
-  static constexpr auto BLK_K = get<2>(WorkgroupTileShape{});
-
-  static constexpr auto ATOM_M =
-      get<1>(typename TiledMma::ThrLayoutVMNK{}.shape());
-  static constexpr auto ATOM_N =
-      get<2>(typename TiledMma::ThrLayoutVMNK{}.shape());
-  static constexpr auto ATOM_K =
-      get<3>(typename TiledMma::ThrLayoutVMNK{}.shape());
-
-  static constexpr auto SG_M = ceil_div(BLK_M, ATOM_M);
-  static constexpr auto SG_N = ceil_div(BLK_N, ATOM_N);
-  static constexpr auto SG_K = ceil_div(BLK_K, ATOM_K);
-  using SubgroupTileShape =
-      Shape<decltype(SG_M), decltype(SG_N), decltype(SG_K)>;
-
-  static constexpr auto Num_SGs = ATOM_N * ATOM_M * ATOM_K;
-  static constexpr uint32_t MaxThreadsPerBlock = size(TiledMma{});
-
-  using Copy_A = typename Copy_Traits<
-      GmemTiledCopyA, InternalStrideA>::template DefaultTiledCopy<ElementA>;
-  using Copy_B = typename Copy_Traits<
-      GmemTiledCopyB, InternalStrideB>::template DefaultTiledCopy<ElementB>;
-
-  using TensorMKL =
-      decltype(make_tensor(make_gmem_ptr(static_cast<ElementA const*>(nullptr)),
-                           make_shape(0, 0, 0), InternalStrideA{}));  //(m, k)
-  using TensorNKL =
-      decltype(make_tensor(make_gmem_ptr(static_cast<ElementB const*>(nullptr)),
-                           make_shape(0, 0, 0), InternalStrideB{}));  //(n, k)
-  using MainloopTensors = cute::tuple<TensorMKL, TensorNKL>;
-  // Host side kernel arguments
-  struct Arguments {
-    ElementA const** ptr_A;
-    StrideA dA;
-    ElementB const** ptr_B;
-    StrideB dB;
-  };
-
-  struct Params {
-    ElementA const** ptr_A;
-    StrideA dA;
-    ElementB const** ptr_B;
-    StrideB dB;
-  };
-
-  //
-  // Methods
-  //
-
-  CollectiveMma() = default;
-
-  template <class ProblemShape>
-  static constexpr Params to_underlying_arguments(
-      ProblemShape const& problem_shape, Arguments const& args,
-      void* workspace) {
-    (void)workspace;
-
-    auto problem_shape_MNK = repeat_like(
-        typename ProblemShape::UnderlyingProblemShape{}, int32_t(1));
-    ;
-    auto init_M = get<0>(problem_shape_MNK);
-    auto init_N = get<1>(problem_shape_MNK);
-    auto init_K = get<2>(problem_shape_MNK);
-
-    return Params{args.ptr_A, args.dA, args.ptr_B, args.dB};
-  }
-
-  template <class ProblemShape>
-  static bool can_implement(ProblemShape problem_shapes,
-                            Arguments const& args) {
-    constexpr int copy_alignment_bits = 128;
-    constexpr int batch_alignment_bits = 512;
-    auto problem_shape_MNKL = append<4>(problem_shapes, 1);
-    auto [M, N, K, L] = problem_shape_MNKL;
-
-    bool implementable = true;
-
-    constexpr int min_aligned_elements_A =
-        copy_alignment_bits / sizeof_bits<ElementA>::value;
-    constexpr int min_aligned_elements_B =
-        copy_alignment_bits / sizeof_bits<ElementB>::value;
-    constexpr int min_batch_aligned_elements_A =
-        batch_alignment_bits / sizeof_bits<ElementA>::value;
-    constexpr int min_batch_aligned_elements_B =
-        batch_alignment_bits / sizeof_bits<ElementB>::value;
-    for (int i = 0; i < problem_shapes.groups(); i++) {
-      auto problem_shape_MNKL =
-          append<4>(problem_shapes.get_host_problem_shape(i), 1);
-      auto [M, N, K, L] = problem_shape_MNKL;
-
-      implementable &= cutlass::detail::check_alignment<min_aligned_elements_A>(
-          cute::make_shape(M, K, L), InternalStrideA{});
-      implementable &= cutlass::detail::check_alignment<min_aligned_elements_B>(
-          cute::make_shape(N, K, L), InternalStrideB{});
-
-      if (L > 1) {
-        implementable &=
-            get<2>(InternalStrideA{}) % min_batch_aligned_elements_A == 0;
-        implementable &=
-            get<2>(InternalStrideB{}) % min_batch_aligned_elements_B == 0;
-      }
-    }
-
-    if (!implementable) {
-      CUTLASS_TRACE_HOST(
-          "  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment "
-          "requirements for XE 2D copy.\n");
-    }
-
-    return implementable;
-  }
-
-  /// Perform a subgroup-scoped matrix multiply-accumulate
-  template <class FrgTensorD, class TensorA, class TensorB, class FrgTensorC,
-            class KTileIterator, class BlkCoord, class LoadTensors>
-  CUTLASS_DEVICE void operator()(FrgTensorD& accum, TensorA gA, TensorB gB,
-                                 FrgTensorC const& src_accum,
-                                 KTileIterator k_tile_iter,
-                                 int const& k_tile_count,
-                                 BlkCoord const& blk_coord, int const& K_start,
-                                 int const& thread_idx, Params const& mainloop,
-                                 LoadTensors const& load_tensors) {
-    static_assert(is_rmem<FrgTensorD>::value,
-                  "D tensor must be rmem resident.");
-    static_assert(is_rmem<FrgTensorC>::value,
-                  "C tensor must be rmem resident.");
-
-    (void)thread_idx;
-
-    Copy_A tiled_copy_a{Copy_A{}.with(get<0>(load_tensors))};
-    Copy_B tiled_copy_b{Copy_B{}.with(get<1>(load_tensors))};
-
-    auto thr_copy_A = tiled_copy_a.get_slice(thread_idx);
-    auto thr_copy_B = tiled_copy_b.get_slice(thread_idx);
-
-    // Instantiate the MMA object and get thread slice
-    TiledMma tiled_mma;
-    // TODO(Codeplay): see if we can make this nicer
-    // To make all work items in a subgroup have the same global tensors pass in
-    // the index of work item 0 in each subgroup
-    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
-    auto first_thread_in_sg_idx =
-        sg.get_group_linear_id() * DispatchPolicy::SubgroupSize;
-    auto thr_mma = tiled_mma.get_slice(first_thread_in_sg_idx);
-
-    // Partition global counting tensors for MMA
-    Tensor tCgA = thr_mma.partition_A(gA);
-    Tensor tCgB = thr_mma.partition_B(gB);
-
-    Tensor tCrA = make_tensor<ElementA>(
-        make_fragment_layout(tiled_copy_a, tCgA(_, _, _, 0).shape()));
-    Tensor tCrB = make_tensor<ElementB>(
-        make_fragment_layout(tiled_copy_b, tCgB(_, _, _, 0).shape()));
-
-    // Retile registers for copies
-    Tensor tArA = thr_copy_A.retile_D(tCrA);
-    Tensor tBrB = thr_copy_B.retile_D(tCrB);
-
-    // Retile global counting tensors for copies
-    Tensor tAgA = thr_copy_A.retile_S(tCgA);
-    Tensor tBgB = thr_copy_B.retile_S(tCgB);
-
-    auto tiled_prefetch_a =
-        cute::prefetch_selector<Shape<Int<BLK_M>, Int<BLK_K>>, Num_SGs>(
-            tiled_copy_a);
-    auto tiled_prefetch_b =
-        cute::prefetch_selector<Shape<Int<BLK_N>, Int<BLK_K>>, Num_SGs>(
-            tiled_copy_b);
-    auto thr_prefetch_A = tiled_prefetch_a.get_slice(thread_idx);
-    auto thr_prefetch_B = tiled_prefetch_b.get_slice(thread_idx);
-
-    // Partition global tile for prefetch
-    auto pAgA = thr_prefetch_A.partition_S(gA);
-    auto pBgB = thr_prefetch_B.partition_S(gB);
-
-#if CUTLASS_ENABLE_DEBUG_PRINTS
-    if (cutlass::thread(LOG_THREAD, LOG_GROUP)) {
-      print("======================= A: \n");
-      print("  gA : ");
-      print(gA);
-      print("\n");
-      print("tCgA : ");
-      print(tCgA);
-      print("\n");
-      print("tAgA : ");
-      print(tAgA);
-      print("\n");
-
-      print("=====================  B :\n");
-      print("  gB : ");
-      print(gB);
-      print("\n");
-      print("tCgB : ");
-      print(tCgB);
-      print("\n");
-      print("tBgB : ");
-      print(tBgB);
-      print("\n");
-
-      print("=====================  Config: \n");
-      print("  threads per workgroup : ");
-      print(MaxThreadsPerBlock);
-      print("\n");
-      print("  SubgroupTileShape : ");
-      print(SubgroupTileShape{});
-      print("\n");
-    }
-#endif
-
-    //
-    // Mainloop
-    //
-    const auto k_start_idx = crd2idx((*k_tile_iter), make_shape(K_start));
-    constexpr int barrier_scope = 2;
-    int prefetch_k = k_start_idx;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (; prefetch_k < DispatchPolicy::Stages; prefetch_k++) {
-      prefetch(tiled_prefetch_a, pAgA(_, _, _, prefetch_k));
-      prefetch(tiled_prefetch_b, pBgB(_, _, _, prefetch_k));
-    }
-
-    for (int k_tile = k_start_idx; k_tile < k_tile_count + k_start_idx;
-         k_tile++, prefetch_k++) {
-      barrier_arrive(barrier_scope);
-      // Copy gmem to rmem for the first k_tile
-      copy(tiled_copy_a, tAgA(_, _, _, k_tile), tArA);
-      copy(tiled_copy_b, tBgB(_, _, _, k_tile), tBrB);
-
-      if (prefetch_k < k_tile_count) {
-        prefetch(tiled_prefetch_a, pAgA(_, _, _, prefetch_k));
-        prefetch(tiled_prefetch_b, pBgB(_, _, _, prefetch_k));
-      }
-
-      cute::gemm(tiled_mma, tCrA, tCrB, accum);
-      barrier_wait(barrier_scope);
-    }
-  }
-
-  template <typename ProblemShape_MNKL>
-  CUTLASS_DEVICE auto update_tensor_shape_stride(
-      Params const& mainloop_params, int32_t const& next_group,
-      ProblemShape_MNKL const& problem_shape_mnkl) {
-    const int32_t M = get<0>(problem_shape_mnkl);
-    const int32_t N = get<1>(problem_shape_mnkl);
-    const int32_t K = get<2>(problem_shape_mnkl);
-
-    ElementA const* ptr_A_curr_batch =
-        reinterpret_cast<ElementA const*>(mainloop_params.ptr_A[next_group]);
-    ElementB const* ptr_B_curr_batch =
-        reinterpret_cast<ElementB const*>(mainloop_params.ptr_B[next_group]);
-
-    Tensor mA = make_tensor(make_gmem_ptr(ptr_A_curr_batch),
-                            make_shape(M, K, (int32_t)1),
-                            mainloop_params.dA[next_group]);
-    Tensor mB = make_tensor(make_gmem_ptr(ptr_B_curr_batch),
-                            make_shape(N, K, (int32_t)1),
-                            mainloop_params.dB[next_group]);
-
-    return cute::make_tuple(mA, mB);
-  }
-};
-
-}  // namespace cutlass::gemm::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/xpu/cutlass_kernels/collective/gemm/xe_builder.hpp b/csrc/xpu/cutlass_kernels/collective/gemm/xe_builder.hpp
deleted file mode 100644
index ca749c3..0000000
--- a/csrc/xpu/cutlass_kernels/collective/gemm/xe_builder.hpp
+++ /dev/null
@@ -1,234 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include <cutlass/arch/arch.h>
-#include <cute/arch/copy.hpp>         // cute::DefaultCopy
-#include <cute/util/type_traits.hpp>  // cute::is_base_of_v
-// #include "cutlass/epilogue/collective/default_epilogue.hpp"
-#include "xe_array_epilogue.hpp"
-#include "xe_callbacks.hpp"
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::epilogue::collective {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Used to specify epilogue subtile shape or dispatch to automatic computation
-// of subtile shape
-struct EpilogueTileAuto {};
-
-// Used to let the builder pick the epilogue schedule automatically.
-// Can be overridden with kernel schedule tags in
-// cutlass/gemm/dispatch_policy.hpp
-struct EpilogueScheduleAuto {};
-
-template <
-    class ArchTag, class OpClass, class TileShape_MNK, class ClusterShape_MNK,
-    class EpilogueTileType, class ElementAccumulator, class ElementCompute,
-    class ElementC, class GmemLayoutTagC, int AlignmentC, class ElementD,
-    class GmemLayoutTagD, int AlignmentD, class EpilogueScheduleType,
-    class FusionOpOrCallbacks = cutlass::epilogue::fusion::LinearCombination<
-        ElementD, ElementCompute, ElementC, ElementCompute>,
-    class Enable = void>
-struct CollectiveBuilder {
-  static_assert(cutlass::detail::dependent_false<ArchTag>,
-                "Could not build a collective epilogue for given parameters.");
-};
-
-// helper sub-builder for epilogue fusion callbacks (for internal use by
-// CollectiveBuilder only)
-namespace detail {
-
-// callbacks builder with operation tag
-template <class DispatchPolicy, class FusionOp, class TileShape_MNK,
-          class EpilogueTile_MN, class ElementAccumulator,
-          class AccLoadOp = cute::DefaultCopy, class = void>
-struct CallbacksBuilder {
-  using Callbacks = fusion::FusionCallbacks<DispatchPolicy, FusionOp,
-                                            TileShape_MNK, EpilogueTile_MN>;
-};
-
-// callbacks builder with callbacks passthrough
-template <class DispatchPolicy, class FusionCallbacks, class TileShape_MNK,
-          class EpilogueTile_MN, class AccLoadOp, class ElementAccumulator>
-struct CallbacksBuilder<DispatchPolicy, FusionCallbacks, TileShape_MNK,
-                        EpilogueTile_MN, ElementAccumulator, AccLoadOp,
-                        cute::enable_if_t<not cute::is_base_of_v<
-                            fusion::FusionOperation, FusionCallbacks>>> {
-  using Callbacks = FusionCallbacks;
-};
-
-}  // namespace detail
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace cutlass::epilogue::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::epilogue::collective {
-
-namespace detail {
-template <class FusionOp>
-struct FusionOpInfo {
-  static_assert(cutlass::detail::dependent_false<FusionOp>,
-                "Could not find a builder specialization.");
-};
-
-template <class ElementD, class ElementCompute, class ElementC>
-struct FusionOpInfo<cutlass::epilogue::fusion::LinearCombination<
-    ElementD, ElementCompute, ElementC, ElementCompute>> {
-  constexpr static bool HasBuilder = true;
-
-  template <class DispatchPolicy, class TileShape_MNK, class EpilogueTile,
-            class>
-  using FusionCallbacks = cutlass::epilogue::fusion::FusionCallbacks<
-      DispatchPolicy,
-      cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute,
-                                                   ElementC, ElementCompute>,
-      TileShape_MNK, EpilogueTile>;
-};
-
-template <template <class> class ActivationFn, class ElementD,
-          class ElementCompute, class ElementC>
-struct FusionOpInfo<cutlass::epilogue::fusion::LinCombEltAct<
-    ActivationFn, ElementD, ElementCompute, ElementC, ElementCompute>> {
-  constexpr static bool HasBuilder = true;
-  template <class DispatchPolicy, class TileShape_MNK, class EpilogueTile,
-            class>
-
-  using FusionCallbacks = cutlass::epilogue::fusion::FusionCallbacks<
-      DispatchPolicy,
-      cutlass::epilogue::fusion::LinCombEltAct<
-          ActivationFn, ElementD, ElementCompute, ElementC, ElementCompute>,
-      TileShape_MNK, EpilogueTile>;
-};
-
-template <class GmemLayoutTagC, template <class> class ActivationFn,
-          class ElementD, class ElementCompute, class ElementC>
-struct FusionOpInfo<cutlass::epilogue::fusion::LinCombDeEltAct<
-    GmemLayoutTagC, ActivationFn, ElementD, ElementCompute, ElementC,
-    ElementCompute>> {
-  constexpr static bool HasBuilder = true;
-
-  template <class DispatchPolicy, class TileShape_MNK, class EpilogueTile,
-            class CopyOpG2R>
-  using FusionCallbacks = cutlass::epilogue::fusion::FusionCallbacks<
-      DispatchPolicy,
-      cutlass::epilogue::fusion::LinCombDeEltAct<GmemLayoutTagC, ActivationFn,
-                                                 ElementD, ElementCompute,
-                                                 ElementC, ElementCompute>,
-      TileShape_MNK, EpilogueTile, CopyOpG2R>;
-};
-}  // namespace detail
-
-// Intel epilogue builder
-template <class TileShape_MNK, class EpilogueTileType, class ElementAccumulator,
-          class ElementCompute, class ElementC, class GmemLayoutTagC,
-          int AlignmentC, class ElementD, class GmemLayoutTagD, int AlignmentD,
-          class EpilogueScheduleType, class FusionOpOrCallbacks>
-struct CollectiveBuilder<
-    arch::IntelXe, arch::OpClassTensorOp, TileShape_MNK,
-    Shape<_1, _1, _1>,  // Cluster Shape
-    EpilogueTileType, ElementAccumulator, ElementCompute, ElementC,
-    GmemLayoutTagC, AlignmentC, ElementD, GmemLayoutTagD, AlignmentD,
-    EpilogueScheduleType, FusionOpOrCallbacks,
-    cute::enable_if_t<
-        cute::is_same_v<EpilogueTileType, EpilogueTileAuto> &&
-        cute::is_any_of_v<EpilogueScheduleType, EpilogueScheduleAuto,
-                          IntelXeXMX16, IntelXeXMX16Group> &&
-        detail::FusionOpInfo<FusionOpOrCallbacks>::HasBuilder>> {
-#ifdef SYCL_NVIDIA_TARGET
-  static_assert(cutlass::detail::dependent_false<arch::IntelXe>,
-                "Trying to use Intel pipeline on Non Intel hardware");
-#endif
-  static_assert(is_static<TileShape_MNK>::value);
-  static_assert(cute::is_any_of_v<ElementC, float, bfloat16_t, half_t, void>,
-                "ElementC needs to be one of: float, bfloat, half for the "
-                "Intel pipeline");
-
-  using EpilogueSchedule = std::conditional_t<
-      cute::is_same_v<EpilogueScheduleType, EpilogueScheduleAuto>, IntelXeXMX16,
-      EpilogueScheduleType>;
-  static constexpr bool IsGroup =
-      cute::is_same_v<EpilogueSchedule, IntelXeXMX16Group>;
-  using DispatchPolicy =
-      std::conditional_t<IsGroup, IntelXeXMX16Group, IntelXeXMX16>;
-
-  using StrideC = std::conditional_t<
-      cute::is_tuple_v<std::remove_pointer_t<GmemLayoutTagC>>, GmemLayoutTagC,
-      cutlass::detail::TagToStrideC_t<
-          std::conditional_t<IsGroup, GmemLayoutTagC*, GmemLayoutTagC>>>;
-  using StrideD = std::conditional_t<
-      cute::is_tuple_v<std::remove_pointer_t<GmemLayoutTagD>>, GmemLayoutTagD,
-      cutlass::detail::TagToStrideC_t<
-          std::conditional_t<IsGroup, GmemLayoutTagD*, GmemLayoutTagD>>>;
-
-  static_assert(IsGroup == std::is_pointer_v<StrideC>,
-                "Group GEMM should have a pointer to strides");
-  static_assert(IsGroup == std::is_pointer_v<StrideD>,
-                "Group GEMM should have a pointer to strides");
-  static_assert(get<1>(std::remove_pointer_t<StrideC>{}) == 1,
-                "Only N-Major/Row-Major layouts for C are supported in the xe "
-                "epilogue collective builder");
-  static_assert(get<1>(std::remove_pointer_t<StrideD>{}) == 1,
-                "Only N-Major/Row-Major layouts for D are supported in the xe "
-                "epilogue collective builder");
-
-  using CopyOpG2R = std::conditional_t<
-      is_void_v<ElementC>, void,
-      std::conditional_t<cutlass::sizeof_bits_v<ElementC> == 32,
-                         XE_2D_U32x8x16_LD_N, XE_2D_U16x8x16_LD_N>>;
-  using CopyOpR2G =
-      std::conditional_t<cutlass::sizeof_bits_v<ElementD> == 32,
-                         XE_2D_U32x8x16_ST_N, XE_2D_U16x8x16_ST_N>;
-
-  // Intel Epilogue with Linear Combination does not use shared memory
-  using SmemLayoutAtomC_ = void;
-  using CopyOpS2R_ = void;
-  using SmemLayoutAtomD_ = void;
-  using CopyOpR2S_ = void;
-
-  // TODO(Codeplay): Should FusionCallbacks use DispatchPolicy
-  // IntelXeGroupEpilogue for group gemm? That does not work.
-  using FusionCallbacks = typename detail::FusionOpInfo<FusionOpOrCallbacks>::
-      template FusionCallbacks<
-          std::conditional_t<IsGroup, IntelXeXMX16Group, IntelXeXMX16>,
-          TileShape_MNK, TileShape_MNK, CopyOpG2R>;
-  using CollectiveOp = cutlass::epilogue::collective::CollectiveEpilogue<
-      DispatchPolicy, TileShape_MNK, ElementAccumulator, StrideC, ElementD,
-      StrideD, FusionCallbacks, CopyOpG2R, SmemLayoutAtomC_, CopyOpS2R_,
-      CopyOpR2G, SmemLayoutAtomD_, CopyOpR2S_>;
-};
-}  // namespace cutlass::epilogue::collective
diff --git a/csrc/xpu/cutlass_kernels/collective/gemm/xe_callbacks.hpp b/csrc/xpu/cutlass_kernels/collective/gemm/xe_callbacks.hpp
deleted file mode 100644
index 1971159..0000000
--- a/csrc/xpu/cutlass_kernels/collective/gemm/xe_callbacks.hpp
+++ /dev/null
@@ -1,770 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief Fusion callbacks specializations for the Intel Xe epilogue
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cute/tensor.hpp"
-
-#include "cutlass/epilogue/dispatch_policy.hpp"
-#include "cutlass/epilogue/fusion/callbacks.hpp"
-#include "cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp"
-#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
-#include "cutlass/epilogue/fusion/xe_visitor.hpp"
-#include "cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp"
-#include "cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp"
-#include "cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp"
-#include "cutlass/epilogue/fusion/xe_visitor_softmax.hpp"
-#include "cutlass/epilogue/fusion/xe_visitor_splitk.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::epilogue::fusion {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <class ElementOutput_, class ElementCompute_, class ElementSource_,
-          class ElementScalar_, FloatRoundStyle RoundStyle_,
-          class CtaTileShapeMNK_, class EpilogueTile_>
-struct FusionCallbacks<
-    epilogue::IntelXeXMX16,
-    fusion::LinearCombination<ElementOutput_, ElementCompute_, ElementSource_,
-                              ElementScalar_, RoundStyle_>,
-    CtaTileShapeMNK_, EpilogueTile_>
-    : Sm90LinearCombination<typename cutlass::detail::get_unpacked_element_type<
-                                ElementOutput_>::type,
-                            ElementCompute_, ElementSource_, ElementScalar_,
-                            RoundStyle_> {
-  using Impl = Sm90LinearCombination<
-      typename cutlass::detail::get_unpacked_element_type<ElementOutput_>::type,
-      ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_>;
-  using ElementOutput = ElementOutput_;
-  using ElementCompute = ElementCompute_;
-  using ElementSource = ElementSource_;
-  using ElementScalar = ElementScalar_;
-  using Operation =
-      fusion::LinearCombination<ElementOutput, ElementCompute, ElementSource_,
-                                ElementScalar, RoundStyle_>;
-
-  struct Arguments {
-    ElementScalar alpha = ElementScalar(1);
-    ElementScalar beta = ElementScalar(0);
-    ElementScalar const* alpha_ptr = nullptr;
-    ElementScalar const* beta_ptr = nullptr;
-
-    using StrideAlpha = Stride<_0, _0, int64_t>;
-    using StrideBeta = Stride<_0, _0, int64_t>;
-    StrideAlpha dAlpha = {_0{}, _0{}, 0};
-    StrideBeta dBeta = {_0{}, _0{}, 0};
-
-    operator typename Impl::Arguments() const {
-      return {
-          // ternary op : beta * C + (alpha * acc)
-          {{beta}, {beta_ptr}, {dBeta}},  // leaf args : beta
-          {},                             // leaf args : C
-          {
-              // binary op : alpha * acc
-              {{alpha}, {alpha_ptr}, {dAlpha}},  // leaf args : alpha
-              {},                                // leaf args : acc
-              {}                                 // binary args : multiplies
-          },                                     // end binary op
-          {}                                     // ternary args : multiply_add
-      };  // end ternary op
-    }
-  };
-
-  // Ctor inheritance
-  using Impl::Impl;
-};
-
-template <template <class> class ActivationFn_, class ElementOutput_,
-          class ElementCompute_, class ElementSource_, class ElementScalar_,
-          FloatRoundStyle RoundStyle_, class CtaTileShapeMNK_,
-          class EpilogueTile_>
-struct FusionCallbacks<
-    epilogue::IntelXeXMX16,
-    fusion::LinCombEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
-                          ElementSource_, ElementScalar_, RoundStyle_>,
-    CtaTileShapeMNK_, EpilogueTile_>
-    : Sm90LinCombEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
-                        ElementSource_, ElementScalar_, RoundStyle_> {
-  using Impl = Sm90LinCombEltAct<
-      ActivationFn_,
-      typename cutlass::detail::get_unpacked_element_type<ElementOutput_>::type,
-      ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_>;
-  using ElementOutput = ElementOutput_;
-  using ElementCompute = ElementCompute_;
-  using ElementSource = ElementSource_;
-  using ElementScalar = ElementScalar_;
-  using Operation =
-      fusion::LinCombEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
-                            ElementSource_, ElementScalar_, RoundStyle_>;
-
-  struct Arguments {
-    ElementScalar_ alpha = ElementScalar_(1);
-    ElementScalar_ beta = ElementScalar_(0);
-    ElementScalar_ const* alpha_ptr = nullptr;
-    ElementScalar_ const* beta_ptr = nullptr;
-
-    using StrideAlpha = Stride<_0, _0, int64_t>;
-    using StrideBeta = Stride<_0, _0, int64_t>;
-    StrideAlpha dAlpha = {_0{}, _0{}, 0};
-    StrideBeta dBeta = {_0{}, _0{}, 0};
-
-    using ActivationArguments =
-        typename Sm90Compute<ActivationFn_, ElementOutput_, ElementCompute_,
-                             RoundStyle_>::Arguments;
-    ActivationArguments activation = ActivationArguments();
-
-    operator typename Impl::Arguments() const {
-      return {
-          // unary op: activation(beta * C + (alpha * acc))
-          {
-              // ternary op : beta * C + (alpha * acc)
-              {{beta}, {beta_ptr}, {dBeta}},  // leaf args : beta
-              {},                             // leaf args : C
-              {
-                  // binary op : alpha * acc
-                  {{alpha}, {alpha_ptr}, {dAlpha}},  // leaf args : alpha
-                  {},                                // leaf args : acc
-                  {}                                 // binary args : multiplies
-              },                                     // end binary op
-              {}      // ternary args : multiply_add
-          },          // end ternary op
-          activation  // unary args: activation
-      };  // end unary op
-    }
-  };
-
-  // Ctor inheritance
-  using Impl::Impl;
-};
-
-// D = splitk(alpha * acc + beta * C)
-template <
-    // int FragmentSize,
-    class CtaTileShapeMNK, class EpilogueTile, class ElementOutput,
-    class ElementCompute, class CopyOpR2G, class ElementSource = ElementOutput,
-    class ElementScalar = ElementCompute,
-    FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest>
-using XeLinCombSplitK = Sm90EVT<
-    XeSplitK<CtaTileShapeMNK, EpilogueTile, ElementOutput, ElementCompute,
-             CopyOpR2G, RoundStyle>,  // splitk(beta * C + (alpha * acc))
-    Sm90LinearCombination<ElementCompute, ElementCompute, ElementSource,
-                          ElementScalar, RoundStyle>  // beta * C + (alpha *
-                                                      // acc)
-    >;
-
-template <
-    // int FragmentSize,
-    class ElementOutput_, class ElementCompute_, class ElementSource_,
-    class ElementScalar_, class CopyOpR2G_, FloatRoundStyle RoundStyle,
-    class CtaTileShapeMNK, class EpilogueTile>
-struct FusionCallbacks<
-    epilogue::IntelXeXMX16,
-    fusion::LinCombSplitK<ElementOutput_, ElementCompute_, CopyOpR2G_,
-                          ElementSource_, ElementScalar_, RoundStyle>,
-    CtaTileShapeMNK, EpilogueTile>
-    : XeLinCombSplitK<CtaTileShapeMNK, EpilogueTile, ElementOutput_,
-                      ElementCompute_, CopyOpR2G_, ElementSource_,
-                      ElementScalar_, RoundStyle> {
-  using ElementOutput = ElementOutput_;
-  using ElementCompute = ElementCompute_;
-  using ElementSource = ElementSource_;
-  using ElementScalar = ElementScalar_;
-  using Impl = XeLinCombSplitK<
-      CtaTileShapeMNK, EpilogueTile,
-      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
-      ElementCompute, CopyOpR2G_, ElementSource, ElementScalar, RoundStyle>;
-  using Operation =
-      fusion::LinCombSplitK<ElementOutput_, ElementCompute, CopyOpR2G_,
-                            ElementSource, ElementScalar, RoundStyle>;
-
-  struct Arguments {
-    ElementScalar alpha = ElementScalar(1);
-    ElementScalar beta = ElementScalar(0);
-    ElementScalar const* alpha_ptr = nullptr;
-    ElementScalar const* beta_ptr = nullptr;
-    ElementOutput* output_ptr = nullptr;
-    ElementOutput* output_ptr1 = nullptr;
-    ElementOutput* output_ptr2 = nullptr;
-    size_t NUM_HEAD = 0;
-    size_t NOPE_DIM = 0;
-    size_t ROPE_DIM = 0;
-    operator typename Impl::Arguments() const {
-      return {
-          // unary op: activation(beta * C + (alpha * acc))
-          {
-              // ternary op : beta * C + (alpha * acc)
-              {{beta}, {beta_ptr}},  // leaf args : beta
-              {},                    // leaf args : C
-              {
-                  // binary op : alpha * acc
-                  {{alpha}, {alpha_ptr}},  // leaf args : alpha
-                  {},                      // leaf args : acc
-                  {}                       // binary args : multiplies
-              },                           // end binary op
-              {}                           // ternary args : multiply_add
-          },                               // end ternary op
-          {output_ptr, output_ptr1, output_ptr2, NUM_HEAD, NOPE_DIM,
-           ROPE_DIM}  // unary args: activation
-      };  // end unary op
-    }
-  };
-
-  // Ctor inheritance
-  using Impl::Impl;
-};
-
-// D = softmax(alpha * acc + beta * C)
-template <
-    // int FragmentSize,
-    class CtaTileShapeMNK, class EpilogueTile, class ElementOutput,
-    class ElementCompute, class CopyOpR2G, class ElementSource = ElementOutput,
-    class ElementScalar = ElementCompute,
-    FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest>
-using XeLinCombSoftmaxRow =
-    Sm90EVT<XeSoftmaxRowReduction<
-                CtaTileShapeMNK, EpilogueTile, ElementOutput, ElementCompute,
-                CopyOpR2G, RoundStyle>,  // softmax(beta * C + (alpha * acc))
-            Sm90LinearCombination<ElementCompute, ElementCompute, ElementSource,
-                                  ElementScalar, RoundStyle>  // beta * C +
-                                                              // (alpha * acc)
-            >;
-
-template <
-    // int FragmentSize,
-    class ElementOutput_, class ElementCompute_, class ElementSource_,
-    class ElementScalar_, class CopyOpR2G_, FloatRoundStyle RoundStyle,
-    class CtaTileShapeMNK, class EpilogueTile>
-struct FusionCallbacks<
-    epilogue::IntelXeXMX16,
-    fusion::LinCombSoftmaxRow<ElementOutput_, ElementCompute_, CopyOpR2G_,
-                              ElementSource_, ElementScalar_, RoundStyle>,
-    CtaTileShapeMNK, EpilogueTile>
-    : XeLinCombSoftmaxRow<CtaTileShapeMNK, EpilogueTile, ElementOutput_,
-                          ElementCompute_, CopyOpR2G_, ElementSource_,
-                          ElementScalar_, RoundStyle> {
-  using ElementOutput = ElementOutput_;
-  using ElementCompute = ElementCompute_;
-  using ElementSource = ElementSource_;
-  using ElementScalar = ElementScalar_;
-  using Impl = XeLinCombSoftmaxRow<
-      CtaTileShapeMNK, EpilogueTile,
-      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
-      ElementCompute, CopyOpR2G_, ElementSource, ElementScalar, RoundStyle>;
-  using Operation =
-      fusion::LinCombSoftmaxRow<ElementOutput_, ElementCompute, CopyOpR2G_,
-                                ElementSource, ElementScalar, RoundStyle>;
-
-  struct Arguments {
-    ElementScalar alpha = ElementScalar(1);
-    ElementScalar beta = ElementScalar(0);
-    ElementScalar const* alpha_ptr = nullptr;
-    ElementScalar const* beta_ptr = nullptr;
-    ElementOutput* output_ptr = nullptr;
-
-    operator typename Impl::Arguments() const {
-      return {
-          // unary op: activation(beta * C + (alpha * acc))
-          {
-              // ternary op : beta * C + (alpha * acc)
-              {{beta}, {beta_ptr}},  // leaf args : beta
-              {},                    // leaf args : C
-              {
-                  // binary op : alpha * acc
-                  {{alpha}, {alpha_ptr}},  // leaf args : alpha
-                  {},                      // leaf args : acc
-                  {}                       // binary args : multiplies
-              },                           // end binary op
-              {}                           // ternary args : multiply_add
-          },                               // end ternary op
-          {output_ptr}                     // unary args: activation
-      };  // end unary op
-    }
-  };
-
-  // Ctor inheritance
-  using Impl::Impl;
-};
-
-template <class StrideAux, class CopyOpG2R, template <class> class ActivationFn,
-          class ElementOutput, class ElementCompute,
-          class ElementAux = ElementOutput, class ElementSource = ElementOutput,
-          class ElementScalar = ElementCompute,
-          FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest>
-using XeLinCombDeEltAct =
-    Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute,
-                        RoundStyle>,  // activation(beta * C + (alpha * acc),
-                                      // aux)
-            Sm90LinearCombination<ElementCompute, ElementCompute, ElementSource,
-                                  ElementScalar, RoundStyle>,  // beta * C +
-                                                               // (alpha * acc)
-            XeAuxLoad<ElementAux, StrideAux, CopyOpG2R>        // aux
-            >;
-
-// Z = Aux
-// dY = alpha * acc + beta * C
-// D = activation(dY, Z)
-//
-template <class GmemLayoutTagAux, template <class> class ActivationFn,
-          class ElementOutput_, class ElementCompute_, class ElementAux,
-          class ElementSource, class ElementScalar, int AlignmentAux,
-          FloatRoundStyle RoundStyle, class CtaTileShapeMNK, class EpilogueTile,
-          class CopyOpG2R>
-struct FusionCallbacks<
-    epilogue::IntelXeXMX16,
-    fusion::LinCombDeEltAct<GmemLayoutTagAux, ActivationFn, ElementOutput_,
-                            ElementCompute_, ElementAux, ElementSource,
-                            ElementScalar, AlignmentAux, RoundStyle>,
-    CtaTileShapeMNK, EpilogueTile, CopyOpG2R>
-    : XeLinCombDeEltAct<cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>,
-                        CopyOpG2R, ActivationFn, ElementOutput_,
-                        ElementCompute_, ElementAux, ElementSource,
-                        ElementScalar, RoundStyle> {
-  using ElementOutput = ElementOutput_;
-  using ElementCompute = ElementCompute_;
-
-  using Impl =
-      XeLinCombDeEltAct<cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>,
-                        CopyOpG2R, ActivationFn, ElementOutput, ElementCompute,
-                        ElementAux, ElementSource, ElementScalar, RoundStyle>;
-  using Operation =
-      fusion::LinCombDeEltAct<GmemLayoutTagAux, ActivationFn, ElementOutput,
-                              ElementCompute, ElementAux, ElementSource,
-                              ElementScalar, AlignmentAux, RoundStyle>;
-
-  struct Arguments {
-    ElementScalar alpha = ElementScalar(1);
-    ElementScalar beta = ElementScalar(0);
-    ElementScalar const* alpha_ptr = nullptr;
-    ElementScalar const* beta_ptr = nullptr;
-
-    using StrideAlpha = Stride<_0, _0, int64_t>;
-    using StrideBeta = Stride<_0, _0, int64_t>;
-    StrideAlpha dAlpha = {_0{}, _0{}, 0};
-    StrideBeta dBeta = {_0{}, _0{}, 0};
-
-    using ActivationArguments =
-        typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute,
-                             RoundStyle>::Arguments;
-    ActivationArguments activation = ActivationArguments();
-
-    using StrideAux = cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>;
-    ElementAux const* aux_ptr = nullptr;
-    StrideAux dAux = {};
-
-    operator typename Impl::Arguments() const {
-      return {
-          // binary op : activation(beta * C + (alpha * acc), aux)
-          {
-              // ternary op : beta * C + (alpha * acc)
-              {{beta}, {beta_ptr}, {dBeta}},  // leaf args : beta
-              {},                             // leaf args : C
-              {
-                  // binary op : alpha * acc
-                  {{alpha}, {alpha_ptr}, {dAlpha}},  // leaf args : alpha
-                  {},                                // leaf args : acc
-                  {}                                 // binary args : multiplies
-              },                                     // end binary op
-              {}                           // ternary args : multiply_add
-          },                               // end ternary op
-          {aux_ptr, ElementAux(0), dAux},  // leaf args : aux
-          activation                       // binary args : activation
-      };  // end binary op
-    }
-  };
-
-  // Ctor inheritance
-  using Impl::Impl;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// D = alpha * acc + beta * C + per-row bias
-template <class ElementOutput_, class ElementCompute_, class ElementBias_,
-          class ElementSource_, class ElementScalar_, int AlignmentBias_,
-          FloatRoundStyle RoundStyle_, class CtaTileShapeMNK_,
-          class EpilogueTile_>
-struct FusionCallbacks<
-    epilogue::IntelXeXMX16,
-    fusion::LinCombPerRowBias<ElementOutput_, ElementCompute_, ElementBias_,
-                              ElementSource_, ElementScalar_, AlignmentBias_,
-                              RoundStyle_>,
-    CtaTileShapeMNK_, EpilogueTile_>
-    : Sm90LinCombPerRowBias<CtaTileShapeMNK_, ElementOutput_, ElementCompute_,
-                            ElementBias_, ElementSource_, ElementScalar_,
-                            AlignmentBias_, RoundStyle_> {
-  using Impl = Sm90LinCombPerRowBias<
-      CtaTileShapeMNK_,
-      typename cutlass::detail::get_unpacked_element_type<ElementOutput_>::type,
-      ElementCompute_, ElementBias_, ElementSource_, ElementScalar_,
-      AlignmentBias_, RoundStyle_>;
-  using ElementOutput = ElementOutput_;
-  using ElementCompute = ElementCompute_;
-  using ElementBias = ElementBias_;
-  using ElementSource = ElementSource_;
-  using ElementScalar = ElementScalar_;
-  static constexpr int AlignmentBias = AlignmentBias_;
-  using Operation =
-      fusion::LinCombPerRowBias<ElementOutput_, ElementCompute_, ElementBias_,
-                                ElementSource_, ElementScalar_, AlignmentBias_,
-                                RoundStyle_>;
-
-  struct Arguments {
-    ElementScalar_ alpha = ElementScalar_(1);
-    ElementScalar_ beta = ElementScalar_(0);
-    ElementScalar_ const* alpha_ptr = nullptr;
-    ElementScalar_ const* beta_ptr = nullptr;
-
-    using StrideAlpha = Stride<_0, _0, int64_t>;
-    using StrideBeta = Stride<_0, _0, int64_t>;
-    StrideAlpha dAlpha = {_0{}, _0{}, 0};
-    StrideBeta dBeta = {_0{}, _0{}, 0};
-
-    using StrideBias = Stride<_1, _0, int64_t>;
-    ElementBias const* bias_ptr = nullptr;
-    StrideBias dBias = {};
-
-    operator typename Impl::Arguments() const {
-      return {
-          // ternary op : beta * C + (alpha * acc + bias)
-          {{beta}, {beta_ptr}, {dBeta}},  // leaf args : beta
-          {},                             // leaf args : C
-          {
-              // ternary op : alpha * acc + bias
-              {{alpha}, {alpha_ptr}, {dAlpha}},   // leaf args : alpha
-              {},                                 // leaf args : acc
-              {bias_ptr, ElementBias(0), dBias},  // leaf args : bias
-              {}                                  // ternary args : multiply_add
-          },                                      // end ternary op
-          {}                                      // ternary args : multiply_add
-      };  // end ternary op
-    }
-  };
-
-  // Ctor inheritance
-  using Impl::Impl;
-};
-
-// D = alpha * acc + beta * C + per-column bias
-template <
-    int StagesC, class CtaTileShapeMNK, class EpilogueTile, class ElementOutput,
-    class ElementCompute, class ElementBias = ElementOutput,
-    class ElementSource = ElementOutput, class ElementScalar = ElementCompute,
-    int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
-    FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest>
-using XeLinCombPerColBias = Sm90EVT<
-    Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute,
-                RoundStyle>,  // beta * C + (alpha * acc + bias)
-    Sm90ScalarBroadcast<ElementScalar, Stride<_0, _0, int64_t>>,  // beta
-    Sm90SrcFetch<ElementSource>,                                  // C
-    Sm90EVT<
-        Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute,
-                    RoundStyle>,  // alpha * acc + bias
-        Sm90ScalarBroadcast<ElementScalar, Stride<_0, _0, int64_t>>,  // alpha
-        Sm90AccFetch,                                                 // acc
-        XeRowBroadcast<0, CtaTileShapeMNK, ElementBias, ElementCompute,
-                       Stride<_0, _1, int64_t>, AlignmentBias>  // bias
-        >>;
-
-template <class ElementOutput_, class ElementCompute_, class ElementBias_,
-          class ElementSource_, class ElementScalar_, int AlignmentBias_,
-          FloatRoundStyle RoundStyle_, class CtaTileShapeMNK_,
-          class EpilogueTile_>
-struct FusionCallbacks<
-    epilogue::IntelXeXMX16,
-    fusion::LinCombPerColBias<ElementOutput_, ElementCompute_, ElementBias_,
-                              ElementSource_, ElementScalar_, AlignmentBias_,
-                              RoundStyle_>,
-    CtaTileShapeMNK_, EpilogueTile_>
-    : XeLinCombPerColBias<_1{} /* Stages */, CtaTileShapeMNK_, EpilogueTile_,
-                          ElementOutput_, ElementCompute_, ElementBias_,
-                          ElementSource_, ElementScalar_, AlignmentBias_,
-                          RoundStyle_> {
-  using Impl = XeLinCombPerColBias<
-      _1{}, CtaTileShapeMNK_, EpilogueTile_,
-      typename cutlass::detail::get_unpacked_element_type<ElementOutput_>::type,
-      ElementCompute_, ElementBias_, ElementSource_, ElementScalar_,
-      AlignmentBias_, RoundStyle_>;
-  using ElementOutput = ElementOutput_;
-  using ElementCompute = ElementCompute_;
-  using ElementBias = ElementBias_;
-  using ElementSource = ElementSource_;
-  using ElementScalar = ElementScalar_;
-  static constexpr int AlignmentBias = AlignmentBias_;
-  using Operation =
-      fusion::LinCombPerColBias<ElementOutput_, ElementCompute_, ElementBias_,
-                                ElementSource_, ElementScalar_, AlignmentBias_,
-                                RoundStyle_>;
-
-  struct Arguments {
-    ElementScalar_ alpha = ElementScalar_(1);
-    ElementScalar_ beta = ElementScalar_(0);
-    ElementScalar_ const* alpha_ptr = nullptr;
-    ElementScalar_ const* beta_ptr = nullptr;
-
-    using StrideAlpha = Stride<_0, _0, int64_t>;
-    using StrideBeta = Stride<_0, _0, int64_t>;
-    StrideAlpha dAlpha = {_0{}, _0{}, 0};
-    StrideBeta dBeta = {_0{}, _0{}, 0};
-
-    using StrideBias = Stride<_0, _1, int64_t>;
-    ElementBias const* bias_ptr = nullptr;
-    StrideBias dBias = {};
-
-    operator typename Impl::Arguments() const {
-      return {
-          // ternary op : beta * C + (alpha * acc + bias)
-          {{beta}, {beta_ptr}, {dBeta}},  // leaf args : beta
-          {},                             // leaf args : C
-          {
-              // ternary op : alpha * acc + bias
-              {{alpha}, {alpha_ptr}, {dAlpha}},   // leaf args : alpha
-              {},                                 // leaf args : acc
-              {bias_ptr, ElementBias(0), dBias},  // leaf args : bias
-              {}                                  // ternary args : multiply_add
-          },                                      // end ternary op
-          {}                                      // ternary args : multiply_add
-      };  // end ternary op
-    }
-  };
-
-  // Ctor inheritance
-  using Impl::Impl;
-};
-
-template <int TopK, class ElementOutput_, class ElementCompute_,
-          class ElementSource_, class ElementScalar_,
-          FloatRoundStyle RoundStyle, class CtaTileShapeMNK, class EpilogueTile>
-struct FusionCallbacks<
-    epilogue::IntelXeXMX16,
-    fusion::LinCombTopKSoftmaxCol<TopK, ElementOutput_, ElementCompute_,
-                                  ElementSource_, ElementScalar_, RoundStyle>,
-    CtaTileShapeMNK, EpilogueTile>
-    : Sm90LinCombTopKSoftmaxCol<TopK, 8 /*FragmentSize*/, CtaTileShapeMNK,
-                                EpilogueTile, ElementOutput_, ElementCompute_,
-                                ElementSource_, ElementScalar_, RoundStyle> {
-  static constexpr int FragmentSize = 8;
-  using ElementOutput = ElementOutput_;
-  using ElementCompute = ElementCompute_;
-  using ElementSource = ElementSource_;
-  using ElementScalar = ElementScalar_;
-  using Impl = Sm90LinCombTopKSoftmaxCol<
-      TopK, FragmentSize, CtaTileShapeMNK, EpilogueTile,
-      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
-      ElementCompute, ElementSource, ElementScalar, RoundStyle>;
-  using Operation =
-      fusion::LinCombTopKSoftmaxCol<TopK, ElementOutput, ElementCompute,
-                                    ElementSource, ElementScalar, RoundStyle>;
-
-  struct Arguments {
-    ElementScalar alpha = ElementScalar(1);
-    ElementScalar beta = ElementScalar(0);
-    ElementScalar const* alpha_ptr = nullptr;
-    ElementScalar const* beta_ptr = nullptr;
-
-    operator typename Impl::Arguments() const {
-      return {
-          // unary op: activation(beta * C + (alpha * acc))
-          {
-              // ternary op : beta * C + (alpha * acc)
-              {{beta}, {beta_ptr}},  // leaf args : beta
-              {},                    // leaf args : C
-              {
-                  // binary op : alpha * acc
-                  {{alpha}, {alpha_ptr}},  // leaf args : alpha
-                  {},                      // leaf args : acc
-                  {}                       // binary args : multiplies
-              },                           // end binary op
-              {}                           // ternary args : multiply_add
-          },                               // end ternary op
-          {}                               // unary args: activation
-      };  // end unary op
-    }
-  };
-
-  // Ctor inheritance
-  using Impl::Impl;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// D = activation(alpha * acc + beta * C + per-row bias)
-
-template <
-    // int FragmentSize,
-    // bool ReuseSmemC,
-    // bool DelayTmaStore,
-    template <class> class ActivationFn_, class ElementOutput_,
-    class ElementCompute_, class ElementBias_, class ElementSource_,
-    class ElementScalar_, int AlignmentBias_, FloatRoundStyle RoundStyle_,
-    class CtaTileShapeMNK_, class EpilogueTile_>
-struct FusionCallbacks<
-    epilogue::IntelXeXMX16,
-    fusion::LinCombPerRowBiasEltAct<
-        ActivationFn_, ElementOutput_, ElementCompute_, ElementBias_,
-        ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_>,
-    CtaTileShapeMNK_, EpilogueTile_>
-    : Sm90LinCombPerRowBiasEltAct<CtaTileShapeMNK_, ActivationFn_,
-                                  ElementOutput_, ElementCompute_, ElementBias_,
-                                  ElementSource_, ElementScalar_,
-                                  AlignmentBias_, RoundStyle_> {
-  using ElementOutput = ElementOutput_;
-  using ElementCompute = ElementCompute_;
-  using ElementBias = ElementBias_;
-  using ElementSource = ElementSource_;
-  using ElementScalar = ElementScalar_;
-  static constexpr int AlignmentBias = AlignmentBias_;
-  using Impl = Sm90LinCombPerRowBiasEltAct<
-      CtaTileShapeMNK_, ActivationFn_, ElementOutput, ElementCompute,
-      ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle_>;
-  using Operation = fusion::LinCombPerRowBiasEltAct<
-      ActivationFn_, ElementOutput, ElementCompute, ElementBias, ElementSource,
-      ElementScalar, AlignmentBias, RoundStyle_>;
-
-  struct Arguments {
-    ElementScalar alpha = ElementScalar(1);
-    ElementScalar beta = ElementScalar(0);
-    ElementScalar const* alpha_ptr = nullptr;
-    ElementScalar const* beta_ptr = nullptr;
-
-    using StrideAlpha = Stride<_0, _0, int64_t>;
-    using StrideBeta = Stride<_0, _0, int64_t>;
-    StrideAlpha dAlpha = {_0{}, _0{}, 0};
-    StrideBeta dBeta = {_0{}, _0{}, 0};
-
-    using StrideBias = Stride<_1, _0, int64_t>;
-    ElementBias const* bias_ptr = nullptr;
-    StrideBias dBias = {};
-
-    using ActivationArguments =
-        typename Sm90Compute<ActivationFn_, ElementOutput, ElementCompute,
-                             RoundStyle_>::Arguments;
-    ActivationArguments activation = ActivationArguments();
-
-    operator typename Impl::Arguments() const {
-      return {
-          // unary op : activation(beta * C + (alpha * acc + bias))
-          {
-              // ternary op : beta * C + (alpha * acc + bias)
-              {{beta}, {beta_ptr}, {dBeta}},  // leaf args : beta
-              {},                             // leaf args : C
-              {
-                  // ternary op : alpha * acc + bias
-                  {{alpha}, {alpha_ptr}, {dAlpha}},   // leaf args : alpha
-                  {},                                 // leaf args : acc
-                  {bias_ptr, ElementBias(0), dBias},  // leaf args : bias
-                  {}  // ternary args : multiply_add
-              },      // end ternary op
-              {}      // ternary args : multiply_add
-          },          // end ternary op
-          activation  // unary args : activation
-      };  // end unary op
-    }
-  };
-
-  // Ctor inheritance
-  using Impl::Impl;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////
-// D = alpha * acc + beta * C, where beta and alpha can be vectors for each
-// batch
-template <class ElementOutput_, class ElementCompute_, class ElementSource_,
-          class ElementScalar_, FloatRoundStyle RoundStyle_,
-          class CtaTileShapeMNK_, class EpilogueTile_>
-struct FusionCallbacks<
-    epilogue::IntelXeXMX16Group,
-    fusion::LinearCombination<ElementOutput_, ElementCompute_, ElementSource_,
-                              ElementScalar_, RoundStyle_>,
-    CtaTileShapeMNK_, EpilogueTile_>
-    : Sm90LinearCombinationPtrArray<
-          typename cutlass::detail::get_unpacked_element_type<
-              ElementOutput_>::type,
-          ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_> {
-  using Impl = Sm90LinearCombinationPtrArray<
-      typename cutlass::detail::get_unpacked_element_type<ElementOutput_>::type,
-      ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_>;
-  using ElementOutput = ElementOutput_;
-  using ElementCompute = ElementCompute_;
-  using ElementSource = ElementSource_;
-  using ElementScalar = ElementScalar_;
-  using Operation =
-      fusion::LinearCombination<ElementOutput, ElementCompute, ElementSource,
-                                ElementScalar, RoundStyle_>;
-
-  struct Arguments {
-    ElementScalar alpha = ElementScalar(1);
-    ElementScalar beta = ElementScalar(0);
-    ElementScalar const* alpha_ptr = nullptr;
-    ElementScalar const* beta_ptr = nullptr;
-    ElementScalar const* const* alpha_ptr_array = nullptr;
-    ElementScalar const* const* beta_ptr_array = nullptr;
-
-    using StrideAlpha = Stride<_0, _0, int64_t>;
-    using StrideBeta = Stride<_0, _0, int64_t>;
-    StrideAlpha dAlpha = {_0{}, _0{}, 0};
-    StrideBeta dBeta = {_0{}, _0{}, 0};
-
-    operator typename Impl::Arguments() const {
-      return {
-          // ternary op : beta * C + (alpha * acc)
-          {{beta}, {beta_ptr}, {beta_ptr_array}, {dBeta}},  // leaf args : beta
-          {},                                               // leaf args : C
-          {
-              // binary op : alpha * acc
-              {{alpha},
-               {alpha_ptr},
-               {alpha_ptr_array},
-               {dAlpha}},  // leaf args : alpha
-              {},          // leaf args : acc
-              {}           // binary args : multiplies
-          },               // end binary op
-          {}               // ternary args : multiply_add
-      };  // end ternary op
-    }
-  };
-
-  // Ctor inheritance
-  using Impl::Impl;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace cutlass::epilogue::fusion
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/xpu/cutlass_kernels/collective/gemm/xe_gemm_array_cooperative.hpp b/csrc/xpu/cutlass_kernels/collective/gemm/xe_gemm_array_cooperative.hpp
deleted file mode 100644
index ccc1e37..0000000
--- a/csrc/xpu/cutlass_kernels/collective/gemm/xe_gemm_array_cooperative.hpp
+++ /dev/null
@@ -1,368 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/workspace.h"
-#include "cutlass/kernel_hardware_info.hpp"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/gemm/kernel/tile_scheduler.hpp"
-#include "cute/tensor.hpp"
-
-///////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::kernel {
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <class ProblemShape_, class CollectiveMainloop_,
-          class CollectiveEpilogue_, class TileScheduler_>
-class GemmUniversal<
-    ProblemShape_, CollectiveMainloop_, CollectiveEpilogue_, TileScheduler_,
-    cute::enable_if_t<cute::is_base_of_v<
-        KernelXePtrArrayCooperative,
-        typename CollectiveMainloop_::DispatchPolicy::Schedule>>> {
- public:
-  //
-  // Type Aliases
-  //
-  using ProblemShape = ProblemShape_;
-  static_assert(
-      cute::rank(typename ProblemShape::UnderlyingProblemShape{}) == 3 or
-          cute::rank(typename ProblemShape::UnderlyingProblemShape{}) == 4,
-      "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
-
-  // Mainloop derived types
-  using CollectiveMainloop = CollectiveMainloop_;
-  using TileShape = typename CollectiveMainloop::WorkgroupTileShape;
-  using WorkgroupTileShape = TileShape;
-  using TiledMma = typename CollectiveMainloop::TiledMma;
-  using ArchTag = typename CollectiveMainloop::ArchTag;
-  using ElementA = typename CollectiveMainloop::ElementA;
-  using StrideA = typename CollectiveMainloop::StrideA;
-  using InternalStrideA = typename CollectiveMainloop::InternalStrideA;
-  using ElementB = typename CollectiveMainloop::ElementB;
-  using StrideB = typename CollectiveMainloop::StrideB;
-  using InternalStrideB = typename CollectiveMainloop::InternalStrideB;
-  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
-  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
-  using ClusterShape = typename DispatchPolicy::ClusterShape;
-  using MainloopArguments = typename CollectiveMainloop::Arguments;
-  using MainloopParams = typename CollectiveMainloop::Params;
-
-  // Epilogue derived types
-  using CollectiveEpilogue = CollectiveEpilogue_;
-  using ElementC = typename CollectiveEpilogue::ElementC;
-  using StrideC = typename CollectiveEpilogue::StrideC;
-  using InternalStrideC = typename CollectiveEpilogue::InternalStrideC;
-  using ElementD = typename CollectiveEpilogue::ElementD;
-  using StrideD = typename CollectiveEpilogue::StrideD;
-  using InternalStrideD = typename CollectiveEpilogue::InternalStrideD;
-  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
-  using EpilogueParams = typename CollectiveEpilogue::Params;
-
-  static_assert(cute::is_same_v<TileScheduler_, GroupScheduler>,
-                "Only Group Scheduler is supported with this code.");
-  using TileSchedulerTag = TileScheduler_;
-  using TileScheduler =
-      typename detail::TileSchedulerSelector<TileScheduler_, ArchTag, TileShape,
-                                             ClusterShape, 0,
-                                             ProblemShape>::Scheduler;
-  using TileSchedulerArguments = typename TileScheduler::Arguments;
-  using TileSchedulerParams = typename TileScheduler::Params;
-
-  static constexpr int SubgroupSize =
-      CollectiveMainloop::SubgroupSize;  // sub_group size
-  static constexpr uint32_t MaxThreadsPerBlock =
-      CollectiveMainloop::MaxThreadsPerBlock;
-  using MmaAtomShape = typename CollectiveMainloop::MmaAtomShape;
-  using SubgroupTileShape = typename CollectiveMainloop::SubgroupTileShape;
-
-  using MainloopTensors = typename CollectiveMainloop::MainloopTensors;
-  using EpilogueTensors = typename CollectiveEpilogue::EpilogueTensors;
-
-  // Kernel level shared memory storage
-  struct SharedStorage {
-    using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
-    EpilogueTensorStorage epilogue;
-  };
-
-  static constexpr int SharedStorageSize = sizeof(SharedStorage);
-
-  static_assert(cute::is_same_v<ClusterShape, cute::Shape<_1, _1, _1>>);
-
-  // Device side arguments
-  struct Arguments {
-    GemmUniversalMode mode{};
-    ProblemShape problem_shape{};
-    MainloopArguments mainloop{};
-    EpilogueArguments epilogue{};
-    KernelHardwareInfo hw_info{};
-    TileSchedulerArguments scheduler{};
-  };
-
-  // Kernel entry point API
-  struct Params {
-    GemmUniversalMode mode{};
-    ProblemShape problem_shape{};
-    MainloopParams mainloop{};
-    EpilogueParams epilogue{};
-    KernelHardwareInfo hw_info{};
-    TileSchedulerParams scheduler{};
-    void* workspace{nullptr};
-  };
-
-  //
-  // Methods
-  //
-
-  // Convert to underlying arguments. In this case, a simple copy for the
-  // aliased type.
-  static Params to_underlying_arguments(Arguments const& args,
-                                        void* workspace) {
-    CUTLASS_TRACE_HOST("to_underlying_arguments():");
-
-    auto problem_shape = args.problem_shape;
-
-    // Get SM count if needed, otherwise use user supplied SM count
-    int sm_count = args.hw_info.sm_count;
-    if (sm_count <= 0) {
-      CUTLASS_TRACE_HOST(
-          "  WARNING: Arguments do not include a valid SM count.\n"
-          "  For optimal performance, populate the arguments "
-          "KernelHardwareInfo struct with the SM count.");
-      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(
-          args.hw_info.device_id);
-    }
-
-    CUTLASS_TRACE_HOST(
-        "to_underlying_arguments(): Setting persistent grid SM count to "
-        << sm_count);
-
-    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
-
-    // Calculate workspace pointers
-    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
-
-    TileSchedulerParams scheduler = TileScheduler::to_underlying_arguments(
-        problem_shape, TileShape{}, ClusterShape{}, hw_info, args.scheduler,
-        workspace_ptr);
-
-    return {args.mode,
-            problem_shape,
-            CollectiveMainloop::to_underlying_arguments(
-                args.problem_shape, args.mainloop, workspace_ptr),
-            CollectiveEpilogue::to_underlying_arguments(
-                args.problem_shape, args.epilogue, workspace_ptr),
-            hw_info,
-            scheduler,
-            workspace};
-  }
-
-  static bool can_implement(Arguments const& args) {
-    bool implementable = true;
-
-    implementable =
-        implementable &&
-        (args.mode == GemmUniversalMode::kGrouped ||
-         (args.mode == GemmUniversalMode::kBatched &&
-          rank(typename ProblemShape::UnderlyingProblemShape{}) == 3));
-
-    implementable =
-        implementable && TileScheduler::can_implement(args.scheduler);
-
-    implementable &=
-        CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
-    implementable &=
-        CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
-
-    return implementable;
-  }
-
-  static size_t get_workspace_size(Arguments const& args) {
-    size_t workspace_size = 0;
-    workspace_size += TileScheduler::template get_workspace_size<
-        typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
-        args.scheduler, typename ProblemShape::UnderlyingProblemShape{},
-        args.hw_info, -1);
-    return workspace_size;
-  }
-
-  static cutlass::Status initialize_workspace(
-      Arguments const& args, void* workspace = nullptr,
-      cudaStream_t stream = nullptr, CudaHostAdapter* cuda_adapter = nullptr) {
-    Status status = Status::kSuccess;
-    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
-
-    status = TileScheduler::template initialize_workspace<
-        typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
-        args.scheduler, workspace_ptr, stream,
-        typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, -1);
-
-    return status;
-  }
-
-  // Computes the kernel launch grid shape based on runtime parameters
-  static dim3 get_grid_shape(Params const& params) {
-    // Given device SM count, set grid size s.t. we do not launch more thread
-    // blocks than we can run concurrently
-    TileSchedulerArguments args{};
-    args.raster_order =
-        params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN
-            ? TileScheduler::RasterOrderOptions::AlongN
-            : TileScheduler::RasterOrderOptions::AlongM;
-    return TileScheduler::get_grid_shape(params.scheduler, params.problem_shape,
-                                         TileShape{}, ClusterShape{},
-                                         params.hw_info, args);
-  }
-
-  static dim3 get_block_shape() { return dim3(MaxThreadsPerBlock, 1, 1); }
-
-  CUTLASS_DEVICE
-  void operator()(Params const& params, char* smem_buf) {
-    // Preconditions
-    CUTE_STATIC_ASSERT(is_static<WorkgroupTileShape>::value);
-
-    static_assert(cute::rank(InternalStrideA{}) == 3,
-                  "StrideA must be rank-3: [M, K, L]. If batch mode is not "
-                  "needed, set L stride to Int<0>.");
-    static_assert(cute::rank(InternalStrideB{}) == 3,
-                  "StrideB must be rank-3: [N, K, L]. If batch mode is not "
-                  "needed, set L stride to Int<0>.");
-    static_assert(cute::rank(InternalStrideC{}) == 3,
-                  "StrideC must be rank-3: [M, N, L]. If batch mode is not "
-                  "needed, set L stride to Int<0>.");
-    static_assert(cute::rank(InternalStrideD{}) == 3,
-                  "StrideD must be rank-3: [M, N, L]. If batch mode is not "
-                  "needed, set L stride to Int<0>.");
-
-    // Kernel level shared memory storage
-    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
-
-    TileScheduler scheduler{params.scheduler};
-    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
-    constexpr auto workgroup_shape =
-        WorkgroupTileShape{};  // (BLK_M,BLK_N,BLK_K)
-
-    int thread_idx = int(ThreadIdxX());
-    constexpr auto subgroup_shape = SubgroupTileShape{};  // (SUB_M,SUB_N,SUB_K)
-    bool did_group_change = true;
-    int32_t curr_group = -1;
-    using ProblemShapeMNKL = Shape<int, int, int, int>;
-    ProblemShapeMNKL problem_shape_MNKL;
-    MainloopTensors AB_tensors;
-    EpilogueTensors CD_tensors;
-
-    if (work_tile_info.is_valid()) {
-      curr_group = work_tile_info.L_idx;
-      problem_shape_MNKL =
-          append<4>(params.problem_shape.get_problem_shape(curr_group), 1);
-    }
-
-    while (work_tile_info.is_valid()) {
-      auto M = get<0>(problem_shape_MNKL);
-      auto N = get<1>(problem_shape_MNKL);
-      auto K = get<2>(problem_shape_MNKL);
-      auto L = get<3>(problem_shape_MNKL);
-
-      Tensor mA_mkl = cute::get_xe_tensor(make_shape(M, K, L));  //(m,k,l)
-      Tensor mB_nkl = cute::get_xe_tensor(make_shape(N, K, L));  //(n,k,l)
-
-      auto m_coord = work_tile_info.M_idx;
-      auto n_coord = work_tile_info.N_idx;
-
-      auto gA_mkl = local_tile(mA_mkl, select<0, 2>(workgroup_shape),
-                               make_coord(m_coord, _, 0));
-      auto gB_nkl = local_tile(mB_nkl, select<1, 2>(workgroup_shape),
-                               make_coord(n_coord, _, 0));
-
-      CollectiveMainloop collective_mma;
-      if (did_group_change) {
-        AB_tensors = collective_mma.update_tensor_shape_stride(
-            params.mainloop, curr_group, problem_shape_MNKL);
-      }
-      auto tile_coord = make_coord(m_coord, n_coord, _, 0);
-
-      // Get the number of K tiles to compute for this work as well as the
-      // starting K tile offset of the work.
-      int work_k_tile_count = TileScheduler::get_work_k_tile_count(
-          work_tile_info, problem_shape_MNKL, workgroup_shape);
-      int work_k_tile_start =
-          TileScheduler::get_work_k_tile_start(work_tile_info);
-      auto k_tile_iter = cute::make_coord_iterator(
-          idx2crd(work_k_tile_start, make_shape(K)), make_shape(K));
-
-      TiledMma tiled_mma;
-      Tensor accumulators =
-          partition_fragment_C(tiled_mma, take<0, 2>(workgroup_shape));
-
-      // Perform the collective scoped MMA
-      collective_mma(accumulators, gA_mkl, gB_nkl, accumulators, k_tile_iter,
-                     work_k_tile_count, tile_coord, K, thread_idx,
-                     params.mainloop, AB_tensors);
-
-      TileScheduler::fixup(params.scheduler, work_tile_info, accumulators, -1,
-                           -1);
-
-      if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
-        CollectiveEpilogue epilogue{params.epilogue, shared_storage.epilogue};
-
-        if (did_group_change) {
-          CD_tensors = epilogue.update_tensor_shape_stride(curr_group,
-                                                           problem_shape_MNKL);
-          did_group_change = false;
-        }
-
-        epilogue(problem_shape_MNKL, subgroup_shape, tile_coord, accumulators,
-                 tiled_mma, thread_idx, CD_tensors);
-      }
-
-      // Get next work tile
-      auto [next_work_tile_info, temp] =
-          scheduler.fetch_next_work(work_tile_info);
-      work_tile_info = next_work_tile_info;
-
-      did_group_change = curr_group != work_tile_info.L_idx;
-
-      if (did_group_change && work_tile_info.is_valid()) {
-        curr_group = work_tile_info.L_idx;
-        problem_shape_MNKL =
-            append<4>(params.problem_shape.get_problem_shape(curr_group), 1);
-      }
-    }
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-}  // namespace cutlass::gemm::kernel
diff --git a/csrc/xpu/cutlass_kernels/grouped_gemm.hpp b/csrc/xpu/cutlass_kernels/grouped_gemm.hpp
index 1af7283..1c3d682 100644
--- a/csrc/xpu/cutlass_kernels/grouped_gemm.hpp
+++ b/csrc/xpu/cutlass_kernels/grouped_gemm.hpp
@@ -14,21 +14,18 @@ namespace gpu::cutlass_kernel {
 
 namespace grouped_gemm {
 void kernel_functor(sycl::queue& stream, void* ptr_A, void* ptr_B, void* ptr_D,
-                    void* ptr_alpha, void* ptr_beta, void* offset, int64_t N,
-                    int64_t K, int64_t groups);
+                    void* offset, int32_t N, int32_t K, int32_t groups);
 }
 
 /* gemm2(group_A, w2, output, offset) */
 
 at::Tensor grouped_gemm_func(at::Tensor& ptr_A, at::Tensor& ptr_B,
-                             at::Tensor& ptr_D, at::Tensor& ptr_alpha,
-                             at::Tensor& ptr_beta, at::Tensor& offset,
+                             at::Tensor& ptr_D, at::Tensor& tokens_per_expert,
                              int64_t N, int64_t K, int64_t groups) {
   auto& dpcpp_queue = vllm::xpu::vllmGetQueue();
   grouped_gemm::kernel_functor(dpcpp_queue, ptr_A.data_ptr(), ptr_B.data_ptr(),
-                               ptr_D.data_ptr(), ptr_alpha.data_ptr(),
-                               ptr_beta.data_ptr(), offset.data_ptr(), N, K,
-                               groups);
+                               ptr_D.data_ptr(), tokens_per_expert.data_ptr(), (int32_t)N, (int32_t)K,
+                               (int32_t)groups);
   return ptr_D;
 }
 
diff --git a/csrc/xpu/cutlass_kernels/grouped_gemm_kernel.cpp b/csrc/xpu/cutlass_kernels/grouped_gemm_kernel.cpp
index 4d0c894..b641cc2 100644
--- a/csrc/xpu/cutlass_kernels/grouped_gemm_kernel.cpp
+++ b/csrc/xpu/cutlass_kernels/grouped_gemm_kernel.cpp
@@ -32,41 +32,8 @@
 /*! \file
     \brief CUTLASS Intel BMG Group Gemm
 
-    This file is almost a complete copy of 04_bmg_grouped_gemm,
-    except that it's used for FP8 (E5M2 & E4M3) datatype inputs.
-
-    This example demonstrates fusing multiple GEMM operations into one kernel.
-
-    Note that the scalar arguments to e.g. the standard 00_bmg_gemm example,
-   have been replaced with vector equivalents, as each individual GEMM has its
-   own inputs and outputs, which needn't be contiguous in memory. For example,
-   where 00_bmg_gemm receives an `ElementA *` defining Matrix A, grouped gemm
-   receives a `ElementA **`, i.e. a pointer to pointers, each pointing to a
-   distinct Matrix A. Likewise, each individual GEMM operation may have its own
-   alpha and beta factors for linear combination. This example demonstrates two
-   approaches: the user can provide `options.alpha` and `options.beta`, in which
-   case they will apply to all GEMMs; otherwise, random values are generated per
-   GEMM.
-
-    Group GEMM scheduling (cutlass::gemm::GroupScheduler) is more complex than
-   standard GEMM, because each GEMM may have a unique size, only known at
-   runtime. Thus, the scheduler will distribute an a priori unknown number of
-   tiles to each work-group. See
-    include/cutlass/gemm/kernel/xe_gemm_array_cooperative.hpp for
-   implementation.
-
-    Note that for simplicity, this example sets every GEMM in the group to the
-   same shape.
-
-    Verification for this example is a conventional GEMM kernel, executed
-   iteratively per group.
-
-    To build & run this example (from your build dir):
-
-      $ ninja 09_bmg_grouped_gemm_fp8
-      $ ./examples/sycl/09_bmg_grouped_gemm_fp8/09_bmg_grouped_gemm_fp8
-
-    Call with `--help` for information about available options.
+    This file is based off on the default cutlass group gemm example in the
+   cutlass-sycl repo, but it uses custom kernels.
 
     Note: the code may spill registers once compiled which will result in
    sub-optimal performance. This is because of an issue inside Intel Graphics
@@ -75,16 +42,14 @@
    variable: $ export IGC_VectorAliasBBThreshold=10000
 */
 
-#pragma once
-
-// #include "cutlass/epilogue/collective/default_epilogue.hpp"
-// #include "cutlass/epilogue/collective/xe_array_epilogue.hpp"
-// #include "cutlass/epilogue/fusion/xe_callbacks.hpp"
-// #include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/collective/xe_array_epilogue.hpp"
+#include "cutlass/epilogue/fusion/xe_callbacks.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/gemm/group_array_problem_shape.hpp"
-// #include "cutlass/gemm/device/gemm_universal.h"
-// #include "cutlass/gemm/device/gemm_universal_adapter.h"
-// #include "cutlass/gemm/collective/collective_mma.hpp"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/collective/collective_mma.hpp"
 #include "cutlass/util/GPU_Clock.hpp"
 
 #include <cute/tensor.hpp>
@@ -99,14 +64,6 @@
 #include <cfloat>
 
 #include "cutlass/gemm/collective/collective_mma_decl.hpp"
-#include "./collective/gemm/gemm_universal.h"
-#include "./collective/gemm/gemm_universal_adapter.h"
-#include "./collective/gemm/xe_array_mma.hpp"
-#include "./collective/gemm/xe_array_epilogue.hpp"
-#include "./collective/gemm/xe_builder.hpp"
-#include "./collective/gemm/xe_callbacks.hpp"
-// #include "./collective/gemm/xe_gemm_array_cooperative.hpp"
-// #include "./collective/gemm/gemm_universal_adapter.hpp"
 
 using namespace cute;
 using ProblemShape =
@@ -115,10 +72,10 @@ using ProblemShape =
 
 using ElementAccumulator = float;      // <- data type of accumulator
 using ElementComputeEpilogue = float;  // <- data type of epilogue operations
-using ElementA = bfloat16_t;  // <- data type of elements in input matrix A
-using ElementB = bfloat16_t;  // <- data type of elements in input matrix B
+using ElementA = bfloat16_t;  // <- data type of elements in input tensor A
+using ElementB = bfloat16_t;  // <- data type of elements in input tensor B
 using ElementOutput =
-    bfloat16_t;  // <- data type of elements in output matrix D
+    bfloat16_t;  // <- data type of elements in output tensor D
 bool debug = false;
 bool collect_gflops = false;
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -126,75 +83,8 @@ bool collect_gflops = false;
 namespace gpu::cutlass_kernel {
 namespace grouped_gemm {
 
-struct Options {
-  bool error = false;
-  bool help = false;
-
-  float alpha, beta;
-  int iterations;
-  int m, n, k, groups;
-  std::vector<typename ProblemShape::UnderlyingProblemShape> problem_sizes_host;
-
-  int num_of_expert;
-
-  Options(int64_t* offset, int N, int K, int ne)
-      : num_of_expert(ne),
-        n(N),
-        k(K),
-        error(false),
-        help(false),
-        alpha(FLT_MAX),
-        beta(FLT_MAX),
-        iterations(100) {
-    if (debug) {
-      std::cout << "Options()" << std::endl;
-    }
-    int group_cnt = 0;
-    // std::cout << "****Options() num_of_expert  " << num_of_expert <<
-    // std::endl;
-    for (int i = 0; i < num_of_expert; ++i) {
-      // std::cout << "****Options() i  " << i << std::endl;
-      // std::cout << "****Options() offset[i]  " << offset[i] << std::endl;
-      if (offset[i] != 0) {
-        group_cnt++;
-      }
-    }
-    // std::cout << "****Options() group_cnt  " << group_cnt << std::endl;
-    problem_sizes_host.reserve(group_cnt);
-    for (int i = 0; i < num_of_expert; ++i) {
-      if (offset[i] != 0) {
-        problem_sizes_host.push_back({static_cast<int>(offset[i]), n, k});
-      }
-    }
-    groups = group_cnt;
-  }
-
-  /// Compute performance in GFLOP/s
-  double gflops(double runtime_s,
-                std::vector<typename ProblemShape::UnderlyingProblemShape>
-                    problem_sizes_host) const {
-    // Number of real-valued multiply-adds
-    uint64_t fmas = uint64_t();
-
-    for (auto const& problem : problem_sizes_host) {
-      fmas += static_cast<uint64_t>(get<0>(problem)) *
-              static_cast<uint64_t>(get<1>(problem)) *
-              static_cast<uint64_t>(get<2>(problem));
-    }
-    // Two flops per multiply-add
-    uint64_t flop = uint64_t(2) * uint64_t(fmas);
-    double gflop = double(flop) / double(1.0e9);
-    return gflop / runtime_s;
-  }
-};
-
 template <class Gemm>
 struct GroupedGemmRunner {
-  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
-  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
-  using StrideC = typename Gemm::GemmKernel::InternalStrideC;
-  using StrideD = typename Gemm::GemmKernel::InternalStrideD;
-
   using LayoutA = typename Gemm::LayoutA;
   using LayoutB = typename Gemm::LayoutB;
   using LayoutC = typename Gemm::LayoutC;
@@ -210,134 +100,90 @@ struct GroupedGemmRunner {
 
   using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
 
-  std::vector<StrideA> stride_A_host;
-  std::vector<StrideB> stride_B_host;
-  std::vector<StrideC> stride_C_host;
-  std::vector<StrideD> stride_D_host;
-
-  // Device-side allocations
-  cutlass::DeviceAllocation<typename ProblemShape::UnderlyingProblemShape>
-      problem_sizes;
-
-  cutlass::DeviceAllocation<StrideA> stride_A;
-  cutlass::DeviceAllocation<StrideB> stride_B;
-  cutlass::DeviceAllocation<StrideC> stride_C;
-  cutlass::DeviceAllocation<StrideD> stride_D;
-
-  void release() {
-    problem_sizes.release();
-    // ptr_C.release();
-    stride_A.release();
-    stride_B.release();
-    stride_C.release();
-    stride_D.release();
-    // block_C.release();
-  }
-
-  /// Allocates device-side data
-  void allocate(const Options& options) {
-    if (debug) {
-      std::cout << "void allocate()" << std::endl;
-    }
-    for (int32_t i = 0; i < options.groups; ++i) {
-      auto problem = options.problem_sizes_host.at(i);
-      auto M = get<0>(problem);
-      auto N = get<1>(problem);
-      auto K = get<2>(problem);
-
-      stride_A_host.push_back(
-          cutlass::make_cute_packed_stride(StrideA{}, {M, K, 1}));
-      stride_B_host.push_back(
-          cutlass::make_cute_packed_stride(StrideB{}, {N, K, 1}));
-      stride_C_host.push_back(
-          cutlass::make_cute_packed_stride(StrideC{}, {M, N, 1}));
-      stride_D_host.push_back(
-          cutlass::make_cute_packed_stride(StrideD{}, {M, N, 1}));
-    }
-  }
-
-  void initialize(const Options& options) {
-    if (debug) {
-      std::cout << "void initialize()" << std::endl;
-    }
-    problem_sizes.reset(options.groups);
-    problem_sizes.copy_from_host(options.problem_sizes_host.data());
-
-    stride_A.reset(options.groups);
-    stride_A.copy_from_host(stride_A_host.data());
-
-    stride_B.reset(options.groups);
-    stride_B.copy_from_host(stride_B_host.data());
-
-    stride_C.reset(options.groups);
-    stride_C.copy_from_host(stride_C_host.data());
-
-    stride_D.reset(options.groups);
-    stride_D.copy_from_host(stride_D_host.data());
-  }
-
-  /// Populates a Gemm::Arguments structure from the given commandline options
-  typename Gemm::Arguments args_from_options(
-      const Options& options, const cutlass::KernelHardwareInfo& hw_info,
-      const ElementA** ptr_A, const ElementB** ptr_B, ElementOutput** ptr_D,
-      ElementAccumulator** ptr_alpha, ElementAccumulator** ptr_beta,
-      bool host_problem_shapes_available = true) {
+  // Compiler fails if std::vector<const ElementA *> ptr_A_host(1)
+  // is used, so using resize() as a workaround
+  std::vector<const ElementA*> ptr_A_host;
+  std::vector<const ElementB*> ptr_B_host;
+  std::vector<const ElementC*> ptr_C_host;
+  std::vector<ElementOutput*> ptr_D_host;
+
+  // We need to pass pointers of pointers, so this allocation is unavoidable
+  // because the pointer to a GPU pointer obtained via & is obviously a CPU
+  // pointer.
+  cutlass::DeviceAllocation<const ElementA*> ptr_A_device;
+  cutlass::DeviceAllocation<const ElementB*> ptr_B_device;
+  cutlass::DeviceAllocation<const ElementC*> ptr_C_device;
+  cutlass::DeviceAllocation<ElementOutput*> ptr_D_device;
+
+  /// Populates a Gemm::Arguments structure
+  typename Gemm::Arguments generate_gemm_args(
+      const cutlass::KernelHardwareInfo& hw_info, const ElementA* ptr_A,
+      const ElementB* ptr_B, ElementOutput* ptr_D,
+      const int32_t* num_rows_per_expert, const int32_t& N, const int32_t& K,
+      const int32_t& num_groups) {
+    ptr_A_host.resize(1);
+    ptr_B_host.resize(1);
+    ptr_C_host.resize(1);
+    ptr_D_host.resize(1);
+    ptr_A_device.reset(1);
+    ptr_B_device.reset(1);
+    ptr_C_device.reset(1);
+    ptr_D_device.reset(1);
+    ptr_A_host.at(0) = ptr_A;
+    ptr_B_host.at(0) = ptr_B;
+    ptr_C_host.at(0) = nullptr;
+    ptr_D_host.at(0) = ptr_D;
+    ptr_A_device.copy_from_host(ptr_A_host.data());
+    ptr_B_device.copy_from_host(ptr_B_host.data());
+    ptr_C_device.copy_from_host(ptr_C_host.data());
+    ptr_D_device.copy_from_host(ptr_D_host.data());
     typename Gemm::Arguments arguments;
-    decltype(arguments.epilogue.thread) fusion_args;
-
-    // If pointers to alpha/beta are provided, i.e., alpha/beta can differ
-    // between batches/groups.
-    fusion_args.alpha = 0;
+    decltype(arguments.fusion_args) fusion_args;
+    fusion_args.alpha = 1;
     fusion_args.beta = 0;
     fusion_args.alpha_ptr = nullptr;
     fusion_args.beta_ptr = nullptr;
-    fusion_args.alpha_ptr_array = ptr_alpha;
-    fusion_args.beta_ptr_array = ptr_beta;
+    fusion_args.alpha_ptr_array = nullptr;
+    fusion_args.beta_ptr_array = nullptr;
     // One alpha and beta per each group
+    // Doesn't matter, because we won't use it.
     fusion_args.dAlpha = {cute::_0{}, cute::_0{}, 1};
     fusion_args.dBeta = {cute::_0{}, cute::_0{}, 1};
     using RasterOrderOptions =
         typename cutlass::gemm::kernel::detail::PersistentTileSchedulerXeGroup<
             ProblemShape>::RasterOrderOptions;
 
-    // Per-GEMM problem shape info may only exist on the device.
-    if (host_problem_shapes_available) {
-      arguments = typename Gemm::Arguments{
-          cutlass::gemm::GemmUniversalMode::kGrouped,
-          {options.groups, problem_sizes.get(),
-           options.problem_sizes_host.data()},
-          {ptr_A, stride_A.get(), ptr_B, stride_B.get()},
-          {fusion_args, nullptr, stride_C.get(), ptr_D, stride_D.get()},
-          hw_info,
-          {1, RasterOrderOptions::AlongN}};
-    } else {
-      arguments = typename Gemm::Arguments{
-          cutlass::gemm::GemmUniversalMode::kGrouped,
-          {options.groups, problem_sizes.get(), nullptr},
-          {ptr_A, stride_A.get(), ptr_B, stride_B.get()},
-          {fusion_args, nullptr, stride_C.get(), ptr_D, stride_D.get()},
-          hw_info,
-          {1, RasterOrderOptions::AlongN}};
-    }
+    arguments =
+        typename Gemm::Arguments{cutlass::gemm::GemmUniversalMode::kGrouped,
+                                 ptr_A_device.get(),
+                                 ptr_B_device.get(),
+                                 ptr_C_device.get(),
+                                 ptr_D_device.get(),
+                                 fusion_args,
+                                 hw_info,
+                                 {1, RasterOrderOptions::AlongN},
+                                 num_rows_per_expert,
+                                 num_groups,
+                                 N,
+                                 K};
 
     return arguments;
   }
 
-  cutlass::Status run(const Options& options, sycl::queue& stream,
+  cutlass::Status run(sycl::queue& stream,
                       const cutlass::KernelHardwareInfo& hw_info,
-                      const ElementA** ptr_A, const ElementB** ptr_B,
-                      ElementOutput** ptr_D, ElementAccumulator** ptr_alpha,
-                      ElementAccumulator** ptr_beta) {
+                      const ElementA* ptr_A, const ElementB* ptr_B,
+                      ElementOutput* ptr_D, const int32_t* num_rows_per_expert,
+                      const int32_t& N, const int32_t& K,
+                      const int32_t& num_groups) {
     if (debug) {
       std::cout << "enter run" << std::endl;
     }
 
-    allocate(options);
-    initialize(options);
     Gemm gemm_op;
 
-    auto arguments = args_from_options(options, hw_info, ptr_A, ptr_B, ptr_D,
-                                       ptr_alpha, ptr_beta, true);
+    auto arguments = generate_gemm_args(hw_info, ptr_A, ptr_B, ptr_D,
+                                        num_rows_per_expert, N, K, num_groups);
 
     size_t workspace_size = Gemm::get_workspace_size(arguments);
     cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
@@ -350,54 +196,22 @@ struct GroupedGemmRunner {
       std::cout << "before run kernel" << std::endl;
     }
     // Run the GEMM
-
-    GPU_Clock timer;
-    timer.start();
-    CUTLASS_CHECK(gemm_op.run(stream));
-    if (collect_gflops) {
-      stream.wait();
-      float cute_time = timer.seconds() * 1000;
-      double cute_average_time = double(cute_time) / double(1);
-      std::cout << "  Avg runtimei : " << cute_average_time << " ms"
-                << std::endl;
-    }
-
-    if (collect_gflops) {
-      std::cout << "collect_gflops:" << collect_gflops << std::endl;
-      GPU_Clock timer;
-      timer.start();
-      for (int iter = 0; iter < 100; ++iter) {
-        CUTLASS_CHECK(gemm_op.run(stream));
-      }
-      stream.wait();
-      float cute_time = timer.seconds() * 1000;
-      double cute_average_time = double(cute_time) / double(options.iterations);
-      double gflops = options.gflops(cute_average_time / 1000.0,
-                                     options.problem_sizes_host);
-      std::cout << "  Avg runtime : " << cute_average_time << " ms"
-                << std::endl;
-      std::cout << "  GFLOPS      : " << gflops << std::endl;
-    }
-    stream.throw_asynchronous();
-    release();
+    CUTLASS_CHECK(gemm_op.run());
     return cutlass::Status::kSuccess;
   }
 };
 
 void kernel_functor(sycl::queue& stream, void* ptr_A, void* ptr_B, void* ptr_D,
-                    void* ptr_alpha, void* ptr_beta, void* offset, int64_t N,
-                    int64_t K, int64_t groups) {
-  //
-  // Run examples
-  //
-  auto offset_ptr = reinterpret_cast<int64_t*>(offset);
-  Options options(offset_ptr, N, K, groups);
+                    void* tokens_per_expert, int32_t N, int32_t K,
+                    int32_t groups) {
+  auto num_rows_per_expert = reinterpret_cast<int32_t*>(tokens_per_expert);
   // The KernelHardwareInfo struct holds the number of EUs on the GPU with a
   // given device ID. This information is used by the underlying kernel.
   cutlass::KernelHardwareInfo hw_info;
 
   // Change device_id to another value if you are running on a machine with
   // multiple GPUs and wish to use a GPU other than that with device ID 0.
+  // TODO: we should get device id with a vllm or torch API.
   hw_info.sm_count =
       cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
           hw_info.device_id);
@@ -407,33 +221,28 @@ void kernel_functor(sycl::queue& stream, void* ptr_A, void* ptr_B, void* ptr_D,
   using ElementA = cutlass::bfloat16_t;
   using ElementB = cutlass::bfloat16_t;
   using ElementOutput = bfloat16_t;
-  using ElementScale = cutlass::bfloat16_t;
 
   using LayoutA = cutlass::layout::RowMajor;
-  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
   using LayoutC = cutlass::layout::RowMajor;
   using LayoutD = cutlass::layout::RowMajor;
 
   using TileShape = Shape<_256, _256, _32>;
-  using GmemTiledCopyA =
-      XE_2D_U16x32x32_LD_N;  // Note: This shape has to match the shape used for
-                             // the scaling factors
-  using GmemTiledCopyB =
-      XE_2D_U16x32x32_LD_V;  // Note: This shape has to match the shape used for
-                             // the scaling factors
-
+  using GmemTiledCopyA = XE_2D_U16x32x32_LD_N;
+  using GmemTiledCopyB = XE_2D_U16x16x16_LD_T;
+  // This TiledMMA is the default one in intel/cutlass-sycl examples
   using TiledMma =
-      TiledMMA<MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>,
-               Layout<Shape<_8, _4, _1>, Stride<_4, _1, _0>>,
-               Tile<Layout<Shape<_8, _8, _4>, Stride<_1, _32, _8>>,
-                    Layout<Shape<_16, _4, _4>, Stride<_1, _64, _16>>, _32>>;
+      typename TiledMMAHelper<MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>, Layout<TileShape>,
+                                    Layout<Shape<_8, _4, _1>, Stride<_4, _1, _0>>>::TiledMMA;
 
   constexpr int PipelineStages = 2;
   using GEMMDispatchPolicy =
-      cutlass::gemm::MainloopIntelXeXMX16Group<PipelineStages>;
+      cutlass::gemm::MainloopIntelXeXMX16Group<PipelineStages,
+                                               cutlass::gemm::KernelXeMoEGEMM>;
   using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16Group;
-  using EpilogueOp =
-      cutlass::epilogue::fusion::LinearCombination<float_t, float_t>;
+  using EpilogueOp = cutlass::epilogue::fusion::LinearCombination<
+      float_t, float_t, float_t, float_t,
+      cutlass::FloatRoundStyle::round_to_nearest, false>;
 
   using CollectiveEpilogue =
       typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -459,12 +268,11 @@ void kernel_functor(sycl::queue& stream, void* ptr_A, void* ptr_B, void* ptr_D,
   using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
 
   GroupedGemmRunner<Gemm> runner;
-  runner.run(options, stream, hw_info,
-             reinterpret_cast<const ElementA**>(ptr_A),
-             reinterpret_cast<const ElementB**>(ptr_B),
-             reinterpret_cast<ElementOutput**>(ptr_D),
-             reinterpret_cast<ElementAccumulator**>(ptr_alpha),
-             reinterpret_cast<ElementAccumulator**>(ptr_beta));
+  // Might wanna throw an exception here
+  runner.run(stream, hw_info, reinterpret_cast<const ElementA*>(ptr_A),
+             reinterpret_cast<const ElementB*>(ptr_B),
+             reinterpret_cast<ElementOutput*>(ptr_D), num_rows_per_expert, N, K,
+             groups);
 }
 
 }  // namespace grouped_gemm
diff --git a/csrc/xpu/ops.h b/csrc/xpu/ops.h
index 471526e..a6f417f 100644
--- a/csrc/xpu/ops.h
+++ b/csrc/xpu/ops.h
@@ -8,8 +8,7 @@ torch::Tensor fp8_gemm_w8a16(const torch::Tensor& A, const torch::Tensor& B,
                              const std::optional<torch::Tensor>& bias_);
 
 torch::Tensor cutlass_grouped_gemm(torch::Tensor ptr_A, torch::Tensor ptr_B,
-                                   torch::Tensor ptr_D, torch::Tensor ptr_alpha,
-                                   torch::Tensor ptr_beta, torch::Tensor offset,
+                                   torch::Tensor ptr_D, torch::Tensor tokens_per_expert,
                                    int64_t N, int64_t K, int64_t groups);
 
 std::tuple<at::Tensor, at::Tensor> deepseek_scaling_rope(
diff --git a/csrc/xpu/torch_bindings.cpp b/csrc/xpu/torch_bindings.cpp
index a81c242..32be74f 100644
--- a/csrc/xpu/torch_bindings.cpp
+++ b/csrc/xpu/torch_bindings.cpp
@@ -15,8 +15,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, xpu_ops) {
   xpu_ops.impl("fp8_gemm_w8a16", torch::kXPU, &fp8_gemm_w8a16);
 
   xpu_ops.def(
-      "cutlass_grouped_gemm(Tensor ptr_A, Tensor ptr_B, Tensor ptr_D, Tensor "
-      "ptr_alpha, Tensor ptr_beta, Tensor offset, int N, int K, int groups) -> "
+      "cutlass_grouped_gemm(Tensor ptr_A, Tensor ptr_B, Tensor ptr_D, Tensor tokens_per_expert, int N, int K, int groups) -> "
       "Tensor");
   xpu_ops.impl("cutlass_grouped_gemm", torch::kXPU,
                gpu::cutlass_kernel::grouped_gemm_func);
diff --git a/tests/fused_moe/test_fused_moe.py b/tests/fused_moe/test_fused_moe.py
index fedeef2..9211bdc 100644
--- a/tests/fused_moe/test_fused_moe.py
+++ b/tests/fused_moe/test_fused_moe.py
@@ -46,18 +46,20 @@ def test_grouped_gemm(m, n, k, e, topk, dtype):
     seed_everything(7)
     num_experts = e
     token_per_group = random_partition(e, m * topk)
+    num_tokens_after_duplication = sum(token_per_group)
+    num_tokens_per_group = torch.tensor(token_per_group, dtype=torch.int32, device=DEVICE)
+
     # input
-    input_A = torch.randn((sum(token_per_group), k),
+    input_A = torch.randn((num_tokens_after_duplication, k),
                           dtype=dtype,
                           device=DEVICE).contiguous()
     ref_A = input_A.clone()
     # weight
     input_B = torch.randn((num_experts, n, k), dtype=dtype, device=DEVICE)
-    input_B = input_B.transpose(-1, -2).contiguous().transpose(-1, -2)
 
     # output offset
-    output = torch.empty((sum(token_per_group), n), dtype=dtype, device=DEVICE)
-    cutlass_grouped_gemm(input_A, input_B, output, token_per_group, n, k,
+    output = torch.empty((num_tokens_after_duplication, n), dtype=dtype, device=DEVICE)
+    cutlass_grouped_gemm(input_A, input_B, output, num_tokens_per_group, n, k,
                          num_experts)
     torch.xpu.synchronize()
     # ref gg
diff --git a/vllm_xpu_kernels/fused_moe_interface.py b/vllm_xpu_kernels/fused_moe_interface.py
index eaeb2ff..4cdabcd 100644
--- a/vllm_xpu_kernels/fused_moe_interface.py
+++ b/vllm_xpu_kernels/fused_moe_interface.py
@@ -10,71 +10,18 @@
     FUSEDMOE_AVAILABLE = False
 
 
-def prepare_gemm_args(n, k, offset, A, B, D, alpha, beta, e):
-
-    if not hasattr(prepare_gemm_args, "gemm_args"):
-        gemm_args = {}
-        device = A.device
-        ptr_A = torch.empty(e * 8, dtype=torch.uint8,
-                            device=device).contiguous()
-        ptr_B = torch.empty(e * 8, dtype=torch.uint8,
-                            device=device).contiguous()
-        ptr_D = torch.empty(e * 8, dtype=torch.uint8,
-                            device=device).contiguous()
-        ptr_alpha = torch.empty(e * 8, dtype=torch.uint8,
-                                device=device).contiguous()
-        ptr_beta = torch.empty(e * 8, dtype=torch.uint8,
-                               device=device).contiguous()
-        gemm_args["ptr_A"] = ptr_A
-        gemm_args["ptr_B"] = ptr_B
-        gemm_args["ptr_D"] = ptr_D
-        gemm_args["ptr_alpha"] = ptr_alpha
-        gemm_args["ptr_beta"] = ptr_beta
-        prepare_gemm_args.gemm_args = gemm_args
-
-    ptr_A = prepare_gemm_args.gemm_args["ptr_A"]
-    ptr_B = prepare_gemm_args.gemm_args["ptr_B"]
-    ptr_D = prepare_gemm_args.gemm_args["ptr_D"]
-    ptr_alpha = prepare_gemm_args.gemm_args["ptr_alpha"]
-    ptr_beta = prepare_gemm_args.gemm_args["ptr_beta"]
-    total_elements_A = 0
-    total_elements_D = 0
-
-    def process_data_ptr(tensor, offset, addr_tensor, dim, group):
-        if dim == 1:
-            addr = tensor[offset].data_ptr()
-        elif dim == 2:
-            addr = tensor[offset, :].data_ptr()
-        elif dim == 3:
-            addr = tensor[offset, :, :].data_ptr()
-        for i in range(8):  # 64bit -> 8 bytes
-            byte_val = (addr >> (i * 8)) & 0xFF
-            addr_tensor[8 * group + i] = byte_val
-
-    groups = 0
-    for expert_i, m in enumerate(offset):
-        if m != 0:
-            # problem_sizes.extend([m, n, k])
-            process_data_ptr(A, total_elements_A, ptr_A, 2, groups)
-            process_data_ptr(B, expert_i, ptr_B, 3, groups)
-            process_data_ptr(D, total_elements_D, ptr_D, 2, groups)
-            process_data_ptr(alpha, groups, ptr_alpha, 1, groups)
-            process_data_ptr(beta, groups, ptr_beta, 1, groups)
-            total_elements_A += m
-            total_elements_D += m
-            groups += 1
-
-    prepare_gemm_args.gemm_args["groups"] = e  # FIXME: groups
-    return prepare_gemm_args.gemm_args
-
-
-def cutlass_grouped_gemm(input_A, input_B, output, offset, n, k, num_experts):
-    alpha = torch.ones(num_experts, dtype=torch.float32, device=input_A.device)
-    beta = torch.zeros(num_experts, dtype=torch.float32, device=input_A.device)
-    gemm_args = prepare_gemm_args(n, k, offset, input_A, input_B, output,
-                                  alpha, beta, num_experts)
-    offset = torch.tensor(offset, dtype=torch.int64, device="cpu")
-    torch.ops._xpu_C.cutlass_grouped_gemm(offset=offset, N=n, K=k, **gemm_args)
+def prepare_gemm_args(n, k, A, B, D, e):
+    gemm_args = {}
+    gemm_args["ptr_A"] = A
+    gemm_args["ptr_B"] = B
+    gemm_args["ptr_D"] = D
+    gemm_args["groups"] = e  # FIXME: groups
+    return gemm_args
+
+
+def cutlass_grouped_gemm(input_A, input_B, output, tokens_per_expert, n, k, num_experts):
+    gemm_args = prepare_gemm_args(n, k, input_A, input_B, output, num_experts)
+    torch.ops._xpu_C.cutlass_grouped_gemm(tokens_per_expert=tokens_per_expert, N=n, K=k, **gemm_args)
 
 
 def cutlass_fused_moe(hidden_states, w13, w2, topk_weights, topk_ids,
@@ -100,12 +47,6 @@ def cutlass_fused_moe(hidden_states, w13, w2, topk_weights, topk_ids,
             (total_input_size, hidden_size),
             dtype=hidden_states.dtype,
             device=hidden_states.device)
-        moe_buffer["alpha"] = torch.ones(num_experts,
-                                         dtype=torch.float32,
-                                         device=hidden_states.device)
-        moe_buffer["beta"] = torch.zeros(num_experts,
-                                         dtype=torch.float32,
-                                         device=hidden_states.device)
 
         cutlass_fused_moe.moe_buffer = moe_buffer
 
@@ -116,37 +57,22 @@ def cutlass_fused_moe(hidden_states, w13, w2, topk_weights, topk_ids,
         "gemm1_output"][:total_input_size, :]
     gemm2_output = cutlass_fused_moe.moe_buffer[
         "gemm2_output"][:total_input_size, :]
-    alpha = cutlass_fused_moe.moe_buffer["alpha"]
-    beta = cutlass_fused_moe.moe_buffer["beta"]
+
 
     # map token to experts
     idxs = topk_ids.argsort()
-    counts = topk_ids.to(torch.long).bincount().cpu().numpy()
+    counts = topk_ids.to(torch.int).bincount()
     tokens_per_expert = counts.cumsum()
     num_per_tok = n_experts_per_token
     token_idxs = idxs // num_per_tok
-    offset = []
-    for expert_id, end_idx in enumerate(tokens_per_expert):
-        start_idx = 0 if expert_id == 0 else tokens_per_expert[expert_id - 1]
-        offset.append(end_idx - start_idx)
-        if start_idx == end_idx:
-            continue
-        exp_token_idxs = token_idxs[start_idx:end_idx]
-        # expert_tokens = hidden_states[exp_token_idxs]
-        # grouped_input_A.append(expert_tokens)
-        input_A[start_idx:end_idx, :].copy_(hidden_states[exp_token_idxs])
-
-    while len(offset) < num_experts:
-        offset.append(0)
 
     ########### gemm1 ##################
-    input_B = w13.transpose(-1, -2).contiguous().transpose(-1, -2)
+    input_B = w13
     assert (list(input_A.shape)[0] == total_input_size)
-    gemm_args = prepare_gemm_args(2 * intermediate_size, hidden_size, offset,
-                                  input_A, input_B, gemm1_output, alpha, beta,
+    gemm_args = prepare_gemm_args(2 * intermediate_size, hidden_size,
+                                  input_A, input_B, gemm1_output,
                                   num_experts)
-    offset_t = torch.tensor(offset, dtype=torch.int64, device='cpu')
-    torch.ops._xpu_C.cutlass_grouped_gemm(offset=offset_t,
+    torch.ops._xpu_C.cutlass_grouped_gemm(tokens_per_expert=tokens_per_expert,
                                           N=2 * intermediate_size,
                                           K=hidden_size,
                                           **gemm_args)
@@ -157,11 +83,11 @@ def cutlass_fused_moe(hidden_states, w13, w2, topk_weights, topk_ids,
 
     ########### gemm2 ##################
     input_A = act_output.contiguous()
-    input_B = w2.transpose(-1, -2).contiguous().transpose(-1, -2)
-    gemm_args = prepare_gemm_args(hidden_size, intermediate_size, offset,
-                                  input_A, input_B, gemm2_output, alpha, beta,
+    input_B = w2
+    gemm_args = prepare_gemm_args(hidden_size, intermediate_size,
+                                  input_A, input_B, gemm2_output,
                                   num_experts)
-    torch.ops._xpu_C.cutlass_grouped_gemm(offset=offset_t,
+    torch.ops._xpu_C.cutlass_grouped_gemm(tokens_per_expert=tokens_per_expert,
                                           N=hidden_size,
                                           K=intermediate_size,
                                           **gemm_args)