ROCm
diff --git a/‎projects/composablekernel/include/ck_tile/core/arch/mma/amdgcn_mma.hpp‎
Lines changed: 7 additions & 6 deletions b/‎projects/composablekernel/include/ck_tile/core/arch/mma/amdgcn_mma.hpp‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎projects/composablekernel/include/ck_tile/core/arch/mma/mfma/mfma_gfx9.hpp‎
Lines changed: 6 additions & 4 deletions b/‎projects/composablekernel/include/ck_tile/core/arch/mma/mfma/mfma_gfx9.hpp‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎projects/composablekernel/include/ck_tile/core/arch/mma/mfma/mfma_selector.hpp‎
Lines changed: 46 additions & 49 deletions b/‎projects/composablekernel/include/ck_tile/core/arch/mma/mfma/mfma_selector.hpp‎
Lines changed: 46 additions & 49 deletions
@@ -17,7 +17,7 @@ namespace ck_tile::core::arch::mma {
 // TODO: Describe layout params.
 /**
  *  @class  amdgcn_mma_base
- *  @brief  Helper base class for amdgcn_mma structs to avoid a lot of code duplication. Also puts
+ *  @brief  Base class for amdgcn_mma structs to avoid a lot of code duplication. Also puts
  *          all generic parameter derivations and static asserts in one place. Houses all of the
  *          amdgcn struct types and variables, except for the exec() function.
  */
@@ -127,14 +127,13 @@ concept MmaOpI = requires(MmaOp op) {
  *  @tparam ADataType Datatype of input A
  *  @tparam BDataType Datatype of input B
  *  @tparam CDataType Datatype of accumulator
- *  @tparam FragM M-dimension of mma block
- *  @tparam FragN N-dimension of mma block
- *  @tparam FragK K-dimension of mma block
+ *  @tparam FragM M-dimension of mma intrinsic
+ *  @tparam FragN N-dimension of mma intrinsic
+ *  @tparam FragK K-dimension of mma intrinsic
  *  @tparam CtrlFlags Control flags for mma operation
  *  @tparam CompilerTarget The current compiler target
  *  @tparam Enabler SFINAE enabler
  */
-// clang-format off
 template <typename ADataType,
           typename BDataType,
           typename CDataType,
@@ -145,17 +144,19 @@ template <typename ADataType,
           typename CompilerTarget,
           MmaOpFamily OpFamily_,
           typename Enabler = void>
+// clang-format off
 struct amdgcn_mma : amdgcn_mma_base<fp32_t, fp32_t, fp32_t, 1u, 1u, 1u, 1u, 1, 1, 1, 1, 1, 1, 1, Unsupported, MmaOpFamily::UNDEFINED>
+// clang-format on
 {
     // This is a default pass-through implementation that doesn't do anything practical.
     CK_TILE_DEVICE static CVecType const&
     exec(AVecType const& regsA, BVecType const& regsB, CVecType const& regsC)
     {
+        printf("[WARNING] Running amdgcn_mma dummy exec function!\n");
         ignore(regsA, regsB);
         return regsC; // No-op, just return C
     }
 };
-// clang-format on
 
 } // namespace ck_tile::core::arch::mma
 #pragma clang diagnostic pop
 
@@ -53,17 +53,18 @@ concept CtrlFlagsGfx9I = requires(CtrlFlags ctrlFlags) {
  * @brief Specialization of amdgcn_mma for MFMA on GFX9 targets
  *
  * This specialization implements the MFMA instruction for fp16_t A and B
- * matrices, and fp32_t accumulator matrix, with 16x16x16 block sizes.
+ * matrices, and fp32_t accumulator matrix, with 16x16x16 fragment sizes.
  *
  * @tparam CtrlFlags Control flags for the MFMA operation
  * @tparam CompilerTarget Current compiler target
  */
 // TODO: c++20 template <CtrlFlagsGfx9I CtrlFlags, amdgcn_target CompilerTarget>
 // TODO: c++20 requires
-// clang-format off
 template <typename CtrlFlags, typename CompilerTarget>
+// clang-format off
 struct amdgcn_mma<fp16_t, fp16_t, fp32_t, 16u, 16u, 16u, CtrlFlags, CompilerTarget, MmaOpFamily::DENSE, enable_if_target_family_gfx9_t<CompilerTarget>>
 : amdgcn_mma_base<fp16_t, fp16_t, fp32_t, 16u, 16u, 16u, 64u, 4, 1, 1, 1, 1, 4, 1, MfmaOp, MmaOpFamily::DENSE>
+// clang-format on
 {
     CK_TILE_DEVICE static auto
     exec(AVecType const& aVec, BVecType const& bVec, CVecType const& cVec) -> CVecType
@@ -82,16 +83,18 @@ struct amdgcn_mma<fp16_t, fp16_t, fp32_t, 16u, 16u, 16u, CtrlFlags, CompilerTarg
  * @brief Specialization of amdgcn_mma for MFMA on GFX950 targets
  *
  * This specialization implements the MFMA instruction for fp16_t A and B
- * matrices, and fp32_t accumulator matrix, with 16x16x32 block sizes.
+ * matrices, and fp32_t accumulator matrix, with 16x16x32 fragment sizes.
  *
  * @tparam CtrlFlags Control flags for the MFMA operation
  * @tparam CompilerTarget Current compiler target
  */
 // TODO: c++20 template <CtrlFlagsGfx9I CtrlFlags, amdgcn_target CompilerTarget>
 // TODO: c++20 requires
 template <typename CtrlFlags, typename CompilerTarget>
+// clang-format off
 struct amdgcn_mma<fp16_t, fp16_t, fp32_t, 16u, 16u, 32u, CtrlFlags, CompilerTarget, MmaOpFamily::DENSE, enable_if_target_id_t<CompilerTarget, amdgcn_target_id::GFX950>>
 : amdgcn_mma_base<fp16_t, fp16_t, fp32_t, 16u, 16u, 32u, 64u, 8, 1, 1, 1, 1, 4, 1, MfmaOp, MmaOpFamily::DENSE>
+// clang-format on
 {
     CK_TILE_DEVICE static auto
     exec(AVecType const& aVec, BVecType const& bVec, CVecType const& cVec) -> CVecType
@@ -104,6 +107,5 @@ struct amdgcn_mma<fp16_t, fp16_t, fp32_t, 16u, 16u, 32u, CtrlFlags, CompilerTarg
                                                        static_cast<int>(CtrlFlags::Blgp))};
     }
 };
-// clang-format on
 
 } // namespace ck_tile::core::arch::mma
@@ -18,28 +18,28 @@ namespace ck_tile::core::arch::mma {
  * @class MfmaDefaultSelector
  * @brief Implements a default MFMA selector strategy for gfx9 target architectures.
  * This implements the K dimension search strategy to find the largest supported MFMA
- * instruction for the given M/N block sizes and datatypes.
+ * instruction for the given M/N chunk sizes and datatypes.
  * If no supported instruction is found, falls back to an unsupported pass-through
  implementation.
  * @tparam ADataType Data type of matrix A
  * @tparam BDataType Data type of matrix B
  * @tparam CDataType Data type of the accumulator
- * @tparam FragM Block M dimension size
- * @tparam FragN Block N dimension size
- * @tparam FragKTest Current Block K dimension size to test
+ * @tparam ChunkM Chunk M dimension size
+ * @tparam ChunkN Chunk N dimension size
+ * @tparam ChunkKTest Current Chunk K dimension size to test
  * @tparam CompilerTarget The compiler target
- * @note Here we assume that FragKTest is always a power-of-two integer.
- *       The search strategy starts from a maximum FragKTest size down to 1u by halving
+ * @note Here we assume that ChunkKTest is always a power-of-two integer.
+ *       The search strategy starts from a maximum ChunkKTest size down to 1u by halving
  *       each time.
  */
 template <typename ADataType,
           typename BDataType,
           typename CDataType,
-          uint32_t FragM,
-          uint32_t FragN,
-          uint32_t FragKTest,
+          uint32_t ChunkM,
+          uint32_t ChunkN,
+          uint32_t ChunkKTest,
           typename CompilerTarget> // TODO: c++20 amdgcn_target_arch_id CompilerTarget>
-// TODO: c++20 requires(is_gfx9_arch_id(CompilerTarget) && is_power_of_two_integer(FragKTest))
+// TODO: c++20 requires(is_gfx9_arch_id(CompilerTarget) && is_power_of_two_integer(ChunkKTest))
 struct MfmaDefaultSelector
 {
     private:
@@ -48,25 +48,25 @@ struct MfmaDefaultSelector
         amdgcn_mma<ADataType,
                    BDataType,
                    CDataType,
-                   FragM,
-                   FragN,
-                   FragKTest,
+                   ChunkM,
+                   ChunkN,
+                   ChunkKTest,
                    DefaultMfmaCtrlFlags, // By default, let's assume no special flags for MFMA
                    CompilerTarget,
                    MmaOpFamily::DENSE>;
 
     public:
     // If the candidate is supported (e.g., a backend implementation exists), then select it.
-    // Otherwise, test another smaller FragK. If no existing implementations, we will get FragK=0u
+    // Otherwise, test another smaller ChunkK. If no existing implementations, we will get ChunkK=0u
     // and fall back to the unsupported pass-through implementation.
     using SelectedOp = std::conditional_t<MmaOpTraits<CandidateOp>::IsSupported,
                                           CandidateOp,
                                           typename MfmaDefaultSelector<ADataType,
                                                                        BDataType,
                                                                        CDataType,
-                                                                       FragM,
-                                                                       FragN,
-                                                                       FragKTest / 2u,
+                                                                       ChunkM,
+                                                                       ChunkN,
+                                                                       ChunkKTest / 2u,
                                                                        CompilerTarget>::SelectedOp>;
 };
 
@@ -77,25 +77,25 @@ struct MfmaDefaultSelector
  * @tparam ADataType Data type of matrix A
  * @tparam BDataType Data type of matrix B
  * @tparam CDataType Data type of the accumulator
- * @tparam FragM Block M dimension size
- * @tparam FragN Block N dimension size
+ * @tparam ChunkM Chunk M dimension size
+ * @tparam ChunkN Chunk N dimension size
  * @tparam CompilerTarget The compiler target
  */
 template <typename ADataType,
           typename BDataType,
           typename CDataType,
-          uint32_t FragM,
-          uint32_t FragN,
+          uint32_t ChunkM,
+          uint32_t ChunkN,
           typename CompilerTarget> // TODO: c++20 amdgcn_target_arch_id CompilerTarget>
-struct MfmaDefaultSelector<ADataType, BDataType, CDataType, FragM, FragN, 1u, CompilerTarget>
+struct MfmaDefaultSelector<ADataType, BDataType, CDataType, ChunkM, ChunkN, 1u, CompilerTarget>
 {
     // Default unsupported pass-through if no instruction is found
     using SelectedOp =
         amdgcn_mma<ADataType,
                    BDataType,
                    CDataType,
-                   FragM,
-                   FragN,
+                   ChunkM,
+                   ChunkN,
                    1u,
                    DefaultMfmaCtrlFlags, // By default, let's assume no special flags for MFMA
                    CompilerTarget,
@@ -105,32 +105,32 @@ struct MfmaDefaultSelector<ADataType, BDataType, CDataType, FragM, FragN, 1u, Co
 /**
  * @struct MmaDefaultSelector
  * @brief Implements the gfx9 default MMA selector strategy for wave-wise MMA decomposition.
- * This implements the M/N block size search strategy to find the largest supported MFMA
+ * This implements the M/N chunk size search strategy to find the largest supported MFMA
  * instruction for the given datatypes.
  * If no supported instruction is found, falls back to an unsupported pass-through implementation.
  * @tparam ADataType Data type of matrix A
  * @tparam BDataType Data type of matrix B
  * @tparam CDataType Data type of the accumulator
- * @tparam FragM Size of the M dimension of the fragment to decompose
- * @tparam FragN Size of the N dimension of the fragment to decompose
- * @tparam FragK Size of the K dimension of the fragment to decompose
+ * @tparam ChunkM Size of the M dimension of the chunk to decompose
+ * @tparam ChunkN Size of the N dimension of the chunk to decompose
+ * @tparam ChunkK Size of the K dimension of the chunk to decompose
  * @tparam CompilerTarget The compiler target
  * @tparam OpFamily The MMA operation family
  */
 template <typename ADataType,
           typename BDataType,
           typename CDataType,
-          uint32_t FragM,
-          uint32_t FragN,
-          uint32_t FragK,
+          uint32_t ChunkM,
+          uint32_t ChunkN,
+          uint32_t ChunkK,
           typename CompilerTarget,
           MmaOpFamily OpFamily> // TODO: c++20 amdgcn_target_arch_id CompilerTarget>
 struct MmaDefaultSelector<ADataType,
                           BDataType,
                           CDataType,
-                          FragM,
-                          FragN,
-                          FragK,
+                          ChunkM,
+                          ChunkN,
+                          ChunkK,
                           CompilerTarget,
                           OpFamily,
                           enable_if_all<enable_if_target_family_gfx9_t<CompilerTarget>,
@@ -162,23 +162,20 @@ struct MmaDefaultSelector<ADataType,
         typename MfmaDefaultSelector<ADataType, BDataType, CDataType, 1u, 1u, 1u, CompilerTarget>::
             SelectedOp;
 
-    // Check if each candidate is supported for the given fragment sizes
-    // For this case, we require the fragment sizes to be multiples of the MFMA shape
-    static constexpr bool IsSupported4x4 =  MmaOpTraits<CandidateOp4x4>::IsSupported &&
-                                            (FragM % CandidateOp4x4::kM == 0u) &&
-                                            (FragN % CandidateOp4x4::kN == 0u) && 
-                                            (FragK % CandidateOp4x4::kK == 0u);
-    static constexpr bool IsSupported16x16 = MmaOpTraits<CandidateOp16x16>::IsSupported && 
-                                            (FragM % CandidateOp16x16::kM == 0u) &&
-                                            (FragN % CandidateOp16x16::kN == 0u) && 
-                                            (FragK % CandidateOp16x16::kK == 0u);
-    static constexpr bool IsSupported32x32 = MmaOpTraits<CandidateOp32x32>::IsSupported && 
-                                            (FragM % CandidateOp32x32::kM == 0u) &&
-                                            (FragN % CandidateOp32x32::kN == 0u) && 
-                                            (FragK % CandidateOp32x32::kK == 0u);
+    // Check if each candidate is supported for the given chunk sizes
+    // For this case, we require the chunk sizes to be multiples of the MFMA shape
+    static constexpr bool IsSupported4x4 =
+        MmaOpTraits<CandidateOp4x4>::IsSupported && (ChunkM % CandidateOp4x4::kM == 0u) &&
+        (ChunkN % CandidateOp4x4::kN == 0u) && (ChunkK % CandidateOp4x4::kK == 0u);
+    static constexpr bool IsSupported16x16 =
+        MmaOpTraits<CandidateOp16x16>::IsSupported && (ChunkM % CandidateOp16x16::kM == 0u) &&
+        (ChunkN % CandidateOp16x16::kN == 0u) && (ChunkK % CandidateOp16x16::kK == 0u);
+    static constexpr bool IsSupported32x32 =
+        MmaOpTraits<CandidateOp32x32>::IsSupported && (ChunkM % CandidateOp32x32::kM == 0u) &&
+        (ChunkN % CandidateOp32x32::kN == 0u) && (ChunkK % CandidateOp32x32::kK == 0u);
 
     public:
-    // Select the largest supported MFMA operation for the given fragment shape
+    // Select the largest supported MFMA operation for the given chunk shape
     using SelectedOp = std::conditional_t<
         IsSupported32x32,
         CandidateOp32x32,