ROCm
diff --git a/‎projects/composablekernel/include/ck_tile/core/arch/mma/amdgcn_mma.hpp‎
Lines changed: 19 additions & 13 deletions b/‎projects/composablekernel/include/ck_tile/core/arch/mma/amdgcn_mma.hpp‎
Lines changed: 19 additions & 13 deletions
diff --git a/‎projects/composablekernel/include/ck_tile/core/arch/mma/mfma/mfma_selector.hpp‎
Lines changed: 61 additions & 56 deletions b/‎projects/composablekernel/include/ck_tile/core/arch/mma/mfma/mfma_selector.hpp‎
Lines changed: 61 additions & 56 deletions
@@ -18,13 +18,13 @@ namespace ck_tile::core::arch::mma {
  *  Meaning of amdgcn_mma layout parameters (general)
  * ---------------------------------------------------
  *
- * The fragment sizes and layout constants in the amdgcn_mma struct describe the mapping between
- * intrinsic input / output matrix elements and vector registers (lane x vector_item space). Note
- * that we end up having a mapping for A, B and C separately, although those for A and B are usually
- * similar if not identical. All mappings can be described as an unmerge operation on one of the
- * matrix dims (either K for AB or M for C), followed by remerging of the resulting subdims and raw
- * other dim into the Lane and Vector_item dimensions. When considering an unmerge operation on a
- * dimension K, we can label the resulting sub-dimensions as K0, K1, and K2, where K0 is the size
+ * The fragment (MmaTile) sizes and layout constants in the amdgcn_mma struct describe the mapping
+ * between intrinsic input / output matrix elements and vector registers (lane x vector_item space).
+ * Note that we end up having a mapping for A, B and C separately, although those for A and B are
+ * usually similar if not identical. All mappings can be described as an unmerge operation on one of
+ * the matrix dims (either K for AB or M for C), followed by remerging of the resulting subdims and
+ * raw other dim into the Lane and Vector_item dimensions. When considering an unmerge operation on
+ * a dimension K, we can label the resulting sub-dimensions as K0, K1, and K2, where K0 is the size
  * of the fastest changing dimension. K0 is also referred to as "The size of the first unmerge", and
  * K1 would be "The size of the second unmerge". There are never more than 2 unmerge operations, and
  * unmerge operations may be trivial (unmerge size of 1). Example double unmerge of size {3, 2} of a
@@ -96,7 +96,7 @@ namespace ck_tile::core::arch::mma {
  *
  * -- A / B Repeat --
  * Variable indicating that all matrix values are represented multiple times in the vector
- * reigsters, typically repeating in the lane dimension. This is always equal to the repeat value
+ * registers, typically repeating in the lane dimension. This is always equal to the repeat value
  * used in Tile Distribution encodings. There are two reasons to have non-trivial (non-1) value
  * here: MFMA block-hiding to create oblong "virtual" intrinsics, and RDNA3 input repetition.
  *
@@ -143,7 +143,7 @@ struct amdgcn_mma_base
     using BDataType = BDataType_;
     using CDataType = CDataType_;
 
-    // Fragment sizes, check description above.
+    // Fragment (MmaTile) sizes, check description above.
     static constexpr index_t kM = FragM; // M = M2 * M1 * M0
     static constexpr index_t kN = FragN;
     static constexpr index_t kK = FragK; // K = K2 * K1 * K0
@@ -224,9 +224,9 @@ concept MmaOpI = requires(MmaOp op) {
  *  @tparam ADataType Datatype of input A
  *  @tparam BDataType Datatype of input B
  *  @tparam CDataType Datatype of accumulator
- *  @tparam FragM M-dimension of mma intrinsic
- *  @tparam FragN N-dimension of mma intrinsic
- *  @tparam FragK K-dimension of mma intrinsic
+ *  @tparam FragM M-dimension of mma intrinsic (MmaTile)
+ *  @tparam FragN N-dimension of mma intrinsic (MmaTile)
+ *  @tparam FragK K-dimension of mma intrinsic (MmaTile)
  *  @tparam CtrlFlags Control flags for mma operation
  *  @tparam CompilerTarget The current compiler target
  *  @tparam OpFamily_ The type of operation (dense, sparse, scale, etc.)
@@ -251,7 +251,13 @@ struct amdgcn_mma : amdgcn_mma_base<fp32_t, fp32_t, fp32_t, 1u, 1u, 1u, 1u, 1, 1
     CK_TILE_DEVICE static CVecType const&
     exec(AVecType const& regsA, BVecType const& regsB, CVecType const& regsC)
     {
-        printf("[WARNING] Running amdgcn_mma dummy exec function!\n");
+        // Prints once across all thread blocks and threads.
+        static __device__ int printed = 0;
+        if(threadIdx.x == 0 && atomicCAS(&printed, 0, 1) == 0)
+        {
+            printf("[WARNING] Running amdgcn_mma dummy exec function!\n");
+        }
+
         ignore(regsA, regsB);
         return regsC; // No-op, just return C
     }
 
@@ -18,28 +18,27 @@ namespace ck_tile::core::arch::mma {
  * @class MfmaDefaultSelector
  * @brief Implements a default MFMA selector strategy for gfx9 target architectures.
  * This implements the K dimension search strategy to find the largest supported MFMA
- * instruction for the given M/N chunk sizes and datatypes.
- * If no supported instruction is found, falls back to an unsupported pass-through
- implementation.
- * @tparam ADataType Data type of matrix A
- * @tparam BDataType Data type of matrix B
- * @tparam CDataType Data type of the accumulator
- * @tparam ChunkM Chunk M dimension size
- * @tparam ChunkN Chunk N dimension size
- * @tparam ChunkKTest Current Chunk K dimension size to test
+ * instruction for the given M/N WaveTile sizes and datatypes.
+ * If no supported instruction is found, falls back to an unsupported pass-through implementation.
+ * @tparam ADataType      Data type of matrix A
+ * @tparam BDataType      Data type of matrix B
+ * @tparam CDataType      Data type of the accumulator
+ * @tparam WaveTileM      WaveTile M dimension size
+ * @tparam WaveTileN      WaveTile N dimension size
+ * @tparam WaveTileKTest  Current WaveTile K dimension size to test
  * @tparam CompilerTarget The compiler target
- * @note Here we assume that ChunkKTest is always a power-of-two integer.
- *       The search strategy starts from a maximum ChunkKTest size down to 1u by halving
+ * @note Here we assume that WaveTileKTest is always a power-of-two integer.
+ *       The search strategy starts from a maximum WaveTileKTest size down to 1u by halving
  *       each time.
  */
 template <typename ADataType,
           typename BDataType,
           typename CDataType,
-          uint32_t ChunkM,
-          uint32_t ChunkN,
-          uint32_t ChunkKTest,
+          uint32_t WaveTileM,
+          uint32_t WaveTileN,
+          uint32_t WaveTileKTest,
           typename CompilerTarget> // TODO: c++20 amdgcn_target_arch_id CompilerTarget>
-// TODO: c++20 requires(is_gfx9_arch_id(CompilerTarget) && is_power_of_two_integer(ChunkKTest))
+// TODO: c++20 requires(is_gfx9_arch_id(CompilerTarget) && is_power_of_two_integer(WaveTileKTest))
 struct MfmaDefaultSelector
 {
     private:
@@ -48,54 +47,60 @@ struct MfmaDefaultSelector
         amdgcn_mma<ADataType,
                    BDataType,
                    CDataType,
-                   ChunkM,
-                   ChunkN,
-                   ChunkKTest,
+                   WaveTileM,
+                   WaveTileN,
+                   WaveTileKTest,
                    DefaultMfmaCtrlFlags, // By default, let's assume no special flags for MFMA
                    CompilerTarget,
                    MmaOpFamily::DENSE>;
 
     public:
     // If the candidate is supported (e.g., a backend implementation exists), then select it.
-    // Otherwise, test another smaller ChunkK. If no existing implementations, we will get ChunkK=0u
-    // and fall back to the unsupported pass-through implementation.
+    // Otherwise, test another smaller WaveTileK. If no existing implementations, we will get
+    // WaveTileK=0u and fall back to the unsupported pass-through implementation.
     using SelectedOp = std::conditional_t<MmaOpTraits<CandidateOp>::IsSupported,
                                           CandidateOp,
                                           typename MfmaDefaultSelector<ADataType,
                                                                        BDataType,
                                                                        CDataType,
-                                                                       ChunkM,
-                                                                       ChunkN,
-                                                                       ChunkKTest / 2u,
+                                                                       WaveTileM,
+                                                                       WaveTileN,
+                                                                       WaveTileKTest / 2u,
                                                                        CompilerTarget>::SelectedOp>;
 };
 
 /**
  * @struct MfmaDefaultSelector
  * @brief Implements the base case for the default MFMA selector when no supported instruction is
  * found.
- * @tparam ADataType Data type of matrix A
- * @tparam BDataType Data type of matrix B
- * @tparam CDataType Data type of the accumulator
- * @tparam ChunkM Chunk M dimension size
- * @tparam ChunkN Chunk N dimension size
+ * @tparam ADataType      Data type of matrix A
+ * @tparam BDataType      Data type of matrix B
+ * @tparam CDataType      Data type of the accumulator
+ * @tparam WaveTileM      WaveTile M dimension size
+ * @tparam WaveTileN      WaveTile N dimension size
  * @tparam CompilerTarget The compiler target
  */
 template <typename ADataType,
           typename BDataType,
           typename CDataType,
-          uint32_t ChunkM,
-          uint32_t ChunkN,
+          uint32_t WaveTileM,
+          uint32_t WaveTileN,
           typename CompilerTarget> // TODO: c++20 amdgcn_target_arch_id CompilerTarget>
-struct MfmaDefaultSelector<ADataType, BDataType, CDataType, ChunkM, ChunkN, 1u, CompilerTarget>
+struct MfmaDefaultSelector<ADataType,
+                           BDataType,
+                           CDataType,
+                           WaveTileM,
+                           WaveTileN,
+                           1u,
+                           CompilerTarget>
 {
     // Default unsupported pass-through if no instruction is found
     using SelectedOp =
         amdgcn_mma<ADataType,
                    BDataType,
                    CDataType,
-                   ChunkM,
-                   ChunkN,
+                   WaveTileM,
+                   WaveTileN,
                    1u,
                    DefaultMfmaCtrlFlags, // By default, let's assume no special flags for MFMA
                    CompilerTarget,
@@ -105,32 +110,32 @@ struct MfmaDefaultSelector<ADataType, BDataType, CDataType, ChunkM, ChunkN, 1u,
 /**
  * @struct MmaDefaultSelector
  * @brief Implements the gfx9 default MMA selector strategy for wave-wise MMA decomposition.
- * This implements the M/N chunk size search strategy to find the largest supported MFMA
+ * This implements the M/N WaveTile size search strategy to find the largest supported MFMA
  * instruction for the given datatypes.
  * If no supported instruction is found, falls back to an unsupported pass-through implementation.
- * @tparam ADataType Data type of matrix A
- * @tparam BDataType Data type of matrix B
- * @tparam CDataType Data type of the accumulator
- * @tparam ChunkM Size of the M dimension of the chunk to decompose
- * @tparam ChunkN Size of the N dimension of the chunk to decompose
- * @tparam ChunkK Size of the K dimension of the chunk to decompose
+ * @tparam ADataType      Data type of matrix A
+ * @tparam BDataType      Data type of matrix B
+ * @tparam CDataType      Data type of the accumulator
+ * @tparam WaveTileM      Size of the M dimension of the WaveTile to decompose
+ * @tparam WaveTileN      Size of the N dimension of the WaveTile to decompose
+ * @tparam WaveTileK      Size of the K dimension of the WaveTile to decompose
  * @tparam CompilerTarget The compiler target
- * @tparam OpFamily The MMA operation family
+ * @tparam OpFamily       The MMA operation family
  */
 template <typename ADataType,
           typename BDataType,
           typename CDataType,
-          uint32_t ChunkM,
-          uint32_t ChunkN,
-          uint32_t ChunkK,
+          uint32_t WaveTileM,
+          uint32_t WaveTileN,
+          uint32_t WaveTileK,
           typename CompilerTarget,
           MmaOpFamily OpFamily> // TODO: c++20 amdgcn_target_arch_id CompilerTarget>
 struct MmaDefaultSelector<ADataType,
                           BDataType,
                           CDataType,
-                          ChunkM,
-                          ChunkN,
-                          ChunkK,
+                          WaveTileM,
+                          WaveTileN,
+                          WaveTileK,
                           CompilerTarget,
                           OpFamily,
                           enable_if_all<enable_if_target_family_gfx9_t<CompilerTarget>,
@@ -162,20 +167,20 @@ struct MmaDefaultSelector<ADataType,
         typename MfmaDefaultSelector<ADataType, BDataType, CDataType, 1u, 1u, 1u, CompilerTarget>::
             SelectedOp;
 
-    // Check if each candidate is supported for the given chunk sizes
-    // For this case, we require the chunk sizes to be multiples of the MFMA shape
+    // Check if each candidate is supported for the given WaveTile sizes
+    // For this case, we require the WaveTile sizes to be multiples of the MFMA shape
     static constexpr bool IsSupported4x4 =
-        MmaOpTraits<CandidateOp4x4>::IsSupported && (ChunkM % CandidateOp4x4::kM == 0u) &&
-        (ChunkN % CandidateOp4x4::kN == 0u) && (ChunkK % CandidateOp4x4::kK == 0u);
+        MmaOpTraits<CandidateOp4x4>::IsSupported && (WaveTileM % CandidateOp4x4::kM == 0u) &&
+        (WaveTileN % CandidateOp4x4::kN == 0u) && (WaveTileK % CandidateOp4x4::kK == 0u);
     static constexpr bool IsSupported16x16 =
-        MmaOpTraits<CandidateOp16x16>::IsSupported && (ChunkM % CandidateOp16x16::kM == 0u) &&
-        (ChunkN % CandidateOp16x16::kN == 0u) && (ChunkK % CandidateOp16x16::kK == 0u);
+        MmaOpTraits<CandidateOp16x16>::IsSupported && (WaveTileM % CandidateOp16x16::kM == 0u) &&
+        (WaveTileN % CandidateOp16x16::kN == 0u) && (WaveTileK % CandidateOp16x16::kK == 0u);
     static constexpr bool IsSupported32x32 =
-        MmaOpTraits<CandidateOp32x32>::IsSupported && (ChunkM % CandidateOp32x32::kM == 0u) &&
-        (ChunkN % CandidateOp32x32::kN == 0u) && (ChunkK % CandidateOp32x32::kK == 0u);
+        MmaOpTraits<CandidateOp32x32>::IsSupported && (WaveTileM % CandidateOp32x32::kM == 0u) &&
+        (WaveTileN % CandidateOp32x32::kN == 0u) && (WaveTileK % CandidateOp32x32::kK == 0u);
 
     public:
-    // Select the largest supported MFMA operation for the given chunk shape
+    // Select the largest supported MFMA operation for the given WaveTile shape
     using SelectedOp = std::conditional_t<
         IsSupported32x32,
         CandidateOp32x32,