Change some remaining references to Chunk to WaveTile

krithalith · krithalith · commit 9f4f0a7b0295 · 2026-03-18T15:39:15.000Z
diff --git a/projects/composablekernel/test/ck_tile/core/arch/mma/test_amdgcn_mma.cpp b/projects/composablekernel/test/ck_tile/core/arch/mma/test_amdgcn_mma.cpp
@@ -69,31 +69,31 @@ using DummyAmdgcnMma = amdgcn_mma<fp32_t,
 
 /*! @struct MmaDefaultSelector
  * @brief For dummy Id only, instantiate tests for both MFMA and WMMA selectors so we can them both
- * @tparam ADataType Data type of matrix A
- * @tparam BDataType Data type of matrix B
- * @tparam CDataType Data type of the accumulator
- * @tparam ChunkM Size of the M dimension of the chunk to decompose
- * @tparam ChunkN Size of the N dimension of the chunk to decompose
- * @tparam ChunkK Size of the K dimension of the chunk to decompose
+ * @tparam ADataType      Data type of matrix A
+ * @tparam BDataType      Data type of matrix B
+ * @tparam CDataType      Data type of the accumulator
+ * @tparam WaveTileM      Size of the M dimension of the WaveTile to decompose
+ * @tparam WaveTileN      Size of the N dimension of the WaveTile to decompose
+ * @tparam WaveTileK      Size of the K dimension of the WaveTile to decompose
  * @tparam CompilerTarget The compiler target
- * @tparam OpFamily The MMA operation family
+ * @tparam OpFamily       The MMA operation family
  */
 template <typename ADataType,
           typename BDataType,
           typename CDataType,
-          uint32_t ChunkM,
-          uint32_t ChunkN,
-          uint32_t ChunkK,
+          uint32_t WaveTileM,
+          uint32_t WaveTileN,
+          uint32_t WaveTileK,
           typename CompilerTarget,
           MmaOpFamily OpFamily>
 // TODO: c++20 amdgcn_target_arch_id CompilerTarget>
 // TODO: requires
 struct MmaDefaultSelector<ADataType,
                           BDataType,
                           CDataType,
-                          ChunkM,
-                          ChunkN,
-                          ChunkK,
+                          WaveTileM,
+                          WaveTileN,
+                          WaveTileK,
                           CompilerTarget,
                           OpFamily,
                           enable_if_all<enable_if_target_id_dummy_t<CompilerTarget>,
@@ -311,11 +311,11 @@ TEST(TestAmdgcnMma, MmaDefaultSelectorUnsupported)
     EXPECT_FALSE(MmaOpTraits<SelectedMma>::IsSupported);
 }
 
-// Test MmaDefaultSelector for supported DummyAmdgcnMma on chunk sizes other than 16x16x16
-// This tests that the selector can still pick the correct MMA op even if the chunk sizes differ
-TEST(TestAmdgcnMma, MmaDefaultSelectorSupportedChunk)
+// Test MmaDefaultSelector for supported DummyAmdgcnMma on WaveTile sizes other than 16x16x16
+// This tests that the selector can still pick the correct MMA op even if the WaveTile sizes differ
+TEST(TestAmdgcnMma, MmaDefaultSelectorSupportedWaveTile)
 {
-    // Select indirectly with a chunk size of 256x128x64
+    // Select indirectly with a WaveTile size of 256x128x64
     using SelectedMma = MmaDefaultSelector<fp32_t,
                                            fp32_t,
                                            fp32_t,
@@ -332,8 +332,8 @@ TEST(TestAmdgcnMma, MmaDefaultSelectorSupportedChunk)
     EXPECT_TRUE(MmaOpTraits<SelectedMma>::IsSupported);
 }
 
-// Test MmaDefaultSelector for a different chunk size and supported arch
-TEST(TestAmdgcnMma, MmaDefaultSelectorUnsupportedChunk)
+// Test MmaDefaultSelector for a different WaveTile size and supported arch
+TEST(TestAmdgcnMma, MmaDefaultSelectorUnsupportedWaveTile)
 {
     // This should fall back to unsupported since DummyAmdgcnMma only supports 16x16x16
     using SelectedMma = MmaDefaultSelector<fp32_t,
@@ -367,34 +367,34 @@ TEST(TestAmdgcnMma, MmaDefaultSelectorFp16Unsupported)
 // Test on real hardware for MmaOp selection.
 // This is not a GEMM kernel, but a simple test to ensure that the selected MmaOp works correctly on
 // real hardware. Assumption: inputs are all 1's The multiply-accumulate functionality can be tested
-// here by looping over the k dimension and accumulating the results. They should be equal to ChunkK
-// regardless of hardware.
+// here by looping over the k dimension and accumulating the results. They should be equal to
+// WaveTileK regardless of hardware.
 template <typename ADataType,
           typename BDataType,
           typename CDataType,
-          uint32_t ChunkM,
-          uint32_t ChunkN,
-          uint32_t ChunkK>
+          uint32_t WaveTileM,
+          uint32_t WaveTileN,
+          uint32_t WaveTileK>
 __global__ void test_accum_over_k(void* a, void* b, void* c, void* out)
 {
     using Selector = MmaDefaultSelector<ADataType,
                                         BDataType,
                                         CDataType,
-                                        ChunkM,
-                                        ChunkN,
-                                        ChunkK,
+                                        WaveTileM,
+                                        WaveTileN,
+                                        WaveTileK,
                                         decltype(get_compiler_target()),
                                         MmaOpFamily::DENSE>;
 
     using MmaOp    = typename Selector::SelectedOp;
     using CVecType = typename MmaOp::CVecType;
 
-    static constexpr uint32_t kIters = ChunkK / MmaOp::kK;
+    static constexpr uint32_t kIters = WaveTileK / MmaOp::kK;
 
     // Initialize the accumulator
     CVecType result = *reinterpret_cast<typename MmaOp::CVecType*>(c);
 
-    // Accumulate input AxB over ChunkK/FragK iterations
+    // Accumulate input AxB over WaveTileK/FragK iterations
     for(uint32_t i = 0; i < kIters; ++i)
     {
         result = MmaOp::exec(*reinterpret_cast<typename MmaOp::AVecType*>(a),
@@ -430,16 +430,16 @@ TEST(TestAmdgcnMma, MmaSelector_F16_F16_F32_16x16x32_Real)
     using BType = fp16_t;
     using CType = fp32_t;
 
-    // Chunk size, also the expected fragment size from the selector.
+    // WaveTile size, also the expected fragment size (MmaTile) from the selector.
     // Note: Actual FragK might be slightly different due to hardware implementation, but the
     // test_accum_over_k kernel will loop over the K dimension to ensure that the total K is
     // correct.
-    static constexpr uint32_t ChunkM = 16;
-    static constexpr uint32_t ChunkN = 16;
-    static constexpr uint32_t ChunkK = 32;
-    static constexpr uint32_t FragM  = ChunkM;
-    static constexpr uint32_t FragN  = ChunkN;
-    static constexpr uint32_t FragK  = ChunkK;
+    static constexpr uint32_t WaveTileM = 16;
+    static constexpr uint32_t WaveTileN = 16;
+    static constexpr uint32_t WaveTileK = 32;
+    static constexpr uint32_t FragM     = WaveTileM;
+    static constexpr uint32_t FragN     = WaveTileN;
+    static constexpr uint32_t FragK     = WaveTileK;
 
     // Gfx11 has input data duplication and no accumulator padding (MultiplierC = 1)
     // TODO: c++20 use is_target_family_gfx11(currentArchId)
@@ -480,16 +480,16 @@ TEST(TestAmdgcnMma, MmaSelector_F16_F16_F32_16x16x32_Real)
     HIP_CHECK_ERROR(hipMemcpy(d_c, h_c.data(), CSize, hipMemcpyHostToDevice));
 
     const auto wave_size = getDeviceWaveSize();
-    test_accum_over_k<AType, BType, CType, ChunkM, ChunkN, ChunkK>
+    test_accum_over_k<AType, BType, CType, WaveTileM, WaveTileN, WaveTileK>
         <<<1, wave_size>>>(d_a, d_b, d_c, d_out);
     HIP_CHECK_ERROR(hipDeviceSynchronize());
 
     HIP_CHECK_ERROR(hipMemcpy(h_out.data(), d_out, CSize, hipMemcpyDeviceToHost));
 
-    // Output should be ChunkK for all elements, because the inputs are all 1's
+    // Output should be WaveTileK for all elements, because the inputs are all 1's
     for(size_t i = 0; i < CElements; ++i)
     {
-        CType expected = static_cast<CType>(ChunkK);
+        CType expected = static_cast<CType>(WaveTileK);
 
         EXPECT_NEAR(h_out[i], expected, 1e-3);
     }
@@ -502,7 +502,7 @@ TEST(TestAmdgcnMma, MmaSelector_F16_F16_F32_16x16x32_Real)
 
 // Do a live test. At minimum, there should be a solution on real hardware for F16_F16_F32_16x16x32
 // The selector should be able to pick the correct MmaOp as a multiple of 16x16x32, even if the
-// chunk sizes are larger than 16x16x32. This tests that the selector can handle larger chunk
+// WaveTile sizes are larger than 16x16x32. This tests that the selector can handle larger WaveTile
 // sizes and still select the correct MmaOp.
 TEST(TestAmdgcnMma, MmaSelector_F16_F16_F32_112x112x128_Real)
 {
@@ -528,13 +528,13 @@ TEST(TestAmdgcnMma, MmaSelector_F16_F16_F32_112x112x128_Real)
     using BType = fp16_t;
     using CType = fp32_t;
 
-    // Chunk size to test for decomposition.
-    // We expect the selector to pick a 16x16 chunk
-    static constexpr uint32_t ChunkM = 112;
-    static constexpr uint32_t ChunkN = 112;
-    static constexpr uint32_t ChunkK = 128;
+    // WaveTile size to test for decomposition.
+    // We expect the selector to pick a 16x16 WaveTile
+    static constexpr uint32_t WaveTileM = 112;
+    static constexpr uint32_t WaveTileN = 112;
+    static constexpr uint32_t WaveTileK = 128;
 
-    // The expected fragment size from the selector (multiple of 16).
+    // The expected fragment size from the selector (MmaTile, multiple of 16).
     // Note: Actual FragK might be slightly different due to hardware implementation, but the
     // test_accum_over_k kernel will loop over the K dimension to ensure that the total K is
     // correct.
@@ -581,16 +581,16 @@ TEST(TestAmdgcnMma, MmaSelector_F16_F16_F32_112x112x128_Real)
     HIP_CHECK_ERROR(hipMemcpy(d_c, h_c.data(), CSize, hipMemcpyHostToDevice));
 
     const auto wave_size = getDeviceWaveSize();
-    test_accum_over_k<AType, BType, CType, ChunkM, ChunkN, ChunkK>
+    test_accum_over_k<AType, BType, CType, WaveTileM, WaveTileN, WaveTileK>
         <<<1, wave_size>>>(d_a, d_b, d_c, d_out);
     HIP_CHECK_ERROR(hipDeviceSynchronize());
 
     HIP_CHECK_ERROR(hipMemcpy(h_out.data(), d_out, CSize, hipMemcpyDeviceToHost));
 
-    // Output should be ChunkK for all elements, because the inputs are all 1's
+    // Output should be WaveTileK for all elements, because the inputs are all 1's
     for(size_t i = 0; i < CElements; ++i)
     {
-        CType expected = static_cast<CType>(ChunkK);
+        CType expected = static_cast<CType>(WaveTileK);
 
         EXPECT_NEAR(h_out[i], expected, 1e-3);
     }
diff --git a/projects/composablekernel/test/ck_tile/core/arch/mma/test_amdgcn_sparse_mma.cpp b/projects/composablekernel/test/ck_tile/core/arch/mma/test_amdgcn_sparse_mma.cpp
@@ -144,29 +144,29 @@ TEST(SparseMMATrait, SparseSelector)
 template <typename AType,
           typename BType,
           typename CType,
-          uint32_t ChunkM,
-          uint32_t ChunkN,
-          uint32_t ChunkK>
+          uint32_t WaveTileM,
+          uint32_t WaveTileN,
+          uint32_t WaveTileK>
 __global__ void test_sparse_accum_over_k(void* a, void* b, void* c, void* out)
 {
     using CompilerTarget = decltype(get_compiler_target());
     using Selector       = MmaDefaultSelector<AType,
                                               BType,
                                               CType,
-                                              ChunkM,
-                                              ChunkN,
-                                              ChunkK,
+                                              WaveTileM,
+                                              WaveTileN,
+                                              WaveTileK,
                                               CompilerTarget,
                                               MmaOpFamily::SPARSE>;
     using MmaOp          = typename Selector::SelectedOp;
     using CVecType       = typename MmaOp::CVecType;
 
-    static constexpr uint32_t kIters = ChunkK / MmaOp::kK;
+    static constexpr uint32_t kIters = WaveTileK / MmaOp::kK;
 
     // Initialize the accumulator
     CVecType result = *reinterpret_cast<typename MmaOp::CVecType*>(c);
 
-    // Accumulate input AxB over ChunkK/FragK iterations
+    // Accumulate input AxB over WaveTileK/FragK iterations
     for(uint32_t i = 0; i < kIters; ++i)
     {
         result = MmaOp::exec(*reinterpret_cast<typename MmaOp::AVecType*>(a),
@@ -207,16 +207,16 @@ TEST(SparseMMATrait, MmaSelector_Sparse_F16_F16_F32_16x16x32_Real)
     using BType = fp16_t;
     using CType = fp32_t;
 
-    // Chunk size, also the expected fragment size from the selector.
+    // WaveTile size, also the expected fragment size (MmaTile) from the selector.
     // Note: Actual FragK might be slightly different due to hardware implementation, but the
     // test_accum_over_k kernel will loop over the K dimension to ensure that the total K is
     // correct.
-    static constexpr uint32_t ChunkM = 16;
-    static constexpr uint32_t ChunkN = 16;
-    static constexpr uint32_t ChunkK = 32;
-    static constexpr uint32_t FragM  = ChunkM;
-    static constexpr uint32_t FragN  = ChunkN;
-    static constexpr uint32_t FragK  = ChunkK;
+    static constexpr uint32_t WaveTileM = 16;
+    static constexpr uint32_t WaveTileN = 16;
+    static constexpr uint32_t WaveTileK = 32;
+    static constexpr uint32_t FragM     = WaveTileM;
+    static constexpr uint32_t FragN     = WaveTileN;
+    static constexpr uint32_t FragK     = WaveTileK;
 
     // The number of elements per thread
     uint32_t AElements = FragM * FragK / deviceWarpSize;