Add detailed layout parameter descriptions.

krithalith · krithalith · commit 0e26f69a5352 · 2026-03-12T08:36:49.000Z
diff --git a/projects/composablekernel/include/ck_tile/core/arch/mma/amdgcn_mma.hpp b/projects/composablekernel/include/ck_tile/core/arch/mma/amdgcn_mma.hpp
@@ -14,7 +14,102 @@
 
 namespace ck_tile::core::arch::mma {
 
-// TODO: Describe layout params.
+/**---------------------------------------------------
+ *  Meaning of amdgcn_mma layout parameters (general)
+ * ---------------------------------------------------
+ *
+ * The fragment sizes and layout constants in the amdgcn_mma struct describe the mapping between
+ * intrinsic input / output matrix elements and vector registers (lane x vector_item space). Note
+ * that we end up having a mapping for A, B and C separately, although those for A and B are usually
+ * similar if not identical. All mappings can be described as an unmerge operation on one of the
+ * matrix dims (either K for AB or M for C), followed by remerging of the resulting subdims and raw
+ * other dim into the Lane and Vector Item dimensions. When I consider an unmerge operation on a
+ * dimension K, I like to label the resulting sub-dimensions as K0, K1, and K2, where K0 is the size
+ * of the fastest changing dimension. K0 is also referred to as "The size of the first unmerge", and
+ * K1 would be "The size of the second unmerge". There are never more than 2 unmerge operations, and
+ * unmerge operations may be trivial (unmerge size of 1). Example double unmerge of size {3, 2} of a
+ * K dimension of size 12:
+ *
+ * K  K2 K1 K0
+ * 0  0  0  0
+ * 1  0  0  1
+ * 2  0  1  0
+ * 3  0  1  1
+ * 4  0  2  0
+ * 5  0  2  1
+ * 6  1  0  0
+ * 7  1  0  1
+ * 8  1  1  0
+ * 9  1  1  1
+ * 10 1  2  0
+ * 11 1  2  1
+ *
+ * Note that K0 = 2 (first unmerge size, fastest changing), K1 = 3 (second unmerge size,
+ * second-fastest changing), and K2 = 12 / 2 / 3 = 2 (outermost dimension, whatever is left).
+ *
+ * If we were to use this unmerge op to decribe an A matrix layout in registers, we might have for
+ * example that L (lane dim) is composed of K1 and M, and V (vector item dim) is composed of K2 and
+ * K0. Compactly described, this would be K{3, 2} L{K1M} V{K2K0}, and if the M dimension was 2 we
+ * would have the following layout (6 lanes, 4 vector items each):
+ *
+ *    | V0       | V1       | V2       | V3       |
+ * L0 | M=0 K=0  | M=0 K=1  | M=0 K=6  | M=0 K=7  |
+ * L1 | M=1 K=0  | M=1 K=1  | M=1 K=6  | M=1 K=7  |
+ * L2 | M=0 K=2  | M=0 K=3  | M=0 K=8  | M=0 K=9  |
+ * L3 | M=1 K=2  | M=1 K=3  | M=1 K=8  | M=1 K=9  |
+ * L4 | M=0 K=4  | M=0 K=5  | M=0 K=10 | M=0 K=11 |
+ * L5 | M=1 K=4  | M=1 K=5  | M=1 K=10 | M=1 K=11 |
+ *
+ * Note that all A matrix elements are now placed in a unique (lane, vector_item). In case a Repeat
+ * dimension is used, every single matrix element is mapped to multiple (Lane, Vector_item)
+ * locations, usually along the Lane dimension.
+ *
+ * Check out TileDistrEncRegMap which can print full forward and backward mapping tables for any
+ * register mapping (expressed as a tile distribution encoding).
+ *
+ * ------------------------------------------
+ *  Individual amdgcn_mma layout parameters
+ * ------------------------------------------
+ *
+ * -- ABKPerLane --
+ * The number of K dim elements in each lane. Always the same for A and B, even when they have
+ * different layouts. In terms of unmerge sizes, it's equal to K0 * K2, i.e the product of the sizes
+ * of the outermost and innermost dimensions after a double K unmerge.
+ *
+ * -- A / B NumAccess --
+ * These two variables describe the size of the outermost dimension if two unmerge operations are
+ * required for K (so K2). Alternatively it can be described as the number of sets the vector
+ * dimension, which houses a number of K indices, is split up into. We may be able to actually
+ * remove A and B num access as well, but it sort of depends on how load and store tile work and
+ * whether we want the user to always have to know about this. There are only two reasons for these
+ * to ever not be 1, and they are different types of reasons:
+ *
+ * (logical correctness). You are dealing with scale MFMA fp8, which due to the index matrix layout
+ * does not allow arbitrary K perms to simplify layouts. This means the layout can only properly be
+ * described with a Num Access value of at least 2.
+ *
+ * (load / store manipulation). I think the load and store tile functions end up looking for the
+ * size of the smallest unmerged K dimension (K0) to determine how many elements should be loaded at
+ * a time. Different Num Access values will lead to different load / store behavior, even if
+ * logically equivalent.
+ *
+ * -- A / B Repeat --
+ * Variable indicating that all matrix values are represented multiple times in the vector
+ * reigsters, typically repeating in the lane dimension. This is always equal to the repeat value
+ * used in Tile Distribution encodings. There are two reasons to have non-trivial (non-1) value
+ * here: MFMA block-hiding to create oblong "virtual" intrinsics, and RDNA3 input repetition.
+ *
+ * -- CMPerLane --
+ * The number of M dim elements in each lane. In terms of unmerge sizes, is equal to M0 * M2, i.e
+ * the product of the sizes of the outermost and innermost dimensions after a double M unmerge.
+ *
+ * -- CNumAccess --
+ * Same as A / B NumAccess but for the M dim (so M2), but the mid-level code doesn't care about this
+ * and will not try to request a specific value. Absolutely needed for logical correctness of
+ * register mappings since we can not perform arbitrary M permutations without messing up the A
+ * layout.
+ */
+
 /**
  *  @class  amdgcn_mma_base
  *  @brief  Base class for amdgcn_mma structs to avoid a lot of code duplication. Also puts
@@ -47,19 +142,19 @@ struct amdgcn_mma_base
     using BDataType = BDataType_;
     using CDataType = CDataType_;
 
-    // Fragment sizes
-    static constexpr index_t kM = FragM;
+    // Fragment sizes, check description above.
+    static constexpr index_t kM = FragM; // M = M2 * M1 * M0
     static constexpr index_t kN = FragN;
-    static constexpr index_t kK = FragK;
+    static constexpr index_t kK = FragK; // K = K2 * K1 * K0
 
-    // Layout constants
-    static constexpr index_t kABKPerLane  = kABKPerLane_;
-    static constexpr index_t kAKNumAccess = kAKNumAccess_;
-    static constexpr index_t kARepeat     = kARepeat_;
-    static constexpr index_t kBKNumAccess = kBKNumAccess_;
-    static constexpr index_t kBRepeat     = kBRepeat_;
-    static constexpr index_t kCMPerLane   = kCMPerLane_;
-    static constexpr index_t kCMNumAccess = kCMNumAccess_;
+    // Layout constants, check description above.
+    static constexpr index_t kABKPerLane  = kABKPerLane_;  // K2 * K0
+    static constexpr index_t kAKNumAccess = kAKNumAccess_; // K2
+    static constexpr index_t kARepeat     = kARepeat_;     // RDNA3 repetition and MFMA block-hiding
+    static constexpr index_t kBKNumAccess = kBKNumAccess_; // K2
+    static constexpr index_t kBRepeat     = kBRepeat_;     // RDNA3 repetition and MFMA block-hiding
+    static constexpr index_t kCMPerLane   = kCMPerLane_;   // M2 * M0
+    static constexpr index_t kCMNumAccess = kCMNumAccess_; // M2
 
     // Register types (derived)
     static constexpr index_t WaveSize = WaveSize_;
@@ -132,6 +227,7 @@ concept MmaOpI = requires(MmaOp op) {
  *  @tparam FragK K-dimension of mma intrinsic
  *  @tparam CtrlFlags Control flags for mma operation
  *  @tparam CompilerTarget The current compiler target
+ *  @tparam OpFamily_ The type of operation (dense, sparse, scale, etc.)
  *  @tparam Enabler SFINAE enabler
  */
 template <typename ADataType,
@@ -145,6 +241,7 @@ template <typename ADataType,
           MmaOpFamily OpFamily_,
           typename Enabler = void>
 // clang-format off
+//                                 | A B C DataTypes      |MNK + WaveSize |AParams |BPar |CPar |
 struct amdgcn_mma : amdgcn_mma_base<fp32_t, fp32_t, fp32_t, 1u, 1u, 1u, 1u, 1, 1, 1, 1, 1, 1, 1, Unsupported, MmaOpFamily::UNDEFINED>
 // clang-format on
 {
diff --git a/projects/composablekernel/include/ck_tile/core/arch/mma/mfma/mfma_gfx9.hpp b/projects/composablekernel/include/ck_tile/core/arch/mma/mfma/mfma_gfx9.hpp
@@ -62,6 +62,7 @@ concept CtrlFlagsGfx9I = requires(CtrlFlags ctrlFlags) {
 // TODO: c++20 requires
 template <typename CtrlFlags, typename CompilerTarget>
 // clang-format off
+//               | A B C DataTypes      | MNK + WaveSize    |AParams |BPar |CPar |
 struct amdgcn_mma<fp16_t, fp16_t, fp32_t, 16u, 16u, 16u, CtrlFlags, CompilerTarget, MmaOpFamily::DENSE, enable_if_target_family_gfx9_t<CompilerTarget>>
 : amdgcn_mma_base<fp16_t, fp16_t, fp32_t, 16u, 16u, 16u, 64u, 4, 1, 1, 1, 1, 4, 1, MfmaOp, MmaOpFamily::DENSE>
 // clang-format on
@@ -92,6 +93,7 @@ struct amdgcn_mma<fp16_t, fp16_t, fp32_t, 16u, 16u, 16u, CtrlFlags, CompilerTarg
 // TODO: c++20 requires
 template <typename CtrlFlags, typename CompilerTarget>
 // clang-format off
+//               | A B C DataTypes      | MNK + WaveSize    |AParams |BPar |CPar |
 struct amdgcn_mma<fp16_t, fp16_t, fp32_t, 16u, 16u, 32u, CtrlFlags, CompilerTarget, MmaOpFamily::DENSE, enable_if_target_id_t<CompilerTarget, amdgcn_target_id::GFX950>>
 : amdgcn_mma_base<fp16_t, fp16_t, fp32_t, 16u, 16u, 32u, 64u, 8, 1, 1, 1, 1, 4, 1, MfmaOp, MmaOpFamily::DENSE>
 // clang-format on
diff --git a/projects/composablekernel/include/ck_tile/core/arch/mma/sparse/mfma/sparse_gfx9.hpp b/projects/composablekernel/include/ck_tile/core/arch/mma/sparse/mfma/sparse_gfx9.hpp
@@ -49,6 +49,7 @@ concept SparseMfmaCtrlFlags = requires(CtrlFlags ctrlFlags) {
 // TODO: c++20 requires
 template <typename CtrlFlags, typename CompilerTarget>
 // clang-format off
+//               | A B C DataTypes      | MNK + WaveSize    |AParams |BPar |CPar |
 struct amdgcn_mma<fp16_t, fp16_t, fp32_t, 16u, 16u, 32u, CtrlFlags, CompilerTarget, MmaOpFamily::SPARSE, std::enable_if_t<is_any_value_of(CompilerTarget::TARGET_ID, amdgcn_target_id::GFX942, amdgcn_target_id::GFX950)>>
 : amdgcn_mma_base<fp16_t, fp16_t, fp32_t, 16u, 16u, 32u, 64u, 8, 1, 1, 1, 1, 4, 1, MfmaOp, MmaOpFamily::SPARSE>
 // clang-format on
diff --git a/projects/composablekernel/include/ck_tile/core/arch/mma/wmma/wmma_gfx11.hpp b/projects/composablekernel/include/ck_tile/core/arch/mma/wmma/wmma_gfx11.hpp
@@ -71,6 +71,7 @@ struct DefaultWmmaCtrlFlags
 // TODO: c++20 requires
 template <typename CtrlFlags, typename CompilerTarget>
 // clang-format off
+//               | A B C DataTypes      | MNK + WaveSize    |AParams  |BPar |CPar |
 struct amdgcn_mma<fp16_t, fp16_t, fp32_t, 16u, 16u, 16u, CtrlFlags, CompilerTarget, MmaOpFamily::DENSE, std::enable_if_t<is_target_family_gfx11<CompilerTarget>()>>
 : amdgcn_mma_base<fp16_t, fp16_t, fp32_t, 16u, 16u, 16u, 32u, 16, 1, 2, 1, 2, 8, 8, WmmaOp, MmaOpFamily::DENSE>
 // clang-format on
diff --git a/projects/composablekernel/include/ck_tile/core/arch/mma/wmma/wmma_gfx12.hpp b/projects/composablekernel/include/ck_tile/core/arch/mma/wmma/wmma_gfx12.hpp
@@ -31,6 +31,7 @@ namespace ck_tile::core::arch::mma {
 // TODO: c++20 requires
 template <typename CtrlFlags, typename CompilerTarget>
 // clang-format off
+//               | A B C DataTypes      | MNK + WaveSize    |AParams |BPar |CPar |
 struct amdgcn_mma<fp16_t, fp16_t, fp32_t, 16u, 16u, 16u, CtrlFlags, CompilerTarget, MmaOpFamily::DENSE, enable_if_target_family_gfx12_t<CompilerTarget>>
 : amdgcn_mma_base<fp16_t, fp16_t, fp32_t, 16u, 16u, 16u, 32u, 8, 1, 1, 1, 1, 8, 1, WmmaOp, MmaOpFamily::DENSE>
 // clang-format on
diff --git a/projects/composablekernel/test/ck_tile/core/arch/mma/test_amdgcn_mma.cpp b/projects/composablekernel/test/ck_tile/core/arch/mma/test_amdgcn_mma.cpp
@@ -42,6 +42,7 @@ using enable_if_target_id_dummy_t = std::enable_if_t<is_dummy_target(CompilerTar
 // TODO: c++20 template <amdgcn_target_arch_id CompilerTarget>
 template <typename CompilerTarget>
 // clang-format off
+//               | A B C DataTypes      | MNK + WaveSize |AParams |BPar |CPar |
 struct amdgcn_mma<fp32_t, fp32_t, fp32_t, 8u, 8u, 8u, DummyCtrlFlags, CompilerTarget, MmaOpFamily::DENSE, enable_if_target_id_dummy_t<CompilerTarget>>
 : amdgcn_mma_base<fp32_t, fp32_t, fp32_t, 8u, 8u, 8u, 64u, 1, 1, 1, 1, 1, 1, 1, DummyOpType, MmaOpFamily::DENSE>
 // clang-format on