[AMD] Implement dotOperandMfma to linear layout conversion (triton-lang#4817)

oplavsic · Ognjen Plavsic · bertmaher · commit 2bd83aaf9afa · 2024-12-05T19:57:55.000-08:00
This commit implements dotOperandMfma layout to linear layout conversion
under following conditions:
- opIdx == 1
- kWidth == 8
- warpsPerCTA[0] == 1.

This serves as a preparation for a next PR, which will add a bypassLDS
pass.

---------

Co-authored-by: Ognjen Plavsic &lt;ognjen.plavsic@luxoft.com&gt;
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -247,6 +247,30 @@ SmallVector<unsigned> getWarpOrder(Attribute layout) {
   return order;
 }
 
+SmallVector<unsigned> getOrderForDotOperand(unsigned opIdx, unsigned rank) {
+  SmallVector<unsigned> order(rank);
+  // The 'order' field typically represents a descending sorted array of
+  // dimensions based on contiguity. For instance, in axisInfo utilities that
+  // retrieve tensor contiguity, it's assumed that the dimension with the
+  // highest contiguity corresponds to order[0].
+  //
+  // The relation between contiguity and order is only relevant if the layout
+  // interfaces with HBM, as is the case when we load tensor from HBM to
+  // registers in the dot layout to bypass LDS. When bypassing LDS, we make the
+  // following assumptions about tensor layouts:
+  // - Tensor A (opIdx == 0) is considered to be row-major.
+  // - Tensor B (opIdx == 1) is considered to be column-major.
+  //
+  // Based on these assumptions, we define the following orders:
+  // - For opIdx == 0, we assume an order of [1, 0].
+  // - For opIdx == 1, we assume an order of [0, 1].
+  std::iota(order.rbegin(), order.rend(), 0);
+  if (opIdx == 1) {
+    std::swap(order[0], order[1]);
+  }
+  return order;
+}
+
 SmallVector<unsigned> getOrder(Attribute layout) {
   if (auto blockedLayout = dyn_cast<BlockedEncodingAttr>(layout)) {
     return llvm::to_vector(blockedLayout.getOrder());
@@ -261,7 +285,11 @@ SmallVector<unsigned> getOrder(Attribute layout) {
   if (auto dotLayout = dyn_cast<DotOperandEncodingAttr>(layout)) {
     auto rank = getWarpsPerCTA(dotLayout.getParent()).size();
     SmallVector<unsigned> order(rank);
-    std::iota(order.rbegin(), order.rend(), 0);
+    if (isa<AMDMfmaEncodingAttr>(dotLayout.getParent())) {
+      return getOrderForDotOperand(dotLayout.getOpIdx(), rank);
+    } else {
+      std::iota(order.rbegin(), order.rend(), 0);
+    }
     return order;
   }
   if (auto sliceLayout = dyn_cast<SliceEncodingAttr>(layout)) {
@@ -925,6 +953,27 @@ unsigned SharedEncodingAttr::getTotalElemsPerThread(ArrayRef<int64_t> shape,
 SmallVector<unsigned>
 DotOperandEncodingAttr::getElemsPerThread(ArrayRef<int64_t> shape,
                                           Type eltTy) const {
+
+  if (auto parent = mlir::dyn_cast<AMDMfmaEncodingAttr>(getParent())) {
+    auto rank = shape.size();
+    assert(rank == 2 || rank == 3);
+
+    auto idx = getOpIdx();
+    assert(idx == 0 || idx == 1);
+
+    SmallVector<unsigned> elemsPerThread(rank);
+
+    auto kWidth = getKWidth();
+    auto rep = parent.getMFMARepForOperands(shape, kWidth, idx);
+
+    if (rank == 3)
+      elemsPerThread[0] = rep[0];
+    elemsPerThread[rank - 2] = (idx == 0) ? rep[1] : rep[1] * kWidth;
+    elemsPerThread[rank - 1] = (idx == 0) ? rep[2] * kWidth : rep[2];
+
+    return elemsPerThread;
+  }
+
   llvm_unreachable("getElemsPerThread is not supported for dot operand");
   return SmallVector<unsigned>();
 }
diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
@@ -553,6 +553,96 @@ AMDMfmaEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
   return combineCtaCgaWithShape(ctaLayout, getCTALayout(), shape);
 }
 
+std::optional<LinearLayout>
+dotOperandMfmaToLinearLayout(DotOperandEncodingAttr dotMfmaLayout,
+                             ArrayRef<int64_t> shape) {
+
+  // Current linear layout conversion for dot operand is only necessary to
+  // enable LDS bypass for operand B in the MFMA dot path. To achieve
+  // performance gains from bypassing LDS, the following conditions must be met:
+  //
+  // 1) opIdx == 1: Currently, only the B tensor (e.g. weights in moe-like
+  //    kernels) bypasses LDS. This constraint is not strict and support for
+  //    bypassing operand A (e.g. Q tensor in flash attention) will be added in
+  //    the future.
+  //
+  // 2) B tensor must be column major: This is required to support vectorized
+  //    global load instructions, as MFMA instructions expect threads to hold B
+  //    operand elements along the K dimension.
+  //
+  // 3) kWidth == 8: Ensures maximum global load vectorization for fp16
+  //    operations.
+  //    TODO: Generalize conversion to handle maximum kWidth for other types
+  //    (i.e. fp8).
+  //
+  // 4) warpsPerCTA[mDim] == 1: This guarantees that every B tensor element is
+  //    held by exactly one thread, maintaining the same number of global loads
+  //    as in a blocked layout.
+  auto mfmaLayout = llvm::cast<AMDMfmaEncodingAttr>(dotMfmaLayout.getParent());
+
+  if (dotMfmaLayout.getOpIdx() == 0) {
+    return std::nullopt;
+  }
+  auto rank = shape.size();
+  bool hasBatchDim = rank == 3;
+  int mIndex = 0 + hasBatchDim;
+
+  auto kWidth = dotMfmaLayout.getKWidth();
+  auto warpsPerCTA = mfmaLayout.getWarpsPerCTA();
+
+  if (kWidth != 8 || warpsPerCTA[mIndex] != 1) {
+    return std::nullopt;
+  }
+
+  MLIRContext *ctx = dotMfmaLayout.getContext();
+  SmallVector<StringAttr> outDimNames = standardOutDimNames(ctx, rank);
+
+  StringAttr kRegister = S("register");
+  StringAttr kLane = S("lane");
+
+  SmallVector<unsigned> order = triton::gpu::getOrder(dotMfmaLayout);
+  auto tileLayout = LinearLayout::empty();
+
+  if (mfmaLayout.getMDim() == 32) {
+    // Based on canonical MFMA linear layout, which handles 4 consecutive
+    // elements along the register dimension, kWidth=8 means we have 8
+    // consecutive elements, so we have an additional {4, 0} base vector here.
+    // For lane dim, since the MFMA thread arrangement is {K, N} = {2, 32}, this
+    // means that mapping of first 5 base (up to thread 16) vectors will be an
+    // identity along N dim. Thread 32 will be mapped to element 8 in K
+    // dimension, because kWidth == 8.
+    tileLayout = LinearLayout(
+        {{kRegister, {{1, 0}, {2, 0}, {4, 0}}},
+         {kLane, {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {0, 16}, {8, 0}}}},
+        {outDimNames[order[0]], outDimNames[order[1]]});
+  } else {
+    assert(mfmaLayout.getMDim() == 16);
+    // For lane dim, since the MFMA thread arrangement is {K, N} = {4, 16}, this
+    // means that mapping of first 4 base (up to thread 16) vectors will be an
+    // identity along N dim. Thread 16 will be mapped to element 8 in K
+    // dimension, because kWidth == 8. Thread 32 is mapped to element 16 as that
+    // is 2*kWidth in K dim.
+    tileLayout = LinearLayout(
+        {{kRegister, {{1, 0}, {2, 0}, {4, 0}}},
+         {kLane, {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {8, 0}, {16, 0}}}},
+        {outDimNames[order[0]], outDimNames[order[1]]});
+  }
+
+  if (hasBatchDim) {
+    assert(order[2] == 0);
+    // Extend the base vector with one value to accomodate for the batch
+    // dimension, which appears at the last.
+    tileLayout *= LinearLayout::identity1D(1, kRegister, outDimNames[order[2]]);
+    tileLayout *= LinearLayout::identity1D(1, kLane, outDimNames[order[2]]);
+  }
+
+  LinearLayout warpLayout =
+      identityND(S("warp"), warpsPerCTA, order, outDimNames);
+  LinearLayout ctaLayout = tileLayout * warpLayout;
+
+  return combineCtaCgaWithShape(ctaLayout, mfmaLayout.getCTALayout(), shape);
+}
+
 std::optional<LinearLayout>
 AMDWmmaEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
   int rank = shape.size();
@@ -731,9 +821,13 @@ SliceEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
   return ret;
 }
 
-// TODO: DotOperandEncoding doesn't support LinearLayout conversion yet.
 std::optional<LinearLayout>
 DotOperandEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
+
+  if (auto mfmaLayout = llvm::dyn_cast<AMDMfmaEncodingAttr>(getParent())) {
+    return dotOperandMfmaToLinearLayout(*this, shape);
+  }
+
   return std::nullopt;
 }
 
diff --git a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp
@@ -51,6 +51,11 @@ class LinearLayoutConversionsTest : public ::testing::Test {
         isTransposed, CTALayoutAttr::get(&ctx, cpg, cSplit, cOrd));
   }
 
+  DotOperandEncodingAttr amdDot(AMDMfmaEncodingAttr mfma, unsigned opIdx,
+                                unsigned kWidth) {
+    return DotOperandEncodingAttr::get(&ctx, opIdx, mfma, kWidth);
+  }
+
   AMDWmmaEncodingAttr wmma(ArrayRef<unsigned> warps) {
     SmallVector<unsigned> cpg(warps.size(), 1u);
     SmallVector<unsigned> cSplit(warps.size(), 1u);
@@ -651,6 +656,149 @@ TEST_F(LinearLayoutConversionsTest, MFMA32_2x4x1Warps) {
                 {S("dim0"), S("dim1"), S("dim2")}));
 }
 
+TEST_F(LinearLayoutConversionsTest, warp1onK_mfma32_rhs_kwidth8) {
+  auto parentMfma_1_8 = mfma(/*warps=*/{1, 8}, /*mDim=*/32, /*nDim=*/32,
+                             /*isTransposed=*/false);
+  auto amdDot_1_8 = amdDot(parentMfma_1_8, /*opIdx=*/1, /*kWidth=*/8);
+  EXPECT_EQ(
+      toLinearLayout({128, 128}, amdDot_1_8),
+      LinearLayout(
+          {{S("register"), {{1, 0}, {2, 0}, {4, 0}, {16, 0}, {32, 0}, {64, 0}}},
+           {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {0, 16}, {8, 0}}},
+           {S("warp"), {{0, 32}, {0, 64}, {0, 0}}},
+           {S("block"), {}}},
+          {S("dim0"), S("dim1")}));
+
+  EXPECT_EQ(
+      toLinearLayout({128, 256}, amdDot_1_8),
+      LinearLayout(
+          {{S("register"), {{1, 0}, {2, 0}, {4, 0}, {16, 0}, {32, 0}, {64, 0}}},
+           {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {0, 16}, {8, 0}}},
+           {S("warp"), {{0, 32}, {0, 64}, {0, 128}}},
+           {S("block"), {}}},
+          {S("dim0"), S("dim1")}));
+
+  EXPECT_EQ(toLinearLayout({32, 64}, amdDot_1_8),
+            LinearLayout(
+                {{S("register"), {{1, 0}, {2, 0}, {4, 0}, {16, 0}}},
+                 {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {0, 16}, {8, 0}}},
+                 {S("warp"), {{0, 32}, {0, 0}, {0, 0}}},
+                 {S("block"), {}}},
+                {S("dim0"), S("dim1")}));
+
+  EXPECT_EQ(
+      toLinearLayout({256, 256}, amdDot_1_8),
+      LinearLayout(
+          {{S("register"),
+            {{1, 0}, {2, 0}, {4, 0}, {16, 0}, {32, 0}, {64, 0}, {128, 0}}},
+           {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {0, 16}, {8, 0}}},
+           {S("warp"), {{0, 32}, {0, 64}, {0, 128}}},
+           {S("block"), {}}},
+          {S("dim0"), S("dim1")}));
+
+  auto parentMfma_1_4 = mfma(/*warps=*/{1, 4}, /*mDim=*/32, /*nDim=*/32,
+                             /*isTransposed=*/false);
+  auto amdDot_1_4 = amdDot(parentMfma_1_4, /*opIdx=*/1, /*kWidth=*/8);
+  EXPECT_EQ(toLinearLayout({256, 256}, amdDot_1_4),
+            LinearLayout(
+                {{S("register"),
+                  {{1, 0},
+                   {2, 0},
+                   {4, 0},
+                   {16, 0},
+                   {32, 0},
+                   {64, 0},
+                   {128, 0},
+                   {0, 128}}},
+                 {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {0, 16}, {8, 0}}},
+                 {S("warp"), {{0, 32}, {0, 64}}},
+                 {S("block"), {}}},
+                {S("dim0"), S("dim1")}));
+}
+
+TEST_F(LinearLayoutConversionsTest, warp1onK_mfma16_rhs_kwidth8) {
+  auto parentMfma_1_4 = mfma(/*warps=*/{1, 4}, /*mDim=*/16, /*nDim=*/16,
+                             /*isTransposed=*/false);
+  auto amdDot_1_4 = amdDot(parentMfma_1_4, /*opIdx=*/1, /*kWidth=*/8);
+  EXPECT_EQ(
+      toLinearLayout({128, 128}, amdDot_1_4),
+      LinearLayout(
+          {{S("register"), {{1, 0}, {2, 0}, {4, 0}, {32, 0}, {64, 0}, {0, 64}}},
+           {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {8, 0}, {16, 0}}},
+           {S("warp"), {{0, 16}, {0, 32}}},
+           {S("block"), {}}},
+          {S("dim0"), S("dim1")}));
+
+  EXPECT_EQ(toLinearLayout({1, 128}, amdDot_1_4),
+            LinearLayout(
+                {{S("register"), {{0, 0}, {0, 0}, {0, 0}, {0, 64}}},
+                 {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {0, 0}, {0, 0}}},
+                 {S("warp"), {{0, 16}, {0, 32}}},
+                 {S("block"), {}}},
+                {S("dim0"), S("dim1")}));
+
+  EXPECT_EQ(toLinearLayout({128, 1}, amdDot_1_4),
+            LinearLayout(
+                {{S("register"), {{1, 0}, {2, 0}, {4, 0}, {32, 0}, {64, 0}}},
+                 {S("lane"), {{0, 0}, {0, 0}, {0, 0}, {0, 0}, {8, 0}, {16, 0}}},
+                 {S("warp"), {{0, 0}, {0, 0}}},
+                 {S("block"), {}}},
+                {S("dim0"), S("dim1")}));
+
+  EXPECT_EQ(toLinearLayout({256, 256}, amdDot_1_4),
+            LinearLayout(
+                {{S("register"),
+                  {{1, 0},
+                   {2, 0},
+                   {4, 0},
+                   {32, 0},
+                   {64, 0},
+                   {128, 0},
+                   {0, 64},
+                   {0, 128}}},
+                 {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {8, 0}, {16, 0}}},
+                 {S("warp"), {{0, 16}, {0, 32}}},
+                 {S("block"), {}}},
+                {S("dim0"), S("dim1")}));
+
+  auto parentMfma_1_8 = mfma(/*warps=*/{1, 8}, /*mDim=*/16, /*nDim=*/16,
+                             /*isTransposed=*/false);
+  auto amdDot_1_8 = amdDot(parentMfma_1_8, /*opIdx=*/1, /*kWidth=*/8);
+  EXPECT_EQ(
+      toLinearLayout({256, 256}, amdDot_1_8),
+      LinearLayout(
+          {{S("register"),
+            {{1, 0}, {2, 0}, {4, 0}, {32, 0}, {64, 0}, {128, 0}, {0, 128}}},
+           {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {8, 0}, {16, 0}}},
+           {S("warp"), {{0, 16}, {0, 32}, {0, 64}}},
+           {S("block"), {}}},
+          {S("dim0"), S("dim1")}));
+
+  auto parentMfma_1_8_1 = mfma(/*warps=*/{1, 1, 8}, /*mDim=*/16, /*nDim=*/16,
+                               /*isTransposed=*/false);
+  auto amdDot_1_8_1 = amdDot(parentMfma_1_8_1, /*opIdx=*/1, /*kWidth=*/8);
+
+  EXPECT_EQ(toLinearLayout({1, 256, 256}, amdDot_1_8_1),
+            LinearLayout({{S("register"),
+                           {{0, 1, 0},
+                            {0, 2, 0},
+                            {0, 4, 0},
+                            {0, 32, 0},
+                            {0, 64, 0},
+                            {0, 128, 0},
+                            {0, 0, 128}}},
+                          {S("lane"),
+                           {{0, 0, 1},
+                            {0, 0, 2},
+                            {0, 0, 4},
+                            {0, 0, 8},
+                            {0, 8, 0},
+                            {0, 16, 0}}},
+                          {S("warp"), {{0, 0, 16}, {0, 0, 32}, {0, 0, 64}}},
+                          {S("block"), {}}},
+                         {S("dim0"), S("dim1"), S("dim2")}));
+}
+
 TEST_F(LinearLayoutConversionsTest, WMMA_2x4Warps) {
   auto legacy = wmma(/*warps=*/{2, 4});