[AMD] WMMA dot operand conversion to Linear Layout (#5299)

binarman · giuseros · web-flow · commit 6d2ca1c5a177 · 2024-12-13T09:22:29.000-08:00
This PR implements conversion of WMMA dot operand layout to linear
layout and adds related tests.

---------

Co-authored-by: Giuseppe Rossini &lt;giuseppe.rossini@amd.com&gt;
diff --git a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -369,8 +369,7 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
         return !useLegacyMMAConversion;
       }
       if (auto dotOperand = dyn_cast<DotOperandEncodingAttr>(layout)) {
-        if (isa<NvidiaMmaEncodingAttr, AMDMfmaEncodingAttr>(
-                dotOperand.getParent())) {
+        if (isa<MmaEncodingTrait>(dotOperand.getParent())) {
           return !useLegacyMMAConversion;
         }
         return false;
diff --git a/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp
@@ -159,7 +159,7 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
             srcTy.getShape()[1] >= 4 * kWidth & dstTy.getRank() <= 2;
         return !canUseLdmatrix;
       }
-      if (isa<AMDMfmaEncodingAttr>(dot.getParent()))
+      if (isa<AMDMfmaEncodingAttr, AMDWmmaEncodingAttr>(dot.getParent()))
         return true;
     }
     return false;
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -1120,11 +1120,9 @@ SmallVector<unsigned> DotOperandEncodingAttr::getWarpsPerCTA() const {
 }
 SmallVector<unsigned> DotOperandEncodingAttr::getWarpOrder() const {
   // FIXME(Lezcano): Preexisting. Do we want to have this path at all?
-  if (mlir::isa<AMDMfmaEncodingAttr>(getParent())) {
+  if (mlir::isa<AMDMfmaEncodingAttr, AMDWmmaEncodingAttr>(getParent())) {
     return ::getWarpOrder(getParent());
   }
-  // It's quite weird to talk about warp order when that the warps
-  // are broadcasted along the K dimension
   llvm::report_fatal_error("DotOperandEncoding::getWarpOrder not implemented");
   return {};
 }
@@ -1160,9 +1158,9 @@ LogicalResult DotOperandEncodingAttr::verify(
 
   if (auto parentAttr = mlir::dyn_cast<AMDWmmaEncodingAttr>(parent)) {
     if (kWidth != 16 && parentAttr.getVersion() == 1 ||
-        kWidth != 8 && parentAttr.getVersion() == 2)
+        kWidth != 8 && kWidth != 16 && parentAttr.getVersion() == 2)
       return emitError() << "ttg.dot_op kWidth parameter must be 16 for "
-                            "gfx11 and 8 for gfx12";
+                            "gfx11 and 8/16 for gfx12";
     return success();
   }
 
diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
@@ -239,6 +239,42 @@ LinearLayout sharedToLinearLayoutLeadingOffset(ArrayRef<int64_t> shape,
   return combineCtaCgaWithShape(tileLayout, shared.getCTALayout(), shape);
 }
 
+LinearLayout warpsDotOperand(MLIRContext *ctx, ArrayRef<unsigned> warpShape,
+                             ArrayRef<unsigned> warpOrder, unsigned inner) {
+  // Let warpsPerCTAMma = {2, 2}, then
+  // warpsPerCTA = {2, 1} for opA and warpsPerCTA = {1, 2} for opB
+  // assume warpOrder = {1, 0}
+  // Assume that C is tiled by 2x2 tiles. Since warpOrder={1, 0}, we have that
+  // the C is owned as per the following layout:
+  // C: 0 | 1
+  //    - | -
+  //    2 | 3
+  // In order to be able to compute C, we need the following warp tiling of
+  // A and B:
+  // A: 0 1 | 0 1    B: 0 2 | 1 3
+  //    - - | - -       - - | - -
+  //    2 3 | 2 3       0 2 | 1 3
+  // In other words, we need to broadcast along K
+  auto rank = warpShape.size();
+  auto dimNames = standardOutDimNames(ctx, rank);
+  LinearLayout warpLayout = LinearLayout::empty();
+
+  // We have to broadcast along the inner dimension
+  // For A, when moving along M we go from 0 to 2.
+  // For B, when moving along N we go from 0 to 1.
+  // As such, choosing the order of A {1, 0}, gives us the correct broadcasting
+  // Same happens if the warpOrder is {0, 1}, like in Hopper
+  for (auto d : warpOrder) {
+    if (d == inner) {
+      warpLayout *= LinearLayout::zeros1D(warpShape[d], S("warp"), dimNames[d]);
+    } else {
+      warpLayout *=
+          LinearLayout::identity1D(warpShape[d], S("warp"), dimNames[d]);
+    }
+  }
+  return warpLayout;
+}
+
 } // anonymous namespace
 
 std::optional<LinearLayout>
@@ -470,7 +506,9 @@ AMDWmmaEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
 
   // We use the order from fastest varying to slowest varying. So each base
   // vector is a tuple of values mapping to matrix C's (N, M[, B]) indices.
-  SmallVector<unsigned> order = triton::gpu::getOrder(*this);
+  SmallVector<unsigned> threadOrder = getThreadOrder();
+  assert(threadOrder[0] == mIndex || threadOrder[0] == nIndex);
+  assert(threadOrder[1] == mIndex || threadOrder[1] == nIndex);
 
   // For wmma with 16x16 output, each of the 32 threads holds 8 elements.
   //
@@ -498,29 +536,106 @@ AMDWmmaEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
           ? LinearLayout(
                 {{kRegister, {/*gap*/ {0, 2}, {0, 4}, {0, 8}}},
                  {kLane, {{1, 0}, {2, 0}, {4, 0}, {8, 0}, /*gap*/ {0, 1}}}},
-                {outDimNames[order[0]], outDimNames[order[1]]})
+                {outDimNames[threadOrder[0]], outDimNames[threadOrder[1]]})
           : LinearLayout(
                 {{kRegister, {{0, 1}, {0, 2}, {0, 4}}},
                  {kLane, {{1, 0}, {2, 0}, {4, 0}, {8, 0}, /*gap*/ {0, 8}}}},
-                {outDimNames[order[0]], outDimNames[order[1]]});
+                {outDimNames[threadOrder[0]], outDimNames[threadOrder[1]]});
 
   if (hasBatchDim) {
-    assert(order[2] == 0);
+    int batchIndex = 0;
     // Extend the base vector with one value to accomodate for the batch
     // dimension, which appears at the last.
-    tileLayout *= LinearLayout::identity1D(1, kRegister, outDimNames[order[2]]);
-    tileLayout *= LinearLayout::identity1D(1, kLane, outDimNames[order[2]]);
+    tileLayout *=
+        LinearLayout::identity1D(1, kRegister, outDimNames[batchIndex]);
+    tileLayout *= LinearLayout::identity1D(1, kLane, outDimNames[batchIndex]);
   }
 
   // And each warp takes the same register and lane sub-layout. So mulitply with
   // an identity layout for the warp.
+  auto warpOrder = getWarpOrder();
   LinearLayout warpLayout =
-      identityStandardND(S("warp"), getWarpsPerCTA(), order);
-  LinearLayout ctaLayout = tileLayout * warpLayout;
+      identityStandardND(S("warp"), getWarpsPerCTA(), warpOrder);
+  // reorder dim names in rep order, so combineCtaCgaWithShape generate proper
+  // extension of layout
+  auto repOrder = getRepOrder();
+  SmallVector<StringAttr> repDimNames;
+  for (auto dim : repOrder)
+    repDimNames.push_back(outDimNames[dim]);
+  LinearLayout ctaLayout = tileLayout.transposeOuts(repDimNames) *
+                           warpLayout.transposeOuts(repDimNames);
 
   return combineCtaCgaWithShape(ctaLayout, getCTALayout(), shape);
 }
 
+std::optional<LinearLayout>
+wmmaDotOperandToLinearLayout(DotOperandEncodingAttr dotWmmaLayout,
+                             ArrayRef<int64_t> shape) {
+  auto wmmaLayout = llvm::cast<AMDWmmaEncodingAttr>(dotWmmaLayout.getParent());
+  auto rank = shape.size();
+  bool hasBatchDim = rank == 3;
+  auto kDim = dotWmmaLayout.getOpIdx() == 0 ? rank - 1 : rank - 2;
+  int32_t kSize = shape[kDim];
+  MLIRContext *ctx = dotWmmaLayout.getContext();
+  SmallVector<StringAttr> outDimNames = standardOutDimNames(ctx, rank);
+  StringAttr kRegister = S("register");
+  StringAttr kLane = S("lane");
+  StringAttr kWarp = S("warp");
+  // lane order
+  // operand A: [1, 0] / [2, 1, 0]
+  // operand B: [0, 1] / [1, 2, 0]
+  // for both cases it is [k, nonk]/[k, nonk, batch]
+  SmallVector<unsigned> laneOrder = triton::gpu::getOrder(dotWmmaLayout);
+  // generate continuous part of register bases(i.e. kWidth)
+  std::vector<std::vector<int32_t>> registerBase;
+  const int32_t kWidth = dotWmmaLayout.getKWidth();
+  for (int i = 1; i < kWidth; i *= 2)
+    registerBase.push_back(std::vector<int32_t>{i, 0});
+  std::vector<std::vector<int32_t>> laneBase = {{0, 1}, {0, 2}, {0, 4}, {0, 8}};
+  switch (wmmaLayout.getVersion()) {
+  case 1:
+    // WMMA version 1 duplicates values in lanes 0-15 and 16-31
+    laneBase.push_back({0, 0});
+    break;
+  case 2:
+    // WMMA version 2 offset values in lanes 0-15 and 16-31 across k dimensions
+    laneBase.push_back({kWidth, 0});
+    break;
+  default:
+    assert(false && "unexpected version");
+  }
+  // Generate layout for one wmma instruction
+  LinearLayout tileLayout(
+      {{kRegister, registerBase}, {kLane, laneBase}},
+      {outDimNames[laneOrder[0]], outDimNames[laneOrder[1]]});
+  if (hasBatchDim) {
+    assert(laneOrder[2] == 0);
+    // Extend the base vector with one value to accomodate for the batch
+    // dimension, which appears at the last.
+    tileLayout *=
+        LinearLayout::identity1D(1, kRegister, outDimNames[laneOrder[2]]);
+    tileLayout *= LinearLayout::identity1D(1, kLane, outDimNames[laneOrder[2]]);
+  }
+
+  // Generate warp layout
+  auto warpsPerCTA = wmmaLayout.getWarpsPerCTA();
+  auto warpOrder = triton::gpu::getWarpOrder(dotWmmaLayout);
+  LinearLayout warpLayout = warpsDotOperand(ctx, warpsPerCTA, warpOrder, kDim);
+
+  // reorder dim names in rep order, so combineCtaCgaWithShape generate proper
+  // extension of layout
+  auto repOrder = wmmaLayout.getRepOrderForOperand(dotWmmaLayout.getOpIdx());
+  SmallVector<StringAttr> repDimNames;
+  for (auto dim : repOrder)
+    repDimNames.push_back(outDimNames[dim]);
+
+  // join instruction layout and warps using repetition order of dimensions
+  LinearLayout ctaLayout = tileLayout.transposeOuts(repDimNames) *
+                           warpLayout.transposeOuts(repDimNames);
+
+  return combineCtaCgaWithShape(ctaLayout, wmmaLayout.getCTALayout(), shape);
+}
+
 std::optional<LinearLayout>
 BlockedEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
   assert(shape.size() == getOrder().size());
@@ -604,44 +719,6 @@ NvidiaMmaEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
   return combineCtaCgaWithShape(ctaLayout, getCTALayout(), shape);
 }
 
-LinearLayout warpsNvidiaDot(MLIRContext *ctx, ArrayRef<unsigned> mmaWarpShape,
-                            ArrayRef<unsigned> mmaWarpOrder, bool isA) {
-  // Let warpsPerCTAMma = {2, 2}, then
-  // warpsPerCTA = {2, 1} for opA and warpsPerCTA = {1, 2} for opB
-  // assume warpOrder = {1, 0}
-  // Assume that C is tiled by 2x2 tiles. Since warpOrder={1, 0}, we have that
-  // the C is owned as per the following layout:
-  // C: 0 | 1
-  //    - | -
-  //    2 | 3
-  // In order to be able to compute C, we need the following warp tiling of
-  // A and B:
-  // A: 0 1 | 0 1    B: 0 2 | 1 3
-  //    - - | - -       - - | - -
-  //    2 3 | 2 3       0 2 | 1 3
-  // In other words, we need to broadcast along K
-  auto rank = mmaWarpOrder.size();
-  auto inner = isA ? rank - 1 : rank - 2;
-  auto dimNames = standardOutDimNames(ctx, rank);
-  LinearLayout warpLayout = LinearLayout::empty();
-
-  // We have to broadcast along the inner dimension
-  // For A, when moving along M we go from 0 to 2.
-  // For B, when moving along N we go from 0 to 1.
-  // As such, choosing the order of A {1, 0}, gives us the correct broadcasting
-  // Same happens if the mmaWarpOrder is {0, 1}, like in Hopper
-  for (auto d : mmaWarpOrder) {
-    if (d == inner) {
-      warpLayout *=
-          LinearLayout::zeros1D(mmaWarpShape[d], S("warp"), dimNames[d]);
-    } else {
-      warpLayout *=
-          LinearLayout::identity1D(mmaWarpShape[d], S("warp"), dimNames[d]);
-    }
-  }
-  return warpLayout;
-}
-
 LinearLayout nvidiaDotToLinearLayout(ArrayRef<int64_t> shape,
                                      DotOperandEncodingAttr dot) {
   int rank = shape.size();
@@ -662,8 +739,9 @@ LinearLayout nvidiaDotToLinearLayout(ArrayRef<int64_t> shape,
   }
   auto ctaLayout =
       nvidiaMmaTile(ctx, tileShape, kWidth, getOrder(dot), dot.getRepOrder());
+  auto kDim = isA ? rank - 1 : rank - 2;
   ctaLayout *=
-      warpsNvidiaDot(ctx, mma.getWarpsPerCTA(), mma.getWarpOrder(), isA)
+      warpsDotOperand(ctx, mma.getWarpsPerCTA(), mma.getWarpOrder(), kDim)
           .transposeOuts(llvm::to_vector(ctaLayout.getOutDimNames()));
 
   return combineCtaCgaWithShape(ctaLayout, getCTALayout(dot), shape);
@@ -674,6 +752,8 @@ DotOperandEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
   auto parent = getParent();
   if (auto mfmaLayout = llvm::dyn_cast<AMDMfmaEncodingAttr>(parent)) {
     return mfmaDotToLinearLayout(*this, shape);
+  } else if (auto wmmaLayout = llvm::dyn_cast<AMDWmmaEncodingAttr>(parent)) {
+    return wmmaDotOperandToLinearLayout(*this, shape);
   } else if (auto mma = mlir::dyn_cast<NvidiaMmaEncodingAttr>(parent)) {
     return nvidiaDotToLinearLayout(shape, *this);
   }
diff --git a/test/TritonGPU/invalid-attributes.mlir b/test/TritonGPU/invalid-attributes.mlir
@@ -42,23 +42,23 @@
 
 // -----
 
-// expected-error@+2 {{ttg.dot_op kWidth parameter must be 16 for gfx11 and 8 for gfx12}}
+// expected-error@+2 {{ttg.dot_op kWidth parameter must be 16 for gfx11 and 8/16 for gfx12}}
 #wmma = #ttg.amd_wmma<{version = 1, warpsPerCTA = [1, 4]}>
 #dot_op = #ttg.dot_op<{opIdx = 1, parent = #wmma}>
 
 // -----
 
-// expected-error@+2 {{ttg.dot_op kWidth parameter must be 16 for gfx11 and 8 for gfx12}}
+// expected-error@+2 {{ttg.dot_op kWidth parameter must be 16 for gfx11 and 8/16 for gfx12}}
 #wmma = #ttg.amd_wmma<{version = 1, warpsPerCTA = [1, 4]}>
 #dot_op = #ttg.dot_op<{opIdx = 1, parent = #wmma, kWidth = 8}>
 
 // -----
-// expected-error@+2 {{ttg.dot_op kWidth parameter must be 16 for gfx11 and 8 for gfx12}}
+// expected-error@+2 {{ttg.dot_op kWidth parameter must be 16 for gfx11 and 8/16 for gfx12}}
 #wmma = #ttg.amd_wmma<{version = 2, warpsPerCTA = [1, 4]}>
-#dot_op = #ttg.dot_op<{opIdx = 1, parent = #wmma, kWidth = 16}>
+#dot_op = #ttg.dot_op<{opIdx = 1, parent = #wmma, kWidth = 32}>
 
 // -----
-// expected-error@+2 {{ttg.dot_op kWidth parameter must be 16 for gfx11 and 8 for gfx12}}
+// expected-error@+2 {{ttg.dot_op kWidth parameter must be 16 for gfx11 and 8/16 for gfx12}}
 #wmma = #ttg.amd_wmma<{version = 2, warpsPerCTA = [1, 4]}>
 #dot_op = #ttg.dot_op<{opIdx = 1, parent = #wmma, kWidth = 4}>
 
diff --git a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp

Original file line number	Diff line number	Diff line change
`@@ -369,8 +369,7 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion`
`369`	`369`	`return !useLegacyMMAConversion;`
`370`	`370`	`}`
`371`	`371`	`if (auto dotOperand = dyn_cast<DotOperandEncodingAttr>(layout)) {`
`372`		`- if (isa<NvidiaMmaEncodingAttr, AMDMfmaEncodingAttr>(`
`373`		`- dotOperand.getParent())) {`
	`372`	`+ if (isa<MmaEncodingTrait>(dotOperand.getParent())) {`
`374`	`373`	`return !useLegacyMMAConversion;`
`375`	`374`	`}`
`376`	`375`	`return false;`
Original file line number	Diff line number	Diff line change
`@@ -159,7 +159,7 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {`
`159`	`159`	`srcTy.getShape()[1] >= 4 * kWidth & dstTy.getRank() <= 2;`
`160`	`160`	`return !canUseLdmatrix;`
`161`	`161`	`}`
`162`		`- if (isa<AMDMfmaEncodingAttr>(dot.getParent()))`
	`162`	`+ if (isa<AMDMfmaEncodingAttr, AMDWmmaEncodingAttr>(dot.getParent()))`
`163`	`163`	`return true;`
`164`	`164`	`}`
`165`	`165`	`return false;`
Original file line number	Diff line number	Diff line change
`@@ -1120,11 +1120,9 @@ SmallVector<unsigned> DotOperandEncodingAttr::getWarpsPerCTA() const {`
`1120`	`1120`	`}`
`1121`	`1121`	`SmallVector<unsigned> DotOperandEncodingAttr::getWarpOrder() const {`
`1122`	`1122`	`// FIXME(Lezcano): Preexisting. Do we want to have this path at all?`
`1123`		`- if (mlir::isa<AMDMfmaEncodingAttr>(getParent())) {`
	`1123`	`+ if (mlir::isa<AMDMfmaEncodingAttr, AMDWmmaEncodingAttr>(getParent())) {`
`1124`	`1124`	`return ::getWarpOrder(getParent());`
`1125`	`1125`	`}`
`1126`		`- // It's quite weird to talk about warp order when that the warps`
`1127`		`- // are broadcasted along the K dimension`
`1128`	`1126`	`llvm::report_fatal_error("DotOperandEncoding::getWarpOrder not implemented");`
`1129`	`1127`	`return {};`
`1130`	`1128`	`}`
`@@ -1160,9 +1158,9 @@ LogicalResult DotOperandEncodingAttr::verify(`
`1160`	`1158`
`1161`	`1159`	`if (auto parentAttr = mlir::dyn_cast<AMDWmmaEncodingAttr>(parent)) {`
`1162`	`1160`	`if (kWidth != 16 && parentAttr.getVersion() == 1 \|\|`
`1163`		`- kWidth != 8 && parentAttr.getVersion() == 2)`
	`1161`	`+ kWidth != 8 && kWidth != 16 && parentAttr.getVersion() == 2)`
`1164`	`1162`	`return emitError() << "ttg.dot_op kWidth parameter must be 16 for "`
`1165`		`- "gfx11 and 8 for gfx12";`
	`1163`	`+ "gfx11 and 8/16 for gfx12";`
`1166`	`1164`	`return success();`
`1167`	`1165`	`}`
`1168`	`1166`