[AMD][gfx12] WMMA AMD16x16x32 support for i4 operands (#7012)

niconunezz · web-flow · commit 4d3c4980044e · 2025-06-17T14:15:51.000-05:00
# New contributor declaration - [x ] I am not making a trivial change, such as fixing a typo in a comment. - [ x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [ x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [ x] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [ ] This PR does not need a test because `FILL THIS IN`. - Select one of the following. - [ ] I have not added any `lit` tests. - [ x] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.) # PR description Previous WMMA support was hardcoded for 16x16x16 tiles, which matched older AMD GPU capabilities. Starting with gfx1200, AMD supports 16x32 input for matrix A and 32x16 for matrix B (for i4 types). To support this, we introduce a mapping from the dot operation's configuration (i.e., shape and element type information) to the corresponding WMMA instruction. This abstraction allows the backend to dynamically determine the key instruction parameters—kDim and kWidth—which are exactly what's needed to enable support for varying K dimensions in WMMA instructions.
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -1143,11 +1143,10 @@ Row |
   let hasCustomAssemblyFormat = 1;
 
   let extraClassDeclaration = extraDistributedDeclaration # [{
-    SmallVector<int64_t> getElemsPerInstrForOperands() const;
+    SmallVector<int64_t> getElemsPerInstrForOperands(int kDim, int opIdx) const;
     SmallVector<int64_t> getRepForOperand(ArrayRef<int64_t> operandShape,
-                                          Type elemType, int kWidth, int opIdx) const;
+                                          Type elemType, int kWidth, int kDim, int opIdx) const;
     SmallVector<unsigned> getRepOrderForOperand(int opIdx) const;
-    unsigned getKWidthForOperands() const;
     static SmallVector<unsigned> getMNKDimPerInstr();
   }];
 }
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -1845,15 +1845,19 @@ SmallVector<unsigned> AMDWmmaEncodingAttr::getCTASplitNum() const {
   return SmallVector<unsigned>(getCTALayout().getCTASplitNum());
 }
 
-SmallVector<int64_t> AMDWmmaEncodingAttr::getElemsPerInstrForOperands() const {
-  return {16, 16};
+SmallVector<int64_t>
+AMDWmmaEncodingAttr::getElemsPerInstrForOperands(int kDim, int opIdx) const {
+  if (opIdx == 0)
+    return {16, kDim};
+  else
+    return {kDim, 16};
 }
 
 SmallVector<int64_t>
 AMDWmmaEncodingAttr::getRepForOperand(ArrayRef<int64_t> operandShape,
-                                      Type elemType, int kWidth,
+                                      Type elemType, int kWidth, int kDim,
                                       int opIdx) const {
-  auto operandTileShape = getElemsPerInstrForOperands();
+  auto operandTileShape = getElemsPerInstrForOperands(kDim, opIdx);
   assert(operandTileShape.size() == 2);
   auto warpsPerCTA = getWarpsPerCTA();
   auto rank = operandShape.size();
@@ -1881,14 +1885,6 @@ SmallVector<unsigned> AMDWmmaEncodingAttr::getMNKDimPerInstr() {
   return {16, 16, 16};
 }
 
-unsigned AMDWmmaEncodingAttr::getKWidthForOperands() const {
-  SmallVector<unsigned> sizePerThread(getRank(), 1);
-  auto numReplicated = getVersion() == 1 ? 2 : 1;
-  auto elemsPerInstr =
-      numReplicated * product(getElemsPerInstrForOperands()) / 32;
-  return elemsPerInstr;
-}
-
 //===----------------------------------------------------------------------===//
 // Mma encoding
 //===----------------------------------------------------------------------===//
diff --git a/test/Conversion/amd/tritongpu_wmma_dot_to_llvm.mlir b/test/Conversion/amd/tritongpu_wmma_dot_to_llvm.mlir
@@ -98,6 +98,21 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     tt.return
   }
 
+    //  CHECK-LABEL: wmma2_dot_int8_32
+  tt.func @wmma2_dot_int8_32(%arg0: tensor<16x32xi4, #ttg.dot_op<{opIdx = 0, parent = #mma2, kWidth = 16}>>, %arg1: tensor<32x16xi4, #ttg.dot_op<{opIdx = 1, parent = #mma2, kWidth = 16}>>, %arg2: tensor<16x16xi32, #mma2>) {
+    // CHECK-COUNT-16: llvm.extractvalue %{{.*}} : !llvm.struct<(i4, i4, i4, i4, i4, i4, i4, i4, i4, i4, i4, i4, i4, i4, i4, i4)>
+    // CHECK-COUNT-16: llvm.insertelement {{.*}} : vector<16xi4>
+    // CHECK: llvm.bitcast %{{.*}} : vector<16xi4> to vector<2xi32>
+    // CHECK-COUNT-16: llvm.extractvalue %{{.*}} : !llvm.struct<(i4, i4, i4, i4, i4, i4, i4, i4, i4, i4, i4, i4, i4, i4, i4, i4)>
+    // CHECK-COUNT-16: llvm.insertelement {{.*}} : vector<16xi4>
+    // CHECK: llvm.bitcast %{{.*}} : vector<16xi4> to vector<2xi32>
+    // CHECK-COUNT-8: llvm.extractvalue %{{.*}} : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+    // CHECK: wmma.i32.16x16x32.iu4{{.*}} : (i1, vector<2xi32>, i1, vector<2xi32>, vector<8xi32>, i1) -> vector<8xi32>
+    %0 = tt.dot %arg0, %arg1, %arg2 {inputPrecision = 2 : i32, maxNumImpreciseAcc = 0 : i32} : tensor<16x32xi4, #ttg.dot_op<{opIdx = 0, parent = #mma2, kWidth = 16}>> * tensor<32x16xi4, #ttg.dot_op<{opIdx = 1, parent = #mma2, kWidth = 16}>> -> tensor<16x16xi32, #mma2>
+    // CHECK-COUNT-8: llvm.insertvalue {{.*}} : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+    tt.return
+  }
+
   //  CHECK-LABEL: wmma1_dot_int4_32
   tt.func @wmma1_dot_int4_32(%arg0: tensor<16x16xi4, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 16}>>, %arg1: tensor<16x16xi4, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 16}>>, %arg2: tensor<16x16xi32, #mma1>) {
     // CHECK-COUNT-16: llvm.extractvalue %{{.*}} : !llvm.struct<(i4, i4, i4, i4, i4, i4, i4, i4, i4, i4, i4, i4, i4, i4, i4, i4)>
@@ -136,6 +151,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     tt.return
   }
 
+
   //  CHECK-LABEL: blocked_to_wmma1
   tt.func @blocked_to_wmma1(%arg0: tensor<128x16xi32, #blocked>) {
     // CHECK-COUNT-16: llvm.extractvalue {{.*}} : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)>
diff --git a/third_party/amd/include/TritonAMDGPUTransforms/WmmaGroup.h b/third_party/amd/include/TritonAMDGPUTransforms/WmmaGroup.h
@@ -0,0 +1,44 @@
+#ifndef TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTRANSFORMS_WMMAGROUP_H_
+#define TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTRANSFORMS_WMMAGROUP_H_
+
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Types.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace mlir {
+
+struct WmmaIntrinsic {
+  // Chooses a suitable wmma instrinsic for the given input case.
+  static FailureOr<WmmaIntrinsic> selectFor(int version, unsigned mDim,
+                                            unsigned nDim, unsigned inputKDim,
+                                            Type aElemType, Type bElemType,
+                                            Type dElemType);
+
+  WmmaIntrinsic(StringRef symbol, unsigned m, unsigned n, unsigned k,
+                unsigned kB, Type aET, Type bET, Type dET)
+      : name(symbol), mDim(m), nDim(n), kDim(k), kBase(kB), aElementType(aET),
+        bElementType(bET), dElementType(dET) {}
+  WmmaIntrinsic(const WmmaIntrinsic &other) = default;
+  WmmaIntrinsic(WmmaIntrinsic &&other) = default;
+  WmmaIntrinsic() = default;
+  WmmaIntrinsic &operator=(WmmaIntrinsic &&other) = default;
+
+  llvm::StringRef name;
+
+  // m, n, and k refer to the shapes of the two operands of an wmma intrinsic:
+  // Operand A has shape [m]x[k]; operand B has shape [k]x[n].
+
+  unsigned mDim;
+  unsigned nDim;
+  unsigned kDim;
+
+  // kBase is the number of elements each thread holds.
+  unsigned kBase;
+
+  Type aElementType;
+  Type bElementType;
+  Type dElementType;
+};
+} // namespace mlir
+
+#endif // TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTRANSFORMS_WMMAGROUP_H_
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandWMMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandWMMA.cpp
@@ -164,12 +164,15 @@ Value convertLayout(int opIdx, ConversionPatternRewriter &rewriter,
 
   auto elemTy = aTensorTy.getElementType();
   int kWidth = encoding.getKWidth();
-  auto elemsPerInstr = wmmaLayout.getElemsPerInstrForOperands();
+
+  int kDim = (wmmaLayout.getVersion() == 2 && kWidth == 16) ? 32 : 16;
+  auto elemsPerInstr = wmmaLayout.getElemsPerInstrForOperands(kDim, opIdx);
   auto wmmaInstrK = elemsPerInstr[opIdx == 0 ? 1 : 0];
   auto wmmaInstrNonK = elemsPerInstr[opIdx == 0 ? 0 : 1];
   assert(wmmaInstrNonK == 16);
 
-  auto numReps = wmmaLayout.getRepForOperand(shape, elemTy, kWidth, opIdx);
+  auto numReps =
+      wmmaLayout.getRepForOperand(shape, elemTy, kWidth, kDim, opIdx);
   auto numRepNonK = numReps[opIdx == 0 ? 1 : 2];
   auto numRepK = numReps[opIdx == 0 ? 2 : 1];
   auto repB = numReps[0];
@@ -179,7 +182,7 @@ Value convertLayout(int opIdx, ConversionPatternRewriter &rewriter,
   Value waveSize = tb.i32_val(iWaveSize);
   Value linearWaveId = tb.udiv(thread, waveSize);
 
-  unsigned numElemsPerThreadPerRep = wmmaLayout.getKWidthForOperands();
+  unsigned numElemsPerThreadPerRep = kWidth;
 
   Value lane = tb.urem(thread, waveSize);
   unsigned int maxNumWarps = shape[nonKDimIdx] / wmmaInstrNonK;
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/WMMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/WMMA.cpp
@@ -23,6 +23,7 @@
 
 #include "../PatternTritonGPUOpToLLVM.h"
 #include "../TritonAMDGPUToLLVM/SchedInstructions.h"
+#include "TritonAMDGPUTransforms/WmmaGroup.h"
 #include "Utility.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "triton/Conversion/TritonGPUToLLVM/Utility.h"
@@ -211,13 +212,27 @@ StringRef getWmmaIntrinsicName(Type aElTy, Type bElTy, Type dElTy, Type valATy,
   return intrinsics[h];
 }
 
+std::string addInstructionSuffix(std::string intrinsicName, unsigned kWidth,
+                                 Type aElTy, Type bElTy, Type dElTy,
+                                 bool tied) {
+  if (tied) {
+    intrinsicName += ".tied";
+  } else {
+    if (isa<FloatType>(aElTy) && aElTy.getIntOrFloatBitWidth() == 8)
+      intrinsicName += "." + getTypeStr(bElTy);
+    intrinsicName += ".v" + std::to_string(kWidth) + getTypeStr(dElTy);
+    intrinsicName += ".v" + std::to_string(kWidth) + getTypeStr(aElTy);
+  }
+
+  return intrinsicName;
+}
+
 Value generateWMMAIntrinsic(ConversionPatternRewriter &rewriter, Location loc,
                             Value valA, Value valB, Value valC, Type aElType,
-                            Type bElType, Type dElType,
+                            Type bElType, Type dElType, StringRef name,
                             std::optional<bool> tiedLower) {
   auto b = TritonLLVMOpBuilder(loc, rewriter);
-  auto name = getWmmaIntrinsicName(aElType, bElType, dElType, valA.getType(),
-                                   valC.getType(), tiedLower.has_value());
+
   LLVM::FastmathFlagsAttr defaultFlags{};
   SmallVector<Value> operands;
   if (aElType.isInteger())
@@ -240,12 +255,12 @@ Value generateWMMAIntrinsic(ConversionPatternRewriter &rewriter, Location loc,
 
 Value generateWMMAOp(ConversionPatternRewriter &rewriter, Location loc,
                      Value valA, Value valB, Value valC, Type aElType,
-                     Type bElType, Type dElType,
+                     Type bElType, Type dElType, StringRef intrinsicName,
                      std::optional<bool> tiedLower) {
   // Independent of wmma version because builtin functions are backward
   // compatible
   return generateWMMAIntrinsic(rewriter, loc, valA, valB, valC, aElType,
-                               bElType, dElType, tiedLower);
+                               bElType, dElType, intrinsicName, tiedLower);
 }
 
 // Conduct the Dot conversion.
@@ -266,16 +281,33 @@ LogicalResult convertDot(DotOp op, DotOpAdaptor adaptor,
   auto aTensorTy = cast<RankedTensorType>(a.getType());
   auto bTensorTy = cast<RankedTensorType>(b.getType());
   auto dTensorTy = cast<RankedTensorType>(d.getType());
-  auto elemTy = aTensorTy.getElementType();
+  auto aElemTy = aTensorTy.getElementType();
+  auto bElemTy = bTensorTy.getElementType();
+  auto dElemTy = dTensorTy.getElementType();
+
+  const auto kDimOperandSize = aTensorTy.getShape().back();
+
+  std::string intrinsicName;
+  FailureOr<WmmaIntrinsic> maybeWmmaIntrinsic =
+      WmmaIntrinsic::selectFor(wmmaVer, mnkDim[0], mnkDim[1], kDimOperandSize,
+                               aElemTy, bElemTy, dElemTy);
+  if (failed(maybeWmmaIntrinsic)) {
+
+    return op.emitError(
+        "no matching matrix core intrinsic due to unsupported element type");
+  }
+
+  unsigned kDim = maybeWmmaIntrinsic->kDim;
 
   auto aEncoding = cast<DotOperandEncodingAttr>(aTensorTy.getEncoding());
   auto bEncoding = cast<DotOperandEncodingAttr>(bTensorTy.getEncoding());
   int kWidth = aEncoding.getKWidth();
+  intrinsicName = maybeWmmaIntrinsic->name;
 
-  auto repA =
-      wmmaLayout.getRepForOperand(aTensorTy.getShape(), elemTy, kWidth, 0);
-  auto repB =
-      wmmaLayout.getRepForOperand(bTensorTy.getShape(), elemTy, kWidth, 1);
+  auto repA = wmmaLayout.getRepForOperand(aTensorTy.getShape(), aElemTy, kWidth,
+                                          kDim, 0);
+  auto repB = wmmaLayout.getRepForOperand(bTensorTy.getShape(), bElemTy, kWidth,
+                                          kDim, 1);
 
   assert(repA[2] == repB[1]);
 
@@ -307,6 +339,9 @@ LogicalResult convertDot(DotOp op, DotOpAdaptor adaptor,
   auto vecTy = vec_ty(dstElemTy, elemsPerVec);
   bool tied = numRepM % 2 == 0 && paddedOutputElemSize == 2;
   int tiedGroup = tied ? 2 : 1;
+
+  intrinsicName = addInstructionSuffix(intrinsicName, kWidth, aElemTy, bElemTy,
+                                       dElemTy, tied);
   for (int b = 0; b < numRepB; ++b) {
     for (int m = 0; m < numRepM / tiedGroup; ++m) {
       for (int n = 0; n < numRepN; ++n) {
@@ -334,11 +369,12 @@ LogicalResult convertDot(DotOp op, DotOpAdaptor adaptor,
                                        ha[{b, m * tiedGroup + subTied, k}], acc,
                                        bTensorTy.getElementType(),
                                        aTensorTy.getElementType(), dstElemTy,
-                                       optTied)
+                                       intrinsicName, optTied)
                       : generateWMMAOp(
                             rewriter, loc, ha[{b, m * tiedGroup + subTied, k}],
                             hb[{b, n, k}], acc, aTensorTy.getElementType(),
-                            bTensorTy.getElementType(), dstElemTy, optTied);
+                            bTensorTy.getElementType(), dstElemTy,
+                            intrinsicName, optTied);
           }
         }
         for (unsigned v = 0; v < dElemsToStorePerThread; ++v) {
@@ -360,7 +396,9 @@ LogicalResult convertDot(DotOp op, DotOpAdaptor adaptor,
   Value res = packLLElements(loc, typeConverter, fc, rewriter, structTy);
 
   const size_t mmaCount = numRepB * numRepM * numRepN * numRepK;
-  setNumGeneratedMMAs(op, mmaCount, mnkDim[0], mnkDim[1], mnkDim[2], elemTy);
+  setNumGeneratedMMAs(op, mmaCount, maybeWmmaIntrinsic->mDim,
+                      maybeWmmaIntrinsic->nDim, maybeWmmaIntrinsic->kDim,
+                      aElemTy);
 
   rewriter.replaceOp(op, res);
   return success();
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt b/third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/WmmaGroup.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/WmmaGroup.cpp