[Blackwell][Clean up] Introduce interface for MMAv5 ops (triton-lang#5848)

masahi · web-flow · commit 6e41010fe34b · 2025-02-07T11:01:54.000-08:00
The goal is to let `TCGen5MMAOp` and `TCGen5MMAScaledOp` share an
interface so that the rest of code can work generically with them. The
MMA pipelining pass gets cleaned up a lot, and the accum init flag
optimization is now automatically enabled for `TCGen5MMAScaledOp` as
well.

---------

Co-authored-by: Masahiro Masuda &lt;mmasuda@nvidia.com&gt;
diff --git a/include/triton/Dialect/TritonNvidiaGPU/IR/CMakeLists.txt b/include/triton/Dialect/TritonNvidiaGPU/IR/CMakeLists.txt
@@ -15,3 +15,8 @@ mlir_tablegen(TritonNvidiaGPUAttrDefs.cpp.inc -gen-attrdef-defs)
 mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
 mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
 add_public_tablegen_target(TritonNvidiaGPUAttrDefsIncGen)
+
+set(LLVM_TARGET_DEFINITIONS TritonNvidiaGPUOpInterfaces.td)
+mlir_tablegen(TritonNvidiaGPUOpInterfaces.h.inc -gen-op-interface-decls)
+mlir_tablegen(TritonNvidiaGPUOpInterfaces.cpp.inc -gen-op-interface-defs)
+add_public_tablegen_target(TritonNvidiaGPUOpInterfacesIncGen)
diff --git a/include/triton/Dialect/TritonNvidiaGPU/IR/Dialect.h b/include/triton/Dialect/TritonNvidiaGPU/IR/Dialect.h
@@ -37,6 +37,8 @@
 #define GET_ATTRDEF_CLASSES
 #include "triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUAttrDefs.h.inc"
 
+#include "triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOpInterfaces.h.inc"
+
 #define GET_OP_CLASSES
 #include "triton/Dialect/TritonNvidiaGPU/IR/Ops.h.inc"
 
diff --git a/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOpInterfaces.td b/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOpInterfaces.td
@@ -0,0 +1,42 @@
+#ifndef TRITON_NVIDIAGPU_OP_INTERFACES
+#define TRITON_NVIDIAGPU_OP_INTERFACES
+
+include "mlir/IR/OpBase.td"
+
+def MMAv5OpInterface : OpInterface<"MMAv5OpInterface"> {
+  let description = [{
+     This interface is implemented by MMAv5 dot and dot scaled ops.
+  }];
+
+  let cppNamespace = "::mlir::triton::nvidia_gpu";
+
+  // We can add more methods as needed.
+  let methods = [
+    InterfaceMethod<"Return the accumulator init flag.",
+                    "::mlir::Value",
+                    "useAccumulator">,
+    InterfaceMethod<"Set the accumulator init flag.",
+                    "void",
+                    "setUseAccumulator",
+                    (ins "::mlir::Value":$flag)>,
+    InterfaceMethod<"Associate a new barrier to this MMAv5 op.",
+                    "void",
+                    "setBarrier",
+                    (ins "::mlir::Value":$barrier)>,
+    InterfaceMethod<"Return the accumulator.",
+                    "::mlir::Value",
+                    "getAccumulator">,
+    InterfaceMethod<"Set the accumulator.",
+                    "void",
+                    "setAccumulator",
+                    (ins "::mlir::Value":$accum)>,
+    InterfaceMethod<"Return the predicate of this op.",
+                    "::mlir::Value",
+                    "getPredicate">,
+    InterfaceMethod<"Set the predicate of this op.",
+                    "void",
+                    "setPredicate",
+                    (ins "::mlir::Value":$pred)>,
+  ];
+}
+#endif // TRITON_NVIDIAGPU_OP_INTERFACES
diff --git a/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td b/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td
@@ -23,6 +23,7 @@
 #define TRITONNVIDIAGPU_OPS
 
 include "triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUDialect.td"
+include "triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOpInterfaces.td"
 include "mlir/Dialect/Arith/IR/ArithBase.td"
 include "triton/Dialect/Triton/IR/TritonTypes.td"
 include "triton/Dialect/Triton/IR/TritonAttrDefs.td"
@@ -326,7 +327,7 @@ def TTNG_TMAStoreWaitOp : TTNG_Op<"async_tma_store_wait"> {
   let assemblyFormat = "attr-dict";
 }
 
-def TTNG_TCGen5MMAOp : TTNG_Op<"tc_gen5_mma", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>, DeclareOpInterfaceMethods<DotOpInterface>]> {
+def TTNG_TCGen5MMAOp : TTNG_Op<"tc_gen5_mma", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>, DeclareOpInterfaceMethods<DotOpInterface>, DeclareOpInterfaceMethods<MMAv5OpInterface>]> {
     let summary = "block level op mapping to tensorcore gen5 mma";
 
     let description = [{
@@ -349,7 +350,7 @@ def TTNG_TCGen5MMAOp : TTNG_Op<"tc_gen5_mma", [DeclareOpInterfaceMethods<MemoryE
     let assemblyFormat = "$a`,` $b`,` $d`,` $useD`,` $pred (`,` $barrier^)? attr-dict `:` functional-type(operands, results)";
 }
 
-def TTNG_TCGen5MMAScaledOp : TTNG_Op<"tc_gen5_mma_scaled", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>, DeclareOpInterfaceMethods<DotOpInterface>]> {
+def TTNG_TCGen5MMAScaledOp : TTNG_Op<"tc_gen5_mma_scaled", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>, DeclareOpInterfaceMethods<DotOpInterface>, DeclareOpInterfaceMethods<MMAv5OpInterface>]> {
     let summary = "block level op mapping to tensorcore gen5 mma";
 
     let description = [{
diff --git a/lib/Dialect/TritonGPU/Transforms/OptimizeAccumulatorInit.cpp b/lib/Dialect/TritonGPU/Transforms/OptimizeAccumulatorInit.cpp
@@ -28,20 +28,21 @@ class TMEMAllocWithUnusedInit
                                    op.getResult().getUsers().end());
     if (users.size() > 2)
       return failure();
-    triton::nvidia_gpu::TCGen5MMAOp mmaOp = nullptr;
+    triton::nvidia_gpu::MMAv5OpInterface mmaOp = nullptr;
     triton::nvidia_gpu::TMEMLoadOp tmemLoad = nullptr;
     for (auto user : users) {
       if (auto load = dyn_cast<triton::nvidia_gpu::TMEMLoadOp>(user)) {
         tmemLoad = load;
-      } else if (auto mma = dyn_cast<triton::nvidia_gpu::TCGen5MMAOp>(user)) {
+      } else if (auto mma =
+                     dyn_cast<triton::nvidia_gpu::MMAv5OpInterface>(user)) {
         mmaOp = mma;
       }
     }
     if (!mmaOp)
       return failure();
     if (tmemLoad && !mmaOp->isBeforeInBlock(tmemLoad))
       return failure();
-    Value useAccFlag = mmaOp.getUseD();
+    Value useAccFlag = mmaOp.useAccumulator();
     if (!useAccFlag)
       return failure();
     auto flagConstOp = useAccFlag.getDefiningOp<arith::ConstantOp>();
@@ -63,7 +64,7 @@ bool dotSupportsAccInitFlag(Operation *op) {
     // initialization that would degrade the performance.
     return !wgDotOp.needsPartialAccumulator();
   }
-  if (isa<triton::nvidia_gpu::TCGen5MMAOp>(op)) {
+  if (isa<triton::nvidia_gpu::MMAv5OpInterface>(op)) {
     return true;
   }
   return false;
@@ -76,8 +77,8 @@ std::pair<Value, Operation *> getAccumulatorUseAndDef(Operation *op) {
   if (auto wgDotOp = dyn_cast<triton::nvidia_gpu::WarpGroupDotOp>(op)) {
     return std::make_pair(wgDotOp.getC(), wgDotOp);
   }
-  if (auto tc05MmaOp = dyn_cast<triton::nvidia_gpu::TCGen5MMAOp>(op)) {
-    auto accVal = tc05MmaOp.getD();
+  if (auto tc05MmaOp = dyn_cast<triton::nvidia_gpu::MMAv5OpInterface>(op)) {
+    auto accVal = tc05MmaOp.getAccumulator();
     auto tmemAlloc = accVal.getDefiningOp<triton::nvidia_gpu::TMEMAllocOp>();
     if (!tmemAlloc ||
         tmemAlloc->getParentRegion() != tc05MmaOp->getParentRegion())
@@ -104,8 +105,9 @@ void setUseAccFlag(Operation *op, Value useAcc) {
 
   if (auto wgDotOp = dyn_cast<triton::nvidia_gpu::WarpGroupDotOp>(op)) {
     wgDotOp.getUseCMutable().assign(useAcc);
-  } else if (auto tc05MmaOp = dyn_cast<triton::nvidia_gpu::TCGen5MMAOp>(op)) {
-    tc05MmaOp.getUseDMutable().assign(useAcc);
+  } else if (auto tc05MmaOp =
+                 dyn_cast<triton::nvidia_gpu::MMAv5OpInterface>(op)) {
+    tc05MmaOp.setUseAccumulator(useAcc);
   } else {
     assert(false && "Unexpected op which implements a DotOpInterface");
   }
diff --git a/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp b/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp
@@ -10,6 +10,7 @@
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
+#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 #include <memory>
 
 namespace mlir::triton::gpu {
@@ -90,8 +91,7 @@ class FuseTransMMAV3Plus : public OpRewritePattern<LocalAllocOp> {
                                 PatternRewriter &rewriter) const override {
     if (!allocOp.getSrc() || !allocOp->hasOneUse() ||
         !isa<triton::nvidia_gpu::WarpGroupDotOp,
-             triton::nvidia_gpu::TCGen5MMAOp,
-             triton::nvidia_gpu::TCGen5MMAScaledOp>(
+             triton::nvidia_gpu::MMAv5OpInterface>(
             *allocOp->getUsers().begin()))
       return failure();
 
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp
@@ -7,6 +7,7 @@
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
+#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 #include "llvm/Support/Casting.h"
 
 using namespace mlir;
@@ -97,18 +98,11 @@ Operation *mlir::triton::predicateOp(RewriterBase &rewriter, Operation *op,
     expectOp.getPredMutable().assign(mask);
     return op;
   }
-  if (auto mmav5Op = dyn_cast<ttng::TCGen5MMAOp>(op)) {
+  if (auto mmav5Op = dyn_cast<ttng::MMAv5OpInterface>(op)) {
     rewriter.setInsertionPoint(mmav5Op);
-    Value mask = getPredMask(rewriter, mmav5Op.getPred().getType(),
-                             mmav5Op.getPred(), pred);
-    mmav5Op.getPredMutable().assign(mask);
-    return op;
-  }
-  if (auto mmav5Op = dyn_cast<ttng::TCGen5MMAScaledOp>(op)) {
-    rewriter.setInsertionPoint(mmav5Op);
-    Value mask = getPredMask(rewriter, mmav5Op.getPred().getType(),
-                             mmav5Op.getPred(), pred);
-    mmav5Op.getPredMutable().assign(mask);
+    auto currPred = mmav5Op.getPredicate();
+    Value mask = getPredMask(rewriter, currPred.getType(), currPred, pred);
+    mmav5Op.setPredicate(mask);
     return op;
   }
   if (auto tmemStoreOp = dyn_cast<ttng::TMEMStoreOp>(op)) {
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/TC05MMAPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/TC05MMAPipeline.cpp
@@ -62,7 +62,7 @@ struct MMAInfo {
 // accumulator for the given MMA operation. The TMEMAllocOp and TMEMLoadOp must
 // be in the same region as the MMA operation.
 std::optional<std::pair<ttng::TMEMAllocOp, ttng::TMEMLoadOp>>
-getTMemAllocAndLoad(Operation *mmaOp) {
+getTMemAllocAndLoad(ttng::MMAv5OpInterface mmaOp) {
   auto acc = mmaOp->getOperand(2).getDefiningOp<ttng::TMEMAllocOp>();
   if (!acc || acc->getParentRegion() != mmaOp->getParentRegion()) {
     return std::nullopt;
@@ -230,20 +230,16 @@ getAccUseFlagFalseInLoop(scf::ForOp forOp, Value useAccFlagUse) {
 }
 
 std::optional<MMAInfo::AccOverridePoint>
-getAccOverrideOrFlagFalseInLoop(scf::ForOp forOp, Operation *mmaOp) {
+getAccOverrideOrFlagFalseInLoop(scf::ForOp forOp,
+                                ttng::MMAv5OpInterface mmaOp) {
   auto tmemAllocAndLoad = getTMemAllocAndLoad(mmaOp);
   assert(tmemAllocAndLoad.has_value() && "Expected tmem alloc and load");
   auto [accAlloc, accLoad] = tmemAllocAndLoad.value();
   auto accOverridePoint = getAccOverridePointInLoop(forOp, accAlloc, accLoad);
 
   if (!accOverridePoint.has_value()) {
-    if (auto op = dyn_cast<ttng::TCGen5MMAOp>(mmaOp)) {
-      auto useAccFlag = op.getUseD();
-      accOverridePoint = getAccUseFlagFalseInLoop(forOp, useAccFlag);
-    } else if (auto op = dyn_cast<ttng::TCGen5MMAScaledOp>(mmaOp)) {
-      auto useAccFlag = op.getUseD();
-      accOverridePoint = getAccUseFlagFalseInLoop(forOp, useAccFlag);
-    }
+    auto useAccFlag = mmaOp.useAccumulator();
+    accOverridePoint = getAccUseFlagFalseInLoop(forOp, useAccFlag);
   }
 
   return accOverridePoint;
@@ -281,7 +277,7 @@ Value createSingleBufferView(IRRewriter &builder, Value alloc, int idx) {
       builder.create<arith::ConstantIntOp>(alloc.getLoc(), idx, 32));
 }
 
-Value createBarrierAlloc(scf::ForOp forOp, Operation *mmaOp, int numStages) {
+Value createBarrierAlloc(scf::ForOp forOp, int numStages) {
   IRRewriter rewriter(forOp->getContext());
   rewriter.setInsertionPoint(forOp);
   MLIRContext *ctx = forOp.getContext();
@@ -490,7 +486,8 @@ void updateAccDefsInLoop(IRRewriter &builder, scf::ForOp forOp, MMAInfo &info,
 // hoisted tmem allocs. Also, update the acc loads and stores to use the new
 // tmem allocs.
 void hoistAndUseTMemAlloc(IRRewriter &builder, scf::ForOp forOp,
-                          Operation *mmaOp, MMAInfo &info, int numStages) {
+                          ttng::MMAv5OpInterface mmaOp, MMAInfo &info,
+                          int numStages) {
   builder.setInsertionPoint(forOp);
   Value zero = builder.create<arith::ConstantIntOp>(forOp.getLoc(), 0, 32);
   Value one = builder.create<arith::ConstantIntOp>(forOp.getLoc(), 1, 32);
@@ -515,11 +512,7 @@ void hoistAndUseTMemAlloc(IRRewriter &builder, scf::ForOp forOp,
         createSingleBufferView(builder, insertSlice, info.accInsertIdx);
   }
 
-  if (auto op = dyn_cast<ttng::TCGen5MMAOp>(mmaOp)) {
-    op.getDMutable().assign(insertSlice);
-  } else if (auto op = dyn_cast<ttng::TCGen5MMAScaledOp>(mmaOp)) {
-    op.getDMutable().assign(insertSlice);
-  }
+  mmaOp.setAccumulator(insertSlice);
 
   updateAccUsesInLoop(builder, forOp, info, newAlloc, numStages);
   assert(isa<BlockArgument>(info.accExtractIdx));
@@ -545,26 +538,22 @@ void hoistAndUseTMemAlloc(IRRewriter &builder, scf::ForOp forOp,
 
 // Create multi-buffered barrier allocs and lower the MMA to MMA + wait barrier
 void createBarrierAndWaitOps(IRRewriter &builder, scf::ForOp forOp,
-                             Operation *mmaOp, MMAInfo &info, int numStages) {
+                             ttng::MMAv5OpInterface mmaOp, MMAInfo &info,
+                             int numStages) {
   builder.setInsertionPoint(forOp);
   Value zero = builder.create<arith::ConstantIntOp>(forOp.getLoc(), 0, 32);
   Value one = builder.create<arith::ConstantIntOp>(forOp.getLoc(), 1, 32);
   Value numStagesVal =
       builder.create<arith::ConstantIntOp>(forOp.getLoc(), numStages, 32);
 
-  info.barrierAlloc = createBarrierAlloc(forOp, mmaOp, numStages);
+  info.barrierAlloc = createBarrierAlloc(forOp, numStages);
 
   Location loc = mmaOp->getLoc();
   builder.setInsertionPoint(mmaOp);
 
   Value barrierSlice =
       createSingleBufferView(builder, info.barrierAlloc, info.barrierIdx);
-
-  if (auto op = dyn_cast<ttng::TCGen5MMAOp>(mmaOp)) {
-    op.getBarrierMutable().assign(barrierSlice);
-  } else if (auto op = dyn_cast<ttng::TCGen5MMAScaledOp>(mmaOp)) {
-    op.getBarrierMutable().assign(barrierSlice);
-  }
+  mmaOp.setBarrier(barrierSlice);
 
   builder.setInsertionPointAfter(mmaOp);
   auto waitOp =
@@ -653,10 +642,11 @@ FailureOr<scf::ForOp> preProcessLoopForTC05MMAPipelining(scf::ForOp forOp,
   }
 
   IRRewriter builder(forOp->getContext());
-  for (auto mmaOp : mmaOps) {
+  for (auto op : mmaOps) {
     // Avoid pipelining if in the backward slice of the mmaOp there is an
     // operation that is already assigned a stage, as it would make the pipeline
     // deeper than we are prepared for.
+    auto mmaOp = cast<ttng::MMAv5OpInterface>(op);
     SetVector<Operation *> backwardSlice;
     BackwardSliceOptions opt;
     opt.omitBlockArguments = true;
diff --git a/lib/Dialect/TritonNvidiaGPU/IR/CMakeLists.txt b/lib/Dialect/TritonNvidiaGPU/IR/CMakeLists.txt
@@ -5,6 +5,7 @@ add_triton_library(TritonNvidiaGPUIR
   DEPENDS
   TritonNvidiaGPUTableGen
   TritonNvidiaGPUAttrDefsIncGen
+  TritonNvidiaGPUOpInterfacesIncGen
 
   LINK_LIBS PUBLIC
   TritonIR
diff --git a/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp b/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp
@@ -25,6 +25,8 @@
 #include "mlir/Support/LLVM.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 
+#include "triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOpInterfaces.cpp.inc"
+
 #define GET_OP_CLASSES
 #include "triton/Dialect/TritonNvidiaGPU/IR/Ops.cpp.inc"
 
@@ -273,6 +275,24 @@ bool TCGen5MMAOp::verifyDims() {
   return aShape[aShape.size() - 1] == bShape[aShape.size() - 2];
 }
 
+Value TCGen5MMAOp::useAccumulator() { return getUseD(); }
+
+void TCGen5MMAOp::setUseAccumulator(Value flag) {
+  getUseDMutable().assign(flag);
+}
+
+void TCGen5MMAOp::setBarrier(Value barrier) {
+  getBarrierMutable().assign(barrier);
+}
+
+Value TCGen5MMAOp::getAccumulator() { return getD(); }
+
+void TCGen5MMAOp::setAccumulator(Value accum) { getDMutable().assign(accum); }
+
+Value TCGen5MMAOp::getPredicate() { return getPred(); }
+
+void TCGen5MMAOp::setPredicate(Value pred) { getPredMutable().assign(pred); }
+
 // -- TMEMStoreOp --
 LogicalResult TMEMStoreOp::verify() {
   if (!isa<triton::nvidia_gpu::TensorMemorySpaceAttr>(
@@ -317,6 +337,28 @@ bool TCGen5MMAScaledOp::verifyDims() {
   return aKdim == bKdim;
 }
 
+Value TCGen5MMAScaledOp::useAccumulator() { return getUseD(); }
+
+void TCGen5MMAScaledOp::setUseAccumulator(Value flag) {
+  getUseDMutable().assign(flag);
+}
+
+void TCGen5MMAScaledOp::setBarrier(Value barrier) {
+  getBarrierMutable().assign(barrier);
+}
+
+Value TCGen5MMAScaledOp::getAccumulator() { return getD(); }
+
+void TCGen5MMAScaledOp::setAccumulator(Value accum) {
+  getDMutable().assign(accum);
+}
+
+Value TCGen5MMAScaledOp::getPredicate() { return getPred(); }
+
+void TCGen5MMAScaledOp::setPredicate(Value pred) {
+  getPredMutable().assign(pred);
+}
+
 // -- TMEMLoadOp --
 LogicalResult TMEMLoadOp::verify() {
   if (!isa<triton::nvidia_gpu::TensorMemorySpaceAttr>(
diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/KeepAccInTMem.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/KeepAccInTMem.cpp
@@ -167,7 +167,7 @@ class TritonNvidiaGPUKeepAccInTMemPass
     SmallVector<Operation *> mmaOps;
     forOp.walk([&](Operation *mmaOp) {
       // Skip MMA nested in another forOp
-      if (isa<ttng::TCGen5MMAOp, ttng::TCGen5MMAScaledOp>(mmaOp) &&
+      if (isa<ttng::MMAv5OpInterface>(mmaOp) &&
           mmaOp->getParentOfType<scf::ForOp>() == forOp) {
         mmaOps.push_back(mmaOp);
       }
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAv5.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAv5.cpp
@@ -548,8 +548,7 @@ struct TCGen5MMAScaledOpConversion
                              {(unsigned)mmaSizeN, (unsigned)mmaSizeK},
                              numBitsPerElementB, rewriter, loc);
 
-    // TODO: Support accumulator init optimization for scaled dot
-    Value useInitAcc = tb.int_val(1, 1);
+    Value useInitAcc = op.getUseD();
     // Only run mma on one thread. We currently use elect as ptxas is not able
     // to detect that tid.x == 0 is true only for 1 thread.
     Value pred =