[TritonNvidiaGPU] Tighten WGMMA verifier; improve FenceInsertion (#6801)

Mogball · web-flow · commit 26d772221e75 · 2025-05-13T09:06:57.000-07:00
* verify that WarpGroupDotOp's result encoding is always NVMMA Hopper
encoding
* clean up some code with this
* teach FenceInsertion to look through WarpSpecializeOp
* deduplicate fences (e.g. two dots in a loop with captured reg-&gt;shared
operands)
diff --git a/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td b/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td
@@ -72,32 +72,40 @@ def TTNG_ClusterWaitOp : TTNG_Op<"cluster_wait", []> {
 //
 // WarpGroupDot Op
 //
-def TTNG_WarpGroupDotOp : TTNG_Op<"warp_group_dot", [DeclareOpInterfaceMethods<InferTypeOpInterface>,
-                                                     DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
-                                                     DeclareOpInterfaceMethods<DotOpInterface>,
-                                                     TypesMatchWith<"result's type matches accumulator's type",
-                                                                     "d", "c", "$_self">]> {
-    let summary = "warp group dot";
-
-    let description = [{
-        $d = matrix_multiply($a, $b) + $c. For docs on InputPrecisionAttr, see TT_DotOp
-    }];
-
-    let arguments = (ins TTG_TensorOrMemDesc:$a,
-                         TTG_TensorOrMemDesc:$b,
-                         TT_FpIntTensor:$c,
-                         Optional<I1>:$useC,
-                         DefaultValuedAttr<TT_InputPrecisionAttr, "::mlir::triton::InputPrecision::IEEE">:$inputPrecision,
-                         DefaultValuedAttr<I32Attr, "0">:$maxNumImpreciseAcc,
-                         DefaultValuedAttr<BoolAttr, "false">:$isAsync);
-
-    let results = (outs TT_FpIntTensor:$d);
-
-    let assemblyFormat = "$a`,` $b`,` $c (`,` $useC^)? attr-dict `:` type($a) `*` type($b) `->` type($d)";
-
-    let extraClassDeclaration = [{
-      bool needsPartialAccumulator();
-    }];
+def TTNG_WarpGroupDotOp : TTNG_Op<"warp_group_dot", [
+  DeclareOpInterfaceMethods<InferTypeOpInterface>,
+  DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+  DeclareOpInterfaceMethods<DotOpInterface>,
+  TypesMatchWith<"result's type matches accumulator's type", "d", "c", "$_self">
+]> {
+  let summary = "warp group dot";
+
+  let description = [{
+    $d = matrix_multiply($a, $b) + $c. For docs on InputPrecisionAttr, see TT_DotOp
+  }];
+
+  let arguments = (ins
+    TTG_TensorOrMemDesc:$a,
+    TTG_TensorOrMemDesc:$b,
+    TT_FpIntTensor:$c,
+    Optional<I1>:$useC,
+    DefaultValuedAttr<TT_InputPrecisionAttr, "::mlir::triton::InputPrecision::IEEE">:$inputPrecision,
+    DefaultValuedAttr<I32Attr, "0">:$maxNumImpreciseAcc,
+    DefaultValuedAttr<BoolAttr, "false">:$isAsync
+  );
+
+  let results = (outs TT_FpIntTensor:$d);
+
+  let assemblyFormat = [{
+    $a`,` $b`,` $c (`,` $useC^)? attr-dict
+    `:` type($a) `*` type($b) `->` type($d)
+  }];
+
+  let extraClassDeclaration = [{
+    bool needsPartialAccumulator();
+  }];
+
+  let hasVerifier = 1;
 }
 
 def TTNG_WarpGroupDotWaitOp : TTNG_Op<"warp_group_dot_wait", [DeclareOpInterfaceMethods<InferTypeOpInterface>,
diff --git a/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp b/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp
@@ -34,7 +34,7 @@ namespace triton {
 namespace nvidia_gpu {
 
 // -- WarpGroupDotOp --
-mlir::LogicalResult WarpGroupDotOp::inferReturnTypes(
+LogicalResult WarpGroupDotOp::inferReturnTypes(
     MLIRContext *context, std::optional<Location> location, ValueRange operands,
     DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<Type> &inferredReturnTypes) {
@@ -43,21 +43,27 @@ mlir::LogicalResult WarpGroupDotOp::inferReturnTypes(
   inferredReturnTypes.push_back(accTy);
 
   // verify encodings
-  auto aEnc =
-      cast<triton::gpu::TensorOrMemDesc>(operands[0].getType()).getEncoding();
-  auto bEnc =
-      cast<triton::gpu::TensorOrMemDesc>(operands[1].getType()).getEncoding();
+  auto aEnc = cast<TensorOrMemDesc>(operands[0].getType()).getEncoding();
+  auto bEnc = cast<TensorOrMemDesc>(operands[1].getType()).getEncoding();
   auto retEnc = accTy.getEncoding();
   if (aEnc) {
     assert(bEnc);
     Dialect &dialect = aEnc.getDialect();
     auto interface = cast<DialectInferLayoutInterface>(&dialect);
     if (interface->inferDotOpEncoding(aEnc, 0, retEnc, location).failed())
-      return mlir::failure();
+      return failure();
     if (interface->inferDotOpEncoding(bEnc, 1, retEnc, location).failed())
-      return mlir::failure();
+      return failure();
   }
-  return mlir::success();
+  return success();
+}
+
+LogicalResult WarpGroupDotOp::verify() {
+  auto nvmmaEnc =
+      dyn_cast<NvidiaMmaEncodingAttr>(getD().getType().getEncoding());
+  if (!nvmmaEnc || !nvmmaEnc.isHopper())
+    return emitOpError("WGMMA result layout must be Hopper NVMMA");
+  return success();
 }
 
 void WarpGroupDotOp::getEffects(
diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
@@ -39,25 +39,17 @@ struct FenceInsertionPass
     if (computeCapability < 90)
       return;
     ModuleOp mod = getOperation();
-    mod.walk([&](Operation *op) {
-      bool isMMAv3 = isa<ttng::WarpGroupDotOp>(op);
-      if (!isMMAv3 && !isa<ttng::MMAv5OpInterface>(op))
-        return WalkResult::advance();
-      OpBuilder builder(op);
-      auto a = op->getOperand(0);
-      auto b = op->getOperand(1);
-      if (isMMAv3) {
-        auto mmaEncoding = dyn_cast<ttg::NvidiaMmaEncodingAttr>(
-            cast<RankedTensorType>(op->getResult(0).getType()).getEncoding());
-        if (!mmaEncoding || !mmaEncoding.isHopper())
-          return WalkResult::advance();
-      }
+    mod.walk([&](tt::DotOpInterface dotOp) {
+      Value a = dotOp.getA();
+      Value b = dotOp.getB();
       bool aDependsOnShared = dependOnCopyRegToShared(a);
       bool bDependsOnShared = dependOnCopyRegToShared(b);
       if (!aDependsOnShared && !bDependsOnShared)
         return WalkResult::advance();
-      Operation *fence = builder.create<ttng::FenceAsyncSharedOp>(
-          op->getLoc(), /*bCluster=*/false);
+
+      OpBuilder builder(dotOp);
+      auto fence = builder.create<ttng::FenceAsyncSharedOp>(dotOp.getLoc(),
+                                                            /*bCluster=*/false);
       // If there is all the dependencies are outside of the loop try to hoist
       // the fence.
       while (auto loopOp = fence->getParentOfType<LoopLikeOpInterface>()) {
@@ -69,6 +61,14 @@ struct FenceInsertionPass
           break;
         loopOp.moveOutOfLoop(fence);
       }
+
+      // If the previous op is already a fence, this one isn't needed.
+      if (auto lastFence = dyn_cast_or_null<ttng::FenceAsyncSharedOp>(
+              fence->getPrevNode())) {
+        if (lastFence.getBCluster() == fence.getBCluster())
+          fence.erase();
+      }
+
       return WalkResult::advance();
     });
   }
@@ -88,6 +88,7 @@ struct FenceInsertionPass
     visited.insert(operand);
     if (!isa<triton::gpu::MemDescType>(operand.getType()))
       return false;
+
     auto op = operand.getDefiningOp();
     if (op) {
       // reach an alloc copying from register, we need a fence.
@@ -100,26 +101,30 @@ struct FenceInsertionPass
       }
       return false;
     }
+
     // reach BlockArgument
     BlockArgument arg = cast<BlockArgument>(operand);
     unsigned argNum = arg.getArgNumber();
     Operation *argOwner = arg.getOwner()->getParentOp();
-    // support ForOp only
+    // look through ForOp iter argument
     if (auto forOp = dyn_cast<scf::ForOp>(argOwner)) {
+      assert(argNum != 0 && "induction var cannot be memdesc type");
+      --argNum;
       // prologue
-      auto iterOperands = forOp.getInitArgs();
-      if (argNum == 0)
-        return false;
-      if (dependOnCopyRegToShared(iterOperands[argNum - 1], visited))
+      if (dependOnCopyRegToShared(forOp.getInitArgs()[argNum], visited))
         return true;
       // yield
       auto yieldOp = forOp.getBody()->getTerminator();
-      Value v = yieldOp->getOperand(argNum - 1);
-      auto entry = std::make_pair<Operation *, unsigned>(std::move(yieldOp),
-                                                         std::move(argNum));
-      if (dependOnCopyRegToShared(v, visited))
-        return true;
+      Value v = yieldOp->getOperand(argNum);
+      return dependOnCopyRegToShared(v, visited);
     }
+
+    // look through `ttg.warp_specialize`.
+    if (auto wsOp = dyn_cast<ttg::WarpSpecializePartitionsOp>(argOwner)) {
+      return dependOnCopyRegToShared(
+          wsOp.getParentOp().getExplicitCaptures()[argNum]);
+    }
+
     // Conservatively return true for other ops
     return true;
   }
diff --git a/test/TritonGPU/fence-inserstion.mlir b/test/TritonGPU/fence-inserstion.mlir
@@ -91,3 +91,51 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
     tt.return
   }
 }
+
+// -----
+
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
+#smem = #ttg.shared_memory
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
+#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 64, unpacked = true>
+
+module attributes {ttg.target = "cuda:100", "ttg.num-warps" = 4 : i32} {
+
+// CHECK-LABEL: @mma_inside_warp_specialize
+tt.func @mma_inside_warp_specialize(%src: tensor<64x64xf16, #blocked>) {
+  %A = ttg.local_alloc %src : (tensor<64x64xf16, #blocked>) -> !ttg.memdesc<64x64xf16, #shared, #smem>
+  %B = ttg.local_alloc : () -> !ttg.memdesc<64x64xf16, #shared, #smem, mutable>
+  %D = ttng.tmem_alloc : () -> !ttg.memdesc<64x64xf32, #tmem, #ttng.tensor_memory, mutable>
+
+  ttg.warp_specialize(%A, %B, %D)
+  default {
+    ttg.warp_yield
+  }
+  // CHECK: partition0
+  partition0(%lhs: !ttg.memdesc<64x64xf16, #shared, #smem>, %rhs: !ttg.memdesc<64x64xf16, #shared, #smem, mutable>, %acc: !ttg.memdesc<64x64xf32, #tmem, #ttng.tensor_memory, mutable>) num_warps(4) {
+    %true = arith.constant true
+    %c0_i32 = arith.constant 0 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c32_i32 = arith.constant 32 : i32
+    // CHECK: ttng.fence_async_shared
+    // CHECK-NEXT: scf.for
+    scf.for %i = %c0_i32 to %c32_i32 step %c1_i32 : i32 {
+      // CHECK-NEXT: ttng.tc_gen5_mma
+      ttng.tc_gen5_mma %lhs, %rhs, %acc, %true, %true : !ttg.memdesc<64x64xf16, #shared, #smem>, !ttg.memdesc<64x64xf16, #shared, #smem, mutable>, !ttg.memdesc<64x64xf32, #tmem, #ttng.tensor_memory, mutable>
+      // CHECK-NEXT: ttng.tc_gen5_mma
+      ttng.tc_gen5_mma %lhs, %rhs, %acc, %true, %true : !ttg.memdesc<64x64xf16, #shared, #smem>, !ttg.memdesc<64x64xf16, #shared, #smem, mutable>, !ttg.memdesc<64x64xf32, #tmem, #ttng.tensor_memory, mutable>
+    }
+    ttg.warp_return
+  }
+  // CHECK: partition1
+  partition1(%lhs: !ttg.memdesc<64x64xf16, #shared, #smem>, %rhs: !ttg.memdesc<64x64xf16, #shared, #smem, mutable>, %acc: !ttg.memdesc<64x64xf32, #tmem, #ttng.tensor_memory, mutable>) num_warps(4) {
+    // CHECK-NOT: ttng.fence_async_shared
+    %true = arith.constant true
+    // CHECK: ttng.tc_gen5_mma
+    ttng.tc_gen5_mma %rhs, %rhs, %acc, %true, %true : !ttg.memdesc<64x64xf16, #shared, #smem, mutable>, !ttg.memdesc<64x64xf16, #shared, #smem, mutable>, !ttg.memdesc<64x64xf32, #tmem, #ttng.tensor_memory, mutable>
+    ttg.warp_return
+  } : (!ttg.memdesc<64x64xf16, #shared, #smem>, !ttg.memdesc<64x64xf16, #shared, #smem, mutable>, !ttg.memdesc<64x64xf32, #tmem, #ttng.tensor_memory, mutable>) -> ()
+  tt.return
+}
+
+}
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM.cpp
@@ -71,28 +71,21 @@ struct WarpGroupDotOpConversion
     auto loc = op.getLoc();
     // D = A * B + C
     Value A = op.getA();
-    Value D = op.getResult();
+    TypedValue<RankedTensorType> D = op.getResult();
 
     // Here we assume the DotOp's operands always comes from shared memory.
     auto AShapePerCTA = getShapePerCTA(A.getType());
     size_t reduceAxis = 1;
     unsigned K = AShapePerCTA[reduceAxis];
     bool isOuter = K == 1;
 
-    NvidiaMmaEncodingAttr mmaLayout = dyn_cast<NvidiaMmaEncodingAttr>(
-        cast<RankedTensorType>(D.getType()).getEncoding());
-    if (!isOuter && mmaLayout &&
-        supportMMA(op.getOperand(0), mmaLayout.getVersionMajor())) {
-      if (mmaLayout.isHopper()) {
-        return convertWGMMA(op, adaptor, getTypeConverter(), rewriter,
-                            getThreadId(rewriter, loc));
-      }
-
-      llvm::report_fatal_error(
-          "Unsupported MMA kind found when converting WarpGroupDotOp to LLVM.");
+    auto mmaLayout = cast<NvidiaMmaEncodingAttr>(D.getType().getEncoding());
+    if (!isOuter && supportMMA(op.getOperand(0), mmaLayout.getVersionMajor())) {
+      return convertWGMMA(op, adaptor, getTypeConverter(), rewriter,
+                          getThreadId(rewriter, loc));
     }
 
-    llvm::report_fatal_error(
+    return op.emitError(
         "Unsupported WarpGroupDotOp found when converting TritonGPU to LLVM.");
   }
 };