[AMD] Use ttg.mask in AMD StreamPipeliner (#7620)

PMylon · web-flow · commit 33462c8ea3e1 · 2025-07-23T18:13:46.000Z
This PR prepares the infrastructure for handling masked operations in
AMD backend, by moving MaskOp handling functions into the shared
pipeline utilities in order to utilize them.
diff --git a/include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h b/include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h
@@ -68,6 +68,15 @@ bool isOuterLoop(scf::ForOp forOp);
 /// Function to mask operations during scheduling.
 Operation *predicateOp(RewriterBase &rewriter, Operation *op, Value pred);
 
+/// Wrap the operation into a MaskOp using the provided predicate, enabling high
+/// level predication abstraction during pipelining.
+Operation *wrapInMaskOp(RewriterBase &rewriter, Operation *op, Value pred);
+
+// Utilize high level predication abstraction to perform optimizations before
+// lowering to predicated operations
+void resolveMaskOp(ModuleOp moduleOp,
+                   DenseSet<triton::gpu::MaskOp> &peeledMaskOps);
+
 // Return true if the given ForOp has the attribute
 // `tt.disallow_acc_multi_buffer` set to true.
 bool getDisallowAccMultiBuffer(scf::ForOp forOp);
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp
@@ -2,10 +2,12 @@
 #include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/UB/IR/UBOps.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "triton/Analysis/AxisInfo.h"
 #include "triton/Dialect/Triton/IR/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
@@ -279,6 +281,69 @@ Operation *mlir::triton::predicateOp(RewriterBase &rewriter, Operation *op,
   return op;
 }
 
+Operation *mlir::triton::wrapInMaskOp(RewriterBase &rewriter, Operation *op,
+                                      Value pred) {
+  auto mask =
+      rewriter.create<ttg::MaskOp>(op->getLoc(), op->getResultTypes(), pred);
+  rewriter.createBlock(&mask->getRegion(0));
+  rewriter.setInsertionPointToStart(&mask->getRegion(0).front());
+  auto newOp = rewriter.clone(*op);
+  rewriter.create<ttg::MaskReturnOp>(op->getLoc(), newOp->getResults());
+  op->replaceAllUsesWith(mask->getResults());
+  rewriter.eraseOp(op);
+  return mask;
+}
+
+void mlir::triton::resolveMaskOp(ModuleOp moduleOp,
+                                 DenseSet<ttg::MaskOp> &peeledMaskOps) {
+  IRRewriter rewriter(moduleOp);
+
+  // Canonicalize the IR to simplify the arithmetic ops defining the mask
+  auto arithDialect =
+      moduleOp.getContext()->getLoadedDialect<arith::ArithDialect>();
+  RewritePatternSet patterns(moduleOp.getContext());
+  arithDialect->getCanonicalizationPatterns(patterns);
+  if (mlir::applyPatternsGreedily(moduleOp, std::move(patterns)).failed())
+    return llvm::report_fatal_error("Failed to canonicalize the IR");
+
+  // Prune all the statically dead mask ops in the epilogue. This is a
+  // hack, ideally we should do it for all the mask ops, but it is incorrect if
+  // we have speculatively executed async cp operations that will store to shmem
+  // even if the mask is false.
+  for (auto maskOp : peeledMaskOps) {
+    rewriter.setInsertionPoint(maskOp);
+    while (&maskOp.getBody()->front() != maskOp.getBody()->getTerminator()) {
+      Operation *op = &maskOp.getBody()->front();
+      if (isConstantIntValue(maskOp.getPred(), 0)) {
+        if (op->getNumResults() > 0) {
+          SmallVector<Value> results;
+          for (auto result : op->getResults()) {
+            auto poisonOp = rewriter.create<mlir::ub::PoisonOp>(
+                op->getLoc(), result.getType());
+            results.push_back(poisonOp);
+          }
+          op->replaceAllUsesWith(results);
+        }
+        op->erase();
+      }
+    }
+  }
+
+  SmallVector<ttg::MaskOp> maskOps;
+  moduleOp->walk([&](ttg::MaskOp maskOp) { maskOps.push_back(maskOp); });
+  for (auto maskOp : maskOps) {
+    rewriter.setInsertionPoint(maskOp);
+    while (&maskOp.getBody()->front() != maskOp.getBody()->getTerminator()) {
+      Operation *op = &maskOp.getBody()->front();
+      rewriter.moveOpBefore(op, maskOp);
+      op = triton::predicateOp(rewriter, op, maskOp.getPred());
+    }
+    maskOp->replaceAllUsesWith(
+        maskOp.getBody()->getTerminator()->getOperands());
+    maskOp->erase();
+  }
+}
+
 // Return true if the given ForOp has the attribute
 // `tt.disallow_acc_multi_buffer` set to true.
 bool mlir::triton::getDisallowAccMultiBuffer(scf::ForOp forOp) {
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/SoftwarePipeliner.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/SoftwarePipeliner.cpp
@@ -1,5 +1,4 @@
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/UB/IR/UBOps.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Support/LLVM.h"
@@ -43,67 +42,6 @@ static void pipelineWgmma(ModuleOp moduleOp, unsigned numStages) {
   }
 }
 
-static Operation *wrapInMaskOp(RewriterBase &rewriter, Operation *op,
-                               Value pred) {
-  auto mask = rewriter.create<MaskOp>(op->getLoc(), op->getResultTypes(), pred);
-  rewriter.createBlock(&mask->getRegion(0));
-  rewriter.setInsertionPointToStart(&mask->getRegion(0).front());
-  auto newOp = rewriter.clone(*op);
-  rewriter.create<MaskReturnOp>(op->getLoc(), newOp->getResults());
-  op->replaceAllUsesWith(mask->getResults());
-  rewriter.eraseOp(op);
-  return mask;
-}
-
-static void resolveMaskOp(ModuleOp moduleOp, DenseSet<MaskOp> &peeledMaskOps) {
-  IRRewriter rewriter(moduleOp);
-
-  // Canonicalize the IR to simplify the arithmetic ops defining the mask
-  auto arithDialect =
-      moduleOp.getContext()->getLoadedDialect<arith::ArithDialect>();
-  RewritePatternSet patterns(moduleOp.getContext());
-  arithDialect->getCanonicalizationPatterns(patterns);
-  if (applyPatternsGreedily(moduleOp, std::move(patterns)).failed())
-    return llvm::report_fatal_error("Failed to canonicalize the IR");
-
-  // Prune all the statically dead mask ops in the epilogue. This is a
-  // hack, ideally we should do it for all the mask ops, but it is incorrect if
-  // we have speculatively executed async cp operations that will store to shmem
-  // even if the mask is false.
-  for (auto maskOp : peeledMaskOps) {
-    rewriter.setInsertionPoint(maskOp);
-    while (&maskOp.getBody()->front() != maskOp.getBody()->getTerminator()) {
-      Operation *op = &maskOp.getBody()->front();
-      if (isConstantIntValue(maskOp.getPred(), 0)) {
-        if (op->getNumResults() > 0) {
-          SmallVector<Value> results;
-          for (auto result : op->getResults()) {
-            auto poisonOp =
-                rewriter.create<ub::PoisonOp>(op->getLoc(), result.getType());
-            results.push_back(poisonOp);
-          }
-          op->replaceAllUsesWith(results);
-        }
-        op->erase();
-      }
-    }
-  }
-
-  SmallVector<MaskOp> maskOps;
-  moduleOp->walk([&](MaskOp maskOp) { maskOps.push_back(maskOp); });
-  for (auto maskOp : maskOps) {
-    rewriter.setInsertionPoint(maskOp);
-    while (&maskOp.getBody()->front() != maskOp.getBody()->getTerminator()) {
-      Operation *op = &maskOp.getBody()->front();
-      rewriter.moveOpBefore(op, maskOp);
-      op = triton::predicateOp(rewriter, op, maskOp.getPred());
-    }
-    maskOp->replaceAllUsesWith(
-        maskOp.getBody()->getTerminator()->getOperands());
-    maskOp->erase();
-  }
-}
-
 static bool hasMMAv5WaitsInLastStage(scf::ForOp forOp,
                                      CoarseSchedule &schedule) {
   int maxStage = schedule.getNumStages() - 1;
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp
@@ -53,7 +53,7 @@ Operation *streamPredication(RewriterBase &rewriter, Operation *op,
     ifOp.getElseBodyBuilder().create<scf::YieldOp>(loc, dotOp->getOperand(2));
     return ifOp;
   }
-  return tt::predicateOp(rewriter, op, pred);
+  return tt::wrapInMaskOp(rewriter, op, pred);
 }
 
 //===----------------------------------------------------------------------===//
@@ -974,9 +974,9 @@ buildSchedule(scf::ForOp &forOp, int numStages, const LoadToInfoMap &loadToInfo,
 }
 } // namespace ChainedDotSchedule
 
-LogicalResult pipelineLoop(scf::ForOp forOp, int numStages, int globalPrefetch,
-                           int localPrefetch, bool useAsyncCopy,
-                           bool waitAtTail) {
+FailureOr<scf::ForOp> pipelineLoop(scf::ForOp forOp, int numStages,
+                                   int globalPrefetch, int localPrefetch,
+                                   bool useAsyncCopy, bool waitAtTail) {
 
   triton::AMD::ModuleAxisInfoAnalysis axisInfoAnalysis(
       forOp->getParentOfType<ModuleOp>());
@@ -1019,9 +1019,24 @@ LogicalResult pipelineLoop(scf::ForOp forOp, int numStages, int globalPrefetch,
     if (part != tt::PipeliningOption::PipelinerPart::Prologue)
       return;
 
-    if (auto loadOp = dyn_cast<tt::LoadOp>(op)) {
+    auto annotateLoad = [](Operation *loadOp) {
       loadOp->setAttr("amd.pipeliner_part",
-                      StringAttr::get(op->getContext(), "prologue"));
+                      StringAttr::get(loadOp->getContext(), "prologue"));
+    };
+
+    if (auto loadOp = dyn_cast<tt::LoadOp>(op)) {
+      annotateLoad(loadOp);
+      return;
+    }
+    // loadOp may be wrapped by a MaskOp as predicateFn execution
+    // precedes annotation
+    if (auto maskOp = dyn_cast<ttg::MaskOp>(op)) {
+      for (auto &innerOp : maskOp.getBody()->without_terminator()) {
+        if (auto loadOp = dyn_cast<tt::LoadOp>(&innerOp)) {
+          annotateLoad(loadOp);
+          return;
+        }
+      }
     }
   };
   // Set the final schedule as our scheduling function
@@ -1077,6 +1092,10 @@ struct PipelinePass : impl::TritonAMDGPUStreamPipelineBase<PipelinePass> {
                          useAsyncCopy, waitAtTail);
     }
 
+    // NOTE: Leave empty for now, until we utilize customEpiloguePeeling
+    DenseSet<ttg::MaskOp> peeledMaskOps;
+    tt::resolveMaskOp(moduleOp, peeledMaskOps);
+
     if (useAsyncCopy) {
       llvm::SmallSetVector<ttg::AsyncWaitOp, 8> waitOps;
       moduleOp.walk([&](ttg::AsyncWaitOp waitOp) { waitOps.insert(waitOp); });