[PIPELINE] Implementing expander option to leave stage predicates as an unresolved op (#6836)

pawelszczerbuk · web-flow · commit 1572e341bfcf · 2025-05-15T20:42:54.000Z
We would like to implement epilogue peeling by custom amount of
iterations to help with cases where last loop iteration is almost
entirely predicated out except for final mmav5 wait.
To avoid adding hard to debug complexity to the expander, we will peel
the epilogue after the expansion and resolve the stage predicates
manually (masking out the instructions from non-last iterations after
the loop, and removing the mask from one-before-last iteration in the
loop).
Adding an option to delay resolving the predicate to after the expansion
makes such transformation much easier, as we won't need to analyze the
arithmetic ops used to build the logical predicates for the ops.
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
@@ -313,6 +313,18 @@ def TTG_LocalStoreOp : TTG_Op<"local_store"> {
   }];
 }
 
+def TTG_PredicateStageOp: TTG_Op<"predicate_stage",
+                                [Pure, AllTypesMatch<["iv", "ub", "step"]>]> {
+  let summary = "pipeliner stage predicate";
+  let arguments = (ins AnySignlessIntegerOrIndex:$iv,
+                       AnySignlessIntegerOrIndex:$ub,
+                       AnySignlessIntegerOrIndex:$step,
+                       I32Attr:$maxStage,
+                       I32Attr:$stage);
+  let results = (outs I1:$result);
+  let assemblyFormat = "$iv `,` $ub `,` $step `maxStage` $maxStage `stage` $stage attr-dict `:` type($iv) `->` type($result)";
+}
+
 def TTG_Fp4ToFpOp : TTG_Op<"fp4_to_fp", [Pure]> {
   let summary = "Upcast fp4 (e2m1) to fp";
 
diff --git a/include/triton/Dialect/TritonGPU/Transforms/PipelineExpander.h b/include/triton/Dialect/TritonGPU/Transforms/PipelineExpander.h
@@ -57,6 +57,12 @@ struct PipeliningOption {
   /// pipeliner will have to predicate operations in the prologue/epilogue.
   bool supportDynamicLoops = false;
 
+  /// If set, use this function to emit the predicate stage ops instead of the
+  /// default one.
+  using EmitPredicateStageFnType = std::function<Value(
+      RewriterBase &, Value, Value, Value, uint64_t, uint64_t)>;
+  EmitPredicateStageFnType emitPredicateStageFn = nullptr;
+
   // Callback to predicate operations when the prologue or epilogue are not
   // peeled. This takes the original operation, an i1 predicate value and the
   // pattern rewriter. It is expected to replace the given operation with
@@ -95,6 +101,10 @@ FailureOr<scf::ForOp> pipelineForLoop(RewriterBase &rewriter, scf::ForOp forOp,
                                       const PipeliningOption &options,
                                       bool *modifiedIR = nullptr);
 
+Value emitPredicateForStage(RewriterBase &rewriter, Value inductionVar,
+                            Value upperBound, Value step, uint64_t maxStage,
+                            uint64_t stage);
+
 } // namespace triton
 } // namespace mlir
 
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipelineExpander.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipelineExpander.cpp
@@ -67,6 +67,8 @@ struct LoopPipelinerInternal {
   triton::PipeliningOption::AnnotationlFnType annotateFn = nullptr;
   bool peelEpilogue;
   triton::PipeliningOption::PredicateOpFnType predicateFn = nullptr;
+  triton::PipeliningOption::EmitPredicateStageFnType emitPredicateStageFn =
+      nullptr;
 
   // When peeling the kernel we generate several version of each value for
   // different stage of the prologue. This map tracks the mapping between
@@ -160,6 +162,10 @@ bool LoopPipelinerInternal::initializeLoopInfo(
     LDBG("--no epilogue or predicate set -> BAIL");
     return false;
   }
+  emitPredicateStageFn = options.emitPredicateStageFn;
+  if (emitPredicateStageFn == nullptr) {
+    emitPredicateStageFn = mlir::triton::emitPredicateForStage;
+  }
   std::vector<std::pair<Operation *, unsigned>> schedule;
   options.getScheduleFn(forOp, schedule);
   if (schedule.empty()) {
@@ -490,20 +496,10 @@ LogicalResult LoopPipelinerInternal::createKernel(
   if (!peelEpilogue) {
     // Create a predicate for each stage except the last stage.
     Location loc = newForOp.getLoc();
-    Type t = ub.getType();
     for (unsigned i = 0; i < maxStage; i++) {
       // c = ub - (maxStage - i) * step
-      Value c = rewriter.create<arith::SubIOp>(
-          loc, ub,
-          rewriter.create<arith::MulIOp>(
-              loc, step,
-              rewriter.create<arith::ConstantOp>(
-                  loc, rewriter.getIntegerAttr(t, int64_t(maxStage - i)))));
-
-      Value pred = rewriter.create<arith::CmpIOp>(
-          newForOp.getLoc(), arith::CmpIPredicate::slt,
-          newForOp.getInductionVar(), c);
-      predicates[i] = pred;
+      predicates[i] = emitPredicateStageFn(rewriter, newForOp.getInductionVar(),
+                                           ub, step, maxStage, i);
     }
   }
   for (Operation *op : opOrder) {
@@ -852,3 +848,19 @@ mlir::triton::pipelineForLoop(RewriterBase &rewriter, ForOp forOp,
 
   return newForOp;
 }
+
+Value mlir::triton::emitPredicateForStage(RewriterBase &rewriter,
+                                          Value inductionVar, Value upperBound,
+                                          Value step, uint64_t maxStage,
+                                          uint64_t stage) {
+  auto loc = inductionVar.getLoc();
+  auto type = inductionVar.getType();
+  Value c = rewriter.create<arith::SubIOp>(
+      loc, upperBound,
+      rewriter.create<arith::MulIOp>(
+          loc, step,
+          rewriter.create<arith::ConstantOp>(
+              loc, rewriter.getIntegerAttr(type, maxStage - stage))));
+  return rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt,
+                                        inductionVar, c);
+}
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/SoftwarePipeliner.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/SoftwarePipeliner.cpp
@@ -62,6 +62,17 @@ static void expandLoops(ModuleOp moduleOp) {
             std::vector<std::pair<Operation *, unsigned>> &schedule) {
           schedule = finalSchedule;
         };
+    // Testing feature: allow for unresolved predicate stage ops
+    // in the loop body.
+    if (forOp->hasAttr("__test_keep_predicate_stage")) {
+      options.emitPredicateStageFn =
+          [](RewriterBase &rewriter, Value inductionVar, Value upperBound,
+             Value step, uint64_t maxStage, uint64_t stage) {
+            return rewriter.create<triton::gpu::PredicateStageOp>(
+                inductionVar.getLoc(), inductionVar, upperBound, step, maxStage,
+                stage);
+          };
+    }
     IRRewriter rewriter(forOp);
     FailureOr<scf::ForOp> newForOp =
         triton::pipelineForLoop(rewriter, forOp, options);
diff --git a/test/TritonGPU/loop-pipeline.mlir b/test/TritonGPU/loop-pipeline.mlir
@@ -1696,3 +1696,38 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
     tt.return
   }
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
+  // CHECK-LABEL: @predicate_stage1
+  // CHECK: scf.for %[[IV:.*]] = %[[LB:.*]] to %[[UB:.*]] step %[[STEP:.*]] iter_args
+  // CHECK: ttg.predicate_stage %[[IV]], %[[UB]], %[[STEP]] maxStage 2 stage 0 : i32 -> i1
+  tt.func public @predicate_stage1(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c1024_i32 = arith.constant 1024 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c1016800_i32 = arith.constant 1016800 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1016800_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
+    %3 = tt.splat %arg3 : i32 -> tensor<1024xi32, #blocked>
+    %4 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #blocked>
+    %5 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #blocked>
+    %6 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #blocked>
+    scf.for %arg4 = %c0_i32 to %c1016800_i32 step %c1024_i32  : i32 {
+      %7 = arith.addi %1, %arg4 : i32
+      %8 = tt.splat %7 : i32 -> tensor<1024xi32, #blocked>
+      %9 = arith.addi %8, %2 : tensor<1024xi32, #blocked>
+      %10 = arith.cmpi slt, %9, %3 : tensor<1024xi32, #blocked>
+      %11 = tt.addptr %4, %9 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked>
+      %12 = tt.load %11, %10 : tensor<1024x!tt.ptr<f32>, #blocked>
+      %13 = tt.addptr %5, %9 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked>
+      %14 = tt.load %13, %10 : tensor<1024x!tt.ptr<f32>, #blocked>
+      %15 = arith.addf %12, %14 : tensor<1024xf32, #blocked>
+      %16 = tt.addptr %6, %9 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked>
+      tt.store %16, %15, %10 : tensor<1024x!tt.ptr<f32>, #blocked>
+    } {tt.num_stages = 3 : i32, __test_keep_predicate_stage}
+    tt.return
+  }
+}