[Warp Specialization] Fix partition loops capture order (#6757)

Mogball · web-flow · commit 1d6b7dd919bf · 2025-05-08T13:54:14.000-07:00
Ops need to be rematerialized in topological order.
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionLoops.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionLoops.cpp
@@ -1,3 +1,4 @@
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
@@ -188,6 +189,11 @@ LogicalResult triton::gpu::partitionLoop(scf::ForOp loop) {
   // captures and thread them in to the regions.
   SetVector<Value> captures;
   getUsedValuesDefinedAbove(wsOp.getPartitionOpHolder(), captures);
+
+  // Find the subgraph that should be cloned into the partition regions. The
+  // explicit captures are the leaves of the subgraph.
+  SetVector<Operation *> opsToClone;
+  SmallVector<Value> explicitCaptures;
   for (unsigned i = 0; i < captures.size(); ++i) {
     Value capture = captures[i];
 
@@ -198,11 +204,7 @@ LogicalResult triton::gpu::partitionLoop(scf::ForOp loop) {
         (defOp->hasTrait<OpTrait::ConstantLike>() ||
          isa<RankedTensorType>(capture.getType()))) {
       captures.insert(defOp->operand_begin(), defOp->operand_end());
-      for (Region *region : wsOp.getPartitionRegions()) {
-        b.setInsertionPointToStart(&region->front());
-        Value copy = b.clone(*capture.getDefiningOp())->getResult(0);
-        replaceAllUsesInRegionWith(capture, copy, *region);
-      }
+      opsToClone.insert(defOp);
       continue;
     }
 
@@ -211,14 +213,30 @@ LogicalResult triton::gpu::partitionLoop(scf::ForOp loop) {
                                "FIXME: capturing tensor values into warp "
                                "partitions is not supported");
     }
-    wsOp->insertOperands(wsOp.getNumOperands(), capture);
-    for (Region *region : wsOp.getPartitionRegions()) {
+    explicitCaptures.push_back(capture);
+  }
+
+  // Clone the ops into each region in topological order.
+  opsToClone = topologicalSort(opsToClone);
+  for (Region *region : wsOp.getPartitionRegions()) {
+    b.setInsertionPointToStart(&region->front());
+    IRMapping mapping;
+    for (Operation *op : opsToClone) {
+      Value copy = b.clone(*op, mapping)->getResult(0);
+      mapping.map(op->getResult(0), copy);
+      replaceAllUsesInRegionWith(op->getResult(0), copy, *region);
+    }
+  }
+
+  // Replace the leaves with explicit captures.
+  wsOp->insertOperands(wsOp.getNumOperands(), explicitCaptures);
+  for (Region *region : wsOp.getPartitionRegions()) {
+    for (Value capture : explicitCaptures) {
       BlockArgument arg =
           region->addArgument(capture.getType(), capture.getLoc());
       replaceAllUsesInRegionWith(capture, arg, *region);
     }
   }
-
   return success();
 }
 
diff --git a/test/TritonGPU/partition-loops.mlir b/test/TritonGPU/partition-loops.mlir
@@ -206,8 +206,8 @@ tt.func @trivial_tensor_captures(%arg0: f16, %lb: i32, %ub: i32, %step: i32) {
   // CHECK: ttg.warp_specialize(%arg1, %arg2, %arg3, %arg0)
   scf.for %i = %lb to %ub step %step : i32 {
     // CHECK: partition0(%arg4: i32, %arg5: i32, %arg6: i32, %arg7: f16) num_warps(4)
-    // CHECK-NEXT: [[SPLAT:%.*]] = tt.splat %arg7 : f16 -> tensor<32xf16>
     // CHECK-NEXT: [[RANGE:%.*]] = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
+    // CHECK-NEXT: [[SPLAT:%.*]] = tt.splat %arg7 : f16 -> tensor<32xf16>
     // CHECK-NEXT: scf.for
     // CHECK-NEXT: "use"([[RANGE]], [[SPLAT]])
     "use"(%0, %1) {ttg.partition = 1} : (tensor<256xi32>, tensor<32xf16>) -> ()
@@ -238,4 +238,24 @@ tt.func @dce_before_warp_allocation(%lb: i32, %ub: i32, %step: i32) {
   tt.return
 }
 
+// CHECK-LABEL: @capture_order
+tt.func public @capture_order(%arg0: i32) {
+  %c0_i32 = arith.constant 0 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %0 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #blocked>
+  %1 = arith.extsi %0 : tensor<4xi32, #blocked> to tensor<4xi64, #blocked>
+  // CHECK: ttg.warp_specialize
+  // CHECK: partition0
+  // CHECK: [[VALUE:%.*]] = tt.make_range
+  // CHECK-NEXT: [[EXT:%.*]] = arith.extsi [[VALUE]]
+  // CHECK-NEXT: scf.for
+  scf.for %arg1 = %c0_i32 to %arg0 step %c1_i32  : i32 {
+    // CHECK-NEXT: "use"([[VALUE]])
+    "use"(%0) : (tensor<4xi32, #blocked>) -> ()
+    // CHECK-NEXT: "use"([[EXT]])
+    "use"(%1) : (tensor<4xi64, #blocked>) -> ()
+  } {ttg.partition.stages = [1 : i32, 0 : i32]}
+  tt.return
+}
+
 }