[NVWS] pass on stage/cluster attributes (#7649)

3gx · web-flow · commit 70f902087bfa · 2025-07-25T21:47:03.000Z
adds stage/cluster attribute to `try_wait/arrive/commit` in `lower-aref` pass. Lit test will be added once PR triton-lang/triton#7648 is merged, because it depends on a lit-test in that PR.
diff --git a/include/triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h b/include/triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/LoadMMASpecialization.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/LoadMMASpecialization.cpp
@@ -1,4 +1,3 @@
-#include "PartitionBuilder.h"
 #include "mlir/Dialect/UB/IR/UBOps.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Dominance.h"
@@ -9,6 +8,7 @@
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/Transforms/MMAv5PipelineUtility.h"
 #include "triton/Dialect/TritonGPU/Transforms/Partition.h"
+#include "triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h"
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
 #include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionBuilder.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionBuilder.cpp
@@ -1,4 +1,4 @@
-#include "PartitionBuilder.h"
+#include "triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h"
 #include "triton/Dialect/TritonGPU/Transforms/Partition.h"
 #include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h"
 
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/RewritePartitionDependencies.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/RewritePartitionDependencies.cpp
@@ -1,10 +1,10 @@
-#include "PartitionBuilder.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Pass/Pass.h"
 #include "nvidia/include/Dialect/NVWS/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/Transforms/Partition.h"
+#include "triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h"
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
 #include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
diff --git a/test/NVWS/lower_aref.mlir b/test/NVWS/lower_aref.mlir
@@ -24,45 +24,45 @@ module attributes {"ttg.num-warps" = 4 : i32} {
     partition0 num_warps(4) {
       scf.for %arg3 = %arg0 to %arg1 step %arg2  : i32 {
         %2 = "op_a"() : () -> tensor<1xi32, #blocked>
-        %3 = nvws.aref.put.enter %1[%c0_i32, %c0_i32] : <[!ttg.memdesc<2x1xi32, #shared, #smem, mutable>]> -> !ttg.memdesc<1xi32, #shared, #smem, mutable, 2x1>
+        %3 = nvws.aref.put.enter %1[%c0_i32, %c0_i32] {loop.cluster = 1 : i32, loop.stage = 3 : i32}: <[!ttg.memdesc<2x1xi32, #shared, #smem, mutable>]> -> !ttg.memdesc<1xi32, #shared, #smem, mutable, 2x1>
         ttg.local_store %2, %3 : tensor<1xi32, #blocked> -> !ttg.memdesc<1xi32, #shared, #smem, mutable, 2x1>
         // CHECK: op_a
         // CHECK-NEXT: [[EMPTYMBAR:%.*]] = ttg.memdesc_index [[EMPTY]]
-        // CHECK-NEXT: ttng.wait_barrier [[EMPTYMBAR]]
+        // CHECK-NEXT: ttng.wait_barrier [[EMPTYMBAR]], {{.*}} {loop.cluster = 1 : i32, loop.stage = 3 : i32}
         // CHECK: local_store
         // CHECK-NEXT: [[FULLMBAR:%.*]] = ttg.memdesc_index [[FULL]]
-        // CHECK-NEXT: ttng.arrive_barrier [[FULLMBAR]], 1
-        nvws.aref.put.exit %1[%c0_i32] [#nvws.async_op<none>] : <[!ttg.memdesc<2x1xi32, #shared, #smem, mutable>]>
+        // CHECK-NEXT: ttng.arrive_barrier [[FULLMBAR]], 1 {loop.cluster = 1 : i32, loop.stage = 3 : i32}
+        nvws.aref.put.exit %1[%c0_i32] [#nvws.async_op<none>] {loop.cluster = 1 : i32, loop.stage = 3 : i32} : <[!ttg.memdesc<2x1xi32, #shared, #smem, mutable>]>
       }
       nvws.warp_group.yield
     }
     partition1 num_warps(4) {
       scf.for %arg3 = %arg0 to %arg1 step %arg2  : i32 {
         // CHECK: [[FULLMBAR:%.*]] = ttg.memdesc_index [[FULL]]
-        // CHECK-NEXT: ttng.wait_barrier [[FULLMBAR]]
+        // CHECK-NEXT: ttng.wait_barrier [[FULLMBAR]], {{.*}} {loop.cluster = 2 : i32, loop.stage = 3 : i32}
         // CHECK: [[VAL:%.*]] = ttg.local_load
         // CHECK-NEXT: [[EMPTYMBAR:%.*]] = ttg.memdesc_index [[EMPTY]]
-        // CHECK-NEXT: ttng.arrive_barrier [[EMPTYMBAR]], 1
+        // CHECK-NEXT: ttng.arrive_barrier [[EMPTYMBAR]], 1 {loop.cluster = 2 : i32, loop.stage = 3 : i32}
         // CHECK: "op_b"([[VAL]])
-        %2 = nvws.aref.get.enter %1[%c0_i32, %c0_i32] : <[!ttg.memdesc<2x1xi32, #shared, #smem, mutable>]> -> !ttg.memdesc<1xi32, #shared, #smem, mutable, 2x1>
+        %2 = nvws.aref.get.enter %1[%c0_i32, %c0_i32] {loop.cluster = 2 : i32, loop.stage = 3 : i32}: <[!ttg.memdesc<2x1xi32, #shared, #smem, mutable>]> -> !ttg.memdesc<1xi32, #shared, #smem, mutable, 2x1>
         %3 = ttg.local_load %2 : !ttg.memdesc<1xi32, #shared, #smem, mutable, 2x1> -> tensor<1xi32, #blocked>
-        nvws.aref.get.exit %1[%c0_i32] [#nvws.async_op<none>] : <[!ttg.memdesc<2x1xi32, #shared, #smem, mutable>]>
+        nvws.aref.get.exit %1[%c0_i32] [#nvws.async_op<none>] {loop.cluster = 2 : i32, loop.stage = 3 : i32} : <[!ttg.memdesc<2x1xi32, #shared, #smem, mutable>]>
         "op_b"(%3) : (tensor<1xi32, #blocked>) -> ()
       }
       nvws.warp_group.return
     }
     partition2 num_warps(4) {
       scf.for %arg3 = %arg0 to %arg1 step %arg2  : i32 {
         // CHECK: [[FULLMBAR:%.*]] = ttg.memdesc_index [[FULL]]
-        // CHECK-NEXT: ttng.wait_barrier [[FULLMBAR]]
+        // CHECK-NEXT: ttng.wait_barrier [[FULLMBAR]], {{.*}} {loop.cluster = 3 : i32, loop.stage = 4 : i32}
         // CHECK: [[VAL:%.*]] = ttg.local_load
         // CHECK-NEXT: [[EMPTYMBAR:%.*]] = ttg.memdesc_index [[EMPTY]]
-        // CHECK-NEXT: ttng.arrive_barrier [[EMPTYMBAR]], 1
+        // CHECK-NEXT: ttng.arrive_barrier [[EMPTYMBAR]], 1 {loop.cluster = 3 : i32, loop.stage = 4 : i32}
         // CHECK: "op_c"([[VAL]])
         // CHECK: "op_d"([[VAL]])
-        %2 = nvws.aref.get.enter %1[%c0_i32, %c0_i32] : <[!ttg.memdesc<2x1xi32, #shared, #smem, mutable>]> -> !ttg.memdesc<1xi32, #shared, #smem, mutable, 2x1>
+        %2 = nvws.aref.get.enter %1[%c0_i32, %c0_i32] {loop.cluster = 3 : i32, loop.stage = 4 : i32}: <[!ttg.memdesc<2x1xi32, #shared, #smem, mutable>]> -> !ttg.memdesc<1xi32, #shared, #smem, mutable, 2x1>
         %3 = ttg.local_load %2 : !ttg.memdesc<1xi32, #shared, #smem, mutable, 2x1> -> tensor<1xi32, #blocked>
-        nvws.aref.get.exit %1[%c0_i32] [#nvws.async_op<none>] : <[!ttg.memdesc<2x1xi32, #shared, #smem, mutable>]>
+        nvws.aref.get.exit %1[%c0_i32] [#nvws.async_op<none>] {loop.cluster = 3 : i32, loop.stage = 4 : i32} : <[!ttg.memdesc<2x1xi32, #shared, #smem, mutable>]>
         "op_c"(%3) : (tensor<1xi32, #blocked>) -> ()
         "op_d"(%3) : (tensor<1xi32, #blocked>) -> ()
       }
diff --git a/third_party/nvidia/lib/Dialect/NVWS/Transforms/LowerAref.cpp b/third_party/nvidia/lib/Dialect/NVWS/Transforms/LowerAref.cpp
@@ -37,6 +37,7 @@
 #include "nvidia/include/Dialect/NVWS/IR/Dialect.h"
 #include "nvidia/include/Dialect/NVWS/Transforms/Passes.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include "triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h"
 #include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
@@ -61,6 +62,16 @@ namespace {
 
 // ----------------------------------------------------------------------------
 
+void assignStageCluster(Operation *op, StageCluster stageCluster,
+                        OpBuilder &builder) {
+  if (stageCluster) {
+    op->setAttr(triton::kLoopStageAttrName,
+                builder.getI32IntegerAttr(stageCluster->first));
+    op->setAttr(triton::kLoopClusterAttrName,
+                builder.getI32IntegerAttr(stageCluster->second));
+  }
+}
+
 struct ArefValue {
   Value emptyMbars;
   Value fullMbars;
@@ -266,7 +277,9 @@ LogicalResult rewritePutEnterOp(ArefCreateOp arefOp, ArefPutEnterOp op,
   // get empty barrier at a given stage
   Value emptyBarrier = getEmptyBarrier(rewriter, loc, arefVal, op.getStage());
 
-  rewriter.create<WaitBarrierOp>(loc, emptyBarrier, op.getPhase());
+  auto waitOp =
+      rewriter.create<WaitBarrierOp>(loc, emptyBarrier, op.getPhase());
+  assignStageCluster(waitOp, getStageCluster(op), rewriter);
   auto views = getSubViews(arefVal, op.getStage(), loc, rewriter);
   assert(views.size() == op.getResults().size());
 
@@ -287,7 +300,8 @@ LogicalResult rewriteGetEnterOp(ArefCreateOp arefOp, ArefGetEnterOp op,
   rewriter.setInsertionPointAfter(op);
 
   Value fullBarrier = getFullBarrier(rewriter, loc, arefVal, op.getStage());
-  rewriter.create<WaitBarrierOp>(loc, fullBarrier, op.getPhase());
+  auto waitOp = rewriter.create<WaitBarrierOp>(loc, fullBarrier, op.getPhase());
+  assignStageCluster(waitOp, getStageCluster(op), rewriter);
   auto views = getSubViews(arefVal, op.getStage(), loc, rewriter);
   assert(views.size() == op.getResults().size());
 
@@ -298,17 +312,19 @@ LogicalResult rewriteGetEnterOp(ArefCreateOp arefOp, ArefGetEnterOp op,
 }
 
 LogicalResult insertArriveBarrier(Location loc, ArrayAttr asyncOps,
-                                  PatternRewriter &rewriter, Value mbar) {
+                                  PatternRewriter &rewriter, Value mbar,
+                                  StageCluster stageCluster) {
   for (auto asyncOp : asyncOps) {
     auto asyncOpEnum = cast<AsyncOpAttr>(asyncOp).getValue();
+    Operation *arriveOp = {};
     switch (asyncOpEnum) {
     case AsyncOp::NONE:
     case AsyncOp::WGMMA:
-      rewriter.create<nvidia_gpu::ArriveBarrierOp>(loc, mbar, 1);
+      arriveOp = rewriter.create<nvidia_gpu::ArriveBarrierOp>(loc, mbar, 1);
       break;
     case AsyncOp::TC5MMA:
     case AsyncOp::TMEMCopy:
-      rewriter.create<nvidia_gpu::TCGen5CommitOp>(loc, mbar);
+      arriveOp = rewriter.create<nvidia_gpu::TCGen5CommitOp>(loc, mbar);
       break;
 
     case AsyncOp::TMALoad:
@@ -318,6 +334,8 @@ LogicalResult insertArriveBarrier(Location loc, ArrayAttr asyncOps,
     default:
       llvm_unreachable("unknown async op");
     }
+    if (arriveOp)
+      assignStageCluster(arriveOp, stageCluster, rewriter);
   }
 
   return success();
@@ -328,15 +346,17 @@ LogicalResult rewritePutExitOp(ArefPutExitOp op, PatternRewriter &rewriter,
   auto loc = op->getLoc();
   rewriter.setInsertionPointAfter(op);
   Value fullBarrier = getFullBarrier(rewriter, loc, arefVal, op.getStage());
-  return insertArriveBarrier(loc, op.getAsyncOps(), rewriter, fullBarrier);
+  return insertArriveBarrier(loc, op.getAsyncOps(), rewriter, fullBarrier,
+                             getStageCluster(op));
 }
 
 LogicalResult rewriteGetExitOp(ArefGetExitOp op, PatternRewriter &rewriter,
                                ArefValue arefVal) {
   auto loc = op->getLoc();
   rewriter.setInsertionPointAfter(op);
   Value emptyBarrier = getEmptyBarrier(rewriter, loc, arefVal, op.getStage());
-  return insertArriveBarrier(loc, op.getAsyncOps(), rewriter, emptyBarrier);
+  return insertArriveBarrier(loc, op.getAsyncOps(), rewriter, emptyBarrier,
+                             getStageCluster(op));
 }
 
 LogicalResult rewriteArefDestroyOp(ArefDestroyOp op, PatternRewriter &rewriter,

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-#include "PartitionBuilder.h"`
	`1`	`+#include "triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h"`
`2`	`2`	`#include "triton/Dialect/TritonGPU/Transforms/Partition.h"`
`3`	`3`	`#include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h"`
`4`	`4`