patch workaround by correctly setting stage/cluster attrubtes (#8797)

3gx · web-flow · commit 4b184cccbb42 · 2025-11-21T14:17:35.000-08:00
* patches workaround for loop-scheduler by using stage/cluster from
previous tmem access op in the partition to set stage/cluster for
put.exit op, and if needed for the follow-up put.enter op
diff --git a/test/NVWS/aref-tmem-insertion.mlir b/test/NVWS/aref-tmem-insertion.mlir
@@ -788,3 +788,57 @@ module attributes {"ttg.num-warps" = 4 : i32, ttg.target = "cuda:100"} {
     tt.return
   }
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 128], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0]}>
+#blocked2 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+#blocked3 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
+#smem = #ttg.shared_memory
+#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, colStride = 1>
+module attributes {"ttg.num-warps" = 4 : i32, ttg.target = "cuda:100"} {
+  // CHECK-LABEL: @if_split_workaround
+  tt.func @if_split_workaround(%arg0: !tt.tensordesc<tensor<1x64xf16, #shared>>, %arg1: tensor<64x128x!tt.ptr<f16>, #blocked3> {tt.contiguity = dense<[1, 64]> : tensor<2xi32>, tt.divisibility = dense<16> : tensor<2xi32>}) {
+    %c0_i32 = arith.constant 0 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %true = arith.constant true
+    %false = arith.constant false
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #blocked>
+    %c32_i32 = arith.constant 32 : i32
+    %result, %token = ttng.tmem_alloc : () -> (!ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.async.token)
+    %0 = ttng.tmem_store %cst, %result[%token], %true : tensor<128x128xf32, #blocked> -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
+    // CHECK: scf.for
+    %1:3 = scf.for %arg2 = %c0_i32 to %c32_i32 step %c1_i32 iter_args(%arg3 = %true, %arg4 = %arg1, %arg5 = %0) -> (i1, tensor<64x128x!tt.ptr<f16>, #blocked3>, !ttg.async.token)  : i32 {
+      %2:3 = "get_offsets"(%arg2) {loop.cluster = 3 : i32, loop.stage = 0 : i32, ttg.partition = array<i32: 1, 2>} : (i32) -> (i32, tensor<64x128xi32, #blocked3>, i32)
+      %3 = tt.splat %2#0 {loop.cluster = 3 : i32, loop.stage = 0 : i32, ttg.partition = array<i32: 2>} : i32 -> tensor<128xi32, #blocked2>
+      %4 = tt.descriptor_gather %arg0[%3, %2#2] {loop.cluster = 3 : i32, loop.stage = 0 : i32, ttg.partition = array<i32: 2>} : (!tt.tensordesc<tensor<1x64xf16, #shared>>, tensor<128xi32, #blocked2>, i32) -> tensor<128x64xf16, #blocked1>
+      %5 = tt.addptr %arg4, %2#1 {loop.cluster = 3 : i32, loop.stage = 1 : i32, tt.constancy = dense<1> : tensor<2xi32>, tt.contiguity = dense<[1, 64]> : tensor<2xi32>, tt.divisibility = dense<16> : tensor<2xi32>, ttg.partition = array<i32: 1>} : tensor<64x128x!tt.ptr<f16>, #blocked3>, tensor<64x128xi32, #blocked3>
+      %6 = tt.load %5 {loop.cluster = 3 : i32, loop.stage = 1 : i32, ttg.partition = array<i32: 1>} : tensor<64x128x!tt.ptr<f16>, #blocked3>
+      %7 = ttg.local_alloc %4 {loop.cluster = 2 : i32, loop.stage = 2 : i32, ttg.partition = array<i32: 2>} : (tensor<128x64xf16, #blocked1>) -> !ttg.memdesc<128x64xf16, #shared, #smem>
+      %8 = ttg.local_alloc %6 {loop.cluster = 2 : i32, loop.stage = 2 : i32, ttg.partition = array<i32: 1>} : (tensor<64x128xf16, #blocked3>) -> !ttg.memdesc<64x128xf16, #shared, #smem>
+      // CHECK: tc_gen5_mma {{.*}} {loop.cluster = 2 : i32, loop.stage = 2 : i32
+      %9 = ttng.tc_gen5_mma %7, %8, %result[%arg5], %arg3, %true {loop.cluster = 2 : i32, loop.stage = 2 : i32, tt.self_latency = 1 : i32, ttg.partition = array<i32: 1>} : !ttg.memdesc<128x64xf16, #shared, #smem>, !ttg.memdesc<64x128xf16, #shared, #smem>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
+      %10 = arith.cmpi eq, %arg2, %c0_i32 {loop.cluster = 1 : i32, loop.stage = 3 : i32, ttg.partition = array<i32: 0, 1>} : i32
+      %11 = arith.select %10, %false, %true {loop.cluster = 1 : i32, loop.stage = 3 : i32, ttg.partition = array<i32: 1>} : i1
+      // CHECK: scf.if
+      // CHECK-NEXT: put.exit {{.*}} {loop.cluster = 2 : i32, loop.stage = 2 : i32
+      // CHECK} {loop.cluster = 2 : i32, loop.stage = 2 : i32
+      // CHECK: scf.if
+      // CHECK: } {loop.cluster = 4 : i32, loop.stage = 3 : i32
+      // CHECK: scf.if
+      // CKECK-NEXT: put.enter {{.*}} {loop.cluster = 2 : i32, loop.stage = 2 : i32
+      // CHECK: } {loop.cluster = 2 : i32, loop.stage = 2 : i32
+      %12 = scf.if %10 -> (!ttg.async.token) {
+        %result_0, %token_1 = ttng.tmem_load %result[%9] {ttg.partition = array<i32: 0>} : !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> -> tensor<128x128xf32, #blocked>
+        "acc_user"(%result_0) {ttg.partition = array<i32: 0>} : (tensor<128x128xf32, #blocked>) -> ()
+        scf.yield {ttg.partition = array<i32: 0, 1>} %token_1 : !ttg.async.token
+      } else {
+        scf.yield {ttg.partition = array<i32: 0, 1>} %9 : !ttg.async.token
+      } {loop.cluster = 4 : i32, loop.stage = 3 : i32, ttg.partition = array<i32: 0, 1>, ttg.partition.outputs = [array<i32: 1>]}
+      scf.yield {ttg.partition = array<i32: 0, 1, 2>} %11, %5, %12 : i1, tensor<64x128x!tt.ptr<f16>, #blocked3>, !ttg.async.token
+    } {tt.disallow_acc_multi_buffer, tt.num_stages = 3 : i32, tt.scheduled_max_stage = 3 : i32, tt.warp_specialize, ttg.partition = array<i32: 0, 1, 2>, ttg.partition.outputs = [array<i32: 1>, array<i32: 1>, array<i32: 1>], ttg.partition.stages = [0 : i32, 1 : i32, 0 : i32], ttg.warp_specialize.tag = 2 : i32}
+    tt.return
+  }
+}
diff --git a/third_party/nvidia/lib/Dialect/NVWS/Transforms/InsertTmemAref.cpp b/third_party/nvidia/lib/Dialect/NVWS/Transforms/InsertTmemAref.cpp
@@ -402,10 +402,15 @@ struct TMEMAref {
       token = op.getToken();
     }
     partitionId = paritionIdStageCluster.first;
+    if (partitionId)
+      stageClusters[*partitionId] = paritionIdStageCluster.second;
     buffer = {};
   }
-  void release(OpBuilder &b, Location loc, StageCluster stageCluster) {
+  void release(OpBuilder &b, Location loc) {
     assert(asyncOp);
+    StageCluster stageCluster;
+    if (partitionId)
+      stageCluster = stageClusters[*partitionId];
     if (kind == PUT) {
       createInto<ArefPutExitOp>(
           b, loc, {partitionId, stageCluster}, aref, token,
@@ -447,6 +452,7 @@ struct TMEMAref {
   Kind kind;
   std::optional<PartitionId> partitionId;
   std::optional<AsyncOp> asyncOp;
+  DenseMap<PartitionId, StageCluster> stageClusters;
 };
 
 TmemAccessDag::Node *
@@ -458,25 +464,18 @@ insertTmemArefImpl(TmemAccessDag::Node *node,
   if (curPartitionId && node->partitionId != curPartitionId) {
     OpBuilder b(node->op);
     Operation *prevOp = nullptr;
-    StageCluster prevStageCluster;
     if (node->parent) {
       // release right after the last op which owns the tmem
       prevOp = node->parent->op;
       b.setInsertionPointAfter(prevOp);
-      prevStageCluster = getStageCluster(prevOp);
     } else {
       // if we are inside if-stmt or for-stmt subdag and need to change
       // ownerhip, release at the top of the block
       // the parentDag op would be if-stmt or for-stmt
       prevOp = node->parentDag->op;
       b.setInsertionPointToStart(node->op->getBlock());
     }
-    if (!node->partitionId) {
-      // if node->partitionId is not set, it means we are outside ws-region
-      // reset prevPartitionId and prevStageCluster to defaults
-      prevStageCluster = {};
-    }
-    state.release(b, prevOp->getLoc(), prevStageCluster);
+    state.release(b, prevOp->getLoc());
 
     // acquire right before op that acquires ownership of tmem
     auto curOp = node->op;
@@ -489,6 +488,10 @@ insertTmemArefImpl(TmemAccessDag::Node *node,
       curOp = node->parentDag->op;
     }
     auto stageCluster = getStageCluster(curOp);
+    // if stage-cluster is empty, use the stage-cluster used from the last op
+    // that acquired ownership of tmem in a partition
+    if (!stageCluster && partitionId)
+      stageCluster = state.stageClusters[*partitionId];
     state.acquire(b, curOp->getLoc(), {partitionId, stageCluster});
   }
 
@@ -519,16 +522,22 @@ insertTmemArefImpl(TmemAccessDag::Node *node,
 
   OpBuilder b(node->op);
   if (auto tmemLoadOp = dyn_cast<TMEMLoadOp>(node->op)) {
+    if (auto id = node->partitionId)
+      state.stageClusters[*id] = getStageCluster(node->op);
     tmemLoadOp.getSrcMutable().assign(
         state.getBuffer(b, node->partitionId, node->op));
     tmemLoadOp.getDepMutable().clear();
     tmemLoadOp.getToken().replaceAllUsesWith(state.replToken);
   } else if (auto tmemStoreOp = dyn_cast<TMEMStoreOp>(node->op)) {
+    if (auto id = node->partitionId)
+      state.stageClusters[*id] = getStageCluster(node->op);
     tmemStoreOp.getDstMutable().assign(
         state.getBuffer(b, node->partitionId, node->op));
     tmemStoreOp.getDepMutable().clear();
     tmemStoreOp.getToken().replaceAllUsesWith(state.replToken);
   } else if (auto mmaOp = dyn_cast<MMAv5OpInterface>(node->op)) {
+    if (auto id = node->partitionId)
+      state.stageClusters[*id] = getStageCluster(node->op);
     if (mmaOp.getAccumulator() == state.origBuffer) {
       mmaOp.getAccDepMutable().clear();
       mmaOp.getToken().replaceAllUsesWith(state.replToken);
@@ -640,10 +649,11 @@ LogicalResult insertTmemAref(TmemAccessDag &accessDag) {
     // aref is used outside ws-loop, find the last point in the same block as
     // create op to have matching exit
     auto op1 = arefOp->getBlock()->findAncestorOpInBlock(*node->op);
+    if (auto id = node->partitionId)
+      state.stageClusters[*id] = {};
     b.setInsertionPointAfter(op1);
   }
-  stageCluster = getStageCluster(node->op);
-  state.release(b, node->op->getLoc(), stageCluster);
+  state.release(b, node->op->getLoc());
 
   if (state.kind == TMEMAref::GET) {
     // When the state ends up in a GET operation, we need to acquire and release
@@ -661,7 +671,7 @@ LogicalResult insertTmemAref(TmemAccessDag &accessDag) {
       }
     }
     state.acquire(b, node->op->getLoc(), {otherPartitionId, {}});
-    state.release(b, node->op->getLoc(), {});
+    state.release(b, node->op->getLoc());
   }
 
   return success();
@@ -751,8 +761,8 @@ void workaroundForLoopScheduler(triton::FuncOp funcOp) {
     // patch loop.stage=1
     enterIf->setAttrs(ifOp->getAttrs());
     exitIf->setAttrs(ifOp->getAttrs());
-    enterIf->setAttr(kLoopStageAttrName, b.getI32IntegerAttr(1));
-    exitIf->setAttr(kLoopStageAttrName, b.getI32IntegerAttr(1));
+    assignStage(b, enterIf, getStageCluster(putEnterOp));
+    assignStage(b, exitIf, getStageCluster(putExitOp));
 
     SetVector<int> enterExitIds, middleIds;
     enterExitIds.insert(1);