[WS] fix assign-stage-phase to propagate to for-op control opnds (#8634)

3gx · web-flow · commit 4c6349d41998 · 2025-11-05T17:24:41.000Z
- fixes assign-stage-phase to propagate default partition to for-op
control operands if needed, not just for-op itself
- track root-partition in aref-tmem-insertion to remove ad-hoc skipping
tmem-insertion
diff --git a/test/NVWS/aref-tmem-insertion.mlir b/test/NVWS/aref-tmem-insertion.mlir
@@ -558,15 +558,17 @@ module attributes {"ttg.num-warps" = 4 : i32, ttg.target = "cuda:100"} {
     // CHECK-NEXT: aref.create
     // CHECK-NEXT: aref.put.enter
     %result, %token = ttng.tmem_alloc : () -> (!ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.async.token)
-    scf.for %arg3 = %c0_i32 to %arg0 step %c1_i32  : i32 {
-      %0 = ttg.local_alloc %arg1 : (tensor<128x64xf16, #blocked1>) -> !ttg.memdesc<128x64xf16, #shared, #smem>
+    %5 = scf.for %arg3 = %c0_i32 to %arg0 step %c1_i32 iter_args(%arg4 = %token) -> (!ttg.async.token)  : i32 {
+      %0 = ttg.local_alloc %arg1 {ttg.partition = array<i32: 0>} : (tensor<128x64xf16, #blocked1>) -> !ttg.memdesc<128x64xf16, #shared, #smem>
       %1 = tt.descriptor_load %arg2[%arg3, %arg3] {ttg.partition = array<i32: 2>} : !tt.tensordesc<tensor<64x128xf16, #shared>> -> tensor<64x128xf16, #blocked1>
       %2 = arith.addf %1, %1 {ttg.partition = array<i32: 0>} : tensor<64x128xf16, #blocked1>
       %3 = ttg.local_alloc %2 {ttg.partition = array<i32: 0>} : (tensor<64x128xf16, #blocked1>) -> !ttg.memdesc<64x128xf16, #shared, #smem>
       // CHECK: aref.buffer
-      %4 = ttng.tc_gen5_mma %0, %3, %result[%token], %true, %true {ttg.partition = array<i32: 1>} : !ttg.memdesc<128x64xf16, #shared, #smem>, !ttg.memdesc<64x128xf16, #shared, #smem>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
-    } {tt.num_stages = 2 : i32, tt.warp_specialize, ttg.partition.stages = [0 : i32, 1 : i32, 0 : i32], ttg.warp_specialize.tag = 18 : i32}
+      %4 = ttng.tc_gen5_mma %0, %3, %result[%arg4], %true, %true {ttg.partition = array<i32: 1>} : !ttg.memdesc<128x64xf16, #shared, #smem>, !ttg.memdesc<64x128xf16, #shared, #smem>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
+      scf.yield %4 : !ttg.async.token
+    } {ttg.partition = array<i32: 0, 1, 2>, ttg.partition.outputs = [array<i32: 1>], tt.num_stages = 2 : i32, tt.warp_specialize, ttg.partition.stages = [0 : i32, 1 : i32, 0 : i32], ttg.warp_specialize.tag = 18 : i32}
     // CHECK: aref.put.exit
+    ttng.tmem_load %result[%5] : !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> -> tensor<128x128xf32, #blocked>
     tt.return
   }
 
diff --git a/test/NVWS/assign_stage_phase.mlir b/test/NVWS/assign_stage_phase.mlir
@@ -674,3 +674,55 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     tt.return
   }
 }
+
+// -----
+#blocked = #ttg.blocked<{sizePerThread = [1, 128], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0]}>
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
+#smem = #ttg.shared_memory
+#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, colStride = 1>
+module attributes {"ttg.num-warps" = 4 : i32, ttg.target = "cuda:100"} {
+  // CHECK-LABEL: @for_loop_control_operand_ppg
+  tt.func @for_loop_control_operand_ppg(%lb: i32, %ub: i32, %step: i32, %ptr0: !tt.ptr<i32>) {
+    %true = arith.constant true
+    %arefBuf = ttng.tmem_alloc : () -> !ttg.memdesc<1x128x128xf32, #tmem, #ttng.tensor_memory, mutable>
+    %aref = nvws.aref.create %arefBuf : <[!ttg.memdesc<1x128x128xf32, #tmem, #ttng.tensor_memory, mutable>]>
+    %_0, %tok = nvws.aref.put.enter %aref : <[!ttg.memdesc<1x128x128xf32, #tmem, #ttng.tensor_memory, mutable>]> -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.async.token
+    // CHECK: put.enter
+    // CHECK-NEXT: [[RET:%.*]]:5 = scf.for
+    %tok0 = scf.for %iv0 = %lb to %ub step %step iter_args(%tok1 = %tok) -> (!ttg.async.token) : i32 {
+      // CHECK-NEXT: tt.addptr {{.*}} {ttg.partition = array<i32: 0, 1, 2>}
+      // CHECK-NEXT: tt.load {{.*}} {ttg.partition = array<i32: 0, 1, 2>}
+      // CHECK-NEXT: "lb1"({{.*}}) {ttg.partition = array<i32: 0, 1, 2>}
+      // CHECK-NEXT: "step1"({{.*}}) {ttg.partition = array<i32: 0, 1, 2>}
+      %ptrub = tt.addptr %ptr0, %iv0 {ttg.partition = array<i32: 1, 2>} : !tt.ptr<i32>, i32
+      %ub1 = tt.load %ptrub {ttg.partition = array<i32: 1, 2>} : !tt.ptr<i32>
+      %lb1 = "lb1"(%iv0) {ttg.partition = array<i32: 1, 2>} : (i32) -> i32
+      %step1 = "step1"(%iv0) {ttg.partition = array<i32: 1, 2>} : (i32) -> i32
+      // CHECK-NEXT: [[RET1:%.*]]:3 = scf.for
+      %tok5 = scf.for %iv = %lb1 to %ub1 step %step1 iter_args(%tok2 = %tok1) -> (!ttg.async.token)  : i32 {
+        %sA = "load1"(%iv) {ttg.partition = array<i32: 1>} : (i32) -> !ttg.memdesc<128x64xf32, #shared, #smem>
+        %sB = "load2"(%iv) {ttg.partition = array<i32: 1>} : (i32) -> !ttg.memdesc<64x128xf32, #shared, #smem>
+        %buf = nvws.aref.buffer %aref, %tok2 {ttg.partition = array<i32: 2>} : <[!ttg.memdesc<1x128x128xf32, #tmem, #ttng.tensor_memory, mutable>]>, !ttg.async.token -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
+        ttng.tc_gen5_mma %sA, %sB, %buf, %true, %true {ttg.partition = array<i32: 2>} : !ttg.memdesc<128x64xf32, #shared, #smem>, !ttg.memdesc<64x128xf32, #shared, #smem>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
+        scf.yield {ttg.partition = array<i32: 1, 2>} %tok2 : !ttg.async.token
+      } {ttg.partition = array<i32: 1, 2>, ttg.partition.outputs = [array<i32: 2>]}
+      // CHECK: scf.yield
+      // CHECK-NEXT: {ttg.partition = array<i32: 0, 1, 2>, ttg.partition.outputs = [array<i32: 2>, array<i32: 0, 2>, array<i32: 2>]}
+      // CHECK-NEXT: nvws.aref.put.exit {{.*}}[[[RET1]]#1]
+      nvws.aref.put.exit %aref, %tok5 [#nvws.async_op<tc5mma>] {ttg.partition = array<i32: 2>} : <[!ttg.memdesc<1x128x128xf32, #tmem, #ttng.tensor_memory, mutable>]>, !ttg.async.token
+      %_1, %token_2 = nvws.aref.get.enter %aref {ttg.partition = array<i32: 1>} : <[!ttg.memdesc<1x128x128xf32, #tmem, #ttng.tensor_memory, mutable>]> -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.async.token
+      nvws.aref.get.exit %aref, %token_2 [#nvws.async_op<none>] {ttg.partition = array<i32: 1>} : <[!ttg.memdesc<1x128x128xf32, #tmem, #ttng.tensor_memory, mutable>]>, !ttg.async.token
+      %buf1, %tok6 = nvws.aref.put.enter %aref {ttg.partition = array<i32: 2>} : <[!ttg.memdesc<1x128x128xf32, #tmem, #ttng.tensor_memory, mutable>]> -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.async.token
+      // CHECK: aref.put.enter
+      // CHECK-NEXT: scf.yield
+      scf.yield {ttg.partition = array<i32: 1, 2>} %tok6 : !ttg.async.token
+      // CHECK-NEXT: {tt.warp_specialize, ttg.partition = array<i32: 0, 1, 2>, ttg.partition.outputs = [array<i32: 2>, array<i32: 0, 2>, array<i32: 2>, array<i32: 0, 1>, array<i32: 0, 1>]}
+    } {tt.warp_specialize, ttg.partition = array<i32: 1, 2>, ttg.partition.outputs = [array<i32: 2>]}
+    // CHECK-NEXT: aref.put.exit {{.*}}[[[RET]]#1]
+    nvws.aref.put.exit %aref, %tok0 [#nvws.async_op<tc5mma>] : <[!ttg.memdesc<1x128x128xf32, #tmem, #ttng.tensor_memory, mutable>]>, !ttg.async.token
+    %_2, %token_2 = nvws.aref.get.enter %aref : <[!ttg.memdesc<1x128x128xf32, #tmem, #ttng.tensor_memory, mutable>]> -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.async.token
+    nvws.aref.get.exit %aref, %token_2 [#nvws.async_op<none>] : <[!ttg.memdesc<1x128x128xf32, #tmem, #ttng.tensor_memory, mutable>]>, !ttg.async.token
+    tt.return
+  }
+}
diff --git a/third_party/nvidia/lib/Dialect/NVWS/Transforms/AssignStagePhase.cpp b/third_party/nvidia/lib/Dialect/NVWS/Transforms/AssignStagePhase.cpp
@@ -465,6 +465,11 @@ void visitBackwardSlice(scf::ForOp wsLoop, Value value,
       visitBackwardSlice(wsLoop,
                          forOp.getBody()->getTerminator()->getOperand(*pos),
                          callback, visited);
+      // visit control operands of for-op
+      for (int idx = 0; idx < forOp.getNumControlOperands(); ++idx) {
+        auto control = forOp.getOperand(idx);
+        visitBackwardSlice(wsLoop, control, callback, visited);
+      }
     }
   } else if (wsLoop.getBody()->findAncestorOpInBlock(*defOp)) {
     callback(defOp);
diff --git a/third_party/nvidia/lib/Dialect/NVWS/Transforms/InsertTmemAref.cpp b/third_party/nvidia/lib/Dialect/NVWS/Transforms/InsertTmemAref.cpp
@@ -246,23 +246,27 @@ struct TmemAccessDag {
     return accessDag;
   }
 
-  std::set<PartitionId> collectPartitions(Node *node) {
+  std::pair<bool, std::set<PartitionId>> collectPartitions(Node *node) {
     std::set<PartitionId> partitions;
+    bool hasRootPartition = false;
     if (node->partitionId)
       partitions.insert(*node->partitionId);
 
     while (node->user) {
       node = node->user.get();
       if (node->partitionId)
         partitions.insert(*node->partitionId);
+      else
+        hasRootPartition = true;
       for (auto &subDag : node->subDags) {
         if (subDag) {
-          auto ps = collectPartitions(subDag.get());
+          auto [rootPartition, ps] = collectPartitions(subDag.get());
+          hasRootPartition = hasRootPartition || rootPartition;
           partitions.insert(ps.begin(), ps.end());
         }
       }
     }
-    return partitions;
+    return {hasRootPartition, partitions};
   };
 
   void printNode(Node *node, int indent, llvm::raw_ostream &os) {
@@ -273,20 +277,23 @@ struct TmemAccessDag {
     }
     std::set<PartitionId> partitions;
     os << "|- [" << node->op << "]";
+    bool hasRootPartition = false;
     if (node->partitionId)
       partitions.insert(*node->partitionId);
+    else
+      hasRootPartition = true;
     if (node->op) {
       os << node->op->getName().getStringRef() << " ";
       if (auto tmemAlloc = dyn_cast<TMEMAllocOp>(node->op)) {
         if (tmemAlloc.getSrc()) {
           os << " %src ";
         } else {
-          partitions = collectPartitions(node);
+          std::tie(hasRootPartition, partitions) = collectPartitions(node);
         }
       }
       os << "  ";
     }
-    os << "[";
+    os << "[" << (hasRootPartition ? "root" : "") << "]";
     for (auto partition : partitions) {
       os << " @" << partition << " ";
     }
@@ -526,10 +533,6 @@ LogicalResult insertTmemAref(TmemAccessDag &accessDag) {
   auto rootNode = accessDag.getRootNode();
   auto allocOp = cast<TMEMAllocOp>(rootNode->op);
 
-  // do nothing for alloc with src, whose user is in the same partition
-  if (allocOp.getSrc() && rootNode->user->partitionId == rootNode->partitionId)
-    return success();
-
   std::optional<bool> isMultiStaged;
   for (auto user : allocOp.getResult().getUsers()) {
     if (auto mmaOp = dyn_cast<MMAv5OpInterface>(user)) {
@@ -613,7 +616,8 @@ LogicalResult insertTmemAref(TmemAccessDag &accessDag) {
     // the corresponding partition to prevent deadlocks. This is necessary
     // because if we're inside an outer loop, re-entering the loop without
     // posting a matching GET operation for the PUT would cause the dead-lock.
-    auto partitions = accessDag.collectPartitions(accessDag.getRootNode());
+    auto [hasRootPartition, partitions] =
+        accessDag.collectPartitions(accessDag.getRootNode());
     std::optional<int> otherPartitionId;
     // since we only have two partition, we just pick the other partition for
     // get
@@ -747,9 +751,11 @@ LogicalResult runOnFunction(triton::FuncOp funcOp) {
 
   for (auto &accessDag : tmemDags) {
     LLVM_DEBUG({ accessDag.printDag(llvm::dbgs()); });
-    auto partitions = accessDag.collectPartitions(accessDag.getRootNode());
+    auto [hasRootPartition, partitions] =
+        accessDag.collectPartitions(accessDag.getRootNode());
     assert(partitions.size() <= 2 && "expecting at most 2 partitions");
-    if (!partitions.empty())
+    auto totalOwners = hasRootPartition + partitions.size();
+    if (totalOwners > 1)
       if (failed(insertTmemAref(accessDag)))
         return failure();
   }