[Warp Specialization] Fix accidentally assigning ops to default partition (#6777)

Mogball · web-flow · commit d90b234d462a · 2025-05-09T16:00:28.000-07:00
Ops that have no backward dependency on any other partition are expected
to be left in the root partition because it's easier to let later passes
deal with rematerialization than for higher-level passes to duplicate
and assign explicit stages.

Accidentally assigning partitions to these ops will cause later passes
to fail. This also changes those passes to a hard fail because the code
generated by WS can be invalid if it isn't split. (The code expects to
be run concurrently).
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/LoadMMASpecialization.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/LoadMMASpecialization.cpp
@@ -229,7 +229,8 @@ static void scheduleDependencies(scf::ForOp loop, WarpSchedule &schedule,
 
     Operation *defOp =
         loop.getBody()->findAncestorOpInBlock(*dep.getDefiningOp());
-    if (!defOp || !schedule.trySchedule(partition, defOp))
+    if (!defOp || !hasDefPartition(loop, defOp, schedule) ||
+        !schedule.trySchedule(partition, defOp))
       continue;
     llvm::append_range(deps, getNestedOperands(defOp));
   }
@@ -392,7 +393,7 @@ void propagatePartitions(scf::ForOp loop, WarpSchedule &schedule) {
     // For each partition, place users of its outputs in a cluster if it is not
     // already assigned to a partition.
     auto useCallback = [&](OpResult result, OpOperand &use, unsigned distance) {
-      Operation *user = use.getOwner();
+      Operation *user = loop.getBody()->findAncestorOpInBlock(*use.getOwner());
       if (!schedule.isScheduled(user)) {
         // Add the current partition as a def to the cluster.
         opClusters.getOrCreate(user)->defPartitions.insert(&partition);
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionLoops.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionLoops.cpp
@@ -269,6 +269,6 @@ void PartitionLoops::runOnOperation() {
 
   for (scf::ForOp loop : loops) {
     if (failed(partitionLoop(loop)))
-      continue;
+      return signalPassFailure();
   }
 }
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/RewritePartitionDependencies.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/RewritePartitionDependencies.cpp
@@ -568,6 +568,6 @@ void RewritePartitionDependencies::runOnOperation() {
 
   for (scf::ForOp loop : loops) {
     if (failed(rewritePartitionDependencies(loop)))
-      continue;
+      return signalPassFailure();
   }
 }
diff --git a/test/TritonGPU/load-mma-specialization.mlir b/test/TritonGPU/load-mma-specialization.mlir
@@ -172,54 +172,27 @@ tt.func @unsupported_load() {
   // CHECK-NEXT: [[DONE_MBAR0:%.*]] = ttg.memdesc_subview [[DONE_MBAR]][%c0_i32]
   // CHECK-NEXT: ttng.init_barrier [[DONE_MBAR0]], 1
 
-  // CHECK-NEXT: [[A_SHARED:%.*]] = ttg.local_alloc : () -> !ttg.memdesc<128x64xf16,
-  // CHECK-NEXT: [[B_SHARED:%.*]] = ttg.local_alloc : () -> !ttg.memdesc<64x128xf16,
-
-  // CHECK-NEXT: [[OPER_EMPTY_MBAR:%.*]] = ttg.local_alloc : () -> !ttg.memdesc<1xi64
-  // CHECK-NEXT: [[OPER_EMPTY_MBAR0:%.*]] = ttg.memdesc_subview [[OPER_EMPTY_MBAR]][%c0_i32]
-  // CHECK-NEXT: init_barrier [[OPER_EMPTY_MBAR0]], 1
-
-  // CHECK-NEXT: [[OPER_READY_MBAR:%.*]] = ttg.local_alloc : () -> !ttg.memdesc<1xi64
-  // CHECK-NEXT: [[OPER_READY_MBAR0:%.*]] = ttg.memdesc_subview [[OPER_READY_MBAR]][%c0_i32]
-  // CHECK-NEXT: init_barrier [[OPER_READY_MBAR0]], 1
-
-  // CHECK-NEXT: arrive_barrier [[OPER_EMPTY_MBAR]], 1
-
   // CHECK-NEXT: scf.for
   scf.for %k = %c0_i32 to %k_tiles step %c1_i32 iter_args(%acc = %zero) -> tensor<128x128xf32, #acc_layout> : i32 {
     // CHECK-NEXT: get_ptrs
     %a_ptrs, %b_ptrs = "get_ptrs"(%k) : (i32) -> (tensor<128x64x!tt.ptr<f16>, #oper_layout>, tensor<64x128x!tt.ptr<f16>, #oper_layout>)
-    // CHECK-NEXT: [[A:%.*]] = tt.load
     %a = tt.load %a_ptrs : tensor<128x64x!tt.ptr<f16>, #oper_layout>
-    // CHECK-NEXT: [[B:%.*]] = tt.load
     %b = tt.load %b_ptrs : tensor<64x128x!tt.ptr<f16>, #oper_layout>
 
-    // CHECK-NEXT: wait_barrier [[OPER_EMPTY_MBAR]]
-    // CHECK-NEXT: local_store [[A]], [[A_SHARED]]
     %a_shared = ttg.local_alloc %a : (tensor<128x64xf16, #oper_layout>) -> !ttg.memdesc<128x64xf16, #shared, #smem>
-    // CHECK-NEXT: local_store [[B]], [[B_SHARED]]
     %b_shared = ttg.local_alloc %b : (tensor<64x128xf16, #oper_layout>) -> !ttg.memdesc<64x128xf16, #shared, #smem>
-    // CHECK-NEXT: arrive_barrier [[OPER_READY_MBAR]], 1
 
     %c_tmem, %c_tok = ttng.tmem_alloc %acc : (tensor<128x128xf32, #acc_layout>) -> (!ttg.memdesc<128x128xf32, #acc_tmem, #ttng.tensor_memory, mutable>, !ttg.async.token)
-    // CHECK-NEXT: [[IS_LAST:%.*]] = arith.cmpi eq, %{{.*}}, %c31_i32
-    // CHECK-NEXT: wait_barrier [[OPER_READY_MBAR]]
-    // CHECK-NEXT: ttng.tc_gen5_mma %{{.*}}, [[ACC]][], %true, %true, [[DONE_MBAR0]][[[IS_LAST]]], [[OPER_EMPTY_MBAR]][%true] {ttg.partition = 1 : i32}
+    // CHECK: [[IS_LAST:%.*]] = arith.cmpi eq, %{{.*}}, %c31_i32
+    // CHECK-NEXT: ttng.tc_gen5_mma %{{.*}}, [[ACC]][], %true, %true, [[DONE_MBAR0]][[[IS_LAST]]] {ttg.partition = 1 : i32}
     %mma_tok = ttng.tc_gen5_mma %a_shared, %b_shared, %c_tmem[%c_tok], %true, %true : !ttg.memdesc<128x64xf16, #shared, #smem>, !ttg.memdesc<64x128xf16, #shared, #smem>, !ttg.memdesc<128x128xf32, #acc_tmem, #ttng.tensor_memory, mutable>
     %c, %load_tok = ttng.tmem_load %c_tmem[%mma_tok] : !ttg.memdesc<128x128xf32, #acc_tmem, #ttng.tensor_memory, mutable> -> tensor<128x128xf32, #acc_layout>
 
-    // CHECK-NEXT: [[NEXT_PHASE:%.*]] = arith.xori
-    // CHECK-NEXT: yield [[NEXT_PHASE]]
-
     scf.yield %c : tensor<128x128xf32, #acc_layout>
-  // CHECK-NEXT: ttg.partition.stages = [0 : i32, 1 : i32, 0 : i32]
+  // CHECK: ttg.partition.stages = [0 : i32, 1 : i32, 0 : i32]
   } {tt.warp_specialize}
 
   // CHECK-NEXT: ttng.wait_barrier [[DONE_MBAR0]], %c0_i32
-  // CHECK-NEXT: ttng.inval_barrier [[OPER_READY_MBAR0]]
-  // CHECK-NEXT: ttg.local_dealloc [[OPER_READY_MBAR]]
-  // CHECK-NEXT: ttng.inval_barrier [[OPER_EMPTY_MBAR0]]
-  // CHECK-NEXT: ttg.local_dealloc [[OPER_EMPTY_MBAR]]
   // CHECK-NEXT: ttng.inval_barrier [[DONE_MBAR0]]
   // CHECK-NEXT: ttg.local_dealloc [[DONE_MBAR]]
 
@@ -749,7 +722,7 @@ tt.func @matmul_tma_acc_with_conditional_def_and_use_no_multibuf_flag(
     %b_shared = ttg.local_alloc %b : (tensor<64x128xf16, #oper_layout>) -> !ttg.memdesc<64x128xf16, #shared, #smem>
     %c_tmem, %c_tok = ttng.tmem_alloc %acc : (tensor<128x128xf32, #acc_layout>) -> (!ttg.memdesc<128x128xf32, #acc_tmem, #ttng.tensor_memory, mutable>, !ttg.async.token)
 
-    // CHECK-NEXT: [[DO_EPILOGUE:%.*]] = arith.cmpi eq, [[K:%.*]], %c0_i32
+    // CHECK-NEXT: [[DO_EPILOGUE:%.*]] = arith.cmpi eq, [[K:%.*]], %c0_i32 : i32
     // CHECK-NEXT: [[MMA_TOK:%.*]] = ttng.tc_gen5_mma %{{[0-9]+}}, %{{[0-9]+}}, [[ACC_BUF]][], [[FLAG]], %true, {{.*}}, [[ACC_READY_BUF0]][[[DO_EPILOGUE]]] {ttg.partition = 1 : i32}
     %mma_tok = ttng.tc_gen5_mma %a_shared, %b_shared, %c_tmem[%c_tok], %flag, %true : !ttg.memdesc<128x64xf16, #shared, #smem>, !ttg.memdesc<64x128xf16, #shared, #smem>, !ttg.memdesc<128x128xf32, #acc_tmem, #ttng.tensor_memory, mutable>
     %c, %load_tok = ttng.tmem_load %c_tmem[%mma_tok] : !ttg.memdesc<128x128xf32, #acc_tmem, #ttng.tensor_memory, mutable> -> tensor<128x128xf32, #acc_layout>
diff --git a/test/TritonGPU/partition-loops.mlir b/test/TritonGPU/partition-loops.mlir
@@ -1,20 +1,10 @@
-// RUN: triton-opt %s -allow-unregistered-dialect -tritongpu-partition-loops -verify-diagnostics -canonicalize | FileCheck %s
+// RUN: triton-opt %s -split-input-file -allow-unregistered-dialect -tritongpu-partition-loops -verify-diagnostics -canonicalize | FileCheck %s
 
 #blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
 !ty = tensor<1xi32, #blocked>
 
 module attributes {"ttg.num-warps" = 4 : i32} {
 
-tt.func @still_has_ssa_deps(%lb: i32, %ub: i32, %step: i32) {
-  scf.for %i = %lb to %ub step %step : i32 {
-    // expected-warning @below {{non-root partition #0 has direct SSA consumer}}
-    %0 = "op_a"() {ttg.partition = 0} : () -> !ty
-    // expected-note @below {{use at distance 0 in partition #1 here}}
-    "op_b"(%0) {ttg.partition = 1} : (!ty) -> ()
-  } {ttg.partition.stages = [0, 1]}
-  tt.return
-}
-
 // CHECK-LABEL: @no_partitions
 tt.func @no_partitions(%lb: i32, %ub: i32, %step: i32) {
   // CHECK-NEXT: scf.for
@@ -259,3 +249,22 @@ tt.func public @capture_order(%arg0: i32) {
 }
 
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+!ty = tensor<1xi32, #blocked>
+
+module attributes {"ttg.num-warps" = 4 : i32} {
+
+tt.func @still_has_ssa_deps(%lb: i32, %ub: i32, %step: i32) {
+  scf.for %i = %lb to %ub step %step : i32 {
+    // expected-warning @below {{non-root partition #0 has direct SSA consumer}}
+    %0 = "op_a"() {ttg.partition = 0} : () -> !ty
+    // expected-note @below {{use at distance 0 in partition #1 here}}
+    "op_b"(%0) {ttg.partition = 1} : (!ty) -> ()
+  } {ttg.partition.stages = [0, 1]}
+  tt.return
+}
+
+}
diff --git a/test/TritonGPU/rewrite-partition-dependencies.mlir b/test/TritonGPU/rewrite-partition-dependencies.mlir

Original file line number	Diff line number	Diff line change
`@@ -269,6 +269,6 @@ void PartitionLoops::runOnOperation() {`
`269`	`269`
`270`	`270`	`for (scf::ForOp loop : loops) {`
`271`	`271`	`if (failed(partitionLoop(loop)))`
`272`		`- continue;`
	`272`	`+ return signalPassFailure();`
`273`	`273`	`}`
`274`	`274`	`}`
Original file line number	Diff line number	Diff line change
`@@ -568,6 +568,6 @@ void RewritePartitionDependencies::runOnOperation() {`
`568`	`568`
`569`	`569`	`for (scf::ForOp loop : loops) {`
`570`	`570`	`if (failed(rewritePartitionDependencies(loop)))`
`571`		`- continue;`
	`571`	`+ return signalPassFailure();`
`572`	`572`	`}`
`573`	`573`	`}`