[Warp Spec] Optimize partitioning by hoisting above broadcasts (#7692)

Mogball · web-flow · commit 0ec07e751edd · 2025-07-28T16:40:50.000-07:00
This PR adds a small optimization step to partitioning that transforms

```mlir
%x = producer() partition=0
%y = broadcast %x : &lt;Axf32&gt; -&gt; &lt;AxBxf32&gt; partition=0
use(%y) partition=1
```

Into

```mlir
%x = producer() partition=0
%y = broadcast %x : &lt;Axf32&gt; -&gt; &lt;AxBxf32&gt; partition=1
use(%y) partition=1
```

To reduce the amount of shared memory needed.
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/Partition.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/Partition.cpp
@@ -2,6 +2,7 @@
 #include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 #include "llvm/ADT/SCCIterator.h"
+#include "llvm/IR/Use.h"
 
 using namespace mlir;
 using namespace triton;
@@ -119,16 +120,14 @@ bool WarpSchedule::trySchedule(Partition *partition, Operation *op) {
 
 FailureOr<WarpSchedule> WarpSchedule::deserialize(scf::ForOp loop) {
   auto stages = loop->getAttrOfType<ArrayAttr>(kPartitionStagesAttrName);
-  if (!stages) {
-    return mlir::emitWarning(loop.getLoc(), "missing '")
-           << kPartitionStagesAttrName << "' attribute";
-  }
+  if (!stages)
+    return failure();
 
   WarpSchedule result;
   for (auto [idx, attr] : llvm::enumerate(stages)) {
     auto stage = dyn_cast<IntegerAttr>(attr);
     if (!stage || stage.getInt() < 0) {
-      return mlir::emitWarning(loop.getLoc(), "partition stages attribute '")
+      return mlir::emitError(loop.getLoc(), "partition stages attribute '")
              << kPartitionStagesAttrName << "' has invalid element " << attr;
     }
 
@@ -140,10 +139,8 @@ FailureOr<WarpSchedule> WarpSchedule::deserialize(scf::ForOp loop) {
     Partition *partition = result.getRootPartition();
     if (auto attr = op.getAttrOfType<IntegerAttr>(kPartitionAttrName)) {
       int64_t idx = attr.getInt();
-      if (idx < 0 || idx >= result.partitions.size()) {
-        return mlir::emitWarning(op.getLoc(), "invalid partition index ")
-               << idx;
-      }
+      if (idx < 0 || idx >= result.partitions.size())
+        return mlir::emitError(op.getLoc(), "invalid partition index ") << idx;
       partition = result.partitions[idx].get();
     }
     result.insert(partition, &op);
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionScheduling.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionScheduling.cpp
@@ -149,10 +149,14 @@ static void scheduleUsers(scf::ForOp loop, WarpSchedule &schedule,
 // first-order partition assignment to the operations in the scheme and its
 // users and/or dependencies. This sets up the initial partitioning of the ops.
 static std::optional<WarpSchedule> getInitialSchedule(scf::ForOp loop) {
-  WarpSchedule schedule;
+  // Check for an existing schedule.
+  if (FailureOr<WarpSchedule> scheduleOr = WarpSchedule::deserialize(loop);
+      succeeded(scheduleOr))
+    return {std::move(*scheduleOr)};
 
   // Start by creating the default partition, a partition for for all loads, and
   // a partition for all MMAs.
+  WarpSchedule schedule;
   Partition *defaultPartition = schedule.addPartition(0);
   Partition *mmaPartition = schedule.addPartition(1);
   Partition *loadPartition = schedule.addPartition(0);
@@ -479,6 +483,39 @@ void propagatePartitions(scf::ForOp loop, WarpSchedule &schedule) {
   }
 }
 
+// Rematerialize chains of broadcasts where the user is in a different partition
+// than the broadcast to reduce the amount of data that needs to be transferred.
+void rematerializeBroadcasts(WarpSchedule &schedule, OpOperand *use) {
+  static_assert(
+      std::is_base_of_v<OpTrait::OneResult<BroadcastOp>, BroadcastOp> &&
+      std::is_base_of_v<OpTrait::OneResult<ExpandDimsOp>, ExpandDimsOp>);
+
+  Operation *defOp = use->get().getDefiningOp();
+  while (isa_and_nonnull<BroadcastOp, ExpandDimsOp>(defOp)) {
+    Operation *clone = OpBuilder(defOp).clone(*defOp);
+    Partition *userPartition = schedule.getPartition(use->getOwner());
+    assert(userPartition && "user not scheduled");
+    schedule.insert(userPartition, clone);
+    use->set(clone->getResult(0));
+
+    defOp = clone->getOperand(0).getDefiningOp();
+    use = &clone->getOpOperand(0);
+  }
+}
+
+void optimizeSchedule(scf::ForOp loop, WarpSchedule &schedule) {
+  for (Partition &partition : schedule.getPartitions()) {
+    SmallVector<OpOperand *> uses;
+    schedule.iterateOutputs(loop, &partition,
+                            [&](Operation *defOp, OpOperand &use) {
+                              if (!isa<scf::YieldOp>(use.getOwner()))
+                                uses.push_back(&use);
+                            });
+    for (OpOperand *use : uses)
+      rematerializeBroadcasts(schedule, use);
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Pass Definition
 //===----------------------------------------------------------------------===//
@@ -507,6 +544,7 @@ void PartitionScheduling::runOnOperation() {
   for (scf::ForOp loop : loops) {
     if (std::optional<WarpSchedule> schedule = getInitialSchedule(loop)) {
       propagatePartitions(loop, *schedule);
+      optimizeSchedule(loop, *schedule);
       schedule->serialize(loop);
     }
   }
diff --git a/test/TritonGPU/partition-scheduling.mlir b/test/TritonGPU/partition-scheduling.mlir
@@ -28,7 +28,6 @@ tt.func public @attention_forward(
   %zero = arith.constant dense<0.0> : tensor<256x64xf32, #blocked>
   %one = arith.constant dense<1.0> : tensor<256xf32, #ttg.slice<{dim = 1, parent = #blocked}>>
 
-  %QK_tmem, %QK_tok = ttng.tmem_alloc : () -> (!ttg.memdesc<256x64xf32, #tmem_acc, #ttng.tensor_memory, mutable>, !ttg.async.token)
 
   %loop_outs:4 = scf.for %i = %c0_i32 to %n_tiles step %c64_i32 iter_args(
     %l_i = %one,
@@ -46,6 +45,7 @@ tt.func public @attention_forward(
     %K_shared = ttg.local_alloc %K : (tensor<64x64xf16, #load_blocked>) -> !ttg.memdesc<64x64xf16, #shared, #smem>
 
     %K_trans = ttg.memdesc_trans %K_shared {order = array<i32: 1, 0>} : !ttg.memdesc<64x64xf16, #shared, #smem> -> !ttg.memdesc<64x64xf16, #shared_T, #smem>
+    %QK_tmem, %QK_tok = ttng.tmem_alloc : () -> (!ttg.memdesc<256x64xf32, #tmem_acc, #ttng.tensor_memory, mutable>, !ttg.async.token)
     %QK_mma_tok = ttng.tc_gen5_mma %Q_shared, %K_trans, %QK_tmem[%QK_tok], %false, %true : !ttg.memdesc<256x64xf16, #shared, #smem>, !ttg.memdesc<64x64xf16, #shared_T, #smem>, !ttg.memdesc<256x64xf32, #tmem_acc, #ttng.tensor_memory, mutable>
 
     %QK, %QK_load_tok = ttng.tmem_load %QK_tmem[%QK_mma_tok] : !ttg.memdesc<256x64xf32, #tmem_acc, #ttng.tensor_memory, mutable> -> tensor<256x64xf32, #blocked>
@@ -138,4 +138,28 @@ tt.func public @mma_operand_view(
   tt.return
 }
 
+// CHECK-LABEL: @optimize_broadcast
+tt.func @optimize_broadcast(%arg0: i32) {
+  %c0_i32 = arith.constant 0 : i32
+  %c1_i32 = arith.constant 1 : i32
+  // CHECK: scf.for
+  scf.for %i = %c0_i32 to %arg0 step %c1_i32 : i32 {
+    // CHECK: [[X:%.*]] = "producer"{{.*}}partition = 0
+    %x = "producer"() {ttg.partition = 0 : i32} : () -> tensor<128xf32>
+
+    // CHECK-DAG: [[X0_P0:%.*]] = tt.expand_dims [[X]] {{.*}}partition = 0
+    // CHECK-DAG: [[X0_P1:%.*]] = tt.expand_dims [[X]] {{.*}}partition = 1
+    %x0 = tt.expand_dims %x {axis = 0 : i32} : tensor<128xf32> -> tensor<1x128xf32>
+    // CHECK-DAG: [[X1_P0:%.*]] = tt.broadcast [[X0_P0]] {{.*}}partition = 0
+    // CHECK-DAG: [[X1_P1:%.*]] = tt.broadcast [[X0_P1]] {{.*}}partition = 1
+    %x1 = tt.broadcast %x0 : tensor<1x128xf32> -> tensor<128x128xf32>
+
+    // CHECK: "use"([[X1_P0]]) {{.*}}partition = 0
+    "use"(%x1) {ttg.partition = 0 : i32} : (tensor<128x128xf32>) -> ()
+    // CHECK: "use"([[X1_P1]]) {{.*}}partition = 1
+    "use"(%x1) {ttg.partition = 1 : i32} : (tensor<128x128xf32>) -> ()
+  } {tt.warp_specialize, ttg.partition.stages = [0 : i32, 1 : i32]}
+  tt.return
+}
+
 }
diff --git a/test/TritonGPU/rewrite-partition-dependencies.mlir b/test/TritonGPU/rewrite-partition-dependencies.mlir
@@ -337,7 +337,7 @@ tt.func @no_def_op(%lb: i32, %ub: i32, %step: i32) {
 module attributes {"ttg.num-warps" = 4 : i32} {
 
 tt.func @invalid_attribute(%lb: i32, %ub: i32, %step: i32) {
-  // expected-warning @below {{partition stages attribute 'ttg.partition.stages' has invalid element "a"}}
+  // expected-error @below {{partition stages attribute 'ttg.partition.stages' has invalid element "a"}}
   scf.for %i = %lb to %ub step %step : i32 {
     scf.yield
   } {ttg.partition.stages = ["a"]}
@@ -359,7 +359,7 @@ module attributes {"ttg.num-warps" = 4 : i32} {
 
 tt.func @invalid_attribute(%lb: i32, %ub: i32, %step: i32) {
   scf.for %k = %lb to %ub step %step : i32 {
-    // expected-warning @below {{invalid partition index -1}}
+    // expected-error @below {{invalid partition index -1}}
     "op"() {ttg.partition = -1} : () -> ()
     scf.yield
   } {ttg.partition.stages = [2, 2]}