[WS] Support scalar ops across partition (triton-lang#8061)

ThomasRaoux · web-flow · commit 5c5ab9f348f7 · 2025-09-04T11:00:47.000-07:00
We use splat/unsplat as we currently don't support allocation of
scalars.
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/RewritePartitionDependencies.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/RewritePartitionDependencies.cpp
@@ -1,7 +1,10 @@
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Pass/Pass.h"
 #include "nvidia/include/Dialect/NVWS/IR/Dialect.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/Transforms/Partition.h"
 #include "triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h"
@@ -202,17 +205,12 @@ LogicalResult DependencyRewriter::run() {
        llvm::zip(schedule.getPartitions(), partitionUseInfo)) {
     // The amount of buffering is based on the longest distance to a user.
     for (auto &[output, info] : useInfo) {
-      // FIXME: No IR support for passing simple scalars through shared
-      // memory.
-      auto tensorType = dyn_cast<RankedTensorType>(output.getType());
-      if (!tensorType) {
-        return mlir::emitWarning(output.getLoc(),
-                                 "FIXME: only tensor SSA dependencies between "
-                                 "partitions are supported");
-      }
+      b.setLoc(output.getLoc());
+      ImplicitLocOpBuilder endBuilder(b.getLoc(), loop->getNextNode());
 
-      Operation *defOp;
+      bool isScalar = false;
       Value tmp = output;
+      Operation *defOp;
       while (true) {
         if (auto arg = dyn_cast<BlockArgument>(tmp)) {
           tmp = loop.getBody()->getTerminator()->getOperand(arg.getArgNumber() -
@@ -222,14 +220,31 @@ LogicalResult DependencyRewriter::run() {
         defOp = tmp.getDefiningOp();
         break;
       }
+      Value val = output;
+      auto tensorType = dyn_cast<RankedTensorType>(output.getType());
+      if (!tensorType) {
+        isScalar = true;
+        b.setInsertionPointAfterValue(output);
+        auto mod = output.getParentRegion()->getParentOfType<ModuleOp>();
+        auto nWarps = lookupNumWarps(mod);
+        auto threadsPerWarp =
+            triton::gpu::TritonGPUDialect::getThreadsPerWarp(mod);
+        int CTAs = triton::gpu::TritonGPUDialect::getNumCTAs(mod);
+        Attribute encoding = getDefaultBlockedEncoding(
+            b.getContext(), {1}, nWarps, threadsPerWarp, CTAs);
+        tensorType = RankedTensorType::get({1}, output.getType(), encoding);
+        StageCluster srcStageCluster = getStageCluster(defOp);
+
+        defOp = b.createInto<triton::SplatOp>(partition, srcStageCluster,
+                                              tensorType, output);
+        val = defOp->getResult(0);
+      }
 
       // Buffer the value based on the greatest distance to a consumer
       // partition.
       int maxDistance = info.getMaxUseDistance(partition);
 
       // Allocate buffers for the value and its associated barriers.
-      b.setLoc(output.getLoc());
-      ImplicitLocOpBuilder endBuilder(b.getLoc(), loop->getNextNode());
       AsyncRef aref = allocateAsyncValue(tensorType, maxDistance);
 
       unsigned numConsumers = info.consumers.size();
@@ -249,20 +264,24 @@ LogicalResult DependencyRewriter::run() {
         // partition with it.
         Value value = b.createInto<LocalLoadOp>(*usePartition, sinkSrcCluster,
                                                 tensorType, view);
+        if (isScalar) {
+          value = b.createInto<triton::UnsplatOp>(*usePartition, sinkSrcCluster,
+                                                  value);
+        }
         for (OpOperand *use : uses)
           use->set(value);
         exitOp(b);
       }
 
       // Set up production of the value
-      if (isa<BlockArgument>(output))
+      if (isa<BlockArgument>(val))
         b.setInsertionPointToStart(loop.getBody());
       else
         b.setInsertionPointAfter(defOp);
 
       StageCluster srcStageCluster = getStageCluster(defOp);
       auto [view, exitOp] = aref.putView(b, partition, srcStageCluster);
-      b.createInto<LocalStoreOp>(partition, srcStageCluster, output, view);
+      b.createInto<LocalStoreOp>(partition, srcStageCluster, val, view);
       exitOp(b);
     }
   }
diff --git a/test/TritonGPU/rewrite-partition-dependencies.mlir b/test/TritonGPU/rewrite-partition-dependencies.mlir
@@ -320,6 +320,31 @@ tt.func @no_def_op(%lb: i32, %ub: i32, %step: i32) {
   tt.return
 }
 
+// CHECK-LABEL: @scalar_consumers
+tt.func @scalar_consumers(%lb: i32, %ub: i32, %step: i32) {
+  // CHECK: [[C0:%.*]] = arith.constant 0 : i32
+  // CHECK-NEXT: [[ABUF:%.*]] = ttg.local_alloc : () -> !ttg.memdesc<1x1xi32, {{.*}}>
+  // CHECK-NEXT: [[AREF:%.*]] = nvws.aref.create [[ABUF]]
+  scf.for %i = %lb to %ub step %step iter_args() -> () : i32 {
+    %0 = "op_a"() {ttg.partition = 0} : () -> i32
+    // CHECK: [[VAL:%.*]] = "op_a"
+    // CHECK-NEXT: [[VAL_TENSOR:%.*]] = tt.splat [[VAL]] {ttg.partition = 0 : i32} : i32 -> tensor<1xi32, #blocked>
+    // CHECK-NEXT: [[BUF:%.*]], [[TOKEN:%.*]] = nvws.aref.put.enter [[AREF]][[[C0]], [[C0]]] {ttg.partition = 0 : i32}
+    // CHECK-NEXT: ttg.local_store [[VAL_TENSOR]], [[BUF]] {ttg.partition = 0 : i32}
+    // CHECK-NEXT: nvws.aref.put.exit [[AREF]][[[C0]]], [[TOKEN]] [#nvws.async_op<none>] {ttg.partition = 0 : i32}
+
+    "op_b"(%0) {ttg.partition = 1} : (i32) -> ()
+    // CHECK-NEXT: [[BUF:%.*]], [[TOKEN:%.*]] = nvws.aref.get.enter [[AREF]][[[C0]], [[C0]]] {ttg.partition = 1 : i32}
+    // CHECK-NEXT: [[VAL:%.*]] = ttg.local_load [[BUF]] {ttg.partition = 1 : i32}
+    // CHECK-NEXT: [[VAL_SCALAR:%.*]] = tt.unsplat [[VAL]] {ttg.partition = 1 : i32} : tensor<1xi32, #blocked>
+    // CHECK-NEXT: nvws.aref.get.exit [[AREF]][[[C0]]], [[TOKEN]] [#nvws.async_op<none>] {ttg.partition = 1 : i32}
+    // CHECK-NEXT: "op_b"([[VAL_SCALAR]])
+
+  } {ttg.partition.stages = [0, 2], ttg.warp_specialize.tag = 0 : i32}
+  tt.return
+}
+
+
 }
 
 // -----