[Backend] Emit bar.warp.sync for barriers of 1 warp (#7336)

Mogball · web-flow · commit 677a30c8d9ce · 2025-06-27T13:40:12.000-07:00
In warp specialized regions with only 1 warp, we can emit
`bar.warp.sync` instead of barriers with a threadcount. This is slightly
more efficient.
diff --git a/test/Conversion/warp_specialize_to_llvm.mlir b/test/Conversion/warp_specialize_to_llvm.mlir
@@ -8,7 +8,7 @@ llvm.mlir.global external @global_smem() {addr_space = 3 : i32, alignment = 16 :
 llvm.func @rewrite_barriers() attributes {allocation.offset = 32 : i32} {
   // CHECK: barrier.sync.aligned 2, 128
   // CHECK: barrier.sync.aligned 3, 64
-  // CHECK: barrier.sync.aligned 4, 32
+  // CHECK: bar.warp.sync
 
   // CHECK: bb{{[0-9]+}}:
   // CHECK-NEXT: barrier.sync.aligned 0, 128
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertWarpSpecializeToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertWarpSpecializeToLLVM.cpp
@@ -1,5 +1,6 @@
 #include "TargetInfo.h"
 #include "TritonNVIDIAGPUToLLVM/PTXAsmFormat.h"
+#include "Utility.h"
 #include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Conversion/Passes.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
@@ -87,6 +88,12 @@ static void createBarrier(TritonLLVMIRRewriter &b, unsigned barIdx,
                           std::optional<unsigned> numThreads, bool aligned) {
   assert(barIdx < 16 && "not enough barriers");
 
+  // If a partition has only 1 warp, use `bar.warp.sync`.
+  if (numThreads && *numThreads == 32) {
+    LLVM::NVIDIA::createSyncWarp(b.getLoc(), b);
+    return;
+  }
+
   PTXBuilder ptxBuilder;
   std::string ptxString;
   llvm::raw_string_ostream os(ptxString);
@@ -101,6 +108,10 @@ static void createBarrier(TritonLLVMIRRewriter &b, unsigned barIdx,
   ptxBuilder.launch(b, b.getLoc(), void_ty(b.getContext()));
 }
 
+static void createAllBarrier(TritonLLVMIRRewriter &b, unsigned barIdx) {
+  createBarrier(b, barIdx, /*numThreads=*/std::nullopt, /*aligned=*/false);
+}
+
 //===----------------------------------------------------------------------===//
 // elideTrivialCaptures
 //===----------------------------------------------------------------------===//
@@ -268,14 +279,12 @@ static void rewritePartitionRegions(WarpSpecializeOp ws, Block *switchLoop,
 
     // The shared memory is only live for the entry into the region, so put
     // another barrier here.
-    createBarrier(b, kSwitchLoopBarrierIdx, /*numThreads=*/std::nullopt,
-                  /*aligned=*/false);
+    createAllBarrier(b, kSwitchLoopBarrierIdx);
 
     // Rewrite all warp returns.
     partition->walk([&](WarpReturnOp op) {
       TritonLLVMIRRewriter b(op.getLoc(), op);
-      createBarrier(b, kSwitchLoopBarrierIdx, /*numThreads=*/std::nullopt,
-                    /*aligned=*/false);
+      createAllBarrier(b, kSwitchLoopBarrierIdx);
       if (auto actRegs = ws.getActualRegisters()) {
         createRegRealloc(b, (*actRegs)[partition->getRegionNumber() + 1],
                          lowRegs);
@@ -393,8 +402,7 @@ static LogicalResult lowerWarpSpecialize(LLVM::LLVMFuncOp func,
   b.setInsertionPointToStart(switchLoop);
   if (maxnreg)
     createRegRealloc(b, maxnreg.getInt(), lowRegs);
-  createBarrier(b, kSwitchLoopBarrierIdx, /*numThreads=*/std::nullopt,
-                /*aligned=*/false);
+  createAllBarrier(b, kSwitchLoopBarrierIdx);
   Value statePtr = LLVM::getSharedMemoryBase(b.getLoc(), b, targetInfo, func);
   Value relWid = b.sub(wid, b.i32_val(defaultNumWarps));
 
@@ -448,10 +456,8 @@ static LogicalResult lowerWarpSpecialize(LLVM::LLVMFuncOp func,
   Block *defaultBlock = new Block;
   funcBlocks.insert(std::next(switchLoop->getIterator()), defaultBlock);
   b.setInsertionPointToStart(defaultBlock);
-  createBarrier(b, kSwitchLoopBarrierIdx, /*numThreads=*/std::nullopt,
-                /*aligned=*/false);
-  createBarrier(b, kSwitchLoopBarrierIdx, /*numThreads=*/std::nullopt,
-                /*aligned=*/false);
+  createAllBarrier(b, kSwitchLoopBarrierIdx);
+  createAllBarrier(b, kSwitchLoopBarrierIdx);
   auto latchBr = b.create<LLVM::BrOp>(switchLoop);
   disableLICM(latchBr);
 
@@ -498,18 +504,15 @@ static LogicalResult lowerWarpSpecialize(LLVM::LLVMFuncOp func,
 
     // First barrier releases the waiting warpgroups. The second barrier ensures
     // they have read the captures before the memory is released upon entry.
-    createBarrier(b, kSwitchLoopBarrierIdx, /*numThreads=*/std::nullopt,
-                  /*aligned=*/false);
+    createAllBarrier(b, kSwitchLoopBarrierIdx);
     if (auto actRegs = ws.getActualRegisters())
       createRegRealloc(b, defRegs, actRegs->front());
-    createBarrier(b, kSwitchLoopBarrierIdx, /*numThreads=*/std::nullopt,
-                  /*aligned=*/false);
+    createAllBarrier(b, kSwitchLoopBarrierIdx);
     b.create<LLVM::BrOp>(&ws.getDefaultRegion().front());
 
     ws.getDefaultRegion().walk([&, ws = ws](WarpYieldOp op) mutable {
       TritonLLVMIRRewriter b(op.getLoc(), op);
-      createBarrier(b, kSwitchLoopBarrierIdx, /*numThreads=*/std::nullopt,
-                    /*aligned=*/false);
+      createAllBarrier(b, kSwitchLoopBarrierIdx);
       if (auto actRegs = ws.getActualRegisters())
         createRegRealloc(b, actRegs->front(), defRegs);
       b.replaceOpWithNewOp<LLVM::BrOp>(op, op.getOperands(), after);
@@ -532,8 +535,7 @@ static LogicalResult lowerWarpSpecialize(LLVM::LLVMFuncOp func,
     Value cst = b.i8_val(partitionStateCounter);
     for (int32_t i : llvm::seq(maxNumWarps))
       b.store(cst, b.gep(ptrTy, i8_ty, statePtr, LLVM::GEPArg(i)));
-    createBarrier(b, kSwitchLoopBarrierIdx, /*numThreads=*/std::nullopt,
-                  /*aligned=*/false);
+    createAllBarrier(b, kSwitchLoopBarrierIdx);
   });
   b.setInsertionPointToStart(switchExit);
   b.create<LLVM::ReturnOp>(ValueRange());