[AMD] Introduce amdgpu.cond_barrier (#5360)

jungpark-mlir · antiagainst · web-flow · commit 89c0b0abdfac · 2024-12-08T08:43:16.000-08:00
condBarrierOp sets barrier instruction only when the given argument is
true. This provides a way to synchronize partial threads in a block,
deliberately diverges the execution sequences of the threads but still
in the sync. However, user should guarantee all threads converge at the
end by calling condBarrierOp(true) with the remaining threads.
Conceptually, this is similar to having a barrier inside an if
statement. This op allows us to avoid blocking the whole block when
suitable to help scheduling.

---------

Co-authored-by: Lei Zhang &lt;antiagainst@gmail.com&gt;
diff --git a/test/TritonGPU/amd/amd-conditional-barrier.mlir b/test/TritonGPU/amd/amd-conditional-barrier.mlir
@@ -0,0 +1,33 @@
+// RUN: triton-opt %s --convert-triton-amdgpu-to-llvm='arch=gfx942' | FileCheck %s
+
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @conditional_barrier() {
+    // CHECK-LABEL: llvm.func @conditional_barrier
+
+    // CHECK:   %[[CMP0:.+]] = llvm.icmp "ne" %3, %1 : i32
+    // CHECK:   %[[CMP1:.+]] = llvm.icmp "eq" %3, %1 : i32
+    // CHECK:   llvm.cond_br %[[CMP0]], ^bb1, ^bb2
+    // CHECK: ^bb1:
+    // CHECK:   rocdl.s.barrier
+    // CHECK:   llvm.br ^bb2
+    // CHECK: ^bb2:
+    // CHECK:   llvm.add
+    // CHECK:   llvm.cond_br %[[CMP1]], ^bb3, ^bb4
+    // CHECK: ^bb3:
+    // CHECK:   rocdl.s.barrier
+    // CHECK:   llvm.br ^bb4
+    // CHECK: ^bb4:
+    // CHECK:   llvm.return
+
+    %c256_i32 = arith.constant 256 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %0 = rocdl.workitem.id.x : i32
+    %1 = arith.divsi %0, %c256_i32 : i32
+    %2 = arith.cmpi ne, %1, %c0_i32 : i32
+    %3 = arith.cmpi eq, %1, %c0_i32 : i32
+    amdgpu.cond_barrier %2
+    %4 = arith.addi %0, %c256_i32 : i32
+    amdgpu.cond_barrier %3
+    tt.return
+  }
+}
diff --git a/third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td b/third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td
@@ -152,6 +152,23 @@ def InstructionSchedHint : TT_AMDGPU_Op<"instruction_sched_hint", []> {
   let assemblyFormat = [{ attr-dict }];
 }
 
+def CondBarrierOp : TT_AMDGPU_Op<"cond_barrier">,
+  Arguments<(ins I1:$pred)> {
+  let summary = "Conditionally set barriers to synchronize partial threads in a block";
+
+  let description = [{
+      condBarrierOp sets barrier instruction only when the given argument is true.
+      This provides a way to synchronize partial threads in a block, deliberately
+      diverges the execution sequences. However, user should guarantee all threads
+      converge at the end by calling condBarrierOp(true) with the remaining threads.
+      Conceptually, this is similar to having an execution barrier inside an if statement.
+      This op allows us to avoid blocking the whole block when suitable to help scheduling.
+      NB. This doesn't set any memory fence.
+  }];
+
+  let assemblyFormat = "$pred attr-dict";
+}
+
 //
 // AMD Buffer operations.
 //
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/SPMDOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/SPMDOpToLLVM.cpp
@@ -1,5 +1,7 @@
+#include "Dialect/TritonAMDGPU/IR/Dialect.h"
 #include "PatternTritonGPUOpToLLVM.h"
 #include "Utility.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 
 using namespace mlir;
 
@@ -25,10 +27,37 @@ struct GetNumProgramsOpConversion
   }
 };
 
+struct CondBarrierOpConversion
+    : public ConvertOpToLLVMPattern<triton::amdgpu::CondBarrierOp> {
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(triton::amdgpu::CondBarrierOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Location loc = op->getLoc();
+    Block *currentBlock = rewriter.getInsertionBlock();
+    Block *afterCondBarBlock =
+        rewriter.splitBlock(currentBlock, rewriter.getInsertionPoint());
+    Block *trueBlock = rewriter.createBlock(afterCondBarBlock);
+    rewriter.setInsertionPointToEnd(currentBlock);
+    rewriter.create<LLVM::CondBrOp>(loc, adaptor.getPred(), trueBlock,
+                                    afterCondBarBlock);
+
+    // conditional barrier
+    rewriter.setInsertionPointToStart(trueBlock);
+    rewriter.create<ROCDL::SBarrierOp>(loc);
+    rewriter.create<LLVM::BrOp>(loc, afterCondBarBlock);
+
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
 } // namespace
 
 void mlir::triton::AMD::populateSPMDOpToLLVMPattern(
     LLVMTypeConverter &typeConverter, RewritePatternSet &patterns,
     PatternBenefit benefit) {
   patterns.add<GetNumProgramsOpConversion>(typeConverter, benefit);
+  patterns.add<CondBarrierOpConversion>(typeConverter, benefit);
 }