From f3ea386b0b040ec50c398a6509da1489ef3f3aa9 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Tue, 16 Sep 2025 13:56:40 -0700 Subject: [PATCH] [AMDGPU] Add s_cluster_barrier on gfx1250 --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 1 + .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 10 ++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 4 + .../Target/AMDGPU/AMDGPULowerIntrinsics.cpp | 68 +++++++++-- llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp | 1 + llvm/lib/Target/AMDGPU/SIDefines.h | 9 +- .../lower-intrinsics-cluster-barrier.ll | 107 ++++++++++++++++++ 7 files changed, 191 insertions(+), 9 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/lower-intrinsics-cluster-barrier.ll diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index fda16e42d2c6b..32b5aa5ac1377 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -696,6 +696,7 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16 //===----------------------------------------------------------------------===// // GFX1250+ only builtins. //===----------------------------------------------------------------------===// +TARGET_BUILTIN(__builtin_amdgcn_s_cluster_barrier, "v", "n", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_flat_prefetch, "vvC*0Ii", "nc", "vmem-pref-insts") TARGET_BUILTIN(__builtin_amdgcn_global_prefetch, "vvC*1Ii", "nc", "vmem-pref-insts") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index c35715965daeb..b6b475a7565ba 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -1478,6 +1478,16 @@ void test_prefetch(generic void *fptr, global void *gptr) { __builtin_amdgcn_global_prefetch(gptr, 8); } +// CHECK-LABEL: @test_s_cluster_barrier( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.amdgcn.s.cluster.barrier() +// CHECK-NEXT: ret void +// +void test_s_cluster_barrier() +{ + __builtin_amdgcn_s_cluster_barrier(); +} + // CHECK-LABEL: @test_global_add_f32( // CHECK-NEXT: entry: // CHECK-NEXT: [[RETVAL:%.*]] = alloca float, align 4, addrspace(5) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index d9eabf972cc26..afce1fe6af854 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -3691,6 +3691,10 @@ def int_amdgcn_ashr_pk_u8_i32 : ClangBuiltin<"__builtin_amdgcn_ashr_pk_u8_i32">, // gfx1250 intrinsics // ===----------------------------------------------------------------------===// +// Vanilla cluster sync-barrier +def int_amdgcn_s_cluster_barrier : ClangBuiltin<"__builtin_amdgcn_s_cluster_barrier">, + Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; + // Async waits decrement ASYNCcnt and tensor waits decrement TENSORcnt which is // modeled as InaccessibleMem. class AMDGPUWaitAsyncIntrinsic : diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp index a30d9cb0412a4..d490788a97685 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp @@ -18,6 +18,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #define DEBUG_TYPE "amdgpu-lower-intrinsics" @@ -49,7 +50,6 @@ class AMDGPULowerIntrinsicsLegacy : public ModulePass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); - AU.setPreservesCFG(); } }; @@ -73,6 +73,7 @@ bool AMDGPULowerIntrinsicsImpl::run() { case Intrinsic::amdgcn_s_barrier_signal: case Intrinsic::amdgcn_s_barrier_signal_isfirst: case Intrinsic::amdgcn_s_barrier_wait: + case Intrinsic::amdgcn_s_cluster_barrier: forEachCall(F, [&](IntrinsicInst *II) { Changed |= visitBarrier(*II); }); break; } @@ -81,13 +82,14 @@ bool AMDGPULowerIntrinsicsImpl::run() { return Changed; } -// Optimize barriers and lower s_barrier to a sequence of split barrier -// intrinsics. +// Optimize barriers and lower s_(cluster_)barrier to a sequence of split +// barrier intrinsics. bool AMDGPULowerIntrinsicsImpl::visitBarrier(IntrinsicInst &I) { assert(I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier || I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal || I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal_isfirst || - I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait); + I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait || + I.getIntrinsicID() == Intrinsic::amdgcn_s_cluster_barrier); const GCNSubtarget &ST = TM.getSubtarget(*I.getFunction()); bool IsSingleWaveWG = false; @@ -99,7 +101,59 @@ bool AMDGPULowerIntrinsicsImpl::visitBarrier(IntrinsicInst &I) { IRBuilder<> B(&I); - if (IsSingleWaveWG) { + // Lower the s_cluster_barrier intrinsic first. There is no corresponding + // hardware instruction in any subtarget. + if (I.getIntrinsicID() == Intrinsic::amdgcn_s_cluster_barrier) { + // The default cluster barrier expects one signal per workgroup. So we need + // a workgroup barrier first. + if (IsSingleWaveWG) { + B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_wave_barrier, {}); + } else { + Value *BarrierID_32 = B.getInt32(AMDGPU::Barrier::WORKGROUP); + Value *BarrierID_16 = B.getInt16(AMDGPU::Barrier::WORKGROUP); + Value *IsFirst = B.CreateIntrinsic( + B.getInt1Ty(), Intrinsic::amdgcn_s_barrier_signal_isfirst, + {BarrierID_32}); + B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_wait, + {BarrierID_16}); + + Instruction *ThenTerm = + SplitBlockAndInsertIfThen(IsFirst, I.getIterator(), false); + B.SetInsertPoint(ThenTerm); + } + + // Now we can signal the cluster barrier from a single wave and wait for the + // barrier in all waves. + Value *BarrierID_32 = B.getInt32(AMDGPU::Barrier::CLUSTER); + Value *BarrierID_16 = B.getInt16(AMDGPU::Barrier::CLUSTER); + B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_signal, + {BarrierID_32}); + + B.SetInsertPoint(&I); + B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_wait, + {BarrierID_16}); + + I.eraseFromParent(); + return true; + } + + bool IsWorkgroupScope = false; + + if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait || + I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal || + I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal_isfirst) { + int BarrierID = cast(I.getArgOperand(0))->getSExtValue(); + if (BarrierID == AMDGPU::Barrier::TRAP || + BarrierID == AMDGPU::Barrier::WORKGROUP || + (BarrierID >= AMDGPU::Barrier::NAMED_BARRIER_FIRST && + BarrierID <= AMDGPU::Barrier::NAMED_BARRIER_LAST)) + IsWorkgroupScope = true; + } else { + assert(I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier); + IsWorkgroupScope = true; + } + + if (IsWorkgroupScope && IsSingleWaveWG) { // Down-grade waits, remove split signals. if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier || I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait) { @@ -134,9 +188,7 @@ PreservedAnalyses AMDGPULowerIntrinsicsPass::run(Module &M, AMDGPULowerIntrinsicsImpl Impl(M, TM); if (!Impl.run()) return PreservedAnalyses::all(); - PreservedAnalyses PA; - PA.preserveSet(); - return PA; + return PreservedAnalyses::none(); } bool AMDGPULowerIntrinsicsLegacy::runOnModule(Module &M) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp index eeb6de512bf5e..e17c2113ca398 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp @@ -360,6 +360,7 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) { if (const IntrinsicInst *II = dyn_cast(DefInst)) { switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_s_barrier: + case Intrinsic::amdgcn_s_cluster_barrier: case Intrinsic::amdgcn_s_barrier_signal: case Intrinsic::amdgcn_s_barrier_signal_var: case Intrinsic::amdgcn_s_barrier_signal_isfirst: diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index ecc4659ee0e81..ecc28244cc71e 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -1108,7 +1108,14 @@ enum Register_Flag : uint8_t { namespace AMDGPU { namespace Barrier { -enum Type { TRAP = -2, WORKGROUP = -1 }; +enum Type { + CLUSTER_TRAP = -4, + CLUSTER = -3, + TRAP = -2, + WORKGROUP = -1, + NAMED_BARRIER_FIRST = 1, + NAMED_BARRIER_LAST = 16, +}; enum { BARRIER_SCOPE_WORKGROUP = 0, diff --git a/llvm/test/CodeGen/AMDGPU/lower-intrinsics-cluster-barrier.ll b/llvm/test/CodeGen/AMDGPU/lower-intrinsics-cluster-barrier.ll new file mode 100644 index 0000000000000..b66011be8defd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-intrinsics-cluster-barrier.ll @@ -0,0 +1,107 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -codegen-opt-level=0 | FileCheck --check-prefixes=CHECK,NOOPT %s +; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -codegen-opt-level=1 -mattr=+wavefrontsize32 | FileCheck --check-prefixes=CHECK,OPT-WAVE32 %s +; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -codegen-opt-level=1 -mattr=+wavefrontsize64 | FileCheck --check-prefixes=CHECK,OPT-WAVE64 %s + +declare void @foo(i1) + +; Verify that the explicit use of a split cluster barrier isn't optimized away. +define amdgpu_kernel void @split_barriers() "amdgpu-flat-work-group-size"="32,32" { +; CHECK-LABEL: define amdgpu_kernel void @split_barriers( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3) +; CHECK-NEXT: [[ISFIRST:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -3) +; CHECK-NEXT: call void @foo(i1 [[ISFIRST]]) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal(i32 -3) + call void @llvm.amdgcn.s.barrier.wait(i16 -3) + %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -3) + call void @foo(i1 %isfirst) + ret void +} + +define amdgpu_kernel void @s_cluster_barrier() { +; CHECK-LABEL: define amdgpu_kernel void @s_cluster_barrier( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1) +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]] +; CHECK: [[BB2]]: +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3) +; CHECK-NEXT: br label %[[BB3]] +; CHECK: [[BB3]]: +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.cluster.barrier() + ret void +} + +define amdgpu_kernel void @s_cluster_barrier_wg32() "amdgpu-flat-work-group-size"="32,32" { +; NOOPT-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg32( +; NOOPT-SAME: ) #[[ATTR1]] { +; NOOPT-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1) +; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1) +; NOOPT-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]] +; NOOPT: [[BB2]]: +; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3) +; NOOPT-NEXT: br label %[[BB3]] +; NOOPT: [[BB3]]: +; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3) +; NOOPT-NEXT: ret void +; +; OPT-WAVE32-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg32( +; OPT-WAVE32-SAME: ) #[[ATTR1]] { +; OPT-WAVE32-NEXT: call void @llvm.amdgcn.wave.barrier() +; OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3) +; OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3) +; OPT-WAVE32-NEXT: ret void +; +; OPT-WAVE64-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg32( +; OPT-WAVE64-SAME: ) #[[ATTR1]] { +; OPT-WAVE64-NEXT: call void @llvm.amdgcn.wave.barrier() +; OPT-WAVE64-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3) +; OPT-WAVE64-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3) +; OPT-WAVE64-NEXT: ret void +; + call void @llvm.amdgcn.s.cluster.barrier() + ret void +} + +define amdgpu_kernel void @s_cluster_barrier_wg64() "amdgpu-flat-work-group-size"="64,64" { +; NOOPT-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg64( +; NOOPT-SAME: ) #[[ATTR2:[0-9]+]] { +; NOOPT-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1) +; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1) +; NOOPT-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]] +; NOOPT: [[BB2]]: +; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3) +; NOOPT-NEXT: br label %[[BB3]] +; NOOPT: [[BB3]]: +; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3) +; NOOPT-NEXT: ret void +; +; OPT-WAVE32-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg64( +; OPT-WAVE32-SAME: ) #[[ATTR2:[0-9]+]] { +; OPT-WAVE32-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1) +; OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1) +; OPT-WAVE32-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]] +; OPT-WAVE32: [[BB2]]: +; OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3) +; OPT-WAVE32-NEXT: br label %[[BB3]] +; OPT-WAVE32: [[BB3]]: +; OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3) +; OPT-WAVE32-NEXT: ret void +; +; OPT-WAVE64-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg64( +; OPT-WAVE64-SAME: ) #[[ATTR2:[0-9]+]] { +; OPT-WAVE64-NEXT: call void @llvm.amdgcn.wave.barrier() +; OPT-WAVE64-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3) +; OPT-WAVE64-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3) +; OPT-WAVE64-NEXT: ret void +; + call void @llvm.amdgcn.s.cluster.barrier() + ret void +}