-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[AMDGPU] Optimize out s_barrier_signal/_wait #116993
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Extend the optimization that converts s_barrier to wave_barrier (nop) when the number of work items is not larger than wave size. This handles the "split barrier" form of s_barrier where the barrier is represented by separate intrinsics (s_barrier_signal/s_barrier_wait). Note: the version where s_barrier is used in gfx12 (and later split) has the optimization already, but some front-ends may prefer to use split intrinsics and this is being addressed by the patch.
|
@llvm/pr-subscribers-backend-amdgpu Author: Piotr Sobczak (piotrAMD) Changes[AMDGPU] Optimize out s_barrier_signal/_wait Extend the optimization that converts s_barrier to wave_barrier (nop) This handles the "split barrier" form of s_barrier where the barrier Full diff: https://github.com/llvm/llvm-project/pull/116993.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 3522ece24f1c45..f01f57de460c75 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1843,8 +1843,9 @@ bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
}
}
- // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
- if (STI.hasSplitBarriers()) {
+ Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
+ if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
+ // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
@@ -2161,6 +2162,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
case Intrinsic::amdgcn_init_whole_wave:
return selectInitWholeWave(I);
case Intrinsic::amdgcn_s_barrier:
+ case Intrinsic::amdgcn_s_barrier_signal:
+ case Intrinsic::amdgcn_s_barrier_wait:
return selectSBarrier(I);
case Intrinsic::amdgcn_raw_buffer_load_lds:
case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5b02f9bf80d3fc..4743ff0f65690e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9605,7 +9605,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
}
- case Intrinsic::amdgcn_s_barrier: {
+ case Intrinsic::amdgcn_s_barrier:
+ case Intrinsic::amdgcn_s_barrier_signal:
+ case Intrinsic::amdgcn_s_barrier_wait: {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) {
unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
@@ -9615,8 +9617,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
0);
}
- // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
- if (ST.hasSplitBarriers()) {
+ if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
+ // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
SDValue K =
DAG.getTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32);
SDValue BarSignal =
diff --git a/llvm/test/CodeGen/AMDGPU/barrier-elimination-gfx12.ll b/llvm/test/CodeGen/AMDGPU/barrier-elimination-gfx12.ll
new file mode 100644
index 00000000000000..42983b335f497c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/barrier-elimination-gfx12.ll
@@ -0,0 +1,56 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s -global-isel | FileCheck %s
+
+; CHECK-LABEL: {{^}}signal_unknown_wgs:
+; CHECK: s_barrier_signal
+define amdgpu_kernel void @signal_unknown_wgs() {
+ tail call void @llvm.amdgcn.s.barrier.signal(i32 -1) #0
+ ret void
+}
+
+; CHECK-LABEL: {{^}}signal_flat_wgs_attr_32_128:
+; CHECK: s_barrier_signal
+define amdgpu_kernel void @signal_flat_wgs_attr_32_128() #1 {
+ tail call void @llvm.amdgcn.s.barrier.signal(i32 -1) #0
+ ret void
+}
+
+; CHECK-LABEL: {{^}}signal_flat_wgs_attr_32_64:
+; CHECK: :
+; CHECK-NEXT: ; wave barrier
+; CHECK-NEXT: s_endpgm
+define amdgpu_kernel void @signal_flat_wgs_attr_32_64() #2 {
+ tail call void @llvm.amdgcn.s.barrier.signal(i32 -1) #0
+ ret void
+}
+
+
+; CHECK-LABEL: {{^}}wait_unknown_wgs:
+; CHECK: s_barrier_wait
+define amdgpu_kernel void @wait_unknown_wgs() {
+ tail call void @llvm.amdgcn.s.barrier.wait(i16 -1) #0
+ ret void
+}
+
+; CHECK-LABEL: {{^}}wait_flat_wgs_attr_32_128:
+; CHECK: s_barrier_wait
+define amdgpu_kernel void @wait_flat_wgs_attr_32_128() #1 {
+ tail call void @llvm.amdgcn.s.barrier.wait(i16 -1) #0
+ ret void
+}
+
+; CHECK-LABEL: {{^}}wait_flat_wgs_attr_32_64:
+; CHECK: :
+; CHECK-NEXT: ; wave barrier
+; CHECK-NEXT: s_endpgm
+define amdgpu_kernel void @wait_flat_wgs_attr_32_64() #2 {
+ tail call void @llvm.amdgcn.s.barrier.wait(i16 -1) #0
+ ret void
+}
+
+declare void @llvm.amdgcn.s.barrier.signal(i32 immarg) #0
+declare void @llvm.amdgcn.s.barrier.wait(i16 immarg) #0
+
+attributes #0 = { convergent nounwind }
+attributes #1 = { nounwind "amdgpu-flat-work-group-size"="32,128" }
+attributes #2 = { nounwind "amdgpu-flat-work-group-size"="16,32" }
|
ruiling
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Extend the optimization that converts s_barrier to wave_barrier (nop)
when the number of work items is not larger than wave size.
This handles the "split barrier" form of s_barrier where the barrier
is represented by separate intrinsics (s_barrier_signal/s_barrier_wait).
Note: the version where s_barrier is used in gfx12 (and later split)
has the optimization already, but some front-ends may prefer to use
split intrinsics and this is being addressed by the patch.