Skip to content

Conversation

@piotrAMD
Copy link
Collaborator

@piotrAMD piotrAMD commented Nov 20, 2024

Extend the optimization that converts s_barrier to wave_barrier (nop)
when the number of work items is not larger than wave size.

This handles the "split barrier" form of s_barrier where the barrier
is represented by separate intrinsics (s_barrier_signal/s_barrier_wait).
Note: the version where s_barrier is used in gfx12 (and later split)
has the optimization already, but some front-ends may prefer to use
split intrinsics and this is being addressed by the patch.

Extend the optimization that converts s_barrier to wave_barrier (nop)
when the number of work items is not larger than wave size.

This handles the "split barrier" form of s_barrier where the barrier
is represented by separate intrinsics (s_barrier_signal/s_barrier_wait).
Note: the version where s_barrier is used in gfx12 (and later split)
has the optimization already, but some front-ends may prefer to use
split intrinsics and this is being addressed by the patch.
@llvmbot
Copy link
Member

llvmbot commented Nov 20, 2024

@llvm/pr-subscribers-backend-amdgpu

Author: Piotr Sobczak (piotrAMD)

Changes

[AMDGPU] Optimize out s_barrier_signal/_wait

Extend the optimization that converts s_barrier to wave_barrier (nop)
when the number of work items is not larger than wave size.

This handles the "split barrier" form of s_barrier where the barrier
is represented by separate intrinsics (s_barrier_signal/s_barrier_wait).
Note: the version where s_barrier is used in gfx12 (and later split)
has the optimization already, but some front-ends may prefer to use
split intrinsics and this is being addressed by the patch.


Full diff: https://github.com/llvm/llvm-project/pull/116993.diff

3 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+5-2)
  • (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+5-3)
  • (added) llvm/test/CodeGen/AMDGPU/barrier-elimination-gfx12.ll (+56)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 3522ece24f1c45..f01f57de460c75 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1843,8 +1843,9 @@ bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
     }
   }
 
-  // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
-  if (STI.hasSplitBarriers()) {
+  Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
+  if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
+    // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
     MachineBasicBlock *MBB = MI.getParent();
     const DebugLoc &DL = MI.getDebugLoc();
     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
@@ -2161,6 +2162,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
   case Intrinsic::amdgcn_init_whole_wave:
     return selectInitWholeWave(I);
   case Intrinsic::amdgcn_s_barrier:
+  case Intrinsic::amdgcn_s_barrier_signal:
+  case Intrinsic::amdgcn_s_barrier_wait:
     return selectSBarrier(I);
   case Intrinsic::amdgcn_raw_buffer_load_lds:
   case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5b02f9bf80d3fc..4743ff0f65690e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9605,7 +9605,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
     return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
   }
-  case Intrinsic::amdgcn_s_barrier: {
+  case Intrinsic::amdgcn_s_barrier:
+  case Intrinsic::amdgcn_s_barrier_signal:
+  case Intrinsic::amdgcn_s_barrier_wait: {
     const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
     if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) {
       unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
@@ -9615,8 +9617,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                        0);
     }
 
-    // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
-    if (ST.hasSplitBarriers()) {
+    if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
+      // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
       SDValue K =
           DAG.getTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32);
       SDValue BarSignal =
diff --git a/llvm/test/CodeGen/AMDGPU/barrier-elimination-gfx12.ll b/llvm/test/CodeGen/AMDGPU/barrier-elimination-gfx12.ll
new file mode 100644
index 00000000000000..42983b335f497c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/barrier-elimination-gfx12.ll
@@ -0,0 +1,56 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s -global-isel | FileCheck %s
+
+; CHECK-LABEL: {{^}}signal_unknown_wgs:
+; CHECK: s_barrier_signal
+define amdgpu_kernel void @signal_unknown_wgs() {
+  tail call void @llvm.amdgcn.s.barrier.signal(i32 -1) #0
+  ret void
+}
+
+; CHECK-LABEL: {{^}}signal_flat_wgs_attr_32_128:
+; CHECK: s_barrier_signal
+define amdgpu_kernel void @signal_flat_wgs_attr_32_128() #1 {
+  tail call void @llvm.amdgcn.s.barrier.signal(i32 -1) #0
+  ret void
+}
+
+; CHECK-LABEL: {{^}}signal_flat_wgs_attr_32_64:
+; CHECK: :
+; CHECK-NEXT: ; wave barrier
+; CHECK-NEXT: s_endpgm
+define amdgpu_kernel void @signal_flat_wgs_attr_32_64() #2 {
+  tail call void @llvm.amdgcn.s.barrier.signal(i32 -1) #0
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}wait_unknown_wgs:
+; CHECK: s_barrier_wait
+define amdgpu_kernel void @wait_unknown_wgs() {
+  tail call void @llvm.amdgcn.s.barrier.wait(i16 -1) #0
+  ret void
+}
+
+; CHECK-LABEL: {{^}}wait_flat_wgs_attr_32_128:
+; CHECK: s_barrier_wait
+define amdgpu_kernel void @wait_flat_wgs_attr_32_128() #1 {
+  tail call void @llvm.amdgcn.s.barrier.wait(i16 -1) #0
+  ret void
+}
+
+; CHECK-LABEL: {{^}}wait_flat_wgs_attr_32_64:
+; CHECK: :
+; CHECK-NEXT: ; wave barrier
+; CHECK-NEXT: s_endpgm
+define amdgpu_kernel void @wait_flat_wgs_attr_32_64() #2 {
+  tail call void @llvm.amdgcn.s.barrier.wait(i16 -1) #0
+  ret void
+}
+
+declare void @llvm.amdgcn.s.barrier.signal(i32 immarg) #0
+declare void @llvm.amdgcn.s.barrier.wait(i16 immarg) #0
+
+attributes #0 = { convergent nounwind }
+attributes #1 = { nounwind "amdgpu-flat-work-group-size"="32,128" }
+attributes #2 = { nounwind "amdgpu-flat-work-group-size"="16,32" }

Copy link
Contributor

@ruiling ruiling left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@piotrAMD piotrAMD merged commit a96ec01 into llvm:main Nov 26, 2024
8 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

7 participants