diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 3522ece24f1c4..8247c8581dde8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1832,19 +1832,25 @@ bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const { } bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const { + Intrinsic::ID IntrinsicID = cast(MI).getIntrinsicID(); if (TM.getOptLevel() > CodeGenOptLevel::None) { unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second; if (WGSize <= STI.getWavefrontSize()) { - MachineBasicBlock *MBB = MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER)); + // If the workgroup fits in a wave, remove s_barrier_signal and lower + // s_barrier/s_barrier_wait to wave_barrier. + if (IntrinsicID == Intrinsic::amdgcn_s_barrier || + IntrinsicID == Intrinsic::amdgcn_s_barrier_wait) { + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER)); + } MI.eraseFromParent(); return true; } } - // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait - if (STI.hasSplitBarriers()) { + if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) { + // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait MachineBasicBlock *MBB = MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM)) @@ -2161,6 +2167,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( case Intrinsic::amdgcn_init_whole_wave: return selectInitWholeWave(I); case Intrinsic::amdgcn_s_barrier: + case Intrinsic::amdgcn_s_barrier_signal: + case Intrinsic::amdgcn_s_barrier_wait: return selectSBarrier(I); case Intrinsic::amdgcn_raw_buffer_load_lds: case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 5b02f9bf80d3f..05442230d5252 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9605,18 +9605,26 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE; return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0); } - case Intrinsic::amdgcn_s_barrier: { + case Intrinsic::amdgcn_s_barrier: + case Intrinsic::amdgcn_s_barrier_signal: + case Intrinsic::amdgcn_s_barrier_wait: { const GCNSubtarget &ST = MF.getSubtarget(); if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) { unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second; - if (WGSize <= ST.getWavefrontSize()) - return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other, - Op.getOperand(0)), - 0); + if (WGSize <= ST.getWavefrontSize()) { + // If the workgroup fits in a wave, remove s_barrier_signal and lower + // s_barrier/s_barrier_wait to wave_barrier. + if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal) + return Op.getOperand(0); + else + return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, + MVT::Other, Op.getOperand(0)), + 0); + } } - // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait - if (ST.hasSplitBarriers()) { + if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) { + // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait SDValue K = DAG.getTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32); SDValue BarSignal = diff --git a/llvm/test/CodeGen/AMDGPU/barrier-elimination-gfx12.ll b/llvm/test/CodeGen/AMDGPU/barrier-elimination-gfx12.ll new file mode 100644 index 0000000000000..d26d406df5220 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/barrier-elimination-gfx12.ll @@ -0,0 +1,64 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s + +define amdgpu_kernel void @signal_unknown_wgs() { +; CHECK-LABEL: signal_unknown_wgs: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_barrier_signal -1 +; CHECK-NEXT: s_endpgm + tail call void @llvm.amdgcn.s.barrier.signal(i32 -1) + ret void +} + +define amdgpu_kernel void @signal_flat_wgs_attr_32_128() #1 { +; CHECK-LABEL: signal_flat_wgs_attr_32_128: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_barrier_signal -1 +; CHECK-NEXT: s_endpgm + tail call void @llvm.amdgcn.s.barrier.signal(i32 -1) + ret void +} + +define amdgpu_kernel void @signal_flat_wgs_attr_16_32() #2 { +; CHECK-LABEL: signal_flat_wgs_attr_16_32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_endpgm + tail call void @llvm.amdgcn.s.barrier.signal(i32 -1) + ret void +} + + +define amdgpu_kernel void @wait_unknown_wgs() { +; CHECK-LABEL: wait_unknown_wgs: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_barrier_wait -1 +; CHECK-NEXT: s_endpgm + tail call void @llvm.amdgcn.s.barrier.wait(i16 -1) + ret void +} + +define amdgpu_kernel void @wait_flat_wgs_attr_32_128() #1 { +; CHECK-LABEL: wait_flat_wgs_attr_32_128: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_barrier_wait -1 +; CHECK-NEXT: s_endpgm + tail call void @llvm.amdgcn.s.barrier.wait(i16 -1) + ret void +} + +define amdgpu_kernel void @wait_flat_wgs_attr_16_32() #2 { +; CHECK-LABEL: wait_flat_wgs_attr_16_32: +; CHECK: ; %bb.0: +; CHECK-NEXT: ; wave barrier +; CHECK-NEXT: s_endpgm + tail call void @llvm.amdgcn.s.barrier.wait(i16 -1) + ret void +} + +declare void @llvm.amdgcn.s.barrier.signal(i32 immarg) #0 +declare void @llvm.amdgcn.s.barrier.wait(i16 immarg) #0 + +attributes #0 = { convergent nounwind } +attributes #1 = { nounwind "amdgpu-flat-work-group-size"="32,128" } +attributes #2 = { nounwind "amdgpu-flat-work-group-size"="16,32" }