diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp index 4df55eac5d76b..bfdd8cf1bc2b1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp @@ -164,6 +164,46 @@ class AMDGPUWaitSGPRHazards { BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::DS_NOP)); } + unsigned mergeMasks(unsigned Mask1, unsigned Mask2) { + unsigned Mask = 0xffff; + Mask = AMDGPU::DepCtr::encodeFieldSaSdst( + Mask, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(Mask1), + AMDGPU::DepCtr::decodeFieldSaSdst(Mask2))); + Mask = AMDGPU::DepCtr::encodeFieldVaVcc( + Mask, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(Mask1), + AMDGPU::DepCtr::decodeFieldVaVcc(Mask2))); + Mask = AMDGPU::DepCtr::encodeFieldVmVsrc( + Mask, std::min(AMDGPU::DepCtr::decodeFieldVmVsrc(Mask1), + AMDGPU::DepCtr::decodeFieldVmVsrc(Mask2))); + Mask = AMDGPU::DepCtr::encodeFieldVaSdst( + Mask, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(Mask1), + AMDGPU::DepCtr::decodeFieldVaSdst(Mask2))); + Mask = AMDGPU::DepCtr::encodeFieldVaVdst( + Mask, std::min(AMDGPU::DepCtr::decodeFieldVaVdst(Mask1), + AMDGPU::DepCtr::decodeFieldVaVdst(Mask2))); + Mask = AMDGPU::DepCtr::encodeFieldHoldCnt( + Mask, std::min(AMDGPU::DepCtr::decodeFieldHoldCnt(Mask1), + AMDGPU::DepCtr::decodeFieldHoldCnt(Mask2))); + Mask = AMDGPU::DepCtr::encodeFieldVaSsrc( + Mask, std::min(AMDGPU::DepCtr::decodeFieldVaSsrc(Mask1), + AMDGPU::DepCtr::decodeFieldVaSsrc(Mask2))); + return Mask; + } + + bool mergeConsecutiveWaitAlus(MachineBasicBlock::instr_iterator &MI, + unsigned Mask) { + auto MBB = MI->getParent(); + if (MI == MBB->instr_begin()) + return false; + + auto It = prev_nodbg(MI, MBB->instr_begin()); + if (It->getOpcode() != AMDGPU::S_WAITCNT_DEPCTR) + return false; + + It->getOperand(0).setImm(mergeMasks(Mask, It->getOperand(0).getImm())); + return true; + } + bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) { enum { WA_VALU = 0x1, WA_SALU = 0x2, WA_VCC = 0x4 }; @@ -362,10 +402,12 @@ class AMDGPUWaitSGPRHazards { Mask = AMDGPU::DepCtr::encodeFieldVaSdst(Mask, 0); } if (Emit) { - auto NewMI = BuildMI(MBB, MI, MI->getDebugLoc(), - TII->get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(Mask); - updateGetPCBundle(NewMI); + if (!mergeConsecutiveWaitAlus(MI, Mask)) { + auto NewMI = BuildMI(MBB, MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(Mask); + updateGetPCBundle(NewMI); + } Emitted = true; } } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index b51cf536467b9..ac6b07bad3e35 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -164,6 +164,18 @@ inline unsigned getSaSdstBitWidth() { return 1; } /// \returns SaSdst bit shift inline unsigned getSaSdstBitShift() { return 0; } +/// \returns VaSsrc width +inline unsigned getVaSsrcBitWidth() { return 1; } + +/// \returns VaSsrc bit shift +inline unsigned getVaSsrcBitShift() { return 8; } + +/// \returns HoldCnt bit shift +inline unsigned getHoldCntWidth() { return 1; } + +/// \returns HoldCnt bit shift +inline unsigned getHoldCntBitShift() { return 7; } + } // end anonymous namespace namespace llvm { @@ -1740,6 +1752,14 @@ unsigned decodeFieldVaVcc(unsigned Encoded) { return unpackBits(Encoded, getVaVccBitShift(), getVaVccBitWidth()); } +unsigned decodeFieldVaSsrc(unsigned Encoded) { + return unpackBits(Encoded, getVaSsrcBitShift(), getVaSsrcBitWidth()); +} + +unsigned decodeFieldHoldCnt(unsigned Encoded) { + return unpackBits(Encoded, getHoldCntBitShift(), getHoldCntWidth()); +} + unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc) { return packBits(VmVsrc, Encoded, getVmVsrcBitShift(), getVmVsrcBitWidth()); } @@ -1780,6 +1800,22 @@ unsigned encodeFieldVaVcc(unsigned VaVcc) { return encodeFieldVaVcc(0xffff, VaVcc); } +unsigned encodeFieldVaSsrc(unsigned Encoded, unsigned VaSsrc) { + return packBits(VaSsrc, Encoded, getVaSsrcBitShift(), getVaSsrcBitWidth()); +} + +unsigned encodeFieldVaSsrc(unsigned VaSsrc) { + return encodeFieldVaSsrc(0xffff, VaSsrc); +} + +unsigned encodeFieldHoldCnt(unsigned Encoded, unsigned HoldCnt) { + return packBits(HoldCnt, Encoded, getHoldCntBitShift(), getHoldCntWidth()); +} + +unsigned encodeFieldHoldCnt(unsigned HoldCnt) { + return encodeFieldHoldCnt(0xffff, HoldCnt); +} + } // namespace DepCtr //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index f54d5a273ca37..184f40bccfff8 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1180,6 +1180,12 @@ unsigned decodeFieldVaSdst(unsigned Encoded); /// \returns Decoded VaVcc from given immediate \p Encoded. unsigned decodeFieldVaVcc(unsigned Encoded); +/// \returns Decoded SaSrc from given immediate \p Encoded. +unsigned decodeFieldVaSsrc(unsigned Encoded); + +/// \returns Decoded HoldCnt from given immediate \p Encoded. +unsigned decodeFieldHoldCnt(unsigned Encoded); + /// \returns \p VmVsrc as an encoded Depctr immediate. unsigned encodeFieldVmVsrc(unsigned VmVsrc); @@ -1210,6 +1216,18 @@ unsigned encodeFieldVaVcc(unsigned VaVcc); /// \returns \p Encoded combined with encoded \p VaVcc. unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc); +/// \returns \p HoldCnt as an encoded Depctr immediate. +unsigned encodeFieldHoldCnt(unsigned HoldCnt); + +/// \returns \p Encoded combined with encoded \p HoldCnt. +unsigned encodeFieldHoldCnt(unsigned HoldCnt, unsigned Encoded); + +/// \returns \p VaSsrc as an encoded Depctr immediate. +unsigned encodeFieldVaSsrc(unsigned VaSsrc); + +/// \returns \p Encoded combined with encoded \p VaSsrc. +unsigned encodeFieldVaSsrc(unsigned Encoded, unsigned VaSsrc); + } // namespace DepCtr namespace Exp { diff --git a/llvm/test/CodeGen/AMDGPU/merge-consecutive-wait-alus.mir b/llvm/test/CodeGen/AMDGPU/merge-consecutive-wait-alus.mir new file mode 100644 index 0000000000000..d8f4c9c8f14b5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/merge-consecutive-wait-alus.mir @@ -0,0 +1,79 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass amdgpu-wait-sgpr-hazards -o - %s | FileCheck %s + + +--- +name: merge_consecutive_wait_alus +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: merge_consecutive_wait_alus + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $sgpr0 = V_CMP_NE_U32_e64 0, $vgpr0, implicit $exec, implicit-def $vcc_lo, implicit-def $vcc_lo + ; CHECK-NEXT: S_WAITCNT_DEPCTR 61946 + ; CHECK-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, -1, 0, killed $vgpr0, killed $sgpr0, implicit $exec, implicit-def $vcc_lo + renamable $sgpr0 = V_CMP_NE_U32_e64 0, $vgpr0, implicit $exec, implicit-def $vcc_lo, implicit-def $vcc + S_WAITCNT_DEPCTR 65530 + renamable $vgpr0 = V_CNDMASK_B32_e64 0, -1, 0, killed $vgpr0, killed $sgpr0, implicit $exec, implicit-def $vcc +... +--- +name: merge_consecutive_wait_alus_two_bb +body: | + ; CHECK-LABEL: name: merge_consecutive_wait_alus_two_bb + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $sgpr0 = V_CMP_NE_U32_e64 0, $vgpr0, implicit $exec, implicit-def $vcc_lo, implicit-def $vcc_lo + ; CHECK-NEXT: S_WAITCNT_DEPCTR 65530 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $sgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_WAITCNT_DEPCTR 61951 + ; CHECK-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, -1, 0, killed $vgpr0, killed $sgpr0, implicit $exec, implicit-def $vcc_lo + bb.0: + liveins: $vgpr0 + + renamable $sgpr0 = V_CMP_NE_U32_e64 0, $vgpr0, implicit $exec, implicit-def $vcc_lo, implicit-def $vcc + S_WAITCNT_DEPCTR 65530 + + bb.1: + liveins: $sgpr0 + + renamable $vgpr0 = V_CNDMASK_B32_e64 0, -1, 0, killed $vgpr0, killed $sgpr0, implicit $exec, implicit-def $vcc +... +--- +name: meta_instructions +machineFunctionInfo: +body: | + bb.0: + ; CHECK-LABEL: name: meta_instructions + ; CHECK: renamable $sgpr0 = V_CMP_NE_U32_e64 0, $vgpr0, implicit $exec, implicit-def $vcc_lo, implicit-def $vcc_lo + ; CHECK-NEXT: S_WAITCNT_DEPCTR 65530 + ; CHECK-NEXT: SCHED_BARRIER 0 + ; CHECK-NEXT: S_WAITCNT_DEPCTR 61951 + ; CHECK-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, -1, 0, killed $vgpr0, killed $sgpr0, implicit $exec, implicit-def $vcc_lo + renamable $sgpr0 = V_CMP_NE_U32_e64 0, $vgpr0, implicit $exec, implicit-def $vcc_lo, implicit-def $vcc + S_WAITCNT_DEPCTR 65530 + SCHED_BARRIER 0 + renamable $vgpr0 = V_CNDMASK_B32_e64 0, -1, 0, killed $vgpr0, killed $sgpr0, implicit $exec, implicit-def $vcc +... +--- +name: debug_instruction +machineFunctionInfo: +body: | + bb.0: + ; CHECK-LABEL: name: debug_instruction + ; CHECK: renamable $sgpr0 = V_CMP_NE_U32_e64 0, $vgpr0, implicit $exec, implicit-def $vcc_lo, implicit-def $vcc_lo + ; CHECK-NEXT: S_WAITCNT_DEPCTR 61946 + ; CHECK-NEXT: DBG_VALUE $sgpr0 + ; CHECK-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, -1, 0, killed $vgpr0, killed $sgpr0, implicit $exec, implicit-def $vcc_lo + renamable $sgpr0 = V_CMP_NE_U32_e64 0, $vgpr0, implicit $exec, implicit-def $vcc_lo, implicit-def $vcc + S_WAITCNT_DEPCTR 65530 + DBG_VALUE $sgpr0 + renamable $vgpr0 = V_CNDMASK_B32_e64 0, -1, 0, killed $vgpr0, killed $sgpr0, implicit $exec, implicit-def $vcc +... +