diff --git a/llvm/include/llvm/CodeGen/LiveRegMatrix.h b/llvm/include/llvm/CodeGen/LiveRegMatrix.h index 2b32308c7c075..55e7abd933a10 100644 --- a/llvm/include/llvm/CodeGen/LiveRegMatrix.h +++ b/llvm/include/llvm/CodeGen/LiveRegMatrix.h @@ -114,6 +114,16 @@ class LiveRegMatrix : public MachineFunctionPass { /// the segment [Start, End). bool checkInterference(SlotIndex Start, SlotIndex End, MCRegister PhysReg); + /// Check for interference in the segment [Start, End) that may prevent + /// assignment to PhysReg, like checkInterference. Returns a lane mask of + /// which lanes of the physical register interfere in the segment [Start, End) + /// of some other interval already assigned to PhysReg. + /// + /// If this function returns LaneBitmask::getNone(), PhysReg is completely + /// free at the segment [Start, End). + LaneBitmask checkInterferenceLanes(SlotIndex Start, SlotIndex End, + MCRegister PhysReg); + /// Assign VirtReg to PhysReg. /// This will mark VirtReg's live range as occupied in the LiveRegMatrix and /// update VirtRegMap. The live range is expected to be available in PhysReg. diff --git a/llvm/lib/CodeGen/LiveRegMatrix.cpp b/llvm/lib/CodeGen/LiveRegMatrix.cpp index c8c722359a4c4..de5e6c42a6b97 100644 --- a/llvm/lib/CodeGen/LiveRegMatrix.cpp +++ b/llvm/lib/CodeGen/LiveRegMatrix.cpp @@ -237,6 +237,41 @@ bool LiveRegMatrix::checkInterference(SlotIndex Start, SlotIndex End, return false; } +LaneBitmask LiveRegMatrix::checkInterferenceLanes(SlotIndex Start, + SlotIndex End, + MCRegister PhysReg) { + // Construct artificial live range containing only one segment [Start, End). + VNInfo valno(0, Start); + LiveRange::Segment Seg(Start, End, &valno); + LiveRange LR; + LR.addSegment(Seg); + + LaneBitmask InterferingLanes; + + // Check for interference with that segment + for (MCRegUnitMaskIterator MCRU(PhysReg, TRI); MCRU.isValid(); ++MCRU) { + auto [Unit, Lanes] = *MCRU; + // LR is stack-allocated. LiveRegMatrix caches queries by a key that + // includes the address of the live range. If (for the same reg unit) this + // checkInterference overload is called twice, without any other query() + // calls in between (on heap-allocated LiveRanges) - which would invalidate + // the cached query - the LR address seen the second time may well be the + // same as that seen the first time, while the Start/End/valno may not - yet + // the same cached result would be fetched. To avoid that, we don't cache + // this query. + // + // FIXME: the usability of the Query API needs to be improved to avoid + // subtle bugs due to query identity. Avoiding caching, for example, would + // greatly simplify things. + LiveIntervalUnion::Query Q; + Q.reset(UserTag, LR, Matrix[Unit]); + if (Q.checkInterference()) + InterferingLanes |= Lanes; + } + + return InterferingLanes; +} + Register LiveRegMatrix::getOneVReg(unsigned PhysReg) const { const LiveInterval *VRegInterval = nullptr; for (MCRegUnit Unit : TRI->regunits(PhysReg)) { diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp index 1254c7be18214..e38a4633d0617 100644 --- a/llvm/lib/CodeGen/VirtRegMap.cpp +++ b/llvm/lib/CodeGen/VirtRegMap.cpp @@ -21,6 +21,7 @@ #include "llvm/CodeGen/LiveDebugVariables.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LiveRegMatrix.h" #include "llvm/CodeGen/LiveStacks.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -187,6 +188,7 @@ class VirtRegRewriter : public MachineFunctionPass { MachineRegisterInfo *MRI = nullptr; SlotIndexes *Indexes = nullptr; LiveIntervals *LIS = nullptr; + LiveRegMatrix *LRM = nullptr; VirtRegMap *VRM = nullptr; LiveDebugVariables *DebugVars = nullptr; DenseSet RewriteRegs; @@ -199,9 +201,9 @@ class VirtRegRewriter : public MachineFunctionPass { void handleIdentityCopy(MachineInstr &MI); void expandCopyBundle(MachineInstr &MI) const; bool subRegLiveThrough(const MachineInstr &MI, MCRegister SuperPhysReg) const; - bool needLiveOutUndefSubregDef(const LiveInterval &LI, - const MachineBasicBlock &MBB, unsigned SubReg, - MCPhysReg PhysReg) const; + LaneBitmask liveOutUndefPhiLanesForUndefSubregDef( + const LiveInterval &LI, const MachineBasicBlock &MBB, unsigned SubReg, + MCPhysReg PhysReg, const MachineInstr &MI) const; public: static char ID; @@ -234,6 +236,7 @@ INITIALIZE_PASS_BEGIN(VirtRegRewriter, "virtregrewriter", INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass) INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) INITIALIZE_PASS_DEPENDENCY(LiveDebugVariables) +INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) INITIALIZE_PASS_DEPENDENCY(LiveStacks) INITIALIZE_PASS_DEPENDENCY(VirtRegMap) INITIALIZE_PASS_END(VirtRegRewriter, "virtregrewriter", @@ -249,6 +252,7 @@ void VirtRegRewriter::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addPreserved(); AU.addRequired(); + AU.addRequired(); if (!ClearVirtRegs) AU.addPreserved(); @@ -263,6 +267,7 @@ bool VirtRegRewriter::runOnMachineFunction(MachineFunction &fn) { MRI = &MF->getRegInfo(); Indexes = &getAnalysis().getSI(); LIS = &getAnalysis().getLIS(); + LRM = &getAnalysis(); VRM = &getAnalysis(); DebugVars = &getAnalysis(); LLVM_DEBUG(dbgs() << "********** REWRITE VIRTUAL REGISTERS **********\n" @@ -535,24 +540,36 @@ bool VirtRegRewriter::subRegLiveThrough(const MachineInstr &MI, return false; } -/// Check if we need to maintain liveness for undef subregister lanes that are -/// live out of a block. -bool VirtRegRewriter::needLiveOutUndefSubregDef(const LiveInterval &LI, - const MachineBasicBlock &MBB, - unsigned SubReg, - MCPhysReg PhysReg) const { +/// Compute a lanemask for undef lanes which need to be preserved out of the +/// defining block for a register assignment. +LaneBitmask VirtRegRewriter::liveOutUndefPhiLanesForUndefSubregDef( + const LiveInterval &LI, const MachineBasicBlock &MBB, unsigned SubReg, + MCPhysReg PhysReg, const MachineInstr &MI) const { LaneBitmask UndefMask = ~TRI->getSubRegIndexLaneMask(SubReg); + LaneBitmask LiveOutUndefLanes; + for (const LiveInterval::SubRange &SR : LI.subranges()) { LaneBitmask NeedImpDefLanes = UndefMask & SR.LaneMask; if (NeedImpDefLanes.any() && !LIS->isLiveOutOfMBB(SR, &MBB)) { for (const MachineBasicBlock *Succ : MBB.successors()) { if (LIS->isLiveInToMBB(SR, Succ)) - return true; + LiveOutUndefLanes |= NeedImpDefLanes; } } } - return false; + SlotIndex MIIndex = LIS->getInstructionIndex(MI); + SlotIndex BeforeMIUses = MIIndex.getBaseIndex(); + LaneBitmask InterferingLanes = + LRM->checkInterferenceLanes(BeforeMIUses, MIIndex.getRegSlot(), PhysReg); + LiveOutUndefLanes &= ~InterferingLanes; + + LLVM_DEBUG(if (LiveOutUndefLanes.any()) { + dbgs() << "Need live out undef defs for " << printReg(PhysReg) + << LiveOutUndefLanes << " from " << printMBBReference(MBB) << '\n'; + }); + + return LiveOutUndefLanes; } void VirtRegRewriter::rewrite() { @@ -611,8 +628,29 @@ void VirtRegRewriter::rewrite() { assert(MO.isDef()); if (MO.isUndef()) { const LiveInterval &LI = LIS->getInterval(VirtReg); - if (needLiveOutUndefSubregDef(LI, *MBBI, SubReg, PhysReg)) - SuperDefs.push_back(PhysReg); + + LaneBitmask LiveOutUndefLanes = + liveOutUndefPhiLanesForUndefSubregDef(LI, *MBBI, SubReg, + PhysReg, MI); + if (LiveOutUndefLanes.any()) { + SmallVector CoveringIndexes; + + // TODO: Just use one super register def if none of the lanes + // are needed? + if (!TRI->getCoveringSubRegIndexes( + *MRI, MRI->getRegClass(VirtReg), LiveOutUndefLanes, + CoveringIndexes)) + llvm_unreachable( + "cannot represent required subregister defs"); + + // Try to represent the minimum needed live out def as a + // sequence of subregister defs. + // + // FIXME: It would be better if we could directly represent + // liveness with a lanemask instead of spamming operands. + for (unsigned SubIdx : CoveringIndexes) + SuperDefs.push_back(TRI->getSubReg(PhysReg, SubIdx)); + } } } } diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll index e819d5d3b1656..da8aa54469835 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -603,6 +603,7 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GISEL-NEXT: s_mov_b32 s14, s43 ; GISEL-NEXT: s_mov_b32 s15, s42 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GISEL-NEXT: v_mov_b32_e32 v1, v0 ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr31 ; GISEL-NEXT: s_xor_b64 exec, exec, s[48:49] @@ -1383,6 +1384,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2] ; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GISEL-NEXT: v_mov_b32_e32 v2, v0 ; GISEL-NEXT: ; implicit-def: $vgpr1 ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: s_xor_b64 exec, exec, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir index 285e7e22264a0..215200c770245 100644 --- a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir +++ b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir @@ -30,7 +30,7 @@ body: | ; CHECK-NEXT: dead [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: dead undef [[DEF2:%[0-9]+]].sub0:vreg_64 = IMPLICIT_DEF ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX16_IMM renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4) - ; CHECK-NEXT: renamable $sgpr24 = IMPLICIT_DEF implicit-def $sgpr24_sgpr25_sgpr26_sgpr27 + ; CHECK-NEXT: renamable $sgpr24 = IMPLICIT_DEF implicit-def $sgpr25 ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM undef renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4) ; CHECK-NEXT: $exec = S_MOV_B64_term undef renamable $sgpr4_sgpr5 ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec @@ -83,7 +83,7 @@ body: | ; CHECK-NEXT: liveins: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x00000000FFFFFFFF ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: dead [[IMAGE_SAMPLE_LZ_V1_V2_5:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF]], renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, undef renamable $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) - ; CHECK-NEXT: renamable $sgpr25 = COPY undef renamable $sgpr24, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27 + ; CHECK-NEXT: renamable $sgpr25 = COPY undef renamable $sgpr24, implicit-def $sgpr24 ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit undef $vcc ; CHECK-NEXT: S_BRANCH %bb.6 ; CHECK-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir index 995a5d267fbed..b8818c5550ad4 100644 --- a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir @@ -30,7 +30,7 @@ body: | ; CHECK-NEXT: dead undef [[DEF3:%[0-9]+]].sub1:vreg_64 = IMPLICIT_DEF ; CHECK-NEXT: dead renamable $sgpr5 = IMPLICIT_DEF ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX16_IMM undef renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4) - ; CHECK-NEXT: renamable $sgpr24 = IMPLICIT_DEF implicit-def $sgpr24_sgpr25_sgpr26_sgpr27 + ; CHECK-NEXT: renamable $sgpr24 = IMPLICIT_DEF implicit-def $sgpr25 ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM undef renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4) ; CHECK-NEXT: $exec = S_MOV_B64_term undef renamable $sgpr4_sgpr5 ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec @@ -80,7 +80,7 @@ body: | ; CHECK-NEXT: liveins: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x00000000FFFFFFFF ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: dead [[IMAGE_SAMPLE_LZ_V1_V2_5:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF]], renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, undef renamable $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) - ; CHECK-NEXT: renamable $sgpr25 = COPY undef renamable $sgpr24, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27 + ; CHECK-NEXT: renamable $sgpr25 = COPY undef renamable $sgpr24, implicit-def $sgpr24 ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.6, implicit undef $vcc ; CHECK-NEXT: S_BRANCH %bb.5 ; CHECK-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/issue98474-assigned-physreg-interference.mir b/llvm/test/CodeGen/AMDGPU/issue98474-assigned-physreg-interference.mir new file mode 100644 index 0000000000000..786ce40203836 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/issue98474-assigned-physreg-interference.mir @@ -0,0 +1,55 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -start-before=greedy,2 -stop-after=tailduplication -verify-machineinstrs -o - %s | FileCheck %s + +--- +name: undef_subreg_def_live_out_tailduplicate_vreg96_undef_sub1_sub2_assigned_physreg_interference +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' +body: | + ; CHECK-LABEL: name: undef_subreg_def_live_out_tailduplicate_vreg96_undef_sub1_sub2_assigned_physreg_interference + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $sgpr0, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr4_vgpr5 + ; CHECK-NEXT: EXP 0, killed renamable $vgpr3, renamable $vgpr4, renamable $vgpr5, killed renamable $vgpr2, 0, 0, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: renamable $vgpr3_vgpr4_vgpr5 = BUFFER_LOAD_FORMAT_XYZ_IDXEN killed renamable $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 8) + ; CHECK-NEXT: EXP 0, killed renamable $vgpr3, renamable $vgpr4, renamable $vgpr5, killed renamable $vgpr2, 0, 0, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + liveins: $sgpr0, $vgpr2 + + %2:vgpr_32 = COPY $vgpr2 + S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit killed $scc + + bb.1: + undef %0.sub0:vreg_96 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.3 + + bb.2: + S_NOP 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %0:vreg_96 = BUFFER_LOAD_FORMAT_XYZ_IDXEN killed %1, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), addrspace 8) + + bb.3: + EXP 0, killed %0.sub0, killed %0.sub1, killed %0.sub2, %2:vgpr_32, 0, 0, 0, implicit $exec + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/issue98474-virtregrewriter-live-out-undef-subregisters.mir b/llvm/test/CodeGen/AMDGPU/issue98474-virtregrewriter-live-out-undef-subregisters.mir index a8ed114f8cd78..86b6c5982b4cb 100644 --- a/llvm/test/CodeGen/AMDGPU/issue98474-virtregrewriter-live-out-undef-subregisters.mir +++ b/llvm/test/CodeGen/AMDGPU/issue98474-virtregrewriter-live-out-undef-subregisters.mir @@ -23,7 +23,7 @@ body: | ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1 + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1 ; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr1, undef renamable $vgpr0, undef renamable $vgpr0, 0, 0, 0, implicit $exec ; CHECK-NEXT: S_ENDPGM 0 ; CHECK-NEXT: {{ $}} @@ -71,7 +71,7 @@ body: | ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1_vgpr2 ; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr1, renamable $vgpr2, undef renamable $vgpr0, 0, 0, 0, implicit $exec ; CHECK-NEXT: S_ENDPGM 0 ; CHECK-NEXT: {{ $}} @@ -119,7 +119,7 @@ body: | ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 + ; CHECK-NEXT: renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr2, implicit-def $vgpr0 ; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr1, renamable $vgpr2, undef renamable $vgpr0, 0, 0, 0, implicit $exec ; CHECK-NEXT: S_ENDPGM 0 ; CHECK-NEXT: {{ $}} @@ -168,7 +168,7 @@ body: | ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1 + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1 ; CHECK-NEXT: S_NOP 0, implicit renamable $vgpr0_vgpr1 ; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr1, undef renamable $vgpr0, undef renamable $vgpr0, 0, 0, 0, implicit $exec ; CHECK-NEXT: S_ENDPGM 0 @@ -249,3 +249,115 @@ body: | S_ENDPGM 0 ... + +# In bb.2, %0 should be assigned to vgpr0_vgpr1. Make sure the value +# copied from $vgpr0 into %3 isn't clobbered by the undef phi def for +# %0.sub1. +--- +name: assigned_physreg_subregister_interference +tracksRegLiveness: true +frameInfo: + adjustsStack: true +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + wwmReservedRegs: + - '$vgpr63' +body: | + ; CHECK-LABEL: name: assigned_physreg_subregister_interference + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; CHECK-NEXT: $vgpr40 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr40 + ; CHECK-NEXT: $vgpr40 = SI_SPILL_S32_TO_VGPR $sgpr31, 1, $vgpr40 + ; CHECK-NEXT: $vgpr40 = SI_SPILL_S32_TO_VGPR $sgpr34, 2, $vgpr40 + ; CHECK-NEXT: $vgpr40 = SI_SPILL_S32_TO_VGPR $sgpr35, 3, $vgpr40 + ; CHECK-NEXT: $vgpr40 = SI_SPILL_S32_TO_VGPR $sgpr36, 4, $vgpr40 + ; CHECK-NEXT: $vgpr40 = SI_SPILL_S32_TO_VGPR $sgpr37, 5, $vgpr40 + ; CHECK-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $vgpr0_vgpr1:0x000000000000000F + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec + ; CHECK-NEXT: renamable $sgpr5 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec + ; CHECK-NEXT: renamable $vcc = V_CMP_EQ_U64_e64 $sgpr4_sgpr5, killed $vgpr0_vgpr1, implicit $exec + ; CHECK-NEXT: renamable $sgpr36_sgpr37 = S_AND_SAVEEXEC_B64 killed renamable $vcc, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: dead $sgpr30_sgpr31 = noconvergent SI_CALL killed renamable $sgpr4_sgpr5, 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0 + ; CHECK-NEXT: renamable $vgpr1 = COPY $vgpr0, implicit $exec + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 123, implicit $exec + ; CHECK-NEXT: $exec = S_XOR_B64 $exec, renamable $sgpr36_sgpr37, implicit-def dead $scc + ; CHECK-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $vgpr1, $sgpr34_sgpr35 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec = COPY renamable $sgpr34_sgpr35 + ; CHECK-NEXT: renamable $vgpr0 = V_ADD_U32_e32 1, killed $vgpr1, implicit $exec + ; CHECK-NEXT: $sgpr37 = SI_RESTORE_S32_FROM_VGPR $vgpr40, 5 + ; CHECK-NEXT: $sgpr36 = SI_RESTORE_S32_FROM_VGPR $vgpr40, 4 + ; CHECK-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr40, 3 + ; CHECK-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr40, 2 + ; CHECK-NEXT: $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr40, 1 + ; CHECK-NEXT: $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr40, 0 + ; CHECK-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + bb.0: + liveins: $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $vgpr0, $vgpr1, $vgpr63 + + $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63 + $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63 + $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr34, 2, $vgpr63 + $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr35, 3, $vgpr63 + $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr36, 4, $vgpr63 + $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr37, 5, $vgpr63 + undef %0.sub0:vreg_64 = COPY $vgpr0 + %0.sub1:vreg_64 = COPY $vgpr1 + ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + renamable $sgpr34_sgpr35 = S_MOV_B64 $exec + + bb.1: + liveins: $vgpr63, $sgpr34_sgpr35 + + renamable $sgpr4 = V_READFIRSTLANE_B32 %0.sub0, implicit $exec + renamable $sgpr5 = V_READFIRSTLANE_B32 %0.sub1, implicit $exec + renamable $vcc = V_CMP_EQ_U64_e64 $sgpr4_sgpr5, %0, implicit $exec + renamable $sgpr36_sgpr37 = S_AND_SAVEEXEC_B64 killed renamable $vcc, implicit-def $exec, implicit-def dead $scc, implicit $exec + + bb.2: + liveins: $vgpr63, $sgpr4_sgpr5:0x000000000000000F, $sgpr34_sgpr35, $sgpr36_sgpr37 + + dead $sgpr30_sgpr31 = noconvergent SI_CALL killed renamable $sgpr4_sgpr5, 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0 + %3:vgpr_32 = COPY $vgpr0 + undef %0.sub0:vreg_64 = V_MOV_B32_e32 123, implicit $exec + $exec = S_XOR_B64_term $exec, killed renamable $sgpr36_sgpr37, implicit-def dead $scc + S_CBRANCH_EXECNZ %bb.1, implicit $exec + + bb.3: + liveins: $vgpr63, $sgpr34_sgpr35 + + $exec = S_MOV_B64_term killed renamable $sgpr34_sgpr35 + + bb.4: + liveins: $vgpr63 + + ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + %6:vgpr_32 = V_ADD_U32_e32 1, %3, implicit $exec + $vgpr0 = COPY %6 + $sgpr37 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 5 + $sgpr36 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 4 + $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 3 + $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 2 + $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 1 + $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 0 + SI_RETURN implicit $vgpr0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll index 7e76e2bf9e894..2999ddb831588 100644 --- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll @@ -172,8 +172,8 @@ define float @sitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr2 ; GISEL-NEXT: ; %bb.3: ; %Flow3 @@ -417,8 +417,8 @@ define float @uitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr7 +; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr2 ; GISEL-NEXT: ; %bb.3: ; %Flow3 @@ -1263,8 +1263,8 @@ define half @sitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr2 ; GISEL-NEXT: ; %bb.3: ; %Flow3 @@ -1510,8 +1510,8 @@ define half @uitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr7 +; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr2 ; GISEL-NEXT: ; %bb.3: ; %Flow3 diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index 6c56dee76142c..2d5e5a9160fdf 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -526,6 +526,7 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: ; implicit-def: $vgpr3 ; GFX906-NEXT: ; implicit-def: $vgpr13 ; GFX906-NEXT: ; implicit-def: $vgpr11 ; GFX906-NEXT: ; implicit-def: $vgpr14 @@ -534,7 +535,6 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: ; implicit-def: $vgpr16 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[4:5] -; GFX906-NEXT: ; implicit-def: $vgpr3 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshrrev_b32_e32 v5, 24, v2 ; GFX906-NEXT: v_lshrrev_b32_e32 v6, 16, v2