diff --git a/llvm/include/llvm/CodeGen/LiveRegMatrix.h b/llvm/include/llvm/CodeGen/LiveRegMatrix.h index 84050bf170737..486392ca3c49d 100644 --- a/llvm/include/llvm/CodeGen/LiveRegMatrix.h +++ b/llvm/include/llvm/CodeGen/LiveRegMatrix.h @@ -118,6 +118,16 @@ class LiveRegMatrix { /// the segment [Start, End). bool checkInterference(SlotIndex Start, SlotIndex End, MCRegister PhysReg); + /// Check for interference in the segment [Start, End) that may prevent + /// assignment to PhysReg, like checkInterference. Returns a lane mask of + /// which lanes of the physical register interfere in the segment [Start, End) + /// of some other interval already assigned to PhysReg. + /// + /// If this function returns LaneBitmask::getNone(), PhysReg is completely + /// free at the segment [Start, End). + LaneBitmask checkInterferenceLanes(SlotIndex Start, SlotIndex End, + MCRegister PhysReg); + /// Assign VirtReg to PhysReg. /// This will mark VirtReg's live range as occupied in the LiveRegMatrix and /// update VirtRegMap. The live range is expected to be available in PhysReg. diff --git a/llvm/lib/CodeGen/LiveRegMatrix.cpp b/llvm/lib/CodeGen/LiveRegMatrix.cpp index a57233cf37da0..bc8c59381a40e 100644 --- a/llvm/lib/CodeGen/LiveRegMatrix.cpp +++ b/llvm/lib/CodeGen/LiveRegMatrix.cpp @@ -244,6 +244,41 @@ bool LiveRegMatrix::checkInterference(SlotIndex Start, SlotIndex End, return false; } +LaneBitmask LiveRegMatrix::checkInterferenceLanes(SlotIndex Start, + SlotIndex End, + MCRegister PhysReg) { + // Construct artificial live range containing only one segment [Start, End). + VNInfo valno(0, Start); + LiveRange::Segment Seg(Start, End, &valno); + LiveRange LR; + LR.addSegment(Seg); + + LaneBitmask InterferingLanes; + + // Check for interference with that segment + for (MCRegUnitMaskIterator MCRU(PhysReg, TRI); MCRU.isValid(); ++MCRU) { + auto [Unit, Lanes] = *MCRU; + // LR is stack-allocated. LiveRegMatrix caches queries by a key that + // includes the address of the live range. If (for the same reg unit) this + // checkInterference overload is called twice, without any other query() + // calls in between (on heap-allocated LiveRanges) - which would invalidate + // the cached query - the LR address seen the second time may well be the + // same as that seen the first time, while the Start/End/valno may not - yet + // the same cached result would be fetched. To avoid that, we don't cache + // this query. + // + // FIXME: the usability of the Query API needs to be improved to avoid + // subtle bugs due to query identity. Avoiding caching, for example, would + // greatly simplify things. + LiveIntervalUnion::Query Q; + Q.reset(UserTag, LR, Matrix[Unit]); + if (Q.checkInterference()) + InterferingLanes |= Lanes; + } + + return InterferingLanes; +} + Register LiveRegMatrix::getOneVReg(unsigned PhysReg) const { const LiveInterval *VRegInterval = nullptr; for (MCRegUnit Unit : TRI->regunits(PhysReg)) { diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp index 46253b1743d97..26a12512c87be 100644 --- a/llvm/lib/CodeGen/VirtRegMap.cpp +++ b/llvm/lib/CodeGen/VirtRegMap.cpp @@ -21,6 +21,7 @@ #include "llvm/CodeGen/LiveDebugVariables.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LiveRegMatrix.h" #include "llvm/CodeGen/LiveStacks.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -203,6 +204,7 @@ class VirtRegRewriter : public MachineFunctionPass { MachineRegisterInfo *MRI = nullptr; SlotIndexes *Indexes = nullptr; LiveIntervals *LIS = nullptr; + LiveRegMatrix *LRM = nullptr; VirtRegMap *VRM = nullptr; LiveDebugVariables *DebugVars = nullptr; DenseSet RewriteRegs; @@ -215,6 +217,9 @@ class VirtRegRewriter : public MachineFunctionPass { void handleIdentityCopy(MachineInstr &MI); void expandCopyBundle(MachineInstr &MI) const; bool subRegLiveThrough(const MachineInstr &MI, MCRegister SuperPhysReg) const; + LaneBitmask liveOutUndefPhiLanesForUndefSubregDef( + const LiveInterval &LI, const MachineBasicBlock &MBB, unsigned SubReg, + MCPhysReg PhysReg, const MachineInstr &MI) const; public: static char ID; @@ -247,6 +252,7 @@ INITIALIZE_PASS_BEGIN(VirtRegRewriter, "virtregrewriter", INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass) INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) INITIALIZE_PASS_DEPENDENCY(LiveDebugVariables) +INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy) INITIALIZE_PASS_DEPENDENCY(LiveStacks) INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy) INITIALIZE_PASS_END(VirtRegRewriter, "virtregrewriter", @@ -262,6 +268,7 @@ void VirtRegRewriter::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addPreserved(); AU.addRequired(); + AU.addRequired(); if (!ClearVirtRegs) AU.addPreserved(); @@ -276,6 +283,7 @@ bool VirtRegRewriter::runOnMachineFunction(MachineFunction &fn) { MRI = &MF->getRegInfo(); Indexes = &getAnalysis().getSI(); LIS = &getAnalysis().getLIS(); + LRM = &getAnalysis().getLRM(); VRM = &getAnalysis().getVRM(); DebugVars = &getAnalysis(); LLVM_DEBUG(dbgs() << "********** REWRITE VIRTUAL REGISTERS **********\n" @@ -548,6 +556,40 @@ bool VirtRegRewriter::subRegLiveThrough(const MachineInstr &MI, return false; } +/// Compute a lanemask for undef lanes which need to be preserved out of the +/// defining block for a register assignment for a subregister def. \p PhysReg +/// is assigned to \p LI, which is the main range. +LaneBitmask VirtRegRewriter::liveOutUndefPhiLanesForUndefSubregDef( + const LiveInterval &LI, const MachineBasicBlock &MBB, unsigned SubReg, + MCPhysReg PhysReg, const MachineInstr &MI) const { + LaneBitmask UndefMask = ~TRI->getSubRegIndexLaneMask(SubReg); + LaneBitmask LiveOutUndefLanes; + + for (const LiveInterval::SubRange &SR : LI.subranges()) { + // Figure out which lanes are undef live into a successor. + LaneBitmask NeedImpDefLanes = UndefMask & SR.LaneMask; + if (NeedImpDefLanes.any() && !LIS->isLiveOutOfMBB(SR, &MBB)) { + for (const MachineBasicBlock *Succ : MBB.successors()) { + if (LIS->isLiveInToMBB(SR, Succ)) + LiveOutUndefLanes |= NeedImpDefLanes; + } + } + } + + SlotIndex MIIndex = LIS->getInstructionIndex(MI); + SlotIndex BeforeMIUses = MIIndex.getBaseIndex(); + LaneBitmask InterferingLanes = + LRM->checkInterferenceLanes(BeforeMIUses, MIIndex.getRegSlot(), PhysReg); + LiveOutUndefLanes &= ~InterferingLanes; + + LLVM_DEBUG(if (LiveOutUndefLanes.any()) { + dbgs() << "Need live out undef defs for " << printReg(PhysReg) + << LiveOutUndefLanes << " from " << printMBBReference(MBB) << '\n'; + }); + + return LiveOutUndefLanes; +} + void VirtRegRewriter::rewrite() { bool NoSubRegLiveness = !MRI->subRegLivenessEnabled(); SmallVector SuperDeads; @@ -602,6 +644,32 @@ void VirtRegRewriter::rewrite() { MO.setIsUndef(true); } else if (!MO.isDead()) { assert(MO.isDef()); + if (MO.isUndef()) { + const LiveInterval &LI = LIS->getInterval(VirtReg); + + LaneBitmask LiveOutUndefLanes = + liveOutUndefPhiLanesForUndefSubregDef(LI, *MBBI, SubReg, + PhysReg, MI); + if (LiveOutUndefLanes.any()) { + SmallVector CoveringIndexes; + + // TODO: Just use one super register def if none of the lanes + // are needed? + if (!TRI->getCoveringSubRegIndexes( + *MRI, MRI->getRegClass(VirtReg), LiveOutUndefLanes, + CoveringIndexes)) + llvm_unreachable( + "cannot represent required subregister defs"); + + // Try to represent the minimum needed live out def as a + // sequence of subregister defs. + // + // FIXME: It would be better if we could directly represent + // liveness with a lanemask instead of spamming operands. + for (unsigned SubIdx : CoveringIndexes) + SuperDefs.push_back(TRI->getSubReg(PhysReg, SubIdx)); + } + } } } diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 8625432992397..055e9850de3d6 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -38,24 +38,19 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr26_sgpr27, implicit-def dead $scc - ; GFX90A-NEXT: $vgpr22 = IMPLICIT_DEF - ; GFX90A-NEXT: $vgpr10 = IMPLICIT_DEF - ; GFX90A-NEXT: $vgpr24 = IMPLICIT_DEF - ; GFX90A-NEXT: $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: $vgpr20 = IMPLICIT_DEF ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.59, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr22, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6, $sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54, $sgpr55, $sgpr16_sgpr17_sgpr18, $sgpr18_sgpr19, $sgpr20_sgpr21_sgpr22, $vgpr2, $vgpr3, $vgpr10, $vgpr24, $vgpr18, $vgpr20 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6, $sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54, $sgpr55, $sgpr16_sgpr17_sgpr18, $sgpr18_sgpr19, $sgpr20_sgpr21_sgpr22, $vgpr2, $vgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr23 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr23 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr25 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 + ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18 + ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20 + ; GFX90A-NEXT: renamable $vgpr23 = IMPLICIT_DEF implicit-def $vgpr22 + ; GFX90A-NEXT: renamable $vgpr25 = IMPLICIT_DEF implicit-def $vgpr24 ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3.Flow17: @@ -111,8 +106,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.6.Flow20: @@ -395,8 +390,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr30_sgpr31 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.37, implicit $exec @@ -434,8 +429,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr36_sgpr37 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.39, implicit $exec @@ -484,8 +479,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr38_sgpr39 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.41, implicit $exec @@ -535,8 +530,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr40_sgpr41 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.47, implicit $exec @@ -589,8 +584,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} @@ -643,8 +638,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr16_sgpr17 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.43, implicit $exec @@ -689,8 +684,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: S_BRANCH %bb.45 ; GFX90A-NEXT: {{ $}} @@ -719,8 +714,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: S_BRANCH %bb.46 ; GFX90A-NEXT: {{ $}} @@ -748,8 +743,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: S_BRANCH %bb.62 ; GFX90A-NEXT: {{ $}} @@ -773,8 +768,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr58_sgpr59 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.53, implicit $exec @@ -880,8 +875,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: $sgpr50_sgpr51 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.57, implicit $exec ; GFX90A-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir index 6603f2ef7adef..7421a2e10c3b5 100644 --- a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir +++ b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir @@ -32,7 +32,7 @@ body: | ; CHECK-NEXT: dead undef [[DEF2:%[0-9]+]].sub0:vreg_64 = IMPLICIT_DEF ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4) ; CHECK-NEXT: SI_SPILL_S512_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s512) into %stack.0, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr24 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $sgpr24 = IMPLICIT_DEF implicit-def $sgpr25 ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM undef renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4) ; CHECK-NEXT: $exec = S_MOV_B64_term undef renamable $sgpr4_sgpr5 ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec @@ -82,7 +82,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s512) from %stack.0, align 4, addrspace 5) ; CHECK-NEXT: dead [[IMAGE_SAMPLE_LZ_V1_V2_5:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF]], killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, undef renamable $sgpr24_sgpr25_sgpr26_sgpr27, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) - ; CHECK-NEXT: renamable $sgpr25 = COPY undef renamable $sgpr24 + ; CHECK-NEXT: renamable $sgpr25 = COPY undef renamable $sgpr24, implicit-def $sgpr24 ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit undef $vcc ; CHECK-NEXT: S_BRANCH %bb.6 ; CHECK-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir index fa95f4c134174..8ae6c27955896 100644 --- a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir @@ -30,7 +30,7 @@ body: | ; CHECK-NEXT: dead renamable $sgpr5 = IMPLICIT_DEF ; CHECK-NEXT: dead undef [[DEF3:%[0-9]+]].sub1:vreg_64 = IMPLICIT_DEF ; CHECK-NEXT: dead renamable $sgpr5 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $sgpr24 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $sgpr24 = IMPLICIT_DEF implicit-def $sgpr25 ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM undef renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4) ; CHECK-NEXT: $exec = S_MOV_B64_term undef renamable $sgpr4_sgpr5 ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec @@ -78,7 +78,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX8_IMM undef renamable $sgpr4_sgpr5, 32, 0 :: (invariant load (s256), addrspace 4) ; CHECK-NEXT: dead [[IMAGE_SAMPLE_LZ_V1_V2_5:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF]], killed renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, undef renamable $sgpr24_sgpr25_sgpr26_sgpr27, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) - ; CHECK-NEXT: renamable $sgpr25 = COPY undef renamable $sgpr24 + ; CHECK-NEXT: renamable $sgpr25 = COPY undef renamable $sgpr24, implicit-def $sgpr24 ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.6, implicit undef $vcc ; CHECK-NEXT: S_BRANCH %bb.5 ; CHECK-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/issue98474-assigned-physreg-interference.mir b/llvm/test/CodeGen/AMDGPU/issue98474-assigned-physreg-interference.mir new file mode 100644 index 0000000000000..786ce40203836 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/issue98474-assigned-physreg-interference.mir @@ -0,0 +1,55 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -start-before=greedy,2 -stop-after=tailduplication -verify-machineinstrs -o - %s | FileCheck %s + +--- +name: undef_subreg_def_live_out_tailduplicate_vreg96_undef_sub1_sub2_assigned_physreg_interference +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' +body: | + ; CHECK-LABEL: name: undef_subreg_def_live_out_tailduplicate_vreg96_undef_sub1_sub2_assigned_physreg_interference + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $sgpr0, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr4_vgpr5 + ; CHECK-NEXT: EXP 0, killed renamable $vgpr3, renamable $vgpr4, renamable $vgpr5, killed renamable $vgpr2, 0, 0, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: renamable $vgpr3_vgpr4_vgpr5 = BUFFER_LOAD_FORMAT_XYZ_IDXEN killed renamable $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 8) + ; CHECK-NEXT: EXP 0, killed renamable $vgpr3, renamable $vgpr4, renamable $vgpr5, killed renamable $vgpr2, 0, 0, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + liveins: $sgpr0, $vgpr2 + + %2:vgpr_32 = COPY $vgpr2 + S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit killed $scc + + bb.1: + undef %0.sub0:vreg_96 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.3 + + bb.2: + S_NOP 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %0:vreg_96 = BUFFER_LOAD_FORMAT_XYZ_IDXEN killed %1, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), addrspace 8) + + bb.3: + EXP 0, killed %0.sub0, killed %0.sub1, killed %0.sub2, %2:vgpr_32, 0, 0, 0, implicit $exec + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/issue98474-need-live-out-undef-subregister-def.ll b/llvm/test/CodeGen/AMDGPU/issue98474-need-live-out-undef-subregister-def.ll new file mode 100644 index 0000000000000..7caa563d8b298 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/issue98474-need-live-out-undef-subregister-def.ll @@ -0,0 +1,42 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck %s + +; Check for verifier error after tail duplication. An implicit_def of +; a subregsiter is needed to maintain liveness after assignment. + +define amdgpu_vs void @test(i32 inreg %cmp, i32 %e0) { +; CHECK-LABEL: test: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_cmp_eq_u32 s0, 0 +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_cbranch_scc1 .LBB0_2 +; CHECK-NEXT: ; %bb.1: ; %load +; CHECK-NEXT: s_mov_b32 s1, s0 +; CHECK-NEXT: s_mov_b32 s2, s0 +; CHECK-NEXT: s_mov_b32 s3, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: buffer_load_format_xy v[1:2], v1, s[0:3], 0 idxen +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: exp mrt0 v0, v1, v2, v0 +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: exp mrt0 v0, v1, v2, v0 +; CHECK-NEXT: s_endpgm +entry: + %cond = icmp eq i32 %cmp, 0 + br i1 %cond, label %end, label %load + +load: + %data1 = call <2 x i32> @llvm.amdgcn.struct.buffer.load.format.v2i32(<4 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0) + %e1 = extractelement <2 x i32> %data1, i32 0 + %e2 = extractelement <2 x i32> %data1, i32 1 + br label %end + +end: + %out1 = phi i32 [ 0, %entry ], [ %e1, %load ] + %out2 = phi i32 [ poison, %entry ], [ %e2, %load ] + call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 %e0, i32 %out1, i32 %out2, i32 %e0, i1 false, i1 false) + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/issue98474-virtregrewriter-live-out-undef-subregisters.mir b/llvm/test/CodeGen/AMDGPU/issue98474-virtregrewriter-live-out-undef-subregisters.mir new file mode 100644 index 0000000000000..86b6c5982b4cb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/issue98474-virtregrewriter-live-out-undef-subregisters.mir @@ -0,0 +1,363 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -start-before=greedy,2 -stop-after=tailduplication -verify-machineinstrs -o - %s | FileCheck %s + +# The partial def of %0 introduces a live out undef def of %0.sub1 +# into bb.3. We need to maintain this liveness with an explicit def of +# the physical subregister. Without this, a verifier error would +# appear after tail duplication. + +--- +name: undef_subreg_def_live_out_tailduplicate_vreg64_undef_sub1 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' +body: | + ; CHECK-LABEL: name: undef_subreg_def_live_out_tailduplicate_vreg64_undef_sub1 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $sgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1 + ; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr1, undef renamable $vgpr0, undef renamable $vgpr0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: renamable $vgpr0_vgpr1 = BUFFER_LOAD_FORMAT_XY_IDXEN killed renamable $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), addrspace 8) + ; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr1, undef renamable $vgpr0, undef renamable $vgpr0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + liveins: $sgpr0 + + S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit killed $scc + + bb.1: + undef %0.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.3 + + bb.2: + S_NOP 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %0:vreg_64 = BUFFER_LOAD_FORMAT_XY_IDXEN killed %1, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), addrspace 8) + + bb.3: + EXP 0, killed %0.sub0, killed %0.sub1, undef %2:vgpr_32, undef %2:vgpr_32, 0, 0, 0, implicit $exec + S_ENDPGM 0 + +... + +--- +name: undef_subreg_def_live_out_tailduplicate_vreg96_undef_sub1_sub2 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' +body: | + ; CHECK-LABEL: name: undef_subreg_def_live_out_tailduplicate_vreg96_undef_sub1_sub2 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $sgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1_vgpr2 + ; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr1, renamable $vgpr2, undef renamable $vgpr0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2 = BUFFER_LOAD_FORMAT_XYZ_IDXEN killed renamable $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 8) + ; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr1, renamable $vgpr2, undef renamable $vgpr0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + liveins: $sgpr0 + + S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit killed $scc + + bb.1: + undef %0.sub0:vreg_96 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.3 + + bb.2: + S_NOP 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %0:vreg_96 = BUFFER_LOAD_FORMAT_XYZ_IDXEN killed %1, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), addrspace 8) + + bb.3: + EXP 0, killed %0.sub0, killed %0.sub1, killed %0.sub2, undef %2:vgpr_32, 0, 0, 0, implicit $exec + S_ENDPGM 0 + +... + +--- +name: undef_subreg_def_live_out_tailduplicate_vreg96_undef_sub0_sub2 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' +body: | + ; CHECK-LABEL: name: undef_subreg_def_live_out_tailduplicate_vreg96_undef_sub0_sub2 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $sgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr2, implicit-def $vgpr0 + ; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr1, renamable $vgpr2, undef renamable $vgpr0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2 = BUFFER_LOAD_FORMAT_XYZ_IDXEN killed renamable $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 8) + ; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr1, renamable $vgpr2, undef renamable $vgpr0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + liveins: $sgpr0 + + S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit killed $scc + + bb.1: + undef %0.sub1:vreg_96 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.3 + + bb.2: + S_NOP 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %0:vreg_96 = BUFFER_LOAD_FORMAT_XYZ_IDXEN killed %1, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), addrspace 8) + + bb.3: + EXP 0, killed %0.sub0, killed %0.sub1, killed %0.sub2, undef %2:vgpr_32, 0, 0, 0, implicit $exec + S_ENDPGM 0 + +... + +# Test another use of the value before the block end. +--- +name: undef_subreg_def_live_out_tailduplicate_vreg64_undef_sub1_undef_use_in_def_block +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' +body: | + ; CHECK-LABEL: name: undef_subreg_def_live_out_tailduplicate_vreg64_undef_sub1_undef_use_in_def_block + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $sgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1 + ; CHECK-NEXT: S_NOP 0, implicit renamable $vgpr0_vgpr1 + ; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr1, undef renamable $vgpr0, undef renamable $vgpr0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: renamable $vgpr0_vgpr1 = BUFFER_LOAD_FORMAT_XY_IDXEN killed renamable $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), addrspace 8) + ; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr1, undef renamable $vgpr0, undef renamable $vgpr0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + liveins: $sgpr0 + + S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit killed $scc + + bb.1: + undef %0.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + S_NOP 0, implicit %0 + S_BRANCH %bb.3 + + bb.2: + S_NOP 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %0:vreg_64 = BUFFER_LOAD_FORMAT_XY_IDXEN killed %1, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), addrspace 8) + + bb.3: + EXP 0, killed %0.sub0, killed %0.sub1, undef %2:vgpr_32, undef %2:vgpr_32, 0, 0, 0, implicit $exec + S_ENDPGM 0 + +... + +# The undef subregister is not live out, no implicit def should be added for it +--- +name: undef_subreg_def_live_out_tailduplicate_vreg64_undef_sub1_no_phi_use +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' +body: | + ; CHECK-LABEL: name: undef_subreg_def_live_out_tailduplicate_vreg64_undef_sub1_no_phi_use + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $sgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr0, undef renamable $vgpr0, undef renamable $vgpr0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: renamable $vgpr0_vgpr1 = BUFFER_LOAD_FORMAT_XY_IDXEN killed renamable $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), addrspace 8) + ; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr0, undef renamable $vgpr0, undef renamable $vgpr0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + liveins: $sgpr0 + + S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit killed $scc + + bb.1: + undef %0.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.3 + + bb.2: + S_NOP 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %0:vreg_64 = BUFFER_LOAD_FORMAT_XY_IDXEN killed %1, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), addrspace 8) + + bb.3: + EXP 0, killed %0.sub0, killed %0.sub0, undef %2:vgpr_32, undef %2:vgpr_32, 0, 0, 0, implicit $exec + S_ENDPGM 0 + +... + +# In bb.2, %0 should be assigned to vgpr0_vgpr1. Make sure the value +# copied from $vgpr0 into %3 isn't clobbered by the undef phi def for +# %0.sub1. +--- +name: assigned_physreg_subregister_interference +tracksRegLiveness: true +frameInfo: + adjustsStack: true +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + wwmReservedRegs: + - '$vgpr63' +body: | + ; CHECK-LABEL: name: assigned_physreg_subregister_interference + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; CHECK-NEXT: $vgpr40 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr40 + ; CHECK-NEXT: $vgpr40 = SI_SPILL_S32_TO_VGPR $sgpr31, 1, $vgpr40 + ; CHECK-NEXT: $vgpr40 = SI_SPILL_S32_TO_VGPR $sgpr34, 2, $vgpr40 + ; CHECK-NEXT: $vgpr40 = SI_SPILL_S32_TO_VGPR $sgpr35, 3, $vgpr40 + ; CHECK-NEXT: $vgpr40 = SI_SPILL_S32_TO_VGPR $sgpr36, 4, $vgpr40 + ; CHECK-NEXT: $vgpr40 = SI_SPILL_S32_TO_VGPR $sgpr37, 5, $vgpr40 + ; CHECK-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $vgpr0_vgpr1:0x000000000000000F + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec + ; CHECK-NEXT: renamable $sgpr5 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec + ; CHECK-NEXT: renamable $vcc = V_CMP_EQ_U64_e64 $sgpr4_sgpr5, killed $vgpr0_vgpr1, implicit $exec + ; CHECK-NEXT: renamable $sgpr36_sgpr37 = S_AND_SAVEEXEC_B64 killed renamable $vcc, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: dead $sgpr30_sgpr31 = noconvergent SI_CALL killed renamable $sgpr4_sgpr5, 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0 + ; CHECK-NEXT: renamable $vgpr1 = COPY $vgpr0, implicit $exec + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 123, implicit $exec + ; CHECK-NEXT: $exec = S_XOR_B64 $exec, renamable $sgpr36_sgpr37, implicit-def dead $scc + ; CHECK-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $vgpr1, $sgpr34_sgpr35 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec = COPY renamable $sgpr34_sgpr35 + ; CHECK-NEXT: renamable $vgpr0 = V_ADD_U32_e32 1, killed $vgpr1, implicit $exec + ; CHECK-NEXT: $sgpr37 = SI_RESTORE_S32_FROM_VGPR $vgpr40, 5 + ; CHECK-NEXT: $sgpr36 = SI_RESTORE_S32_FROM_VGPR $vgpr40, 4 + ; CHECK-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr40, 3 + ; CHECK-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr40, 2 + ; CHECK-NEXT: $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr40, 1 + ; CHECK-NEXT: $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr40, 0 + ; CHECK-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + bb.0: + liveins: $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $vgpr0, $vgpr1, $vgpr63 + + $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63 + $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63 + $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr34, 2, $vgpr63 + $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr35, 3, $vgpr63 + $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr36, 4, $vgpr63 + $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr37, 5, $vgpr63 + undef %0.sub0:vreg_64 = COPY $vgpr0 + %0.sub1:vreg_64 = COPY $vgpr1 + ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + renamable $sgpr34_sgpr35 = S_MOV_B64 $exec + + bb.1: + liveins: $vgpr63, $sgpr34_sgpr35 + + renamable $sgpr4 = V_READFIRSTLANE_B32 %0.sub0, implicit $exec + renamable $sgpr5 = V_READFIRSTLANE_B32 %0.sub1, implicit $exec + renamable $vcc = V_CMP_EQ_U64_e64 $sgpr4_sgpr5, %0, implicit $exec + renamable $sgpr36_sgpr37 = S_AND_SAVEEXEC_B64 killed renamable $vcc, implicit-def $exec, implicit-def dead $scc, implicit $exec + + bb.2: + liveins: $vgpr63, $sgpr4_sgpr5:0x000000000000000F, $sgpr34_sgpr35, $sgpr36_sgpr37 + + dead $sgpr30_sgpr31 = noconvergent SI_CALL killed renamable $sgpr4_sgpr5, 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0 + %3:vgpr_32 = COPY $vgpr0 + undef %0.sub0:vreg_64 = V_MOV_B32_e32 123, implicit $exec + $exec = S_XOR_B64_term $exec, killed renamable $sgpr36_sgpr37, implicit-def dead $scc + S_CBRANCH_EXECNZ %bb.1, implicit $exec + + bb.3: + liveins: $vgpr63, $sgpr34_sgpr35 + + $exec = S_MOV_B64_term killed renamable $sgpr34_sgpr35 + + bb.4: + liveins: $vgpr63 + + ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + %6:vgpr_32 = V_ADD_U32_e32 1, %3, implicit $exec + $vgpr0 = COPY %6 + $sgpr37 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 5 + $sgpr36 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 4 + $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 3 + $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 2 + $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 1 + $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 0 + SI_RETURN implicit $vgpr0 + +... diff --git a/llvm/test/MachineVerifier/AMDGPU/issue98474-missing-def-liveout-physical-subregister.mir b/llvm/test/MachineVerifier/AMDGPU/issue98474-missing-def-liveout-physical-subregister.mir new file mode 100644 index 0000000000000..892a4298bbdb5 --- /dev/null +++ b/llvm/test/MachineVerifier/AMDGPU/issue98474-missing-def-liveout-physical-subregister.mir @@ -0,0 +1,36 @@ +# XFAIL: * +# RUN: not --crash llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -run-pass=none -filetype=null %s + +# FIXME: This should fail the machine verifier. There is a missing def +# of $vgpr2 in bb.1, which is needed since it's live into bb.3 + +--- +name: missing_live_out_subreg_def +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $vgpr0 + + S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit killed $scc + + bb.1: + liveins: $vgpr0 + + renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.3 + + bb.2: + liveins: $vgpr0 + + renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF + renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec + renamable $vgpr1_vgpr2 = BUFFER_LOAD_FORMAT_XY_IDXEN killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), addrspace 8) + + bb.3: + liveins: $vgpr0, $vgpr1_vgpr2 + + EXP 0, killed renamable $vgpr0, killed renamable $vgpr1, renamable $vgpr2, renamable $vgpr0, 0, 0, 0, implicit $exec + S_ENDPGM 0 + +...