diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 7793907c032d2..2e424ee078a78 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -910,12 +910,18 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI, TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg) .add(MI.getOperand(1)); MI.getOperand(1).setReg(TmpReg); - } else if (tryMoveVGPRConstToSGPR(MI.getOperand(1), DstReg, MI.getParent(), - MI, MI.getDebugLoc())) { + return true; + } + + if (tryMoveVGPRConstToSGPR(MI.getOperand(1), DstReg, MI.getParent(), MI, + MI.getDebugLoc())) { I = std::next(I); MI.eraseFromParent(); + return true; } - return true; + + if (!SrcReg.isVirtual()) + return true; } if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) { SIInstrWorklist worklist; @@ -941,7 +947,7 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) { if (PHISources.contains(MI)) return; Register DstReg = MI->getOperand(0).getReg(); - const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg); + const TargetRegisterClass *DstRC = TRI->getRegClassForReg(*MRI, DstReg); V2SCopyInfo Info(getNextVGPRToSGPRCopyId(), MI, TRI->getRegSizeInBits(*DstRC)); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index ec5c5bb349ac4..4bff0204bdcb2 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6861,13 +6861,10 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, // Emit the actual waterfall loop, executing the wrapped instruction for each // unique value of \p ScalarOps across all lanes. In the best case we execute 1 // iteration, in the worst case we execute 64 (once per lane). -static void -emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, - MachineRegisterInfo &MRI, - MachineBasicBlock &LoopBB, - MachineBasicBlock &BodyBB, - const DebugLoc &DL, - ArrayRef ScalarOps) { +static void emitLoadScalarOpsFromVGPRLoop( + const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, + MachineBasicBlock &BodyBB, const DebugLoc &DL, + ArrayRef ScalarOps, ArrayRef PhySGPRs = {}) { MachineFunction &MF = *LoopBB.getParent(); const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -6876,8 +6873,7 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineBasicBlock::iterator I = LoopBB.begin(); Register CondReg; - - for (MachineOperand *ScalarOp : ScalarOps) { + for (auto [Idx, ScalarOp] : enumerate(ScalarOps)) { unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI); unsigned NumSubRegs = RegSize / 32; Register VScalarOp = ScalarOp->getReg(); @@ -6906,7 +6902,15 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, } // Update ScalarOp operand to use the SGPR ScalarOp. - ScalarOp->setReg(CurReg); + if (PhySGPRs.empty() || !PhySGPRs[Idx].isValid()) + ScalarOp->setReg(CurReg); + else { + // Insert into the same block of use + BuildMI(*ScalarOp->getParent()->getParent(), ScalarOp->getParent(), DL, + TII.get(AMDGPU::COPY), PhySGPRs[Idx]) + .addReg(CurReg); + ScalarOp->setReg(PhySGPRs[Idx]); + } ScalarOp->setIsKill(); } else { SmallVector ReadlanePieces; @@ -6975,7 +6979,14 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, } // Update ScalarOp operand to use the SGPR ScalarOp. - ScalarOp->setReg(SScalarOp); + if (PhySGPRs.empty() || !PhySGPRs[Idx].isValid()) + ScalarOp->setReg(SScalarOp); + else { + BuildMI(*ScalarOp->getParent()->getParent(), ScalarOp->getParent(), DL, + TII.get(AMDGPU::COPY), PhySGPRs[Idx]) + .addReg(SScalarOp); + ScalarOp->setReg(PhySGPRs[Idx]); + } ScalarOp->setIsKill(); } } @@ -7006,7 +7017,10 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin = nullptr, - MachineBasicBlock::iterator End = nullptr) { + MachineBasicBlock::iterator End = nullptr, + ArrayRef PhySGPRs = {}) { + assert((PhySGPRs.empty() || PhySGPRs.size() == ScalarOps.size()) && + "Physical SGPRs must be empty or match the number of scalar operands"); MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); const GCNSubtarget &ST = MF.getSubtarget(); @@ -7091,7 +7105,8 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, } } - emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps); + emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps, + PhySGPRs); MachineBasicBlock::iterator First = RemainderBB->begin(); // Restore SCC @@ -7328,27 +7343,7 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) { MachineOperand *Dest = &MI.getOperand(0); if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) { - // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and - // following copies, we also need to move copies from and to physical - // registers into the loop block. - unsigned FrameSetupOpcode = getCallFrameSetupOpcode(); - unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode(); - - // Also move the copies to physical registers into the loop block - MachineBasicBlock &MBB = *MI.getParent(); - MachineBasicBlock::iterator Start(&MI); - while (Start->getOpcode() != FrameSetupOpcode) - --Start; - MachineBasicBlock::iterator End(&MI); - while (End->getOpcode() != FrameDestroyOpcode) - ++End; - // Also include following copies of the return value - ++End; - while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() && - MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr)) - ++End; - CreatedBB = - loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End); + createWaterFall(&MI, MDT, {Dest}); } } @@ -7611,6 +7606,33 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, legalizeOperandsVALUt16(MI, OpIdx, MRI); } +void SIInstrInfo::createWaterFall(MachineInstr *MI, MachineDominatorTree *MDT, + ArrayRef ScalarOps, + ArrayRef PhySGPRs) const { + if (MI->getOpcode() == AMDGPU::SI_CALL_ISEL) { + // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and + // following copies, we also need to move copies from and to physical + // registers into the loop block. + // Also move the copies to physical registers into the loop block + MachineBasicBlock &MBB = *MI->getParent(); + MachineBasicBlock::iterator Start(MI); + while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP) + --Start; + MachineBasicBlock::iterator End(MI); + while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN) + ++End; + + // Also include following copies of the return value + ++End; + while (End != MBB.end() && End->isCopy() && + MI->definesRegister(End->getOperand(1).getReg(), &RI)) + ++End; + + loadMBUFScalarOperandsFromVGPR(*this, *MI, ScalarOps, MDT, Start, End, + PhySGPRs); + } +} + void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const { @@ -7630,6 +7652,87 @@ void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist, assert(Worklist.empty() && "Deferred MachineInstr are not supposed to re-populate worklist"); } + + for (std::pair &Entry : Worklist.WaterFalls) + createWaterFall(Entry.first, MDT, Entry.second.MOs, Entry.second.SGPRs); + + for (std::pair Entry : Worklist.V2PhySCopiesToErase) + if (Entry.second) + Entry.first->eraseFromParent(); +} +void SIInstrInfo::createReadFirstLaneFromCopyToPhysReg( + MachineRegisterInfo &MRI, Register DstReg, MachineInstr &Inst) const { + // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and + // hope for the best. + const TargetRegisterClass *DstRC = RI.getRegClassForReg(MRI, DstReg); + ArrayRef BaseIndices = RI.getRegSplitParts(DstRC, 4); + if (BaseIndices.empty() || BaseIndices.size() == 1) { + Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), + get(AMDGPU::V_READFIRSTLANE_B32), NewDst) + .add(Inst.getOperand(1)); + BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), + DstReg) + .addReg(NewDst); + } else { + SmallVector DstRegs; + for (unsigned i = 0; i < BaseIndices.size(); ++i) { + Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), + get(AMDGPU::V_READFIRSTLANE_B32), NewDst) + .addReg(Inst.getOperand(1).getReg(), 0, BaseIndices[i]); + + DstRegs.push_back(NewDst); + } + MachineInstrBuilder MIB = + BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), + get(AMDGPU::REG_SEQUENCE), DstReg); + for (unsigned i = 0; i < BaseIndices.size(); ++i) { + MIB.addReg(DstRegs[i]); + MIB.addImm(RI.getSubRegFromChannel(i)); + } + } +} + +void SIInstrInfo::handleCopyToPhyHelper(SIInstrWorklist &Worklist, + Register DstReg, MachineInstr &Inst, + MachineRegisterInfo &MRI) const { + if (DstReg == AMDGPU::M0) { + createReadFirstLaneFromCopyToPhysReg(MRI, DstReg, Inst); + Worklist.V2PhySCopiesToErase.try_emplace(&Inst, true); + return; + } + Register SrcReg = Inst.getOperand(1).getReg(); + MachineBasicBlock::iterator I = Inst.getIterator(); + MachineBasicBlock::iterator E = Inst.getParent()->end(); + // Only search current block since phyreg's def & use cannot cross + // blocks when MF.NoPhi = false. + while (++I != E) { + // Currently, we only support waterfall on SI_CALL_ISEL. + if (I->getOpcode() == AMDGPU::SI_CALL_ISEL) { + MachineInstr *UseMI = &*I; + for (unsigned i = 0; i < UseMI->getNumOperands(); ++i) { + if (UseMI->getOperand(i).isReg() && + UseMI->getOperand(i).getReg() == DstReg) { + MachineOperand *MO = &UseMI->getOperand(i); + MO->setReg(SrcReg); + V2PhysSCopyInfo &V2SCopyInfo = Worklist.WaterFalls[UseMI]; + V2SCopyInfo.MOs.push_back(MO); + V2SCopyInfo.SGPRs.push_back(DstReg); + Worklist.V2PhySCopiesToErase.try_emplace(&Inst, true); + } + } + } else if (I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG && + I->getOperand(0).isReg() && + I->getOperand(0).getReg() == DstReg) { + createReadFirstLaneFromCopyToPhysReg(MRI, DstReg, Inst); + Worklist.V2PhySCopiesToErase.try_emplace(&Inst, true); + } else if (I->readsRegister(DstReg, &RI)) + // COPY cannot be erased if other type of inst uses it. + Worklist.V2PhySCopiesToErase[&Inst] = false; + if (I->findRegisterDefOperand(DstReg, &RI)) + break; + } } void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, @@ -8106,19 +8209,9 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, Register DstReg = Inst.getOperand(0).getReg(); const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); - // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and - // hope for the best. if (Inst.isCopy() && DstReg.isPhysical() && RI.isVGPR(MRI, Inst.getOperand(1).getReg())) { - Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), - get(AMDGPU::V_READFIRSTLANE_B32), NewDst) - .add(Inst.getOperand(1)); - BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), - DstReg) - .addReg(NewDst); - - Inst.eraseFromParent(); + handleCopyToPhyHelper(Worklist, DstReg, Inst, MRI); return; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index e979eeb0bdf3a..b2e35bc5ea53d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -52,6 +52,13 @@ static const MachineMemOperand::Flags MOLastUse = static const MachineMemOperand::Flags MOCooperative = MachineMemOperand::MOTargetFlag3; +struct V2PhysSCopyInfo { + // Operands that need to replaced by waterfall + SmallVector MOs; + // Target physical registers replacing the MOs + SmallVector SGPRs; +}; + /// Utility to store machine instructions worklist. struct SIInstrWorklist { SIInstrWorklist() = default; @@ -79,6 +86,9 @@ struct SIInstrWorklist { SetVector &getDeferredList() { return DeferredList; } + DenseMap WaterFalls; + DenseMap V2PhySCopiesToErase; + private: /// InstrList contains the MachineInstrs. SetVector InstrList; @@ -1407,6 +1417,12 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const; + /// Wrapper function for generating waterfall for instruction \p MI + /// This function take into consideration of related pre & succ instructions + /// (e.g. calling process) into consideratioin + void createWaterFall(MachineInstr *MI, MachineDominatorTree *MDT, + ArrayRef ScalarOps, + ArrayRef PhySGPRs = {}) const; void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; @@ -1595,6 +1611,14 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { const TargetSchedModel &getSchedModel() const { return SchedModel; } + void createReadFirstLaneFromCopyToPhysReg(MachineRegisterInfo &MRI, + Register DstReg, + MachineInstr &Inst) const; + + void handleCopyToPhyHelper(SIInstrWorklist &Worklist, Register DstReg, + MachineInstr &Inst, + MachineRegisterInfo &MRI) const; + // Enforce operand's \p OpName even alignment if required by target. // This is used if an operand is a 32 bit register but needs to be aligned // regardless. diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll deleted file mode 100644 index 34f4476f7fd6a..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll +++ /dev/null @@ -1,27 +0,0 @@ -; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 -filetype=null %s 2>&1 | FileCheck -enable-var-scope %s - -; CHECK: illegal VGPR to SGPR copy - -declare hidden void @external_void_func_a15i32_inreg([15 x i32] inreg) #0 -declare hidden void @external_void_func_a16i32_inreg([16 x i32] inreg) #0 -declare hidden void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg, i32 inreg) #0 - -define void @test_call_external_void_func_a15i32_inreg([15 x i32] inreg %arg0) #0 { - call void @external_void_func_a15i32_inreg([15 x i32] inreg %arg0) - ret void -} - -define void @test_call_external_void_func_a16i32_inreg([16 x i32] inreg %arg0) #0 { - call void @external_void_func_a16i32_inreg([16 x i32] inreg %arg0) - ret void -} - -define void @test_call_external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1) #0 { - call void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1) - ret void -} - -attributes #0 = { nounwind } - -!llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill.ll new file mode 100644 index 0000000000000..920a6e3e92347 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill.ll @@ -0,0 +1,231 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope %s + +define hidden void @external_void_func_a15i32_inreg([15 x i32] inreg %args) #0 { +; CHECK-LABEL: external_void_func_a15i32_inreg: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + ret void +} + +define hidden void @external_void_func_a16i32_inreg([16 x i32] inreg %args) #0 { +; CHECK-LABEL: external_void_func_a16i32_inreg: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + ret void +} + +define hidden void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: external_void_func_a15i32_inreg_i32_inreg: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + ret void +} + +define void @test_call_external_void_func_a15i32_inreg([15 x i32] inreg %arg0) #0 { +; CHECK-LABEL: test_call_external_void_func_a15i32_inreg: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s59, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_xor_saveexec_b64 s[40:41], -1 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[40:41] +; CHECK-NEXT: v_writelane_b32 v1, s30, 0 +; CHECK-NEXT: s_mov_b32 s40, s26 +; CHECK-NEXT: s_mov_b32 s41, s25 +; CHECK-NEXT: s_mov_b32 s42, s24 +; CHECK-NEXT: s_mov_b32 s43, s23 +; CHECK-NEXT: s_mov_b32 s44, s22 +; CHECK-NEXT: s_mov_b32 s45, s21 +; CHECK-NEXT: s_mov_b32 s46, s20 +; CHECK-NEXT: s_mov_b32 s47, s19 +; CHECK-NEXT: s_mov_b32 s56, s18 +; CHECK-NEXT: s_mov_b32 s57, s17 +; CHECK-NEXT: s_mov_b32 s58, s16 +; CHECK-NEXT: s_mov_b64 s[60:61], exec +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v1, s31, 1 +; CHECK-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s26, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s26, v0 +; CHECK-NEXT: s_and_saveexec_b64 vcc, vcc +; CHECK-NEXT: s_getpc_b64 s[62:63] +; CHECK-NEXT: s_add_u32 s62, s62, external_void_func_a15i32_inreg@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s63, s63, external_void_func_a15i32_inreg@rel32@hi+12 +; CHECK-NEXT: s_mov_b32 s0, s58 +; CHECK-NEXT: s_mov_b32 s1, s57 +; CHECK-NEXT: s_mov_b32 s2, s56 +; CHECK-NEXT: s_mov_b32 s3, s47 +; CHECK-NEXT: s_mov_b32 s16, s46 +; CHECK-NEXT: s_mov_b32 s17, s45 +; CHECK-NEXT: s_mov_b32 s18, s44 +; CHECK-NEXT: s_mov_b32 s19, s43 +; CHECK-NEXT: s_mov_b32 s20, s42 +; CHECK-NEXT: s_mov_b32 s21, s41 +; CHECK-NEXT: s_mov_b32 s22, s40 +; CHECK-NEXT: s_mov_b32 s23, s27 +; CHECK-NEXT: s_mov_b32 s24, s28 +; CHECK-NEXT: s_mov_b32 s25, s29 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[62:63] +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr31 +; CHECK-NEXT: s_xor_b64 exec, exec, vcc +; CHECK-NEXT: s_cbranch_execnz .LBB3_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_mov_b64 exec, s[60:61] +; CHECK-NEXT: v_readlane_b32 s31, v1, 1 +; CHECK-NEXT: v_readlane_b32 s30, v1, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_mov_b32 s33, s59 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + call void @external_void_func_a15i32_inreg([15 x i32] inreg %arg0) + ret void +} + +define void @test_call_external_void_func_a16i32_inreg([16 x i32] inreg %arg0) #0 { +; CHECK-LABEL: test_call_external_void_func_a16i32_inreg: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s74, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_xor_saveexec_b64 s[40:41], -1 +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[40:41] +; CHECK-NEXT: v_writelane_b32 v2, s30, 0 +; CHECK-NEXT: s_mov_b32 s40, s27 +; CHECK-NEXT: s_mov_b32 s41, s26 +; CHECK-NEXT: s_mov_b32 s42, s25 +; CHECK-NEXT: s_mov_b32 s43, s24 +; CHECK-NEXT: s_mov_b32 s44, s23 +; CHECK-NEXT: s_mov_b32 s45, s22 +; CHECK-NEXT: s_mov_b32 s46, s21 +; CHECK-NEXT: s_mov_b32 s47, s20 +; CHECK-NEXT: s_mov_b32 s56, s19 +; CHECK-NEXT: s_mov_b32 s57, s18 +; CHECK-NEXT: s_mov_b32 s58, s17 +; CHECK-NEXT: s_mov_b32 s59, s16 +; CHECK-NEXT: s_mov_b64 s[60:61], exec +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v2, s31, 1 +; CHECK-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s27, v1 +; CHECK-NEXT: v_readfirstlane_b32 s26, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s27, v1 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[16:17], s26, v0 +; CHECK-NEXT: s_and_b64 s[16:17], vcc, s[16:17] +; CHECK-NEXT: s_and_saveexec_b64 s[62:63], s[16:17] +; CHECK-NEXT: s_getpc_b64 s[72:73] +; CHECK-NEXT: s_add_u32 s72, s72, external_void_func_a16i32_inreg@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s73, s73, external_void_func_a16i32_inreg@rel32@hi+12 +; CHECK-NEXT: s_mov_b32 s0, s59 +; CHECK-NEXT: s_mov_b32 s1, s58 +; CHECK-NEXT: s_mov_b32 s2, s57 +; CHECK-NEXT: s_mov_b32 s3, s56 +; CHECK-NEXT: s_mov_b32 s16, s47 +; CHECK-NEXT: s_mov_b32 s17, s46 +; CHECK-NEXT: s_mov_b32 s18, s45 +; CHECK-NEXT: s_mov_b32 s19, s44 +; CHECK-NEXT: s_mov_b32 s20, s43 +; CHECK-NEXT: s_mov_b32 s21, s42 +; CHECK-NEXT: s_mov_b32 s22, s41 +; CHECK-NEXT: s_mov_b32 s23, s40 +; CHECK-NEXT: s_mov_b32 s24, s28 +; CHECK-NEXT: s_mov_b32 s25, s29 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[72:73] +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr31 +; CHECK-NEXT: s_xor_b64 exec, exec, s[62:63] +; CHECK-NEXT: s_cbranch_execnz .LBB4_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_mov_b64 exec, s[60:61] +; CHECK-NEXT: v_readlane_b32 s31, v2, 1 +; CHECK-NEXT: v_readlane_b32 s30, v2, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_mov_b32 s33, s74 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + call void @external_void_func_a16i32_inreg([16 x i32] inreg %arg0) + ret void +} + +define void @test_call_external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: test_call_external_void_func_a15i32_inreg_i32_inreg: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s74, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_xor_saveexec_b64 s[40:41], -1 +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[40:41] +; CHECK-NEXT: v_writelane_b32 v2, s30, 0 +; CHECK-NEXT: s_mov_b32 s40, s27 +; CHECK-NEXT: s_mov_b32 s41, s26 +; CHECK-NEXT: s_mov_b32 s42, s25 +; CHECK-NEXT: s_mov_b32 s43, s24 +; CHECK-NEXT: s_mov_b32 s44, s23 +; CHECK-NEXT: s_mov_b32 s45, s22 +; CHECK-NEXT: s_mov_b32 s46, s21 +; CHECK-NEXT: s_mov_b32 s47, s20 +; CHECK-NEXT: s_mov_b32 s56, s19 +; CHECK-NEXT: s_mov_b32 s57, s18 +; CHECK-NEXT: s_mov_b32 s58, s17 +; CHECK-NEXT: s_mov_b32 s59, s16 +; CHECK-NEXT: s_mov_b64 s[60:61], exec +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v2, s31, 1 +; CHECK-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s27, v1 +; CHECK-NEXT: v_readfirstlane_b32 s26, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s27, v1 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[16:17], s26, v0 +; CHECK-NEXT: s_and_b64 s[16:17], vcc, s[16:17] +; CHECK-NEXT: s_and_saveexec_b64 s[62:63], s[16:17] +; CHECK-NEXT: s_getpc_b64 s[72:73] +; CHECK-NEXT: s_add_u32 s72, s72, external_void_func_a15i32_inreg_i32_inreg@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s73, s73, external_void_func_a15i32_inreg_i32_inreg@rel32@hi+12 +; CHECK-NEXT: s_mov_b32 s0, s59 +; CHECK-NEXT: s_mov_b32 s1, s58 +; CHECK-NEXT: s_mov_b32 s2, s57 +; CHECK-NEXT: s_mov_b32 s3, s56 +; CHECK-NEXT: s_mov_b32 s16, s47 +; CHECK-NEXT: s_mov_b32 s17, s46 +; CHECK-NEXT: s_mov_b32 s18, s45 +; CHECK-NEXT: s_mov_b32 s19, s44 +; CHECK-NEXT: s_mov_b32 s20, s43 +; CHECK-NEXT: s_mov_b32 s21, s42 +; CHECK-NEXT: s_mov_b32 s22, s41 +; CHECK-NEXT: s_mov_b32 s23, s40 +; CHECK-NEXT: s_mov_b32 s24, s28 +; CHECK-NEXT: s_mov_b32 s25, s29 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[72:73] +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr31 +; CHECK-NEXT: s_xor_b64 exec, exec, s[62:63] +; CHECK-NEXT: s_cbranch_execnz .LBB5_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_mov_b64 exec, s[60:61] +; CHECK-NEXT: v_readlane_b32 s31, v2, 1 +; CHECK-NEXT: v_readlane_b32 s30, v2, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_mov_b32 s33, s74 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + call void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll deleted file mode 100644 index 242b5e9aeaf42..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll +++ /dev/null @@ -1,78 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 2> %t.err < %s | FileCheck %s -; RUN: FileCheck -check-prefix=ERR %s < %t.err -; FIXME: These tests cannot be tail called, and should be executed in a waterfall loop. - -declare hidden void @void_func_i32_inreg(i32 inreg) - -; ERR: error: :0:0: in function tail_call_i32_inreg_divergent void (i32): illegal VGPR to SGPR copy -; ERR: error: :0:0: in function indirect_tail_call_i32_inreg_divergent void (i32): illegal VGPR to SGPR copy - -define void @tail_call_i32_inreg_divergent(i32 %vgpr) { -; CHECK-LABEL: tail_call_i32_inreg_divergent: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s16, s33 -; CHECK-NEXT: s_mov_b32 s33, s32 -; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v40, s16, 2 -; CHECK-NEXT: s_addk_i32 s32, 0x400 -; CHECK-NEXT: v_writelane_b32 v40, s30, 0 -; CHECK-NEXT: v_writelane_b32 v40, s31, 1 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, void_func_i32_inreg@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, void_func_i32_inreg@rel32@hi+12 -; CHECK-NEXT: ; illegal copy v0 to s0 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 -; CHECK-NEXT: v_readlane_b32 s30, v40, 0 -; CHECK-NEXT: s_mov_b32 s32, s33 -; CHECK-NEXT: v_readlane_b32 s4, v40, 2 -; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_mov_b32 s33, s4 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[30:31] - tail call void @void_func_i32_inreg(i32 inreg %vgpr) - ret void -} - -@constant = external hidden addrspace(4) constant ptr - -define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) { -; CHECK-LABEL: indirect_tail_call_i32_inreg_divergent: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s16, s33 -; CHECK-NEXT: s_mov_b32 s33, s32 -; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: s_addk_i32 s32, 0x400 -; CHECK-NEXT: v_writelane_b32 v40, s16, 2 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, constant@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, constant@rel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; CHECK-NEXT: v_writelane_b32 v40, s30, 0 -; CHECK-NEXT: v_writelane_b32 v40, s31, 1 -; CHECK-NEXT: ; illegal copy v0 to s0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 -; CHECK-NEXT: v_readlane_b32 s30, v40, 0 -; CHECK-NEXT: s_mov_b32 s32, s33 -; CHECK-NEXT: v_readlane_b32 s4, v40, 2 -; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_mov_b32 s33, s4 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[30:31] - %fptr = load ptr, ptr addrspace(4) @constant, align 8 - tail call void %fptr(i32 inreg %vgpr) - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.waterfall.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.waterfall.ll new file mode 100644 index 0000000000000..c8abc3c4fb3ac --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.waterfall.ll @@ -0,0 +1,444 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s + +define hidden void @void_func_i32_inreg(i32 inreg) { +; CHECK-LABEL: void_func_i32_inreg: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + ret void +} + +define void @tail_call_i32_inreg_divergent(i32 %vgpr) { +; CHECK-LABEL: tail_call_i32_inreg_divergent: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s19, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[16:17] +; CHECK-NEXT: v_writelane_b32 v1, s30, 0 +; CHECK-NEXT: s_mov_b64 s[16:17], exec +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v1, s31, 1 +; CHECK-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s18, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s18, v0 +; CHECK-NEXT: s_and_saveexec_b64 vcc, vcc +; CHECK-NEXT: s_getpc_b64 s[20:21] +; CHECK-NEXT: s_add_u32 s20, s20, void_func_i32_inreg@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s21, s21, void_func_i32_inreg@rel32@hi+12 +; CHECK-NEXT: s_mov_b32 s0, s18 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[20:21] +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr31 +; CHECK-NEXT: s_xor_b64 exec, exec, vcc +; CHECK-NEXT: s_cbranch_execnz .LBB1_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_mov_b64 exec, s[16:17] +; CHECK-NEXT: v_readlane_b32 s31, v1, 1 +; CHECK-NEXT: v_readlane_b32 s30, v1, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_mov_b32 s33, s19 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + tail call void @void_func_i32_inreg(i32 inreg %vgpr) + ret void +} + +@constant = external hidden addrspace(4) constant ptr + +define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) { +; CHECK-LABEL: indirect_tail_call_i32_inreg_divergent: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s16, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[18:19] +; CHECK-NEXT: v_writelane_b32 v40, s16, 20 +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 +; CHECK-NEXT: v_writelane_b32 v40, s34, 2 +; CHECK-NEXT: v_writelane_b32 v40, s35, 3 +; CHECK-NEXT: v_writelane_b32 v40, s36, 4 +; CHECK-NEXT: v_writelane_b32 v40, s37, 5 +; CHECK-NEXT: v_writelane_b32 v40, s38, 6 +; CHECK-NEXT: v_writelane_b32 v40, s39, 7 +; CHECK-NEXT: v_writelane_b32 v40, s48, 8 +; CHECK-NEXT: v_writelane_b32 v40, s49, 9 +; CHECK-NEXT: v_writelane_b32 v40, s50, 10 +; CHECK-NEXT: v_writelane_b32 v40, s51, 11 +; CHECK-NEXT: v_writelane_b32 v40, s52, 12 +; CHECK-NEXT: v_writelane_b32 v40, s53, 13 +; CHECK-NEXT: v_writelane_b32 v40, s54, 14 +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v40, s55, 15 +; CHECK-NEXT: v_writelane_b32 v40, s64, 16 +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, constant@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, constant@rel32@hi+12 +; CHECK-NEXT: v_writelane_b32 v40, s65, 17 +; CHECK-NEXT: s_load_dwordx2 s[64:65], s[4:5], 0x0 +; CHECK-NEXT: v_writelane_b32 v40, s66, 18 +; CHECK-NEXT: s_mov_b32 s50, s15 +; CHECK-NEXT: s_mov_b32 s51, s14 +; CHECK-NEXT: s_mov_b32 s52, s13 +; CHECK-NEXT: s_mov_b32 s53, s12 +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] +; CHECK-NEXT: s_mov_b64 s[54:55], exec +; CHECK-NEXT: v_writelane_b32 v40, s67, 19 +; CHECK-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s16, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s16, v0 +; CHECK-NEXT: s_and_saveexec_b64 s[66:67], vcc +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b32 s12, s53 +; CHECK-NEXT: s_mov_b32 s13, s52 +; CHECK-NEXT: s_mov_b32 s14, s51 +; CHECK-NEXT: s_mov_b32 s15, s50 +; CHECK-NEXT: s_mov_b32 s0, s16 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[64:65] +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr31 +; CHECK-NEXT: s_xor_b64 exec, exec, s[66:67] +; CHECK-NEXT: s_cbranch_execnz .LBB2_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_mov_b64 exec, s[54:55] +; CHECK-NEXT: v_readlane_b32 s67, v40, 19 +; CHECK-NEXT: v_readlane_b32 s66, v40, 18 +; CHECK-NEXT: v_readlane_b32 s65, v40, 17 +; CHECK-NEXT: v_readlane_b32 s64, v40, 16 +; CHECK-NEXT: v_readlane_b32 s55, v40, 15 +; CHECK-NEXT: v_readlane_b32 s54, v40, 14 +; CHECK-NEXT: v_readlane_b32 s53, v40, 13 +; CHECK-NEXT: v_readlane_b32 s52, v40, 12 +; CHECK-NEXT: v_readlane_b32 s51, v40, 11 +; CHECK-NEXT: v_readlane_b32 s50, v40, 10 +; CHECK-NEXT: v_readlane_b32 s49, v40, 9 +; CHECK-NEXT: v_readlane_b32 s48, v40, 8 +; CHECK-NEXT: v_readlane_b32 s39, v40, 7 +; CHECK-NEXT: v_readlane_b32 s38, v40, 6 +; CHECK-NEXT: v_readlane_b32 s37, v40, 5 +; CHECK-NEXT: v_readlane_b32 s36, v40, 4 +; CHECK-NEXT: v_readlane_b32 s35, v40, 3 +; CHECK-NEXT: v_readlane_b32 s34, v40, 2 +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 +; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: v_readlane_b32 s4, v40, 20 +; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[6:7] +; CHECK-NEXT: s_mov_b32 s33, s4 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %fptr = load ptr, ptr addrspace(4) @constant, align 8 + tail call void %fptr(i32 inreg %vgpr) + ret void +} + +declare void @user(ptr addrspace(5)) + +define amdgpu_kernel void @v_multiple_frame_indexes_literal_offsets() #0 { +; CHECK-LABEL: v_multiple_frame_indexes_literal_offsets: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s17 +; CHECK-NEXT: v_mov_b32_e32 v3, 8 +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b32 s33, s16 +; CHECK-NEXT: s_mov_b32 s50, s15 +; CHECK-NEXT: s_mov_b32 s51, s14 +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] +; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_movk_i32 s32, 0x400 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s15, v3 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s15, v3 +; CHECK-NEXT: s_and_saveexec_b64 s[52:53], vcc +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, user@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, user@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 +; CHECK-NEXT: s_mov_b32 s14, s33 +; CHECK-NEXT: s_mov_b32 s0, s15 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: ; implicit-def: $vgpr3 +; CHECK-NEXT: ; implicit-def: $vgpr31 +; CHECK-NEXT: s_xor_b64 exec, exec, s[52:53] +; CHECK-NEXT: s_cbranch_execnz .LBB3_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_endpgm + %vgpr = call i32 @llvm.amdgcn.workitem.id.x() + %alloca0 = alloca [2 x i32], align 8, addrspace(5) + %alloca1 = alloca i32, align 4, addrspace(5) + %cmp = icmp eq i32 %vgpr, 0 + %select = select i1 %cmp, ptr addrspace(5) %alloca0, ptr addrspace(5) %alloca1 + call void @user(ptr addrspace(5) inreg %select) + ret void +} + +declare void @user_i32_inreg_i32_i32_inreg(i32 inreg, i32, i32 inreg) +define amdgpu_kernel void @call_user_i32_inreg_i32_i32_inreg(i32 %a, i32 %a1, i32 %a2, i32 %b, i32 %b1, i32 %b2, i32 %c) #0 { +; CHECK-LABEL: call_user_i32_inreg_i32_i32_inreg: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx8 s[64:71], s[8:9], 0x0 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s17 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v3, s66 +; CHECK-NEXT: v_mov_b32_e32 v4, s65 +; CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s64, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CHECK-NEXT: v_mov_b32_e32 v4, s69 +; CHECK-NEXT: v_mov_b32_e32 v5, s68 +; CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s67, v0 +; CHECK-NEXT: s_add_u32 s48, s8, 32 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: s_mov_b32 s33, s16 +; CHECK-NEXT: s_mov_b32 s50, s15 +; CHECK-NEXT: s_mov_b32 s51, s14 +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[6:7] +; CHECK-NEXT: s_mov_b64 s[38:39], s[4:5] +; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; CHECK-NEXT: s_addc_u32 s49, s9, 0 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s15, v4 +; CHECK-NEXT: v_readfirstlane_b32 s16, v3 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s15, v4 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], s16, v3 +; CHECK-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: s_and_saveexec_b64 s[52:53], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, user_i32_inreg_i32_i32_inreg@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, user_i32_inreg_i32_i32_inreg@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[4:5], 0x0 +; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] +; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] +; CHECK-NEXT: s_mov_b64 s[8:9], s[48:49] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 +; CHECK-NEXT: s_mov_b32 s14, s33 +; CHECK-NEXT: v_mov_b32_e32 v0, s70 +; CHECK-NEXT: s_mov_b32 s1, s15 +; CHECK-NEXT: s_mov_b32 s0, s16 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] +; CHECK-NEXT: ; implicit-def: $vgpr4 +; CHECK-NEXT: ; implicit-def: $vgpr3 +; CHECK-NEXT: ; implicit-def: $vgpr31 +; CHECK-NEXT: s_xor_b64 exec, exec, s[52:53] +; CHECK-NEXT: s_cbranch_execnz .LBB4_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_endpgm + %vgpr = call i32 @llvm.amdgcn.workitem.id.x() + %cmp.a = icmp sgt i32 %vgpr, %a + %cmp.b = icmp sgt i32 %vgpr, %b + %sel.a = select i1 %cmp.a, i32 %a1, i32 %a2 + %sel.b = select i1 %cmp.b, i32 %b1, i32 %b2 + call void @user_i32_inreg_i32_i32_inreg(i32 inreg %sel.a, i32 %c, i32 inreg %sel.b) + ret void +} + +declare void @user_ft_inreg_ft_ft_inreg(float inreg, float, float inreg) +define amdgpu_kernel void @call_user_ft_inreg_ft_ft_inreg(i32 %a, float %a1, float %a2, i32 %b, float %b1, float %b2, float %c) #0 { +; CHECK-LABEL: call_user_ft_inreg_ft_ft_inreg: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx8 s[64:71], s[8:9], 0x0 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s17 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v3, s66 +; CHECK-NEXT: v_mov_b32_e32 v4, s65 +; CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s64, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CHECK-NEXT: v_mov_b32_e32 v4, s69 +; CHECK-NEXT: v_mov_b32_e32 v5, s68 +; CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s67, v0 +; CHECK-NEXT: s_add_u32 s48, s8, 32 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: s_mov_b32 s33, s16 +; CHECK-NEXT: s_mov_b32 s50, s15 +; CHECK-NEXT: s_mov_b32 s51, s14 +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[6:7] +; CHECK-NEXT: s_mov_b64 s[38:39], s[4:5] +; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; CHECK-NEXT: s_addc_u32 s49, s9, 0 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s15, v4 +; CHECK-NEXT: v_readfirstlane_b32 s16, v3 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s15, v4 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], s16, v3 +; CHECK-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: s_and_saveexec_b64 s[52:53], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, user_ft_inreg_ft_ft_inreg@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, user_ft_inreg_ft_ft_inreg@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[4:5], 0x0 +; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] +; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] +; CHECK-NEXT: s_mov_b64 s[8:9], s[48:49] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 +; CHECK-NEXT: s_mov_b32 s14, s33 +; CHECK-NEXT: v_mov_b32_e32 v0, s70 +; CHECK-NEXT: s_mov_b32 s1, s15 +; CHECK-NEXT: s_mov_b32 s0, s16 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] +; CHECK-NEXT: ; implicit-def: $vgpr4 +; CHECK-NEXT: ; implicit-def: $vgpr3 +; CHECK-NEXT: ; implicit-def: $vgpr31 +; CHECK-NEXT: s_xor_b64 exec, exec, s[52:53] +; CHECK-NEXT: s_cbranch_execnz .LBB5_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_endpgm + %vgpr = call i32 @llvm.amdgcn.workitem.id.x() + %cmp.a = icmp sgt i32 %vgpr, %a + %cmp.b = icmp sgt i32 %vgpr, %b + %sel.a = select i1 %cmp.a, float %a1, float %a2 + %sel.b = select i1 %cmp.b, float %b1, float %b2 + call void @user_ft_inreg_ft_ft_inreg(float inreg %sel.a, float %c, float inreg %sel.b) + ret void +} + +declare void @user_2xft_inreg_ft_2xft_inreg(<2 x float> inreg, float, <2 x float> inreg) +define amdgpu_kernel void @call_user_2xft_inreg_ft_2xft_inreg(i32 %a, <2 x float> %a1, <2 x float> %a2, i32 %b, <2 x float> %b1, <2 x float> %b2, float %c) #0 { +; CHECK-LABEL: call_user_2xft_inreg_ft_2xft_inreg: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_mov_b32 s50, s15 +; CHECK-NEXT: s_mov_b32 s51, s14 +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[6:7] +; CHECK-NEXT: s_mov_b64 s[38:39], s[4:5] +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x8 +; CHECK-NEXT: s_load_dword s10, s[8:9], 0x0 +; CHECK-NEXT: s_load_dword s11, s[8:9], 0x18 +; CHECK-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x20 +; CHECK-NEXT: s_load_dword s54, s[8:9], 0x30 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v3, s6 +; CHECK-NEXT: v_mov_b32_e32 v4, s4 +; CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s10, v0 +; CHECK-NEXT: s_add_u32 s0, s0, s17 +; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CHECK-NEXT: v_mov_b32_e32 v4, s7 +; CHECK-NEXT: v_mov_b32_e32 v5, s5 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; CHECK-NEXT: v_mov_b32_e32 v5, s14 +; CHECK-NEXT: v_mov_b32_e32 v6, s12 +; CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s11, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; CHECK-NEXT: v_mov_b32_e32 v6, s15 +; CHECK-NEXT: v_mov_b32_e32 v7, s13 +; CHECK-NEXT: s_add_u32 s48, s8, 56 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: s_mov_b32 s33, s16 +; CHECK-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; CHECK-NEXT: s_addc_u32 s49, s9, 0 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s15, v6 +; CHECK-NEXT: v_readfirstlane_b32 s16, v5 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s15, v6 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], s16, v5 +; CHECK-NEXT: v_readfirstlane_b32 s17, v4 +; CHECK-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s17, v4 +; CHECK-NEXT: v_readfirstlane_b32 s18, v3 +; CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s18, v3 +; CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc +; CHECK-NEXT: s_and_saveexec_b64 s[52:53], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, user_2xft_inreg_ft_2xft_inreg@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, user_2xft_inreg_ft_2xft_inreg@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x0 +; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] +; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] +; CHECK-NEXT: s_mov_b64 s[8:9], s[48:49] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 +; CHECK-NEXT: s_mov_b32 s14, s33 +; CHECK-NEXT: v_mov_b32_e32 v0, s54 +; CHECK-NEXT: s_mov_b32 s3, s15 +; CHECK-NEXT: s_mov_b32 s2, s16 +; CHECK-NEXT: s_mov_b32 s1, s17 +; CHECK-NEXT: s_mov_b32 s0, s18 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[20:21] +; CHECK-NEXT: ; implicit-def: $vgpr6 +; CHECK-NEXT: ; implicit-def: $vgpr5 +; CHECK-NEXT: ; implicit-def: $vgpr4 +; CHECK-NEXT: ; implicit-def: $vgpr3 +; CHECK-NEXT: ; implicit-def: $vgpr31 +; CHECK-NEXT: s_xor_b64 exec, exec, s[52:53] +; CHECK-NEXT: s_cbranch_execnz .LBB6_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_endpgm + %vgpr = call i32 @llvm.amdgcn.workitem.id.x() + %cmp.a = icmp sgt i32 %vgpr, %a + %cmp.b = icmp sgt i32 %vgpr, %b + %sel.a = select i1 %cmp.a, <2 x float> %a1, <2 x float> %a2 + %sel.b = select i1 %cmp.b, <2 x float> %b1, <2 x float> %b2 + call void @user_2xft_inreg_ft_2xft_inreg(<2 x float> inreg %sel.a, float %c, <2 x float> inreg %sel.b) + ret void +} + +declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }