diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index c5b9bd9de66e1..19357635ecfc1 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1844,6 +1844,20 @@ The AMDGPU backend supports the following calling conventions: ..TODO:: Describe. + ``amdgpu_gfx_whole_wave`` Used for AMD graphics targets. Functions with this calling convention + cannot be used as entry points. They must have an i1 as the first argument, + which will be mapped to the value of EXEC on entry into the function. Other + arguments will contain poison in their inactive lanes. Similarly, the return + value for the inactive lanes is poison. + + The function will run with all lanes enabled, i.e. EXEC will be set to -1 in the + prologue and restored to its original value in the epilogue. The inactive lanes + will be preserved for all the registers used by the function. Active lanes only + will only be preserved for the callee saved registers. + + In all other respects, functions with this calling convention behave like + ``amdgpu_gfx`` functions. + ``amdgpu_gs`` Used for Mesa/AMDPAL geometry shaders. ..TODO:: Describe. diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h index c7e4bdf3ff811..a2311d2ac285d 100644 --- a/llvm/include/llvm/AsmParser/LLToken.h +++ b/llvm/include/llvm/AsmParser/LLToken.h @@ -181,6 +181,7 @@ enum Kind { kw_amdgpu_cs_chain_preserve, kw_amdgpu_kernel, kw_amdgpu_gfx, + kw_amdgpu_gfx_whole_wave, kw_tailcc, kw_m68k_rtdcc, kw_graalcc, diff --git a/llvm/include/llvm/IR/CallingConv.h b/llvm/include/llvm/IR/CallingConv.h index d68491eb5535c..ef761eb1aed73 100644 --- a/llvm/include/llvm/IR/CallingConv.h +++ b/llvm/include/llvm/IR/CallingConv.h @@ -284,6 +284,9 @@ namespace CallingConv { RISCV_VLSCall_32768 = 122, RISCV_VLSCall_65536 = 123, + // Calling convention for AMDGPU whole wave functions. + AMDGPU_Gfx_WholeWave = 124, + /// The highest possible ID. Must be some 2^k - 1. MaxID = 1023 }; @@ -294,8 +297,13 @@ namespace CallingConv { /// directly or indirectly via a call-like instruction. constexpr bool isCallableCC(CallingConv::ID CC) { switch (CC) { + // Called with special intrinsics: + // llvm.amdgcn.cs.chain case CallingConv::AMDGPU_CS_Chain: case CallingConv::AMDGPU_CS_ChainPreserve: + // llvm.amdgcn.call.whole.wave + case CallingConv::AMDGPU_Gfx_WholeWave: + // Hardware entry points: case CallingConv::AMDGPU_CS: case CallingConv::AMDGPU_ES: case CallingConv::AMDGPU_GS: diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp index ce813e1d7b1c4..520c6a00a9c07 100644 --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -679,6 +679,7 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(amdgpu_cs_chain_preserve); KEYWORD(amdgpu_kernel); KEYWORD(amdgpu_gfx); + KEYWORD(amdgpu_gfx_whole_wave); KEYWORD(tailcc); KEYWORD(m68k_rtdcc); KEYWORD(graalcc); diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index b7f6950f679ef..00277757c0955 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -2272,6 +2272,9 @@ bool LLParser::parseOptionalCallingConv(unsigned &CC) { CC = CallingConv::AMDGPU_CS_ChainPreserve; break; case lltok::kw_amdgpu_kernel: CC = CallingConv::AMDGPU_KERNEL; break; + case lltok::kw_amdgpu_gfx_whole_wave: + CC = CallingConv::AMDGPU_Gfx_WholeWave; + break; case lltok::kw_tailcc: CC = CallingConv::Tail; break; case lltok::kw_m68k_rtdcc: CC = CallingConv::M68k_RTD; break; case lltok::kw_graalcc: CC = CallingConv::GRAAL; break; diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 145ef10f28f35..3e40915b6a920 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -404,6 +404,9 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) { break; case CallingConv::AMDGPU_KERNEL: Out << "amdgpu_kernel"; break; case CallingConv::AMDGPU_Gfx: Out << "amdgpu_gfx"; break; + case CallingConv::AMDGPU_Gfx_WholeWave: + Out << "amdgpu_gfx_whole_wave"; + break; case CallingConv::M68k_RTD: Out << "m68k_rtdcc"; break; case CallingConv::RISCV_VectorCall: Out << "riscv_vector_cc"; diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index 7a03663e129dc..fc067459dcba3 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -1232,6 +1232,7 @@ bool llvm::CallingConv::supportsNonVoidReturnType(CallingConv::ID CC) { case CallingConv::AArch64_SVE_VectorCall: case CallingConv::WASM_EmscriptenInvoke: case CallingConv::AMDGPU_Gfx: + case CallingConv::AMDGPU_Gfx_WholeWave: case CallingConv::M68k_INTR: case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0: case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2: diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 9bd573e773610..e7b491e76724e 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -2979,6 +2979,16 @@ void Verifier::visitFunction(const Function &F) { "perfect forwarding!", &F); break; + case CallingConv::AMDGPU_Gfx_WholeWave: + Check(!F.arg_empty() && F.arg_begin()->getType()->isIntegerTy(1), + "Calling convention requires first argument to be i1", &F); + Check(!F.arg_begin()->hasInRegAttr(), + "Calling convention requires first argument to not be inreg", &F); + Check(!F.isVarArg(), + "Calling convention does not support varargs or " + "perfect forwarding!", + &F); + break; } // Check that the argument values match the function type for this function... diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 14101e57f5143..3d8d274f06246 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -374,8 +374,10 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val, return true; } - unsigned ReturnOpc = - IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::SI_RETURN; + const bool IsWholeWave = MFI->isWholeWaveFunction(); + unsigned ReturnOpc = IsWholeWave ? AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN + : IsShader ? AMDGPU::SI_RETURN_TO_EPILOG + : AMDGPU::SI_RETURN; auto Ret = B.buildInstrNoInsert(ReturnOpc); if (!FLI.CanLowerReturn) @@ -383,6 +385,9 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val, else if (!lowerReturnVal(B, Val, VRegs, Ret)) return false; + if (IsWholeWave) + addOriginalExecToReturn(B.getMF(), Ret); + // TODO: Handle CalleeSavedRegsViaCopy. B.insertInstr(Ret); @@ -632,6 +637,17 @@ bool AMDGPUCallLowering::lowerFormalArguments( if (DL.getTypeStoreSize(Arg.getType()) == 0) continue; + if (Info->isWholeWaveFunction() && Idx == 0) { + assert(VRegs[Idx].size() == 1 && "Expected only one register"); + + // The first argument for whole wave functions is the original EXEC value. + B.buildInstr(AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP) + .addDef(VRegs[Idx][0]); + + ++Idx; + continue; + } + const bool InReg = Arg.hasAttribute(Attribute::InReg); if (Arg.hasAttribute(Attribute::SwiftSelf) || @@ -1347,6 +1363,7 @@ bool AMDGPUCallLowering::lowerTailCall( SmallVector, 12> ImplicitArgRegs; if (Info.CallConv != CallingConv::AMDGPU_Gfx && + Info.CallConv != CallingConv::AMDGPU_Gfx_WholeWave && !AMDGPU::isChainCC(Info.CallConv)) { // With a fixed ABI, allocate fixed registers before user arguments. if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info)) @@ -1524,7 +1541,8 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // after the ordinary user argument registers. SmallVector, 12> ImplicitArgRegs; - if (Info.CallConv != CallingConv::AMDGPU_Gfx) { + if (Info.CallConv != CallingConv::AMDGPU_Gfx && + Info.CallConv != CallingConv::AMDGPU_Gfx_WholeWave) { // With a fixed ABI, allocate fixed registers before user arguments. if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info)) return false; @@ -1592,3 +1610,11 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, return true; } + +void AMDGPUCallLowering::addOriginalExecToReturn( + MachineFunction &MF, MachineInstrBuilder &Ret) const { + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const MachineInstr *Setup = TII->getWholeWaveFunctionSetup(MF); + Ret.addReg(Setup->getOperand(0).getReg()); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h index a6e801f2a547b..e0033d59d10bb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -37,6 +37,9 @@ class AMDGPUCallLowering final : public CallLowering { bool lowerReturnVal(MachineIRBuilder &B, const Value *Val, ArrayRef VRegs, MachineInstrBuilder &Ret) const; + void addOriginalExecToReturn(MachineFunction &MF, + MachineInstrBuilder &Ret) const; + public: AMDGPUCallLowering(const AMDGPUTargetLowering &TLI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 2bfd56f9f3554..891d362503f15 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -315,6 +315,10 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; +// G_AMDGPU_WHOLE_WAVE_FUNC_RETURN is simpler than AMDGPUwhole_wave_return, +// so we don't mark it as equivalent. + class GISelSop2Pat < SDPatternOperator node, Instruction inst, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 3d040fb705a8d..b037cdd5393ea 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1143,6 +1143,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC, case CallingConv::Cold: return CC_AMDGPU_Func; case CallingConv::AMDGPU_Gfx: + case CallingConv::AMDGPU_Gfx_WholeWave: return CC_SI_Gfx; case CallingConv::AMDGPU_KERNEL: case CallingConv::SPIR_KERNEL: @@ -1168,6 +1169,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC, case CallingConv::AMDGPU_LS: return RetCC_SI_Shader; case CallingConv::AMDGPU_Gfx: + case CallingConv::AMDGPU_Gfx_WholeWave: return RetCC_SI_Gfx; case CallingConv::C: case CallingConv::Fast: @@ -5875,6 +5877,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUFFER_ATOMIC_FMIN) NODE_NAME_CASE(BUFFER_ATOMIC_FMAX) NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32) + NODE_NAME_CASE(WHOLE_WAVE_SETUP) + NODE_NAME_CASE(WHOLE_WAVE_RETURN) } return nullptr; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 4e8c6c7ea3b27..39bb0adfc1a17 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -608,6 +608,12 @@ enum NodeType : unsigned { BUFFER_ATOMIC_FMAX, BUFFER_ATOMIC_COND_SUB_U32, LAST_MEMORY_OPCODE = BUFFER_ATOMIC_COND_SUB_U32, + + // Set up a whole wave function. + WHOLE_WAVE_SETUP, + + // Return from a whole wave function. + WHOLE_WAVE_RETURN, }; } // End namespace AMDGPUISD diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index ce58e93a15207..e305f08925cc6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -348,6 +348,17 @@ def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2", def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>; +// Marks the entry into a whole wave function. +def AMDGPUwhole_wave_setup : SDNode< + "AMDGPUISD::WHOLE_WAVE_SETUP", SDTypeProfile<1, 0, [SDTCisInt<0>]>, + [SDNPHasChain, SDNPSideEffect]>; + +// Marks the return from a whole wave function. +def AMDGPUwhole_wave_return : SDNode< + "AMDGPUISD::WHOLE_WAVE_RETURN", SDTNone, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] +>; + // SI+ export def AMDGPUExportOp : SDTypeProfile<0, 8, [ SDTCisInt<0>, // i8 tgt diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index d161c035ac295..8975486caa770 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4160,6 +4160,10 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { return true; case AMDGPU::G_AMDGPU_WAVE_ADDRESS: return selectWaveAddress(I); + case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: { + I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN)); + return true; + } case AMDGPU::G_STACKRESTORE: return selectStackRestore(I); case AMDGPU::G_PHI: diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index bf2f37bddb9ed..b54cccead9781 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -5540,6 +5540,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_PREFETCH: OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); break; + case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP: + case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); + break; } return getInstructionMapping(/*ID*/1, /*Cost*/1, diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index bbed828b4fed3..c4a3be44fc72d 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -3206,7 +3206,7 @@ bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) { // Check entry priority at each export (as there will only be a few). // Note: amdgpu_gfx can only be a callee, so defer to caller setprio. bool Changed = false; - if (CC != CallingConv::AMDGPU_Gfx) + if (CC != CallingConv::AMDGPU_Gfx && CC != CallingConv::AMDGPU_Gfx_WholeWave) Changed = ensureEntrySetPrio(MF, NormalPriority, TII); auto NextMI = std::next(It); diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 6a3867937d57f..11552b3a9a438 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -946,8 +946,18 @@ static Register buildScratchExecCopy(LiveRegUnits &LiveUnits, initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog); - ScratchExecCopy = findScratchNonCalleeSaveRegister( - MRI, LiveUnits, *TRI.getWaveMaskRegClass()); + if (FuncInfo->isWholeWaveFunction()) { + // Whole wave functions already have a copy of the original EXEC mask that + // we can use. + assert(IsProlog && "Epilog should look at return, not setup"); + ScratchExecCopy = + TII->getWholeWaveFunctionSetup(MF)->getOperand(0).getReg(); + assert(ScratchExecCopy && "Couldn't find copy of EXEC"); + } else { + ScratchExecCopy = findScratchNonCalleeSaveRegister( + MRI, LiveUnits, *TRI.getWaveMaskRegClass()); + } + if (!ScratchExecCopy) report_fatal_error("failed to find free scratch register"); @@ -996,10 +1006,15 @@ void SIFrameLowering::emitCSRSpillStores( }; StoreWWMRegisters(WWMScratchRegs); + + auto EnableAllLanes = [&]() { + unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1); + }; + if (!WWMCalleeSavedRegs.empty()) { if (ScratchExecCopy) { - unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1); + EnableAllLanes(); } else { ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true, @@ -1008,7 +1023,18 @@ void SIFrameLowering::emitCSRSpillStores( } StoreWWMRegisters(WWMCalleeSavedRegs); - if (ScratchExecCopy) { + if (FuncInfo->isWholeWaveFunction()) { + // SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose, so we can remove + // it now. If we have already saved some WWM CSR registers, then the EXEC is + // already -1 and we don't need to do anything else. Otherwise, set EXEC to + // -1 here. + if (!ScratchExecCopy) + buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true, + /*EnableInactiveLanes*/ true); + else if (WWMCalleeSavedRegs.empty()) + EnableAllLanes(); + TII->getWholeWaveFunctionSetup(MF)->eraseFromParent(); + } else if (ScratchExecCopy) { // FIXME: Split block and make terminator. unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec()) @@ -1083,11 +1109,6 @@ void SIFrameLowering::emitCSRSpillRestores( Register ScratchExecCopy; SmallVector, 2> WWMCalleeSavedRegs, WWMScratchRegs; FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs); - if (!WWMScratchRegs.empty()) - ScratchExecCopy = - buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, - /*IsProlog*/ false, /*EnableInactiveLanes*/ true); - auto RestoreWWMRegisters = [&](SmallVectorImpl> &WWMRegs) { for (const auto &Reg : WWMRegs) { @@ -1098,6 +1119,36 @@ void SIFrameLowering::emitCSRSpillRestores( } }; + if (FuncInfo->isWholeWaveFunction()) { + // For whole wave functions, the EXEC is already -1 at this point. + // Therefore, we can restore the CSR WWM registers right away. + RestoreWWMRegisters(WWMCalleeSavedRegs); + + // The original EXEC is the first operand of the return instruction. + const MachineInstr &Return = MBB.instr_back(); + assert(Return.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN && + "Unexpected return inst"); + Register OrigExec = Return.getOperand(0).getReg(); + + if (!WWMScratchRegs.empty()) { + unsigned XorOpc = ST.isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64; + BuildMI(MBB, MBBI, DL, TII->get(XorOpc), TRI.getExec()) + .addReg(OrigExec) + .addImm(-1); + RestoreWWMRegisters(WWMScratchRegs); + } + + // Restore original EXEC. + unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addReg(OrigExec); + return; + } + + if (!WWMScratchRegs.empty()) { + ScratchExecCopy = + buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, + /*IsProlog=*/false, /*EnableInactiveLanes=*/true); + } RestoreWWMRegisters(WWMScratchRegs); if (!WWMCalleeSavedRegs.empty()) { if (ScratchExecCopy) { @@ -1634,6 +1685,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, NeedExecCopyReservedReg = true; else if (MI.getOpcode() == AMDGPU::SI_RETURN || MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || + MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN || (MFI->isChainFunction() && TII->isChainCallOpcode(MI.getOpcode()))) { // We expect all return to be the same size. @@ -1662,6 +1714,21 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, if (MFI->isEntryFunction()) return; + if (MFI->isWholeWaveFunction()) { + // In practice, all the VGPRs are WWM registers, and we will need to save at + // least their inactive lanes. Add them to WWMReservedRegs. + assert(!NeedExecCopyReservedReg && + "Whole wave functions can use the reg mapped for their i1 argument"); + + // FIXME: Be more efficient! + for (MCRegister Reg : AMDGPU::VGPR_32RegClass) + if (MF.getRegInfo().isPhysRegModified(Reg)) { + MFI->reserveWWMRegister(Reg); + MF.begin()->addLiveIn(Reg); + } + MF.begin()->sortUniqueLiveIns(); + } + // Remove any VGPRs used in the return value because these do not need to be saved. // This prevents CSR restore from clobbering return VGPRs. if (ReturnMI) { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 0c76ff2ec5ea7..d4e3fa71ada85 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2260,7 +2260,8 @@ SDValue SITargetLowering::getPreloadedValue( const ArgDescriptor WorkGroupIDZ = ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u); if (Subtarget->hasArchitectedSGPRs() && - (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) { + (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx || + CC == CallingConv::AMDGPU_Gfx_WholeWave)) { switch (PVID) { case AMDGPUFunctionArgInfo::WORKGROUP_ID_X: Reg = &WorkGroupIDX; @@ -2942,12 +2943,15 @@ SDValue SITargetLowering::LowerFormalArguments( if (!Subtarget->enableFlatScratch()) assert(!UserSGPRInfo.hasFlatScratchInit()); if ((CallConv != CallingConv::AMDGPU_CS && - CallConv != CallingConv::AMDGPU_Gfx) || + CallConv != CallingConv::AMDGPU_Gfx && + CallConv != CallingConv::AMDGPU_Gfx_WholeWave) || !Subtarget->hasArchitectedSGPRs()) assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ()); } + bool IsWholeWaveFunc = Info->isWholeWaveFunction(); + if (CallConv == CallingConv::AMDGPU_PS) { processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info); @@ -2988,7 +2992,8 @@ SDValue SITargetLowering::LowerFormalArguments( } else if (IsKernel) { assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); } else { - Splits.append(Ins.begin(), Ins.end()); + Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(), + Ins.end()); } if (IsKernel) @@ -3019,6 +3024,13 @@ SDValue SITargetLowering::LowerFormalArguments( SmallVector Chains; + if (IsWholeWaveFunc) { + SDValue Setup = DAG.getNode(AMDGPUISD::WHOLE_WAVE_SETUP, DL, + {MVT::i1, MVT::Other}, Chain); + InVals.push_back(Setup.getValue(0)); + Chains.push_back(Setup.getValue(1)); + } + // FIXME: This is the minimum kernel argument alignment. We should improve // this to the maximum alignment of the arguments. // @@ -3026,7 +3038,8 @@ SDValue SITargetLowering::LowerFormalArguments( // kern arg offset. const Align KernelArgBaseAlign = Align(16); - for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { + for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e; + ++i) { const ISD::InputArg &Arg = Ins[i]; if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) { InVals.push_back(DAG.getPOISON(Arg.VT)); @@ -3374,7 +3387,9 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, unsigned Opc = AMDGPUISD::ENDPGM; if (!IsWaveEnd) - Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_GLUE; + Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN + : IsShader ? AMDGPUISD::RETURN_TO_EPILOG + : AMDGPUISD::RET_GLUE; return DAG.getNode(Opc, DL, MVT::Other, RetOps); } @@ -3876,7 +3891,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); - if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) { + if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) && + CallConv != CallingConv::AMDGPU_Gfx_WholeWave) { // With a fixed ABI, allocate fixed registers before user arguments. passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); } @@ -5890,6 +5906,18 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.eraseFromParent(); return SplitBB; } + case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: { + assert(MFI->isWholeWaveFunction()); + + // During ISel, it's difficult to propagate the original EXEC mask to use as + // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead. + MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent()); + Register OriginalExec = Setup->getOperand(0).getReg(); + assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC"); + MF->getRegInfo().clearKillFlags(OriginalExec); + MI.getOperand(0).setReg(OriginalExec); + return BB; + } default: if (TII->isImage(MI) || TII->isMUBUF(MI)) { if (!MI.mayStore()) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 2af0a575a8885..9faf4974e3fd6 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1812,6 +1812,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // with knowledge of the called routines. if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || MI.getOpcode() == AMDGPU::SI_RETURN || + MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN || MI.getOpcode() == AMDGPU::S_SETPC_B64_return || (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index c8935f0cb6034..e2a2525d909bd 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2472,6 +2472,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); break; } + case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: case AMDGPU::SI_RETURN: { const MachineFunction *MF = MBB.getParent(); const GCNSubtarget &ST = MF->getSubtarget(); @@ -5757,6 +5758,19 @@ void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, Indexes->insertMachineInstrInMaps(*ExecRestoreMI); } +MachineInstr * +SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const { + assert(MF.getInfo()->isWholeWaveFunction() && + "Not a whole wave func"); + MachineBasicBlock &MBB = *MF.begin(); + for (MachineInstr &MI : MBB) + if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP || + MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP) + return &MI; + + llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction"); +} + static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, const MachineRegisterInfo &MRI, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 5e92921f3ea21..800ea9ab50b85 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1215,6 +1215,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes = nullptr) const; + MachineInstr *getWholeWaveFunctionSetup(MachineFunction &MF) const; + /// Return the correct register class for \p OpNo. For target-specific /// instructions, this will return the register class that has been defined /// in tablegen. For generic instructions, like REG_SEQUENCE it will return diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 991d9f83e92e4..2230a431a0f26 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -644,6 +644,32 @@ def SI_INIT_WHOLE_WAVE : SPseudoInstSI < let isConvergent = 1; } +// Sets EXEC to all lanes and returns the previous EXEC. +def SI_WHOLE_WAVE_FUNC_SETUP : SPseudoInstSI < + (outs SReg_1:$dst), (ins), [(set i1:$dst, (AMDGPUwhole_wave_setup))]> { + let Defs = [EXEC]; + let Uses = [EXEC]; + + let isConvergent = 1; +} + +// Restores the previous EXEC and otherwise behaves entirely like a SI_RETURN. +def SI_WHOLE_WAVE_FUNC_RETURN : SPseudoInstSI < + (outs), (ins SReg_1:$orig_exec)> { + let isTerminator = 1; + let isBarrier = 1; + let isReturn = 1; + let SchedRW = [WriteBranch]; + + // We're going to use custom handling to set the $orig_exec to the correct value. + let usesCustomInserter = 1; +} + +// Generate a SI_WHOLE_WAVE_FUNC_RETURN pseudo with a placeholder for its +// argument. It will be filled in by the custom inserter. +def : GCNPat< + (AMDGPUwhole_wave_return), (SI_WHOLE_WAVE_FUNC_RETURN (i1 (IMPLICIT_DEF)))>; + // Return for returning shaders to a shader variant epilog. def SI_RETURN_TO_EPILOG : SPseudoInstSI < (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> { @@ -4300,6 +4326,20 @@ def G_AMDGPU_S_MUL_I64_I32 : AMDGPUGenericInstruction { let hasSideEffects = 0; } +def G_AMDGPU_WHOLE_WAVE_FUNC_SETUP : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$origExec); + let InOperandList = (ins); + let isConvergent = 1; +} + +def G_AMDGPU_WHOLE_WAVE_FUNC_RETURN : AMDGPUGenericInstruction { + let OutOperandList = (outs); + let InOperandList = (ins type0:$origExec); + let isTerminator = 1; + let isBarrier = 1; + let isReturn = 1; +} + // This is equivalent to the G_INTRINSIC*, but the operands may have // been legalized depending on the subtarget requirements. def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction { diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 8c2e9b620ad16..f0be204cd9bdb 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -51,7 +51,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false), PrivateSegmentWaveByteOffset(false), WorkItemIDX(false), WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false), - GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) { + GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0), + IsWholeWaveFunction(F.getCallingConv() == + CallingConv::AMDGPU_Gfx_WholeWave) { const GCNSubtarget &ST = *STI; FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F); WavesPerEU = ST.getWavesPerEU(F); @@ -99,7 +101,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, ImplicitArgPtr = false; } else if (!isEntryFunction()) { - if (CC != CallingConv::AMDGPU_Gfx) + if (CC != CallingConv::AMDGPU_Gfx && + CC != CallingConv::AMDGPU_Gfx_WholeWave) ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; FrameOffsetReg = AMDGPU::SGPR33; @@ -732,6 +735,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()), MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()), Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()), + IsWholeWaveFunction(MFI.isWholeWaveFunction()), DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()), ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) { for (Register Reg : MFI.getSGPRSpillPhysVGPRs()) @@ -778,6 +782,7 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields( HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs; BytesInStackArgArea = YamlMFI.BytesInStackArgArea; ReturnsVoid = YamlMFI.ReturnsVoid; + IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction; if (YamlMFI.ScavengeFI) { auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo()); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 274a60adb8d07..08b0206d244fb 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -298,6 +298,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { StringValue LongBranchReservedReg; bool HasInitWholeWave = false; + bool IsWholeWaveFunction = false; unsigned DynamicVGPRBlockSize = 0; unsigned ScratchReservedForDynamicVGPRs = 0; @@ -356,6 +357,7 @@ template <> struct MappingTraits { YamlIO.mapOptional("dynamicVGPRBlockSize", MFI.DynamicVGPRBlockSize, false); YamlIO.mapOptional("scratchReservedForDynamicVGPRs", MFI.ScratchReservedForDynamicVGPRs, 0); + YamlIO.mapOptional("isWholeWaveFunction", MFI.IsWholeWaveFunction, false); } }; @@ -565,6 +567,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, // the serialization easier. ReservedRegSet WWMReservedRegs; + bool IsWholeWaveFunction = false; + using PrologEpilogSGPRSpill = std::pair; // To track the SGPR spill method used for a CSR SGPR register during @@ -670,6 +674,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, return WWMReservedRegs.contains(Reg); } + bool isWholeWaveFunction() const { return IsWholeWaveFunction; } + ArrayRef getPrologEpilogSGPRSpills() const { assert(is_sorted(PrologEpilogSGPRSpills, llvm::less_first())); return PrologEpilogSGPRSpills; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index fa2b8db6ba55a..84cfa878276fd 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -407,6 +407,7 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList : CSR_AMDGPU_SaveList; case CallingConv::AMDGPU_Gfx: + case CallingConv::AMDGPU_Gfx_WholeWave: return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList : CSR_AMDGPU_SI_Gfx_SaveList; case CallingConv::AMDGPU_CS_ChainPreserve: @@ -433,6 +434,7 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask : CSR_AMDGPU_RegMask; case CallingConv::AMDGPU_Gfx: + case CallingConv::AMDGPU_Gfx_WholeWave: return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask : CSR_AMDGPU_SI_Gfx_RegMask; case CallingConv::AMDGPU_CS_Chain: diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index c9d2c286bf237..2d344f41ff790 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1423,7 +1423,8 @@ constexpr bool isShader(CallingConv::ID CC) { LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC) { - return isShader(CC) || CC == CallingConv::AMDGPU_Gfx; + return isShader(CC) || CC == CallingConv::AMDGPU_Gfx || + CC == CallingConv::AMDGPU_Gfx_WholeWave; } LLVM_READNONE diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp index e464470143e52..fd6253daa327a 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -44,6 +44,7 @@ static const char *getStageName(CallingConv::ID CC) { case CallingConv::AMDGPU_LS: return ".ls"; case CallingConv::AMDGPU_Gfx: + case CallingConv::AMDGPU_Gfx_WholeWave: llvm_unreachable("Callable shader has no hardware stage"); default: return ".cs"; diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll index 9cf3fdbe550b4..0b5ce08c00a23 100644 --- a/llvm/test/Bitcode/compatibility.ll +++ b/llvm/test/Bitcode/compatibility.ll @@ -564,6 +564,10 @@ declare riscv_vls_cc(32768) void @riscv_vls_cc_32768() ; CHECK: declare riscv_vls_cc(32768) void @riscv_vls_cc_32768() declare riscv_vls_cc(65536) void @riscv_vls_cc_65536() ; CHECK: declare riscv_vls_cc(65536) void @riscv_vls_cc_65536() +declare cc124 void @f.cc124(i1) +; CHECK: declare amdgpu_gfx_whole_wave void @f.cc124(i1) +declare amdgpu_gfx_whole_wave void @f.amdgpu_gfx_whole_wave(i1) +; CHECK: declare amdgpu_gfx_whole_wave void @f.amdgpu_gfx_whole_wave(i1) declare cc1023 void @f.cc1023() ; CHECK: declare cc1023 void @f.cc1023() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir new file mode 100644 index 0000000000000..beca901945753 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir @@ -0,0 +1,40 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s +# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s +# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s +--- +name: basic_test +legalized: true +machineFunctionInfo: + isWholeWaveFunction: true +body: | + bb.1: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: basic_test + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:vcc(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY]], [[COPY2]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY1]], [[COPY3]] + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), [[SELECT]](s32), [[SELECT1]](s32), 1, 1, 1, 0 + ; CHECK-NEXT: $vgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0 + %1:_(s32) = COPY $vgpr0 + %2:_(s32) = COPY $vgpr1 + %0:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP + %12:_(s32) = G_CONSTANT i32 5 + %11:_(s32) = G_SELECT %0(s1), %1, %12 + %14:_(s32) = G_CONSTANT i32 3 + %13:_(s32) = G_SELECT %0(s1), %2, %14 + %15:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), %11(s32), %13(s32), 1, 1, 1, 0 + $vgpr0 = COPY %15(s32) + G_AMDGPU_WHOLE_WAVE_FUNC_RETURN %0(s1), implicit $vgpr0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll new file mode 100644 index 0000000000000..b68786b579dd2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll @@ -0,0 +1,103 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -stop-after=irtranslator -verify-machineinstrs < %s | FileCheck %s + +define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) { + ; CHECK-LABEL: name: basic_test + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY]], [[C]] + ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY1]], [[C1]] + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), [[SELECT]](s32), [[SELECT1]](s32), 1, 1, 1, 0 + ; CHECK-NEXT: $vgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0 + %x = select i1 %active, i32 %a, i32 5 + %y = select i1 %active, i32 %b, i32 3 + %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false) + ret i32 %ret +} + +; Make sure we don't crash if %active is not used at all. +define amdgpu_gfx_whole_wave i32 @unused_active(i1 %active, i32 %a, i32 %b) { + ; CHECK-LABEL: name: unused_active + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 14 + ; CHECK-NEXT: $vgpr0 = COPY [[C]](s32) + ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0 + ret i32 14 +} + +define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) { + ; CHECK-LABEL: name: multiple_blocks + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]] + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s1), [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), [[ICMP]](s1) + ; CHECK-NEXT: G_BRCOND [[INT]](s1), %bb.2 + ; CHECK-NEXT: G_BR %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.if.then: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.if.end: + ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[COPY1]](s32), %bb.1, [[ADD]](s32), %bb.2 + ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT1]](s32) + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY]], [[PHI]] + ; CHECK-NEXT: $vgpr0 = COPY [[SELECT]](s32) + ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0 + %c = icmp eq i32 %a, %b + br i1 %c, label %if.then, label %if.end + +if.then: ; preds = %0 + %d = add i32 %a, %b + br label %if.end + +if.end: + %f = phi i32 [ %d, %if.then ], [ %b, %0 ] + %e = select i1 %active, i32 %a, i32 %f + ret i32 %e +} + +define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) { + ; CHECK-LABEL: name: ret_64 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 5 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[MV]], [[C]] + ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[MV1]], [[C1]] + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), [[SELECT]](s64), [[SELECT1]](s64), 1, 1, 1, 0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INTRINSIC_CONVERGENT]](s64) + ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) + ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0, implicit $vgpr1 + %x = select i1 %active, i64 %a, i64 5 + %y = select i1 %active, i64 %b, i64 3 + %ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false) + ret i64 %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll new file mode 100644 index 0000000000000..3450d63ff7b4a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll @@ -0,0 +1,191 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -stop-after=finalize-isel < %s | FileCheck --check-prefix=DAGISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -stop-after=finalize-isel < %s | FileCheck --check-prefix=GISEL %s + +define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) { + ; DAGISEL-LABEL: name: basic_test + ; DAGISEL: bb.0 (%ir-block.0): + ; DAGISEL-NEXT: liveins: $vgpr0, $vgpr1 + ; DAGISEL-NEXT: {{ $}} + ; DAGISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; DAGISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; DAGISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5 + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_]], 0, [[COPY1]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; DAGISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 3 + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_1]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; DAGISEL-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], killed [[V_CNDMASK_B32_e64_1]], 1, 1, 1, 0, implicit $exec + ; DAGISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_dpp]] + ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0 + ; + ; GISEL-LABEL: name: basic_test + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $vgpr0, $vgpr1 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; GISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY2]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 3 + ; GISEL-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY3]], 0, [[COPY1]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], [[V_CNDMASK_B32_e64_1]], 1, 1, 1, 0, implicit $exec + ; GISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_dpp]] + ; GISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0 + %x = select i1 %active, i32 %a, i32 5 + %y = select i1 %active, i32 %b, i32 3 + %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false) + ret i32 %ret +} + +; Make sure we don't crash if %active is not used at all. +define amdgpu_gfx_whole_wave i32 @unused_active(i1 %active, i32 %a, i32 %b) { + ; DAGISEL-LABEL: name: unused_active + ; DAGISEL: bb.0 (%ir-block.0): + ; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; DAGISEL-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 14, implicit $exec + ; DAGISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_e32_]] + ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0 + ; + ; GISEL-LABEL: name: unused_active + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $vgpr0, $vgpr1 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; GISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 14 + ; GISEL-NEXT: $vgpr0 = COPY [[S_MOV_B32_]] + ; GISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0 + ret i32 14 +} + +define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) { + ; DAGISEL-LABEL: name: multiple_blocks + ; DAGISEL: bb.0 (%ir-block.0): + ; DAGISEL-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; DAGISEL-NEXT: liveins: $vgpr0, $vgpr1 + ; DAGISEL-NEXT: {{ $}} + ; DAGISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; DAGISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; DAGISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[SI_WHOLE_WAVE_FUNC_SETUP]] + ; DAGISEL-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY1]], [[COPY]], implicit $exec + ; DAGISEL-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; DAGISEL-NEXT: S_BRANCH %bb.1 + ; DAGISEL-NEXT: {{ $}} + ; DAGISEL-NEXT: bb.1.if.then: + ; DAGISEL-NEXT: successors: %bb.2(0x80000000) + ; DAGISEL-NEXT: {{ $}} + ; DAGISEL-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], [[COPY]], 0, implicit $exec + ; DAGISEL-NEXT: {{ $}} + ; DAGISEL-NEXT: bb.2.if.end: + ; DAGISEL-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, [[V_ADD_U32_e64_]], %bb.1 + ; DAGISEL-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; DAGISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]] + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[PHI]], 0, [[COPY1]], [[COPY3]], implicit $exec + ; DAGISEL-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] + ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0 + ; + ; GISEL-LABEL: name: multiple_blocks + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GISEL-NEXT: liveins: $vgpr0, $vgpr1 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; GISEL-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec + ; GISEL-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec + ; GISEL-NEXT: S_BRANCH %bb.2 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: bb.2.if.then: + ; GISEL-NEXT: successors: %bb.3(0x80000000) + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: bb.3.if.end: + ; GISEL-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.1, [[V_ADD_U32_e64_]], %bb.2 + ; GISEL-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[PHI]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] + ; GISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0 + %c = icmp eq i32 %a, %b + br i1 %c, label %if.then, label %if.end + +if.then: ; preds = %0 + %d = add i32 %a, %b + br label %if.end + +if.end: + %f = phi i32 [ %d, %if.then ], [ %b, %0 ] + %e = select i1 %active, i32 %a, i32 %f + ret i32 %e +} + +define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) { + ; DAGISEL-LABEL: name: ret_64 + ; DAGISEL: bb.0 (%ir-block.0): + ; DAGISEL-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; DAGISEL-NEXT: {{ $}} + ; DAGISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; DAGISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; DAGISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; DAGISEL-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; DAGISEL-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; DAGISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[SI_WHOLE_WAVE_FUNC_SETUP]] + ; DAGISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; DAGISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_]], 0, killed [[COPY5]], [[COPY4]], implicit $exec + ; DAGISEL-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; DAGISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 5 + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_1]], 0, killed [[COPY6]], [[COPY4]], implicit $exec + ; DAGISEL-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_]], 0, killed [[COPY7]], [[COPY4]], implicit $exec + ; DAGISEL-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; DAGISEL-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 3 + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_2]], 0, killed [[COPY8]], [[COPY4]], implicit $exec + ; DAGISEL-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_1]], killed [[V_CNDMASK_B32_e64_3]], 1, 1, 1, 0, implicit $exec + ; DAGISEL-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], killed [[V_CNDMASK_B32_e64_2]], 1, 1, 1, 0, implicit $exec + ; DAGISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_dpp]] + ; DAGISEL-NEXT: $vgpr1 = COPY [[V_MOV_B32_dpp1]] + ; DAGISEL-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0, implicit $vgpr1 + ; + ; GISEL-LABEL: name: ret_64 + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 5, implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_1]], 0, [[COPY1]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_2]], 0, [[COPY2]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_3]], 0, [[COPY3]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], [[V_CNDMASK_B32_e64_2]], 1, 1, 1, 0, implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_1]], [[V_CNDMASK_B32_e64_3]], 1, 1, 1, 0, implicit $exec + ; GISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_dpp]] + ; GISEL-NEXT: $vgpr1 = COPY [[V_MOV_B32_dpp1]] + ; GISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0, implicit $vgpr1 + %x = select i1 %active, i64 %a, i64 5 + %y = select i1 %active, i64 %b, i64 3 + %ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false) + ret i64 %ret +} + diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir b/llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir new file mode 100644 index 0000000000000..93f489170cea0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir @@ -0,0 +1,448 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=prologepilog -o - %s | FileCheck %s + +--- +name: save_inactive_lanes_non_csr_vgpr +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + bb.0: + ; CHECK-LABEL: name: save_inactive_lanes_non_csr_vgpr + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 14, implicit $exec + ; CHECK-NEXT: $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0 + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0 + renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 14, implicit $exec + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0 + +... +--- +name: save_all_lanes_csr_vgpr +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + bb.0: + ; CHECK-LABEL: name: save_all_lanes_csr_vgpr + ; CHECK: liveins: $vgpr40 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sgpr0 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: $vgpr40 = V_MOV_B32_e32 14, implicit $exec + ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0 + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0 + renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + $vgpr40 = V_MOV_B32_e32 14, implicit $exec + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0 + +... +--- +name: save_csr_sgpr_to_non_csr_vgpr +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $sgpr20, $vgpr191 + ; CHECK-LABEL: name: save_csr_sgpr_to_non_csr_vgpr + ; CHECK: liveins: $sgpr20, $vgpr191, $vgpr192 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr192, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1 + ; CHECK-NEXT: $vgpr192 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr192 + ; CHECK-NEXT: $sgpr20 = S_MOV_B32 14, implicit $exec + ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr192, 0 + ; CHECK-NEXT: $exec_lo = S_XOR_B32 $vcc_lo, -1, implicit-def $scc + ; CHECK-NEXT: $vgpr192 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo + $vgpr192 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr192 + renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + $sgpr20 = S_MOV_B32 14, implicit $exec + $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr192, 0 + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo + +... +--- +name: save_csr_sgpr_to_csr_vgpr +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $sgpr20, $vgpr191 + ; CHECK-LABEL: name: save_csr_sgpr_to_csr_vgpr + ; CHECK: liveins: $sgpr20, $vgpr191 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vcc_lo = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr191, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191 + ; CHECK-NEXT: $sgpr20 = S_MOV_B32 14, implicit $exec + ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0 + ; CHECK-NEXT: $vgpr191 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo + $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191 + renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + $sgpr20 = S_MOV_B32 14, implicit $exec + $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0 + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo + +... +--- +name: vgpr_and_sgpr_csr +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +liveins: + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +frameInfo: + maxAlignment: 4 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + hasSpilledSGPRs: true + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + spillPhysVGPRs: + - '$vgpr191' + wwmReservedRegs: + - '$vgpr191' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr191 + + ; CHECK-LABEL: name: vgpr_and_sgpr_csr + ; CHECK: liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr40, $vgpr49 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr49, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1 + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr0 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40 + ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0 + ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_XOR_B32 $vcc_lo, -1, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $vgpr49 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo + $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191 + renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20 + S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40 + $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0 + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo + +... +--- +name: split_orig_exec +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +liveins: + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +frameInfo: + maxAlignment: 4 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + hasSpilledSGPRs: true + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + spillPhysVGPRs: + - '$vgpr191' + wwmReservedRegs: + - '$vgpr191' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr191 + + ; CHECK-LABEL: name: split_orig_exec + ; CHECK: liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr40, $vgpr49 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr49, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1 + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr0 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20 + ; CHECK-NEXT: $sgpr3 = COPY $vcc_lo + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40 + ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0 + ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_XOR_B32 $sgpr3, -1, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $vgpr49 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr3 + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr3 + $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191 + renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20 + $sgpr3 = COPY $vcc_lo + S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40 + $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0 + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr3 + +... +--- +name: vgpr_superregs +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + bb.0: + ; CHECK-LABEL: name: vgpr_superregs + ; CHECK: liveins: $vgpr0, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr40, $vgpr41, $vgpr42 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr5, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.4, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1 + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.5, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr41, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.6, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr42, $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.7, addrspace 5) + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 14, implicit $exec + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr40_vgpr41_vgpr42 + ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.5, addrspace 5) + ; CHECK-NEXT: $vgpr41 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.6, addrspace 5) + ; CHECK-NEXT: $vgpr42 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.7, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5) + ; CHECK-NEXT: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.4, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0 + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0 + renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 14, implicit $exec + S_NOP 0, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr40_vgpr41_vgpr42 + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0 + +... +--- +name: dont_restore_used_vgprs +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +liveins: + - { reg: '$vgpr0' } + - { reg: '$vgpr20' } + - { reg: '$vgpr40' } +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $vgpr0, $vgpr20, $vgpr40 + + ; CHECK-LABEL: name: dont_restore_used_vgprs + ; CHECK: liveins: $vgpr0, $vgpr20, $vgpr40 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_NOP 0, implicit $vgpr0, implicit $vgpr20, implicit $vgpr40 + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0 + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0 + renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + S_NOP 0, implicit $vgpr0, implicit $vgpr20, implicit $vgpr40 + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0 + +... +--- +name: multiple_blocks +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +liveins: + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + ; CHECK-LABEL: name: multiple_blocks + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1 + ; CHECK-NEXT: $sgpr1 = S_MOV_B32 $exec_lo + ; CHECK-NEXT: V_CMPX_EQ_U32_nosdst_e64 $vgpr0, $vgpr1, implicit-def $exec, implicit $exec + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $vgpr1 = V_ADD_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, killed renamable $sgpr1, implicit-def $scc + ; CHECK-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr0, $vcc_lo, implicit $exec + ; CHECK-NEXT: $exec_lo = S_XOR_B32 $vcc_lo, -1, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo, implicit $vgpr0 + bb.0: + successors: %bb.1, %bb.2 + liveins: $vgpr0, $vgpr1 + + renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + $sgpr1 = S_MOV_B32 $exec_lo + V_CMPX_EQ_U32_nosdst_e64 $vgpr0, $vgpr1, implicit-def $exec, implicit $exec + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1 + + renamable $vgpr1 = V_ADD_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec + + bb.2: + liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1 + + $exec_lo = S_OR_B32 $exec_lo, killed renamable $sgpr1, implicit-def $scc + renamable $vgpr0 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr0, $vcc_lo, implicit $exec + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo, implicit $vgpr0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll new file mode 100644 index 0000000000000..53d02925fb1c2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll @@ -0,0 +1,2414 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL64 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL64 %s + +; Make sure the i1 %active is passed through EXEC. +; The EXEC mask should be set to -1 for the duration of the function +; and restored to its original value in the epilogue. +; We will also need to restore the inactive lanes for any allocated VGPRs. +define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: basic_test: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1 +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: basic_test: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 +; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 +; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: basic_test: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL64-NEXT: s_mov_b64 exec, vcc +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: basic_test: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 +; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; GISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 +; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL64-NEXT: s_mov_b64 exec, vcc +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + %x = select i1 %active, i32 %a, i32 5 + %y = select i1 %active, i32 %b, i32 3 + %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false) + ret i32 %ret +} + +; Make sure we don't crash if there's only one use for %active. +define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: single_use_of_active: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc_lo +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: single_use_of_active: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 +; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc_lo +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 +; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: single_use_of_active: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL64-NEXT: s_mov_b64 exec, vcc +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: single_use_of_active: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 +; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 +; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL64-NEXT: s_mov_b64 exec, vcc +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + %y = select i1 %active, i32 %b, i32 17 + %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %a, i32 %y, i32 1, i32 1, i32 1, i1 false) + ret i32 %ret +} + +; Make sure we don't crash if %active is not used at all. +define amdgpu_gfx_whole_wave i32 @unused_active(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: unused_active: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: v_mov_b32_e32 v0, 14 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: unused_active: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: v_mov_b32_e32 v0, 14 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_xor_b32 exec_lo, s0, -1 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: unused_active: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: v_mov_b32_e32 v0, 14 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_xor_b64 exec, s[0:1], -1 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: unused_active: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: v_mov_b32_e32 v0, 14 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_xor_b64 exec, s[0:1], -1 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; GISEL64-NEXT: s_mov_b64 exec, s[0:1] +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + ret i32 14 +} + +; For any used VGPRs (including those used for SGPR spills), we need to restore the inactive lanes. +; For CSR VGPRs, we need to restore all lanes. +define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: csr: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x3 +; DAGISEL-NEXT: scratch_store_b32 off, v2, s32 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 offset:4 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:8 +; DAGISEL-NEXT: scratch_store_b32 off, v49, s32 offset:16 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill +; DAGISEL-NEXT: ;;#ASMSTART +; DAGISEL-NEXT: ; clobber CSR +; DAGISEL-NEXT: ;;#ASMEND +; DAGISEL-NEXT: v_writelane_b32 v2, s20, 0 +; DAGISEL-NEXT: ;;#ASMSTART +; DAGISEL-NEXT: ; clobber non-CSR +; DAGISEL-NEXT: ;;#ASMEND +; DAGISEL-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1 +; DAGISEL-NEXT: v_readlane_b32 s20, v2, 0 +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x3 +; DAGISEL-NEXT: scratch_load_b32 v2, off, s32 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:8 +; DAGISEL-NEXT: scratch_load_b32 v49, off, s32 offset:16 +; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_wait_alu 0xf1ff +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: csr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: scratch_store_b32 off, v2, s32 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 offset:4 +; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:8 +; GISEL-NEXT: scratch_store_b32 off, v49, s32 offset:16 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill +; GISEL-NEXT: ;;#ASMSTART +; GISEL-NEXT: ; clobber CSR +; GISEL-NEXT: ;;#ASMEND +; GISEL-NEXT: v_writelane_b32 v2, s20, 0 +; GISEL-NEXT: ;;#ASMSTART +; GISEL-NEXT: ; clobber non-CSR +; GISEL-NEXT: ;;#ASMEND +; GISEL-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1 +; GISEL-NEXT: v_readlane_b32 s20, v2, 0 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: scratch_load_b32 v2, off, s32 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:8 +; GISEL-NEXT: scratch_load_b32 v49, off, s32 offset:16 +; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_wait_alu 0xf1ff +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: csr: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; DAGISEL64-NEXT: s_clause 0x3 +; DAGISEL64-NEXT: scratch_store_b32 off, v2, s32 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 offset:4 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:8 +; DAGISEL64-NEXT: scratch_store_b32 off, v49, s32 offset:16 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill +; DAGISEL64-NEXT: ;;#ASMSTART +; DAGISEL64-NEXT: ; clobber CSR +; DAGISEL64-NEXT: ;;#ASMEND +; DAGISEL64-NEXT: v_writelane_b32 v2, s20, 0 +; DAGISEL64-NEXT: ;;#ASMSTART +; DAGISEL64-NEXT: ; clobber non-CSR +; DAGISEL64-NEXT: ;;#ASMEND +; DAGISEL64-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc +; DAGISEL64-NEXT: v_readlane_b32 s20, v2, 0 +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; DAGISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; DAGISEL64-NEXT: s_clause 0x3 +; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:8 +; DAGISEL64-NEXT: scratch_load_b32 v49, off, s32 offset:16 +; DAGISEL64-NEXT: s_mov_b64 exec, vcc +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_wait_alu 0xf1ff +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: csr: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; GISEL64-NEXT: s_clause 0x3 +; GISEL64-NEXT: scratch_store_b32 off, v2, s32 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 offset:4 +; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:8 +; GISEL64-NEXT: scratch_store_b32 off, v49, s32 offset:16 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill +; GISEL64-NEXT: ;;#ASMSTART +; GISEL64-NEXT: ; clobber CSR +; GISEL64-NEXT: ;;#ASMEND +; GISEL64-NEXT: v_writelane_b32 v2, s20, 0 +; GISEL64-NEXT: ;;#ASMSTART +; GISEL64-NEXT: ; clobber non-CSR +; GISEL64-NEXT: ;;#ASMEND +; GISEL64-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; GISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc +; GISEL64-NEXT: v_readlane_b32 s20, v2, 0 +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; GISEL64-NEXT: s_clause 0x3 +; GISEL64-NEXT: scratch_load_b32 v2, off, s32 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:8 +; GISEL64-NEXT: scratch_load_b32 v49, off, s32 offset:16 +; GISEL64-NEXT: s_mov_b64 exec, vcc +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_wait_alu 0xf1ff +; GISEL64-NEXT: s_setpc_b64 s[30:31] + %x = select i1 %active, i32 %a, i32 5 + %y = select i1 %active, i32 %b, i32 3 + call void asm sideeffect "; clobber CSR", "~{v40},~{s48}"() + call void asm sideeffect "; clobber non-CSR", "~{v49},~{s20}"() + %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false) + ret i32 %ret +} + +; Save and restore all lanes of v40. +define amdgpu_gfx_whole_wave void @csr_vgpr_only(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: csr_vgpr_only: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; DAGISEL-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; DAGISEL-NEXT: ;;#ASMSTART +; DAGISEL-NEXT: ; clobber CSR VGPR +; DAGISEL-NEXT: ;;#ASMEND +; DAGISEL-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: csr_vgpr_only: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GISEL-NEXT: ;;#ASMSTART +; GISEL-NEXT: ; clobber CSR VGPR +; GISEL-NEXT: ;;#ASMEND +; GISEL-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: csr_vgpr_only: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; DAGISEL64-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; DAGISEL64-NEXT: ;;#ASMSTART +; DAGISEL64-NEXT: ; clobber CSR VGPR +; DAGISEL64-NEXT: ;;#ASMEND +; DAGISEL64-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: csr_vgpr_only: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GISEL64-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GISEL64-NEXT: ;;#ASMSTART +; GISEL64-NEXT: ; clobber CSR VGPR +; GISEL64-NEXT: ;;#ASMEND +; GISEL64-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_mov_b64 exec, s[0:1] +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + call void asm sideeffect "; clobber CSR VGPR", "~{v40}"() + ret void +} + +define amdgpu_gfx_whole_wave void @sgpr_spill_only(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: sgpr_spill_only: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: v_writelane_b32 v0, s68, 0 +; DAGISEL-NEXT: ;;#ASMSTART +; DAGISEL-NEXT: ; clobber CSR SGPR +; DAGISEL-NEXT: ;;#ASMEND +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL-NEXT: v_readlane_b32 s68, v0, 0 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: sgpr_spill_only: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: v_writelane_b32 v0, s68, 0 +; GISEL-NEXT: ;;#ASMSTART +; GISEL-NEXT: ; clobber CSR SGPR +; GISEL-NEXT: ;;#ASMEND +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_readlane_b32 s68, v0, 0 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_xor_b32 exec_lo, s0, -1 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: sgpr_spill_only: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: v_writelane_b32 v0, s68, 0 +; DAGISEL64-NEXT: ;;#ASMSTART +; DAGISEL64-NEXT: ; clobber CSR SGPR +; DAGISEL64-NEXT: ;;#ASMEND +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL64-NEXT: v_readlane_b32 s68, v0, 0 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_xor_b64 exec, s[0:1], -1 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: sgpr_spill_only: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: v_writelane_b32 v0, s68, 0 +; GISEL64-NEXT: ;;#ASMSTART +; GISEL64-NEXT: ; clobber CSR SGPR +; GISEL64-NEXT: ;;#ASMEND +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL64-NEXT: v_readlane_b32 s68, v0, 0 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_xor_b64 exec, s[0:1], -1 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; GISEL64-NEXT: s_mov_b64 exec, s[0:1] +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + call void asm sideeffect "; clobber CSR SGPR", "~{s68}"() + ret void +} + +define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: multiple_blocks: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; DAGISEL-NEXT: s_mov_b32 s1, exec_lo +; DAGISEL-NEXT: v_cmpx_eq_u32_e64 v0, v1 +; DAGISEL-NEXT: ; %bb.1: ; %if.then +; DAGISEL-NEXT: v_add_nc_u32_e32 v1, v0, v1 +; DAGISEL-NEXT: ; %bb.2: ; %if.end +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: multiple_blocks: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 +; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-NEXT: s_mov_b32 s1, exec_lo +; GISEL-NEXT: v_cmpx_eq_u32_e64 v0, v1 +; GISEL-NEXT: ; %bb.1: ; %if.then +; GISEL-NEXT: v_add_nc_u32_e32 v1, v0, v1 +; GISEL-NEXT: ; %bb.2: ; %if.end +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 +; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: multiple_blocks: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; DAGISEL64-NEXT: s_mov_b64 s[2:3], exec +; DAGISEL64-NEXT: v_cmpx_eq_u32_e64 v0, v1 +; DAGISEL64-NEXT: ; %bb.1: ; %if.then +; DAGISEL64-NEXT: v_add_nc_u32_e32 v1, v0, v1 +; DAGISEL64-NEXT: ; %bb.2: ; %if.end +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_or_b64 exec, exec, s[2:3] +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL64-NEXT: s_mov_b64 exec, vcc +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: multiple_blocks: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 +; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL64-NEXT: s_mov_b64 s[2:3], exec +; GISEL64-NEXT: v_cmpx_eq_u32_e64 v0, v1 +; GISEL64-NEXT: ; %bb.1: ; %if.then +; GISEL64-NEXT: v_add_nc_u32_e32 v1, v0, v1 +; GISEL64-NEXT: ; %bb.2: ; %if.end +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_or_b64 exec, exec, s[2:3] +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL64-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 +; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL64-NEXT: s_mov_b64 exec, vcc +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + %c = icmp eq i32 %a, %b + br i1 %c, label %if.then, label %if.end + +if.then: ; preds = %0 + %d = add i32 %a, %b + br label %if.end + +if.end: + %f = phi i32 [ %d, %if.then ], [ %b, %0 ] + %e = select i1 %active, i32 %a, i32 %f + ret i32 %e +} + +define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) { +; DAGISEL-LABEL: ret_64: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x3 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; DAGISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: v_dual_cndmask_b32 v1, 0, v1 :: v_dual_cndmask_b32 v0, 5, v0 +; DAGISEL-NEXT: v_dual_cndmask_b32 v2, 3, v2 :: v_dual_cndmask_b32 v3, 0, v3 +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; DAGISEL-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x3 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; DAGISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: ret_64: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 +; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; GISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 0, v1 +; GISEL-NEXT: v_dual_cndmask_b32 v2, 3, v2 :: v_dual_cndmask_b32 v3, 0, v3 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 +; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; GISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: ret_64: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; DAGISEL64-NEXT: s_clause 0x3 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; DAGISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; DAGISEL64-NEXT: v_cndmask_b32_e32 v2, 3, v2, vcc +; DAGISEL64-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; DAGISEL64-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL64-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; DAGISEL64-NEXT: s_clause 0x3 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; DAGISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; DAGISEL64-NEXT: s_mov_b64 exec, vcc +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: ret_64: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; GISEL64-NEXT: s_clause 0x3 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 +; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; GISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; GISEL64-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GISEL64-NEXT: v_cndmask_b32_e32 v2, 3, v2, vcc +; GISEL64-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL64-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL64-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; GISEL64-NEXT: s_clause 0x3 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 +; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; GISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; GISEL64-NEXT: s_mov_b64 exec, vcc +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + %x = select i1 %active, i64 %a, i64 5 + %y = select i1 %active, i64 %b, i64 3 + %ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false) + ret i64 %ret +} + +define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i32> inreg %v4i32, float inreg %float, ptr addrspace(5) inreg %ptr, ptr addrspace(5) inreg %ptr2) { +; DAGISEL-LABEL: inreg_args: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; DAGISEL-NEXT: s_clause 0x5 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; DAGISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16 +; DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s9 +; DAGISEL-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; DAGISEL-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 +; DAGISEL-NEXT: scratch_store_b32 off, v4, s10 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_store_b128 off, v[0:3], s11 +; DAGISEL-NEXT: scratch_store_b32 off, v5, s11 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1 +; DAGISEL-NEXT: s_clause 0x5 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; DAGISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; DAGISEL-NEXT: scratch_load_b32 v4, off, s32 offset:16 +; DAGISEL-NEXT: scratch_load_b32 v5, off, s32 offset:20 +; DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: inreg_args: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 s34, -1 +; GISEL-NEXT: s_clause 0x5 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 +; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; GISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; GISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16 +; GISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: s_mov_b32 s0, s5 +; GISEL-NEXT: s_mov_b32 s1, s6 +; GISEL-NEXT: s_mov_b32 s2, s7 +; GISEL-NEXT: s_mov_b32 s3, s8 +; GISEL-NEXT: v_mov_b32_e32 v4, s4 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GISEL-NEXT: v_mov_b32_e32 v5, s9 +; GISEL-NEXT: scratch_store_b32 off, v4, s10 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_store_b128 off, v[0:3], s11 +; GISEL-NEXT: scratch_store_b32 off, v5, s11 +; GISEL-NEXT: s_xor_b32 exec_lo, s34, -1 +; GISEL-NEXT: s_clause 0x5 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 +; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; GISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; GISEL-NEXT: scratch_load_b32 v4, off, s32 offset:16 +; GISEL-NEXT: scratch_load_b32 v5, off, s32 offset:20 +; GISEL-NEXT: s_mov_b32 exec_lo, s34 +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: inreg_args: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; DAGISEL64-NEXT: s_clause 0x5 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; DAGISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; DAGISEL64-NEXT: scratch_store_b32 off, v4, s32 offset:16 +; DAGISEL64-NEXT: scratch_store_b32 off, v5, s32 offset:20 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: v_mov_b32_e32 v4, s4 +; DAGISEL64-NEXT: v_mov_b32_e32 v0, s5 +; DAGISEL64-NEXT: v_mov_b32_e32 v1, s6 +; DAGISEL64-NEXT: v_mov_b32_e32 v2, s7 +; DAGISEL64-NEXT: v_mov_b32_e32 v3, s8 +; DAGISEL64-NEXT: v_mov_b32_e32 v5, s9 +; DAGISEL64-NEXT: scratch_store_b32 off, v4, s10 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_store_b128 off, v[0:3], s11 +; DAGISEL64-NEXT: scratch_store_b32 off, v5, s11 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_xor_b64 exec, s[0:1], -1 +; DAGISEL64-NEXT: s_clause 0x5 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; DAGISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; DAGISEL64-NEXT: scratch_load_b32 v4, off, s32 offset:16 +; DAGISEL64-NEXT: scratch_load_b32 v5, off, s32 offset:20 +; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: inreg_args: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 s[34:35], -1 +; GISEL64-NEXT: s_clause 0x5 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 +; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; GISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; GISEL64-NEXT: scratch_store_b32 off, v4, s32 offset:16 +; GISEL64-NEXT: scratch_store_b32 off, v5, s32 offset:20 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: s_mov_b32 s0, s5 +; GISEL64-NEXT: s_mov_b32 s1, s6 +; GISEL64-NEXT: s_mov_b32 s2, s7 +; GISEL64-NEXT: s_mov_b32 s3, s8 +; GISEL64-NEXT: v_mov_b32_e32 v4, s4 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: v_mov_b32_e32 v0, s0 +; GISEL64-NEXT: v_mov_b32_e32 v1, s1 +; GISEL64-NEXT: v_mov_b32_e32 v2, s2 +; GISEL64-NEXT: v_mov_b32_e32 v3, s3 +; GISEL64-NEXT: v_mov_b32_e32 v5, s9 +; GISEL64-NEXT: scratch_store_b32 off, v4, s10 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_store_b128 off, v[0:3], s11 +; GISEL64-NEXT: scratch_store_b32 off, v5, s11 +; GISEL64-NEXT: s_xor_b64 exec, s[34:35], -1 +; GISEL64-NEXT: s_clause 0x5 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 +; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; GISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; GISEL64-NEXT: scratch_load_b32 v4, off, s32 offset:16 +; GISEL64-NEXT: scratch_load_b32 v5, off, s32 offset:20 +; GISEL64-NEXT: s_mov_b64 exec, s[34:35] +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + store i32 %i32, ptr addrspace(5) %ptr + store <4 x i32> %v4i32, ptr addrspace(5) %ptr2 + store float %float, ptr addrspace(5) %ptr2 + ret void +} + +declare amdgpu_gfx <2 x half> @gfx_callee(<2 x half> %x, <2 x half> %y) + +define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 x half> %x, <2 x half> %y) { +; DAGISEL-LABEL: call_gfx_from_whole_wave: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_mov_b32 s0, s33 +; DAGISEL-NEXT: s_mov_b32 s33, s32 +; DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_store_b32 off, v0, s33 offset:4 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s33 offset:8 +; DAGISEL-NEXT: scratch_store_b32 off, v2, s33 offset:12 +; DAGISEL-NEXT: scratch_store_b32 off, v3, s33 offset:16 +; DAGISEL-NEXT: scratch_store_b32 off, v4, s33 offset:20 +; DAGISEL-NEXT: scratch_store_b32 off, v5, s33 offset:24 +; DAGISEL-NEXT: scratch_store_b32 off, v6, s33 offset:28 +; DAGISEL-NEXT: scratch_store_b32 off, v7, s33 offset:32 +; DAGISEL-NEXT: scratch_store_b32 off, v8, s33 offset:36 +; DAGISEL-NEXT: scratch_store_b32 off, v9, s33 offset:40 +; DAGISEL-NEXT: scratch_store_b32 off, v10, s33 offset:44 +; DAGISEL-NEXT: scratch_store_b32 off, v11, s33 offset:48 +; DAGISEL-NEXT: scratch_store_b32 off, v12, s33 offset:52 +; DAGISEL-NEXT: scratch_store_b32 off, v13, s33 offset:56 +; DAGISEL-NEXT: scratch_store_b32 off, v14, s33 offset:60 +; DAGISEL-NEXT: scratch_store_b32 off, v15, s33 offset:64 +; DAGISEL-NEXT: scratch_store_b32 off, v16, s33 offset:68 +; DAGISEL-NEXT: scratch_store_b32 off, v17, s33 offset:72 +; DAGISEL-NEXT: scratch_store_b32 off, v18, s33 offset:76 +; DAGISEL-NEXT: scratch_store_b32 off, v19, s33 offset:80 +; DAGISEL-NEXT: scratch_store_b32 off, v20, s33 offset:84 +; DAGISEL-NEXT: scratch_store_b32 off, v21, s33 offset:88 +; DAGISEL-NEXT: scratch_store_b32 off, v22, s33 offset:92 +; DAGISEL-NEXT: scratch_store_b32 off, v23, s33 offset:96 +; DAGISEL-NEXT: scratch_store_b32 off, v24, s33 offset:100 +; DAGISEL-NEXT: scratch_store_b32 off, v25, s33 offset:104 +; DAGISEL-NEXT: scratch_store_b32 off, v26, s33 offset:108 +; DAGISEL-NEXT: scratch_store_b32 off, v27, s33 offset:112 +; DAGISEL-NEXT: scratch_store_b32 off, v28, s33 offset:116 +; DAGISEL-NEXT: scratch_store_b32 off, v29, s33 offset:120 +; DAGISEL-NEXT: scratch_store_b32 off, v30, s33 offset:124 +; DAGISEL-NEXT: scratch_store_b32 off, v31, s33 offset:128 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_store_b32 off, v32, s33 offset:132 +; DAGISEL-NEXT: scratch_store_b32 off, v33, s33 offset:136 +; DAGISEL-NEXT: scratch_store_b32 off, v34, s33 offset:140 +; DAGISEL-NEXT: scratch_store_b32 off, v35, s33 offset:144 +; DAGISEL-NEXT: scratch_store_b32 off, v36, s33 offset:148 +; DAGISEL-NEXT: scratch_store_b32 off, v37, s33 offset:152 +; DAGISEL-NEXT: scratch_store_b32 off, v38, s33 offset:156 +; DAGISEL-NEXT: scratch_store_b32 off, v39, s33 offset:160 +; DAGISEL-NEXT: scratch_store_b32 off, v48, s33 offset:164 +; DAGISEL-NEXT: scratch_store_b32 off, v49, s33 offset:168 +; DAGISEL-NEXT: scratch_store_b32 off, v50, s33 offset:172 +; DAGISEL-NEXT: scratch_store_b32 off, v51, s33 offset:176 +; DAGISEL-NEXT: scratch_store_b32 off, v52, s33 offset:180 +; DAGISEL-NEXT: scratch_store_b32 off, v53, s33 offset:184 +; DAGISEL-NEXT: scratch_store_b32 off, v54, s33 offset:188 +; DAGISEL-NEXT: scratch_store_b32 off, v55, s33 offset:192 +; DAGISEL-NEXT: scratch_store_b32 off, v64, s33 offset:196 +; DAGISEL-NEXT: scratch_store_b32 off, v65, s33 offset:200 +; DAGISEL-NEXT: scratch_store_b32 off, v66, s33 offset:204 +; DAGISEL-NEXT: scratch_store_b32 off, v67, s33 offset:208 +; DAGISEL-NEXT: scratch_store_b32 off, v68, s33 offset:212 +; DAGISEL-NEXT: scratch_store_b32 off, v69, s33 offset:216 +; DAGISEL-NEXT: scratch_store_b32 off, v70, s33 offset:220 +; DAGISEL-NEXT: scratch_store_b32 off, v71, s33 offset:224 +; DAGISEL-NEXT: scratch_store_b32 off, v80, s33 offset:228 +; DAGISEL-NEXT: scratch_store_b32 off, v81, s33 offset:232 +; DAGISEL-NEXT: scratch_store_b32 off, v82, s33 offset:236 +; DAGISEL-NEXT: scratch_store_b32 off, v83, s33 offset:240 +; DAGISEL-NEXT: scratch_store_b32 off, v84, s33 offset:244 +; DAGISEL-NEXT: scratch_store_b32 off, v85, s33 offset:248 +; DAGISEL-NEXT: scratch_store_b32 off, v86, s33 offset:252 +; DAGISEL-NEXT: scratch_store_b32 off, v87, s33 offset:256 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_store_b32 off, v96, s33 offset:260 +; DAGISEL-NEXT: scratch_store_b32 off, v97, s33 offset:264 +; DAGISEL-NEXT: scratch_store_b32 off, v98, s33 offset:268 +; DAGISEL-NEXT: scratch_store_b32 off, v99, s33 offset:272 +; DAGISEL-NEXT: scratch_store_b32 off, v100, s33 offset:276 +; DAGISEL-NEXT: scratch_store_b32 off, v101, s33 offset:280 +; DAGISEL-NEXT: scratch_store_b32 off, v102, s33 offset:284 +; DAGISEL-NEXT: scratch_store_b32 off, v103, s33 offset:288 +; DAGISEL-NEXT: scratch_store_b32 off, v112, s33 offset:292 +; DAGISEL-NEXT: scratch_store_b32 off, v113, s33 offset:296 +; DAGISEL-NEXT: scratch_store_b32 off, v114, s33 offset:300 +; DAGISEL-NEXT: scratch_store_b32 off, v115, s33 offset:304 +; DAGISEL-NEXT: scratch_store_b32 off, v116, s33 offset:308 +; DAGISEL-NEXT: scratch_store_b32 off, v117, s33 offset:312 +; DAGISEL-NEXT: scratch_store_b32 off, v118, s33 offset:316 +; DAGISEL-NEXT: scratch_store_b32 off, v119, s33 offset:320 +; DAGISEL-NEXT: scratch_store_b32 off, v128, s33 offset:324 +; DAGISEL-NEXT: scratch_store_b32 off, v129, s33 offset:328 +; DAGISEL-NEXT: scratch_store_b32 off, v130, s33 offset:332 +; DAGISEL-NEXT: scratch_store_b32 off, v131, s33 offset:336 +; DAGISEL-NEXT: scratch_store_b32 off, v132, s33 offset:340 +; DAGISEL-NEXT: scratch_store_b32 off, v133, s33 offset:344 +; DAGISEL-NEXT: scratch_store_b32 off, v134, s33 offset:348 +; DAGISEL-NEXT: scratch_store_b32 off, v135, s33 offset:352 +; DAGISEL-NEXT: scratch_store_b32 off, v144, s33 offset:356 +; DAGISEL-NEXT: scratch_store_b32 off, v145, s33 offset:360 +; DAGISEL-NEXT: scratch_store_b32 off, v146, s33 offset:364 +; DAGISEL-NEXT: scratch_store_b32 off, v147, s33 offset:368 +; DAGISEL-NEXT: scratch_store_b32 off, v148, s33 offset:372 +; DAGISEL-NEXT: scratch_store_b32 off, v149, s33 offset:376 +; DAGISEL-NEXT: scratch_store_b32 off, v150, s33 offset:380 +; DAGISEL-NEXT: scratch_store_b32 off, v151, s33 offset:384 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_store_b32 off, v160, s33 offset:388 +; DAGISEL-NEXT: scratch_store_b32 off, v161, s33 offset:392 +; DAGISEL-NEXT: scratch_store_b32 off, v162, s33 offset:396 +; DAGISEL-NEXT: scratch_store_b32 off, v163, s33 offset:400 +; DAGISEL-NEXT: scratch_store_b32 off, v164, s33 offset:404 +; DAGISEL-NEXT: scratch_store_b32 off, v165, s33 offset:408 +; DAGISEL-NEXT: scratch_store_b32 off, v166, s33 offset:412 +; DAGISEL-NEXT: scratch_store_b32 off, v167, s33 offset:416 +; DAGISEL-NEXT: scratch_store_b32 off, v176, s33 offset:420 +; DAGISEL-NEXT: scratch_store_b32 off, v177, s33 offset:424 +; DAGISEL-NEXT: scratch_store_b32 off, v178, s33 offset:428 +; DAGISEL-NEXT: scratch_store_b32 off, v179, s33 offset:432 +; DAGISEL-NEXT: scratch_store_b32 off, v180, s33 offset:436 +; DAGISEL-NEXT: scratch_store_b32 off, v181, s33 offset:440 +; DAGISEL-NEXT: scratch_store_b32 off, v182, s33 offset:444 +; DAGISEL-NEXT: scratch_store_b32 off, v183, s33 offset:448 +; DAGISEL-NEXT: scratch_store_b32 off, v192, s33 offset:452 +; DAGISEL-NEXT: scratch_store_b32 off, v193, s33 offset:456 +; DAGISEL-NEXT: scratch_store_b32 off, v194, s33 offset:460 +; DAGISEL-NEXT: scratch_store_b32 off, v195, s33 offset:464 +; DAGISEL-NEXT: scratch_store_b32 off, v196, s33 offset:468 +; DAGISEL-NEXT: scratch_store_b32 off, v197, s33 offset:472 +; DAGISEL-NEXT: scratch_store_b32 off, v198, s33 offset:476 +; DAGISEL-NEXT: scratch_store_b32 off, v199, s33 offset:480 +; DAGISEL-NEXT: scratch_store_b32 off, v208, s33 offset:484 +; DAGISEL-NEXT: scratch_store_b32 off, v209, s33 offset:488 +; DAGISEL-NEXT: scratch_store_b32 off, v210, s33 offset:492 +; DAGISEL-NEXT: scratch_store_b32 off, v211, s33 offset:496 +; DAGISEL-NEXT: scratch_store_b32 off, v212, s33 offset:500 +; DAGISEL-NEXT: scratch_store_b32 off, v213, s33 offset:504 +; DAGISEL-NEXT: scratch_store_b32 off, v214, s33 offset:508 +; DAGISEL-NEXT: scratch_store_b32 off, v215, s33 offset:512 +; DAGISEL-NEXT: s_clause 0xf +; DAGISEL-NEXT: scratch_store_b32 off, v224, s33 offset:516 +; DAGISEL-NEXT: scratch_store_b32 off, v225, s33 offset:520 +; DAGISEL-NEXT: scratch_store_b32 off, v226, s33 offset:524 +; DAGISEL-NEXT: scratch_store_b32 off, v227, s33 offset:528 +; DAGISEL-NEXT: scratch_store_b32 off, v228, s33 offset:532 +; DAGISEL-NEXT: scratch_store_b32 off, v229, s33 offset:536 +; DAGISEL-NEXT: scratch_store_b32 off, v230, s33 offset:540 +; DAGISEL-NEXT: scratch_store_b32 off, v231, s33 offset:544 +; DAGISEL-NEXT: scratch_store_b32 off, v240, s33 offset:548 +; DAGISEL-NEXT: scratch_store_b32 off, v241, s33 offset:552 +; DAGISEL-NEXT: scratch_store_b32 off, v242, s33 offset:556 +; DAGISEL-NEXT: scratch_store_b32 off, v243, s33 offset:560 +; DAGISEL-NEXT: scratch_store_b32 off, v244, s33 offset:564 +; DAGISEL-NEXT: scratch_store_b32 off, v245, s33 offset:568 +; DAGISEL-NEXT: scratch_store_b32 off, v246, s33 offset:572 +; DAGISEL-NEXT: scratch_store_b32 off, v247, s33 offset:576 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: v_writelane_b32 v40, s0, 3 +; DAGISEL-NEXT: v_mov_b32_e32 v2, v0 +; DAGISEL-NEXT: v_swap_b32 v0, v1 +; DAGISEL-NEXT: s_mov_b32 s1, gfx_callee@abs32@hi +; DAGISEL-NEXT: v_writelane_b32 v40, s4, 0 +; DAGISEL-NEXT: s_mov_b32 s0, gfx_callee@abs32@lo +; DAGISEL-NEXT: s_addk_co_i32 s32, 0x250 +; DAGISEL-NEXT: v_writelane_b32 v40, s30, 1 +; DAGISEL-NEXT: v_writelane_b32 v40, s31, 2 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL-NEXT: v_readlane_b32 s31, v40, 2 +; DAGISEL-NEXT: v_readlane_b32 s30, v40, 1 +; DAGISEL-NEXT: v_readlane_b32 s4, v40, 0 +; DAGISEL-NEXT: v_readlane_b32 s0, v40, 3 +; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; DAGISEL-NEXT: s_mov_b32 s32, s33 +; DAGISEL-NEXT: s_xor_b32 exec_lo, s4, -1 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_load_b32 v0, off, s33 offset:4 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s33 offset:8 +; DAGISEL-NEXT: scratch_load_b32 v2, off, s33 offset:12 +; DAGISEL-NEXT: scratch_load_b32 v3, off, s33 offset:16 +; DAGISEL-NEXT: scratch_load_b32 v4, off, s33 offset:20 +; DAGISEL-NEXT: scratch_load_b32 v5, off, s33 offset:24 +; DAGISEL-NEXT: scratch_load_b32 v6, off, s33 offset:28 +; DAGISEL-NEXT: scratch_load_b32 v7, off, s33 offset:32 +; DAGISEL-NEXT: scratch_load_b32 v8, off, s33 offset:36 +; DAGISEL-NEXT: scratch_load_b32 v9, off, s33 offset:40 +; DAGISEL-NEXT: scratch_load_b32 v10, off, s33 offset:44 +; DAGISEL-NEXT: scratch_load_b32 v11, off, s33 offset:48 +; DAGISEL-NEXT: scratch_load_b32 v12, off, s33 offset:52 +; DAGISEL-NEXT: scratch_load_b32 v13, off, s33 offset:56 +; DAGISEL-NEXT: scratch_load_b32 v14, off, s33 offset:60 +; DAGISEL-NEXT: scratch_load_b32 v15, off, s33 offset:64 +; DAGISEL-NEXT: scratch_load_b32 v16, off, s33 offset:68 +; DAGISEL-NEXT: scratch_load_b32 v17, off, s33 offset:72 +; DAGISEL-NEXT: scratch_load_b32 v18, off, s33 offset:76 +; DAGISEL-NEXT: scratch_load_b32 v19, off, s33 offset:80 +; DAGISEL-NEXT: scratch_load_b32 v20, off, s33 offset:84 +; DAGISEL-NEXT: scratch_load_b32 v21, off, s33 offset:88 +; DAGISEL-NEXT: scratch_load_b32 v22, off, s33 offset:92 +; DAGISEL-NEXT: scratch_load_b32 v23, off, s33 offset:96 +; DAGISEL-NEXT: scratch_load_b32 v24, off, s33 offset:100 +; DAGISEL-NEXT: scratch_load_b32 v25, off, s33 offset:104 +; DAGISEL-NEXT: scratch_load_b32 v26, off, s33 offset:108 +; DAGISEL-NEXT: scratch_load_b32 v27, off, s33 offset:112 +; DAGISEL-NEXT: scratch_load_b32 v28, off, s33 offset:116 +; DAGISEL-NEXT: scratch_load_b32 v29, off, s33 offset:120 +; DAGISEL-NEXT: scratch_load_b32 v30, off, s33 offset:124 +; DAGISEL-NEXT: scratch_load_b32 v31, off, s33 offset:128 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_load_b32 v32, off, s33 offset:132 +; DAGISEL-NEXT: scratch_load_b32 v33, off, s33 offset:136 +; DAGISEL-NEXT: scratch_load_b32 v34, off, s33 offset:140 +; DAGISEL-NEXT: scratch_load_b32 v35, off, s33 offset:144 +; DAGISEL-NEXT: scratch_load_b32 v36, off, s33 offset:148 +; DAGISEL-NEXT: scratch_load_b32 v37, off, s33 offset:152 +; DAGISEL-NEXT: scratch_load_b32 v38, off, s33 offset:156 +; DAGISEL-NEXT: scratch_load_b32 v39, off, s33 offset:160 +; DAGISEL-NEXT: scratch_load_b32 v48, off, s33 offset:164 +; DAGISEL-NEXT: scratch_load_b32 v49, off, s33 offset:168 +; DAGISEL-NEXT: scratch_load_b32 v50, off, s33 offset:172 +; DAGISEL-NEXT: scratch_load_b32 v51, off, s33 offset:176 +; DAGISEL-NEXT: scratch_load_b32 v52, off, s33 offset:180 +; DAGISEL-NEXT: scratch_load_b32 v53, off, s33 offset:184 +; DAGISEL-NEXT: scratch_load_b32 v54, off, s33 offset:188 +; DAGISEL-NEXT: scratch_load_b32 v55, off, s33 offset:192 +; DAGISEL-NEXT: scratch_load_b32 v64, off, s33 offset:196 +; DAGISEL-NEXT: scratch_load_b32 v65, off, s33 offset:200 +; DAGISEL-NEXT: scratch_load_b32 v66, off, s33 offset:204 +; DAGISEL-NEXT: scratch_load_b32 v67, off, s33 offset:208 +; DAGISEL-NEXT: scratch_load_b32 v68, off, s33 offset:212 +; DAGISEL-NEXT: scratch_load_b32 v69, off, s33 offset:216 +; DAGISEL-NEXT: scratch_load_b32 v70, off, s33 offset:220 +; DAGISEL-NEXT: scratch_load_b32 v71, off, s33 offset:224 +; DAGISEL-NEXT: scratch_load_b32 v80, off, s33 offset:228 +; DAGISEL-NEXT: scratch_load_b32 v81, off, s33 offset:232 +; DAGISEL-NEXT: scratch_load_b32 v82, off, s33 offset:236 +; DAGISEL-NEXT: scratch_load_b32 v83, off, s33 offset:240 +; DAGISEL-NEXT: scratch_load_b32 v84, off, s33 offset:244 +; DAGISEL-NEXT: scratch_load_b32 v85, off, s33 offset:248 +; DAGISEL-NEXT: scratch_load_b32 v86, off, s33 offset:252 +; DAGISEL-NEXT: scratch_load_b32 v87, off, s33 offset:256 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_load_b32 v96, off, s33 offset:260 +; DAGISEL-NEXT: scratch_load_b32 v97, off, s33 offset:264 +; DAGISEL-NEXT: scratch_load_b32 v98, off, s33 offset:268 +; DAGISEL-NEXT: scratch_load_b32 v99, off, s33 offset:272 +; DAGISEL-NEXT: scratch_load_b32 v100, off, s33 offset:276 +; DAGISEL-NEXT: scratch_load_b32 v101, off, s33 offset:280 +; DAGISEL-NEXT: scratch_load_b32 v102, off, s33 offset:284 +; DAGISEL-NEXT: scratch_load_b32 v103, off, s33 offset:288 +; DAGISEL-NEXT: scratch_load_b32 v112, off, s33 offset:292 +; DAGISEL-NEXT: scratch_load_b32 v113, off, s33 offset:296 +; DAGISEL-NEXT: scratch_load_b32 v114, off, s33 offset:300 +; DAGISEL-NEXT: scratch_load_b32 v115, off, s33 offset:304 +; DAGISEL-NEXT: scratch_load_b32 v116, off, s33 offset:308 +; DAGISEL-NEXT: scratch_load_b32 v117, off, s33 offset:312 +; DAGISEL-NEXT: scratch_load_b32 v118, off, s33 offset:316 +; DAGISEL-NEXT: scratch_load_b32 v119, off, s33 offset:320 +; DAGISEL-NEXT: scratch_load_b32 v128, off, s33 offset:324 +; DAGISEL-NEXT: scratch_load_b32 v129, off, s33 offset:328 +; DAGISEL-NEXT: scratch_load_b32 v130, off, s33 offset:332 +; DAGISEL-NEXT: scratch_load_b32 v131, off, s33 offset:336 +; DAGISEL-NEXT: scratch_load_b32 v132, off, s33 offset:340 +; DAGISEL-NEXT: scratch_load_b32 v133, off, s33 offset:344 +; DAGISEL-NEXT: scratch_load_b32 v134, off, s33 offset:348 +; DAGISEL-NEXT: scratch_load_b32 v135, off, s33 offset:352 +; DAGISEL-NEXT: scratch_load_b32 v144, off, s33 offset:356 +; DAGISEL-NEXT: scratch_load_b32 v145, off, s33 offset:360 +; DAGISEL-NEXT: scratch_load_b32 v146, off, s33 offset:364 +; DAGISEL-NEXT: scratch_load_b32 v147, off, s33 offset:368 +; DAGISEL-NEXT: scratch_load_b32 v148, off, s33 offset:372 +; DAGISEL-NEXT: scratch_load_b32 v149, off, s33 offset:376 +; DAGISEL-NEXT: scratch_load_b32 v150, off, s33 offset:380 +; DAGISEL-NEXT: scratch_load_b32 v151, off, s33 offset:384 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_load_b32 v160, off, s33 offset:388 +; DAGISEL-NEXT: scratch_load_b32 v161, off, s33 offset:392 +; DAGISEL-NEXT: scratch_load_b32 v162, off, s33 offset:396 +; DAGISEL-NEXT: scratch_load_b32 v163, off, s33 offset:400 +; DAGISEL-NEXT: scratch_load_b32 v164, off, s33 offset:404 +; DAGISEL-NEXT: scratch_load_b32 v165, off, s33 offset:408 +; DAGISEL-NEXT: scratch_load_b32 v166, off, s33 offset:412 +; DAGISEL-NEXT: scratch_load_b32 v167, off, s33 offset:416 +; DAGISEL-NEXT: scratch_load_b32 v176, off, s33 offset:420 +; DAGISEL-NEXT: scratch_load_b32 v177, off, s33 offset:424 +; DAGISEL-NEXT: scratch_load_b32 v178, off, s33 offset:428 +; DAGISEL-NEXT: scratch_load_b32 v179, off, s33 offset:432 +; DAGISEL-NEXT: scratch_load_b32 v180, off, s33 offset:436 +; DAGISEL-NEXT: scratch_load_b32 v181, off, s33 offset:440 +; DAGISEL-NEXT: scratch_load_b32 v182, off, s33 offset:444 +; DAGISEL-NEXT: scratch_load_b32 v183, off, s33 offset:448 +; DAGISEL-NEXT: scratch_load_b32 v192, off, s33 offset:452 +; DAGISEL-NEXT: scratch_load_b32 v193, off, s33 offset:456 +; DAGISEL-NEXT: scratch_load_b32 v194, off, s33 offset:460 +; DAGISEL-NEXT: scratch_load_b32 v195, off, s33 offset:464 +; DAGISEL-NEXT: scratch_load_b32 v196, off, s33 offset:468 +; DAGISEL-NEXT: scratch_load_b32 v197, off, s33 offset:472 +; DAGISEL-NEXT: scratch_load_b32 v198, off, s33 offset:476 +; DAGISEL-NEXT: scratch_load_b32 v199, off, s33 offset:480 +; DAGISEL-NEXT: scratch_load_b32 v208, off, s33 offset:484 +; DAGISEL-NEXT: scratch_load_b32 v209, off, s33 offset:488 +; DAGISEL-NEXT: scratch_load_b32 v210, off, s33 offset:492 +; DAGISEL-NEXT: scratch_load_b32 v211, off, s33 offset:496 +; DAGISEL-NEXT: scratch_load_b32 v212, off, s33 offset:500 +; DAGISEL-NEXT: scratch_load_b32 v213, off, s33 offset:504 +; DAGISEL-NEXT: scratch_load_b32 v214, off, s33 offset:508 +; DAGISEL-NEXT: scratch_load_b32 v215, off, s33 offset:512 +; DAGISEL-NEXT: s_clause 0xf +; DAGISEL-NEXT: scratch_load_b32 v224, off, s33 offset:516 +; DAGISEL-NEXT: scratch_load_b32 v225, off, s33 offset:520 +; DAGISEL-NEXT: scratch_load_b32 v226, off, s33 offset:524 +; DAGISEL-NEXT: scratch_load_b32 v227, off, s33 offset:528 +; DAGISEL-NEXT: scratch_load_b32 v228, off, s33 offset:532 +; DAGISEL-NEXT: scratch_load_b32 v229, off, s33 offset:536 +; DAGISEL-NEXT: scratch_load_b32 v230, off, s33 offset:540 +; DAGISEL-NEXT: scratch_load_b32 v231, off, s33 offset:544 +; DAGISEL-NEXT: scratch_load_b32 v240, off, s33 offset:548 +; DAGISEL-NEXT: scratch_load_b32 v241, off, s33 offset:552 +; DAGISEL-NEXT: scratch_load_b32 v242, off, s33 offset:556 +; DAGISEL-NEXT: scratch_load_b32 v243, off, s33 offset:560 +; DAGISEL-NEXT: scratch_load_b32 v244, off, s33 offset:564 +; DAGISEL-NEXT: scratch_load_b32 v245, off, s33 offset:568 +; DAGISEL-NEXT: scratch_load_b32 v246, off, s33 offset:572 +; DAGISEL-NEXT: scratch_load_b32 v247, off, s33 offset:576 +; DAGISEL-NEXT: s_mov_b32 exec_lo, s4 +; DAGISEL-NEXT: s_mov_b32 s33, s0 +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: call_gfx_from_whole_wave: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_mov_b32 s0, s33 +; GISEL-NEXT: s_mov_b32 s33, s32 +; GISEL-NEXT: s_xor_saveexec_b32 s4, -1 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_store_b32 off, v0, s33 offset:4 +; GISEL-NEXT: scratch_store_b32 off, v1, s33 offset:8 +; GISEL-NEXT: scratch_store_b32 off, v2, s33 offset:12 +; GISEL-NEXT: scratch_store_b32 off, v3, s33 offset:16 +; GISEL-NEXT: scratch_store_b32 off, v4, s33 offset:20 +; GISEL-NEXT: scratch_store_b32 off, v5, s33 offset:24 +; GISEL-NEXT: scratch_store_b32 off, v6, s33 offset:28 +; GISEL-NEXT: scratch_store_b32 off, v7, s33 offset:32 +; GISEL-NEXT: scratch_store_b32 off, v8, s33 offset:36 +; GISEL-NEXT: scratch_store_b32 off, v9, s33 offset:40 +; GISEL-NEXT: scratch_store_b32 off, v10, s33 offset:44 +; GISEL-NEXT: scratch_store_b32 off, v11, s33 offset:48 +; GISEL-NEXT: scratch_store_b32 off, v12, s33 offset:52 +; GISEL-NEXT: scratch_store_b32 off, v13, s33 offset:56 +; GISEL-NEXT: scratch_store_b32 off, v14, s33 offset:60 +; GISEL-NEXT: scratch_store_b32 off, v15, s33 offset:64 +; GISEL-NEXT: scratch_store_b32 off, v16, s33 offset:68 +; GISEL-NEXT: scratch_store_b32 off, v17, s33 offset:72 +; GISEL-NEXT: scratch_store_b32 off, v18, s33 offset:76 +; GISEL-NEXT: scratch_store_b32 off, v19, s33 offset:80 +; GISEL-NEXT: scratch_store_b32 off, v20, s33 offset:84 +; GISEL-NEXT: scratch_store_b32 off, v21, s33 offset:88 +; GISEL-NEXT: scratch_store_b32 off, v22, s33 offset:92 +; GISEL-NEXT: scratch_store_b32 off, v23, s33 offset:96 +; GISEL-NEXT: scratch_store_b32 off, v24, s33 offset:100 +; GISEL-NEXT: scratch_store_b32 off, v25, s33 offset:104 +; GISEL-NEXT: scratch_store_b32 off, v26, s33 offset:108 +; GISEL-NEXT: scratch_store_b32 off, v27, s33 offset:112 +; GISEL-NEXT: scratch_store_b32 off, v28, s33 offset:116 +; GISEL-NEXT: scratch_store_b32 off, v29, s33 offset:120 +; GISEL-NEXT: scratch_store_b32 off, v30, s33 offset:124 +; GISEL-NEXT: scratch_store_b32 off, v31, s33 offset:128 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_store_b32 off, v32, s33 offset:132 +; GISEL-NEXT: scratch_store_b32 off, v33, s33 offset:136 +; GISEL-NEXT: scratch_store_b32 off, v34, s33 offset:140 +; GISEL-NEXT: scratch_store_b32 off, v35, s33 offset:144 +; GISEL-NEXT: scratch_store_b32 off, v36, s33 offset:148 +; GISEL-NEXT: scratch_store_b32 off, v37, s33 offset:152 +; GISEL-NEXT: scratch_store_b32 off, v38, s33 offset:156 +; GISEL-NEXT: scratch_store_b32 off, v39, s33 offset:160 +; GISEL-NEXT: scratch_store_b32 off, v48, s33 offset:164 +; GISEL-NEXT: scratch_store_b32 off, v49, s33 offset:168 +; GISEL-NEXT: scratch_store_b32 off, v50, s33 offset:172 +; GISEL-NEXT: scratch_store_b32 off, v51, s33 offset:176 +; GISEL-NEXT: scratch_store_b32 off, v52, s33 offset:180 +; GISEL-NEXT: scratch_store_b32 off, v53, s33 offset:184 +; GISEL-NEXT: scratch_store_b32 off, v54, s33 offset:188 +; GISEL-NEXT: scratch_store_b32 off, v55, s33 offset:192 +; GISEL-NEXT: scratch_store_b32 off, v64, s33 offset:196 +; GISEL-NEXT: scratch_store_b32 off, v65, s33 offset:200 +; GISEL-NEXT: scratch_store_b32 off, v66, s33 offset:204 +; GISEL-NEXT: scratch_store_b32 off, v67, s33 offset:208 +; GISEL-NEXT: scratch_store_b32 off, v68, s33 offset:212 +; GISEL-NEXT: scratch_store_b32 off, v69, s33 offset:216 +; GISEL-NEXT: scratch_store_b32 off, v70, s33 offset:220 +; GISEL-NEXT: scratch_store_b32 off, v71, s33 offset:224 +; GISEL-NEXT: scratch_store_b32 off, v80, s33 offset:228 +; GISEL-NEXT: scratch_store_b32 off, v81, s33 offset:232 +; GISEL-NEXT: scratch_store_b32 off, v82, s33 offset:236 +; GISEL-NEXT: scratch_store_b32 off, v83, s33 offset:240 +; GISEL-NEXT: scratch_store_b32 off, v84, s33 offset:244 +; GISEL-NEXT: scratch_store_b32 off, v85, s33 offset:248 +; GISEL-NEXT: scratch_store_b32 off, v86, s33 offset:252 +; GISEL-NEXT: scratch_store_b32 off, v87, s33 offset:256 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_store_b32 off, v96, s33 offset:260 +; GISEL-NEXT: scratch_store_b32 off, v97, s33 offset:264 +; GISEL-NEXT: scratch_store_b32 off, v98, s33 offset:268 +; GISEL-NEXT: scratch_store_b32 off, v99, s33 offset:272 +; GISEL-NEXT: scratch_store_b32 off, v100, s33 offset:276 +; GISEL-NEXT: scratch_store_b32 off, v101, s33 offset:280 +; GISEL-NEXT: scratch_store_b32 off, v102, s33 offset:284 +; GISEL-NEXT: scratch_store_b32 off, v103, s33 offset:288 +; GISEL-NEXT: scratch_store_b32 off, v112, s33 offset:292 +; GISEL-NEXT: scratch_store_b32 off, v113, s33 offset:296 +; GISEL-NEXT: scratch_store_b32 off, v114, s33 offset:300 +; GISEL-NEXT: scratch_store_b32 off, v115, s33 offset:304 +; GISEL-NEXT: scratch_store_b32 off, v116, s33 offset:308 +; GISEL-NEXT: scratch_store_b32 off, v117, s33 offset:312 +; GISEL-NEXT: scratch_store_b32 off, v118, s33 offset:316 +; GISEL-NEXT: scratch_store_b32 off, v119, s33 offset:320 +; GISEL-NEXT: scratch_store_b32 off, v128, s33 offset:324 +; GISEL-NEXT: scratch_store_b32 off, v129, s33 offset:328 +; GISEL-NEXT: scratch_store_b32 off, v130, s33 offset:332 +; GISEL-NEXT: scratch_store_b32 off, v131, s33 offset:336 +; GISEL-NEXT: scratch_store_b32 off, v132, s33 offset:340 +; GISEL-NEXT: scratch_store_b32 off, v133, s33 offset:344 +; GISEL-NEXT: scratch_store_b32 off, v134, s33 offset:348 +; GISEL-NEXT: scratch_store_b32 off, v135, s33 offset:352 +; GISEL-NEXT: scratch_store_b32 off, v144, s33 offset:356 +; GISEL-NEXT: scratch_store_b32 off, v145, s33 offset:360 +; GISEL-NEXT: scratch_store_b32 off, v146, s33 offset:364 +; GISEL-NEXT: scratch_store_b32 off, v147, s33 offset:368 +; GISEL-NEXT: scratch_store_b32 off, v148, s33 offset:372 +; GISEL-NEXT: scratch_store_b32 off, v149, s33 offset:376 +; GISEL-NEXT: scratch_store_b32 off, v150, s33 offset:380 +; GISEL-NEXT: scratch_store_b32 off, v151, s33 offset:384 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_store_b32 off, v160, s33 offset:388 +; GISEL-NEXT: scratch_store_b32 off, v161, s33 offset:392 +; GISEL-NEXT: scratch_store_b32 off, v162, s33 offset:396 +; GISEL-NEXT: scratch_store_b32 off, v163, s33 offset:400 +; GISEL-NEXT: scratch_store_b32 off, v164, s33 offset:404 +; GISEL-NEXT: scratch_store_b32 off, v165, s33 offset:408 +; GISEL-NEXT: scratch_store_b32 off, v166, s33 offset:412 +; GISEL-NEXT: scratch_store_b32 off, v167, s33 offset:416 +; GISEL-NEXT: scratch_store_b32 off, v176, s33 offset:420 +; GISEL-NEXT: scratch_store_b32 off, v177, s33 offset:424 +; GISEL-NEXT: scratch_store_b32 off, v178, s33 offset:428 +; GISEL-NEXT: scratch_store_b32 off, v179, s33 offset:432 +; GISEL-NEXT: scratch_store_b32 off, v180, s33 offset:436 +; GISEL-NEXT: scratch_store_b32 off, v181, s33 offset:440 +; GISEL-NEXT: scratch_store_b32 off, v182, s33 offset:444 +; GISEL-NEXT: scratch_store_b32 off, v183, s33 offset:448 +; GISEL-NEXT: scratch_store_b32 off, v192, s33 offset:452 +; GISEL-NEXT: scratch_store_b32 off, v193, s33 offset:456 +; GISEL-NEXT: scratch_store_b32 off, v194, s33 offset:460 +; GISEL-NEXT: scratch_store_b32 off, v195, s33 offset:464 +; GISEL-NEXT: scratch_store_b32 off, v196, s33 offset:468 +; GISEL-NEXT: scratch_store_b32 off, v197, s33 offset:472 +; GISEL-NEXT: scratch_store_b32 off, v198, s33 offset:476 +; GISEL-NEXT: scratch_store_b32 off, v199, s33 offset:480 +; GISEL-NEXT: scratch_store_b32 off, v208, s33 offset:484 +; GISEL-NEXT: scratch_store_b32 off, v209, s33 offset:488 +; GISEL-NEXT: scratch_store_b32 off, v210, s33 offset:492 +; GISEL-NEXT: scratch_store_b32 off, v211, s33 offset:496 +; GISEL-NEXT: scratch_store_b32 off, v212, s33 offset:500 +; GISEL-NEXT: scratch_store_b32 off, v213, s33 offset:504 +; GISEL-NEXT: scratch_store_b32 off, v214, s33 offset:508 +; GISEL-NEXT: scratch_store_b32 off, v215, s33 offset:512 +; GISEL-NEXT: s_clause 0xf +; GISEL-NEXT: scratch_store_b32 off, v224, s33 offset:516 +; GISEL-NEXT: scratch_store_b32 off, v225, s33 offset:520 +; GISEL-NEXT: scratch_store_b32 off, v226, s33 offset:524 +; GISEL-NEXT: scratch_store_b32 off, v227, s33 offset:528 +; GISEL-NEXT: scratch_store_b32 off, v228, s33 offset:532 +; GISEL-NEXT: scratch_store_b32 off, v229, s33 offset:536 +; GISEL-NEXT: scratch_store_b32 off, v230, s33 offset:540 +; GISEL-NEXT: scratch_store_b32 off, v231, s33 offset:544 +; GISEL-NEXT: scratch_store_b32 off, v240, s33 offset:548 +; GISEL-NEXT: scratch_store_b32 off, v241, s33 offset:552 +; GISEL-NEXT: scratch_store_b32 off, v242, s33 offset:556 +; GISEL-NEXT: scratch_store_b32 off, v243, s33 offset:560 +; GISEL-NEXT: scratch_store_b32 off, v244, s33 offset:564 +; GISEL-NEXT: scratch_store_b32 off, v245, s33 offset:568 +; GISEL-NEXT: scratch_store_b32 off, v246, s33 offset:572 +; GISEL-NEXT: scratch_store_b32 off, v247, s33 offset:576 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: v_writelane_b32 v40, s0, 3 +; GISEL-NEXT: v_mov_b32_e32 v2, v0 +; GISEL-NEXT: v_swap_b32 v0, v1 +; GISEL-NEXT: s_mov_b32 s0, gfx_callee@abs32@lo +; GISEL-NEXT: v_writelane_b32 v40, s4, 0 +; GISEL-NEXT: s_mov_b32 s1, gfx_callee@abs32@hi +; GISEL-NEXT: s_addk_co_i32 s32, 0x250 +; GISEL-NEXT: v_writelane_b32 v40, s30, 1 +; GISEL-NEXT: v_writelane_b32 v40, s31, 2 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_readlane_b32 s31, v40, 2 +; GISEL-NEXT: v_readlane_b32 s30, v40, 1 +; GISEL-NEXT: v_readlane_b32 s4, v40, 0 +; GISEL-NEXT: v_readlane_b32 s0, v40, 3 +; GISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GISEL-NEXT: s_mov_b32 s32, s33 +; GISEL-NEXT: s_xor_b32 exec_lo, s4, -1 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_load_b32 v0, off, s33 offset:4 +; GISEL-NEXT: scratch_load_b32 v1, off, s33 offset:8 +; GISEL-NEXT: scratch_load_b32 v2, off, s33 offset:12 +; GISEL-NEXT: scratch_load_b32 v3, off, s33 offset:16 +; GISEL-NEXT: scratch_load_b32 v4, off, s33 offset:20 +; GISEL-NEXT: scratch_load_b32 v5, off, s33 offset:24 +; GISEL-NEXT: scratch_load_b32 v6, off, s33 offset:28 +; GISEL-NEXT: scratch_load_b32 v7, off, s33 offset:32 +; GISEL-NEXT: scratch_load_b32 v8, off, s33 offset:36 +; GISEL-NEXT: scratch_load_b32 v9, off, s33 offset:40 +; GISEL-NEXT: scratch_load_b32 v10, off, s33 offset:44 +; GISEL-NEXT: scratch_load_b32 v11, off, s33 offset:48 +; GISEL-NEXT: scratch_load_b32 v12, off, s33 offset:52 +; GISEL-NEXT: scratch_load_b32 v13, off, s33 offset:56 +; GISEL-NEXT: scratch_load_b32 v14, off, s33 offset:60 +; GISEL-NEXT: scratch_load_b32 v15, off, s33 offset:64 +; GISEL-NEXT: scratch_load_b32 v16, off, s33 offset:68 +; GISEL-NEXT: scratch_load_b32 v17, off, s33 offset:72 +; GISEL-NEXT: scratch_load_b32 v18, off, s33 offset:76 +; GISEL-NEXT: scratch_load_b32 v19, off, s33 offset:80 +; GISEL-NEXT: scratch_load_b32 v20, off, s33 offset:84 +; GISEL-NEXT: scratch_load_b32 v21, off, s33 offset:88 +; GISEL-NEXT: scratch_load_b32 v22, off, s33 offset:92 +; GISEL-NEXT: scratch_load_b32 v23, off, s33 offset:96 +; GISEL-NEXT: scratch_load_b32 v24, off, s33 offset:100 +; GISEL-NEXT: scratch_load_b32 v25, off, s33 offset:104 +; GISEL-NEXT: scratch_load_b32 v26, off, s33 offset:108 +; GISEL-NEXT: scratch_load_b32 v27, off, s33 offset:112 +; GISEL-NEXT: scratch_load_b32 v28, off, s33 offset:116 +; GISEL-NEXT: scratch_load_b32 v29, off, s33 offset:120 +; GISEL-NEXT: scratch_load_b32 v30, off, s33 offset:124 +; GISEL-NEXT: scratch_load_b32 v31, off, s33 offset:128 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_load_b32 v32, off, s33 offset:132 +; GISEL-NEXT: scratch_load_b32 v33, off, s33 offset:136 +; GISEL-NEXT: scratch_load_b32 v34, off, s33 offset:140 +; GISEL-NEXT: scratch_load_b32 v35, off, s33 offset:144 +; GISEL-NEXT: scratch_load_b32 v36, off, s33 offset:148 +; GISEL-NEXT: scratch_load_b32 v37, off, s33 offset:152 +; GISEL-NEXT: scratch_load_b32 v38, off, s33 offset:156 +; GISEL-NEXT: scratch_load_b32 v39, off, s33 offset:160 +; GISEL-NEXT: scratch_load_b32 v48, off, s33 offset:164 +; GISEL-NEXT: scratch_load_b32 v49, off, s33 offset:168 +; GISEL-NEXT: scratch_load_b32 v50, off, s33 offset:172 +; GISEL-NEXT: scratch_load_b32 v51, off, s33 offset:176 +; GISEL-NEXT: scratch_load_b32 v52, off, s33 offset:180 +; GISEL-NEXT: scratch_load_b32 v53, off, s33 offset:184 +; GISEL-NEXT: scratch_load_b32 v54, off, s33 offset:188 +; GISEL-NEXT: scratch_load_b32 v55, off, s33 offset:192 +; GISEL-NEXT: scratch_load_b32 v64, off, s33 offset:196 +; GISEL-NEXT: scratch_load_b32 v65, off, s33 offset:200 +; GISEL-NEXT: scratch_load_b32 v66, off, s33 offset:204 +; GISEL-NEXT: scratch_load_b32 v67, off, s33 offset:208 +; GISEL-NEXT: scratch_load_b32 v68, off, s33 offset:212 +; GISEL-NEXT: scratch_load_b32 v69, off, s33 offset:216 +; GISEL-NEXT: scratch_load_b32 v70, off, s33 offset:220 +; GISEL-NEXT: scratch_load_b32 v71, off, s33 offset:224 +; GISEL-NEXT: scratch_load_b32 v80, off, s33 offset:228 +; GISEL-NEXT: scratch_load_b32 v81, off, s33 offset:232 +; GISEL-NEXT: scratch_load_b32 v82, off, s33 offset:236 +; GISEL-NEXT: scratch_load_b32 v83, off, s33 offset:240 +; GISEL-NEXT: scratch_load_b32 v84, off, s33 offset:244 +; GISEL-NEXT: scratch_load_b32 v85, off, s33 offset:248 +; GISEL-NEXT: scratch_load_b32 v86, off, s33 offset:252 +; GISEL-NEXT: scratch_load_b32 v87, off, s33 offset:256 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_load_b32 v96, off, s33 offset:260 +; GISEL-NEXT: scratch_load_b32 v97, off, s33 offset:264 +; GISEL-NEXT: scratch_load_b32 v98, off, s33 offset:268 +; GISEL-NEXT: scratch_load_b32 v99, off, s33 offset:272 +; GISEL-NEXT: scratch_load_b32 v100, off, s33 offset:276 +; GISEL-NEXT: scratch_load_b32 v101, off, s33 offset:280 +; GISEL-NEXT: scratch_load_b32 v102, off, s33 offset:284 +; GISEL-NEXT: scratch_load_b32 v103, off, s33 offset:288 +; GISEL-NEXT: scratch_load_b32 v112, off, s33 offset:292 +; GISEL-NEXT: scratch_load_b32 v113, off, s33 offset:296 +; GISEL-NEXT: scratch_load_b32 v114, off, s33 offset:300 +; GISEL-NEXT: scratch_load_b32 v115, off, s33 offset:304 +; GISEL-NEXT: scratch_load_b32 v116, off, s33 offset:308 +; GISEL-NEXT: scratch_load_b32 v117, off, s33 offset:312 +; GISEL-NEXT: scratch_load_b32 v118, off, s33 offset:316 +; GISEL-NEXT: scratch_load_b32 v119, off, s33 offset:320 +; GISEL-NEXT: scratch_load_b32 v128, off, s33 offset:324 +; GISEL-NEXT: scratch_load_b32 v129, off, s33 offset:328 +; GISEL-NEXT: scratch_load_b32 v130, off, s33 offset:332 +; GISEL-NEXT: scratch_load_b32 v131, off, s33 offset:336 +; GISEL-NEXT: scratch_load_b32 v132, off, s33 offset:340 +; GISEL-NEXT: scratch_load_b32 v133, off, s33 offset:344 +; GISEL-NEXT: scratch_load_b32 v134, off, s33 offset:348 +; GISEL-NEXT: scratch_load_b32 v135, off, s33 offset:352 +; GISEL-NEXT: scratch_load_b32 v144, off, s33 offset:356 +; GISEL-NEXT: scratch_load_b32 v145, off, s33 offset:360 +; GISEL-NEXT: scratch_load_b32 v146, off, s33 offset:364 +; GISEL-NEXT: scratch_load_b32 v147, off, s33 offset:368 +; GISEL-NEXT: scratch_load_b32 v148, off, s33 offset:372 +; GISEL-NEXT: scratch_load_b32 v149, off, s33 offset:376 +; GISEL-NEXT: scratch_load_b32 v150, off, s33 offset:380 +; GISEL-NEXT: scratch_load_b32 v151, off, s33 offset:384 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_load_b32 v160, off, s33 offset:388 +; GISEL-NEXT: scratch_load_b32 v161, off, s33 offset:392 +; GISEL-NEXT: scratch_load_b32 v162, off, s33 offset:396 +; GISEL-NEXT: scratch_load_b32 v163, off, s33 offset:400 +; GISEL-NEXT: scratch_load_b32 v164, off, s33 offset:404 +; GISEL-NEXT: scratch_load_b32 v165, off, s33 offset:408 +; GISEL-NEXT: scratch_load_b32 v166, off, s33 offset:412 +; GISEL-NEXT: scratch_load_b32 v167, off, s33 offset:416 +; GISEL-NEXT: scratch_load_b32 v176, off, s33 offset:420 +; GISEL-NEXT: scratch_load_b32 v177, off, s33 offset:424 +; GISEL-NEXT: scratch_load_b32 v178, off, s33 offset:428 +; GISEL-NEXT: scratch_load_b32 v179, off, s33 offset:432 +; GISEL-NEXT: scratch_load_b32 v180, off, s33 offset:436 +; GISEL-NEXT: scratch_load_b32 v181, off, s33 offset:440 +; GISEL-NEXT: scratch_load_b32 v182, off, s33 offset:444 +; GISEL-NEXT: scratch_load_b32 v183, off, s33 offset:448 +; GISEL-NEXT: scratch_load_b32 v192, off, s33 offset:452 +; GISEL-NEXT: scratch_load_b32 v193, off, s33 offset:456 +; GISEL-NEXT: scratch_load_b32 v194, off, s33 offset:460 +; GISEL-NEXT: scratch_load_b32 v195, off, s33 offset:464 +; GISEL-NEXT: scratch_load_b32 v196, off, s33 offset:468 +; GISEL-NEXT: scratch_load_b32 v197, off, s33 offset:472 +; GISEL-NEXT: scratch_load_b32 v198, off, s33 offset:476 +; GISEL-NEXT: scratch_load_b32 v199, off, s33 offset:480 +; GISEL-NEXT: scratch_load_b32 v208, off, s33 offset:484 +; GISEL-NEXT: scratch_load_b32 v209, off, s33 offset:488 +; GISEL-NEXT: scratch_load_b32 v210, off, s33 offset:492 +; GISEL-NEXT: scratch_load_b32 v211, off, s33 offset:496 +; GISEL-NEXT: scratch_load_b32 v212, off, s33 offset:500 +; GISEL-NEXT: scratch_load_b32 v213, off, s33 offset:504 +; GISEL-NEXT: scratch_load_b32 v214, off, s33 offset:508 +; GISEL-NEXT: scratch_load_b32 v215, off, s33 offset:512 +; GISEL-NEXT: s_clause 0xf +; GISEL-NEXT: scratch_load_b32 v224, off, s33 offset:516 +; GISEL-NEXT: scratch_load_b32 v225, off, s33 offset:520 +; GISEL-NEXT: scratch_load_b32 v226, off, s33 offset:524 +; GISEL-NEXT: scratch_load_b32 v227, off, s33 offset:528 +; GISEL-NEXT: scratch_load_b32 v228, off, s33 offset:532 +; GISEL-NEXT: scratch_load_b32 v229, off, s33 offset:536 +; GISEL-NEXT: scratch_load_b32 v230, off, s33 offset:540 +; GISEL-NEXT: scratch_load_b32 v231, off, s33 offset:544 +; GISEL-NEXT: scratch_load_b32 v240, off, s33 offset:548 +; GISEL-NEXT: scratch_load_b32 v241, off, s33 offset:552 +; GISEL-NEXT: scratch_load_b32 v242, off, s33 offset:556 +; GISEL-NEXT: scratch_load_b32 v243, off, s33 offset:560 +; GISEL-NEXT: scratch_load_b32 v244, off, s33 offset:564 +; GISEL-NEXT: scratch_load_b32 v245, off, s33 offset:568 +; GISEL-NEXT: scratch_load_b32 v246, off, s33 offset:572 +; GISEL-NEXT: scratch_load_b32 v247, off, s33 offset:576 +; GISEL-NEXT: s_mov_b32 exec_lo, s4 +; GISEL-NEXT: s_mov_b32 s33, s0 +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: call_gfx_from_whole_wave: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_mov_b32 s0, s33 +; DAGISEL64-NEXT: s_mov_b32 s33, s32 +; DAGISEL64-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s33 offset:4 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s33 offset:8 +; DAGISEL64-NEXT: scratch_store_b32 off, v2, s33 offset:12 +; DAGISEL64-NEXT: scratch_store_b32 off, v3, s33 offset:16 +; DAGISEL64-NEXT: scratch_store_b32 off, v4, s33 offset:20 +; DAGISEL64-NEXT: scratch_store_b32 off, v5, s33 offset:24 +; DAGISEL64-NEXT: scratch_store_b32 off, v6, s33 offset:28 +; DAGISEL64-NEXT: scratch_store_b32 off, v7, s33 offset:32 +; DAGISEL64-NEXT: scratch_store_b32 off, v8, s33 offset:36 +; DAGISEL64-NEXT: scratch_store_b32 off, v9, s33 offset:40 +; DAGISEL64-NEXT: scratch_store_b32 off, v10, s33 offset:44 +; DAGISEL64-NEXT: scratch_store_b32 off, v11, s33 offset:48 +; DAGISEL64-NEXT: scratch_store_b32 off, v12, s33 offset:52 +; DAGISEL64-NEXT: scratch_store_b32 off, v13, s33 offset:56 +; DAGISEL64-NEXT: scratch_store_b32 off, v14, s33 offset:60 +; DAGISEL64-NEXT: scratch_store_b32 off, v15, s33 offset:64 +; DAGISEL64-NEXT: scratch_store_b32 off, v16, s33 offset:68 +; DAGISEL64-NEXT: scratch_store_b32 off, v17, s33 offset:72 +; DAGISEL64-NEXT: scratch_store_b32 off, v18, s33 offset:76 +; DAGISEL64-NEXT: scratch_store_b32 off, v19, s33 offset:80 +; DAGISEL64-NEXT: scratch_store_b32 off, v20, s33 offset:84 +; DAGISEL64-NEXT: scratch_store_b32 off, v21, s33 offset:88 +; DAGISEL64-NEXT: scratch_store_b32 off, v22, s33 offset:92 +; DAGISEL64-NEXT: scratch_store_b32 off, v23, s33 offset:96 +; DAGISEL64-NEXT: scratch_store_b32 off, v24, s33 offset:100 +; DAGISEL64-NEXT: scratch_store_b32 off, v25, s33 offset:104 +; DAGISEL64-NEXT: scratch_store_b32 off, v26, s33 offset:108 +; DAGISEL64-NEXT: scratch_store_b32 off, v27, s33 offset:112 +; DAGISEL64-NEXT: scratch_store_b32 off, v28, s33 offset:116 +; DAGISEL64-NEXT: scratch_store_b32 off, v29, s33 offset:120 +; DAGISEL64-NEXT: scratch_store_b32 off, v30, s33 offset:124 +; DAGISEL64-NEXT: scratch_store_b32 off, v31, s33 offset:128 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_store_b32 off, v32, s33 offset:132 +; DAGISEL64-NEXT: scratch_store_b32 off, v33, s33 offset:136 +; DAGISEL64-NEXT: scratch_store_b32 off, v34, s33 offset:140 +; DAGISEL64-NEXT: scratch_store_b32 off, v35, s33 offset:144 +; DAGISEL64-NEXT: scratch_store_b32 off, v36, s33 offset:148 +; DAGISEL64-NEXT: scratch_store_b32 off, v37, s33 offset:152 +; DAGISEL64-NEXT: scratch_store_b32 off, v38, s33 offset:156 +; DAGISEL64-NEXT: scratch_store_b32 off, v39, s33 offset:160 +; DAGISEL64-NEXT: scratch_store_b32 off, v48, s33 offset:164 +; DAGISEL64-NEXT: scratch_store_b32 off, v49, s33 offset:168 +; DAGISEL64-NEXT: scratch_store_b32 off, v50, s33 offset:172 +; DAGISEL64-NEXT: scratch_store_b32 off, v51, s33 offset:176 +; DAGISEL64-NEXT: scratch_store_b32 off, v52, s33 offset:180 +; DAGISEL64-NEXT: scratch_store_b32 off, v53, s33 offset:184 +; DAGISEL64-NEXT: scratch_store_b32 off, v54, s33 offset:188 +; DAGISEL64-NEXT: scratch_store_b32 off, v55, s33 offset:192 +; DAGISEL64-NEXT: scratch_store_b32 off, v64, s33 offset:196 +; DAGISEL64-NEXT: scratch_store_b32 off, v65, s33 offset:200 +; DAGISEL64-NEXT: scratch_store_b32 off, v66, s33 offset:204 +; DAGISEL64-NEXT: scratch_store_b32 off, v67, s33 offset:208 +; DAGISEL64-NEXT: scratch_store_b32 off, v68, s33 offset:212 +; DAGISEL64-NEXT: scratch_store_b32 off, v69, s33 offset:216 +; DAGISEL64-NEXT: scratch_store_b32 off, v70, s33 offset:220 +; DAGISEL64-NEXT: scratch_store_b32 off, v71, s33 offset:224 +; DAGISEL64-NEXT: scratch_store_b32 off, v80, s33 offset:228 +; DAGISEL64-NEXT: scratch_store_b32 off, v81, s33 offset:232 +; DAGISEL64-NEXT: scratch_store_b32 off, v82, s33 offset:236 +; DAGISEL64-NEXT: scratch_store_b32 off, v83, s33 offset:240 +; DAGISEL64-NEXT: scratch_store_b32 off, v84, s33 offset:244 +; DAGISEL64-NEXT: scratch_store_b32 off, v85, s33 offset:248 +; DAGISEL64-NEXT: scratch_store_b32 off, v86, s33 offset:252 +; DAGISEL64-NEXT: scratch_store_b32 off, v87, s33 offset:256 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_store_b32 off, v96, s33 offset:260 +; DAGISEL64-NEXT: scratch_store_b32 off, v97, s33 offset:264 +; DAGISEL64-NEXT: scratch_store_b32 off, v98, s33 offset:268 +; DAGISEL64-NEXT: scratch_store_b32 off, v99, s33 offset:272 +; DAGISEL64-NEXT: scratch_store_b32 off, v100, s33 offset:276 +; DAGISEL64-NEXT: scratch_store_b32 off, v101, s33 offset:280 +; DAGISEL64-NEXT: scratch_store_b32 off, v102, s33 offset:284 +; DAGISEL64-NEXT: scratch_store_b32 off, v103, s33 offset:288 +; DAGISEL64-NEXT: scratch_store_b32 off, v112, s33 offset:292 +; DAGISEL64-NEXT: scratch_store_b32 off, v113, s33 offset:296 +; DAGISEL64-NEXT: scratch_store_b32 off, v114, s33 offset:300 +; DAGISEL64-NEXT: scratch_store_b32 off, v115, s33 offset:304 +; DAGISEL64-NEXT: scratch_store_b32 off, v116, s33 offset:308 +; DAGISEL64-NEXT: scratch_store_b32 off, v117, s33 offset:312 +; DAGISEL64-NEXT: scratch_store_b32 off, v118, s33 offset:316 +; DAGISEL64-NEXT: scratch_store_b32 off, v119, s33 offset:320 +; DAGISEL64-NEXT: scratch_store_b32 off, v128, s33 offset:324 +; DAGISEL64-NEXT: scratch_store_b32 off, v129, s33 offset:328 +; DAGISEL64-NEXT: scratch_store_b32 off, v130, s33 offset:332 +; DAGISEL64-NEXT: scratch_store_b32 off, v131, s33 offset:336 +; DAGISEL64-NEXT: scratch_store_b32 off, v132, s33 offset:340 +; DAGISEL64-NEXT: scratch_store_b32 off, v133, s33 offset:344 +; DAGISEL64-NEXT: scratch_store_b32 off, v134, s33 offset:348 +; DAGISEL64-NEXT: scratch_store_b32 off, v135, s33 offset:352 +; DAGISEL64-NEXT: scratch_store_b32 off, v144, s33 offset:356 +; DAGISEL64-NEXT: scratch_store_b32 off, v145, s33 offset:360 +; DAGISEL64-NEXT: scratch_store_b32 off, v146, s33 offset:364 +; DAGISEL64-NEXT: scratch_store_b32 off, v147, s33 offset:368 +; DAGISEL64-NEXT: scratch_store_b32 off, v148, s33 offset:372 +; DAGISEL64-NEXT: scratch_store_b32 off, v149, s33 offset:376 +; DAGISEL64-NEXT: scratch_store_b32 off, v150, s33 offset:380 +; DAGISEL64-NEXT: scratch_store_b32 off, v151, s33 offset:384 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_store_b32 off, v160, s33 offset:388 +; DAGISEL64-NEXT: scratch_store_b32 off, v161, s33 offset:392 +; DAGISEL64-NEXT: scratch_store_b32 off, v162, s33 offset:396 +; DAGISEL64-NEXT: scratch_store_b32 off, v163, s33 offset:400 +; DAGISEL64-NEXT: scratch_store_b32 off, v164, s33 offset:404 +; DAGISEL64-NEXT: scratch_store_b32 off, v165, s33 offset:408 +; DAGISEL64-NEXT: scratch_store_b32 off, v166, s33 offset:412 +; DAGISEL64-NEXT: scratch_store_b32 off, v167, s33 offset:416 +; DAGISEL64-NEXT: scratch_store_b32 off, v176, s33 offset:420 +; DAGISEL64-NEXT: scratch_store_b32 off, v177, s33 offset:424 +; DAGISEL64-NEXT: scratch_store_b32 off, v178, s33 offset:428 +; DAGISEL64-NEXT: scratch_store_b32 off, v179, s33 offset:432 +; DAGISEL64-NEXT: scratch_store_b32 off, v180, s33 offset:436 +; DAGISEL64-NEXT: scratch_store_b32 off, v181, s33 offset:440 +; DAGISEL64-NEXT: scratch_store_b32 off, v182, s33 offset:444 +; DAGISEL64-NEXT: scratch_store_b32 off, v183, s33 offset:448 +; DAGISEL64-NEXT: scratch_store_b32 off, v192, s33 offset:452 +; DAGISEL64-NEXT: scratch_store_b32 off, v193, s33 offset:456 +; DAGISEL64-NEXT: scratch_store_b32 off, v194, s33 offset:460 +; DAGISEL64-NEXT: scratch_store_b32 off, v195, s33 offset:464 +; DAGISEL64-NEXT: scratch_store_b32 off, v196, s33 offset:468 +; DAGISEL64-NEXT: scratch_store_b32 off, v197, s33 offset:472 +; DAGISEL64-NEXT: scratch_store_b32 off, v198, s33 offset:476 +; DAGISEL64-NEXT: scratch_store_b32 off, v199, s33 offset:480 +; DAGISEL64-NEXT: scratch_store_b32 off, v208, s33 offset:484 +; DAGISEL64-NEXT: scratch_store_b32 off, v209, s33 offset:488 +; DAGISEL64-NEXT: scratch_store_b32 off, v210, s33 offset:492 +; DAGISEL64-NEXT: scratch_store_b32 off, v211, s33 offset:496 +; DAGISEL64-NEXT: scratch_store_b32 off, v212, s33 offset:500 +; DAGISEL64-NEXT: scratch_store_b32 off, v213, s33 offset:504 +; DAGISEL64-NEXT: scratch_store_b32 off, v214, s33 offset:508 +; DAGISEL64-NEXT: scratch_store_b32 off, v215, s33 offset:512 +; DAGISEL64-NEXT: s_clause 0xf +; DAGISEL64-NEXT: scratch_store_b32 off, v224, s33 offset:516 +; DAGISEL64-NEXT: scratch_store_b32 off, v225, s33 offset:520 +; DAGISEL64-NEXT: scratch_store_b32 off, v226, s33 offset:524 +; DAGISEL64-NEXT: scratch_store_b32 off, v227, s33 offset:528 +; DAGISEL64-NEXT: scratch_store_b32 off, v228, s33 offset:532 +; DAGISEL64-NEXT: scratch_store_b32 off, v229, s33 offset:536 +; DAGISEL64-NEXT: scratch_store_b32 off, v230, s33 offset:540 +; DAGISEL64-NEXT: scratch_store_b32 off, v231, s33 offset:544 +; DAGISEL64-NEXT: scratch_store_b32 off, v240, s33 offset:548 +; DAGISEL64-NEXT: scratch_store_b32 off, v241, s33 offset:552 +; DAGISEL64-NEXT: scratch_store_b32 off, v242, s33 offset:556 +; DAGISEL64-NEXT: scratch_store_b32 off, v243, s33 offset:560 +; DAGISEL64-NEXT: scratch_store_b32 off, v244, s33 offset:564 +; DAGISEL64-NEXT: scratch_store_b32 off, v245, s33 offset:568 +; DAGISEL64-NEXT: scratch_store_b32 off, v246, s33 offset:572 +; DAGISEL64-NEXT: scratch_store_b32 off, v247, s33 offset:576 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: v_writelane_b32 v40, s0, 4 +; DAGISEL64-NEXT: v_mov_b32_e32 v2, v0 +; DAGISEL64-NEXT: v_swap_b32 v0, v1 +; DAGISEL64-NEXT: s_mov_b32 s1, gfx_callee@abs32@hi +; DAGISEL64-NEXT: v_writelane_b32 v40, s4, 0 +; DAGISEL64-NEXT: s_mov_b32 s0, gfx_callee@abs32@lo +; DAGISEL64-NEXT: s_addk_co_i32 s32, 0x250 +; DAGISEL64-NEXT: v_writelane_b32 v40, s5, 1 +; DAGISEL64-NEXT: v_writelane_b32 v40, s30, 2 +; DAGISEL64-NEXT: v_writelane_b32 v40, s31, 3 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1] +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL64-NEXT: v_readlane_b32 s31, v40, 3 +; DAGISEL64-NEXT: v_readlane_b32 s30, v40, 2 +; DAGISEL64-NEXT: v_readlane_b32 s5, v40, 1 +; DAGISEL64-NEXT: v_readlane_b32 s4, v40, 0 +; DAGISEL64-NEXT: v_readlane_b32 s0, v40, 4 +; DAGISEL64-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; DAGISEL64-NEXT: s_mov_b32 s32, s33 +; DAGISEL64-NEXT: s_xor_b64 exec, s[4:5], -1 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s33 offset:4 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s33 offset:8 +; DAGISEL64-NEXT: scratch_load_b32 v2, off, s33 offset:12 +; DAGISEL64-NEXT: scratch_load_b32 v3, off, s33 offset:16 +; DAGISEL64-NEXT: scratch_load_b32 v4, off, s33 offset:20 +; DAGISEL64-NEXT: scratch_load_b32 v5, off, s33 offset:24 +; DAGISEL64-NEXT: scratch_load_b32 v6, off, s33 offset:28 +; DAGISEL64-NEXT: scratch_load_b32 v7, off, s33 offset:32 +; DAGISEL64-NEXT: scratch_load_b32 v8, off, s33 offset:36 +; DAGISEL64-NEXT: scratch_load_b32 v9, off, s33 offset:40 +; DAGISEL64-NEXT: scratch_load_b32 v10, off, s33 offset:44 +; DAGISEL64-NEXT: scratch_load_b32 v11, off, s33 offset:48 +; DAGISEL64-NEXT: scratch_load_b32 v12, off, s33 offset:52 +; DAGISEL64-NEXT: scratch_load_b32 v13, off, s33 offset:56 +; DAGISEL64-NEXT: scratch_load_b32 v14, off, s33 offset:60 +; DAGISEL64-NEXT: scratch_load_b32 v15, off, s33 offset:64 +; DAGISEL64-NEXT: scratch_load_b32 v16, off, s33 offset:68 +; DAGISEL64-NEXT: scratch_load_b32 v17, off, s33 offset:72 +; DAGISEL64-NEXT: scratch_load_b32 v18, off, s33 offset:76 +; DAGISEL64-NEXT: scratch_load_b32 v19, off, s33 offset:80 +; DAGISEL64-NEXT: scratch_load_b32 v20, off, s33 offset:84 +; DAGISEL64-NEXT: scratch_load_b32 v21, off, s33 offset:88 +; DAGISEL64-NEXT: scratch_load_b32 v22, off, s33 offset:92 +; DAGISEL64-NEXT: scratch_load_b32 v23, off, s33 offset:96 +; DAGISEL64-NEXT: scratch_load_b32 v24, off, s33 offset:100 +; DAGISEL64-NEXT: scratch_load_b32 v25, off, s33 offset:104 +; DAGISEL64-NEXT: scratch_load_b32 v26, off, s33 offset:108 +; DAGISEL64-NEXT: scratch_load_b32 v27, off, s33 offset:112 +; DAGISEL64-NEXT: scratch_load_b32 v28, off, s33 offset:116 +; DAGISEL64-NEXT: scratch_load_b32 v29, off, s33 offset:120 +; DAGISEL64-NEXT: scratch_load_b32 v30, off, s33 offset:124 +; DAGISEL64-NEXT: scratch_load_b32 v31, off, s33 offset:128 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_load_b32 v32, off, s33 offset:132 +; DAGISEL64-NEXT: scratch_load_b32 v33, off, s33 offset:136 +; DAGISEL64-NEXT: scratch_load_b32 v34, off, s33 offset:140 +; DAGISEL64-NEXT: scratch_load_b32 v35, off, s33 offset:144 +; DAGISEL64-NEXT: scratch_load_b32 v36, off, s33 offset:148 +; DAGISEL64-NEXT: scratch_load_b32 v37, off, s33 offset:152 +; DAGISEL64-NEXT: scratch_load_b32 v38, off, s33 offset:156 +; DAGISEL64-NEXT: scratch_load_b32 v39, off, s33 offset:160 +; DAGISEL64-NEXT: scratch_load_b32 v48, off, s33 offset:164 +; DAGISEL64-NEXT: scratch_load_b32 v49, off, s33 offset:168 +; DAGISEL64-NEXT: scratch_load_b32 v50, off, s33 offset:172 +; DAGISEL64-NEXT: scratch_load_b32 v51, off, s33 offset:176 +; DAGISEL64-NEXT: scratch_load_b32 v52, off, s33 offset:180 +; DAGISEL64-NEXT: scratch_load_b32 v53, off, s33 offset:184 +; DAGISEL64-NEXT: scratch_load_b32 v54, off, s33 offset:188 +; DAGISEL64-NEXT: scratch_load_b32 v55, off, s33 offset:192 +; DAGISEL64-NEXT: scratch_load_b32 v64, off, s33 offset:196 +; DAGISEL64-NEXT: scratch_load_b32 v65, off, s33 offset:200 +; DAGISEL64-NEXT: scratch_load_b32 v66, off, s33 offset:204 +; DAGISEL64-NEXT: scratch_load_b32 v67, off, s33 offset:208 +; DAGISEL64-NEXT: scratch_load_b32 v68, off, s33 offset:212 +; DAGISEL64-NEXT: scratch_load_b32 v69, off, s33 offset:216 +; DAGISEL64-NEXT: scratch_load_b32 v70, off, s33 offset:220 +; DAGISEL64-NEXT: scratch_load_b32 v71, off, s33 offset:224 +; DAGISEL64-NEXT: scratch_load_b32 v80, off, s33 offset:228 +; DAGISEL64-NEXT: scratch_load_b32 v81, off, s33 offset:232 +; DAGISEL64-NEXT: scratch_load_b32 v82, off, s33 offset:236 +; DAGISEL64-NEXT: scratch_load_b32 v83, off, s33 offset:240 +; DAGISEL64-NEXT: scratch_load_b32 v84, off, s33 offset:244 +; DAGISEL64-NEXT: scratch_load_b32 v85, off, s33 offset:248 +; DAGISEL64-NEXT: scratch_load_b32 v86, off, s33 offset:252 +; DAGISEL64-NEXT: scratch_load_b32 v87, off, s33 offset:256 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_load_b32 v96, off, s33 offset:260 +; DAGISEL64-NEXT: scratch_load_b32 v97, off, s33 offset:264 +; DAGISEL64-NEXT: scratch_load_b32 v98, off, s33 offset:268 +; DAGISEL64-NEXT: scratch_load_b32 v99, off, s33 offset:272 +; DAGISEL64-NEXT: scratch_load_b32 v100, off, s33 offset:276 +; DAGISEL64-NEXT: scratch_load_b32 v101, off, s33 offset:280 +; DAGISEL64-NEXT: scratch_load_b32 v102, off, s33 offset:284 +; DAGISEL64-NEXT: scratch_load_b32 v103, off, s33 offset:288 +; DAGISEL64-NEXT: scratch_load_b32 v112, off, s33 offset:292 +; DAGISEL64-NEXT: scratch_load_b32 v113, off, s33 offset:296 +; DAGISEL64-NEXT: scratch_load_b32 v114, off, s33 offset:300 +; DAGISEL64-NEXT: scratch_load_b32 v115, off, s33 offset:304 +; DAGISEL64-NEXT: scratch_load_b32 v116, off, s33 offset:308 +; DAGISEL64-NEXT: scratch_load_b32 v117, off, s33 offset:312 +; DAGISEL64-NEXT: scratch_load_b32 v118, off, s33 offset:316 +; DAGISEL64-NEXT: scratch_load_b32 v119, off, s33 offset:320 +; DAGISEL64-NEXT: scratch_load_b32 v128, off, s33 offset:324 +; DAGISEL64-NEXT: scratch_load_b32 v129, off, s33 offset:328 +; DAGISEL64-NEXT: scratch_load_b32 v130, off, s33 offset:332 +; DAGISEL64-NEXT: scratch_load_b32 v131, off, s33 offset:336 +; DAGISEL64-NEXT: scratch_load_b32 v132, off, s33 offset:340 +; DAGISEL64-NEXT: scratch_load_b32 v133, off, s33 offset:344 +; DAGISEL64-NEXT: scratch_load_b32 v134, off, s33 offset:348 +; DAGISEL64-NEXT: scratch_load_b32 v135, off, s33 offset:352 +; DAGISEL64-NEXT: scratch_load_b32 v144, off, s33 offset:356 +; DAGISEL64-NEXT: scratch_load_b32 v145, off, s33 offset:360 +; DAGISEL64-NEXT: scratch_load_b32 v146, off, s33 offset:364 +; DAGISEL64-NEXT: scratch_load_b32 v147, off, s33 offset:368 +; DAGISEL64-NEXT: scratch_load_b32 v148, off, s33 offset:372 +; DAGISEL64-NEXT: scratch_load_b32 v149, off, s33 offset:376 +; DAGISEL64-NEXT: scratch_load_b32 v150, off, s33 offset:380 +; DAGISEL64-NEXT: scratch_load_b32 v151, off, s33 offset:384 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_load_b32 v160, off, s33 offset:388 +; DAGISEL64-NEXT: scratch_load_b32 v161, off, s33 offset:392 +; DAGISEL64-NEXT: scratch_load_b32 v162, off, s33 offset:396 +; DAGISEL64-NEXT: scratch_load_b32 v163, off, s33 offset:400 +; DAGISEL64-NEXT: scratch_load_b32 v164, off, s33 offset:404 +; DAGISEL64-NEXT: scratch_load_b32 v165, off, s33 offset:408 +; DAGISEL64-NEXT: scratch_load_b32 v166, off, s33 offset:412 +; DAGISEL64-NEXT: scratch_load_b32 v167, off, s33 offset:416 +; DAGISEL64-NEXT: scratch_load_b32 v176, off, s33 offset:420 +; DAGISEL64-NEXT: scratch_load_b32 v177, off, s33 offset:424 +; DAGISEL64-NEXT: scratch_load_b32 v178, off, s33 offset:428 +; DAGISEL64-NEXT: scratch_load_b32 v179, off, s33 offset:432 +; DAGISEL64-NEXT: scratch_load_b32 v180, off, s33 offset:436 +; DAGISEL64-NEXT: scratch_load_b32 v181, off, s33 offset:440 +; DAGISEL64-NEXT: scratch_load_b32 v182, off, s33 offset:444 +; DAGISEL64-NEXT: scratch_load_b32 v183, off, s33 offset:448 +; DAGISEL64-NEXT: scratch_load_b32 v192, off, s33 offset:452 +; DAGISEL64-NEXT: scratch_load_b32 v193, off, s33 offset:456 +; DAGISEL64-NEXT: scratch_load_b32 v194, off, s33 offset:460 +; DAGISEL64-NEXT: scratch_load_b32 v195, off, s33 offset:464 +; DAGISEL64-NEXT: scratch_load_b32 v196, off, s33 offset:468 +; DAGISEL64-NEXT: scratch_load_b32 v197, off, s33 offset:472 +; DAGISEL64-NEXT: scratch_load_b32 v198, off, s33 offset:476 +; DAGISEL64-NEXT: scratch_load_b32 v199, off, s33 offset:480 +; DAGISEL64-NEXT: scratch_load_b32 v208, off, s33 offset:484 +; DAGISEL64-NEXT: scratch_load_b32 v209, off, s33 offset:488 +; DAGISEL64-NEXT: scratch_load_b32 v210, off, s33 offset:492 +; DAGISEL64-NEXT: scratch_load_b32 v211, off, s33 offset:496 +; DAGISEL64-NEXT: scratch_load_b32 v212, off, s33 offset:500 +; DAGISEL64-NEXT: scratch_load_b32 v213, off, s33 offset:504 +; DAGISEL64-NEXT: scratch_load_b32 v214, off, s33 offset:508 +; DAGISEL64-NEXT: scratch_load_b32 v215, off, s33 offset:512 +; DAGISEL64-NEXT: s_clause 0xf +; DAGISEL64-NEXT: scratch_load_b32 v224, off, s33 offset:516 +; DAGISEL64-NEXT: scratch_load_b32 v225, off, s33 offset:520 +; DAGISEL64-NEXT: scratch_load_b32 v226, off, s33 offset:524 +; DAGISEL64-NEXT: scratch_load_b32 v227, off, s33 offset:528 +; DAGISEL64-NEXT: scratch_load_b32 v228, off, s33 offset:532 +; DAGISEL64-NEXT: scratch_load_b32 v229, off, s33 offset:536 +; DAGISEL64-NEXT: scratch_load_b32 v230, off, s33 offset:540 +; DAGISEL64-NEXT: scratch_load_b32 v231, off, s33 offset:544 +; DAGISEL64-NEXT: scratch_load_b32 v240, off, s33 offset:548 +; DAGISEL64-NEXT: scratch_load_b32 v241, off, s33 offset:552 +; DAGISEL64-NEXT: scratch_load_b32 v242, off, s33 offset:556 +; DAGISEL64-NEXT: scratch_load_b32 v243, off, s33 offset:560 +; DAGISEL64-NEXT: scratch_load_b32 v244, off, s33 offset:564 +; DAGISEL64-NEXT: scratch_load_b32 v245, off, s33 offset:568 +; DAGISEL64-NEXT: scratch_load_b32 v246, off, s33 offset:572 +; DAGISEL64-NEXT: scratch_load_b32 v247, off, s33 offset:576 +; DAGISEL64-NEXT: s_mov_b64 exec, s[4:5] +; DAGISEL64-NEXT: s_mov_b32 s33, s0 +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: call_gfx_from_whole_wave: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_mov_b32 s0, s33 +; GISEL64-NEXT: s_mov_b32 s33, s32 +; GISEL64-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_store_b32 off, v0, s33 offset:4 +; GISEL64-NEXT: scratch_store_b32 off, v1, s33 offset:8 +; GISEL64-NEXT: scratch_store_b32 off, v2, s33 offset:12 +; GISEL64-NEXT: scratch_store_b32 off, v3, s33 offset:16 +; GISEL64-NEXT: scratch_store_b32 off, v4, s33 offset:20 +; GISEL64-NEXT: scratch_store_b32 off, v5, s33 offset:24 +; GISEL64-NEXT: scratch_store_b32 off, v6, s33 offset:28 +; GISEL64-NEXT: scratch_store_b32 off, v7, s33 offset:32 +; GISEL64-NEXT: scratch_store_b32 off, v8, s33 offset:36 +; GISEL64-NEXT: scratch_store_b32 off, v9, s33 offset:40 +; GISEL64-NEXT: scratch_store_b32 off, v10, s33 offset:44 +; GISEL64-NEXT: scratch_store_b32 off, v11, s33 offset:48 +; GISEL64-NEXT: scratch_store_b32 off, v12, s33 offset:52 +; GISEL64-NEXT: scratch_store_b32 off, v13, s33 offset:56 +; GISEL64-NEXT: scratch_store_b32 off, v14, s33 offset:60 +; GISEL64-NEXT: scratch_store_b32 off, v15, s33 offset:64 +; GISEL64-NEXT: scratch_store_b32 off, v16, s33 offset:68 +; GISEL64-NEXT: scratch_store_b32 off, v17, s33 offset:72 +; GISEL64-NEXT: scratch_store_b32 off, v18, s33 offset:76 +; GISEL64-NEXT: scratch_store_b32 off, v19, s33 offset:80 +; GISEL64-NEXT: scratch_store_b32 off, v20, s33 offset:84 +; GISEL64-NEXT: scratch_store_b32 off, v21, s33 offset:88 +; GISEL64-NEXT: scratch_store_b32 off, v22, s33 offset:92 +; GISEL64-NEXT: scratch_store_b32 off, v23, s33 offset:96 +; GISEL64-NEXT: scratch_store_b32 off, v24, s33 offset:100 +; GISEL64-NEXT: scratch_store_b32 off, v25, s33 offset:104 +; GISEL64-NEXT: scratch_store_b32 off, v26, s33 offset:108 +; GISEL64-NEXT: scratch_store_b32 off, v27, s33 offset:112 +; GISEL64-NEXT: scratch_store_b32 off, v28, s33 offset:116 +; GISEL64-NEXT: scratch_store_b32 off, v29, s33 offset:120 +; GISEL64-NEXT: scratch_store_b32 off, v30, s33 offset:124 +; GISEL64-NEXT: scratch_store_b32 off, v31, s33 offset:128 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_store_b32 off, v32, s33 offset:132 +; GISEL64-NEXT: scratch_store_b32 off, v33, s33 offset:136 +; GISEL64-NEXT: scratch_store_b32 off, v34, s33 offset:140 +; GISEL64-NEXT: scratch_store_b32 off, v35, s33 offset:144 +; GISEL64-NEXT: scratch_store_b32 off, v36, s33 offset:148 +; GISEL64-NEXT: scratch_store_b32 off, v37, s33 offset:152 +; GISEL64-NEXT: scratch_store_b32 off, v38, s33 offset:156 +; GISEL64-NEXT: scratch_store_b32 off, v39, s33 offset:160 +; GISEL64-NEXT: scratch_store_b32 off, v48, s33 offset:164 +; GISEL64-NEXT: scratch_store_b32 off, v49, s33 offset:168 +; GISEL64-NEXT: scratch_store_b32 off, v50, s33 offset:172 +; GISEL64-NEXT: scratch_store_b32 off, v51, s33 offset:176 +; GISEL64-NEXT: scratch_store_b32 off, v52, s33 offset:180 +; GISEL64-NEXT: scratch_store_b32 off, v53, s33 offset:184 +; GISEL64-NEXT: scratch_store_b32 off, v54, s33 offset:188 +; GISEL64-NEXT: scratch_store_b32 off, v55, s33 offset:192 +; GISEL64-NEXT: scratch_store_b32 off, v64, s33 offset:196 +; GISEL64-NEXT: scratch_store_b32 off, v65, s33 offset:200 +; GISEL64-NEXT: scratch_store_b32 off, v66, s33 offset:204 +; GISEL64-NEXT: scratch_store_b32 off, v67, s33 offset:208 +; GISEL64-NEXT: scratch_store_b32 off, v68, s33 offset:212 +; GISEL64-NEXT: scratch_store_b32 off, v69, s33 offset:216 +; GISEL64-NEXT: scratch_store_b32 off, v70, s33 offset:220 +; GISEL64-NEXT: scratch_store_b32 off, v71, s33 offset:224 +; GISEL64-NEXT: scratch_store_b32 off, v80, s33 offset:228 +; GISEL64-NEXT: scratch_store_b32 off, v81, s33 offset:232 +; GISEL64-NEXT: scratch_store_b32 off, v82, s33 offset:236 +; GISEL64-NEXT: scratch_store_b32 off, v83, s33 offset:240 +; GISEL64-NEXT: scratch_store_b32 off, v84, s33 offset:244 +; GISEL64-NEXT: scratch_store_b32 off, v85, s33 offset:248 +; GISEL64-NEXT: scratch_store_b32 off, v86, s33 offset:252 +; GISEL64-NEXT: scratch_store_b32 off, v87, s33 offset:256 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_store_b32 off, v96, s33 offset:260 +; GISEL64-NEXT: scratch_store_b32 off, v97, s33 offset:264 +; GISEL64-NEXT: scratch_store_b32 off, v98, s33 offset:268 +; GISEL64-NEXT: scratch_store_b32 off, v99, s33 offset:272 +; GISEL64-NEXT: scratch_store_b32 off, v100, s33 offset:276 +; GISEL64-NEXT: scratch_store_b32 off, v101, s33 offset:280 +; GISEL64-NEXT: scratch_store_b32 off, v102, s33 offset:284 +; GISEL64-NEXT: scratch_store_b32 off, v103, s33 offset:288 +; GISEL64-NEXT: scratch_store_b32 off, v112, s33 offset:292 +; GISEL64-NEXT: scratch_store_b32 off, v113, s33 offset:296 +; GISEL64-NEXT: scratch_store_b32 off, v114, s33 offset:300 +; GISEL64-NEXT: scratch_store_b32 off, v115, s33 offset:304 +; GISEL64-NEXT: scratch_store_b32 off, v116, s33 offset:308 +; GISEL64-NEXT: scratch_store_b32 off, v117, s33 offset:312 +; GISEL64-NEXT: scratch_store_b32 off, v118, s33 offset:316 +; GISEL64-NEXT: scratch_store_b32 off, v119, s33 offset:320 +; GISEL64-NEXT: scratch_store_b32 off, v128, s33 offset:324 +; GISEL64-NEXT: scratch_store_b32 off, v129, s33 offset:328 +; GISEL64-NEXT: scratch_store_b32 off, v130, s33 offset:332 +; GISEL64-NEXT: scratch_store_b32 off, v131, s33 offset:336 +; GISEL64-NEXT: scratch_store_b32 off, v132, s33 offset:340 +; GISEL64-NEXT: scratch_store_b32 off, v133, s33 offset:344 +; GISEL64-NEXT: scratch_store_b32 off, v134, s33 offset:348 +; GISEL64-NEXT: scratch_store_b32 off, v135, s33 offset:352 +; GISEL64-NEXT: scratch_store_b32 off, v144, s33 offset:356 +; GISEL64-NEXT: scratch_store_b32 off, v145, s33 offset:360 +; GISEL64-NEXT: scratch_store_b32 off, v146, s33 offset:364 +; GISEL64-NEXT: scratch_store_b32 off, v147, s33 offset:368 +; GISEL64-NEXT: scratch_store_b32 off, v148, s33 offset:372 +; GISEL64-NEXT: scratch_store_b32 off, v149, s33 offset:376 +; GISEL64-NEXT: scratch_store_b32 off, v150, s33 offset:380 +; GISEL64-NEXT: scratch_store_b32 off, v151, s33 offset:384 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_store_b32 off, v160, s33 offset:388 +; GISEL64-NEXT: scratch_store_b32 off, v161, s33 offset:392 +; GISEL64-NEXT: scratch_store_b32 off, v162, s33 offset:396 +; GISEL64-NEXT: scratch_store_b32 off, v163, s33 offset:400 +; GISEL64-NEXT: scratch_store_b32 off, v164, s33 offset:404 +; GISEL64-NEXT: scratch_store_b32 off, v165, s33 offset:408 +; GISEL64-NEXT: scratch_store_b32 off, v166, s33 offset:412 +; GISEL64-NEXT: scratch_store_b32 off, v167, s33 offset:416 +; GISEL64-NEXT: scratch_store_b32 off, v176, s33 offset:420 +; GISEL64-NEXT: scratch_store_b32 off, v177, s33 offset:424 +; GISEL64-NEXT: scratch_store_b32 off, v178, s33 offset:428 +; GISEL64-NEXT: scratch_store_b32 off, v179, s33 offset:432 +; GISEL64-NEXT: scratch_store_b32 off, v180, s33 offset:436 +; GISEL64-NEXT: scratch_store_b32 off, v181, s33 offset:440 +; GISEL64-NEXT: scratch_store_b32 off, v182, s33 offset:444 +; GISEL64-NEXT: scratch_store_b32 off, v183, s33 offset:448 +; GISEL64-NEXT: scratch_store_b32 off, v192, s33 offset:452 +; GISEL64-NEXT: scratch_store_b32 off, v193, s33 offset:456 +; GISEL64-NEXT: scratch_store_b32 off, v194, s33 offset:460 +; GISEL64-NEXT: scratch_store_b32 off, v195, s33 offset:464 +; GISEL64-NEXT: scratch_store_b32 off, v196, s33 offset:468 +; GISEL64-NEXT: scratch_store_b32 off, v197, s33 offset:472 +; GISEL64-NEXT: scratch_store_b32 off, v198, s33 offset:476 +; GISEL64-NEXT: scratch_store_b32 off, v199, s33 offset:480 +; GISEL64-NEXT: scratch_store_b32 off, v208, s33 offset:484 +; GISEL64-NEXT: scratch_store_b32 off, v209, s33 offset:488 +; GISEL64-NEXT: scratch_store_b32 off, v210, s33 offset:492 +; GISEL64-NEXT: scratch_store_b32 off, v211, s33 offset:496 +; GISEL64-NEXT: scratch_store_b32 off, v212, s33 offset:500 +; GISEL64-NEXT: scratch_store_b32 off, v213, s33 offset:504 +; GISEL64-NEXT: scratch_store_b32 off, v214, s33 offset:508 +; GISEL64-NEXT: scratch_store_b32 off, v215, s33 offset:512 +; GISEL64-NEXT: s_clause 0xf +; GISEL64-NEXT: scratch_store_b32 off, v224, s33 offset:516 +; GISEL64-NEXT: scratch_store_b32 off, v225, s33 offset:520 +; GISEL64-NEXT: scratch_store_b32 off, v226, s33 offset:524 +; GISEL64-NEXT: scratch_store_b32 off, v227, s33 offset:528 +; GISEL64-NEXT: scratch_store_b32 off, v228, s33 offset:532 +; GISEL64-NEXT: scratch_store_b32 off, v229, s33 offset:536 +; GISEL64-NEXT: scratch_store_b32 off, v230, s33 offset:540 +; GISEL64-NEXT: scratch_store_b32 off, v231, s33 offset:544 +; GISEL64-NEXT: scratch_store_b32 off, v240, s33 offset:548 +; GISEL64-NEXT: scratch_store_b32 off, v241, s33 offset:552 +; GISEL64-NEXT: scratch_store_b32 off, v242, s33 offset:556 +; GISEL64-NEXT: scratch_store_b32 off, v243, s33 offset:560 +; GISEL64-NEXT: scratch_store_b32 off, v244, s33 offset:564 +; GISEL64-NEXT: scratch_store_b32 off, v245, s33 offset:568 +; GISEL64-NEXT: scratch_store_b32 off, v246, s33 offset:572 +; GISEL64-NEXT: scratch_store_b32 off, v247, s33 offset:576 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: v_writelane_b32 v40, s0, 4 +; GISEL64-NEXT: v_mov_b32_e32 v2, v0 +; GISEL64-NEXT: v_swap_b32 v0, v1 +; GISEL64-NEXT: s_mov_b32 s0, gfx_callee@abs32@lo +; GISEL64-NEXT: v_writelane_b32 v40, s4, 0 +; GISEL64-NEXT: s_mov_b32 s1, gfx_callee@abs32@hi +; GISEL64-NEXT: s_addk_co_i32 s32, 0x250 +; GISEL64-NEXT: v_writelane_b32 v40, s5, 1 +; GISEL64-NEXT: v_writelane_b32 v40, s30, 2 +; GISEL64-NEXT: v_writelane_b32 v40, s31, 3 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL64-NEXT: v_readlane_b32 s31, v40, 3 +; GISEL64-NEXT: v_readlane_b32 s30, v40, 2 +; GISEL64-NEXT: v_readlane_b32 s5, v40, 1 +; GISEL64-NEXT: v_readlane_b32 s4, v40, 0 +; GISEL64-NEXT: v_readlane_b32 s0, v40, 4 +; GISEL64-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GISEL64-NEXT: s_mov_b32 s32, s33 +; GISEL64-NEXT: s_xor_b64 exec, s[4:5], -1 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_load_b32 v0, off, s33 offset:4 +; GISEL64-NEXT: scratch_load_b32 v1, off, s33 offset:8 +; GISEL64-NEXT: scratch_load_b32 v2, off, s33 offset:12 +; GISEL64-NEXT: scratch_load_b32 v3, off, s33 offset:16 +; GISEL64-NEXT: scratch_load_b32 v4, off, s33 offset:20 +; GISEL64-NEXT: scratch_load_b32 v5, off, s33 offset:24 +; GISEL64-NEXT: scratch_load_b32 v6, off, s33 offset:28 +; GISEL64-NEXT: scratch_load_b32 v7, off, s33 offset:32 +; GISEL64-NEXT: scratch_load_b32 v8, off, s33 offset:36 +; GISEL64-NEXT: scratch_load_b32 v9, off, s33 offset:40 +; GISEL64-NEXT: scratch_load_b32 v10, off, s33 offset:44 +; GISEL64-NEXT: scratch_load_b32 v11, off, s33 offset:48 +; GISEL64-NEXT: scratch_load_b32 v12, off, s33 offset:52 +; GISEL64-NEXT: scratch_load_b32 v13, off, s33 offset:56 +; GISEL64-NEXT: scratch_load_b32 v14, off, s33 offset:60 +; GISEL64-NEXT: scratch_load_b32 v15, off, s33 offset:64 +; GISEL64-NEXT: scratch_load_b32 v16, off, s33 offset:68 +; GISEL64-NEXT: scratch_load_b32 v17, off, s33 offset:72 +; GISEL64-NEXT: scratch_load_b32 v18, off, s33 offset:76 +; GISEL64-NEXT: scratch_load_b32 v19, off, s33 offset:80 +; GISEL64-NEXT: scratch_load_b32 v20, off, s33 offset:84 +; GISEL64-NEXT: scratch_load_b32 v21, off, s33 offset:88 +; GISEL64-NEXT: scratch_load_b32 v22, off, s33 offset:92 +; GISEL64-NEXT: scratch_load_b32 v23, off, s33 offset:96 +; GISEL64-NEXT: scratch_load_b32 v24, off, s33 offset:100 +; GISEL64-NEXT: scratch_load_b32 v25, off, s33 offset:104 +; GISEL64-NEXT: scratch_load_b32 v26, off, s33 offset:108 +; GISEL64-NEXT: scratch_load_b32 v27, off, s33 offset:112 +; GISEL64-NEXT: scratch_load_b32 v28, off, s33 offset:116 +; GISEL64-NEXT: scratch_load_b32 v29, off, s33 offset:120 +; GISEL64-NEXT: scratch_load_b32 v30, off, s33 offset:124 +; GISEL64-NEXT: scratch_load_b32 v31, off, s33 offset:128 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_load_b32 v32, off, s33 offset:132 +; GISEL64-NEXT: scratch_load_b32 v33, off, s33 offset:136 +; GISEL64-NEXT: scratch_load_b32 v34, off, s33 offset:140 +; GISEL64-NEXT: scratch_load_b32 v35, off, s33 offset:144 +; GISEL64-NEXT: scratch_load_b32 v36, off, s33 offset:148 +; GISEL64-NEXT: scratch_load_b32 v37, off, s33 offset:152 +; GISEL64-NEXT: scratch_load_b32 v38, off, s33 offset:156 +; GISEL64-NEXT: scratch_load_b32 v39, off, s33 offset:160 +; GISEL64-NEXT: scratch_load_b32 v48, off, s33 offset:164 +; GISEL64-NEXT: scratch_load_b32 v49, off, s33 offset:168 +; GISEL64-NEXT: scratch_load_b32 v50, off, s33 offset:172 +; GISEL64-NEXT: scratch_load_b32 v51, off, s33 offset:176 +; GISEL64-NEXT: scratch_load_b32 v52, off, s33 offset:180 +; GISEL64-NEXT: scratch_load_b32 v53, off, s33 offset:184 +; GISEL64-NEXT: scratch_load_b32 v54, off, s33 offset:188 +; GISEL64-NEXT: scratch_load_b32 v55, off, s33 offset:192 +; GISEL64-NEXT: scratch_load_b32 v64, off, s33 offset:196 +; GISEL64-NEXT: scratch_load_b32 v65, off, s33 offset:200 +; GISEL64-NEXT: scratch_load_b32 v66, off, s33 offset:204 +; GISEL64-NEXT: scratch_load_b32 v67, off, s33 offset:208 +; GISEL64-NEXT: scratch_load_b32 v68, off, s33 offset:212 +; GISEL64-NEXT: scratch_load_b32 v69, off, s33 offset:216 +; GISEL64-NEXT: scratch_load_b32 v70, off, s33 offset:220 +; GISEL64-NEXT: scratch_load_b32 v71, off, s33 offset:224 +; GISEL64-NEXT: scratch_load_b32 v80, off, s33 offset:228 +; GISEL64-NEXT: scratch_load_b32 v81, off, s33 offset:232 +; GISEL64-NEXT: scratch_load_b32 v82, off, s33 offset:236 +; GISEL64-NEXT: scratch_load_b32 v83, off, s33 offset:240 +; GISEL64-NEXT: scratch_load_b32 v84, off, s33 offset:244 +; GISEL64-NEXT: scratch_load_b32 v85, off, s33 offset:248 +; GISEL64-NEXT: scratch_load_b32 v86, off, s33 offset:252 +; GISEL64-NEXT: scratch_load_b32 v87, off, s33 offset:256 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_load_b32 v96, off, s33 offset:260 +; GISEL64-NEXT: scratch_load_b32 v97, off, s33 offset:264 +; GISEL64-NEXT: scratch_load_b32 v98, off, s33 offset:268 +; GISEL64-NEXT: scratch_load_b32 v99, off, s33 offset:272 +; GISEL64-NEXT: scratch_load_b32 v100, off, s33 offset:276 +; GISEL64-NEXT: scratch_load_b32 v101, off, s33 offset:280 +; GISEL64-NEXT: scratch_load_b32 v102, off, s33 offset:284 +; GISEL64-NEXT: scratch_load_b32 v103, off, s33 offset:288 +; GISEL64-NEXT: scratch_load_b32 v112, off, s33 offset:292 +; GISEL64-NEXT: scratch_load_b32 v113, off, s33 offset:296 +; GISEL64-NEXT: scratch_load_b32 v114, off, s33 offset:300 +; GISEL64-NEXT: scratch_load_b32 v115, off, s33 offset:304 +; GISEL64-NEXT: scratch_load_b32 v116, off, s33 offset:308 +; GISEL64-NEXT: scratch_load_b32 v117, off, s33 offset:312 +; GISEL64-NEXT: scratch_load_b32 v118, off, s33 offset:316 +; GISEL64-NEXT: scratch_load_b32 v119, off, s33 offset:320 +; GISEL64-NEXT: scratch_load_b32 v128, off, s33 offset:324 +; GISEL64-NEXT: scratch_load_b32 v129, off, s33 offset:328 +; GISEL64-NEXT: scratch_load_b32 v130, off, s33 offset:332 +; GISEL64-NEXT: scratch_load_b32 v131, off, s33 offset:336 +; GISEL64-NEXT: scratch_load_b32 v132, off, s33 offset:340 +; GISEL64-NEXT: scratch_load_b32 v133, off, s33 offset:344 +; GISEL64-NEXT: scratch_load_b32 v134, off, s33 offset:348 +; GISEL64-NEXT: scratch_load_b32 v135, off, s33 offset:352 +; GISEL64-NEXT: scratch_load_b32 v144, off, s33 offset:356 +; GISEL64-NEXT: scratch_load_b32 v145, off, s33 offset:360 +; GISEL64-NEXT: scratch_load_b32 v146, off, s33 offset:364 +; GISEL64-NEXT: scratch_load_b32 v147, off, s33 offset:368 +; GISEL64-NEXT: scratch_load_b32 v148, off, s33 offset:372 +; GISEL64-NEXT: scratch_load_b32 v149, off, s33 offset:376 +; GISEL64-NEXT: scratch_load_b32 v150, off, s33 offset:380 +; GISEL64-NEXT: scratch_load_b32 v151, off, s33 offset:384 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_load_b32 v160, off, s33 offset:388 +; GISEL64-NEXT: scratch_load_b32 v161, off, s33 offset:392 +; GISEL64-NEXT: scratch_load_b32 v162, off, s33 offset:396 +; GISEL64-NEXT: scratch_load_b32 v163, off, s33 offset:400 +; GISEL64-NEXT: scratch_load_b32 v164, off, s33 offset:404 +; GISEL64-NEXT: scratch_load_b32 v165, off, s33 offset:408 +; GISEL64-NEXT: scratch_load_b32 v166, off, s33 offset:412 +; GISEL64-NEXT: scratch_load_b32 v167, off, s33 offset:416 +; GISEL64-NEXT: scratch_load_b32 v176, off, s33 offset:420 +; GISEL64-NEXT: scratch_load_b32 v177, off, s33 offset:424 +; GISEL64-NEXT: scratch_load_b32 v178, off, s33 offset:428 +; GISEL64-NEXT: scratch_load_b32 v179, off, s33 offset:432 +; GISEL64-NEXT: scratch_load_b32 v180, off, s33 offset:436 +; GISEL64-NEXT: scratch_load_b32 v181, off, s33 offset:440 +; GISEL64-NEXT: scratch_load_b32 v182, off, s33 offset:444 +; GISEL64-NEXT: scratch_load_b32 v183, off, s33 offset:448 +; GISEL64-NEXT: scratch_load_b32 v192, off, s33 offset:452 +; GISEL64-NEXT: scratch_load_b32 v193, off, s33 offset:456 +; GISEL64-NEXT: scratch_load_b32 v194, off, s33 offset:460 +; GISEL64-NEXT: scratch_load_b32 v195, off, s33 offset:464 +; GISEL64-NEXT: scratch_load_b32 v196, off, s33 offset:468 +; GISEL64-NEXT: scratch_load_b32 v197, off, s33 offset:472 +; GISEL64-NEXT: scratch_load_b32 v198, off, s33 offset:476 +; GISEL64-NEXT: scratch_load_b32 v199, off, s33 offset:480 +; GISEL64-NEXT: scratch_load_b32 v208, off, s33 offset:484 +; GISEL64-NEXT: scratch_load_b32 v209, off, s33 offset:488 +; GISEL64-NEXT: scratch_load_b32 v210, off, s33 offset:492 +; GISEL64-NEXT: scratch_load_b32 v211, off, s33 offset:496 +; GISEL64-NEXT: scratch_load_b32 v212, off, s33 offset:500 +; GISEL64-NEXT: scratch_load_b32 v213, off, s33 offset:504 +; GISEL64-NEXT: scratch_load_b32 v214, off, s33 offset:508 +; GISEL64-NEXT: scratch_load_b32 v215, off, s33 offset:512 +; GISEL64-NEXT: s_clause 0xf +; GISEL64-NEXT: scratch_load_b32 v224, off, s33 offset:516 +; GISEL64-NEXT: scratch_load_b32 v225, off, s33 offset:520 +; GISEL64-NEXT: scratch_load_b32 v226, off, s33 offset:524 +; GISEL64-NEXT: scratch_load_b32 v227, off, s33 offset:528 +; GISEL64-NEXT: scratch_load_b32 v228, off, s33 offset:532 +; GISEL64-NEXT: scratch_load_b32 v229, off, s33 offset:536 +; GISEL64-NEXT: scratch_load_b32 v230, off, s33 offset:540 +; GISEL64-NEXT: scratch_load_b32 v231, off, s33 offset:544 +; GISEL64-NEXT: scratch_load_b32 v240, off, s33 offset:548 +; GISEL64-NEXT: scratch_load_b32 v241, off, s33 offset:552 +; GISEL64-NEXT: scratch_load_b32 v242, off, s33 offset:556 +; GISEL64-NEXT: scratch_load_b32 v243, off, s33 offset:560 +; GISEL64-NEXT: scratch_load_b32 v244, off, s33 offset:564 +; GISEL64-NEXT: scratch_load_b32 v245, off, s33 offset:568 +; GISEL64-NEXT: scratch_load_b32 v246, off, s33 offset:572 +; GISEL64-NEXT: scratch_load_b32 v247, off, s33 offset:576 +; GISEL64-NEXT: s_mov_b64 exec, s[4:5] +; GISEL64-NEXT: s_mov_b32 s33, s0 +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_setpc_b64 s[30:31] + %ret = call amdgpu_gfx <2 x half>(<2 x half>, <2 x half>) @gfx_callee(<2 x half> %y, <2 x half> %x) convergent + ret <2 x half> %ret +} diff --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll index b514c49394d21..278cf0150c2f7 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll @@ -46,6 +46,7 @@ ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: dynamicVGPRBlockSize: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 +; CHECK-NEXT: isWholeWaveFunction: false ; CHECK-NEXT: body: define amdgpu_kernel void @long_branch_used_all_sgprs(ptr addrspace(1) %arg, i32 %cnd) #0 { entry: @@ -315,6 +316,7 @@ ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: dynamicVGPRBlockSize: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 +; CHECK-NEXT: isWholeWaveFunction: false ; CHECK-NEXT: body: define amdgpu_kernel void @long_branch_high_num_sgprs_used(ptr addrspace(1) %arg, i32 %cnd) #0 { entry: diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll index fc730f9e88454..890ea44081ce7 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll @@ -46,6 +46,7 @@ ; AFTER-PEI-NEXT: hasInitWholeWave: false ; AFTER-PEI-NEXT: dynamicVGPRBlockSize: 0 ; AFTER-PEI-NEXT: scratchReservedForDynamicVGPRs: 0 +; AFTER-PEI-NEXT: isWholeWaveFunction: false ; AFTER-PEI-NEXT: body: define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 { %wide.sgpr0 = call <32 x i32> asm sideeffect "; def $0", "=s" () #0 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll index 5adef1433079d..f84ef8a3844dd 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll @@ -46,6 +46,7 @@ ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: dynamicVGPRBlockSize: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 +; CHECK-NEXT: isWholeWaveFunction: false ; CHECK-NEXT: body: define amdgpu_kernel void @uniform_long_forward_branch_debug(ptr addrspace(1) %arg, i32 %arg1) #0 !dbg !5 { bb0: diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll index fa40164aa02f0..cc834d017c149 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll @@ -46,6 +46,7 @@ ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: dynamicVGPRBlockSize: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 +; CHECK-NEXT: isWholeWaveFunction: false ; CHECK-NEXT: body: define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) #0 { bb0: diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir index 24565e4423d04..06c580ec6f6b4 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir @@ -55,6 +55,7 @@ # FULL-NEXT: hasInitWholeWave: false # FULL-NEXT: dynamicVGPRBlockSize: 0 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0 +# FULL-NEXT: isWholeWaveFunction: false # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: @@ -162,6 +163,7 @@ body: | # FULL-NEXT: hasInitWholeWave: false # FULL-NEXT: dynamicVGPRBlockSize: 0 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0 +# FULL-NEXT: isWholeWaveFunction: false # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: @@ -240,6 +242,7 @@ body: | # FULL-NEXT: hasInitWholeWave: false # FULL-NEXT: dynamicVGPRBlockSize: 0 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0 +# FULL-NEXT: isWholeWaveFunction: false # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: @@ -319,6 +322,7 @@ body: | # FULL-NEXT: hasInitWholeWave: false # FULL-NEXT: dynamicVGPRBlockSize: 0 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0 +# FULL-NEXT: isWholeWaveFunction: false # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll index a15271382f37d..427154651a381 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -56,6 +56,7 @@ ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: dynamicVGPRBlockSize: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 +; CHECK-NEXT: isWholeWaveFunction: false ; CHECK-NEXT: body: define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) { %gep = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %arg0 @@ -105,6 +106,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) { ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: dynamicVGPRBlockSize: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 +; CHECK-NEXT: isWholeWaveFunction: false ; CHECK-NEXT: body: define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) { %gep = getelementptr inbounds [128 x i32], ptr addrspace(2) @gds, i32 0, i32 %arg0 @@ -178,6 +180,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 { ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: dynamicVGPRBlockSize: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 +; CHECK-NEXT: isWholeWaveFunction: false ; CHECK-NEXT: body: define void @function() { ret void @@ -233,6 +236,7 @@ define void @function() { ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: dynamicVGPRBlockSize: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 +; CHECK-NEXT: isWholeWaveFunction: false ; CHECK-NEXT: body: define void @function_nsz() #0 { ret void diff --git a/llvm/test/Verifier/amdgpu-cc.ll b/llvm/test/Verifier/amdgpu-cc.ll index aec09771d2e4f..e86825e088753 100644 --- a/llvm/test/Verifier/amdgpu-cc.ll +++ b/llvm/test/Verifier/amdgpu-cc.ll @@ -217,3 +217,36 @@ define amdgpu_cs_chain_preserve void @preallocated_cc_amdgpu_cs_chain_preserve(p define amdgpu_cs_chain_preserve void @inalloca_cc_amdgpu_cs_chain_preserve(ptr inalloca(i32) %ptr) { ret void } + +; CHECK: Calling convention requires first argument to be i1 +; CHECK-NEXT: ptr @whole_wave_no_args +define amdgpu_gfx_whole_wave void @whole_wave_no_args() { + ret void +} + +; CHECK: Calling convention requires first argument to be i1 +; CHECK-NEXT: ptr @whole_wave_must_have_i1_active +define amdgpu_gfx_whole_wave void @whole_wave_must_have_i1_active(i32 %x) { + ret void +} + +; CHECK: Calling convention requires first argument to not be inreg +; CHECK-NEXT: ptr @whole_wave_i1_active_inreg +define amdgpu_gfx_whole_wave void @whole_wave_i1_active_inreg(i1 inreg %active) { + ret void +} + +; CHECK: Calling convention does not support varargs +; CHECK-NEXT: ptr @whole_wave_varargs +define amdgpu_gfx_whole_wave void @whole_wave_varargs(i1 %active, i32 %x, ...) { + ret void +} + +declare amdgpu_gfx_whole_wave void @whole_wave_callee(i1 %active) + +; CHECK: calling convention does not permit calls +; CHECK-NEXT: call amdgpu_gfx_whole_wave void @whole_wave_callee(i1 true) +define amdgpu_cs void @cant_call_whole_wave_func() { + call amdgpu_gfx_whole_wave void @whole_wave_callee(i1 true) + ret void +}