diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp index c612f8de7b50b..81d86ba867db8 100644 --- a/llvm/lib/CodeGen/MachineOperand.cpp +++ b/llvm/lib/CodeGen/MachineOperand.cpp @@ -1080,7 +1080,7 @@ MachinePointerInfo MachinePointerInfo::getStack(MachineFunction &MF, } MachinePointerInfo MachinePointerInfo::getUnknownStack(MachineFunction &MF) { - return MachinePointerInfo(MF.getDataLayout().getAllocaAddrSpace()); + return MachinePointerInfo(MF.getPSVManager().getStack()->getAddressSpace()); } MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, Flags f, diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 48007be924bda..07917217405b2 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -119,6 +119,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/ModRef.h" +#include "llvm/Support/NVPTXAddrSpace.h" #include "llvm/Support/raw_ostream.h" #include #include @@ -4500,6 +4501,13 @@ void Verifier::visitAllocaInst(AllocaInst &AI) { "alloca on amdgpu must be in addrspace(5)", &AI); } + if (TT.isNVPTX()) { + Check(AI.getAddressSpace() == NVPTXAS::ADDRESS_SPACE_LOCAL || + AI.getAddressSpace() == NVPTXAS::ADDRESS_SPACE_GENERIC, + "AllocaInst can only be in Generic or Local address space for NVPTX.", + &AI); + } + visitInstruction(AI); } diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 14ca867023e2a..6785554bc5418 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -80,6 +80,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Endian.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/NVPTXAddrSpace.h" #include "llvm/Support/NativeFormatting.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLoweringObjectFile.h" @@ -1480,14 +1481,12 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters( int64_t NumBytes = MFI.getStackSize(); if (NumBytes) { O << "\t.local .align " << MFI.getMaxAlign().value() << " .b8 \t" - << DEPOTNAME << getFunctionNumber() << "[" << NumBytes << "];\n"; - if (static_cast(MF.getTarget()).is64Bit()) { - O << "\t.reg .b64 \t%SP;\n" - << "\t.reg .b64 \t%SPL;\n"; - } else { - O << "\t.reg .b32 \t%SP;\n" - << "\t.reg .b32 \t%SPL;\n"; - } + << DEPOTNAME << getFunctionNumber() << "[" << NumBytes << "];\n" + << "\t.reg .b" + << MF.getTarget().getPointerSizeInBits(ADDRESS_SPACE_GENERIC) + << " \t%SP;\n" + << "\t.reg .b" << MF.getTarget().getPointerSizeInBits(ADDRESS_SPACE_LOCAL) + << " \t%SPL;\n"; } // Go through all virtual registers to establish the mapping between the diff --git a/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp index 47bc15f52bb96..ccb475d8efc63 100644 --- a/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp @@ -48,25 +48,45 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF, // mov %SPL, %depot; // cvta.local %SP, %SPL; // for local address accesses in MF. - bool Is64Bit = - static_cast(MF.getTarget()).is64Bit(); + // if the generic and local address spaces are different, + // it emits: + // mov %SPL, %depot; + // cvt.u64.u32 %SP, %SPL; + // cvta.local %SP, %SP; + + if (MR.use_empty(NRI->getFrameLocalRegister(MF))) + // If %SPL is not used, do not bother emitting anything + return; + bool IsLocal64Bit = + MF.getTarget().getPointerSize(NVPTXAS::ADDRESS_SPACE_LOCAL) == 8; + bool IsGeneric64Bit = + MF.getTarget().getPointerSize(NVPTXAS::ADDRESS_SPACE_GENERIC) == 8; + bool NeedsCast = IsGeneric64Bit != IsLocal64Bit; + Register SourceReg = NRI->getFrameLocalRegister(MF); + if (NeedsCast) + SourceReg = NRI->getFrameRegister(MF); + unsigned CvtaLocalOpcode = - (Is64Bit ? NVPTX::cvta_local_64 : NVPTX::cvta_local); - unsigned MovDepotOpcode = - (Is64Bit ? NVPTX::MOV_DEPOT_ADDR_64 : NVPTX::MOV_DEPOT_ADDR); - if (!MR.use_empty(NRI->getFrameRegister(MF))) { - // If %SP is not used, do not bother emitting "cvta.local %SP, %SPL". + (IsGeneric64Bit ? NVPTX::cvta_local_64 : NVPTX::cvta_local); + + MBBI = BuildMI(MBB, MBBI, dl, + MF.getSubtarget().getInstrInfo()->get(CvtaLocalOpcode), + NRI->getFrameRegister(MF)) + .addReg(SourceReg); + + if (NeedsCast) MBBI = BuildMI(MBB, MBBI, dl, - MF.getSubtarget().getInstrInfo()->get(CvtaLocalOpcode), + MF.getSubtarget().getInstrInfo()->get(NVPTX::CVT_u64_u32), NRI->getFrameRegister(MF)) - .addReg(NRI->getFrameLocalRegister(MF)); - } - if (!MR.use_empty(NRI->getFrameLocalRegister(MF))) { - BuildMI(MBB, MBBI, dl, - MF.getSubtarget().getInstrInfo()->get(MovDepotOpcode), - NRI->getFrameLocalRegister(MF)) - .addImm(MF.getFunctionNumber()); - } + .addReg(NRI->getFrameLocalRegister(MF)) + .addImm(NVPTX::PTXCvtMode::NONE); + + unsigned MovDepotOpcode = + (IsLocal64Bit ? NVPTX::MOV_DEPOT_ADDR_64 : NVPTX::MOV_DEPOT_ADDR); + BuildMI(MBB, MBBI, dl, + MF.getSubtarget().getInstrInfo()->get(MovDepotOpcode), + NRI->getFrameLocalRegister(MF)) + .addImm(MF.getFunctionNumber()); } } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index d3fb657851fe2..9ad16151014f8 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1116,7 +1116,6 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(NVPTXISD::FMINNUM3) MAKE_CASE(NVPTXISD::FMAXIMUM3) MAKE_CASE(NVPTXISD::FMINIMUM3) - MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC) MAKE_CASE(NVPTXISD::STACKRESTORE) MAKE_CASE(NVPTXISD::STACKSAVE) MAKE_CASE(NVPTXISD::SETP_F16X2) @@ -1781,10 +1780,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { - if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) { const Function &Fn = DAG.getMachineFunction().getFunction(); - DAG.getContext()->diagnose(DiagnosticInfoUnsupported( Fn, "Support for dynamic alloca introduced in PTX ISA version 7.3 and " @@ -1795,28 +1792,7 @@ SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, return DAG.getMergeValues(Ops, SDLoc()); } - SDLoc DL(Op.getNode()); - SDValue Chain = Op.getOperand(0); - SDValue Size = Op.getOperand(1); - uint64_t Align = Op.getConstantOperandVal(2); - - // The alignment on a ISD::DYNAMIC_STACKALLOC node may be 0 to indicate that - // the default stack alignment should be used. - if (Align == 0) - Align = DAG.getSubtarget().getFrameLowering()->getStackAlign().value(); - - // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32. - const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL); - - SDValue Alloc = - DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, {LocalVT, MVT::Other}, - {Chain, DAG.getZExtOrTrunc(Size, DL, LocalVT), - DAG.getTargetConstant(Align, DL, MVT::i32)}); - - SDValue ASC = DAG.getAddrSpaceCast( - DL, Op.getValueType(), Alloc, ADDRESS_SPACE_LOCAL, ADDRESS_SPACE_GENERIC); - - return DAG.getMergeValues({ASC, SDValue(Alloc.getNode(), 1)}, DL); + return Op; } SDValue NVPTXTargetLowering::LowerSTACKRESTORE(SDValue Op, diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 03b3edc902e54..6033f3dc8df15 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -69,7 +69,6 @@ enum NodeType : unsigned { FMAXIMUM3, FMINIMUM3, - DYNAMIC_STACKALLOC, STACKRESTORE, STACKSAVE, BrxStart, diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 4e38e026e6bda..2ff82a1c96aca 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -2259,22 +2259,31 @@ def trapexitinst : NVPTXInst<(outs), (ins), "trap; exit;", [(trap)]>, Requires<[ // brkpt instruction def debugtrapinst : BasicNVPTXInst<(outs), (ins), "brkpt", [(debugtrap)]>; -def SDTDynAllocaOp : - SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<1>, SDTCisVT<2, i32>]>; +def SDTDynAllocaOp + : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<1>, SDTCisVT<2, i32>]>; -def dyn_alloca : - SDNode<"NVPTXISD::DYNAMIC_STACKALLOC", SDTDynAllocaOp, - [SDNPHasChain, SDNPSideEffect]>; +def getAllocaAlign : SDNodeXFormgetZExtValue()) + return CurDAG->getTargetConstant(NV, SDLoc(N), N->getValueType(0)); + return CurDAG->getTargetConstant(CurDAG->getSubtarget().getFrameLowering()->getStackAlign().value(), SDLoc(N), N->getValueType(0)); +}]>; -foreach t = [I32RT, I64RT] in { - def DYNAMIC_STACKALLOC # t.Size : - BasicNVPTXInst<(outs t.RC:$ptr), - (ins t.RC:$size, i32imm:$align), - "alloca.u" # t.Size, - [(set t.Ty:$ptr, (dyn_alloca t.Ty:$size, timm:$align))]>, - Requires<[hasPTX<73>, hasSM<52>]>; +def dyn_alloca : SDNode<"ISD::DYNAMIC_STACKALLOC", + SDTDynAllocaOp, [SDNPHasChain, SDNPSideEffect]>; + +let Predicates = [hasPTX<73>, hasSM<52>] in { + foreach t = [I32RT, I64RT] in { + def DYNAMIC_STACKALLOC_#t.Size + : BasicNVPTXInst<(outs t.RC:$ptr), (ins t.RC:$size, i32imm:$align), + "alloca.u"#t.Size>; + } } +def : Pat<(i32(dyn_alloca i32:$size, imm:$align)), + (DYNAMIC_STACKALLOC_32 $size, (getAllocaAlign imm:$align))>; +def : Pat<(i64(dyn_alloca i64:$size, imm:$align)), + (DYNAMIC_STACKALLOC_64 $size, (getAllocaAlign imm:$align))>; + // // BRX // diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp index 88bc000f39bf7..6ba29da911bc0 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp @@ -6,16 +6,15 @@ // //===----------------------------------------------------------------------===// // -// For all alloca instructions, and add a pair of cast to local address for -// each of them. For example, +// Change the address space of each alloca to local and add an addrspacecast to +// generic address space. For example, // // %A = alloca i32 // store i32 0, i32* %A ; emits st.u32 // // will be transformed to // -// %A = alloca i32 -// %Local = addrspacecast i32* %A to i32 addrspace(5)* +// %A = alloca i32, addrspace(5) // %Generic = addrspacecast i32 addrspace(5)* %A to i32* // store i32 0, i32 addrspace(5)* %Generic ; emits st.local.u32 // @@ -24,22 +23,31 @@ // //===----------------------------------------------------------------------===// -#include "MCTargetDesc/NVPTXBaseInfo.h" #include "NVPTX.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/Pass.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/NVPTXAddrSpace.h" using namespace llvm; +using namespace NVPTXAS; namespace { class NVPTXLowerAlloca : public FunctionPass { - bool runOnFunction(Function &F) override; + bool lowerFunctionAllocas(Function &F); public: - static char ID; // Pass identification, replacement for typeid + static char ID; NVPTXLowerAlloca() : FunctionPass(ID) {} + bool runOnFunction(Function &F) override; StringRef getPassName() const override { return "convert address space of alloca'ed memory to local"; } @@ -51,84 +59,47 @@ char NVPTXLowerAlloca::ID = 1; INITIALIZE_PASS(NVPTXLowerAlloca, "nvptx-lower-alloca", "Lower Alloca", false, false) -// ============================================================================= -// Main function for this pass. -// ============================================================================= bool NVPTXLowerAlloca::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; - - bool Changed = false; - for (auto &BB : F) - for (auto &I : BB) { - if (auto allocaInst = dyn_cast(&I)) { - Changed = true; + SmallVector Allocas; + for (auto &I : instructions(F)) + if (auto *Alloca = dyn_cast(&I)) + if (Alloca->getAddressSpace() != ADDRESS_SPACE_LOCAL) + Allocas.push_back(Alloca); - PointerType *AllocInstPtrTy = - cast(allocaInst->getType()->getScalarType()); - unsigned AllocAddrSpace = AllocInstPtrTy->getAddressSpace(); - assert((AllocAddrSpace == ADDRESS_SPACE_GENERIC || - AllocAddrSpace == ADDRESS_SPACE_LOCAL) && - "AllocaInst can only be in Generic or Local address space for " - "NVPTX."); - - Instruction *AllocaInLocalAS = allocaInst; - auto ETy = allocaInst->getAllocatedType(); - - // We need to make sure that LLVM has info that alloca needs to go to - // ADDRESS_SPACE_LOCAL for InferAddressSpace pass. - // - // For allocas in ADDRESS_SPACE_LOCAL, we add addrspacecast to - // ADDRESS_SPACE_LOCAL and back to ADDRESS_SPACE_GENERIC, so that - // the alloca's users still use a generic pointer to operate on. - // - // For allocas already in ADDRESS_SPACE_LOCAL, we just need - // addrspacecast to ADDRESS_SPACE_GENERIC. - if (AllocAddrSpace == ADDRESS_SPACE_GENERIC) { - auto ASCastToLocalAS = new AddrSpaceCastInst( - allocaInst, - PointerType::get(ETy->getContext(), ADDRESS_SPACE_LOCAL), ""); - ASCastToLocalAS->insertAfter(allocaInst->getIterator()); - AllocaInLocalAS = ASCastToLocalAS; - } + if (Allocas.empty()) + return false; - auto AllocaInGenericAS = new AddrSpaceCastInst( - AllocaInLocalAS, - PointerType::get(ETy->getContext(), ADDRESS_SPACE_GENERIC), ""); - AllocaInGenericAS->insertAfter(AllocaInLocalAS->getIterator()); + IRBuilder<> Builder(F.getContext()); + for (AllocaInst *Alloca : Allocas) { + Builder.SetInsertPoint(Alloca); + auto *NewAlloca = + Builder.CreateAlloca(Alloca->getAllocatedType(), ADDRESS_SPACE_LOCAL, + Alloca->getArraySize(), Alloca->getName()); + NewAlloca->setAlignment(Alloca->getAlign()); + auto *Cast = Builder.CreateAddrSpaceCast( + NewAlloca, + PointerType::get(Alloca->getAllocatedType()->getContext(), + ADDRESS_SPACE_GENERIC), + ""); + for (auto &U : llvm::make_early_inc_range(Alloca->uses())) { + auto *II = dyn_cast(U.getUser()); + if (!II || !II->isLifetimeStartOrEnd()) + continue; - for (Use &AllocaUse : llvm::make_early_inc_range(allocaInst->uses())) { - // Check Load, Store, GEP, and BitCast Uses on alloca and make them - // use the converted generic address, in order to expose non-generic - // addrspacecast to NVPTXInferAddressSpaces. For other types - // of instructions this is unnecessary and may introduce redundant - // address cast. - auto LI = dyn_cast(AllocaUse.getUser()); - if (LI && LI->getPointerOperand() == allocaInst && - !LI->isVolatile()) { - LI->setOperand(LI->getPointerOperandIndex(), AllocaInGenericAS); - continue; - } - auto SI = dyn_cast(AllocaUse.getUser()); - if (SI && SI->getPointerOperand() == allocaInst && - !SI->isVolatile()) { - SI->setOperand(SI->getPointerOperandIndex(), AllocaInGenericAS); - continue; - } - auto GI = dyn_cast(AllocaUse.getUser()); - if (GI && GI->getPointerOperand() == allocaInst) { - GI->setOperand(GI->getPointerOperandIndex(), AllocaInGenericAS); - continue; - } - auto BI = dyn_cast(AllocaUse.getUser()); - if (BI && BI->getOperand(0) == allocaInst) { - BI->setOperand(0, AllocaInGenericAS); - continue; - } - } - } + Builder.SetInsertPoint(II); + Builder.CreateIntrinsic(II->getIntrinsicID(), {NewAlloca->getType()}, + {NewAlloca}); + II->eraseFromParent(); } - return Changed; + SmallVector DbgVariableUses; + findDbgValues(Alloca, DbgVariableUses); + for (auto *Dbg : DbgVariableUses) + Dbg->replaceVariableLocationOp(Alloca, NewAlloca); + + Alloca->replaceAllUsesWith(Cast); + Alloca->eraseFromParent(); + } + return true; } FunctionPass *llvm::createNVPTXLowerAllocaPass() { diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index e2bbe57c0085c..050ba38b7fcd2 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -529,7 +529,9 @@ void copyByValParam(Function &F, Argument &Arg) { // the use of the byval parameter with this alloca instruction. AllocA->setAlignment( Arg.getParamAlign().value_or(DL.getPrefTypeAlign(StructType))); - Arg.replaceAllUsesWith(AllocA); + auto *AddressSpaceCast = + IRB.CreateAddrSpaceCast(AllocA, Arg.getType(), Arg.getName()); + Arg.replaceAllUsesWith(AddressSpaceCast); CallInst *ArgInParam = createNVVMInternalAddrspaceWrap(IRB, Arg); diff --git a/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp b/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp index e9b0aaeca4964..9c0ca5d9c5e62 100644 --- a/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp @@ -12,22 +12,25 @@ // mov %SPL, %depot // cvta.local %SP, %SPL // -// Because Frame Index is a generic address and alloca can only return generic -// pointer, without this pass the instructions producing alloca'ed address will -// be based on %SP. NVPTXLowerAlloca tends to help replace store and load on -// this address with their .local versions, but this may introduce a lot of -// cvta.to.local instructions. Performance can be improved if we avoid casting -// address back and forth and directly calculate local address based on %SPL. +// Allocas are local addresses, if multiple alloca addresses need to be +// converted to generic ones, then multiple cvta.local instructions will be +// emitted. To eliminate these redundant cvta.local instructions, we need to +// combine them into a single cvta.local instruction. +// // This peephole pass optimizes these cases, for example // // It will transform the following pattern -// %0 = LEA_ADDRi64 %VRFrame64, 4 -// %1 = cvta_to_local_64 %0 +// %0 = LEA_ADDRi64 %VRFrameLocal64, 0 +// %1 = LEA_ADDRi64 %VRFrameLocal64, 4 +// %2 = cvta_to_local_64 %0 +// %3 = cvta_to_local_64 %1 // // into -// %1 = LEA_ADDRi64 %VRFrameLocal64, 4 +// %0 = LEA_ADDRi64 %VRFrame64, 0 +// %1 = LEA_ADDRi64 %VRFrame64, 4 // // %VRFrameLocal64 is the virtual register name of %SPL +// %VRFrame64 is the virtual register name of %SP // //===----------------------------------------------------------------------===// @@ -66,25 +69,36 @@ char NVPTXPeephole::ID = 0; INITIALIZE_PASS(NVPTXPeephole, "nvptx-peephole", "NVPTX Peephole", false, false) -static bool isCVTAToLocalCombinationCandidate(MachineInstr &Root) { +static bool isCVTALocalCombinationCandidate(MachineInstr &Root) { auto &MBB = *Root.getParent(); auto &MF = *MBB.getParent(); - // Check current instruction is cvta.to.local - if (Root.getOpcode() != NVPTX::cvta_to_local_64 && - Root.getOpcode() != NVPTX::cvta_to_local) + // Check current instruction is cvta.local + if (Root.getOpcode() != NVPTX::cvta_local_64 && + Root.getOpcode() != NVPTX::cvta_local) return false; auto &Op = Root.getOperand(1); const auto &MRI = MF.getRegInfo(); - MachineInstr *GenericAddrDef = nullptr; + MachineInstr *LocalAddrDef = nullptr; if (Op.isReg() && Op.getReg().isVirtual()) { - GenericAddrDef = MRI.getUniqueVRegDef(Op.getReg()); + LocalAddrDef = MRI.getUniqueVRegDef(Op.getReg()); + } + + if (!LocalAddrDef || LocalAddrDef->getParent() != &MBB) + return false; + + // With -nvptx-short-ptr there's an extra cvta.u64.u32 instruction + // between the LEA_ADDRi and the cvta.local. + if (LocalAddrDef->getOpcode() == NVPTX::CVT_u64_u32) { + auto &Op = LocalAddrDef->getOperand(1); + if (Op.isReg() && Op.getReg().isVirtual()) + LocalAddrDef = MRI.getUniqueVRegDef(Op.getReg()); } // Check the register operand is uniquely defined by LEA_ADDRi instruction - if (!GenericAddrDef || GenericAddrDef->getParent() != &MBB || - (GenericAddrDef->getOpcode() != NVPTX::LEA_ADDRi64 && - GenericAddrDef->getOpcode() != NVPTX::LEA_ADDRi)) { + if (!LocalAddrDef || LocalAddrDef->getParent() != &MBB || + (LocalAddrDef->getOpcode() != NVPTX::LEA_ADDRi64 && + LocalAddrDef->getOpcode() != NVPTX::LEA_ADDRi)) { return false; } @@ -92,37 +106,63 @@ static bool isCVTAToLocalCombinationCandidate(MachineInstr &Root) { MF.getSubtarget().getRegisterInfo(); // Check the LEA_ADDRi operand is Frame index - auto &BaseAddrOp = GenericAddrDef->getOperand(1); - if (BaseAddrOp.isReg() && BaseAddrOp.getReg() == NRI->getFrameRegister(MF)) { + auto &BaseAddrOp = LocalAddrDef->getOperand(1); + if (BaseAddrOp.isReg() && + BaseAddrOp.getReg() == NRI->getFrameLocalRegister(MF)) { return true; } return false; } -static void CombineCVTAToLocal(MachineInstr &Root) { - auto &MBB = *Root.getParent(); +static void CombineCVTALocal(MachineInstr &CVTALocalInstr) { + auto &MBB = *CVTALocalInstr.getParent(); auto &MF = *MBB.getParent(); const auto &MRI = MF.getRegInfo(); const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); - auto &Prev = *MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); + auto *LeaInstr = MRI.getUniqueVRegDef(CVTALocalInstr.getOperand(1).getReg()); + MachineInstr *CVTInstr = nullptr; + if (LeaInstr->getOpcode() == NVPTX::CVT_u64_u32) { + CVTInstr = LeaInstr; + LeaInstr = MRI.getUniqueVRegDef(LeaInstr->getOperand(1).getReg()); + if ((LeaInstr->getOpcode() != NVPTX::LEA_ADDRi64 && + LeaInstr->getOpcode() != NVPTX::LEA_ADDRi)) { + LLVM_DEBUG(dbgs() << "NVPTXPeephole: Expected LEA_ADDRi64 or LEA_ADDRi"); + + return; + } + } const NVPTXRegisterInfo *NRI = MF.getSubtarget().getRegisterInfo(); MachineInstrBuilder MIB = - BuildMI(MF, Root.getDebugLoc(), TII->get(Prev.getOpcode()), - Root.getOperand(0).getReg()) - .addReg(NRI->getFrameLocalRegister(MF)) - .add(Prev.getOperand(2)); - - MBB.insert((MachineBasicBlock::iterator)&Root, MIB); - - // Check if MRI has only one non dbg use, which is Root - if (MRI.hasOneNonDBGUse(Prev.getOperand(0).getReg())) { - Prev.eraseFromParent(); + BuildMI(MF, CVTALocalInstr.getDebugLoc(), TII->get(LeaInstr->getOpcode()), + CVTALocalInstr.getOperand(0).getReg()) + .addReg(NRI->getFrameRegister(MF)) + .add(LeaInstr->getOperand(2)); + + MBB.insert((MachineBasicBlock::iterator)&CVTALocalInstr, MIB); + + // Check if we can erase the cvta.u64.u32 or LEA_ADDRi instructions + if (CVTInstr) { + // Check if the cvta.u64.u32 instruction has only one non dbg use + // which is the cvta.local instruction. + if (MRI.hasOneNonDBGUse(CVTInstr->getOperand(0).getReg())) + CVTInstr->eraseFromParent(); + + // Check if the LEA_ADDRi instruction has no other non dbg uses + // (i.e. cvta.u64.u32 was the only non dbg use) + if (MRI.use_nodbg_empty(CVTInstr->getOperand(0).getReg())) + CVTInstr->eraseFromParent(); + + } else if (MRI.hasOneNonDBGUse(LeaInstr->getOperand(0).getReg())) { + // Check if the LEA_ADDRi instruction has only one non dbg use + // which is the cvta.u64.u32 instruction. + LeaInstr->eraseFromParent(); } - Root.eraseFromParent(); + + CVTALocalInstr.eraseFromParent(); } bool NVPTXPeephole::runOnMachineFunction(MachineFunction &MF) { @@ -137,8 +177,8 @@ bool NVPTXPeephole::runOnMachineFunction(MachineFunction &MF) { while (BlockIter != MBB.end()) { auto &MI = *BlockIter++; - if (isCVTAToLocalCombinationCandidate(MI)) { - CombineCVTAToLocal(MI); + if (isCVTALocalCombinationCandidate(MI)) { + CombineCVTALocal(MI); Changed = true; } } // Instruction @@ -147,13 +187,24 @@ bool NVPTXPeephole::runOnMachineFunction(MachineFunction &MF) { const NVPTXRegisterInfo *NRI = MF.getSubtarget().getRegisterInfo(); - // Remove unnecessary %VRFrame = cvta.local %VRFrameLocal const auto &MRI = MF.getRegInfo(); - if (MRI.use_empty(NRI->getFrameRegister(MF))) { - if (auto MI = MRI.getUniqueVRegDef(NRI->getFrameRegister(MF))) { + + // Remove unnecessary %VRFrame = cvta.local %VRFrame + if (MRI.hasOneNonDBGUse(NRI->getFrameRegister(MF))) + if (auto *MI = MRI.getOneNonDBGUser(NRI->getFrameRegister(MF))) + if (MI->getOpcode() == NVPTX::cvta_local_64 || + MI->getOpcode() == NVPTX::cvta_local) + MI->eraseFromParent(); + + // Remove unnecessary %VRFrame = cvt.u64.u32 %VRFrameLocal + if (MRI.use_empty(NRI->getFrameRegister(MF))) + if (auto *MI = MRI.getUniqueVRegDef(NRI->getFrameRegister(MF))) + MI->eraseFromParent(); + + // Remove unnecessary %VRFrameLocal = LEA_ADDRi %depot + if (MRI.use_empty(NRI->getFrameLocalRegister(MF))) + if (auto *MI = MRI.getUniqueVRegDef(NRI->getFrameLocalRegister(MF))) MI->eraseFromParent(); - } - } return Changed; } diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp index 646b554878c70..709a5832bb784 100644 --- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp @@ -119,7 +119,8 @@ bool NVPTXRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MI.getOperand(FIOperandNum + 1).getImm(); // Using I0 as the frame pointer - MI.getOperand(FIOperandNum).ChangeToRegister(getFrameRegister(MF), false); + MI.getOperand(FIOperandNum) + .ChangeToRegister(getFrameLocalRegister(MF), false); MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset); return false; } @@ -127,14 +128,18 @@ bool NVPTXRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, Register NVPTXRegisterInfo::getFrameRegister(const MachineFunction &MF) const { const NVPTXTargetMachine &TM = static_cast(MF.getTarget()); - return TM.is64Bit() ? NVPTX::VRFrame64 : NVPTX::VRFrame32; + return TM.getPointerSize(NVPTXAS::ADDRESS_SPACE_GENERIC) == 8 + ? NVPTX::VRFrame64 + : NVPTX::VRFrame32; } Register NVPTXRegisterInfo::getFrameLocalRegister(const MachineFunction &MF) const { const NVPTXTargetMachine &TM = static_cast(MF.getTarget()); - return TM.is64Bit() ? NVPTX::VRFrameLocal64 : NVPTX::VRFrameLocal32; + return TM.getPointerSize(NVPTXAS::ADDRESS_SPACE_LOCAL) == 8 + ? NVPTX::VRFrameLocal64 + : NVPTX::VRFrameLocal32; } void NVPTXRegisterInfo::clearDebugRegisterMap() const { diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 833f014a4c870..1ca514c42ede9 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -216,9 +216,6 @@ class NVPTXPassConfig : public TargetPassConfig { // function is only called in opt mode. void addEarlyCSEOrGVNPass(); - // Add passes that propagate special memory spaces. - void addAddressSpaceInferencePasses(); - // Add passes that perform straight-line scalar optimizations. void addStraightLineScalarOptimizationPasses(); }; @@ -304,17 +301,6 @@ void NVPTXPassConfig::addEarlyCSEOrGVNPass() { addPass(createEarlyCSEPass()); } -void NVPTXPassConfig::addAddressSpaceInferencePasses() { - // NVPTXLowerArgs emits alloca for byval parameters which can often - // be eliminated by SROA. - addPass(createSROAPass()); - addPass(createNVPTXLowerAllocaPass()); - // TODO: Consider running InferAddressSpaces during opt, earlier in the - // compilation flow. - addPass(createInferAddressSpacesPass()); - addPass(createNVPTXAtomicLowerPass()); -} - void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() { addPass(createSeparateConstOffsetFromGEPPass()); addPass(createSpeculativeExecutionPass()); @@ -368,13 +354,22 @@ void NVPTXPassConfig::addIRPasses() { // NVPTXLowerArgs is required for correctness and should be run right // before the address space inference passes. addPass(createNVPTXLowerArgsPass()); + addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering)); + + if (getOptLevel() != CodeGenOptLevel::None) + // NVPTXLowerArgs may emit alloca for byval parameters which can often + // be eliminated by SROA. + addPass(createSROAPass()); + addPass(createNVPTXLowerAllocaPass()); if (getOptLevel() != CodeGenOptLevel::None) { - addAddressSpaceInferencePasses(); + // TODO: Consider running InferAddressSpaces during opt, earlier in the + // compilation flow. + addPass(createInferAddressSpacesPass()); + addPass(createNVPTXAtomicLowerPass()); addStraightLineScalarOptimizationPasses(); } addPass(createAtomicExpandLegacyPass()); - addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering)); addPass(createNVPTXCtorDtorLoweringLegacyPass()); // === LSR and other generic IR passes === @@ -502,3 +497,12 @@ void NVPTXPassConfig::addMachineSSAOptimization() { addPass(&PeepholeOptimizerLegacyID); printAndVerify("After codegen peephole optimization pass"); } + +unsigned +NVPTXTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const { + if (Kind == PseudoSourceValue::FixedStack || + Kind == PseudoSourceValue::Stack) { + return ADDRESS_SPACE_LOCAL; + } + return CodeGenTargetMachineImpl::getAddressSpaceForPseudoSourceKind(Kind); +} \ No newline at end of file diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h index 118a01a0352f5..802c63ffca7f6 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h @@ -76,6 +76,8 @@ class NVPTXTargetMachine : public CodeGenTargetMachineImpl { std::pair getPredicatedAddrSpace(const Value *V) const override; + + unsigned getAddressSpaceForPseudoSourceKind(unsigned Kind) const override; }; // NVPTXTargetMachine. class NVPTXTargetMachine32 : public NVPTXTargetMachine { diff --git a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll index 0eb7f6462f6fa..eee600a8c98fe 100644 --- a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll +++ b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll @@ -25,9 +25,9 @@ entry: ; CHECK: ld.param.b64 %rd[[A_REG:[0-9]+]], [kernel_func_param_0] ; CHECK: cvta.to.global.u64 %rd[[A1_REG:[0-9]+]], %rd[[A_REG]] -; CHECK: add.u64 %rd[[SP_REG:[0-9]+]], %SP, 0 +; CHECK: add.u64 %rd[[SP_REG0:[0-9]+]], %SP, 0 ; CHECK: ld.global.b32 %r[[A0_REG:[0-9]+]], [%rd[[A1_REG]]] -; CHECK: st.local.b32 [{{%rd[0-9]+}}], %r[[A0_REG]] +; CHECK: st.local.b32 [%SPL], %r[[A0_REG]] %0 = load float, ptr %a, align 4 store float %0, ptr %buf, align 4 @@ -47,7 +47,7 @@ entry: ; CHECK-DAG: .param .b64 param0; ; CHECK-DAG: .param .b64 param1; ; CHECK-DAG: st.param.b64 [param0], %rd[[A_REG]] -; CHECK-DAG: st.param.b64 [param1], %rd[[SP_REG]] +; CHECK-DAG: st.param.b64 [param1], %rd[[SP_REG0]] ; CHECK: call.uni callee, call void @callee(ptr %a, ptr %buf) #2 diff --git a/llvm/test/CodeGen/NVPTX/dynamic-stackalloc-regression.ll b/llvm/test/CodeGen/NVPTX/dynamic-stackalloc-regression.ll index 0474d82556c1e..34702f1c177c5 100644 --- a/llvm/test/CodeGen/NVPTX/dynamic-stackalloc-regression.ll +++ b/llvm/test/CodeGen/NVPTX/dynamic-stackalloc-regression.ll @@ -13,13 +13,13 @@ define void @foo(i64 %a, ptr %p0, ptr %p1) { ; CHECK-NEXT: add.s64 %rd2, %rd1, 7; ; CHECK-NEXT: and.b64 %rd3, %rd2, -8; ; CHECK-NEXT: alloca.u64 %rd4, %rd3, 16; -; CHECK-NEXT: cvta.local.u64 %rd5, %rd4; -; CHECK-NEXT: ld.param.b64 %rd6, [foo_param_1]; -; CHECK-NEXT: alloca.u64 %rd7, %rd3, 16; -; CHECK-NEXT: cvta.local.u64 %rd8, %rd7; -; CHECK-NEXT: ld.param.b64 %rd9, [foo_param_2]; -; CHECK-NEXT: st.b64 [%rd6], %rd5; -; CHECK-NEXT: st.b64 [%rd9], %rd8; +; CHECK-NEXT: ld.param.b64 %rd5, [foo_param_1]; +; CHECK-NEXT: cvta.local.u64 %rd6, %rd4; +; CHECK-NEXT: ld.param.b64 %rd7, [foo_param_2]; +; CHECK-NEXT: alloca.u64 %rd8, %rd3, 16; +; CHECK-NEXT: cvta.local.u64 %rd9, %rd8; +; CHECK-NEXT: st.b64 [%rd5], %rd6; +; CHECK-NEXT: st.b64 [%rd7], %rd9; ; CHECK-NEXT: ret; %b = alloca i8, i64 %a, align 16 %c = alloca i8, i64 %a, align 16 diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll index 217bb483682ff..7a594904a35e8 100644 --- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll @@ -89,12 +89,12 @@ define float @test_extract_i(<2 x float> %a, i64 %idx) #0 { ; CHECK-NOF32X2-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_extract_i_param_0]; ; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_extract_i_param_1]; -; CHECK-NOF32X2-NEXT: st.v2.b32 [%SP], {%r1, %r2}; +; CHECK-NOF32X2-NEXT: st.local.v2.b32 [%SPL], {%r1, %r2}; ; CHECK-NOF32X2-NEXT: and.b64 %rd2, %rd1, 1; ; CHECK-NOF32X2-NEXT: shl.b64 %rd3, %rd2, 2; -; CHECK-NOF32X2-NEXT: add.u64 %rd4, %SP, 0; +; CHECK-NOF32X2-NEXT: add.u64 %rd4, %SPL, 0; ; CHECK-NOF32X2-NEXT: or.b64 %rd5, %rd4, %rd3; -; CHECK-NOF32X2-NEXT: ld.b32 %r3, [%rd5]; +; CHECK-NOF32X2-NEXT: ld.local.b32 %r3, [%rd5]; ; CHECK-NOF32X2-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NOF32X2-NEXT: ret; ; diff --git a/llvm/test/CodeGen/NVPTX/indirect_byval.ll b/llvm/test/CodeGen/NVPTX/indirect_byval.ll index e1fecdb76bd4d..83cde60b31655 100644 --- a/llvm/test/CodeGen/NVPTX/indirect_byval.ll +++ b/llvm/test/CodeGen/NVPTX/indirect_byval.ll @@ -17,23 +17,22 @@ define internal i32 @foo() { ; CHECK-NEXT: .reg .b64 %SPL; ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: mov.b64 %SPL, __local_depot0; ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; -; CHECK-NEXT: ld.global.b64 %rd1, [ptr]; +; CHECK-NEXT: add.u64 %rd1, %SP, 0; +; CHECK-NEXT: ld.global.b64 %rd2, [ptr]; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 1 .b8 param0[1]; ; CHECK-NEXT: .param .b64 param1; ; CHECK-NEXT: .param .b32 retval0; -; CHECK-NEXT: add.u64 %rd2, %SP, 0; -; CHECK-NEXT: st.param.b64 [param1], %rd2; -; CHECK-NEXT: add.u64 %rd3, %SPL, 1; -; CHECK-NEXT: ld.local.b8 %rs1, [%rd3]; +; CHECK-NEXT: st.param.b64 [param1], %rd1; +; CHECK-NEXT: ld.local.b8 %rs1, [%SPL+1]; ; CHECK-NEXT: st.param.b8 [param0], %rs1; ; CHECK-NEXT: prototype_0 : .callprototype (.param .b32 _) _ (.param .align 1 .b8 _[1], .param .b64 _); -; CHECK-NEXT: call (retval0), %rd1, (param0, param1), prototype_0; +; CHECK-NEXT: call (retval0), %rd2, (param0, param1), prototype_0; ; CHECK-NEXT: ld.param.b32 %r1, [retval0]; ; CHECK-NEXT: } // callseq 0 ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; @@ -54,23 +53,22 @@ define internal i32 @bar() { ; CHECK-NEXT: .reg .b64 %SP; ; CHECK-NEXT: .reg .b64 %SPL; ; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: mov.b64 %SPL, __local_depot1; ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; -; CHECK-NEXT: ld.global.b64 %rd1, [ptr]; +; CHECK-NEXT: add.u64 %rd1, %SP, 0; +; CHECK-NEXT: ld.global.b64 %rd2, [ptr]; ; CHECK-NEXT: { // callseq 1, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; ; CHECK-NEXT: .param .b64 param1; ; CHECK-NEXT: .param .b32 retval0; -; CHECK-NEXT: add.u64 %rd2, %SP, 0; -; CHECK-NEXT: st.param.b64 [param1], %rd2; -; CHECK-NEXT: add.u64 %rd3, %SPL, 8; -; CHECK-NEXT: ld.local.b64 %rd4, [%rd3]; -; CHECK-NEXT: st.param.b64 [param0], %rd4; +; CHECK-NEXT: st.param.b64 [param1], %rd1; +; CHECK-NEXT: ld.local.b64 %rd3, [%SPL+8]; +; CHECK-NEXT: st.param.b64 [param0], %rd3; ; CHECK-NEXT: prototype_1 : .callprototype (.param .b32 _) _ (.param .align 8 .b8 _[8], .param .b64 _); -; CHECK-NEXT: call (retval0), %rd1, (param0, param1), prototype_1; +; CHECK-NEXT: call (retval0), %rd2, (param0, param1), prototype_1; ; CHECK-NEXT: ld.param.b32 %r1, [retval0]; ; CHECK-NEXT: } // callseq 1 ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; diff --git a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll index 9dac46cb49005..88a897ebd06fb 100644 --- a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll +++ b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll @@ -12,13 +12,12 @@ define void @foo(i32 %a) { ; PTX32-NEXT: .local .align 4 .b8 __local_depot0[4]; ; PTX32-NEXT: .reg .b32 %SP; ; PTX32-NEXT: .reg .b32 %SPL; -; PTX32-NEXT: .reg .b32 %r<3>; +; PTX32-NEXT: .reg .b32 %r<2>; ; PTX32-EMPTY: ; PTX32-NEXT: // %bb.0: ; PTX32-NEXT: mov.b32 %SPL, __local_depot0; ; PTX32-NEXT: ld.param.b32 %r1, [foo_param_0]; -; PTX32-NEXT: add.u32 %r2, %SPL, 0; -; PTX32-NEXT: st.local.b32 [%r2], %r1; +; PTX32-NEXT: st.local.b32 [%SPL], %r1; ; PTX32-NEXT: ret; ; ; PTX64-LABEL: foo( @@ -27,13 +26,11 @@ define void @foo(i32 %a) { ; PTX64-NEXT: .reg .b64 %SP; ; PTX64-NEXT: .reg .b64 %SPL; ; PTX64-NEXT: .reg .b32 %r<2>; -; PTX64-NEXT: .reg .b64 %rd<2>; ; PTX64-EMPTY: ; PTX64-NEXT: // %bb.0: ; PTX64-NEXT: mov.b64 %SPL, __local_depot0; ; PTX64-NEXT: ld.param.b32 %r1, [foo_param_0]; -; PTX64-NEXT: add.u64 %rd1, %SPL, 0; -; PTX64-NEXT: st.local.b32 [%rd1], %r1; +; PTX64-NEXT: st.local.b32 [%SPL], %r1; ; PTX64-NEXT: ret; %local = alloca i32, align 4 store volatile i32 %a, ptr %local @@ -46,15 +43,14 @@ define ptx_kernel void @foo2(i32 %a) { ; PTX32-NEXT: .local .align 4 .b8 __local_depot1[4]; ; PTX32-NEXT: .reg .b32 %SP; ; PTX32-NEXT: .reg .b32 %SPL; -; PTX32-NEXT: .reg .b32 %r<4>; +; PTX32-NEXT: .reg .b32 %r<3>; ; PTX32-EMPTY: ; PTX32-NEXT: // %bb.0: ; PTX32-NEXT: mov.b32 %SPL, __local_depot1; ; PTX32-NEXT: cvta.local.u32 %SP, %SPL; ; PTX32-NEXT: ld.param.b32 %r1, [foo2_param_0]; ; PTX32-NEXT: add.u32 %r2, %SP, 0; -; PTX32-NEXT: add.u32 %r3, %SPL, 0; -; PTX32-NEXT: st.local.b32 [%r3], %r1; +; PTX32-NEXT: st.local.b32 [%SPL], %r1; ; PTX32-NEXT: { // callseq 0, 0 ; PTX32-NEXT: .param .b32 param0; ; PTX32-NEXT: st.param.b32 [param0], %r2; @@ -68,15 +64,14 @@ define ptx_kernel void @foo2(i32 %a) { ; PTX64-NEXT: .reg .b64 %SP; ; PTX64-NEXT: .reg .b64 %SPL; ; PTX64-NEXT: .reg .b32 %r<2>; -; PTX64-NEXT: .reg .b64 %rd<3>; +; PTX64-NEXT: .reg .b64 %rd<2>; ; PTX64-EMPTY: ; PTX64-NEXT: // %bb.0: ; PTX64-NEXT: mov.b64 %SPL, __local_depot1; ; PTX64-NEXT: cvta.local.u64 %SP, %SPL; ; PTX64-NEXT: ld.param.b32 %r1, [foo2_param_0]; ; PTX64-NEXT: add.u64 %rd1, %SP, 0; -; PTX64-NEXT: add.u64 %rd2, %SPL, 0; -; PTX64-NEXT: st.local.b32 [%rd2], %r1; +; PTX64-NEXT: st.local.b32 [%SPL], %r1; ; PTX64-NEXT: { // callseq 0, 0 ; PTX64-NEXT: .param .b64 param0; ; PTX64-NEXT: st.param.b64 [param0], %rd1; @@ -102,9 +97,9 @@ define void @foo3(i32 %a) { ; PTX32-NEXT: // %bb.0: ; PTX32-NEXT: mov.b32 %SPL, __local_depot2; ; PTX32-NEXT: ld.param.b32 %r1, [foo3_param_0]; -; PTX32-NEXT: add.u32 %r2, %SPL, 0; -; PTX32-NEXT: shl.b32 %r3, %r1, 2; -; PTX32-NEXT: add.s32 %r4, %r2, %r3; +; PTX32-NEXT: shl.b32 %r2, %r1, 2; +; PTX32-NEXT: add.u32 %r3, %SPL, 0; +; PTX32-NEXT: add.s32 %r4, %r3, %r2; ; PTX32-NEXT: st.local.b32 [%r4], %r1; ; PTX32-NEXT: ret; ; @@ -119,9 +114,9 @@ define void @foo3(i32 %a) { ; PTX64-NEXT: // %bb.0: ; PTX64-NEXT: mov.b64 %SPL, __local_depot2; ; PTX64-NEXT: ld.param.b32 %r1, [foo3_param_0]; -; PTX64-NEXT: add.u64 %rd1, %SPL, 0; -; PTX64-NEXT: mul.wide.s32 %rd2, %r1, 4; -; PTX64-NEXT: add.s64 %rd3, %rd1, %rd2; +; PTX64-NEXT: mul.wide.s32 %rd1, %r1, 4; +; PTX64-NEXT: add.u64 %rd2, %SPL, 0; +; PTX64-NEXT: add.s64 %rd3, %rd2, %rd1; ; PTX64-NEXT: st.local.b32 [%rd3], %r1; ; PTX64-NEXT: ret; %local = alloca [3 x i32], align 4 @@ -136,17 +131,15 @@ define void @foo4() { ; PTX32-NEXT: .local .align 4 .b8 __local_depot3[8]; ; PTX32-NEXT: .reg .b32 %SP; ; PTX32-NEXT: .reg .b32 %SPL; -; PTX32-NEXT: .reg .b32 %r<5>; +; PTX32-NEXT: .reg .b32 %r<3>; ; PTX32-EMPTY: ; PTX32-NEXT: // %bb.0: ; PTX32-NEXT: mov.b32 %SPL, __local_depot3; ; PTX32-NEXT: cvta.local.u32 %SP, %SPL; ; PTX32-NEXT: add.u32 %r1, %SP, 0; -; PTX32-NEXT: add.u32 %r2, %SPL, 0; -; PTX32-NEXT: add.u32 %r3, %SP, 4; -; PTX32-NEXT: add.u32 %r4, %SPL, 4; -; PTX32-NEXT: st.local.b32 [%r2], 0; -; PTX32-NEXT: st.local.b32 [%r4], 0; +; PTX32-NEXT: add.u32 %r2, %SP, 4; +; PTX32-NEXT: st.local.b32 [%SPL], 0; +; PTX32-NEXT: st.local.b32 [%SPL+4], 0; ; PTX32-NEXT: { // callseq 1, 0 ; PTX32-NEXT: .param .b32 param0; ; PTX32-NEXT: st.param.b32 [param0], %r1; @@ -154,7 +147,7 @@ define void @foo4() { ; PTX32-NEXT: } // callseq 1 ; PTX32-NEXT: { // callseq 2, 0 ; PTX32-NEXT: .param .b32 param0; -; PTX32-NEXT: st.param.b32 [param0], %r3; +; PTX32-NEXT: st.param.b32 [param0], %r2; ; PTX32-NEXT: call.uni bar, (param0); ; PTX32-NEXT: } // callseq 2 ; PTX32-NEXT: ret; @@ -164,17 +157,15 @@ define void @foo4() { ; PTX64-NEXT: .local .align 4 .b8 __local_depot3[8]; ; PTX64-NEXT: .reg .b64 %SP; ; PTX64-NEXT: .reg .b64 %SPL; -; PTX64-NEXT: .reg .b64 %rd<5>; +; PTX64-NEXT: .reg .b64 %rd<3>; ; PTX64-EMPTY: ; PTX64-NEXT: // %bb.0: ; PTX64-NEXT: mov.b64 %SPL, __local_depot3; ; PTX64-NEXT: cvta.local.u64 %SP, %SPL; ; PTX64-NEXT: add.u64 %rd1, %SP, 0; -; PTX64-NEXT: add.u64 %rd2, %SPL, 0; -; PTX64-NEXT: add.u64 %rd3, %SP, 4; -; PTX64-NEXT: add.u64 %rd4, %SPL, 4; -; PTX64-NEXT: st.local.b32 [%rd2], 0; -; PTX64-NEXT: st.local.b32 [%rd4], 0; +; PTX64-NEXT: add.u64 %rd2, %SP, 4; +; PTX64-NEXT: st.local.b32 [%SPL], 0; +; PTX64-NEXT: st.local.b32 [%SPL+4], 0; ; PTX64-NEXT: { // callseq 1, 0 ; PTX64-NEXT: .param .b64 param0; ; PTX64-NEXT: st.param.b64 [param0], %rd1; @@ -182,7 +173,7 @@ define void @foo4() { ; PTX64-NEXT: } // callseq 1 ; PTX64-NEXT: { // callseq 2, 0 ; PTX64-NEXT: .param .b64 param0; -; PTX64-NEXT: st.param.b64 [param0], %rd3; +; PTX64-NEXT: st.param.b64 [param0], %rd2; ; PTX64-NEXT: call.uni bar, (param0); ; PTX64-NEXT: } // callseq 2 ; PTX64-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/lower-alloca.ll b/llvm/test/CodeGen/NVPTX/lower-alloca.ll index 57c1e5826c89a..7bc90c42ad12f 100644 --- a/llvm/test/CodeGen/NVPTX/lower-alloca.ll +++ b/llvm/test/CodeGen/NVPTX/lower-alloca.ll @@ -7,28 +7,28 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 target triple = "nvptx64-unknown-unknown" define ptx_kernel void @kernel() { -; LABEL: @lower_alloca +; CHECK-LABEL: @kernel ; PTX-LABEL: .visible .entry kernel( %A = alloca i32 -; CHECK: addrspacecast ptr %A to ptr addrspace(5) -; CHECK: store i32 0, ptr addrspace(5) {{%.+}} -; LOWERALLOCAONLY: [[V1:%.*]] = addrspacecast ptr %A to ptr addrspace(5) -; LOWERALLOCAONLY: [[V2:%.*]] = addrspacecast ptr addrspace(5) [[V1]] to ptr +; CHECK: %A1 = alloca i32, align 4, addrspace(5) +; CHECK: store i32 0, ptr addrspace(5) %A1 +; LOWERALLOCAONLY: %A1 = alloca i32, align 4, addrspace(5) +; LOWERALLOCAONLY: [[V2:%.*]] = addrspacecast ptr addrspace(5) %A1 to ptr ; LOWERALLOCAONLY: store i32 0, ptr [[V2]], align 4 -; PTX: st.local.b32 [{{%rd[0-9]+}}], 0 +; PTX: st.local.b32 [%SPL], 0 store i32 0, ptr %A call void @callee(ptr %A) ret void } define void @alloca_in_explicit_local_as() { -; LABEL: @lower_alloca_addrspace5 +; CHECK-LABEL: @alloca_in_explicit_local_as ; PTX-LABEL: .visible .func alloca_in_explicit_local_as( %A = alloca i32, addrspace(5) -; CHECK: store i32 0, ptr addrspace(5) {{%.+}} -; PTX: st.local.b32 [%SP], 0 -; LOWERALLOCAONLY: [[V1:%.*]] = addrspacecast ptr addrspace(5) %A to ptr -; LOWERALLOCAONLY: store i32 0, ptr [[V1]], align 4 +; CHECK: store i32 0, ptr addrspace(5) %A, align 4 +; PTX: st.local.b32 [%SPL], 0 +; LOWERALLOCAONLY: %A = alloca i32, align 4, addrspace(5) +; LOWERALLOCAONLY: store i32 0, ptr addrspace(5) %A store i32 0, ptr addrspace(5) %A call void @callee(ptr addrspace(5) %A) ret void diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll index 01ab47145940c..2dd028101abdf 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll @@ -152,7 +152,7 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .b32 %r<2>; -; PTX-NEXT: .reg .b64 %rd<8>; +; PTX-NEXT: .reg .b64 %rd<7>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: mov.b64 %SPL, __local_depot4; @@ -163,8 +163,7 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 ; PTX-NEXT: cvta.param.u64 %rd3, %rd2; ; PTX-NEXT: cvta.param.u64 %rd4, %rd1; ; PTX-NEXT: add.u64 %rd5, %SP, 0; -; PTX-NEXT: add.u64 %rd6, %SPL, 0; -; PTX-NEXT: st.local.b32 [%rd6], %r1; +; PTX-NEXT: st.local.b32 [%SPL], %r1; ; PTX-NEXT: { // callseq 1, 0 ; PTX-NEXT: .param .b64 param0; ; PTX-NEXT: .param .b64 param1; @@ -174,8 +173,8 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 ; PTX-NEXT: st.param.b64 [param1], %rd5; ; PTX-NEXT: st.param.b64 [param0], %rd4; ; PTX-NEXT: prototype_1 : .callprototype (.param .b32 _) _ (.param .b64 _, .param .b64 _, .param .b64 _); -; PTX-NEXT: mov.b64 %rd7, escape3; -; PTX-NEXT: call (retval0), %rd7, (param0, param1, param2), prototype_1; +; PTX-NEXT: mov.b64 %rd6, escape3; +; PTX-NEXT: call (retval0), %rd6, (param0, param1, param2), prototype_1; ; PTX-NEXT: } // callseq 1 ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @multiple_grid_const_escape( diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll index 21257e21bea9f..ed6657ae90549 100644 --- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -135,21 +135,20 @@ define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out ; ; PTX-LABEL: escape_ptr( ; PTX: { -; PTX-NEXT: .local .align 4 .b8 __local_depot2[8]; +; PTX-NEXT: .local .align 8 .b8 __local_depot2[8]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; -; PTX-NEXT: .reg .b32 %r<3>; -; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-NEXT: .reg .b64 %rd<6>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %entry ; PTX-NEXT: mov.b64 %SPL, __local_depot2; ; PTX-NEXT: cvta.local.u64 %SP, %SPL; ; PTX-NEXT: add.u64 %rd1, %SP, 0; -; PTX-NEXT: add.u64 %rd2, %SPL, 0; -; PTX-NEXT: ld.param.b32 %r1, [escape_ptr_param_1+4]; -; PTX-NEXT: st.local.b32 [%rd2+4], %r1; -; PTX-NEXT: ld.param.b32 %r2, [escape_ptr_param_1]; -; PTX-NEXT: st.local.b32 [%rd2], %r2; +; PTX-NEXT: ld.param.b32 %rd2, [escape_ptr_param_1+4]; +; PTX-NEXT: shl.b64 %rd3, %rd2, 32; +; PTX-NEXT: ld.param.b32 %rd4, [escape_ptr_param_1]; +; PTX-NEXT: or.b64 %rd5, %rd3, %rd4; +; PTX-NEXT: st.local.b64 [%SPL], %rd5; ; PTX-NEXT: { // callseq 0, 0 ; PTX-NEXT: .param .b64 param0; ; PTX-NEXT: st.param.b64 [param0], %rd1; @@ -175,25 +174,24 @@ define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone ; ; PTX-LABEL: escape_ptr_gep( ; PTX: { -; PTX-NEXT: .local .align 4 .b8 __local_depot3[8]; +; PTX-NEXT: .local .align 8 .b8 __local_depot3[8]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; -; PTX-NEXT: .reg .b32 %r<3>; -; PTX-NEXT: .reg .b64 %rd<4>; +; PTX-NEXT: .reg .b64 %rd<7>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %entry ; PTX-NEXT: mov.b64 %SPL, __local_depot3; ; PTX-NEXT: cvta.local.u64 %SP, %SPL; ; PTX-NEXT: add.u64 %rd1, %SP, 0; -; PTX-NEXT: add.u64 %rd2, %SPL, 0; -; PTX-NEXT: ld.param.b32 %r1, [escape_ptr_gep_param_1+4]; -; PTX-NEXT: st.local.b32 [%rd2+4], %r1; -; PTX-NEXT: ld.param.b32 %r2, [escape_ptr_gep_param_1]; -; PTX-NEXT: st.local.b32 [%rd2], %r2; -; PTX-NEXT: add.s64 %rd3, %rd1, 4; +; PTX-NEXT: ld.param.b32 %rd2, [escape_ptr_gep_param_1+4]; +; PTX-NEXT: shl.b64 %rd3, %rd2, 32; +; PTX-NEXT: ld.param.b32 %rd4, [escape_ptr_gep_param_1]; +; PTX-NEXT: or.b64 %rd5, %rd3, %rd4; +; PTX-NEXT: st.local.b64 [%SPL], %rd5; +; PTX-NEXT: add.s64 %rd6, %rd1, 4; ; PTX-NEXT: { // callseq 1, 0 ; PTX-NEXT: .param .b64 param0; -; PTX-NEXT: st.param.b64 [param0], %rd3; +; PTX-NEXT: st.param.b64 [param0], %rd6; ; PTX-NEXT: call.uni _Z6escapePv, (param0); ; PTX-NEXT: } // callseq 1 ; PTX-NEXT: ret; @@ -216,11 +214,10 @@ define dso_local ptx_kernel void @escape_ptr_store(ptr nocapture noundef writeon ; ; PTX-LABEL: escape_ptr_store( ; PTX: { -; PTX-NEXT: .local .align 4 .b8 __local_depot4[8]; +; PTX-NEXT: .local .align 8 .b8 __local_depot4[8]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; -; PTX-NEXT: .reg .b32 %r<3>; -; PTX-NEXT: .reg .b64 %rd<5>; +; PTX-NEXT: .reg .b64 %rd<8>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %entry ; PTX-NEXT: mov.b64 %SPL, __local_depot4; @@ -228,11 +225,11 @@ define dso_local ptx_kernel void @escape_ptr_store(ptr nocapture noundef writeon ; PTX-NEXT: ld.param.b64 %rd1, [escape_ptr_store_param_0]; ; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; ; PTX-NEXT: add.u64 %rd3, %SP, 0; -; PTX-NEXT: add.u64 %rd4, %SPL, 0; -; PTX-NEXT: ld.param.b32 %r1, [escape_ptr_store_param_1+4]; -; PTX-NEXT: st.local.b32 [%rd4+4], %r1; -; PTX-NEXT: ld.param.b32 %r2, [escape_ptr_store_param_1]; -; PTX-NEXT: st.local.b32 [%rd4], %r2; +; PTX-NEXT: ld.param.b32 %rd4, [escape_ptr_store_param_1+4]; +; PTX-NEXT: shl.b64 %rd5, %rd4, 32; +; PTX-NEXT: ld.param.b32 %rd6, [escape_ptr_store_param_1]; +; PTX-NEXT: or.b64 %rd7, %rd5, %rd6; +; PTX-NEXT: st.local.b64 [%SPL], %rd7; ; PTX-NEXT: st.global.b64 [%rd2], %rd3; ; PTX-NEXT: ret; entry: @@ -254,11 +251,10 @@ define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef wri ; ; PTX-LABEL: escape_ptr_gep_store( ; PTX: { -; PTX-NEXT: .local .align 4 .b8 __local_depot5[8]; +; PTX-NEXT: .local .align 8 .b8 __local_depot5[8]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; -; PTX-NEXT: .reg .b32 %r<3>; -; PTX-NEXT: .reg .b64 %rd<6>; +; PTX-NEXT: .reg .b64 %rd<9>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %entry ; PTX-NEXT: mov.b64 %SPL, __local_depot5; @@ -266,13 +262,13 @@ define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef wri ; PTX-NEXT: ld.param.b64 %rd1, [escape_ptr_gep_store_param_0]; ; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; ; PTX-NEXT: add.u64 %rd3, %SP, 0; -; PTX-NEXT: add.u64 %rd4, %SPL, 0; -; PTX-NEXT: ld.param.b32 %r1, [escape_ptr_gep_store_param_1+4]; -; PTX-NEXT: st.local.b32 [%rd4+4], %r1; -; PTX-NEXT: ld.param.b32 %r2, [escape_ptr_gep_store_param_1]; -; PTX-NEXT: st.local.b32 [%rd4], %r2; -; PTX-NEXT: add.s64 %rd5, %rd3, 4; -; PTX-NEXT: st.global.b64 [%rd2], %rd5; +; PTX-NEXT: ld.param.b32 %rd4, [escape_ptr_gep_store_param_1+4]; +; PTX-NEXT: shl.b64 %rd5, %rd4, 32; +; PTX-NEXT: ld.param.b32 %rd6, [escape_ptr_gep_store_param_1]; +; PTX-NEXT: or.b64 %rd7, %rd5, %rd6; +; PTX-NEXT: st.local.b64 [%SPL], %rd7; +; PTX-NEXT: add.s64 %rd8, %rd3, 4; +; PTX-NEXT: st.global.b64 [%rd2], %rd8; ; PTX-NEXT: ret; entry: %b = getelementptr inbounds nuw i8, ptr %s, i64 4 @@ -294,11 +290,10 @@ define dso_local ptx_kernel void @escape_ptrtoint(ptr nocapture noundef writeonl ; ; PTX-LABEL: escape_ptrtoint( ; PTX: { -; PTX-NEXT: .local .align 4 .b8 __local_depot6[8]; +; PTX-NEXT: .local .align 8 .b8 __local_depot6[8]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; -; PTX-NEXT: .reg .b32 %r<3>; -; PTX-NEXT: .reg .b64 %rd<5>; +; PTX-NEXT: .reg .b64 %rd<8>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %entry ; PTX-NEXT: mov.b64 %SPL, __local_depot6; @@ -306,11 +301,11 @@ define dso_local ptx_kernel void @escape_ptrtoint(ptr nocapture noundef writeonl ; PTX-NEXT: ld.param.b64 %rd1, [escape_ptrtoint_param_0]; ; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; ; PTX-NEXT: add.u64 %rd3, %SP, 0; -; PTX-NEXT: add.u64 %rd4, %SPL, 0; -; PTX-NEXT: ld.param.b32 %r1, [escape_ptrtoint_param_1+4]; -; PTX-NEXT: st.local.b32 [%rd4+4], %r1; -; PTX-NEXT: ld.param.b32 %r2, [escape_ptrtoint_param_1]; -; PTX-NEXT: st.local.b32 [%rd4], %r2; +; PTX-NEXT: ld.param.b32 %rd4, [escape_ptrtoint_param_1+4]; +; PTX-NEXT: shl.b64 %rd5, %rd4, 32; +; PTX-NEXT: ld.param.b32 %rd6, [escape_ptrtoint_param_1]; +; PTX-NEXT: or.b64 %rd7, %rd5, %rd6; +; PTX-NEXT: st.local.b64 [%SPL], %rd7; ; PTX-NEXT: st.global.b64 [%rd2], %rd3; ; PTX-NEXT: ret; entry: @@ -455,64 +450,51 @@ define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly ; PTX-NEXT: .local .align 8 .b8 __local_depot9[8]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; -; PTX-NEXT: .reg .b32 %r<3>; -; PTX-NEXT: .reg .b64 %rd<47>; +; PTX-NEXT: .reg .b16 %rs<17>; +; PTX-NEXT: .reg .b64 %rd<7>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %entry ; PTX-NEXT: mov.b64 %SPL, __local_depot9; ; PTX-NEXT: cvta.local.u64 %SP, %SPL; ; PTX-NEXT: ld.param.b64 %rd1, [memcpy_to_param_param_0]; -; PTX-NEXT: add.u64 %rd2, %SPL, 0; -; PTX-NEXT: ld.param.b32 %r1, [memcpy_to_param_param_1+4]; -; PTX-NEXT: st.local.b32 [%rd2+4], %r1; -; PTX-NEXT: ld.param.b32 %r2, [memcpy_to_param_param_1]; -; PTX-NEXT: st.local.b32 [%rd2], %r2; -; PTX-NEXT: ld.volatile.b8 %rd3, [%rd1]; -; PTX-NEXT: ld.volatile.b8 %rd4, [%rd1+1]; -; PTX-NEXT: shl.b64 %rd5, %rd4, 8; -; PTX-NEXT: or.b64 %rd6, %rd5, %rd3; -; PTX-NEXT: ld.volatile.b8 %rd7, [%rd1+2]; -; PTX-NEXT: shl.b64 %rd8, %rd7, 16; -; PTX-NEXT: ld.volatile.b8 %rd9, [%rd1+3]; -; PTX-NEXT: shl.b64 %rd10, %rd9, 24; -; PTX-NEXT: or.b64 %rd11, %rd10, %rd8; -; PTX-NEXT: or.b64 %rd12, %rd11, %rd6; -; PTX-NEXT: ld.volatile.b8 %rd13, [%rd1+4]; -; PTX-NEXT: ld.volatile.b8 %rd14, [%rd1+5]; -; PTX-NEXT: shl.b64 %rd15, %rd14, 8; -; PTX-NEXT: or.b64 %rd16, %rd15, %rd13; -; PTX-NEXT: ld.volatile.b8 %rd17, [%rd1+6]; -; PTX-NEXT: shl.b64 %rd18, %rd17, 16; -; PTX-NEXT: ld.volatile.b8 %rd19, [%rd1+7]; -; PTX-NEXT: shl.b64 %rd20, %rd19, 24; -; PTX-NEXT: or.b64 %rd21, %rd20, %rd18; -; PTX-NEXT: or.b64 %rd22, %rd21, %rd16; -; PTX-NEXT: shl.b64 %rd23, %rd22, 32; -; PTX-NEXT: or.b64 %rd24, %rd23, %rd12; -; PTX-NEXT: st.volatile.b64 [%SP], %rd24; -; PTX-NEXT: ld.volatile.b8 %rd25, [%rd1+8]; -; PTX-NEXT: ld.volatile.b8 %rd26, [%rd1+9]; -; PTX-NEXT: shl.b64 %rd27, %rd26, 8; -; PTX-NEXT: or.b64 %rd28, %rd27, %rd25; -; PTX-NEXT: ld.volatile.b8 %rd29, [%rd1+10]; -; PTX-NEXT: shl.b64 %rd30, %rd29, 16; -; PTX-NEXT: ld.volatile.b8 %rd31, [%rd1+11]; -; PTX-NEXT: shl.b64 %rd32, %rd31, 24; -; PTX-NEXT: or.b64 %rd33, %rd32, %rd30; -; PTX-NEXT: or.b64 %rd34, %rd33, %rd28; -; PTX-NEXT: ld.volatile.b8 %rd35, [%rd1+12]; -; PTX-NEXT: ld.volatile.b8 %rd36, [%rd1+13]; -; PTX-NEXT: shl.b64 %rd37, %rd36, 8; -; PTX-NEXT: or.b64 %rd38, %rd37, %rd35; -; PTX-NEXT: ld.volatile.b8 %rd39, [%rd1+14]; -; PTX-NEXT: shl.b64 %rd40, %rd39, 16; -; PTX-NEXT: ld.volatile.b8 %rd41, [%rd1+15]; -; PTX-NEXT: shl.b64 %rd42, %rd41, 24; -; PTX-NEXT: or.b64 %rd43, %rd42, %rd40; -; PTX-NEXT: or.b64 %rd44, %rd43, %rd38; -; PTX-NEXT: shl.b64 %rd45, %rd44, 32; -; PTX-NEXT: or.b64 %rd46, %rd45, %rd34; -; PTX-NEXT: st.volatile.b64 [%SP+8], %rd46; +; PTX-NEXT: add.u64 %rd2, %SP, 0; +; PTX-NEXT: ld.param.b32 %rd3, [memcpy_to_param_param_1+4]; +; PTX-NEXT: shl.b64 %rd4, %rd3, 32; +; PTX-NEXT: ld.param.b32 %rd5, [memcpy_to_param_param_1]; +; PTX-NEXT: or.b64 %rd6, %rd4, %rd5; +; PTX-NEXT: st.local.b64 [%SPL], %rd6; +; PTX-NEXT: ld.volatile.b8 %rs1, [%rd1]; +; PTX-NEXT: st.volatile.b8 [%rd2], %rs1; +; PTX-NEXT: ld.volatile.b8 %rs2, [%rd1+1]; +; PTX-NEXT: st.volatile.b8 [%rd2+1], %rs2; +; PTX-NEXT: ld.volatile.b8 %rs3, [%rd1+2]; +; PTX-NEXT: st.volatile.b8 [%rd2+2], %rs3; +; PTX-NEXT: ld.volatile.b8 %rs4, [%rd1+3]; +; PTX-NEXT: st.volatile.b8 [%rd2+3], %rs4; +; PTX-NEXT: ld.volatile.b8 %rs5, [%rd1+4]; +; PTX-NEXT: st.volatile.b8 [%rd2+4], %rs5; +; PTX-NEXT: ld.volatile.b8 %rs6, [%rd1+5]; +; PTX-NEXT: st.volatile.b8 [%rd2+5], %rs6; +; PTX-NEXT: ld.volatile.b8 %rs7, [%rd1+6]; +; PTX-NEXT: st.volatile.b8 [%rd2+6], %rs7; +; PTX-NEXT: ld.volatile.b8 %rs8, [%rd1+7]; +; PTX-NEXT: st.volatile.b8 [%rd2+7], %rs8; +; PTX-NEXT: ld.volatile.b8 %rs9, [%rd1+8]; +; PTX-NEXT: st.volatile.b8 [%rd2+8], %rs9; +; PTX-NEXT: ld.volatile.b8 %rs10, [%rd1+9]; +; PTX-NEXT: st.volatile.b8 [%rd2+9], %rs10; +; PTX-NEXT: ld.volatile.b8 %rs11, [%rd1+10]; +; PTX-NEXT: st.volatile.b8 [%rd2+10], %rs11; +; PTX-NEXT: ld.volatile.b8 %rs12, [%rd1+11]; +; PTX-NEXT: st.volatile.b8 [%rd2+11], %rs12; +; PTX-NEXT: ld.volatile.b8 %rs13, [%rd1+12]; +; PTX-NEXT: st.volatile.b8 [%rd2+12], %rs13; +; PTX-NEXT: ld.volatile.b8 %rs14, [%rd1+13]; +; PTX-NEXT: st.volatile.b8 [%rd2+13], %rs14; +; PTX-NEXT: ld.volatile.b8 %rs15, [%rd1+14]; +; PTX-NEXT: st.volatile.b8 [%rd2+14], %rs15; +; PTX-NEXT: ld.volatile.b8 %rs16, [%rd1+15]; +; PTX-NEXT: st.volatile.b8 [%rd2+15], %rs16; ; PTX-NEXT: ret; entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %s, ptr %in, i64 16, i1 true) @@ -655,17 +637,16 @@ define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr by ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %bb ; PTX-NEXT: mov.b64 %SPL, __local_depot12; -; PTX-NEXT: cvta.local.u64 %SP, %SPL; ; PTX-NEXT: ld.param.b8 %rs1, [test_select_write_param_3]; ; PTX-NEXT: and.b16 %rs2, %rs1, 1; ; PTX-NEXT: setp.ne.b16 %p1, %rs2, 0; ; PTX-NEXT: ld.param.b32 %r1, [test_select_write_param_1]; -; PTX-NEXT: st.b32 [%SP], %r1; +; PTX-NEXT: st.local.b32 [%SPL], %r1; ; PTX-NEXT: ld.param.b32 %r2, [test_select_write_param_0]; -; PTX-NEXT: st.b32 [%SP+4], %r2; -; PTX-NEXT: add.u64 %rd1, %SPL, 4; -; PTX-NEXT: add.u64 %rd2, %SPL, 0; -; PTX-NEXT: selp.b64 %rd3, %rd1, %rd2, %p1; +; PTX-NEXT: st.local.b32 [%SPL+4], %r2; +; PTX-NEXT: add.u64 %rd1, %SPL, 0; +; PTX-NEXT: add.u64 %rd2, %SPL, 4; +; PTX-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1; ; PTX-NEXT: st.local.b32 [%rd3], 1; ; PTX-NEXT: ret; bb: @@ -830,25 +811,23 @@ define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr ; PTX-NEXT: .reg .pred %p<2>; ; PTX-NEXT: .reg .b16 %rs<3>; ; PTX-NEXT: .reg .b32 %r<3>; -; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-NEXT: .reg .b64 %rd<2>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %bb ; PTX-NEXT: mov.b64 %SPL, __local_depot14; -; PTX-NEXT: cvta.local.u64 %SP, %SPL; ; PTX-NEXT: ld.param.b8 %rs1, [test_phi_write_param_2]; ; PTX-NEXT: and.b16 %rs2, %rs1, 1; ; PTX-NEXT: setp.ne.b16 %p1, %rs2, 0; -; PTX-NEXT: add.u64 %rd1, %SPL, 0; ; PTX-NEXT: ld.param.b32 %r1, [test_phi_write_param_1+4]; -; PTX-NEXT: st.b32 [%SP], %r1; -; PTX-NEXT: add.u64 %rd2, %SPL, 4; +; PTX-NEXT: st.local.b32 [%SPL], %r1; ; PTX-NEXT: ld.param.b32 %r2, [test_phi_write_param_0]; -; PTX-NEXT: st.b32 [%SP+4], %r2; +; PTX-NEXT: st.local.b32 [%SPL+4], %r2; +; PTX-NEXT: add.u64 %rd1, %SPL, 4; ; PTX-NEXT: @%p1 bra $L__BB14_2; ; PTX-NEXT: // %bb.1: // %second -; PTX-NEXT: mov.b64 %rd2, %rd1; +; PTX-NEXT: add.u64 %rd1, %SPL, 0; ; PTX-NEXT: $L__BB14_2: // %merge -; PTX-NEXT: st.local.b32 [%rd2], 1; +; PTX-NEXT: st.local.b32 [%rd1], 1; ; PTX-NEXT: ret; bb: br i1 %cond, label %first, label %second @@ -882,13 +861,11 @@ define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) { ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .b32 %r<2>; -; PTX-NEXT: .reg .b64 %rd<2>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: mov.b64 %SPL, __local_depot15; -; PTX-NEXT: add.u64 %rd1, %SPL, 0; ; PTX-NEXT: ld.param.b32 %r1, [test_forward_byval_arg_param_0]; -; PTX-NEXT: st.local.b32 [%rd1], %r1; +; PTX-NEXT: st.local.b32 [%SPL], %r1; ; PTX-NEXT: { // callseq 2, 0 ; PTX-NEXT: .param .align 4 .b8 param0[4]; ; PTX-NEXT: st.param.b32 [param0], %r1; diff --git a/llvm/test/CodeGen/NVPTX/vaargs.ll b/llvm/test/CodeGen/NVPTX/vaargs.ll index a6b1bdda22e3c..d36d13268490e 100644 --- a/llvm/test/CodeGen/NVPTX/vaargs.ll +++ b/llvm/test/CodeGen/NVPTX/vaargs.ll @@ -17,55 +17,55 @@ entry: ; Test va_start ; CHECK: .param .align 8 .b8 foo_vararg[] ; CHECK: mov.b[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], foo_vararg; -; CHECK-NEXT: st.b[[BITS]] [%SP], [[VA_PTR]]; +; CHECK-NEXT: st.b[[BITS]] [[[SP1:%(r|rd)[0-9]+]]], [[VA_PTR]] call void @llvm.va_start(ptr %al) ; Test va_copy() -; CHECK-NEXT: ld.b[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP]; -; CHECK-NEXT: st.b[[BITS]] [%SP+{{[0-9]+}}], [[VA_PTR]]; +; CHECK-NEXT: ld.b[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [[[SP1]]]; +; CHECK-NEXT: st.b[[BITS]] [[[SP2:%(r|rd)[0-9]+]]], [[VA_PTR]]; call void @llvm.va_copy(ptr %al2, ptr %al) ; Test va_arg(ap, int32_t) -; CHECK-NEXT: ld.b[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP]; +; CHECK-NEXT: ld.b[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [[[SP1]]]; ; CHECK-NEXT: add.s[[BITS]] [[VA_PTR_TMP:%(r|rd)[0-9]+]], [[VA_PTR]], 3; ; CHECK-NEXT: and.b[[BITS]] [[VA_PTR_ALIGN:%(r|rd)[0-9]+]], [[VA_PTR_TMP]], -4; ; CHECK-NEXT: add.s[[BITS]] [[VA_PTR_NEXT:%(r|rd)[0-9]+]], [[VA_PTR_ALIGN]], 4; -; CHECK-NEXT: st.b[[BITS]] [%SP], [[VA_PTR_NEXT]]; +; CHECK-NEXT: st.b[[BITS]] [[[SP1]]], [[VA_PTR_NEXT]]; ; CHECK-NEXT: ld.local.b32 %r{{[0-9]+}}, [[[VA_PTR_ALIGN]]]; %0 = va_arg ptr %al, i32 ; Test va_arg(ap, int64_t) -; CHECK-NEXT: ld.b[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP]; +; CHECK-NEXT: ld.b[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [[[SP1]]]; ; CHECK-NEXT: add.s[[BITS]] [[VA_PTR_TMP:%(r|rd)[0-9]+]], [[VA_PTR]], 7; ; CHECK-NEXT: and.b[[BITS]] [[VA_PTR_ALIGN:%(r|rd)[0-9]+]], [[VA_PTR_TMP]], -8; ; CHECK-NEXT: add.s[[BITS]] [[VA_PTR_NEXT:%(r|rd)[0-9]+]], [[VA_PTR_ALIGN]], 8; -; CHECK-NEXT: st.b[[BITS]] [%SP], [[VA_PTR_NEXT]]; +; CHECK-NEXT: st.b[[BITS]] [[[SP1]]], [[VA_PTR_NEXT]]; ; CHECK-NEXT: ld.local.b64 %rd{{[0-9]+}}, [[[VA_PTR_ALIGN]]]; %1 = va_arg ptr %al, i64 ; Test va_arg(ap, double) -; CHECK-NEXT: ld.b[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP]; +; CHECK-NEXT: ld.b[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [[[SP1]]]; ; CHECK-NEXT: add.s[[BITS]] [[VA_PTR_TMP:%(r|rd)[0-9]+]], [[VA_PTR]], 7; ; CHECK-NEXT: and.b[[BITS]] [[VA_PTR_ALIGN:%(r|rd)[0-9]+]], [[VA_PTR_TMP]], -8; ; CHECK-NEXT: add.s[[BITS]] [[VA_PTR_NEXT:%(r|rd)[0-9]+]], [[VA_PTR_ALIGN]], 8; -; CHECK-NEXT: st.b[[BITS]] [%SP], [[VA_PTR_NEXT]]; +; CHECK-NEXT: st.b[[BITS]] [[[SP1]]], [[VA_PTR_NEXT]]; ; CHECK-NEXT: ld.local.b64 %rd{{[0-9]+}}, [[[VA_PTR_ALIGN]]]; %2 = va_arg ptr %al, double ; Test va_arg(ap, ptr) -; CHECK-NEXT: ld.b[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP]; +; CHECK-NEXT: ld.b[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [[[SP1]]]; ; CHECK32-NEXT: add.s32 [[VA_PTR_TMP:%r[0-9]+]], [[VA_PTR]], 3; ; CHECK64-NEXT: add.s64 [[VA_PTR_TMP:%rd[0-9]+]], [[VA_PTR]], 7; ; CHECK32-NEXT: and.b32 [[VA_PTR_ALIGN:%r[0-9]+]], [[VA_PTR_TMP]], -4; ; CHECK64-NEXT: and.b64 [[VA_PTR_ALIGN:%rd[0-9]+]], [[VA_PTR_TMP]], -8; ; CHECK32-NEXT: add.s32 [[VA_PTR_NEXT:%r[0-9]+]], [[VA_PTR_ALIGN]], 4; ; CHECK64-NEXT: add.s64 [[VA_PTR_NEXT:%rd[0-9]+]], [[VA_PTR_ALIGN]], 8; -; CHECK-NEXT: st.b[[BITS]] [%SP], [[VA_PTR_NEXT]]; +; CHECK-NEXT: st.b[[BITS]] [[[SP1]]], [[VA_PTR_NEXT]]; ; CHECK-NEXT: ld.local.b[[BITS]] %{{(r|rd)[0-9]+}}, [[[VA_PTR_ALIGN]]]; %3 = va_arg ptr %al, ptr diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll index 61ff80632c789..9e3434eae0894 100644 --- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll +++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll @@ -110,16 +110,16 @@ define dso_local i32 @foo() { ; CHECK-PTX-NEXT: // %bb.0: // %entry ; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot1; ; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; -; CHECK-PTX-NEXT: st.b64 [%SP], 4294967297; -; CHECK-PTX-NEXT: st.b32 [%SP+8], 1; -; CHECK-PTX-NEXT: st.b64 [%SP+16], 1; -; CHECK-PTX-NEXT: st.b64 [%SP+24], 4607182418800017408; -; CHECK-PTX-NEXT: st.b64 [%SP+32], 4607182418800017408; +; CHECK-PTX-NEXT: add.u64 %rd1, %SP, 0; +; CHECK-PTX-NEXT: st.local.b64 [%SPL], 4294967297; +; CHECK-PTX-NEXT: st.local.b32 [%SPL+8], 1; +; CHECK-PTX-NEXT: st.local.b64 [%SPL+16], 1; +; CHECK-PTX-NEXT: st.local.b64 [%SPL+24], 4607182418800017408; +; CHECK-PTX-NEXT: st.local.b64 [%SPL+32], 4607182418800017408; ; CHECK-PTX-NEXT: { // callseq 0, 0 ; CHECK-PTX-NEXT: .param .b32 param0; ; CHECK-PTX-NEXT: .param .b64 param1; ; CHECK-PTX-NEXT: .param .b32 retval0; -; CHECK-PTX-NEXT: add.u64 %rd1, %SP, 0; ; CHECK-PTX-NEXT: st.param.b64 [param1], %rd1; ; CHECK-PTX-NEXT: st.param.b32 [param0], 1; ; CHECK-PTX-NEXT: call.uni (retval0), variadics1, (param0, param1); @@ -138,34 +138,34 @@ entry: define dso_local i32 @variadics2(i32 noundef %first, ...) { ; CHECK-PTX-LABEL: variadics2( ; CHECK-PTX: { -; CHECK-PTX-NEXT: .local .align 1 .b8 __local_depot2[3]; +; CHECK-PTX-NEXT: .local .align 2 .b8 __local_depot2[4]; ; CHECK-PTX-NEXT: .reg .b64 %SP; ; CHECK-PTX-NEXT: .reg .b64 %SPL; -; CHECK-PTX-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-NEXT: .reg .b16 %rs<6>; ; CHECK-PTX-NEXT: .reg .b32 %r<6>; -; CHECK-PTX-NEXT: .reg .b64 %rd<8>; +; CHECK-PTX-NEXT: .reg .b64 %rd<7>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: // %entry ; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot2; ; CHECK-PTX-NEXT: ld.param.b32 %r1, [variadics2_param_0]; ; CHECK-PTX-NEXT: ld.param.b64 %rd1, [variadics2_param_1]; -; CHECK-PTX-NEXT: add.u64 %rd2, %SPL, 0; -; CHECK-PTX-NEXT: add.s64 %rd3, %rd1, 7; -; CHECK-PTX-NEXT: and.b64 %rd4, %rd3, -8; -; CHECK-PTX-NEXT: ld.b32 %r2, [%rd4]; -; CHECK-PTX-NEXT: ld.s8 %r3, [%rd4+4]; -; CHECK-PTX-NEXT: ld.b8 %rs1, [%rd4+7]; -; CHECK-PTX-NEXT: st.local.b8 [%rd2+2], %rs1; -; CHECK-PTX-NEXT: ld.b8 %rs2, [%rd4+6]; -; CHECK-PTX-NEXT: st.local.b8 [%rd2+1], %rs2; -; CHECK-PTX-NEXT: ld.b8 %rs3, [%rd4+5]; -; CHECK-PTX-NEXT: st.local.b8 [%rd2], %rs3; -; CHECK-PTX-NEXT: ld.b64 %rd5, [%rd4+8]; +; CHECK-PTX-NEXT: add.s64 %rd2, %rd1, 7; +; CHECK-PTX-NEXT: and.b64 %rd3, %rd2, -8; +; CHECK-PTX-NEXT: ld.b32 %r2, [%rd3]; +; CHECK-PTX-NEXT: ld.s8 %r3, [%rd3+4]; +; CHECK-PTX-NEXT: ld.b8 %rs1, [%rd3+7]; +; CHECK-PTX-NEXT: st.local.b8 [%SPL+2], %rs1; +; CHECK-PTX-NEXT: ld.b8 %rs2, [%rd3+5]; +; CHECK-PTX-NEXT: ld.b8 %rs3, [%rd3+6]; +; CHECK-PTX-NEXT: shl.b16 %rs4, %rs3, 8; +; CHECK-PTX-NEXT: or.b16 %rs5, %rs4, %rs2; +; CHECK-PTX-NEXT: st.local.b16 [%SPL], %rs5; +; CHECK-PTX-NEXT: ld.b64 %rd4, [%rd3+8]; ; CHECK-PTX-NEXT: add.s32 %r4, %r1, %r2; ; CHECK-PTX-NEXT: add.s32 %r5, %r4, %r3; -; CHECK-PTX-NEXT: cvt.u64.u32 %rd6, %r5; -; CHECK-PTX-NEXT: add.s64 %rd7, %rd6, %rd5; -; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %rd7; +; CHECK-PTX-NEXT: cvt.u64.u32 %rd5, %r5; +; CHECK-PTX-NEXT: add.s64 %rd6, %rd5, %rd4; +; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %rd6; ; CHECK-PTX-NEXT: ret; entry: %vlist = alloca ptr, align 8 @@ -201,29 +201,29 @@ define dso_local i32 @bar() { ; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot3[24]; ; CHECK-PTX-NEXT: .reg .b64 %SP; ; CHECK-PTX-NEXT: .reg .b64 %SPL; -; CHECK-PTX-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-NEXT: .reg .b16 %rs<6>; ; CHECK-PTX-NEXT: .reg .b32 %r<2>; -; CHECK-PTX-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-NEXT: .reg .b64 %rd<2>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: // %entry ; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot3; ; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; -; CHECK-PTX-NEXT: add.u64 %rd1, %SPL, 0; +; CHECK-PTX-NEXT: add.u64 %rd1, %SP, 8; ; CHECK-PTX-NEXT: ld.global.nc.b8 %rs1, [__const_$_bar_$_s1+7]; -; CHECK-PTX-NEXT: st.local.b8 [%rd1+2], %rs1; +; CHECK-PTX-NEXT: st.local.b8 [%SPL+2], %rs1; ; CHECK-PTX-NEXT: ld.global.nc.b8 %rs2, [__const_$_bar_$_s1+6]; -; CHECK-PTX-NEXT: st.local.b8 [%rd1+1], %rs2; -; CHECK-PTX-NEXT: ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+5]; -; CHECK-PTX-NEXT: st.local.b8 [%rd1], %rs3; -; CHECK-PTX-NEXT: st.b32 [%SP+8], 1; -; CHECK-PTX-NEXT: st.b8 [%SP+12], 1; -; CHECK-PTX-NEXT: st.b64 [%SP+16], 1; +; CHECK-PTX-NEXT: shl.b16 %rs3, %rs2, 8; +; CHECK-PTX-NEXT: ld.global.nc.b8 %rs4, [__const_$_bar_$_s1+5]; +; CHECK-PTX-NEXT: or.b16 %rs5, %rs3, %rs4; +; CHECK-PTX-NEXT: st.local.b16 [%SPL], %rs5; +; CHECK-PTX-NEXT: st.local.b32 [%SPL+8], 1; +; CHECK-PTX-NEXT: st.local.b8 [%SPL+12], 1; +; CHECK-PTX-NEXT: st.local.b64 [%SPL+16], 1; ; CHECK-PTX-NEXT: { // callseq 1, 0 ; CHECK-PTX-NEXT: .param .b32 param0; ; CHECK-PTX-NEXT: .param .b64 param1; ; CHECK-PTX-NEXT: .param .b32 retval0; -; CHECK-PTX-NEXT: add.u64 %rd2, %SP, 8; -; CHECK-PTX-NEXT: st.param.b64 [param1], %rd2; +; CHECK-PTX-NEXT: st.param.b64 [param1], %rd1; ; CHECK-PTX-NEXT: st.param.b32 [param0], 1; ; CHECK-PTX-NEXT: call.uni (retval0), variadics2, (param0, param1); ; CHECK-PTX-NEXT: ld.param.b32 %r1, [retval0]; @@ -288,12 +288,12 @@ define dso_local i32 @baz() { ; CHECK-PTX-NEXT: // %bb.0: // %entry ; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot5; ; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; -; CHECK-PTX-NEXT: st.v4.b32 [%SP], {1, 1, 1, 1}; +; CHECK-PTX-NEXT: add.u64 %rd1, %SP, 0; +; CHECK-PTX-NEXT: st.local.v4.b32 [%SPL], {1, 1, 1, 1}; ; CHECK-PTX-NEXT: { // callseq 2, 0 ; CHECK-PTX-NEXT: .param .b32 param0; ; CHECK-PTX-NEXT: .param .b64 param1; ; CHECK-PTX-NEXT: .param .b32 retval0; -; CHECK-PTX-NEXT: add.u64 %rd1, %SP, 0; ; CHECK-PTX-NEXT: st.param.b64 [param1], %rd1; ; CHECK-PTX-NEXT: st.param.b32 [param0], 1; ; CHECK-PTX-NEXT: call.uni (retval0), variadics3, (param0, param1); @@ -348,27 +348,24 @@ define dso_local void @qux() { ; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot7[24]; ; CHECK-PTX-NEXT: .reg .b64 %SP; ; CHECK-PTX-NEXT: .reg .b64 %SPL; -; CHECK-PTX-NEXT: .reg .b64 %rd<7>; +; CHECK-PTX-NEXT: .reg .b64 %rd<4>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: // %entry ; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot7; ; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; -; CHECK-PTX-NEXT: add.u64 %rd1, %SPL, 0; +; CHECK-PTX-NEXT: add.u64 %rd1, %SP, 16; ; CHECK-PTX-NEXT: ld.global.nc.b64 %rd2, [__const_$_qux_$_s+8]; -; CHECK-PTX-NEXT: st.local.b64 [%rd1+8], %rd2; +; CHECK-PTX-NEXT: st.local.b64 [%SPL+8], %rd2; ; CHECK-PTX-NEXT: ld.global.nc.b64 %rd3, [__const_$_qux_$_s]; -; CHECK-PTX-NEXT: st.local.b64 [%rd1], %rd3; -; CHECK-PTX-NEXT: st.b64 [%SP+16], 1; +; CHECK-PTX-NEXT: st.local.b64 [%SPL], %rd3; +; CHECK-PTX-NEXT: st.local.b64 [%SPL+16], 1; ; CHECK-PTX-NEXT: { // callseq 3, 0 ; CHECK-PTX-NEXT: .param .align 8 .b8 param0[16]; ; CHECK-PTX-NEXT: .param .b64 param1; ; CHECK-PTX-NEXT: .param .b32 retval0; -; CHECK-PTX-NEXT: add.u64 %rd4, %SP, 16; -; CHECK-PTX-NEXT: st.param.b64 [param1], %rd4; -; CHECK-PTX-NEXT: ld.local.b64 %rd5, [%rd1+8]; -; CHECK-PTX-NEXT: st.param.b64 [param0+8], %rd5; -; CHECK-PTX-NEXT: ld.local.b64 %rd6, [%rd1]; -; CHECK-PTX-NEXT: st.param.b64 [param0], %rd6; +; CHECK-PTX-NEXT: st.param.b64 [param1], %rd1; +; CHECK-PTX-NEXT: st.param.b64 [param0+8], %rd2; +; CHECK-PTX-NEXT: st.param.b64 [param0], %rd3; ; CHECK-PTX-NEXT: call.uni (retval0), variadics4, (param0, param1); ; CHECK-PTX-NEXT: } // callseq 3 ; CHECK-PTX-NEXT: ret; diff --git a/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll b/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll index fa42481016540..d41b5aefec58a 100644 --- a/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll +++ b/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll @@ -9,7 +9,6 @@ ; CHECK: .loc 1 5 3 // t.c:5:3 ; CHECK: { // callseq 0, 0 ; CHECK: .param .b64 param0; -; CHECK: add.u64 %rd1, %SP, 0; ; CHECK: st.param.b64 [param0], %rd1; ; CHECK: call.uni escape_foo, (param0); ; CHECK: } // callseq 0 diff --git a/llvm/test/DebugInfo/NVPTX/dbg-value-const-byref.ll b/llvm/test/DebugInfo/NVPTX/dbg-value-const-byref.ll index 6ca906ad3ef25..406af7ea29b5e 100644 --- a/llvm/test/DebugInfo/NVPTX/dbg-value-const-byref.ll +++ b/llvm/test/DebugInfo/NVPTX/dbg-value-const-byref.ll @@ -22,7 +22,7 @@ ; CHECK: DEBUG_VALUE: foo:i <- 3 ; CHECK: DEBUG_VALUE: foo:i <- 7 ; CHECK: DEBUG_VALUE: foo:i <- % -; CHECK: DEBUG_VALUE: foo:i <- [DW_OP_deref] $vrdepot +; CHECK: DEBUG_VALUE: foo:i <- [DW_OP_deref] $vrdepot ; Function Attrs: nounwind ssp uwtable define i32 @foo() #0 !dbg !4 {