diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index d817a3c6a8777..bb0aeb493ed48 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -28,6 +28,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/Register.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetCallingConv.h" @@ -390,35 +391,27 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, /// and promote them to a larger size if they're not. /// /// The promoted type is placed in \p PromoteVT if the function returns true. -static std::optional PromoteScalarIntegerPTX(const EVT &VT) { +static EVT promoteScalarIntegerPTX(const EVT VT) { if (VT.isScalarInteger()) { - MVT PromotedVT; switch (PowerOf2Ceil(VT.getFixedSizeInBits())) { default: llvm_unreachable( "Promotion is not suitable for scalars of size larger than 64-bits"); case 1: - PromotedVT = MVT::i1; - break; + return MVT::i1; case 2: case 4: case 8: - PromotedVT = MVT::i8; - break; + return MVT::i8; case 16: - PromotedVT = MVT::i16; - break; + return MVT::i16; case 32: - PromotedVT = MVT::i32; - break; + return MVT::i32; case 64: - PromotedVT = MVT::i64; - break; + return MVT::i64; } - if (VT != PromotedVT) - return PromotedVT; } - return std::nullopt; + return VT; } // Check whether we can merge loads/stores of some of the pieces of a @@ -1053,10 +1046,8 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { break; MAKE_CASE(NVPTXISD::RET_GLUE) - MAKE_CASE(NVPTXISD::DeclareParam) + MAKE_CASE(NVPTXISD::DeclareArrayParam) MAKE_CASE(NVPTXISD::DeclareScalarParam) - MAKE_CASE(NVPTXISD::DeclareRet) - MAKE_CASE(NVPTXISD::DeclareRetParam) MAKE_CASE(NVPTXISD::CALL) MAKE_CASE(NVPTXISD::LoadParam) MAKE_CASE(NVPTXISD::LoadParamV2) @@ -1162,8 +1153,8 @@ SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, } std::string NVPTXTargetLowering::getPrototype( - const DataLayout &DL, Type *retTy, const ArgListTy &Args, - const SmallVectorImpl &Outs, MaybeAlign RetAlign, + const DataLayout &DL, Type *RetTy, const ArgListTy &Args, + const SmallVectorImpl &Outs, std::optional FirstVAArg, const CallBase &CB, unsigned UniqueCallSite) const { auto PtrVT = getPointerTy(DL); @@ -1172,22 +1163,22 @@ std::string NVPTXTargetLowering::getPrototype( raw_string_ostream O(Prototype); O << "prototype_" << UniqueCallSite << " : .callprototype "; - if (retTy->isVoidTy()) { + if (RetTy->isVoidTy()) { O << "()"; } else { O << "("; - if (shouldPassAsArray(retTy)) { - assert(RetAlign && "RetAlign must be set for non-void return types"); - O << ".param .align " << RetAlign->value() << " .b8 _[" - << DL.getTypeAllocSize(retTy) << "]"; - } else if (retTy->isFloatingPointTy() || retTy->isIntegerTy()) { + if (shouldPassAsArray(RetTy)) { + const Align RetAlign = getArgumentAlignment(&CB, RetTy, 0, DL); + O << ".param .align " << RetAlign.value() << " .b8 _[" + << DL.getTypeAllocSize(RetTy) << "]"; + } else if (RetTy->isFloatingPointTy() || RetTy->isIntegerTy()) { unsigned size = 0; - if (auto *ITy = dyn_cast(retTy)) { + if (auto *ITy = dyn_cast(RetTy)) { size = ITy->getBitWidth(); } else { - assert(retTy->isFloatingPointTy() && + assert(RetTy->isFloatingPointTy() && "Floating point type expected here"); - size = retTy->getPrimitiveSizeInBits(); + size = RetTy->getPrimitiveSizeInBits(); } // PTX ABI requires all scalar return values to be at least 32 // bits in size. fp16 normally uses .b16 as its storage type in @@ -1195,7 +1186,7 @@ std::string NVPTXTargetLowering::getPrototype( size = promoteScalarArgumentSize(size); O << ".param .b" << size << " _"; - } else if (isa(retTy)) { + } else if (isa(RetTy)) { O << ".param .b" << PtrVT.getSizeInBits() << " _"; } else { llvm_unreachable("Unknown return type"); @@ -1256,7 +1247,7 @@ std::string NVPTXTargetLowering::getPrototype( if (FirstVAArg) O << (first ? "" : ",") << " .param .align " - << STI.getMaxRequiredAlignment() << " .b8 _[]\n"; + << STI.getMaxRequiredAlignment() << " .b8 _[]"; O << ")"; if (shouldEmitPTXNoReturn(&CB, *nvTM)) O << " .noreturn"; @@ -1442,6 +1433,21 @@ static ISD::NodeType getExtOpcode(const ISD::ArgFlagsTy &Flags) { return ISD::ANY_EXTEND; } +static SDValue correctParamType(SDValue V, EVT ExpectedVT, + ISD::ArgFlagsTy Flags, SelectionDAG &DAG, + SDLoc dl) { + const EVT ActualVT = V.getValueType(); + assert((ActualVT == ExpectedVT || + (ExpectedVT.isInteger() && ActualVT.isInteger())) && + "Non-integer argument type size mismatch"); + if (ExpectedVT.bitsGT(ActualVT)) + return DAG.getNode(getExtOpcode(Flags), dl, ExpectedVT, V); + if (ExpectedVT.bitsLT(ActualVT)) + return DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, V); + + return V; +} + SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { @@ -1505,9 +1511,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, "Outs and OutVals must be the same size"); // Declare the .params or .reg need to pass values // to the function - for (const auto [ArgI, Arg] : llvm::enumerate(Args)) { - const auto ArgOuts = AllOuts.take_while( - [ArgI = ArgI](auto O) { return O.OrigArgIndex == ArgI; }); + for (const auto E : llvm::enumerate(Args)) { + const auto ArgI = E.index(); + const auto Arg = E.value(); + const auto ArgOuts = + AllOuts.take_while([&](auto O) { return O.OrigArgIndex == ArgI; }); const auto ArgOutVals = AllOutVals.take_front(ArgOuts.size()); AllOuts = AllOuts.drop_front(ArgOuts.size()); AllOutVals = AllOutVals.drop_front(ArgOuts.size()); @@ -1515,6 +1523,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const bool IsVAArg = (ArgI >= FirstVAArg); const bool IsByVal = Arg.IsByVal; + const SDValue ParamSymbol = + getCallParamSymbol(DAG, IsVAArg ? FirstVAArg : ArgI, MVT::i32); + SmallVector VTs; SmallVector Offsets; @@ -1525,38 +1536,43 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, assert(VTs.size() == Offsets.size() && "Size mismatch"); assert((IsByVal || VTs.size() == ArgOuts.size()) && "Size mismatch"); - Align ArgAlign; - if (IsByVal) { - // The ByValAlign in the Outs[OIdx].Flags is always set at this point, - // so we don't need to worry whether it's naturally aligned or not. - // See TargetLowering::LowerCallTo(). - Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign(); - ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy, - InitialAlign, DL); - if (IsVAArg) - VAOffset = alignTo(VAOffset, ArgAlign); - } else { - ArgAlign = getArgumentAlignment(CB, Arg.Ty, ArgI + 1, DL); - } + const Align ArgAlign = [&]() { + if (IsByVal) { + // The ByValAlign in the Outs[OIdx].Flags is always set at this point, + // so we don't need to worry whether it's naturally aligned or not. + // See TargetLowering::LowerCallTo(). + const Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign(); + const Align ByValAlign = getFunctionByValParamAlign( + CB->getCalledFunction(), ETy, InitialAlign, DL); + if (IsVAArg) + VAOffset = alignTo(VAOffset, ByValAlign); + return ByValAlign; + } + return getArgumentAlignment(CB, Arg.Ty, ArgI + 1, DL); + }(); const unsigned TypeSize = DL.getTypeAllocSize(ETy); assert((!IsByVal || TypeSize == ArgOuts[0].Flags.getByValSize()) && "type size mismatch"); - const bool PassAsArray = IsByVal || shouldPassAsArray(Arg.Ty); - if (IsVAArg) { - if (ArgI == FirstVAArg) { - VADeclareParam = Chain = - DAG.getNode(NVPTXISD::DeclareParam, dl, {MVT::Other, MVT::Glue}, - {Chain, GetI32(STI.getMaxRequiredAlignment()), - GetI32(ArgI), GetI32(1), InGlue}); + const std::optional ArgDeclare = [&]() -> std::optional { + if (IsVAArg) { + if (ArgI == FirstVAArg) { + VADeclareParam = DAG.getNode( + NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue}, + {Chain, ParamSymbol, GetI32(STI.getMaxRequiredAlignment()), + GetI32(0), InGlue}); + return VADeclareParam; + } + return std::nullopt; + } + if (IsByVal || shouldPassAsArray(Arg.Ty)) { + // declare .param .align .b8 .param[]; + return DAG.getNode(NVPTXISD::DeclareArrayParam, dl, + {MVT::Other, MVT::Glue}, + {Chain, ParamSymbol, GetI32(ArgAlign.value()), + GetI32(TypeSize), InGlue}); } - } else if (PassAsArray) { - // declare .param .align .b8 .param[]; - Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, {MVT::Other, MVT::Glue}, - {Chain, GetI32(ArgAlign.value()), GetI32(ArgI), - GetI32(TypeSize), InGlue}); - } else { assert(ArgOuts.size() == 1 && "We must pass only one value as non-array"); // declare .param .b .param; @@ -1568,11 +1584,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ? promoteScalarArgumentSize(TypeSize * 8) : TypeSize * 8; - Chain = - DAG.getNode(NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue}, - {Chain, GetI32(ArgI), GetI32(PromotedSize), InGlue}); + return DAG.getNode(NVPTXISD::DeclareScalarParam, dl, + {MVT::Other, MVT::Glue}, + {Chain, ParamSymbol, GetI32(PromotedSize), InGlue}); + }(); + if (ArgDeclare) { + Chain = ArgDeclare->getValue(0); + InGlue = ArgDeclare->getValue(1); } - InGlue = Chain.getValue(1); // PTX Interoperability Guide 3.3(A): [Integer] Values shorter // than 32-bits are sign extended or zero extended, depending on @@ -1594,8 +1613,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } else { StVal = ArgOutVals[I]; - if (auto PromotedVT = PromoteScalarIntegerPTX(StVal.getValueType())) { - StVal = DAG.getNode(getExtOpcode(ArgOuts[I].Flags), dl, *PromotedVT, + auto PromotedVT = promoteScalarIntegerPTX(StVal.getValueType()); + if (PromotedVT != StVal.getValueType()) { + StVal = DAG.getNode(getExtOpcode(ArgOuts[I].Flags), dl, PromotedVT, StVal); } } @@ -1619,12 +1639,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, unsigned J = 0; for (const unsigned NumElts : VectorInfo) { const int CurOffset = Offsets[J]; - EVT EltVT = VTs[J]; + EVT EltVT = promoteScalarIntegerPTX(VTs[J]); const Align PartAlign = commonAlignment(ArgAlign, CurOffset); - if (auto PromotedVT = PromoteScalarIntegerPTX(EltVT)) - EltVT = *PromotedVT; - // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a // scalar store. In such cases, fall back to byte stores. if (NumElts == 1 && !IsVAArg && PartAlign < DAG.getEVTAlign(EltVT)) { @@ -1695,27 +1712,26 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } GlobalAddressSDNode *Func = dyn_cast(Callee.getNode()); - MaybeAlign RetAlign = std::nullopt; // Handle Result if (!Ins.empty()) { - RetAlign = getArgumentAlignment(CB, RetTy, 0, DL); - - // Declare - // .param .align N .b8 retval0[], or - // .param .b retval0 - const unsigned ResultSize = DL.getTypeAllocSizeInBits(RetTy); - if (!shouldPassAsArray(RetTy)) { - const unsigned PromotedResultSize = promoteScalarArgumentSize(ResultSize); - Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, {MVT::Other, MVT::Glue}, - {Chain, GetI32(PromotedResultSize), InGlue}); - InGlue = Chain.getValue(1); - } else { - Chain = DAG.getNode( - NVPTXISD::DeclareRetParam, dl, {MVT::Other, MVT::Glue}, - {Chain, GetI32(RetAlign->value()), GetI32(ResultSize / 8), InGlue}); - InGlue = Chain.getValue(1); - } + const SDValue RetDeclare = [&]() { + const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32); + const unsigned ResultSize = DL.getTypeAllocSizeInBits(RetTy); + if (shouldPassAsArray(RetTy)) { + const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL); + return DAG.getNode(NVPTXISD::DeclareArrayParam, dl, + {MVT::Other, MVT::Glue}, + {Chain, RetSymbol, GetI32(RetAlign.value()), + GetI32(ResultSize / 8), InGlue}); + } + const auto PromotedResultSize = promoteScalarArgumentSize(ResultSize); + return DAG.getNode( + NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue}, + {Chain, RetSymbol, GetI32(PromotedResultSize), InGlue}); + }(); + Chain = RetDeclare.getValue(0); + InGlue = RetDeclare.getValue(1); } const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs); @@ -1760,7 +1776,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // The prototype is embedded in a string and put as the operand for a // CallPrototype SDNode which will print out to the value of the string. std::string Proto = - getPrototype(DL, RetTy, Args, CLI.Outs, RetAlign, + getPrototype(DL, RetTy, Args, CLI.Outs, HasVAArgs ? std::optional(FirstVAArg) : std::nullopt, *CB, UniqueCallSite); const char *ProtoStr = nvTM->getStrPool().save(Proto).data(); @@ -1773,11 +1789,10 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (ConvertToIndirectCall) { // Copy the function ptr to a ptx register and use the register to call the // function. - EVT DestVT = Callee.getValueType(); - MachineRegisterInfo &RegInfo = DAG.getMachineFunction().getRegInfo(); + const MVT DestVT = Callee.getValueType().getSimpleVT(); + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - unsigned DestReg = - RegInfo.createVirtualRegister(TLI.getRegClassFor(DestVT.getSimpleVT())); + Register DestReg = MRI.createVirtualRegister(TLI.getRegClassFor(DestVT)); auto RegCopy = DAG.getCopyToReg(DAG.getEntryNode(), dl, DestReg, Callee); Callee = DAG.getCopyFromReg(RegCopy, dl, DestReg, DestVT); } @@ -1810,7 +1825,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0); assert(VTs.size() == Ins.size() && "Bad value decomposition"); - assert(RetAlign && "RetAlign is guaranteed to be set"); + const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL); // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than // 32-bits are sign extended or zero extended, depending on whether @@ -1818,17 +1833,15 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const bool ExtendIntegerRetVal = RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; - const auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, *RetAlign); + const auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign); unsigned I = 0; for (const unsigned VectorizedSize : VectorInfo) { - EVT TheLoadType = VTs[I]; + EVT TheLoadType = promoteScalarIntegerPTX(VTs[I]); EVT EltType = Ins[I].VT; - const Align EltAlign = commonAlignment(*RetAlign, Offsets[I]); + const Align EltAlign = commonAlignment(RetAlign, Offsets[I]); - if (auto PromotedVT = PromoteScalarIntegerPTX(TheLoadType)) { - TheLoadType = *PromotedVT; - EltType = *PromotedVT; - } + if (TheLoadType != VTs[I]) + EltType = TheLoadType; if (ExtendIntegerRetVal) { TheLoadType = MVT::i32; @@ -1898,13 +1911,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, continue; } - SDValue Ret = DAG.getNode( - NVPTXISD::ProxyReg, dl, - {ProxyRegOps[I].getSimpleValueType(), MVT::Other, MVT::Glue}, - {Chain, ProxyRegOps[I], InGlue}); - - Chain = Ret.getValue(1); - InGlue = Ret.getValue(2); + SDValue Ret = + DAG.getNode(NVPTXISD::ProxyReg, dl, ProxyRegOps[I].getSimpleValueType(), + {Chain, ProxyRegOps[I]}); const EVT ExpectedVT = Ins[I].VT; if (!Ret.getValueType().bitsEq(ExpectedVT)) { @@ -1914,14 +1923,10 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } for (SDValue &T : TempProxyRegOps) { - SDValue Repl = DAG.getNode(NVPTXISD::ProxyReg, dl, - {T.getSimpleValueType(), MVT::Other, MVT::Glue}, - {Chain, T.getOperand(0), InGlue}); + SDValue Repl = DAG.getNode(NVPTXISD::ProxyReg, dl, T.getSimpleValueType(), + {Chain, T.getOperand(0)}); DAG.ReplaceAllUsesWith(T, Repl); DAG.RemoveDeadNode(T.getNode()); - - Chain = Repl.getValue(1); - InGlue = Repl.getValue(2); } // set isTailCall to false for now, until we figure out how to express @@ -3293,11 +3298,17 @@ bool NVPTXTargetLowering::splitValueIntoRegisterParts( // Name of the symbol is composed from its index and the function name. // Negative index corresponds to special parameter (unsized array) used for // passing variable arguments. -SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, - EVT v) const { +SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int I, + EVT T) const { StringRef SavedStr = nvTM->getStrPool().save( - getParamName(&DAG.getMachineFunction().getFunction(), idx)); - return DAG.getExternalSymbol(SavedStr.data(), v); + getParamName(&DAG.getMachineFunction().getFunction(), I)); + return DAG.getExternalSymbol(SavedStr.data(), T); +} + +SDValue NVPTXTargetLowering::getCallParamSymbol(SelectionDAG &DAG, int I, + EVT T) const { + const StringRef SavedStr = nvTM->getStrPool().save("param" + Twine(I)); + return DAG.getExternalSymbol(SavedStr.data(), T); } SDValue NVPTXTargetLowering::LowerFormalArguments( @@ -3394,8 +3405,11 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( const unsigned PackingAmt = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1; - const EVT VecVT = EVT::getVectorVT( - F->getContext(), LoadVT.getScalarType(), NumElts * PackingAmt); + const EVT VecVT = + NumElts == 1 + ? LoadVT + : EVT::getVectorVT(F->getContext(), LoadVT.getScalarType(), + NumElts * PackingAmt); SDValue VecAddr = DAG.getObjectPtrOffset( dl, ArgSymbol, TypeSize::getFixed(Offsets[I])); @@ -3409,22 +3423,16 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( if (P.getNode()) P.getNode()->setIROrder(Arg.getArgNo() + 1); for (const unsigned J : llvm::seq(NumElts)) { - SDValue Elt = DAG.getNode( - LoadVT.isVector() ? ISD::EXTRACT_SUBVECTOR - : ISD::EXTRACT_VECTOR_ELT, - dl, LoadVT, P, DAG.getVectorIdxConstant(J * PackingAmt, dl)); - - // Extend or truncate the element if necessary (e.g. an i8 is loaded - // into an i16 register) - const EVT ExpectedVT = ArgIns[I + J].VT; - assert((Elt.getValueType() == ExpectedVT || - (ExpectedVT.isInteger() && Elt.getValueType().isInteger())) && - "Non-integer argument type size mismatch"); - if (ExpectedVT.bitsGT(Elt.getValueType())) - Elt = DAG.getNode(getExtOpcode(ArgIns[I + J].Flags), dl, ExpectedVT, - Elt); - else if (ExpectedVT.bitsLT(Elt.getValueType())) - Elt = DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, Elt); + SDValue Elt = + NumElts == 1 + ? P + : DAG.getNode(LoadVT.isVector() ? ISD::EXTRACT_SUBVECTOR + : ISD::EXTRACT_VECTOR_ELT, + dl, LoadVT, P, + DAG.getVectorIdxConstant(J * PackingAmt, dl)); + + Elt = correctParamType(Elt, ArgIns[I + J].VT, ArgIns[I + J].Flags, + DAG, dl); InVals.push_back(Elt); } I += NumElts; @@ -3467,25 +3475,14 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, const auto GetRetVal = [&](unsigned I) -> SDValue { SDValue RetVal = OutVals[I]; - assert(!PromoteScalarIntegerPTX(RetVal.getValueType()) && + assert(promoteScalarIntegerPTX(RetVal.getValueType()) == + RetVal.getValueType() && "OutVal type should always be legal"); - EVT VTI = VTs[I]; - if (const auto PromotedVT = PromoteScalarIntegerPTX(VTI)) - VTI = *PromotedVT; - + const EVT VTI = promoteScalarIntegerPTX(VTs[I]); const EVT StoreVT = ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI); - - assert((RetVal.getValueType() == StoreVT || - (StoreVT.isInteger() && RetVal.getValueType().isInteger())) && - "Non-integer argument type size mismatch"); - if (StoreVT.bitsGT(RetVal.getValueType())) { - RetVal = DAG.getNode(getExtOpcode(Outs[I].Flags), dl, StoreVT, RetVal); - } else if (StoreVT.bitsLT(RetVal.getValueType())) { - RetVal = DAG.getNode(ISD::TRUNCATE, dl, StoreVT, RetVal); - } - return RetVal; + return correctParamType(RetVal, StoreVT, Outs[I].Flags, DAG, dl); }; const auto RetAlign = getFunctionParamOptimizedAlign(&F, RetTy, DL); @@ -3500,7 +3497,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, if (NumElts == 1) { Val = GetRetVal(I); } else { - SmallVector StoreVals; + SmallVector StoreVals; for (const unsigned J : llvm::seq(NumElts)) { SDValue ValJ = GetRetVal(I + J); if (ValJ.getValueType().isVector()) @@ -3514,7 +3511,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, Val = DAG.getBuildVector(VT, dl, StoreVals); } - SDValue RetSymbol = DAG.getExternalSymbol("func_retval0", MVT::i32); + const SDValue RetSymbol = DAG.getExternalSymbol("func_retval0", MVT::i32); SDValue Ptr = DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I])); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 3a8091fecfde1..2477e1fb61595 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -25,10 +25,15 @@ enum NodeType : unsigned { // Start the numbering from where ISD NodeType finishes. FIRST_NUMBER = ISD::BUILTIN_OP_END, RET_GLUE, - DeclareParam, + + /// These nodes represent a parameter declaration. In PTX this will look like: + /// .param .align 16 .b8 param0[1024]; + /// .param .b32 retval0; + /// + /// DeclareArrayParam(Chain, Externalsym, Align, Size, Glue) + /// DeclareScalarParam(Chain, Externalsym, Size, Glue) DeclareScalarParam, - DeclareRetParam, - DeclareRet, + DeclareArrayParam, /// This node represents a PTX call instruction. It's operands are as follows: /// @@ -174,7 +179,6 @@ class NVPTXTargetLowering : public TargetLowering { std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &, const SmallVectorImpl &, - MaybeAlign RetAlign, std::optional FirstVAArg, const CallBase &CB, unsigned UniqueCallSite) const; @@ -272,8 +276,8 @@ class NVPTXTargetLowering : public TargetLowering { const NVPTXSubtarget &STI; // cache the subtarget here mutable unsigned GlobalUniqueCallSite; - SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const; - + SDValue getParamSymbol(SelectionDAG &DAG, int I, EVT T) const; + SDValue getCallParamSymbol(SelectionDAG &DAG, int I, EVT T) const; SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 1a2515b7f66f3..441ddeeb7d667 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1990,9 +1990,9 @@ defm FSetNE : FSET_FORMAT; defm FSetNUM : FSET_FORMAT; defm FSetNAN : FSET_FORMAT; -def SDTDeclareParamProfile : +def SDTDeclareArrayParam : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>; -def SDTDeclareScalarParamProfile : +def SDTDeclareScalarParam : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>; def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>; @@ -2001,22 +2001,17 @@ def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>; def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>; def SDTMoveParamProfile : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisSameAs<0, 1>]>; -def SDTProxyRegProfile : SDTypeProfile<1, 1, []>; -def DeclareParam : - SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def DeclareScalarParam : - SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def DeclareRetParam : - SDNode<"NVPTXISD::DeclareRetParam", - SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>, +def SDTProxyReg : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>]>; + + +def declare_array_param : + SDNode<"NVPTXISD::DeclareArrayParam", SDTDeclareArrayParam, [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def DeclareRet : - SDNode<"NVPTXISD::DeclareRet", - SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>, +def declare_scalar_param : + SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParam, [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; + def LoadParam : SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile, [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; @@ -2037,9 +2032,8 @@ def StoreParamV4 : [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; def MoveParam : SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>; -def ProxyReg : - SDNode<"NVPTXISD::ProxyReg", SDTProxyRegProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def proxy_reg : + SDNode<"NVPTXISD::ProxyReg", SDTProxyReg, [SDNPHasChain]>; /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns, /// NumParams, Callee, Proto, InGlue) @@ -2188,23 +2182,17 @@ defm StoreParamV2F64 : StoreParamV2Inst; defm StoreParamV4F32 : StoreParamV4Inst; -def DeclareRetMemInst : - NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size), - ".param .align $align .b8 retval0[$size];", - [(DeclareRetParam imm:$align, imm:$size)]>; -def DeclareRetScalarInst : - NVPTXInst<(outs), (ins i32imm:$size), - ".param .b$size retval0;", - [(DeclareRet imm:$size)]>; - -def DeclareParamInst : - NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size), - ".param .align $align .b8 param$a[$size];", - [(DeclareParam imm:$align, imm:$a, imm:$size)]>; -def DeclareScalarParamInst : +def DECLARE_PARAM_array : + NVPTXInst<(outs), (ins i32imm:$a, i32imm:$align, i32imm:$size), + ".param .align $align .b8 \t$a[$size];", []>; +def DECLARE_PARAM_scalar : NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), - ".param .b$size param$a;", - [(DeclareScalarParam imm:$a, imm:$size)]>; + ".param .b$size \t$a;", []>; + +def : Pat<(declare_array_param externalsym:$a, imm:$align, imm:$size), + (DECLARE_PARAM_array (to_texternsym $a), imm:$align, imm:$size)>; +def : Pat<(declare_scalar_param externalsym:$a, imm:$size), + (DECLARE_PARAM_scalar (to_texternsym $a), imm:$size)>; foreach t = [I32RT, I64RT] in { defvar inst_name = "MOV" # t.Size # "_PARAM"; @@ -2217,7 +2205,7 @@ multiclass ProxyRegInst { def NAME : BasicNVPTXInst<(outs rc:$dst), (ins rc:$src), "mov." # SzStr>; foreach vt = rc.RegTypes in - def : Pat<(vt (ProxyReg vt:$src)), (!cast(NAME) $src)>; + def : Pat<(vt (proxy_reg vt:$src)), (!cast(NAME) $src)>; } defm ProxyRegB1 : ProxyRegInst<"pred", B1>; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll index 65a077d67e4ba..c99860cc5cc1b 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll @@ -14,17 +14,17 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB0_1: // %partword.cmpxchg.loop @@ -41,7 +41,7 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB0_1; ; SM60-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new @@ -59,17 +59,17 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB1_1: // %partword.cmpxchg.loop @@ -86,7 +86,7 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB1_1; ; SM60-NEXT: $L__BB1_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new @@ -104,17 +104,17 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB2_1: // %partword.cmpxchg.loop @@ -131,7 +131,7 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB2_1; ; SM60-NEXT: $L__BB2_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new @@ -149,17 +149,17 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB3_1: // %partword.cmpxchg.loop @@ -177,7 +177,7 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB3_1; ; SM60-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new @@ -195,17 +195,17 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB4_1: // %partword.cmpxchg.loop @@ -223,7 +223,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: @%p2 bra $L__BB4_1; ; SM60-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new @@ -241,17 +241,17 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB5_1: // %partword.cmpxchg.loop @@ -269,7 +269,7 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: @%p2 bra $L__BB5_1; ; SM60-NEXT: $L__BB5_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new @@ -287,18 +287,18 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB6_1: // %partword.cmpxchg.loop @@ -316,7 +316,7 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB6_1; ; SM60-NEXT: $L__BB6_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new @@ -334,18 +334,18 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB7_1: // %partword.cmpxchg.loop @@ -363,7 +363,7 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: @%p2 bra $L__BB7_1; ; SM60-NEXT: $L__BB7_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new @@ -381,18 +381,18 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB8_1: // %partword.cmpxchg.loop @@ -410,7 +410,7 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: @%p2 bra $L__BB8_1; ; SM60-NEXT: $L__BB8_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new @@ -428,17 +428,17 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB9_1: // %partword.cmpxchg.loop @@ -456,7 +456,7 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB9_1; ; SM60-NEXT: $L__BB9_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new @@ -474,17 +474,17 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB10_1: // %partword.cmpxchg.loop @@ -502,7 +502,7 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: @%p2 bra $L__BB10_1; ; SM60-NEXT: $L__BB10_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new @@ -520,17 +520,17 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB11_1: // %partword.cmpxchg.loop @@ -548,7 +548,7 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: @%p2 bra $L__BB11_1; ; SM60-NEXT: $L__BB11_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new @@ -566,17 +566,17 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB12_1: // %partword.cmpxchg.loop @@ -594,7 +594,7 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB12_1; ; SM60-NEXT: $L__BB12_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire ret i8 %new @@ -612,17 +612,17 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB13_1: // %partword.cmpxchg.loop @@ -640,7 +640,7 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB13_1; ; SM60-NEXT: $L__BB13_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire ret i8 %new @@ -658,17 +658,17 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB14_1: // %partword.cmpxchg.loop @@ -686,7 +686,7 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB14_1; ; SM60-NEXT: $L__BB14_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire ret i8 %new @@ -704,18 +704,18 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB15_1: // %partword.cmpxchg.loop @@ -733,7 +733,7 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB15_1; ; SM60-NEXT: $L__BB15_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst ret i8 %new @@ -751,18 +751,18 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB16_1: // %partword.cmpxchg.loop @@ -780,7 +780,7 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB16_1; ; SM60-NEXT: $L__BB16_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst ret i8 %new @@ -798,18 +798,18 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB17_1: // %partword.cmpxchg.loop @@ -827,7 +827,7 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB17_1; ; SM60-NEXT: $L__BB17_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst ret i8 %new @@ -845,18 +845,18 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB18_1: // %partword.cmpxchg.loop @@ -873,7 +873,7 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB18_1; ; SM60-NEXT: $L__BB18_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic ret i8 %new @@ -891,18 +891,18 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB19_1: // %partword.cmpxchg.loop @@ -919,7 +919,7 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB19_1; ; SM60-NEXT: $L__BB19_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic ret i8 %new @@ -937,18 +937,18 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB20_1: // %partword.cmpxchg.loop @@ -965,7 +965,7 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB20_1; ; SM60-NEXT: $L__BB20_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic ret i8 %new @@ -983,18 +983,18 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB21_1: // %partword.cmpxchg.loop @@ -1012,7 +1012,7 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB21_1; ; SM60-NEXT: $L__BB21_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire ret i8 %new @@ -1030,18 +1030,18 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB22_1: // %partword.cmpxchg.loop @@ -1059,7 +1059,7 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB22_1; ; SM60-NEXT: $L__BB22_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire ret i8 %new @@ -1077,18 +1077,18 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB23_1: // %partword.cmpxchg.loop @@ -1106,7 +1106,7 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB23_1; ; SM60-NEXT: $L__BB23_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire ret i8 %new @@ -1124,18 +1124,18 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB24_1: // %partword.cmpxchg.loop @@ -1153,7 +1153,7 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB24_1; ; SM60-NEXT: $L__BB24_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst ret i8 %new @@ -1171,18 +1171,18 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB25_1: // %partword.cmpxchg.loop @@ -1200,7 +1200,7 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB25_1; ; SM60-NEXT: $L__BB25_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst ret i8 %new @@ -1218,18 +1218,18 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB26_1: // %partword.cmpxchg.loop @@ -1247,7 +1247,7 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB26_1; ; SM60-NEXT: $L__BB26_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst ret i8 %new @@ -1265,18 +1265,18 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB27_1: // %partword.cmpxchg.loop @@ -1294,7 +1294,7 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB27_1; ; SM60-NEXT: $L__BB27_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic ret i8 %new @@ -1312,18 +1312,18 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB28_1: // %partword.cmpxchg.loop @@ -1341,7 +1341,7 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: @%p2 bra $L__BB28_1; ; SM60-NEXT: $L__BB28_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic ret i8 %new @@ -1359,18 +1359,18 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB29_1: // %partword.cmpxchg.loop @@ -1388,7 +1388,7 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: @%p2 bra $L__BB29_1; ; SM60-NEXT: $L__BB29_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic ret i8 %new @@ -1406,18 +1406,18 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB30_1: // %partword.cmpxchg.loop @@ -1435,7 +1435,7 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB30_1; ; SM60-NEXT: $L__BB30_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new @@ -1453,18 +1453,18 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB31_1: // %partword.cmpxchg.loop @@ -1482,7 +1482,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB31_1; ; SM60-NEXT: $L__BB31_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new @@ -1500,18 +1500,18 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB32_1: // %partword.cmpxchg.loop @@ -1529,7 +1529,7 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB32_1; ; SM60-NEXT: $L__BB32_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new @@ -1547,18 +1547,18 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB33_1: // %partword.cmpxchg.loop @@ -1576,7 +1576,7 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB33_1; ; SM60-NEXT: $L__BB33_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst ret i8 %new @@ -1594,18 +1594,18 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB34_1: // %partword.cmpxchg.loop @@ -1623,7 +1623,7 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB34_1; ; SM60-NEXT: $L__BB34_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst ret i8 %new @@ -1641,18 +1641,18 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB35_1: // %partword.cmpxchg.loop @@ -1670,7 +1670,7 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB35_1; ; SM60-NEXT: $L__BB35_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst ret i8 %new @@ -1688,18 +1688,18 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB36_1: // %partword.cmpxchg.loop @@ -1717,7 +1717,7 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB36_1; ; SM60-NEXT: $L__BB36_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic ret i8 %new @@ -1735,18 +1735,18 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB37_1: // %partword.cmpxchg.loop @@ -1764,7 +1764,7 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: @%p2 bra $L__BB37_1; ; SM60-NEXT: $L__BB37_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic ret i8 %new @@ -1782,18 +1782,18 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB38_1: // %partword.cmpxchg.loop @@ -1811,7 +1811,7 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: @%p2 bra $L__BB38_1; ; SM60-NEXT: $L__BB38_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic ret i8 %new @@ -1829,18 +1829,18 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB39_1: // %partword.cmpxchg.loop @@ -1858,7 +1858,7 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB39_1; ; SM60-NEXT: $L__BB39_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire ret i8 %new @@ -1876,18 +1876,18 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB40_1: // %partword.cmpxchg.loop @@ -1905,7 +1905,7 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB40_1; ; SM60-NEXT: $L__BB40_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire ret i8 %new @@ -1923,18 +1923,18 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB41_1: // %partword.cmpxchg.loop @@ -1952,7 +1952,7 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB41_1; ; SM60-NEXT: $L__BB41_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire ret i8 %new @@ -1970,18 +1970,18 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB42_1: // %partword.cmpxchg.loop @@ -1999,7 +1999,7 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB42_1; ; SM60-NEXT: $L__BB42_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst ret i8 %new @@ -2017,18 +2017,18 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB43_1: // %partword.cmpxchg.loop @@ -2046,7 +2046,7 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB43_1; ; SM60-NEXT: $L__BB43_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst ret i8 %new @@ -2064,18 +2064,18 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB44_1: // %partword.cmpxchg.loop @@ -2093,7 +2093,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB44_1; ; SM60-NEXT: $L__BB44_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst ret i8 %new diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll index 7107fbcf6eb54..68de517f65bb9 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll @@ -14,17 +14,17 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop @@ -41,7 +41,7 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB0_1; ; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new @@ -59,17 +59,17 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop @@ -86,7 +86,7 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB1_1; ; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new @@ -104,17 +104,17 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop @@ -131,7 +131,7 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB2_1; ; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new @@ -149,17 +149,17 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop @@ -177,7 +177,7 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB3_1; ; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new @@ -195,17 +195,17 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop @@ -223,7 +223,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: @%p2 bra $L__BB4_1; ; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new @@ -241,17 +241,17 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB5_1: // %partword.cmpxchg.loop @@ -269,7 +269,7 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: @%p2 bra $L__BB5_1; ; SM70-NEXT: $L__BB5_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new @@ -287,18 +287,18 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop @@ -316,7 +316,7 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB6_1; ; SM70-NEXT: $L__BB6_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new @@ -334,18 +334,18 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB7_1: // %partword.cmpxchg.loop @@ -363,7 +363,7 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: @%p2 bra $L__BB7_1; ; SM70-NEXT: $L__BB7_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new @@ -381,18 +381,18 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB8_1: // %partword.cmpxchg.loop @@ -410,7 +410,7 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: @%p2 bra $L__BB8_1; ; SM70-NEXT: $L__BB8_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new @@ -428,17 +428,17 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB9_1: // %partword.cmpxchg.loop @@ -456,7 +456,7 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB9_1; ; SM70-NEXT: $L__BB9_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new @@ -474,17 +474,17 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB10_1: // %partword.cmpxchg.loop @@ -502,7 +502,7 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: @%p2 bra $L__BB10_1; ; SM70-NEXT: $L__BB10_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new @@ -520,17 +520,17 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB11_1: // %partword.cmpxchg.loop @@ -548,7 +548,7 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: @%p2 bra $L__BB11_1; ; SM70-NEXT: $L__BB11_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new @@ -566,17 +566,17 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB12_1: // %partword.cmpxchg.loop @@ -594,7 +594,7 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB12_1; ; SM70-NEXT: $L__BB12_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire ret i8 %new @@ -612,17 +612,17 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB13_1: // %partword.cmpxchg.loop @@ -640,7 +640,7 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB13_1; ; SM70-NEXT: $L__BB13_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire ret i8 %new @@ -658,17 +658,17 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB14_1: // %partword.cmpxchg.loop @@ -686,7 +686,7 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB14_1; ; SM70-NEXT: $L__BB14_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire ret i8 %new @@ -704,18 +704,18 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB15_1: // %partword.cmpxchg.loop @@ -733,7 +733,7 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB15_1; ; SM70-NEXT: $L__BB15_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst ret i8 %new @@ -751,18 +751,18 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB16_1: // %partword.cmpxchg.loop @@ -780,7 +780,7 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB16_1; ; SM70-NEXT: $L__BB16_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst ret i8 %new @@ -798,18 +798,18 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB17_1: // %partword.cmpxchg.loop @@ -827,7 +827,7 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB17_1; ; SM70-NEXT: $L__BB17_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst ret i8 %new @@ -845,18 +845,18 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB18_1: // %partword.cmpxchg.loop @@ -873,7 +873,7 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB18_1; ; SM70-NEXT: $L__BB18_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic ret i8 %new @@ -891,18 +891,18 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB19_1: // %partword.cmpxchg.loop @@ -919,7 +919,7 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB19_1; ; SM70-NEXT: $L__BB19_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic ret i8 %new @@ -937,18 +937,18 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB20_1: // %partword.cmpxchg.loop @@ -965,7 +965,7 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB20_1; ; SM70-NEXT: $L__BB20_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic ret i8 %new @@ -983,18 +983,18 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB21_1: // %partword.cmpxchg.loop @@ -1012,7 +1012,7 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB21_1; ; SM70-NEXT: $L__BB21_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire ret i8 %new @@ -1030,18 +1030,18 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB22_1: // %partword.cmpxchg.loop @@ -1059,7 +1059,7 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB22_1; ; SM70-NEXT: $L__BB22_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire ret i8 %new @@ -1077,18 +1077,18 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB23_1: // %partword.cmpxchg.loop @@ -1106,7 +1106,7 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB23_1; ; SM70-NEXT: $L__BB23_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire ret i8 %new @@ -1124,18 +1124,18 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB24_1: // %partword.cmpxchg.loop @@ -1153,7 +1153,7 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB24_1; ; SM70-NEXT: $L__BB24_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst ret i8 %new @@ -1171,18 +1171,18 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB25_1: // %partword.cmpxchg.loop @@ -1200,7 +1200,7 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB25_1; ; SM70-NEXT: $L__BB25_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst ret i8 %new @@ -1218,18 +1218,18 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB26_1: // %partword.cmpxchg.loop @@ -1247,7 +1247,7 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB26_1; ; SM70-NEXT: $L__BB26_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst ret i8 %new @@ -1265,18 +1265,18 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB27_1: // %partword.cmpxchg.loop @@ -1294,7 +1294,7 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB27_1; ; SM70-NEXT: $L__BB27_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic ret i8 %new @@ -1312,18 +1312,18 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB28_1: // %partword.cmpxchg.loop @@ -1341,7 +1341,7 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: @%p2 bra $L__BB28_1; ; SM70-NEXT: $L__BB28_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic ret i8 %new @@ -1359,18 +1359,18 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB29_1: // %partword.cmpxchg.loop @@ -1388,7 +1388,7 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: @%p2 bra $L__BB29_1; ; SM70-NEXT: $L__BB29_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic ret i8 %new @@ -1406,18 +1406,18 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB30_1: // %partword.cmpxchg.loop @@ -1435,7 +1435,7 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB30_1; ; SM70-NEXT: $L__BB30_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new @@ -1453,18 +1453,18 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB31_1: // %partword.cmpxchg.loop @@ -1482,7 +1482,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB31_1; ; SM70-NEXT: $L__BB31_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new @@ -1500,18 +1500,18 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB32_1: // %partword.cmpxchg.loop @@ -1529,7 +1529,7 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB32_1; ; SM70-NEXT: $L__BB32_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new @@ -1547,18 +1547,18 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB33_1: // %partword.cmpxchg.loop @@ -1576,7 +1576,7 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB33_1; ; SM70-NEXT: $L__BB33_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst ret i8 %new @@ -1594,18 +1594,18 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB34_1: // %partword.cmpxchg.loop @@ -1623,7 +1623,7 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB34_1; ; SM70-NEXT: $L__BB34_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst ret i8 %new @@ -1641,18 +1641,18 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB35_1: // %partword.cmpxchg.loop @@ -1670,7 +1670,7 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB35_1; ; SM70-NEXT: $L__BB35_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst ret i8 %new @@ -1688,18 +1688,18 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB36_1: // %partword.cmpxchg.loop @@ -1717,7 +1717,7 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB36_1; ; SM70-NEXT: $L__BB36_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic ret i8 %new @@ -1735,18 +1735,18 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB37_1: // %partword.cmpxchg.loop @@ -1764,7 +1764,7 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: @%p2 bra $L__BB37_1; ; SM70-NEXT: $L__BB37_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic ret i8 %new @@ -1782,18 +1782,18 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB38_1: // %partword.cmpxchg.loop @@ -1811,7 +1811,7 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: @%p2 bra $L__BB38_1; ; SM70-NEXT: $L__BB38_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic ret i8 %new @@ -1829,18 +1829,18 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB39_1: // %partword.cmpxchg.loop @@ -1858,7 +1858,7 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB39_1; ; SM70-NEXT: $L__BB39_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire ret i8 %new @@ -1876,18 +1876,18 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB40_1: // %partword.cmpxchg.loop @@ -1905,7 +1905,7 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB40_1; ; SM70-NEXT: $L__BB40_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire ret i8 %new @@ -1923,18 +1923,18 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB41_1: // %partword.cmpxchg.loop @@ -1952,7 +1952,7 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB41_1; ; SM70-NEXT: $L__BB41_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire ret i8 %new @@ -1970,18 +1970,18 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB42_1: // %partword.cmpxchg.loop @@ -1999,7 +1999,7 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB42_1; ; SM70-NEXT: $L__BB42_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst ret i8 %new @@ -2017,18 +2017,18 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB43_1: // %partword.cmpxchg.loop @@ -2046,7 +2046,7 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB43_1; ; SM70-NEXT: $L__BB43_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst ret i8 %new @@ -2064,18 +2064,18 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB44_1: // %partword.cmpxchg.loop @@ -2093,7 +2093,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB44_1; ; SM70-NEXT: $L__BB44_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst ret i8 %new diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll index f289c3cf3d509..e20f988577282 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll @@ -14,17 +14,17 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB0_1: // %partword.cmpxchg.loop @@ -41,7 +41,7 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB0_1; ; SM90-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new @@ -59,17 +59,17 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB1_1: // %partword.cmpxchg.loop @@ -86,7 +86,7 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB1_1; ; SM90-NEXT: $L__BB1_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new @@ -104,17 +104,17 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB2_1: // %partword.cmpxchg.loop @@ -131,7 +131,7 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB2_1; ; SM90-NEXT: $L__BB2_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new @@ -149,17 +149,17 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB3_1: // %partword.cmpxchg.loop @@ -177,7 +177,7 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB3_1; ; SM90-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new @@ -195,17 +195,17 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB4_1: // %partword.cmpxchg.loop @@ -223,7 +223,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: @%p2 bra $L__BB4_1; ; SM90-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new @@ -241,17 +241,17 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB5_1: // %partword.cmpxchg.loop @@ -269,7 +269,7 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: @%p2 bra $L__BB5_1; ; SM90-NEXT: $L__BB5_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new @@ -287,18 +287,18 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB6_1: // %partword.cmpxchg.loop @@ -316,7 +316,7 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB6_1; ; SM90-NEXT: $L__BB6_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new @@ -334,18 +334,18 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB7_1: // %partword.cmpxchg.loop @@ -363,7 +363,7 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: @%p2 bra $L__BB7_1; ; SM90-NEXT: $L__BB7_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new @@ -381,18 +381,18 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB8_1: // %partword.cmpxchg.loop @@ -410,7 +410,7 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: @%p2 bra $L__BB8_1; ; SM90-NEXT: $L__BB8_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new @@ -428,17 +428,17 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop @@ -456,7 +456,7 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB9_1; ; SM90-NEXT: $L__BB9_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new @@ -474,17 +474,17 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB10_1: // %partword.cmpxchg.loop @@ -502,7 +502,7 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: @%p2 bra $L__BB10_1; ; SM90-NEXT: $L__BB10_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new @@ -520,17 +520,17 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB11_1: // %partword.cmpxchg.loop @@ -548,7 +548,7 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: @%p2 bra $L__BB11_1; ; SM90-NEXT: $L__BB11_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new @@ -566,17 +566,17 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB12_1: // %partword.cmpxchg.loop @@ -594,7 +594,7 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB12_1; ; SM90-NEXT: $L__BB12_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire ret i8 %new @@ -612,17 +612,17 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB13_1: // %partword.cmpxchg.loop @@ -640,7 +640,7 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB13_1; ; SM90-NEXT: $L__BB13_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire ret i8 %new @@ -658,17 +658,17 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB14_1: // %partword.cmpxchg.loop @@ -686,7 +686,7 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB14_1; ; SM90-NEXT: $L__BB14_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire ret i8 %new @@ -704,18 +704,18 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB15_1: // %partword.cmpxchg.loop @@ -733,7 +733,7 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB15_1; ; SM90-NEXT: $L__BB15_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst ret i8 %new @@ -751,18 +751,18 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB16_1: // %partword.cmpxchg.loop @@ -780,7 +780,7 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB16_1; ; SM90-NEXT: $L__BB16_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst ret i8 %new @@ -798,18 +798,18 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB17_1: // %partword.cmpxchg.loop @@ -827,7 +827,7 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB17_1; ; SM90-NEXT: $L__BB17_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst ret i8 %new @@ -845,18 +845,18 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; ; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB18_1: // %partword.cmpxchg.loop @@ -873,7 +873,7 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB18_1; ; SM90-NEXT: $L__BB18_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic ret i8 %new @@ -891,18 +891,18 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; ; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB19_1: // %partword.cmpxchg.loop @@ -919,7 +919,7 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB19_1; ; SM90-NEXT: $L__BB19_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic ret i8 %new @@ -937,18 +937,18 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; ; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB20_1: // %partword.cmpxchg.loop @@ -965,7 +965,7 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB20_1; ; SM90-NEXT: $L__BB20_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic ret i8 %new @@ -983,18 +983,18 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; ; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB21_1: // %partword.cmpxchg.loop @@ -1012,7 +1012,7 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB21_1; ; SM90-NEXT: $L__BB21_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire ret i8 %new @@ -1030,18 +1030,18 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; ; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB22_1: // %partword.cmpxchg.loop @@ -1059,7 +1059,7 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB22_1; ; SM90-NEXT: $L__BB22_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire ret i8 %new @@ -1077,18 +1077,18 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; ; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB23_1: // %partword.cmpxchg.loop @@ -1106,7 +1106,7 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB23_1; ; SM90-NEXT: $L__BB23_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire ret i8 %new @@ -1124,18 +1124,18 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB24_1: // %partword.cmpxchg.loop @@ -1153,7 +1153,7 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB24_1; ; SM90-NEXT: $L__BB24_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst ret i8 %new @@ -1171,18 +1171,18 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB25_1: // %partword.cmpxchg.loop @@ -1200,7 +1200,7 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB25_1; ; SM90-NEXT: $L__BB25_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst ret i8 %new @@ -1218,18 +1218,18 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB26_1: // %partword.cmpxchg.loop @@ -1247,7 +1247,7 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB26_1; ; SM90-NEXT: $L__BB26_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst ret i8 %new @@ -1265,18 +1265,18 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; ; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB27_1: // %partword.cmpxchg.loop @@ -1294,7 +1294,7 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB27_1; ; SM90-NEXT: $L__BB27_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic ret i8 %new @@ -1312,18 +1312,18 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; ; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB28_1: // %partword.cmpxchg.loop @@ -1341,7 +1341,7 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: @%p2 bra $L__BB28_1; ; SM90-NEXT: $L__BB28_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic ret i8 %new @@ -1359,18 +1359,18 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; ; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB29_1: // %partword.cmpxchg.loop @@ -1388,7 +1388,7 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: @%p2 bra $L__BB29_1; ; SM90-NEXT: $L__BB29_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic ret i8 %new @@ -1406,18 +1406,18 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; ; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB30_1: // %partword.cmpxchg.loop @@ -1435,7 +1435,7 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB30_1; ; SM90-NEXT: $L__BB30_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new @@ -1453,18 +1453,18 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; ; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB31_1: // %partword.cmpxchg.loop @@ -1482,7 +1482,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB31_1; ; SM90-NEXT: $L__BB31_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new @@ -1500,18 +1500,18 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; ; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB32_1: // %partword.cmpxchg.loop @@ -1529,7 +1529,7 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB32_1; ; SM90-NEXT: $L__BB32_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new @@ -1547,18 +1547,18 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB33_1: // %partword.cmpxchg.loop @@ -1576,7 +1576,7 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB33_1; ; SM90-NEXT: $L__BB33_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst ret i8 %new @@ -1594,18 +1594,18 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB34_1: // %partword.cmpxchg.loop @@ -1623,7 +1623,7 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB34_1; ; SM90-NEXT: $L__BB34_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst ret i8 %new @@ -1641,18 +1641,18 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB35_1: // %partword.cmpxchg.loop @@ -1670,7 +1670,7 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB35_1; ; SM90-NEXT: $L__BB35_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst ret i8 %new @@ -1688,18 +1688,18 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB36_1: // %partword.cmpxchg.loop @@ -1717,7 +1717,7 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB36_1; ; SM90-NEXT: $L__BB36_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic ret i8 %new @@ -1735,18 +1735,18 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB37_1: // %partword.cmpxchg.loop @@ -1764,7 +1764,7 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: @%p2 bra $L__BB37_1; ; SM90-NEXT: $L__BB37_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic ret i8 %new @@ -1782,18 +1782,18 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB38_1: // %partword.cmpxchg.loop @@ -1811,7 +1811,7 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: @%p2 bra $L__BB38_1; ; SM90-NEXT: $L__BB38_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic ret i8 %new @@ -1829,18 +1829,18 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB39_1: // %partword.cmpxchg.loop @@ -1858,7 +1858,7 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB39_1; ; SM90-NEXT: $L__BB39_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire ret i8 %new @@ -1876,18 +1876,18 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB40_1: // %partword.cmpxchg.loop @@ -1905,7 +1905,7 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB40_1; ; SM90-NEXT: $L__BB40_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire ret i8 %new @@ -1923,18 +1923,18 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB41_1: // %partword.cmpxchg.loop @@ -1952,7 +1952,7 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB41_1; ; SM90-NEXT: $L__BB41_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire ret i8 %new @@ -1970,18 +1970,18 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB42_1: // %partword.cmpxchg.loop @@ -1999,7 +1999,7 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB42_1; ; SM90-NEXT: $L__BB42_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst ret i8 %new @@ -2017,18 +2017,18 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB43_1: // %partword.cmpxchg.loop @@ -2046,7 +2046,7 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB43_1; ; SM90-NEXT: $L__BB43_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst ret i8 %new @@ -2064,18 +2064,18 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB44_1: // %partword.cmpxchg.loop @@ -2093,7 +2093,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB44_1; ; SM90-NEXT: $L__BB44_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst ret i8 %new diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll index 9eeff9d7c2b75..85414a2ab04e8 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll @@ -21,17 +21,17 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: ld.param.b8 %rs1, [relaxed_sys_i8_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [relaxed_sys_i8_param_0]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r9, %rd2; -; SM30-NEXT: and.b32 %r10, %r9, 3; -; SM30-NEXT: shl.b32 %r1, %r10, 3; -; SM30-NEXT: mov.b32 %r11, 255; -; SM30-NEXT: shl.b32 %r12, %r11, %r1; -; SM30-NEXT: not.b32 %r2, %r12; -; SM30-NEXT: cvt.u32.u16 %r13, %rs1; -; SM30-NEXT: and.b32 %r14, %r13, 255; -; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: ld.param.b8 %r15, [relaxed_sys_i8_param_1]; -; SM30-NEXT: shl.b32 %r4, %r15, %r1; +; SM30-NEXT: ld.param.b8 %r9, [relaxed_sys_i8_param_1]; +; SM30-NEXT: cvt.u32.u64 %r10, %rd2; +; SM30-NEXT: and.b32 %r11, %r10, 3; +; SM30-NEXT: shl.b32 %r1, %r11, 3; +; SM30-NEXT: mov.b32 %r12, 255; +; SM30-NEXT: shl.b32 %r13, %r12, %r1; +; SM30-NEXT: not.b32 %r2, %r13; +; SM30-NEXT: cvt.u32.u16 %r14, %rs1; +; SM30-NEXT: and.b32 %r15, %r14, 255; +; SM30-NEXT: shl.b32 %r3, %r15, %r1; +; SM30-NEXT: shl.b32 %r4, %r9, %r1; ; SM30-NEXT: ld.b32 %r16, [%rd1]; ; SM30-NEXT: and.b32 %r20, %r16, %r2; ; SM30-NEXT: $L__BB0_1: // %partword.cmpxchg.loop @@ -48,7 +48,7 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: mov.b32 %r20, %r8; ; SM30-NEXT: @%p2 bra $L__BB0_1; ; SM30-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM30-NEXT: st.param.b32 [func_retval0], %r13; +; SM30-NEXT: st.param.b32 [func_retval0], %r14; ; SM30-NEXT: ret; ; ; SM70-LABEL: relaxed_sys_i8( @@ -62,17 +62,17 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [relaxed_sys_i8_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [relaxed_sys_i8_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [relaxed_sys_i8_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [relaxed_sys_i8_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop @@ -89,7 +89,7 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB0_1; ; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; ; SM90-LABEL: relaxed_sys_i8( ; SM90: { @@ -147,17 +147,17 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: ld.param.b8 %rs1, [acquire_sys_i8_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [acquire_sys_i8_param_0]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r9, %rd2; -; SM30-NEXT: and.b32 %r10, %r9, 3; -; SM30-NEXT: shl.b32 %r1, %r10, 3; -; SM30-NEXT: mov.b32 %r11, 255; -; SM30-NEXT: shl.b32 %r12, %r11, %r1; -; SM30-NEXT: not.b32 %r2, %r12; -; SM30-NEXT: cvt.u32.u16 %r13, %rs1; -; SM30-NEXT: and.b32 %r14, %r13, 255; -; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: ld.param.b8 %r15, [acquire_sys_i8_param_1]; -; SM30-NEXT: shl.b32 %r4, %r15, %r1; +; SM30-NEXT: ld.param.b8 %r9, [acquire_sys_i8_param_1]; +; SM30-NEXT: cvt.u32.u64 %r10, %rd2; +; SM30-NEXT: and.b32 %r11, %r10, 3; +; SM30-NEXT: shl.b32 %r1, %r11, 3; +; SM30-NEXT: mov.b32 %r12, 255; +; SM30-NEXT: shl.b32 %r13, %r12, %r1; +; SM30-NEXT: not.b32 %r2, %r13; +; SM30-NEXT: cvt.u32.u16 %r14, %rs1; +; SM30-NEXT: and.b32 %r15, %r14, 255; +; SM30-NEXT: shl.b32 %r3, %r15, %r1; +; SM30-NEXT: shl.b32 %r4, %r9, %r1; ; SM30-NEXT: ld.b32 %r16, [%rd1]; ; SM30-NEXT: and.b32 %r20, %r16, %r2; ; SM30-NEXT: $L__BB1_1: // %partword.cmpxchg.loop @@ -175,7 +175,7 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: @%p2 bra $L__BB1_1; ; SM30-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; -; SM30-NEXT: st.param.b32 [func_retval0], %r13; +; SM30-NEXT: st.param.b32 [func_retval0], %r14; ; SM30-NEXT: ret; ; ; SM70-LABEL: acquire_sys_i8( @@ -189,17 +189,17 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acquire_sys_i8_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_sys_i8_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_sys_i8_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [acquire_sys_i8_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop @@ -217,7 +217,7 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB1_1; ; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; ; SM90-LABEL: acquire_sys_i8( ; SM90: { @@ -276,18 +276,18 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: ld.param.b8 %rs1, [release_sys_i8_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [release_sys_i8_param_0]; ; SM30-NEXT: membar.sys; +; SM30-NEXT: ld.param.b8 %r9, [release_sys_i8_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r9, %rd2; -; SM30-NEXT: and.b32 %r10, %r9, 3; -; SM30-NEXT: shl.b32 %r1, %r10, 3; -; SM30-NEXT: mov.b32 %r11, 255; -; SM30-NEXT: shl.b32 %r12, %r11, %r1; -; SM30-NEXT: not.b32 %r2, %r12; -; SM30-NEXT: cvt.u32.u16 %r13, %rs1; -; SM30-NEXT: and.b32 %r14, %r13, 255; -; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: ld.param.b8 %r15, [release_sys_i8_param_1]; -; SM30-NEXT: shl.b32 %r4, %r15, %r1; +; SM30-NEXT: cvt.u32.u64 %r10, %rd2; +; SM30-NEXT: and.b32 %r11, %r10, 3; +; SM30-NEXT: shl.b32 %r1, %r11, 3; +; SM30-NEXT: mov.b32 %r12, 255; +; SM30-NEXT: shl.b32 %r13, %r12, %r1; +; SM30-NEXT: not.b32 %r2, %r13; +; SM30-NEXT: cvt.u32.u16 %r14, %rs1; +; SM30-NEXT: and.b32 %r15, %r14, 255; +; SM30-NEXT: shl.b32 %r3, %r15, %r1; +; SM30-NEXT: shl.b32 %r4, %r9, %r1; ; SM30-NEXT: ld.b32 %r16, [%rd1]; ; SM30-NEXT: and.b32 %r20, %r16, %r2; ; SM30-NEXT: $L__BB2_1: // %partword.cmpxchg.loop @@ -304,7 +304,7 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: mov.b32 %r20, %r8; ; SM30-NEXT: @%p2 bra $L__BB2_1; ; SM30-NEXT: $L__BB2_3: // %partword.cmpxchg.end -; SM30-NEXT: st.param.b32 [func_retval0], %r13; +; SM30-NEXT: st.param.b32 [func_retval0], %r14; ; SM30-NEXT: ret; ; ; SM70-LABEL: release_sys_i8( @@ -318,18 +318,18 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [release_sys_i8_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_sys_i8_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [release_sys_i8_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_sys_i8_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop @@ -346,7 +346,7 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB2_1; ; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; ; SM90-LABEL: release_sys_i8( ; SM90: { @@ -405,18 +405,18 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: ld.param.b8 %rs1, [acq_rel_sys_i8_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i8_param_0]; ; SM30-NEXT: membar.sys; +; SM30-NEXT: ld.param.b8 %r9, [acq_rel_sys_i8_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r9, %rd2; -; SM30-NEXT: and.b32 %r10, %r9, 3; -; SM30-NEXT: shl.b32 %r1, %r10, 3; -; SM30-NEXT: mov.b32 %r11, 255; -; SM30-NEXT: shl.b32 %r12, %r11, %r1; -; SM30-NEXT: not.b32 %r2, %r12; -; SM30-NEXT: cvt.u32.u16 %r13, %rs1; -; SM30-NEXT: and.b32 %r14, %r13, 255; -; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: ld.param.b8 %r15, [acq_rel_sys_i8_param_1]; -; SM30-NEXT: shl.b32 %r4, %r15, %r1; +; SM30-NEXT: cvt.u32.u64 %r10, %rd2; +; SM30-NEXT: and.b32 %r11, %r10, 3; +; SM30-NEXT: shl.b32 %r1, %r11, 3; +; SM30-NEXT: mov.b32 %r12, 255; +; SM30-NEXT: shl.b32 %r13, %r12, %r1; +; SM30-NEXT: not.b32 %r2, %r13; +; SM30-NEXT: cvt.u32.u16 %r14, %rs1; +; SM30-NEXT: and.b32 %r15, %r14, 255; +; SM30-NEXT: shl.b32 %r3, %r15, %r1; +; SM30-NEXT: shl.b32 %r4, %r9, %r1; ; SM30-NEXT: ld.b32 %r16, [%rd1]; ; SM30-NEXT: and.b32 %r20, %r16, %r2; ; SM30-NEXT: $L__BB3_1: // %partword.cmpxchg.loop @@ -434,7 +434,7 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: @%p2 bra $L__BB3_1; ; SM30-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; -; SM30-NEXT: st.param.b32 [func_retval0], %r13; +; SM30-NEXT: st.param.b32 [func_retval0], %r14; ; SM30-NEXT: ret; ; ; SM70-LABEL: acq_rel_sys_i8( @@ -448,18 +448,18 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_sys_i8_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i8_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_sys_i8_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_sys_i8_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop @@ -477,7 +477,7 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB3_1; ; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; ; SM90-LABEL: acq_rel_sys_i8( ; SM90: { @@ -537,18 +537,18 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: ld.param.b8 %rs1, [seq_cst_sys_i8_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i8_param_0]; ; SM30-NEXT: membar.sys; +; SM30-NEXT: ld.param.b8 %r9, [seq_cst_sys_i8_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r9, %rd2; -; SM30-NEXT: and.b32 %r10, %r9, 3; -; SM30-NEXT: shl.b32 %r1, %r10, 3; -; SM30-NEXT: mov.b32 %r11, 255; -; SM30-NEXT: shl.b32 %r12, %r11, %r1; -; SM30-NEXT: not.b32 %r2, %r12; -; SM30-NEXT: cvt.u32.u16 %r13, %rs1; -; SM30-NEXT: and.b32 %r14, %r13, 255; -; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: ld.param.b8 %r15, [seq_cst_sys_i8_param_1]; -; SM30-NEXT: shl.b32 %r4, %r15, %r1; +; SM30-NEXT: cvt.u32.u64 %r10, %rd2; +; SM30-NEXT: and.b32 %r11, %r10, 3; +; SM30-NEXT: shl.b32 %r1, %r11, 3; +; SM30-NEXT: mov.b32 %r12, 255; +; SM30-NEXT: shl.b32 %r13, %r12, %r1; +; SM30-NEXT: not.b32 %r2, %r13; +; SM30-NEXT: cvt.u32.u16 %r14, %rs1; +; SM30-NEXT: and.b32 %r15, %r14, 255; +; SM30-NEXT: shl.b32 %r3, %r15, %r1; +; SM30-NEXT: shl.b32 %r4, %r9, %r1; ; SM30-NEXT: ld.b32 %r16, [%rd1]; ; SM30-NEXT: and.b32 %r20, %r16, %r2; ; SM30-NEXT: $L__BB4_1: // %partword.cmpxchg.loop @@ -566,7 +566,7 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: @%p2 bra $L__BB4_1; ; SM30-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; -; SM30-NEXT: st.param.b32 [func_retval0], %r13; +; SM30-NEXT: st.param.b32 [func_retval0], %r14; ; SM30-NEXT: ret; ; ; SM70-LABEL: seq_cst_sys_i8( @@ -580,18 +580,18 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_sys_i8_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i8_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_sys_i8_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_sys_i8_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop @@ -609,7 +609,7 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB4_1; ; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; ; SM90-LABEL: seq_cst_sys_i8( ; SM90: { diff --git a/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll b/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll index 8a0c0f8c3b452..a2fc8da3f1e61 100644 --- a/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll +++ b/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll @@ -24,11 +24,11 @@ define i16 @cvt_i16_i32(i32 %x) { define i16 @cvt_i16_i64(i64 %x) { ; CHECK-LABEL: cvt_i16_i64( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [cvt_i16_i64_param_0]; -; CHECK-NEXT: st.param.b32 [func_retval0], %rd1; +; CHECK-NEXT: ld.param.b16 %r1, [cvt_i16_i64_param_0]; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %a = trunc i64 %x to i16 ret i16 %a diff --git a/llvm/test/CodeGen/NVPTX/extractelement.ll b/llvm/test/CodeGen/NVPTX/extractelement.ll index b1eadf381d3b4..f37777ab954e2 100644 --- a/llvm/test/CodeGen/NVPTX/extractelement.ll +++ b/llvm/test/CodeGen/NVPTX/extractelement.ll @@ -158,27 +158,24 @@ define i16 @test_v8i8(i64 %a) { ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<16>; ; CHECK-NEXT: .reg .b32 %r<12>; -; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_v8i8_param_0]; -; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd1; } -; CHECK-NEXT: cvt.u32.u64 %r2, %rd1; -; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v8i8_param_0]; +; CHECK-NEXT: bfe.s32 %r3, %r1, 0, 8; ; CHECK-NEXT: cvt.s8.s32 %rs1, %r3; -; CHECK-NEXT: bfe.s32 %r4, %r2, 8, 8; +; CHECK-NEXT: bfe.s32 %r4, %r1, 8, 8; ; CHECK-NEXT: cvt.s8.s32 %rs2, %r4; -; CHECK-NEXT: bfe.s32 %r5, %r2, 16, 8; +; CHECK-NEXT: bfe.s32 %r5, %r1, 16, 8; ; CHECK-NEXT: cvt.s8.s32 %rs3, %r5; -; CHECK-NEXT: bfe.s32 %r6, %r2, 24, 8; +; CHECK-NEXT: bfe.s32 %r6, %r1, 24, 8; ; CHECK-NEXT: cvt.s8.s32 %rs4, %r6; -; CHECK-NEXT: bfe.s32 %r7, %r1, 0, 8; +; CHECK-NEXT: bfe.s32 %r7, %r2, 0, 8; ; CHECK-NEXT: cvt.s8.s32 %rs5, %r7; -; CHECK-NEXT: bfe.s32 %r8, %r1, 8, 8; +; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8; ; CHECK-NEXT: cvt.s8.s32 %rs6, %r8; -; CHECK-NEXT: bfe.s32 %r9, %r1, 16, 8; +; CHECK-NEXT: bfe.s32 %r9, %r2, 16, 8; ; CHECK-NEXT: cvt.s8.s32 %rs7, %r9; -; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8; +; CHECK-NEXT: bfe.s32 %r10, %r2, 24, 8; ; CHECK-NEXT: cvt.s8.s32 %rs8, %r10; ; CHECK-NEXT: add.s16 %rs9, %rs1, %rs2; ; CHECK-NEXT: add.s16 %rs10, %rs3, %rs4; diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll index 0a2cd81ac904c..321a6240df098 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll @@ -121,7 +121,7 @@ define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, p define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) { ; PTX-LABEL: grid_const_escape( ; PTX: { -; PTX-NEXT: .reg .b32 %r<3>; +; PTX-NEXT: .reg .b32 %r<2>; ; PTX-NEXT: .reg .b64 %rd<4>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: @@ -153,7 +153,7 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 ; PTX-NEXT: .local .align 4 .b8 __local_depot4[4]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; -; PTX-NEXT: .reg .b32 %r<4>; +; PTX-NEXT: .reg .b32 %r<3>; ; PTX-NEXT: .reg .b64 %rd<8>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: @@ -255,7 +255,7 @@ define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) { ; PTX-LABEL: grid_const_partial_escape( ; PTX: { -; PTX-NEXT: .reg .b32 %r<5>; +; PTX-NEXT: .reg .b32 %r<4>; ; PTX-NEXT: .reg .b64 %rd<6>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: @@ -295,7 +295,7 @@ define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %ou define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) { ; PTX-LABEL: grid_const_partial_escapemem( ; PTX: { -; PTX-NEXT: .reg .b32 %r<6>; +; PTX-NEXT: .reg .b32 %r<5>; ; PTX-NEXT: .reg .b64 %rd<6>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll index 6f334b075241b..c165de7ffff03 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args.ll @@ -31,7 +31,7 @@ define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 % ; PTX-LABEL: load_alignment( ; PTX: { ; PTX-NEXT: .reg .b32 %r<4>; -; PTX-NEXT: .reg .b64 %rd<8>; +; PTX-NEXT: .reg .b64 %rd<7>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %entry ; PTX-NEXT: mov.b64 %rd1, load_alignment_param_0; @@ -76,7 +76,7 @@ define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) { ; ; PTX-LABEL: load_padding( ; PTX: { -; PTX-NEXT: .reg .b64 %rd<5>; +; PTX-NEXT: .reg .b64 %rd<4>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: mov.b64 %rd1, load_padding_param_0; diff --git a/llvm/test/CodeGen/NVPTX/misched_func_call.ll b/llvm/test/CodeGen/NVPTX/misched_func_call.ll index 2e9eb6913ac0e..8401f457418d1 100644 --- a/llvm/test/CodeGen/NVPTX/misched_func_call.ll +++ b/llvm/test/CodeGen/NVPTX/misched_func_call.ll @@ -8,7 +8,7 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) { ; CHECK-LABEL: wombat( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<11>; -; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-NEXT: .reg .b64 %rd<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %bb ; CHECK-NEXT: ld.param.b32 %r4, [wombat_param_2]; @@ -27,11 +27,11 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) { ; CHECK-NEXT: mul.lo.s32 %r7, %r10, %r3; ; CHECK-NEXT: or.b32 %r8, %r4, %r7; ; CHECK-NEXT: mul.lo.s32 %r9, %r2, %r8; -; CHECK-NEXT: cvt.rn.f64.s32 %rd3, %r9; -; CHECK-NEXT: cvt.rn.f64.u32 %rd4, %r10; -; CHECK-NEXT: add.rn.f64 %rd5, %rd4, %rd3; -; CHECK-NEXT: mov.b64 %rd6, 0; -; CHECK-NEXT: st.global.b64 [%rd6], %rd5; +; CHECK-NEXT: cvt.rn.f64.s32 %rd2, %r9; +; CHECK-NEXT: cvt.rn.f64.u32 %rd3, %r10; +; CHECK-NEXT: add.rn.f64 %rd4, %rd3, %rd2; +; CHECK-NEXT: mov.b64 %rd5, 0; +; CHECK-NEXT: st.global.b64 [%rd5], %rd4; ; CHECK-NEXT: mov.b32 %r10, 1; ; CHECK-NEXT: bra.uni $L__BB0_1; bb: diff --git a/llvm/test/CodeGen/NVPTX/pow2_mask_cmp.ll b/llvm/test/CodeGen/NVPTX/pow2_mask_cmp.ll index 3096b953e8d3a..160511387652c 100644 --- a/llvm/test/CodeGen/NVPTX/pow2_mask_cmp.ll +++ b/llvm/test/CodeGen/NVPTX/pow2_mask_cmp.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=nvptx64 -verify-machineinstrs < %s | FileCheck %s ; RUN: %if ptxas %{ llc -mtriple=nvptx64 -verify-machineinstrs < %s | %ptxas-verify %} @@ -10,9 +11,19 @@ ; value will be identical regardless of the boolean representation. ; Check that the optimization triggers in this case. -; CHECK-LABEL: @pow2_mask_cmp -; CHECK: bfe.u32 {{%r[0-9]+}}, {{%r[0-9]+}}, 3, 1 define i32 @pow2_mask_cmp(i32 %x) { +; CHECK-LABEL: pow2_mask_cmp( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [pow2_mask_cmp_param_0]; +; CHECK-NEXT: shr.u16 %rs2, %rs1, 3; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs2; +; CHECK-NEXT: and.b32 %r2, %r1, 1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %a = and i32 %x, 8 %cmp = icmp ne i32 %a, 0 %r = zext i1 %cmp to i32 diff --git a/llvm/test/CodeGen/NVPTX/st-param-imm.ll b/llvm/test/CodeGen/NVPTX/st-param-imm.ll index 50d3e8049a947..6aa111932a4a5 100644 --- a/llvm/test/CodeGen/NVPTX/st-param-imm.ll +++ b/llvm/test/CodeGen/NVPTX/st-param-imm.ll @@ -445,12 +445,12 @@ define void @st_param_v4_i8_irrr(i8 %b, i8 %c, i8 %d) { ; CHECK-NEXT: .reg .b16 %rs<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_irrr_param_0]; +; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_irrr_param_2]; ; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_irrr_param_1]; -; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_irrr_param_2]; +; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_irrr_param_0]; ; CHECK-NEXT: { // callseq 24, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs1, %rs2, %rs3}; +; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs3, %rs2, %rs1}; ; CHECK-NEXT: call.uni call_v4_i8, (param0); ; CHECK-NEXT: } // callseq 24 ; CHECK-NEXT: ret; @@ -467,12 +467,12 @@ define void @st_param_v4_i8_rirr(i8 %a, i8 %c, i8 %d) { ; CHECK-NEXT: .reg .b16 %rs<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rirr_param_0]; +; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rirr_param_2]; ; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_rirr_param_1]; -; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_rirr_param_2]; +; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_rirr_param_0]; ; CHECK-NEXT: { // callseq 25, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, 2, %rs2, %rs3}; +; CHECK-NEXT: st.param.v4.b8 [param0], {%rs3, 2, %rs2, %rs1}; ; CHECK-NEXT: call.uni call_v4_i8, (param0); ; CHECK-NEXT: } // callseq 25 ; CHECK-NEXT: ret; @@ -489,12 +489,12 @@ define void @st_param_v4_i8_rrir(i8 %a, i8 %b, i8 %d) { ; CHECK-NEXT: .reg .b16 %rs<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rrir_param_0]; +; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rrir_param_2]; ; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_rrir_param_1]; -; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_rrir_param_2]; +; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_rrir_param_0]; ; CHECK-NEXT: { // callseq 26, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, %rs2, 3, %rs3}; +; CHECK-NEXT: st.param.v4.b8 [param0], {%rs3, %rs2, 3, %rs1}; ; CHECK-NEXT: call.uni call_v4_i8, (param0); ; CHECK-NEXT: } // callseq 26 ; CHECK-NEXT: ret; @@ -511,12 +511,12 @@ define void @st_param_v4_i8_rrri(i8 %a, i8 %b, i8 %c) { ; CHECK-NEXT: .reg .b16 %rs<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rrri_param_0]; +; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rrri_param_2]; ; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_rrri_param_1]; -; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_rrri_param_2]; +; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_rrri_param_0]; ; CHECK-NEXT: { // callseq 27, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, %rs2, %rs3, 4}; +; CHECK-NEXT: st.param.v4.b8 [param0], {%rs3, %rs2, %rs1, 4}; ; CHECK-NEXT: call.uni call_v4_i8, (param0); ; CHECK-NEXT: } // callseq 27 ; CHECK-NEXT: ret; @@ -533,11 +533,11 @@ define void @st_param_v4_i8_iirr(i8 %c, i8 %d) { ; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_iirr_param_0]; -; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_iirr_param_1]; +; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_iirr_param_1]; +; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_iirr_param_0]; ; CHECK-NEXT: { // callseq 28, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0], {1, 2, %rs1, %rs2}; +; CHECK-NEXT: st.param.v4.b8 [param0], {1, 2, %rs2, %rs1}; ; CHECK-NEXT: call.uni call_v4_i8, (param0); ; CHECK-NEXT: } // callseq 28 ; CHECK-NEXT: ret; @@ -554,11 +554,11 @@ define void @st_param_v4_i8_irir(i8 %b, i8 %d) { ; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_irir_param_0]; -; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_irir_param_1]; +; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_irir_param_1]; +; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_irir_param_0]; ; CHECK-NEXT: { // callseq 29, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs1, 3, %rs2}; +; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs2, 3, %rs1}; ; CHECK-NEXT: call.uni call_v4_i8, (param0); ; CHECK-NEXT: } // callseq 29 ; CHECK-NEXT: ret; @@ -575,11 +575,11 @@ define void @st_param_v4_i8_irri(i8 %b, i8 %c) { ; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_irri_param_0]; -; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_irri_param_1]; +; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_irri_param_1]; +; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_irri_param_0]; ; CHECK-NEXT: { // callseq 30, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs1, %rs2, 4}; +; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs2, %rs1, 4}; ; CHECK-NEXT: call.uni call_v4_i8, (param0); ; CHECK-NEXT: } // callseq 30 ; CHECK-NEXT: ret; @@ -596,11 +596,11 @@ define void @st_param_v4_i8_riir(i8 %a, i8 %d) { ; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_riir_param_0]; -; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_riir_param_1]; +; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_riir_param_1]; +; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_riir_param_0]; ; CHECK-NEXT: { // callseq 31, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, 2, 3, %rs2}; +; CHECK-NEXT: st.param.v4.b8 [param0], {%rs2, 2, 3, %rs1}; ; CHECK-NEXT: call.uni call_v4_i8, (param0); ; CHECK-NEXT: } // callseq 31 ; CHECK-NEXT: ret; @@ -617,11 +617,11 @@ define void @st_param_v4_i8_riri(i8 %a, i8 %c) { ; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_riri_param_0]; -; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_riri_param_1]; +; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_riri_param_1]; +; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_riri_param_0]; ; CHECK-NEXT: { // callseq 32, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, 2, %rs2, 4}; +; CHECK-NEXT: st.param.v4.b8 [param0], {%rs2, 2, %rs1, 4}; ; CHECK-NEXT: call.uni call_v4_i8, (param0); ; CHECK-NEXT: } // callseq 32 ; CHECK-NEXT: ret; @@ -638,11 +638,11 @@ define void @st_param_v4_i8_rrii(i8 %a, i8 %b) { ; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rrii_param_0]; -; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_rrii_param_1]; +; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rrii_param_1]; +; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_rrii_param_0]; ; CHECK-NEXT: { // callseq 33, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, %rs2, 3, 4}; +; CHECK-NEXT: st.param.v4.b8 [param0], {%rs2, %rs1, 3, 4}; ; CHECK-NEXT: call.uni call_v4_i8, (param0); ; CHECK-NEXT: } // callseq 33 ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll index e4e668018d872..87e46b1505e31 100644 --- a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll +++ b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll @@ -103,16 +103,16 @@ define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) { ; CHECK-NEXT: ld.param.b8 %rs6, [retval0+8]; ; CHECK-NEXT: ld.param.b64 %rd2, [retval0+16]; ; CHECK-NEXT: } // callseq 1 -; CHECK-NEXT: cvt.u32.u16 %r16, %rs3; -; CHECK-NEXT: cvt.u32.u16 %r17, %rs4; -; CHECK-NEXT: cvt.u32.u16 %r18, %rs5; -; CHECK-NEXT: cvt.u32.u16 %r19, %rs6; +; CHECK-NEXT: cvt.u32.u16 %r15, %rs3; +; CHECK-NEXT: cvt.u32.u16 %r16, %rs4; +; CHECK-NEXT: cvt.u32.u16 %r17, %rs5; +; CHECK-NEXT: cvt.u32.u16 %r18, %rs6; ; CHECK-NEXT: st.param.b32 [func_retval0], %r14; ; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs2; -; CHECK-NEXT: st.param.b8 [func_retval0+8], %r19; -; CHECK-NEXT: st.param.b8 [func_retval0+7], %r18; -; CHECK-NEXT: st.param.b8 [func_retval0+6], %r17; -; CHECK-NEXT: st.param.b8 [func_retval0+5], %r16; +; CHECK-NEXT: st.param.b8 [func_retval0+8], %r18; +; CHECK-NEXT: st.param.b8 [func_retval0+7], %r17; +; CHECK-NEXT: st.param.b8 [func_retval0+6], %r16; +; CHECK-NEXT: st.param.b8 [func_retval0+5], %r15; ; CHECK-NEXT: st.param.b64 [func_retval0+16], %rd2; ; CHECK-NEXT: ret; %r = tail call %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) @@ -185,48 +185,48 @@ define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) { ; CHECK-NEXT: ld.param.b8 %rs10, [retval0+16]; ; CHECK-NEXT: ld.param.b64 %rd32, [retval0+24]; ; CHECK-NEXT: } // callseq 2 -; CHECK-NEXT: cvt.u64.u16 %rd35, %rs3; +; CHECK-NEXT: cvt.u64.u16 %rd33, %rs3; +; CHECK-NEXT: and.b64 %rd34, %rd33, 255; +; CHECK-NEXT: cvt.u64.u16 %rd35, %rs4; ; CHECK-NEXT: and.b64 %rd36, %rd35, 255; -; CHECK-NEXT: cvt.u64.u16 %rd37, %rs4; -; CHECK-NEXT: and.b64 %rd38, %rd37, 255; -; CHECK-NEXT: shl.b64 %rd39, %rd38, 8; -; CHECK-NEXT: or.b64 %rd40, %rd36, %rd39; -; CHECK-NEXT: cvt.u64.u16 %rd41, %rs5; -; CHECK-NEXT: and.b64 %rd42, %rd41, 255; -; CHECK-NEXT: shl.b64 %rd43, %rd42, 16; -; CHECK-NEXT: or.b64 %rd44, %rd40, %rd43; -; CHECK-NEXT: cvt.u64.u16 %rd45, %rs6; -; CHECK-NEXT: and.b64 %rd46, %rd45, 255; -; CHECK-NEXT: shl.b64 %rd47, %rd46, 24; -; CHECK-NEXT: or.b64 %rd48, %rd44, %rd47; -; CHECK-NEXT: cvt.u64.u16 %rd49, %rs7; -; CHECK-NEXT: and.b64 %rd50, %rd49, 255; -; CHECK-NEXT: shl.b64 %rd51, %rd50, 32; -; CHECK-NEXT: or.b64 %rd52, %rd48, %rd51; -; CHECK-NEXT: cvt.u64.u16 %rd53, %rs8; -; CHECK-NEXT: and.b64 %rd54, %rd53, 255; -; CHECK-NEXT: shl.b64 %rd55, %rd54, 40; -; CHECK-NEXT: or.b64 %rd56, %rd52, %rd55; -; CHECK-NEXT: cvt.u64.u16 %rd57, %rs9; -; CHECK-NEXT: and.b64 %rd58, %rd57, 255; -; CHECK-NEXT: shl.b64 %rd59, %rd58, 48; -; CHECK-NEXT: or.b64 %rd60, %rd56, %rd59; -; CHECK-NEXT: cvt.u64.u16 %rd61, %rs10; -; CHECK-NEXT: shl.b64 %rd62, %rd61, 56; -; CHECK-NEXT: or.b64 %rd63, %rd60, %rd62; +; CHECK-NEXT: shl.b64 %rd37, %rd36, 8; +; CHECK-NEXT: or.b64 %rd38, %rd34, %rd37; +; CHECK-NEXT: cvt.u64.u16 %rd39, %rs5; +; CHECK-NEXT: and.b64 %rd40, %rd39, 255; +; CHECK-NEXT: shl.b64 %rd41, %rd40, 16; +; CHECK-NEXT: or.b64 %rd42, %rd38, %rd41; +; CHECK-NEXT: cvt.u64.u16 %rd43, %rs6; +; CHECK-NEXT: and.b64 %rd44, %rd43, 255; +; CHECK-NEXT: shl.b64 %rd45, %rd44, 24; +; CHECK-NEXT: or.b64 %rd46, %rd42, %rd45; +; CHECK-NEXT: cvt.u64.u16 %rd47, %rs7; +; CHECK-NEXT: and.b64 %rd48, %rd47, 255; +; CHECK-NEXT: shl.b64 %rd49, %rd48, 32; +; CHECK-NEXT: or.b64 %rd50, %rd46, %rd49; +; CHECK-NEXT: cvt.u64.u16 %rd51, %rs8; +; CHECK-NEXT: and.b64 %rd52, %rd51, 255; +; CHECK-NEXT: shl.b64 %rd53, %rd52, 40; +; CHECK-NEXT: or.b64 %rd54, %rd50, %rd53; +; CHECK-NEXT: cvt.u64.u16 %rd55, %rs9; +; CHECK-NEXT: and.b64 %rd56, %rd55, 255; +; CHECK-NEXT: shl.b64 %rd57, %rd56, 48; +; CHECK-NEXT: or.b64 %rd58, %rd54, %rd57; +; CHECK-NEXT: cvt.u64.u16 %rd59, %rs10; +; CHECK-NEXT: shl.b64 %rd60, %rd59, 56; +; CHECK-NEXT: or.b64 %rd61, %rd58, %rd60; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd31; ; CHECK-NEXT: st.param.b8 [func_retval0+8], %rs2; -; CHECK-NEXT: st.param.b8 [func_retval0+12], %rd45; -; CHECK-NEXT: st.param.b8 [func_retval0+11], %rd41; -; CHECK-NEXT: st.param.b8 [func_retval0+10], %rd37; -; CHECK-NEXT: st.param.b8 [func_retval0+9], %rd35; -; CHECK-NEXT: shr.u64 %rd64, %rd52, 32; +; CHECK-NEXT: st.param.b8 [func_retval0+12], %rd43; +; CHECK-NEXT: st.param.b8 [func_retval0+11], %rd39; +; CHECK-NEXT: st.param.b8 [func_retval0+10], %rd35; +; CHECK-NEXT: st.param.b8 [func_retval0+9], %rd33; +; CHECK-NEXT: shr.u64 %rd64, %rd50, 32; ; CHECK-NEXT: st.param.b8 [func_retval0+13], %rd64; -; CHECK-NEXT: shr.u64 %rd65, %rd56, 40; +; CHECK-NEXT: shr.u64 %rd65, %rd54, 40; ; CHECK-NEXT: st.param.b8 [func_retval0+14], %rd65; -; CHECK-NEXT: shr.u64 %rd66, %rd60, 48; +; CHECK-NEXT: shr.u64 %rd66, %rd58, 48; ; CHECK-NEXT: st.param.b8 [func_retval0+15], %rd66; -; CHECK-NEXT: shr.u64 %rd67, %rd63, 56; +; CHECK-NEXT: shr.u64 %rd67, %rd61, 56; ; CHECK-NEXT: st.param.b8 [func_retval0+16], %rd67; ; CHECK-NEXT: st.param.b64 [func_retval0+24], %rd32; ; CHECK-NEXT: ret; @@ -317,16 +317,16 @@ define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) { ; CHECK-NEXT: ld.param.b8 %rs6, [retval0+8]; ; CHECK-NEXT: ld.param.b64 %rd2, [retval0+16]; ; CHECK-NEXT: } // callseq 4 -; CHECK-NEXT: cvt.u32.u16 %r16, %rs3; -; CHECK-NEXT: cvt.u32.u16 %r17, %rs4; -; CHECK-NEXT: cvt.u32.u16 %r18, %rs5; -; CHECK-NEXT: cvt.u32.u16 %r19, %rs6; +; CHECK-NEXT: cvt.u32.u16 %r15, %rs3; +; CHECK-NEXT: cvt.u32.u16 %r16, %rs4; +; CHECK-NEXT: cvt.u32.u16 %r17, %rs5; +; CHECK-NEXT: cvt.u32.u16 %r18, %rs6; ; CHECK-NEXT: st.param.b32 [func_retval0], %r14; ; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs2; -; CHECK-NEXT: st.param.b8 [func_retval0+8], %r19; -; CHECK-NEXT: st.param.b8 [func_retval0+7], %r18; -; CHECK-NEXT: st.param.b8 [func_retval0+6], %r17; -; CHECK-NEXT: st.param.b8 [func_retval0+5], %r16; +; CHECK-NEXT: st.param.b8 [func_retval0+8], %r18; +; CHECK-NEXT: st.param.b8 [func_retval0+7], %r17; +; CHECK-NEXT: st.param.b8 [func_retval0+6], %r16; +; CHECK-NEXT: st.param.b8 [func_retval0+5], %r15; ; CHECK-NEXT: st.param.b64 [func_retval0+16], %rd2; ; CHECK-NEXT: ret; %r = tail call %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) @@ -376,16 +376,16 @@ define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) { ; CHECK-NEXT: ld.param.b8 %rs6, [retval0+8]; ; CHECK-NEXT: ld.param.b64 %rd2, [retval0+16]; ; CHECK-NEXT: } // callseq 5 -; CHECK-NEXT: cvt.u32.u16 %r16, %rs3; -; CHECK-NEXT: cvt.u32.u16 %r17, %rs4; -; CHECK-NEXT: cvt.u32.u16 %r18, %rs5; -; CHECK-NEXT: cvt.u32.u16 %r19, %rs6; +; CHECK-NEXT: cvt.u32.u16 %r15, %rs3; +; CHECK-NEXT: cvt.u32.u16 %r16, %rs4; +; CHECK-NEXT: cvt.u32.u16 %r17, %rs5; +; CHECK-NEXT: cvt.u32.u16 %r18, %rs6; ; CHECK-NEXT: st.param.b32 [func_retval0], %r14; ; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs2; -; CHECK-NEXT: st.param.b8 [func_retval0+8], %r19; -; CHECK-NEXT: st.param.b8 [func_retval0+7], %r18; -; CHECK-NEXT: st.param.b8 [func_retval0+6], %r17; -; CHECK-NEXT: st.param.b8 [func_retval0+5], %r16; +; CHECK-NEXT: st.param.b8 [func_retval0+8], %r18; +; CHECK-NEXT: st.param.b8 [func_retval0+7], %r17; +; CHECK-NEXT: st.param.b8 [func_retval0+6], %r16; +; CHECK-NEXT: st.param.b8 [func_retval0+5], %r15; ; CHECK-NEXT: st.param.b64 [func_retval0+16], %rd2; ; CHECK-NEXT: ret; %r = tail call %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) @@ -458,48 +458,48 @@ define %s_i8f64p @test_s_i8f64p(%s_i8f64p %a) { ; CHECK-NEXT: ld.param.b8 %rs10, [retval0+16]; ; CHECK-NEXT: ld.param.b64 %rd32, [retval0+24]; ; CHECK-NEXT: } // callseq 6 -; CHECK-NEXT: cvt.u64.u16 %rd35, %rs3; +; CHECK-NEXT: cvt.u64.u16 %rd33, %rs3; +; CHECK-NEXT: and.b64 %rd34, %rd33, 255; +; CHECK-NEXT: cvt.u64.u16 %rd35, %rs4; ; CHECK-NEXT: and.b64 %rd36, %rd35, 255; -; CHECK-NEXT: cvt.u64.u16 %rd37, %rs4; -; CHECK-NEXT: and.b64 %rd38, %rd37, 255; -; CHECK-NEXT: shl.b64 %rd39, %rd38, 8; -; CHECK-NEXT: or.b64 %rd40, %rd36, %rd39; -; CHECK-NEXT: cvt.u64.u16 %rd41, %rs5; -; CHECK-NEXT: and.b64 %rd42, %rd41, 255; -; CHECK-NEXT: shl.b64 %rd43, %rd42, 16; -; CHECK-NEXT: or.b64 %rd44, %rd40, %rd43; -; CHECK-NEXT: cvt.u64.u16 %rd45, %rs6; -; CHECK-NEXT: and.b64 %rd46, %rd45, 255; -; CHECK-NEXT: shl.b64 %rd47, %rd46, 24; -; CHECK-NEXT: or.b64 %rd48, %rd44, %rd47; -; CHECK-NEXT: cvt.u64.u16 %rd49, %rs7; -; CHECK-NEXT: and.b64 %rd50, %rd49, 255; -; CHECK-NEXT: shl.b64 %rd51, %rd50, 32; -; CHECK-NEXT: or.b64 %rd52, %rd48, %rd51; -; CHECK-NEXT: cvt.u64.u16 %rd53, %rs8; -; CHECK-NEXT: and.b64 %rd54, %rd53, 255; -; CHECK-NEXT: shl.b64 %rd55, %rd54, 40; -; CHECK-NEXT: or.b64 %rd56, %rd52, %rd55; -; CHECK-NEXT: cvt.u64.u16 %rd57, %rs9; -; CHECK-NEXT: and.b64 %rd58, %rd57, 255; -; CHECK-NEXT: shl.b64 %rd59, %rd58, 48; -; CHECK-NEXT: or.b64 %rd60, %rd56, %rd59; -; CHECK-NEXT: cvt.u64.u16 %rd61, %rs10; -; CHECK-NEXT: shl.b64 %rd62, %rd61, 56; -; CHECK-NEXT: or.b64 %rd63, %rd60, %rd62; +; CHECK-NEXT: shl.b64 %rd37, %rd36, 8; +; CHECK-NEXT: or.b64 %rd38, %rd34, %rd37; +; CHECK-NEXT: cvt.u64.u16 %rd39, %rs5; +; CHECK-NEXT: and.b64 %rd40, %rd39, 255; +; CHECK-NEXT: shl.b64 %rd41, %rd40, 16; +; CHECK-NEXT: or.b64 %rd42, %rd38, %rd41; +; CHECK-NEXT: cvt.u64.u16 %rd43, %rs6; +; CHECK-NEXT: and.b64 %rd44, %rd43, 255; +; CHECK-NEXT: shl.b64 %rd45, %rd44, 24; +; CHECK-NEXT: or.b64 %rd46, %rd42, %rd45; +; CHECK-NEXT: cvt.u64.u16 %rd47, %rs7; +; CHECK-NEXT: and.b64 %rd48, %rd47, 255; +; CHECK-NEXT: shl.b64 %rd49, %rd48, 32; +; CHECK-NEXT: or.b64 %rd50, %rd46, %rd49; +; CHECK-NEXT: cvt.u64.u16 %rd51, %rs8; +; CHECK-NEXT: and.b64 %rd52, %rd51, 255; +; CHECK-NEXT: shl.b64 %rd53, %rd52, 40; +; CHECK-NEXT: or.b64 %rd54, %rd50, %rd53; +; CHECK-NEXT: cvt.u64.u16 %rd55, %rs9; +; CHECK-NEXT: and.b64 %rd56, %rd55, 255; +; CHECK-NEXT: shl.b64 %rd57, %rd56, 48; +; CHECK-NEXT: or.b64 %rd58, %rd54, %rd57; +; CHECK-NEXT: cvt.u64.u16 %rd59, %rs10; +; CHECK-NEXT: shl.b64 %rd60, %rd59, 56; +; CHECK-NEXT: or.b64 %rd61, %rd58, %rd60; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd31; ; CHECK-NEXT: st.param.b8 [func_retval0+8], %rs2; -; CHECK-NEXT: st.param.b8 [func_retval0+12], %rd45; -; CHECK-NEXT: st.param.b8 [func_retval0+11], %rd41; -; CHECK-NEXT: st.param.b8 [func_retval0+10], %rd37; -; CHECK-NEXT: st.param.b8 [func_retval0+9], %rd35; -; CHECK-NEXT: shr.u64 %rd64, %rd52, 32; +; CHECK-NEXT: st.param.b8 [func_retval0+12], %rd43; +; CHECK-NEXT: st.param.b8 [func_retval0+11], %rd39; +; CHECK-NEXT: st.param.b8 [func_retval0+10], %rd35; +; CHECK-NEXT: st.param.b8 [func_retval0+9], %rd33; +; CHECK-NEXT: shr.u64 %rd64, %rd50, 32; ; CHECK-NEXT: st.param.b8 [func_retval0+13], %rd64; -; CHECK-NEXT: shr.u64 %rd65, %rd56, 40; +; CHECK-NEXT: shr.u64 %rd65, %rd54, 40; ; CHECK-NEXT: st.param.b8 [func_retval0+14], %rd65; -; CHECK-NEXT: shr.u64 %rd66, %rd60, 48; +; CHECK-NEXT: shr.u64 %rd66, %rd58, 48; ; CHECK-NEXT: st.param.b8 [func_retval0+15], %rd66; -; CHECK-NEXT: shr.u64 %rd67, %rd63, 56; +; CHECK-NEXT: shr.u64 %rd67, %rd61, 56; ; CHECK-NEXT: st.param.b8 [func_retval0+16], %rd67; ; CHECK-NEXT: st.param.b64 [func_retval0+24], %rd32; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll index 167d7faafe5b3..ad2e7044e93bc 100644 --- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll +++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll @@ -348,7 +348,7 @@ define dso_local void @qux() { ; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot7[24]; ; CHECK-PTX-NEXT: .reg .b64 %SP; ; CHECK-PTX-NEXT: .reg .b64 %SPL; -; CHECK-PTX-NEXT: .reg .b32 %r<3>; +; CHECK-PTX-NEXT: .reg .b32 %r<2>; ; CHECK-PTX-NEXT: .reg .b64 %rd<8>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: // %entry