diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index 2637b9fab0d50..a683726facd0c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -678,11 +678,8 @@ static bool runOnKernelFunction(const NVPTXTargetMachine &TM, Function &F) { LLVM_DEBUG(dbgs() << "Lowering kernel args of " << F.getName() << "\n"); for (Argument &Arg : F.args()) { - if (Arg.getType()->isPointerTy()) { - if (Arg.hasByValAttr()) - handleByValParam(TM, &Arg); - else if (TM.getDrvInterface() == NVPTX::CUDA) - markPointerAsGlobal(&Arg); + if (Arg.getType()->isPointerTy() && Arg.hasByValAttr()) { + handleByValParam(TM, &Arg); } else if (Arg.getType()->isIntegerTy() && TM.getDrvInterface() == NVPTX::CUDA) { HandleIntToPtr(Arg); @@ -699,10 +696,9 @@ static bool runOnDeviceFunction(const NVPTXTargetMachine &TM, Function &F) { cast(TM.getSubtargetImpl()->getTargetLowering()); for (Argument &Arg : F.args()) - if (Arg.getType()->isPointerTy() && Arg.hasByValAttr()) { - markPointerAsAS(&Arg, ADDRESS_SPACE_LOCAL); + if (Arg.getType()->isPointerTy() && Arg.hasByValAttr()) adjustByValArgAlignment(&Arg, &Arg, TLI); - } + return true; } diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index a89ca3037c7ff..e359735c20750 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -599,6 +599,21 @@ unsigned NVPTXTTIImpl::getAssumedAddrSpace(const Value *V) const { if (isa(V)) return ADDRESS_SPACE_LOCAL; + if (const Argument *Arg = dyn_cast(V)) { + if (isKernelFunction(*Arg->getParent())) { + const NVPTXTargetMachine &TM = + static_cast(getTLI()->getTargetMachine()); + if (TM.getDrvInterface() == NVPTX::CUDA && !Arg->hasByValAttr()) + return ADDRESS_SPACE_GLOBAL; + } else { + // We assume that all device parameters that are passed byval will be + // placed in the local AS. Very simple cases will be updated after ISel to + // use the device param space where possible. + if (Arg->hasByValAttr()) + return ADDRESS_SPACE_LOCAL; + } + } + return -1; } diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp index 73a3f5e4d3694..b65a08be75640 100644 --- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -305,10 +305,16 @@ static bool isNoopPtrIntCastPair(const Operator *I2P, const DataLayout &DL, } // Returns true if V is an address expression. -// TODO: Currently, we consider only phi, bitcast, addrspacecast, and -// getelementptr operators. +// TODO: Currently, we only consider: +// - arguments +// - phi, bitcast, addrspacecast, and getelementptr operators static bool isAddressExpression(const Value &V, const DataLayout &DL, const TargetTransformInfo *TTI) { + + if (const Argument *Arg = dyn_cast(&V)) + return Arg->getType()->isPointerTy() && + TTI->getAssumedAddrSpace(&V) != UninitializedAddressSpace; + const Operator *Op = dyn_cast(&V); if (!Op) return false; @@ -341,6 +347,9 @@ static bool isAddressExpression(const Value &V, const DataLayout &DL, static SmallVector getPointerOperands(const Value &V, const DataLayout &DL, const TargetTransformInfo *TTI) { + if (isa(&V)) + return {}; + const Operator &Op = cast(V); switch (Op.getOpcode()) { case Instruction::PHI: { @@ -505,13 +514,11 @@ void InferAddressSpacesImpl::appendsFlatAddressExpressionToPostorderStack( if (Visited.insert(V).second) { PostorderStack.emplace_back(V, false); - Operator *Op = cast(V); - for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I) { - if (ConstantExpr *CE = dyn_cast(Op->getOperand(I))) { - if (isAddressExpression(*CE, *DL, TTI) && Visited.insert(CE).second) - PostorderStack.emplace_back(CE, false); - } - } + if (auto *Op = dyn_cast(V)) + for (auto &O : Op->operands()) + if (ConstantExpr *CE = dyn_cast(O)) + if (isAddressExpression(*CE, *DL, TTI) && Visited.insert(CE).second) + PostorderStack.emplace_back(CE, false); } } } @@ -828,6 +835,18 @@ Value *InferAddressSpacesImpl::cloneValueWithNewAddressSpace( assert(V->getType()->getPointerAddressSpace() == FlatAddrSpace && isAddressExpression(*V, *DL, TTI)); + if (auto *Arg = dyn_cast(V)) { + // Arguments are address space casted in the function body, as we do not + // want to change the function signature. + Function *F = Arg->getParent(); + BasicBlock::iterator Insert = F->getEntryBlock().getFirstNonPHIIt(); + + Type *NewPtrTy = PointerType::get(Arg->getContext(), NewAddrSpace); + auto *NewI = new AddrSpaceCastInst(Arg, NewPtrTy); + NewI->insertBefore(Insert); + return NewI; + } + if (Instruction *I = dyn_cast(V)) { Value *NewV = cloneInstructionWithNewAddressSpace( I, NewAddrSpace, ValueWithNewAddrSpace, PredicatedAS, PoisonUsesToFix); @@ -966,8 +985,12 @@ bool InferAddressSpacesImpl::updateAddressSpace( // of all its pointer operands. unsigned NewAS = UninitializedAddressSpace; - const Operator &Op = cast(V); - if (Op.getOpcode() == Instruction::Select) { + // isAddressExpression should guarantee that V is an operator or an argument. + assert(isa(V) || isa(V)); + + if (isa(V) && + cast(V).getOpcode() == Instruction::Select) { + const Operator &Op = cast(V); Value *Src0 = Op.getOperand(1); Value *Src1 = Op.getOperand(2); @@ -1258,7 +1281,7 @@ void InferAddressSpacesImpl::performPointerReplacement( } // Otherwise, replaces the use with flat(NewV). - if (Instruction *VInst = dyn_cast(V)) { + if (isa(V) || isa(NewV)) { // Don't create a copy of the original addrspacecast. if (U == V && isa(V)) return; @@ -1268,7 +1291,7 @@ void InferAddressSpacesImpl::performPointerReplacement( if (Instruction *NewVInst = dyn_cast(NewV)) InsertPos = std::next(NewVInst->getIterator()); else - InsertPos = std::next(VInst->getIterator()); + InsertPos = std::next(cast(V)->getIterator()); while (isa(InsertPos)) ++InsertPos; diff --git a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll index f5f1dd9fcf0ea..44ac46db254a7 100644 --- a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll +++ b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll @@ -12,14 +12,14 @@ define ptx_kernel void @foo(ptr noalias readonly %ptr, ptr noalias %retval) { ; CHECK: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK: ld.param.u64 %rd1, [foo_param_0]; -; CHECK: ld.param.u64 %rd2, [foo_param_1]; -; CHECK: cvta.to.global.u64 %rd3, %rd2; -; CHECK: cvta.to.global.u64 %rd4, %rd1; -; CHECK: ld.global.nc.u8 %rs1, [%rd4]; +; CHECK: cvta.to.global.u64 %rd2, %rd1; +; CHECK: ld.param.u64 %rd3, [foo_param_1]; +; CHECK: cvta.to.global.u64 %rd4, %rd3; +; CHECK: ld.global.nc.u8 %rs1, [%rd2]; ; CHECK: cvt.u32.u8 %r1, %rs1; ; CHECK: add.s32 %r2, %r1, 1; ; CHECK: and.b32 %r3, %r2, 1; -; CHECK: st.global.u32 [%rd3], %r3; +; CHECK: st.global.u32 [%rd4], %r3; ; CHECK: ret; %ld = load i1, ptr %ptr, align 1 %zext = zext i1 %ld to i32 diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll index e4e1f40d0d8b2..38b7400696c54 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll @@ -12,9 +12,7 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly ; OPT-LABEL: define dso_local noundef i32 @non_kernel_function( ; OPT-SAME: ptr noundef readonly byval([[STRUCT_UINT4:%.*]]) align 16 captures(none) [[A:%.*]], i1 noundef zeroext [[B:%.*]], i32 noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; OPT-NEXT: [[ENTRY:.*:]] -; OPT-NEXT: [[A2:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(5) -; OPT-NEXT: [[A1:%.*]] = addrspacecast ptr addrspace(5) [[A2]] to ptr -; OPT-NEXT: [[A_:%.*]] = select i1 [[B]], ptr [[A1]], ptr addrspacecast (ptr addrspace(1) @gi to ptr) +; OPT-NEXT: [[A_:%.*]] = select i1 [[B]], ptr [[A]], ptr addrspacecast (ptr addrspace(1) @gi to ptr) ; OPT-NEXT: [[IDX_EXT:%.*]] = sext i32 [[C]] to i64 ; OPT-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[A_]], i64 [[IDX_EXT]] ; OPT-NEXT: [[TMP0:%.*]] = load i32, ptr [[ADD_PTR]], align 1 @@ -74,12 +72,10 @@ define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %inpu ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_int( ; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[OUT2:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; OPT-NEXT: [[OUT3:%.*]] = addrspacecast ptr addrspace(1) [[OUT2]] to ptr ; OPT-NEXT: [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) ; OPT-NEXT: [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4 ; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]] -; OPT-NEXT: store i32 [[ADD]], ptr [[OUT3]], align 4 +; OPT-NEXT: store i32 [[ADD]], ptr [[OUT]], align 4 ; OPT-NEXT: ret void %tmp = load i32, ptr %input1, align 4 %add = add i32 %tmp, %input2 @@ -105,15 +101,13 @@ define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, p ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_struct( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[OUT4:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; OPT-NEXT: [[OUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUT4]] to ptr ; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) ; OPT-NEXT: [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0 ; OPT-NEXT: [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1 ; OPT-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4 ; OPT-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(101) [[GEP22]], align 4 ; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP1]], [[TMP2]] -; OPT-NEXT: store i32 [[ADD]], ptr [[OUT5]], align 4 +; OPT-NEXT: store i32 [[ADD]], ptr [[OUT]], align 4 ; OPT-NEXT: ret void %gep1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0 %gep2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1 @@ -233,11 +227,9 @@ define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %i ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_memory_escape( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[ADDR4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1) -; OPT-NEXT: [[ADDR5:%.*]] = addrspacecast ptr addrspace(1) [[ADDR4]] to ptr ; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) ; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) -; OPT-NEXT: store ptr [[INPUT1]], ptr [[ADDR5]], align 8 +; OPT-NEXT: store ptr [[INPUT1]], ptr [[ADDR]], align 8 ; OPT-NEXT: ret void store ptr %input, ptr %addr, align 8 ret void @@ -263,14 +255,12 @@ define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 ; PTX-NOT .local ; OPT-LABEL: define ptx_kernel void @grid_const_inlineasm_escape( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[RESULT4:%.*]] = addrspacecast ptr [[RESULT]] to ptr addrspace(1) -; OPT-NEXT: [[RESULT5:%.*]] = addrspacecast ptr addrspace(1) [[RESULT4]] to ptr ; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) ; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) ; OPT-NEXT: [[TMPPTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0 ; OPT-NEXT: [[TMPPTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1 ; OPT-NEXT: [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2 -; OPT-NEXT: store i64 [[TMP2]], ptr [[RESULT5]], align 8 +; OPT-NEXT: store i64 [[TMP2]], ptr [[RESULT]], align 8 ; OPT-NEXT: ret void %tmpptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0 %tmpptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1 @@ -311,13 +301,11 @@ define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %ou ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_partial_escape( ; OPT-SAME: ptr byval(i32) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1) -; OPT-NEXT: [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr ; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) ; OPT-NEXT: [[INPUT1_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1]]) ; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[INPUT1_GEN]], align 4 ; OPT-NEXT: [[TWICE:%.*]] = add i32 [[VAL1]], [[VAL1]] -; OPT-NEXT: store i32 [[TWICE]], ptr [[OUTPUT5]], align 4 +; OPT-NEXT: store i32 [[TWICE]], ptr [[OUTPUT]], align 4 ; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]]) ; OPT-NEXT: ret void %val = load i32, ptr %input @@ -361,15 +349,13 @@ define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel i32 @grid_const_partial_escapemem( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1) -; OPT-NEXT: [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr ; OPT-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) ; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]]) ; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0 ; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[PTR1]], align 4 ; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1 ; OPT-NEXT: [[VAL2:%.*]] = load i32, ptr [[PTR2]], align 4 -; OPT-NEXT: store ptr [[INPUT1]], ptr [[OUTPUT5]], align 8 +; OPT-NEXT: store ptr [[INPUT1]], ptr [[OUTPUT]], align 8 ; OPT-NEXT: [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]] ; OPT-NEXT: [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]]) ; OPT-NEXT: ret i32 [[ADD]] @@ -407,11 +393,9 @@ define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_phi( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) -; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr ; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) ; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) -; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4 +; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4 ; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 ; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]] ; OPT: [[FIRST]]: @@ -423,7 +407,7 @@ define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr ; OPT: [[MERGE]]: ; OPT-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] ; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 -; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4 +; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4 ; OPT-NEXT: ret void %val = load i32, ptr %inout @@ -470,13 +454,11 @@ define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_phi_ngc( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) -; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr ; OPT-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) ; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) ; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) ; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) -; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4 +; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4 ; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 ; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]] ; OPT: [[FIRST]]: @@ -488,7 +470,7 @@ define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ; OPT: [[MERGE]]: ; OPT-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] ; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 -; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4 +; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4 ; OPT-NEXT: ret void %val = load i32, ptr %inout %less = icmp slt i32 %val, 0 @@ -531,17 +513,15 @@ define ptx_kernel void @grid_const_select(ptr byval(i32) align 4 %input1, ptr by ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_select( ; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) -; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr ; OPT-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) ; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) ; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) ; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) -; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4 +; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4 ; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 ; OPT-NEXT: [[PTRNEW:%.*]] = select i1 [[LESS]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]] ; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 -; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4 +; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4 ; OPT-NEXT: ret void %val = load i32, ptr %inout %less = icmp slt i32 %val, 0 diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll index a1c0a86e9c4e4..8fa7d5c3e0cbc 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes IR,IRC ; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-nvcl | FileCheck %s --check-prefixes IR,IRO ; RUN: llc < %s -mcpu=sm_20 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes PTX,PTXC @@ -17,12 +17,10 @@ define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 % ; IR-LABEL: define void @load_alignment( ; IR-SAME: ptr readonly byval([[CLASS_OUTER:%.*]]) align 8 captures(none) [[ARG:%.*]]) { ; IR-NEXT: [[ENTRY:.*:]] -; IR-NEXT: [[ARG2:%.*]] = addrspacecast ptr [[ARG]] to ptr addrspace(5) -; IR-NEXT: [[ARG1:%.*]] = addrspacecast ptr addrspace(5) [[ARG2]] to ptr -; IR-NEXT: [[ARG_IDX_VAL:%.*]] = load ptr, ptr [[ARG1]], align 8 -; IR-NEXT: [[ARG_IDX1:%.*]] = getelementptr [[CLASS_OUTER]], ptr [[ARG1]], i64 0, i32 0, i32 1 +; IR-NEXT: [[ARG_IDX_VAL:%.*]] = load ptr, ptr [[ARG]], align 8 +; IR-NEXT: [[ARG_IDX1:%.*]] = getelementptr [[CLASS_OUTER]], ptr [[ARG]], i64 0, i32 0, i32 1 ; IR-NEXT: [[ARG_IDX1_VAL:%.*]] = load ptr, ptr [[ARG_IDX1]], align 8 -; IR-NEXT: [[ARG_IDX2:%.*]] = getelementptr [[CLASS_OUTER]], ptr [[ARG1]], i64 0, i32 1 +; IR-NEXT: [[ARG_IDX2:%.*]] = getelementptr [[CLASS_OUTER]], ptr [[ARG]], i64 0, i32 1 ; IR-NEXT: [[ARG_IDX2_VAL:%.*]] = load i32, ptr [[ARG_IDX2]], align 8 ; IR-NEXT: [[ARG_IDX_VAL_VAL:%.*]] = load i32, ptr [[ARG_IDX_VAL]], align 4 ; IR-NEXT: [[ADD_I:%.*]] = add nsw i32 [[ARG_IDX_VAL_VAL]], [[ARG_IDX2_VAL]] @@ -77,9 +75,7 @@ entry: define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) { ; IR-LABEL: define void @load_padding( ; IR-SAME: ptr readonly byval([[CLASS_PADDED:%.*]]) align 4 captures(none) [[ARG:%.*]]) { -; IR-NEXT: [[ARG2:%.*]] = addrspacecast ptr [[ARG]] to ptr addrspace(5) -; IR-NEXT: [[ARG1:%.*]] = addrspacecast ptr addrspace(5) [[ARG2]] to ptr -; IR-NEXT: [[TMP:%.*]] = call ptr @escape(ptr nonnull align 16 [[ARG1]]) +; IR-NEXT: [[TMP:%.*]] = call ptr @escape(ptr nonnull align 16 [[ARG]]) ; IR-NEXT: ret void ; ; PTX-LABEL: load_padding( @@ -108,21 +104,11 @@ define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) { ; OpenCL can't make assumptions about incoming pointer, so we should generate ; generic pointers load/store. define ptx_kernel void @ptr_generic(ptr %out, ptr %in) { -; IRC-LABEL: define ptx_kernel void @ptr_generic( -; IRC-SAME: ptr [[OUT:%.*]], ptr [[IN:%.*]]) { -; IRC-NEXT: [[IN3:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) -; IRC-NEXT: [[IN4:%.*]] = addrspacecast ptr addrspace(1) [[IN3]] to ptr -; IRC-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; IRC-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; IRC-NEXT: [[V:%.*]] = load i32, ptr [[IN4]], align 4 -; IRC-NEXT: store i32 [[V]], ptr [[OUT2]], align 4 -; IRC-NEXT: ret void -; -; IRO-LABEL: define ptx_kernel void @ptr_generic( -; IRO-SAME: ptr [[OUT:%.*]], ptr [[IN:%.*]]) { -; IRO-NEXT: [[V:%.*]] = load i32, ptr [[IN]], align 4 -; IRO-NEXT: store i32 [[V]], ptr [[OUT]], align 4 -; IRO-NEXT: ret void +; IR-LABEL: define ptx_kernel void @ptr_generic( +; IR-SAME: ptr [[OUT:%.*]], ptr [[IN:%.*]]) { +; IR-NEXT: [[V:%.*]] = load i32, ptr [[IN]], align 4 +; IR-NEXT: store i32 [[V]], ptr [[OUT]], align 4 +; IR-NEXT: ret void ; ; PTXC-LABEL: ptr_generic( ; PTXC: { diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll index cfe934544eb3a..4631732b81ea6 100644 --- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -1,9 +1,11 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --scrub-attributes --version 5 -; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_60 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,SM_60 -; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,SM_70 -; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_60 -mattr=ptx77 -passes=nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,SM_60 -; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -passes=nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,SM_70 +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_60 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,LOWER-ARGS,SM_60 +; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,LOWER-ARGS,SM_70 +; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_60 -mattr=ptx77 -passes=nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,LOWER-ARGS,SM_60 +; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -passes=nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,LOWER-ARGS,SM_70 ; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -passes=nvptx-copy-byval-args -S | FileCheck %s --check-prefixes=COMMON,COPY +; RUN: llc < %s -mcpu=sm_60 -mattr=ptx77 | FileCheck %s --check-prefixes=PTX,PTX_60 +; RUN: llc < %s -mcpu=sm_70 -mattr=ptx77 | FileCheck %s --check-prefixes=PTX,PTX_70 source_filename = "" target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" @@ -27,25 +29,13 @@ declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local ptx_kernel void @read_only(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; SM_60-LABEL: define dso_local ptx_kernel void @read_only( -; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -; SM_60-NEXT: [[ENTRY:.*:]] -; SM_60-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_60-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4 -; SM_60-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 -; SM_60-NEXT: ret void -; -; SM_70-LABEL: define dso_local ptx_kernel void @read_only( -; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -; SM_70-NEXT: [[ENTRY:.*:]] -; SM_70-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_70-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4 -; SM_70-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 -; SM_70-NEXT: ret void +; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only( +; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; LOWER-ARGS-NEXT: [[ENTRY:.*:]] +; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4 +; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4 +; LOWER-ARGS-NEXT: ret void ; ; COPY-LABEL: define dso_local ptx_kernel void @read_only( ; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { @@ -57,6 +47,17 @@ define dso_local ptx_kernel void @read_only(ptr nocapture noundef writeonly %out ; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4 ; COPY-NEXT: ret void ; +; PTX-LABEL: read_only( +; PTX: { +; PTX-NEXT: .reg .b32 %r<2>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %entry +; PTX-NEXT: ld.param.u64 %rd1, [read_only_param_0]; +; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; +; PTX-NEXT: ld.param.u32 %r1, [read_only_param_1]; +; PTX-NEXT: st.global.u32 [%rd2], %r1; +; PTX-NEXT: ret; entry: %i = load i32, ptr %s, align 4 store i32 %i, ptr %out, align 4 @@ -65,27 +66,14 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local ptx_kernel void @read_only_gep(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; SM_60-LABEL: define dso_local ptx_kernel void @read_only_gep( -; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_60-NEXT: [[ENTRY:.*:]] -; SM_60-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_60-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 -; SM_60-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 -; SM_60-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 -; SM_60-NEXT: ret void -; -; SM_70-LABEL: define dso_local ptx_kernel void @read_only_gep( -; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_70-NEXT: [[ENTRY:.*:]] -; SM_70-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_70-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 -; SM_70-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 -; SM_70-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 -; SM_70-NEXT: ret void +; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only_gep( +; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; LOWER-ARGS-NEXT: [[ENTRY:.*:]] +; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; LOWER-ARGS-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 +; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 +; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4 +; LOWER-ARGS-NEXT: ret void ; ; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep( ; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { @@ -98,6 +86,17 @@ define dso_local ptx_kernel void @read_only_gep(ptr nocapture noundef writeonly ; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4 ; COPY-NEXT: ret void ; +; PTX-LABEL: read_only_gep( +; PTX: { +; PTX-NEXT: .reg .b32 %r<2>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %entry +; PTX-NEXT: ld.param.u64 %rd1, [read_only_gep_param_0]; +; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; +; PTX-NEXT: ld.param.u32 %r1, [read_only_gep_param_1+4]; +; PTX-NEXT: st.global.u32 [%rd2], %r1; +; PTX-NEXT: ret; entry: %b = getelementptr inbounds nuw i8, ptr %s, i64 4 %i = load i32, ptr %b, align 4 @@ -107,27 +106,14 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local ptx_kernel void @read_only_gep_asc(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; SM_60-LABEL: define dso_local ptx_kernel void @read_only_gep_asc( -; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_60-NEXT: [[ENTRY:.*:]] -; SM_60-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_60-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 -; SM_60-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 -; SM_60-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 -; SM_60-NEXT: ret void -; -; SM_70-LABEL: define dso_local ptx_kernel void @read_only_gep_asc( -; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_70-NEXT: [[ENTRY:.*:]] -; SM_70-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_70-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 -; SM_70-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 -; SM_70-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 -; SM_70-NEXT: ret void +; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only_gep_asc( +; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; LOWER-ARGS-NEXT: [[ENTRY:.*:]] +; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; LOWER-ARGS-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 +; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 +; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4 +; LOWER-ARGS-NEXT: ret void ; ; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep_asc( ; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { @@ -141,6 +127,17 @@ define dso_local ptx_kernel void @read_only_gep_asc(ptr nocapture noundef writeo ; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4 ; COPY-NEXT: ret void ; +; PTX-LABEL: read_only_gep_asc( +; PTX: { +; PTX-NEXT: .reg .b32 %r<2>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %entry +; PTX-NEXT: ld.param.u64 %rd1, [read_only_gep_asc_param_0]; +; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; +; PTX-NEXT: ld.param.u32 %r1, [read_only_gep_asc_param_1+4]; +; PTX-NEXT: st.global.u32 [%rd2], %r1; +; PTX-NEXT: ret; entry: %b = getelementptr inbounds nuw i8, ptr %s, i64 4 %asc = addrspacecast ptr %b to ptr addrspace(101) @@ -151,49 +148,30 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local ptx_kernel void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; SM_60-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0( -; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_60-NEXT: [[ENTRY:.*:]] -; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_60-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 -; SM_60-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) -; SM_60-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr -; SM_60-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4 -; SM_60-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 -; SM_60-NEXT: ret void -; -; SM_70-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0( -; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_70-NEXT: [[ENTRY:.*:]] -; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_70-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 -; SM_70-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) -; SM_70-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr -; SM_70-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4 -; SM_70-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 -; SM_70-NEXT: ret void -; -; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0( -; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COPY-NEXT: [[ENTRY:.*:]] -; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) -; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 -; COPY-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) -; COPY-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr -; COPY-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4 -; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4 -; COPY-NEXT: ret void +; COMMON-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0( +; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 +; COMMON-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) +; COMMON-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr +; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4 +; COMMON-NEXT: store i32 [[I]], ptr [[OUT]], align 4 +; COMMON-NEXT: ret void ; +; PTX-LABEL: read_only_gep_asc0( +; PTX: { +; PTX-NEXT: .reg .b32 %r<2>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %entry +; PTX-NEXT: ld.param.u64 %rd1, [read_only_gep_asc0_param_0]; +; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; +; PTX-NEXT: ld.param.u32 %r1, [read_only_gep_asc0_param_1+4]; +; PTX-NEXT: st.global.u32 [%rd2], %r1; +; PTX-NEXT: ret; entry: %b = getelementptr inbounds nuw i8, ptr %s, i64 4 %asc = addrspacecast ptr %b to ptr addrspace(101) @@ -208,23 +186,19 @@ define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out ; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr( ; SM_60-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { ; SM_60-NEXT: [[ENTRY:.*:]] -; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_60-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S3]]) +; SM_60-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_60-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; SM_60-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR5:[0-9]+]] ; SM_60-NEXT: ret void ; ; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr( ; SM_70-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { ; SM_70-NEXT: [[ENTRY:.*:]] -; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_70-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S3]]) +; SM_70-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_70-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; SM_70-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR6:[0-9]+]] ; SM_70-NEXT: ret void ; ; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr( @@ -233,9 +207,36 @@ define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out ; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 ; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) ; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) -; COPY-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) +; COPY-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR5:[0-9]+]] ; COPY-NEXT: ret void ; +; PTX-LABEL: escape_ptr( +; PTX: { +; PTX-NEXT: .local .align 4 .b8 __local_depot4[8]; +; PTX-NEXT: .reg .b64 %SP; +; PTX-NEXT: .reg .b64 %SPL; +; PTX-NEXT: .reg .b32 %r<3>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %entry +; PTX-NEXT: mov.b64 %SPL, __local_depot4; +; PTX-NEXT: cvta.local.u64 %SP, %SPL; +; PTX-NEXT: add.u64 %rd1, %SP, 0; +; PTX-NEXT: add.u64 %rd2, %SPL, 0; +; PTX-NEXT: ld.param.u32 %r1, [escape_ptr_param_1+4]; +; PTX-NEXT: st.local.u32 [%rd2+4], %r1; +; PTX-NEXT: ld.param.u32 %r2, [escape_ptr_param_1]; +; PTX-NEXT: st.local.u32 [%rd2], %r2; +; PTX-NEXT: { // callseq 0, 0 +; PTX-NEXT: .param .b64 param0; +; PTX-NEXT: st.param.b64 [param0], %rd1; +; PTX-NEXT: call.uni +; PTX-NEXT: _Z6escapePv, +; PTX-NEXT: ( +; PTX-NEXT: param0 +; PTX-NEXT: ); +; PTX-NEXT: } // callseq 0 +; PTX-NEXT: ret; entry: call void @_Z6escapePv(ptr noundef nonnull %s) #0 ret void @@ -246,25 +247,21 @@ define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone ; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr_gep( ; SM_60-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { ; SM_60-NEXT: [[ENTRY:.*:]] -; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_60-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 -; SM_60-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) +; SM_60-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_60-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; SM_60-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 +; SM_60-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR5]] ; SM_60-NEXT: ret void ; ; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr_gep( ; SM_70-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { ; SM_70-NEXT: [[ENTRY:.*:]] -; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_70-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 -; SM_70-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) +; SM_70-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_70-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; SM_70-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 +; SM_70-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR6]] ; SM_70-NEXT: ret void ; ; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr_gep( @@ -274,9 +271,37 @@ define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone ; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) ; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) ; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 -; COPY-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) +; COPY-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR5]] ; COPY-NEXT: ret void ; +; PTX-LABEL: escape_ptr_gep( +; PTX: { +; PTX-NEXT: .local .align 4 .b8 __local_depot5[8]; +; PTX-NEXT: .reg .b64 %SP; +; PTX-NEXT: .reg .b64 %SPL; +; PTX-NEXT: .reg .b32 %r<3>; +; PTX-NEXT: .reg .b64 %rd<4>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %entry +; PTX-NEXT: mov.b64 %SPL, __local_depot5; +; PTX-NEXT: cvta.local.u64 %SP, %SPL; +; PTX-NEXT: add.u64 %rd1, %SP, 0; +; PTX-NEXT: add.u64 %rd2, %SPL, 0; +; PTX-NEXT: ld.param.u32 %r1, [escape_ptr_gep_param_1+4]; +; PTX-NEXT: st.local.u32 [%rd2+4], %r1; +; PTX-NEXT: ld.param.u32 %r2, [escape_ptr_gep_param_1]; +; PTX-NEXT: st.local.u32 [%rd2], %r2; +; PTX-NEXT: add.s64 %rd3, %rd1, 4; +; PTX-NEXT: { // callseq 1, 0 +; PTX-NEXT: .param .b64 param0; +; PTX-NEXT: st.param.b64 [param0], %rd3; +; PTX-NEXT: call.uni +; PTX-NEXT: _Z6escapePv, +; PTX-NEXT: ( +; PTX-NEXT: param0 +; PTX-NEXT: ); +; PTX-NEXT: } // callseq 1 +; PTX-NEXT: ret; entry: %b = getelementptr inbounds nuw i8, ptr %s, i64 4 call void @_Z6escapePv(ptr noundef nonnull %b) #0 @@ -285,37 +310,36 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local ptx_kernel void @escape_ptr_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr_store( -; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_60-NEXT: [[ENTRY:.*:]] -; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_60-NEXT: store ptr [[S3]], ptr [[OUT2]], align 8 -; SM_60-NEXT: ret void -; -; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr_store( -; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_70-NEXT: [[ENTRY:.*:]] -; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_70-NEXT: store ptr [[S3]], ptr [[OUT2]], align 8 -; SM_70-NEXT: ret void -; -; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr_store( -; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COPY-NEXT: [[ENTRY:.*:]] -; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) -; COPY-NEXT: store ptr [[S1]], ptr [[OUT]], align 8 -; COPY-NEXT: ret void +; COMMON-LABEL: define dso_local ptx_kernel void @escape_ptr_store( +; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; COMMON-NEXT: store ptr [[S1]], ptr [[OUT]], align 8 +; COMMON-NEXT: ret void ; +; PTX-LABEL: escape_ptr_store( +; PTX: { +; PTX-NEXT: .local .align 4 .b8 __local_depot6[8]; +; PTX-NEXT: .reg .b64 %SP; +; PTX-NEXT: .reg .b64 %SPL; +; PTX-NEXT: .reg .b32 %r<3>; +; PTX-NEXT: .reg .b64 %rd<5>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %entry +; PTX-NEXT: mov.b64 %SPL, __local_depot6; +; PTX-NEXT: cvta.local.u64 %SP, %SPL; +; PTX-NEXT: ld.param.u64 %rd1, [escape_ptr_store_param_0]; +; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; +; PTX-NEXT: add.u64 %rd3, %SP, 0; +; PTX-NEXT: add.u64 %rd4, %SPL, 0; +; PTX-NEXT: ld.param.u32 %r1, [escape_ptr_store_param_1+4]; +; PTX-NEXT: st.local.u32 [%rd4+4], %r1; +; PTX-NEXT: ld.param.u32 %r2, [escape_ptr_store_param_1]; +; PTX-NEXT: st.local.u32 [%rd4], %r2; +; PTX-NEXT: st.global.u64 [%rd2], %rd3; +; PTX-NEXT: ret; entry: store ptr %s, ptr %out, align 8 ret void @@ -323,40 +347,38 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr_gep_store( -; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_60-NEXT: [[ENTRY:.*:]] -; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_60-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 -; SM_60-NEXT: store ptr [[B]], ptr [[OUT2]], align 8 -; SM_60-NEXT: ret void -; -; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr_gep_store( -; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_70-NEXT: [[ENTRY:.*:]] -; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_70-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 -; SM_70-NEXT: store ptr [[B]], ptr [[OUT2]], align 8 -; SM_70-NEXT: ret void -; -; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr_gep_store( -; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COPY-NEXT: [[ENTRY:.*:]] -; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) -; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 -; COPY-NEXT: store ptr [[B]], ptr [[OUT]], align 8 -; COPY-NEXT: ret void +; COMMON-LABEL: define dso_local ptx_kernel void @escape_ptr_gep_store( +; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 +; COMMON-NEXT: store ptr [[B]], ptr [[OUT]], align 8 +; COMMON-NEXT: ret void ; +; PTX-LABEL: escape_ptr_gep_store( +; PTX: { +; PTX-NEXT: .local .align 4 .b8 __local_depot7[8]; +; PTX-NEXT: .reg .b64 %SP; +; PTX-NEXT: .reg .b64 %SPL; +; PTX-NEXT: .reg .b32 %r<3>; +; PTX-NEXT: .reg .b64 %rd<6>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %entry +; PTX-NEXT: mov.b64 %SPL, __local_depot7; +; PTX-NEXT: cvta.local.u64 %SP, %SPL; +; PTX-NEXT: ld.param.u64 %rd1, [escape_ptr_gep_store_param_0]; +; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; +; PTX-NEXT: add.u64 %rd3, %SP, 0; +; PTX-NEXT: add.u64 %rd4, %SPL, 0; +; PTX-NEXT: ld.param.u32 %r1, [escape_ptr_gep_store_param_1+4]; +; PTX-NEXT: st.local.u32 [%rd4+4], %r1; +; PTX-NEXT: ld.param.u32 %r2, [escape_ptr_gep_store_param_1]; +; PTX-NEXT: st.local.u32 [%rd4], %r2; +; PTX-NEXT: add.s64 %rd5, %rd3, 4; +; PTX-NEXT: st.global.u64 [%rd2], %rd5; +; PTX-NEXT: ret; entry: %b = getelementptr inbounds nuw i8, ptr %s, i64 4 store ptr %b, ptr %out, align 8 @@ -365,40 +387,37 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local ptx_kernel void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptrtoint( -; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_60-NEXT: [[ENTRY:.*:]] -; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_60-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64 -; SM_60-NEXT: store i64 [[I]], ptr [[OUT2]], align 8 -; SM_60-NEXT: ret void -; -; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptrtoint( -; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_70-NEXT: [[ENTRY:.*:]] -; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_70-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64 -; SM_70-NEXT: store i64 [[I]], ptr [[OUT2]], align 8 -; SM_70-NEXT: ret void -; -; COPY-LABEL: define dso_local ptx_kernel void @escape_ptrtoint( -; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COPY-NEXT: [[ENTRY:.*:]] -; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) -; COPY-NEXT: [[I:%.*]] = ptrtoint ptr [[S1]] to i64 -; COPY-NEXT: store i64 [[I]], ptr [[OUT]], align 8 -; COPY-NEXT: ret void +; COMMON-LABEL: define dso_local ptx_kernel void @escape_ptrtoint( +; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; COMMON-NEXT: [[I:%.*]] = ptrtoint ptr [[S1]] to i64 +; COMMON-NEXT: store i64 [[I]], ptr [[OUT]], align 8 +; COMMON-NEXT: ret void ; +; PTX-LABEL: escape_ptrtoint( +; PTX: { +; PTX-NEXT: .local .align 4 .b8 __local_depot8[8]; +; PTX-NEXT: .reg .b64 %SP; +; PTX-NEXT: .reg .b64 %SPL; +; PTX-NEXT: .reg .b32 %r<3>; +; PTX-NEXT: .reg .b64 %rd<5>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %entry +; PTX-NEXT: mov.b64 %SPL, __local_depot8; +; PTX-NEXT: cvta.local.u64 %SP, %SPL; +; PTX-NEXT: ld.param.u64 %rd1, [escape_ptrtoint_param_0]; +; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; +; PTX-NEXT: add.u64 %rd3, %SP, 0; +; PTX-NEXT: add.u64 %rd4, %SPL, 0; +; PTX-NEXT: ld.param.u32 %r1, [escape_ptrtoint_param_1+4]; +; PTX-NEXT: st.local.u32 [%rd4+4], %r1; +; PTX-NEXT: ld.param.u32 %r2, [escape_ptrtoint_param_1]; +; PTX-NEXT: st.local.u32 [%rd4], %r2; +; PTX-NEXT: st.global.u64 [%rd2], %rd3; +; PTX-NEXT: ret; entry: %i = ptrtoint ptr %s to i64 store i64 %i, ptr %out, align 8 @@ -407,23 +426,12 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local ptx_kernel void @memcpy_from_param(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; SM_60-LABEL: define dso_local ptx_kernel void @memcpy_from_param( -; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_60-NEXT: [[ENTRY:.*:]] -; SM_60-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true) -; SM_60-NEXT: ret void -; -; SM_70-LABEL: define dso_local ptx_kernel void @memcpy_from_param( -; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_70-NEXT: [[ENTRY:.*:]] -; SM_70-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true) -; SM_70-NEXT: ret void +; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @memcpy_from_param( +; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; LOWER-ARGS-NEXT: [[ENTRY:.*:]] +; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT]], ptr addrspace(101) [[S3]], i64 16, i1 true) +; LOWER-ARGS-NEXT: ret void ; ; COPY-LABEL: define dso_local ptx_kernel void @memcpy_from_param( ; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { @@ -434,6 +442,46 @@ define dso_local ptx_kernel void @memcpy_from_param(ptr nocapture noundef writeo ; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[OUT]], ptr [[S1]], i64 16, i1 true) ; COPY-NEXT: ret void ; +; PTX-LABEL: memcpy_from_param( +; PTX: { +; PTX-NEXT: .reg .b16 %rs<17>; +; PTX-NEXT: .reg .b64 %rd<2>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %entry +; PTX-NEXT: ld.param.u64 %rd1, [memcpy_from_param_param_0]; +; PTX-NEXT: ld.param.u8 %rs1, [memcpy_from_param_param_1+15]; +; PTX-NEXT: st.volatile.u8 [%rd1+15], %rs1; +; PTX-NEXT: ld.param.u8 %rs2, [memcpy_from_param_param_1+14]; +; PTX-NEXT: st.volatile.u8 [%rd1+14], %rs2; +; PTX-NEXT: ld.param.u8 %rs3, [memcpy_from_param_param_1+13]; +; PTX-NEXT: st.volatile.u8 [%rd1+13], %rs3; +; PTX-NEXT: ld.param.u8 %rs4, [memcpy_from_param_param_1+12]; +; PTX-NEXT: st.volatile.u8 [%rd1+12], %rs4; +; PTX-NEXT: ld.param.u8 %rs5, [memcpy_from_param_param_1+11]; +; PTX-NEXT: st.volatile.u8 [%rd1+11], %rs5; +; PTX-NEXT: ld.param.u8 %rs6, [memcpy_from_param_param_1+10]; +; PTX-NEXT: st.volatile.u8 [%rd1+10], %rs6; +; PTX-NEXT: ld.param.u8 %rs7, [memcpy_from_param_param_1+9]; +; PTX-NEXT: st.volatile.u8 [%rd1+9], %rs7; +; PTX-NEXT: ld.param.u8 %rs8, [memcpy_from_param_param_1+8]; +; PTX-NEXT: st.volatile.u8 [%rd1+8], %rs8; +; PTX-NEXT: ld.param.u8 %rs9, [memcpy_from_param_param_1+7]; +; PTX-NEXT: st.volatile.u8 [%rd1+7], %rs9; +; PTX-NEXT: ld.param.u8 %rs10, [memcpy_from_param_param_1+6]; +; PTX-NEXT: st.volatile.u8 [%rd1+6], %rs10; +; PTX-NEXT: ld.param.u8 %rs11, [memcpy_from_param_param_1+5]; +; PTX-NEXT: st.volatile.u8 [%rd1+5], %rs11; +; PTX-NEXT: ld.param.u8 %rs12, [memcpy_from_param_param_1+4]; +; PTX-NEXT: st.volatile.u8 [%rd1+4], %rs12; +; PTX-NEXT: ld.param.u8 %rs13, [memcpy_from_param_param_1+3]; +; PTX-NEXT: st.volatile.u8 [%rd1+3], %rs13; +; PTX-NEXT: ld.param.u8 %rs14, [memcpy_from_param_param_1+2]; +; PTX-NEXT: st.volatile.u8 [%rd1+2], %rs14; +; PTX-NEXT: ld.param.u8 %rs15, [memcpy_from_param_param_1+1]; +; PTX-NEXT: st.volatile.u8 [%rd1+1], %rs15; +; PTX-NEXT: ld.param.u8 %rs16, [memcpy_from_param_param_1]; +; PTX-NEXT: st.volatile.u8 [%rd1], %rs16; +; PTX-NEXT: ret; entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true) ret void @@ -441,23 +489,12 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local ptx_kernel void @memcpy_from_param_noalign (ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) %s) local_unnamed_addr #0 { -; SM_60-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign( -; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_60-NEXT: [[ENTRY:.*:]] -; SM_60-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true) -; SM_60-NEXT: ret void -; -; SM_70-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign( -; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_70-NEXT: [[ENTRY:.*:]] -; SM_70-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true) -; SM_70-NEXT: ret void +; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign( +; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; LOWER-ARGS-NEXT: [[ENTRY:.*:]] +; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT]], ptr addrspace(101) [[S3]], i64 16, i1 true) +; LOWER-ARGS-NEXT: ret void ; ; COPY-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign( ; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { @@ -468,6 +505,46 @@ define dso_local ptx_kernel void @memcpy_from_param_noalign (ptr nocapture nound ; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[OUT]], ptr [[S1]], i64 16, i1 true) ; COPY-NEXT: ret void ; +; PTX-LABEL: memcpy_from_param_noalign( +; PTX: { +; PTX-NEXT: .reg .b16 %rs<17>; +; PTX-NEXT: .reg .b64 %rd<2>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %entry +; PTX-NEXT: ld.param.u64 %rd1, [memcpy_from_param_noalign_param_0]; +; PTX-NEXT: ld.param.u8 %rs1, [memcpy_from_param_noalign_param_1+15]; +; PTX-NEXT: st.volatile.u8 [%rd1+15], %rs1; +; PTX-NEXT: ld.param.u8 %rs2, [memcpy_from_param_noalign_param_1+14]; +; PTX-NEXT: st.volatile.u8 [%rd1+14], %rs2; +; PTX-NEXT: ld.param.u8 %rs3, [memcpy_from_param_noalign_param_1+13]; +; PTX-NEXT: st.volatile.u8 [%rd1+13], %rs3; +; PTX-NEXT: ld.param.u8 %rs4, [memcpy_from_param_noalign_param_1+12]; +; PTX-NEXT: st.volatile.u8 [%rd1+12], %rs4; +; PTX-NEXT: ld.param.u8 %rs5, [memcpy_from_param_noalign_param_1+11]; +; PTX-NEXT: st.volatile.u8 [%rd1+11], %rs5; +; PTX-NEXT: ld.param.u8 %rs6, [memcpy_from_param_noalign_param_1+10]; +; PTX-NEXT: st.volatile.u8 [%rd1+10], %rs6; +; PTX-NEXT: ld.param.u8 %rs7, [memcpy_from_param_noalign_param_1+9]; +; PTX-NEXT: st.volatile.u8 [%rd1+9], %rs7; +; PTX-NEXT: ld.param.u8 %rs8, [memcpy_from_param_noalign_param_1+8]; +; PTX-NEXT: st.volatile.u8 [%rd1+8], %rs8; +; PTX-NEXT: ld.param.u8 %rs9, [memcpy_from_param_noalign_param_1+7]; +; PTX-NEXT: st.volatile.u8 [%rd1+7], %rs9; +; PTX-NEXT: ld.param.u8 %rs10, [memcpy_from_param_noalign_param_1+6]; +; PTX-NEXT: st.volatile.u8 [%rd1+6], %rs10; +; PTX-NEXT: ld.param.u8 %rs11, [memcpy_from_param_noalign_param_1+5]; +; PTX-NEXT: st.volatile.u8 [%rd1+5], %rs11; +; PTX-NEXT: ld.param.u8 %rs12, [memcpy_from_param_noalign_param_1+4]; +; PTX-NEXT: st.volatile.u8 [%rd1+4], %rs12; +; PTX-NEXT: ld.param.u8 %rs13, [memcpy_from_param_noalign_param_1+3]; +; PTX-NEXT: st.volatile.u8 [%rd1+3], %rs13; +; PTX-NEXT: ld.param.u8 %rs14, [memcpy_from_param_noalign_param_1+2]; +; PTX-NEXT: st.volatile.u8 [%rd1+2], %rs14; +; PTX-NEXT: ld.param.u8 %rs15, [memcpy_from_param_noalign_param_1+1]; +; PTX-NEXT: st.volatile.u8 [%rd1+1], %rs15; +; PTX-NEXT: ld.param.u8 %rs16, [memcpy_from_param_noalign_param_1]; +; PTX-NEXT: st.volatile.u8 [%rd1], %rs16; +; PTX-NEXT: ret; entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true) ret void @@ -475,37 +552,79 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; SM_60-LABEL: define dso_local ptx_kernel void @memcpy_to_param( -; SM_60-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef readnone byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_60-NEXT: [[ENTRY:.*:]] -; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_60-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) -; SM_60-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr -; SM_60-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true) -; SM_60-NEXT: ret void -; -; SM_70-LABEL: define dso_local ptx_kernel void @memcpy_to_param( -; SM_70-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef readnone byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_70-NEXT: [[ENTRY:.*:]] -; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_70-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) -; SM_70-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr -; SM_70-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true) -; SM_70-NEXT: ret void -; -; COPY-LABEL: define dso_local ptx_kernel void @memcpy_to_param( -; COPY-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef readnone byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COPY-NEXT: [[ENTRY:.*:]] -; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) -; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S1]], ptr [[IN]], i64 16, i1 true) -; COPY-NEXT: ret void +; COMMON-LABEL: define dso_local ptx_kernel void @memcpy_to_param( +; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef readnone byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; COMMON-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S1]], ptr [[IN]], i64 16, i1 true) +; COMMON-NEXT: ret void ; +; PTX-LABEL: memcpy_to_param( +; PTX: { +; PTX-NEXT: .local .align 8 .b8 __local_depot11[8]; +; PTX-NEXT: .reg .b64 %SP; +; PTX-NEXT: .reg .b64 %SPL; +; PTX-NEXT: .reg .b32 %r<3>; +; PTX-NEXT: .reg .b64 %rd<48>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %entry +; PTX-NEXT: mov.b64 %SPL, __local_depot11; +; PTX-NEXT: cvta.local.u64 %SP, %SPL; +; PTX-NEXT: ld.param.u64 %rd1, [memcpy_to_param_param_0]; +; PTX-NEXT: add.u64 %rd3, %SPL, 0; +; PTX-NEXT: ld.param.u32 %r1, [memcpy_to_param_param_1+4]; +; PTX-NEXT: st.local.u32 [%rd3+4], %r1; +; PTX-NEXT: ld.param.u32 %r2, [memcpy_to_param_param_1]; +; PTX-NEXT: st.local.u32 [%rd3], %r2; +; PTX-NEXT: ld.volatile.u8 %rd4, [%rd1]; +; PTX-NEXT: ld.volatile.u8 %rd5, [%rd1+1]; +; PTX-NEXT: shl.b64 %rd6, %rd5, 8; +; PTX-NEXT: or.b64 %rd7, %rd6, %rd4; +; PTX-NEXT: ld.volatile.u8 %rd8, [%rd1+2]; +; PTX-NEXT: shl.b64 %rd9, %rd8, 16; +; PTX-NEXT: ld.volatile.u8 %rd10, [%rd1+3]; +; PTX-NEXT: shl.b64 %rd11, %rd10, 24; +; PTX-NEXT: or.b64 %rd12, %rd11, %rd9; +; PTX-NEXT: or.b64 %rd13, %rd12, %rd7; +; PTX-NEXT: ld.volatile.u8 %rd14, [%rd1+4]; +; PTX-NEXT: ld.volatile.u8 %rd15, [%rd1+5]; +; PTX-NEXT: shl.b64 %rd16, %rd15, 8; +; PTX-NEXT: or.b64 %rd17, %rd16, %rd14; +; PTX-NEXT: ld.volatile.u8 %rd18, [%rd1+6]; +; PTX-NEXT: shl.b64 %rd19, %rd18, 16; +; PTX-NEXT: ld.volatile.u8 %rd20, [%rd1+7]; +; PTX-NEXT: shl.b64 %rd21, %rd20, 24; +; PTX-NEXT: or.b64 %rd22, %rd21, %rd19; +; PTX-NEXT: or.b64 %rd23, %rd22, %rd17; +; PTX-NEXT: shl.b64 %rd24, %rd23, 32; +; PTX-NEXT: or.b64 %rd25, %rd24, %rd13; +; PTX-NEXT: st.volatile.u64 [%SP], %rd25; +; PTX-NEXT: ld.volatile.u8 %rd26, [%rd1+8]; +; PTX-NEXT: ld.volatile.u8 %rd27, [%rd1+9]; +; PTX-NEXT: shl.b64 %rd28, %rd27, 8; +; PTX-NEXT: or.b64 %rd29, %rd28, %rd26; +; PTX-NEXT: ld.volatile.u8 %rd30, [%rd1+10]; +; PTX-NEXT: shl.b64 %rd31, %rd30, 16; +; PTX-NEXT: ld.volatile.u8 %rd32, [%rd1+11]; +; PTX-NEXT: shl.b64 %rd33, %rd32, 24; +; PTX-NEXT: or.b64 %rd34, %rd33, %rd31; +; PTX-NEXT: or.b64 %rd35, %rd34, %rd29; +; PTX-NEXT: ld.volatile.u8 %rd36, [%rd1+12]; +; PTX-NEXT: ld.volatile.u8 %rd37, [%rd1+13]; +; PTX-NEXT: shl.b64 %rd38, %rd37, 8; +; PTX-NEXT: or.b64 %rd39, %rd38, %rd36; +; PTX-NEXT: ld.volatile.u8 %rd40, [%rd1+14]; +; PTX-NEXT: shl.b64 %rd41, %rd40, 16; +; PTX-NEXT: ld.volatile.u8 %rd42, [%rd1+15]; +; PTX-NEXT: shl.b64 %rd43, %rd42, 24; +; PTX-NEXT: or.b64 %rd44, %rd43, %rd41; +; PTX-NEXT: or.b64 %rd45, %rd44, %rd39; +; PTX-NEXT: shl.b64 %rd46, %rd45, 32; +; PTX-NEXT: or.b64 %rd47, %rd46, %rd35; +; PTX-NEXT: st.volatile.u64 [%SP+8], %rd47; +; PTX-NEXT: ret; entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %s, ptr %in, i64 16, i1 true) ret void @@ -513,40 +632,22 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local ptx_kernel void @copy_on_store(ptr nocapture noundef readonly %in, ptr nocapture noundef byval(%struct.S) align 4 %s, i1 noundef zeroext %b) local_unnamed_addr #0 { -; SM_60-LABEL: define dso_local ptx_kernel void @copy_on_store( -; SM_60-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_60-NEXT: [[BB:.*:]] -; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_60-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) -; SM_60-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr -; SM_60-NEXT: [[I:%.*]] = load i32, ptr [[IN2]], align 4 -; SM_60-NEXT: store i32 [[I]], ptr [[S3]], align 4 -; SM_60-NEXT: ret void -; -; SM_70-LABEL: define dso_local ptx_kernel void @copy_on_store( -; SM_70-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_70-NEXT: [[BB:.*:]] -; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_70-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) -; SM_70-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr -; SM_70-NEXT: [[I:%.*]] = load i32, ptr [[IN2]], align 4 -; SM_70-NEXT: store i32 [[I]], ptr [[S3]], align 4 -; SM_70-NEXT: ret void -; -; COPY-LABEL: define dso_local ptx_kernel void @copy_on_store( -; COPY-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COPY-NEXT: [[BB:.*:]] -; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) -; COPY-NEXT: [[I:%.*]] = load i32, ptr [[IN]], align 4 -; COPY-NEXT: store i32 [[I]], ptr [[S1]], align 4 -; COPY-NEXT: ret void +; COMMON-LABEL: define dso_local ptx_kernel void @copy_on_store( +; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[BB:.*:]] +; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[IN]], align 4 +; COMMON-NEXT: store i32 [[I]], ptr [[S1]], align 4 +; COMMON-NEXT: ret void ; +; PTX-LABEL: copy_on_store( +; PTX: { +; PTX-EMPTY: +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %bb +; PTX-NEXT: ret; bb: %i = load i32, ptr %in, align 4 store i32 %i, ptr %s, align 4 @@ -557,8 +658,6 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3 ; SM_60-LABEL: define ptx_kernel void @test_select( ; SM_60-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { ; SM_60-NEXT: [[BB:.*:]] -; SM_60-NEXT: [[OUT7:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_60-NEXT: [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr ; SM_60-NEXT: [[INPUT24:%.*]] = alloca i32, align 4 ; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) ; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT24]], ptr addrspace(101) align 4 [[INPUT25]], i64 4, i1 false) @@ -567,21 +666,19 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3 ; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false) ; SM_60-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]] ; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 -; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[OUT8]], align 4 +; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[OUT]], align 4 ; SM_60-NEXT: ret void ; ; SM_70-LABEL: define ptx_kernel void @test_select( ; SM_70-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { ; SM_70-NEXT: [[BB:.*:]] -; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr ; SM_70-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) ; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) ; SM_70-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) ; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) ; SM_70-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]] ; SM_70-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 -; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[OUT2]], align 4 +; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[OUT]], align 4 ; SM_70-NEXT: ret void ; ; COPY-LABEL: define ptx_kernel void @test_select( @@ -598,6 +695,48 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3 ; COPY-NEXT: store i32 [[VALLOADED]], ptr [[OUT]], align 4 ; COPY-NEXT: ret void ; +; PTX_60-LABEL: test_select( +; PTX_60: { +; PTX_60-NEXT: .reg .pred %p<2>; +; PTX_60-NEXT: .reg .b16 %rs<3>; +; PTX_60-NEXT: .reg .b32 %r<4>; +; PTX_60-NEXT: .reg .b64 %rd<3>; +; PTX_60-EMPTY: +; PTX_60-NEXT: // %bb.0: // %bb +; PTX_60-NEXT: ld.param.u8 %rs1, [test_select_param_3]; +; PTX_60-NEXT: and.b16 %rs2, %rs1, 1; +; PTX_60-NEXT: setp.eq.b16 %p1, %rs2, 1; +; PTX_60-NEXT: ld.param.u64 %rd1, [test_select_param_2]; +; PTX_60-NEXT: cvta.to.global.u64 %rd2, %rd1; +; PTX_60-NEXT: ld.param.u32 %r1, [test_select_param_1]; +; PTX_60-NEXT: ld.param.u32 %r2, [test_select_param_0]; +; PTX_60-NEXT: selp.b32 %r3, %r2, %r1, %p1; +; PTX_60-NEXT: st.global.u32 [%rd2], %r3; +; PTX_60-NEXT: ret; +; +; PTX_70-LABEL: test_select( +; PTX_70: { +; PTX_70-NEXT: .reg .pred %p<2>; +; PTX_70-NEXT: .reg .b16 %rs<3>; +; PTX_70-NEXT: .reg .b32 %r<2>; +; PTX_70-NEXT: .reg .b64 %rd<10>; +; PTX_70-EMPTY: +; PTX_70-NEXT: // %bb.0: // %bb +; PTX_70-NEXT: ld.param.u8 %rs1, [test_select_param_3]; +; PTX_70-NEXT: and.b16 %rs2, %rs1, 1; +; PTX_70-NEXT: setp.eq.b16 %p1, %rs2, 1; +; PTX_70-NEXT: mov.b64 %rd1, test_select_param_0; +; PTX_70-NEXT: ld.param.u64 %rd2, [test_select_param_2]; +; PTX_70-NEXT: cvta.to.global.u64 %rd3, %rd2; +; PTX_70-NEXT: mov.b64 %rd4, test_select_param_1; +; PTX_70-NEXT: mov.b64 %rd5, %rd4; +; PTX_70-NEXT: cvta.param.u64 %rd6, %rd5; +; PTX_70-NEXT: mov.b64 %rd7, %rd1; +; PTX_70-NEXT: cvta.param.u64 %rd8, %rd7; +; PTX_70-NEXT: selp.b64 %rd9, %rd8, %rd6, %p1; +; PTX_70-NEXT: ld.u32 %r1, [%rd9]; +; PTX_70-NEXT: st.global.u32 [%rd3], %r1; +; PTX_70-NEXT: ret; bb: %ptrnew = select i1 %cond, ptr %input1, ptr %input2 %valloaded = load i32, ptr %ptrnew, align 4 @@ -606,49 +745,45 @@ bb: } define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) { -; SM_60-LABEL: define ptx_kernel void @test_select_write( -; SM_60-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { -; SM_60-NEXT: [[BB:.*:]] -; SM_60-NEXT: [[OUT5:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_60-NEXT: [[OUT6:%.*]] = addrspacecast ptr addrspace(1) [[OUT5]] to ptr -; SM_60-NEXT: [[INPUT23:%.*]] = alloca i32, align 4 -; SM_60-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) -; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false) -; SM_60-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 -; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false) -; SM_60-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]] -; SM_60-NEXT: store i32 1, ptr [[PTRNEW]], align 4 -; SM_60-NEXT: ret void -; -; SM_70-LABEL: define ptx_kernel void @test_select_write( -; SM_70-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { -; SM_70-NEXT: [[BB:.*:]] -; SM_70-NEXT: [[OUT5:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_70-NEXT: [[OUT6:%.*]] = addrspacecast ptr addrspace(1) [[OUT5]] to ptr -; SM_70-NEXT: [[INPUT23:%.*]] = alloca i32, align 4 -; SM_70-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) -; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false) -; SM_70-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 -; SM_70-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false) -; SM_70-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]] -; SM_70-NEXT: store i32 1, ptr [[PTRNEW]], align 4 -; SM_70-NEXT: ret void -; -; COPY-LABEL: define ptx_kernel void @test_select_write( -; COPY-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { -; COPY-NEXT: [[BB:.*:]] -; COPY-NEXT: [[INPUT23:%.*]] = alloca i32, align 4 -; COPY-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) -; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false) -; COPY-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 -; COPY-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false) -; COPY-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]] -; COPY-NEXT: store i32 1, ptr [[PTRNEW]], align 4 -; COPY-NEXT: ret void +; COMMON-LABEL: define ptx_kernel void @test_select_write( +; COMMON-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { +; COMMON-NEXT: [[BB:.*:]] +; COMMON-NEXT: [[INPUT23:%.*]] = alloca i32, align 4 +; COMMON-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false) +; COMMON-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 +; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false) +; COMMON-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]] +; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4 +; COMMON-NEXT: ret void ; +; PTX-LABEL: test_select_write( +; PTX: { +; PTX-NEXT: .local .align 4 .b8 __local_depot14[8]; +; PTX-NEXT: .reg .b64 %SP; +; PTX-NEXT: .reg .b64 %SPL; +; PTX-NEXT: .reg .pred %p<2>; +; PTX-NEXT: .reg .b16 %rs<3>; +; PTX-NEXT: .reg .b32 %r<4>; +; PTX-NEXT: .reg .b64 %rd<6>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %bb +; PTX-NEXT: mov.b64 %SPL, __local_depot14; +; PTX-NEXT: cvta.local.u64 %SP, %SPL; +; PTX-NEXT: ld.param.u8 %rs1, [test_select_write_param_3]; +; PTX-NEXT: and.b16 %rs2, %rs1, 1; +; PTX-NEXT: setp.eq.b16 %p1, %rs2, 1; +; PTX-NEXT: ld.param.u32 %r1, [test_select_write_param_1]; +; PTX-NEXT: st.u32 [%SP], %r1; +; PTX-NEXT: ld.param.u32 %r2, [test_select_write_param_0]; +; PTX-NEXT: st.u32 [%SP+4], %r2; +; PTX-NEXT: add.u64 %rd2, %SPL, 4; +; PTX-NEXT: add.u64 %rd4, %SPL, 0; +; PTX-NEXT: selp.b64 %rd5, %rd2, %rd4, %p1; +; PTX-NEXT: mov.b32 %r3, 1; +; PTX-NEXT: st.local.u32 [%rd5], %r3; +; PTX-NEXT: ret; bb: %ptrnew = select i1 %cond, ptr %input1, ptr %input2 store i32 1, ptr %ptrnew, align 4 @@ -659,8 +794,6 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval ; SM_60-LABEL: define ptx_kernel void @test_phi( ; SM_60-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { ; SM_60-NEXT: [[BB:.*:]] -; SM_60-NEXT: [[INOUT7:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) -; SM_60-NEXT: [[INOUT8:%.*]] = addrspacecast ptr addrspace(1) [[INOUT7]] to ptr ; SM_60-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8 ; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) ; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false) @@ -677,14 +810,12 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval ; SM_60: [[MERGE]]: ; SM_60-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] ; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 -; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[INOUT8]], align 4 +; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4 ; SM_60-NEXT: ret void ; ; SM_70-LABEL: define ptx_kernel void @test_phi( ; SM_70-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { ; SM_70-NEXT: [[BB:.*:]] -; SM_70-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) -; SM_70-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr ; SM_70-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) ; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) ; SM_70-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) @@ -699,7 +830,7 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval ; SM_70: [[MERGE]]: ; SM_70-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] ; SM_70-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 -; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4 +; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4 ; SM_70-NEXT: ret void ; ; COPY-LABEL: define ptx_kernel void @test_phi( @@ -724,6 +855,53 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval ; COPY-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4 ; COPY-NEXT: ret void ; +; PTX_60-LABEL: test_phi( +; PTX_60: { +; PTX_60-NEXT: .reg .pred %p<2>; +; PTX_60-NEXT: .reg .b16 %rs<3>; +; PTX_60-NEXT: .reg .b32 %r<5>; +; PTX_60-NEXT: .reg .b64 %rd<3>; +; PTX_60-EMPTY: +; PTX_60-NEXT: // %bb.0: // %bb +; PTX_60-NEXT: ld.param.u8 %rs1, [test_phi_param_3]; +; PTX_60-NEXT: and.b16 %rs2, %rs1, 1; +; PTX_60-NEXT: setp.eq.b16 %p1, %rs2, 1; +; PTX_60-NEXT: ld.param.u64 %rd2, [test_phi_param_2]; +; PTX_60-NEXT: cvta.to.global.u64 %rd1, %rd2; +; PTX_60-NEXT: ld.param.u32 %r4, [test_phi_param_0]; +; PTX_60-NEXT: @%p1 bra $L__BB15_2; +; PTX_60-NEXT: // %bb.1: // %second +; PTX_60-NEXT: ld.param.u32 %r4, [test_phi_param_1+4]; +; PTX_60-NEXT: $L__BB15_2: // %merge +; PTX_60-NEXT: st.global.u32 [%rd1], %r4; +; PTX_60-NEXT: ret; +; +; PTX_70-LABEL: test_phi( +; PTX_70: { +; PTX_70-NEXT: .reg .pred %p<2>; +; PTX_70-NEXT: .reg .b16 %rs<3>; +; PTX_70-NEXT: .reg .b32 %r<2>; +; PTX_70-NEXT: .reg .b64 %rd<12>; +; PTX_70-EMPTY: +; PTX_70-NEXT: // %bb.0: // %bb +; PTX_70-NEXT: ld.param.u8 %rs1, [test_phi_param_3]; +; PTX_70-NEXT: and.b16 %rs2, %rs1, 1; +; PTX_70-NEXT: setp.eq.b16 %p1, %rs2, 1; +; PTX_70-NEXT: mov.b64 %rd6, test_phi_param_0; +; PTX_70-NEXT: ld.param.u64 %rd7, [test_phi_param_2]; +; PTX_70-NEXT: cvta.to.global.u64 %rd1, %rd7; +; PTX_70-NEXT: mov.b64 %rd10, %rd6; +; PTX_70-NEXT: cvta.param.u64 %rd11, %rd10; +; PTX_70-NEXT: @%p1 bra $L__BB15_2; +; PTX_70-NEXT: // %bb.1: // %second +; PTX_70-NEXT: mov.b64 %rd8, test_phi_param_1; +; PTX_70-NEXT: mov.b64 %rd9, %rd8; +; PTX_70-NEXT: cvta.param.u64 %rd2, %rd9; +; PTX_70-NEXT: add.s64 %rd11, %rd2, 4; +; PTX_70-NEXT: $L__BB15_2: // %merge +; PTX_70-NEXT: ld.u32 %r1, [%rd11]; +; PTX_70-NEXT: st.global.u32 [%rd1], %r1; +; PTX_70-NEXT: ret; bb: br i1 %cond, label %first, label %second @@ -744,7 +922,7 @@ merge: ; preds = %second, %first define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, i1 %cond) { ; COMMON-LABEL: define ptx_kernel void @test_phi_write( -; COMMON-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { +; COMMON-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { ; COMMON-NEXT: [[BB:.*:]] ; COMMON-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8 ; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) @@ -764,6 +942,35 @@ define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr ; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4 ; COMMON-NEXT: ret void ; +; PTX-LABEL: test_phi_write( +; PTX: { +; PTX-NEXT: .local .align 4 .b8 __local_depot16[8]; +; PTX-NEXT: .reg .b64 %SP; +; PTX-NEXT: .reg .b64 %SPL; +; PTX-NEXT: .reg .pred %p<2>; +; PTX-NEXT: .reg .b16 %rs<3>; +; PTX-NEXT: .reg .b32 %r<4>; +; PTX-NEXT: .reg .b64 %rd<7>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %bb +; PTX-NEXT: mov.b64 %SPL, __local_depot16; +; PTX-NEXT: cvta.local.u64 %SP, %SPL; +; PTX-NEXT: ld.param.u8 %rs1, [test_phi_write_param_2]; +; PTX-NEXT: and.b16 %rs2, %rs1, 1; +; PTX-NEXT: setp.eq.b16 %p1, %rs2, 1; +; PTX-NEXT: add.u64 %rd1, %SPL, 0; +; PTX-NEXT: ld.param.u32 %r1, [test_phi_write_param_1+4]; +; PTX-NEXT: st.u32 [%SP], %r1; +; PTX-NEXT: add.u64 %rd6, %SPL, 4; +; PTX-NEXT: ld.param.u32 %r2, [test_phi_write_param_0]; +; PTX-NEXT: st.u32 [%SP+4], %r2; +; PTX-NEXT: @%p1 bra $L__BB16_2; +; PTX-NEXT: // %bb.1: // %second +; PTX-NEXT: mov.b64 %rd6, %rd1; +; PTX-NEXT: $L__BB16_2: // %merge +; PTX-NEXT: mov.b32 %r3, 1; +; PTX-NEXT: st.local.u32 [%rd6], %r3; +; PTX-NEXT: ret; bb: br i1 %cond, label %first, label %second diff --git a/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll b/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll index 82301e42f7d06..a257b6cfd77b7 100644 --- a/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll +++ b/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll @@ -169,19 +169,6 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) ; CHECK-NEXT: .b8 0 // EOM(1) ; CHECK-NEXT: .b8 0 // EOM(2) ; CHECK-NEXT: .b8 6 // Abbreviation Code -; CHECK-NEXT: .b8 5 // DW_TAG_formal_parameter -; CHECK-NEXT: .b8 0 // DW_CHILDREN_no -; CHECK-NEXT: .b8 3 // DW_AT_name -; CHECK-NEXT: .b8 8 // DW_FORM_string -; CHECK-NEXT: .b8 58 // DW_AT_decl_file -; CHECK-NEXT: .b8 11 // DW_FORM_data1 -; CHECK-NEXT: .b8 59 // DW_AT_decl_line -; CHECK-NEXT: .b8 11 // DW_FORM_data1 -; CHECK-NEXT: .b8 73 // DW_AT_type -; CHECK-NEXT: .b8 19 // DW_FORM_ref4 -; CHECK-NEXT: .b8 0 // EOM(1) -; CHECK-NEXT: .b8 0 // EOM(2) -; CHECK-NEXT: .b8 7 // Abbreviation Code ; CHECK-NEXT: .b8 15 // DW_TAG_pointer_type ; CHECK-NEXT: .b8 0 // DW_CHILDREN_no ; CHECK-NEXT: .b8 73 // DW_AT_type @@ -192,12 +179,12 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) ; CHECK-NEXT: } ; CHECK-NEXT: .section .debug_info ; CHECK-NEXT: { -; CHECK-NEXT: .b32 238 // Length of Unit +; CHECK-NEXT: .b32 254 // Length of Unit ; CHECK-NEXT: .b8 2 // DWARF version number ; CHECK-NEXT: .b8 0 ; CHECK-NEXT: .b32 .debug_abbrev // Offset Into Abbrev. Section ; CHECK-NEXT: .b8 8 // Address Size (in bytes) -; CHECK-NEXT: .b8 1 // Abbrev [1] 0xb:0xe7 DW_TAG_compile_unit +; CHECK-NEXT: .b8 1 // Abbrev [1] 0xb:0xf7 DW_TAG_compile_unit ; CHECK-NEXT: .b8 99 // DW_AT_producer ; CHECK-NEXT: .b8 108 ; CHECK-NEXT: .b8 97 @@ -307,7 +294,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) ; CHECK-NEXT: .b8 9 // DW_AT_location ; CHECK-NEXT: .b8 3 ; CHECK-NEXT: .b64 SHARED -; CHECK-NEXT: .b8 4 // Abbrev [4] 0x90:0x53 DW_TAG_subprogram +; CHECK-NEXT: .b8 4 // Abbrev [4] 0x90:0x63 DW_TAG_subprogram ; CHECK-NEXT: .b64 $L__func_begin0 // DW_AT_low_pc ; CHECK-NEXT: .b64 $L__func_end0 // DW_AT_high_pc ; CHECK-NEXT: .b8 1 // DW_AT_frame_base @@ -337,20 +324,36 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) ; CHECK-NEXT: .b8 0 ; CHECK-NEXT: .b8 1 // DW_AT_decl_file ; CHECK-NEXT: .b8 6 // DW_AT_decl_line -; CHECK-NEXT: .b32 227 // DW_AT_type -; CHECK-NEXT: .b8 6 // Abbrev [6] 0xc0:0x9 DW_TAG_formal_parameter +; CHECK-NEXT: .b32 248 // DW_AT_type +; CHECK-NEXT: .b8 5 // Abbrev [5] 0xc0:0x11 DW_TAG_formal_parameter +; CHECK-NEXT: .b8 2 // DW_AT_address_class +; CHECK-NEXT: .b8 6 // DW_AT_location +; CHECK-NEXT: .b8 144 +; CHECK-NEXT: .b8 177 +; CHECK-NEXT: .b8 200 +; CHECK-NEXT: .b8 201 +; CHECK-NEXT: .b8 171 +; CHECK-NEXT: .b8 2 ; CHECK-NEXT: .b8 120 // DW_AT_name ; CHECK-NEXT: .b8 0 ; CHECK-NEXT: .b8 1 // DW_AT_decl_file ; CHECK-NEXT: .b8 6 // DW_AT_decl_line -; CHECK-NEXT: .b32 236 // DW_AT_type -; CHECK-NEXT: .b8 6 // Abbrev [6] 0xc9:0x9 DW_TAG_formal_parameter +; CHECK-NEXT: .b32 243 // DW_AT_type +; CHECK-NEXT: .b8 5 // Abbrev [5] 0xd1:0x11 DW_TAG_formal_parameter +; CHECK-NEXT: .b8 2 // DW_AT_address_class +; CHECK-NEXT: .b8 6 // DW_AT_location +; CHECK-NEXT: .b8 144 +; CHECK-NEXT: .b8 179 +; CHECK-NEXT: .b8 200 +; CHECK-NEXT: .b8 201 +; CHECK-NEXT: .b8 171 +; CHECK-NEXT: .b8 2 ; CHECK-NEXT: .b8 121 // DW_AT_name ; CHECK-NEXT: .b8 0 ; CHECK-NEXT: .b8 1 // DW_AT_decl_file ; CHECK-NEXT: .b8 6 // DW_AT_decl_line -; CHECK-NEXT: .b32 236 // DW_AT_type -; CHECK-NEXT: .b8 5 // Abbrev [5] 0xd2:0x10 DW_TAG_formal_parameter +; CHECK-NEXT: .b32 243 // DW_AT_type +; CHECK-NEXT: .b8 5 // Abbrev [5] 0xe2:0x10 DW_TAG_formal_parameter ; CHECK-NEXT: .b8 2 // DW_AT_address_class ; CHECK-NEXT: .b8 5 // DW_AT_location ; CHECK-NEXT: .b8 144 @@ -364,7 +367,9 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) ; CHECK-NEXT: .b8 6 // DW_AT_decl_line ; CHECK-NEXT: .b32 111 // DW_AT_type ; CHECK-NEXT: .b8 0 // End Of Children Mark -; CHECK-NEXT: .b8 3 // Abbrev [3] 0xe3:0x9 DW_TAG_base_type +; CHECK-NEXT: .b8 6 // Abbrev [6] 0xf3:0x5 DW_TAG_pointer_type +; CHECK-NEXT: .b32 248 // DW_AT_type +; CHECK-NEXT: .b8 3 // Abbrev [3] 0xf8:0x9 DW_TAG_base_type ; CHECK-NEXT: .b8 102 // DW_AT_name ; CHECK-NEXT: .b8 108 ; CHECK-NEXT: .b8 111 @@ -373,8 +378,6 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) ; CHECK-NEXT: .b8 0 ; CHECK-NEXT: .b8 4 // DW_AT_encoding ; CHECK-NEXT: .b8 4 // DW_AT_byte_size -; CHECK-NEXT: .b8 7 // Abbrev [7] 0xec:0x5 DW_TAG_pointer_type -; CHECK-NEXT: .b32 227 // DW_AT_type ; CHECK-NEXT: .b8 0 // End Of Children Mark ; CHECK-NEXT: } ; CHECK-NEXT: .section .debug_macinfo { } diff --git a/llvm/test/DebugInfo/NVPTX/debug-info.ll b/llvm/test/DebugInfo/NVPTX/debug-info.ll index 62b30a1f15aff..fa2925af37971 100644 --- a/llvm/test/DebugInfo/NVPTX/debug-info.ll +++ b/llvm/test/DebugInfo/NVPTX/debug-info.ll @@ -100,8 +100,8 @@ if.end: ; preds = %if.then, %entry ; CHECK: .section .debug_loc ; CHECK-NEXT: { ; CHECK-NEXT: $L__debug_loc0: -; CHECK-NEXT: .b64 $L__tmp8 ; CHECK-NEXT: .b64 $L__tmp10 +; CHECK-NEXT: .b64 $L__tmp12 ; CHECK-NEXT: .b8 5 // Loc expr size ; CHECK-NEXT: .b8 0 ; CHECK-NEXT: .b8 144 // DW_OP_regx @@ -112,7 +112,7 @@ if.end: ; preds = %if.then, %entry ; CHECK-NEXT: .b64 0 ; CHECK-NEXT: .b64 0 ; CHECK-NEXT: $L__debug_loc1: -; CHECK-NEXT: .b64 $L__tmp5 +; CHECK-NEXT: .b64 $L__tmp7 ; CHECK-NEXT: .b64 $L__func_end0 ; CHECK-NEXT: .b8 5 // Loc expr size ; CHECK-NEXT: .b8 0 @@ -586,12 +586,12 @@ if.end: ; preds = %if.then, %entry ; CHECK-NEXT: } ; CHECK-NEXT: .section .debug_info ; CHECK-NEXT: { -; CHECK-NEXT: .b32 2388 // Length of Unit +; CHECK-NEXT: .b32 2404 // Length of Unit ; CHECK-NEXT: .b8 2 // DWARF version number ; CHECK-NEXT: .b8 0 ; CHECK-NEXT: .b32 .debug_abbrev // Offset Into Abbrev. Section ; CHECK-NEXT: .b8 8 // Address Size (in bytes) -; CHECK-NEXT: .b8 1 // Abbrev [1] 0xb:0x94d DW_TAG_compile_unit +; CHECK-NEXT: .b8 1 // Abbrev [1] 0xb:0x95d DW_TAG_compile_unit ; CHECK-NEXT: .b8 0 // DW_AT_producer ; CHECK-NEXT: .b8 4 // DW_AT_language ; CHECK-NEXT: .b8 0 @@ -2481,7 +2481,7 @@ if.end: ; preds = %if.then, %entry ; CHECK-NEXT: .b8 4 // DW_AT_byte_size ; CHECK-NEXT: .b8 12 // Abbrev [12] 0x83d:0x5 DW_TAG_pointer_type ; CHECK-NEXT: .b32 2100 // DW_AT_type -; CHECK-NEXT: .b8 23 // Abbrev [23] 0x842:0xd5 DW_TAG_subprogram +; CHECK-NEXT: .b8 23 // Abbrev [23] 0x842:0xe5 DW_TAG_subprogram ; CHECK-NEXT: .b64 $L__func_begin0 // DW_AT_low_pc ; CHECK-NEXT: .b64 $L__func_end0 // DW_AT_high_pc ; CHECK-NEXT: .b8 1 // DW_AT_frame_base @@ -2522,7 +2522,7 @@ if.end: ; preds = %if.then, %entry ; CHECK-NEXT: .b8 0 ; CHECK-NEXT: .b8 1 // DW_AT_decl_file ; CHECK-NEXT: .b8 5 // DW_AT_decl_line -; CHECK-NEXT: .b32 2384 // DW_AT_type +; CHECK-NEXT: .b32 2400 // DW_AT_type ; CHECK-NEXT: .b8 25 // Abbrev [25] 0x87d:0xd DW_TAG_formal_parameter ; CHECK-NEXT: .b32 $L__debug_loc0 // DW_AT_location ; CHECK-NEXT: .b8 97 // DW_AT_name @@ -2530,54 +2530,70 @@ if.end: ; preds = %if.then, %entry ; CHECK-NEXT: .b8 1 // DW_AT_decl_file ; CHECK-NEXT: .b8 5 // DW_AT_decl_line ; CHECK-NEXT: .b32 2100 // DW_AT_type -; CHECK-NEXT: .b8 22 // Abbrev [22] 0x88a:0x9 DW_TAG_formal_parameter +; CHECK-NEXT: .b8 24 // Abbrev [24] 0x88a:0x11 DW_TAG_formal_parameter +; CHECK-NEXT: .b8 2 // DW_AT_address_class +; CHECK-NEXT: .b8 6 // DW_AT_location +; CHECK-NEXT: .b8 144 +; CHECK-NEXT: .b8 179 +; CHECK-NEXT: .b8 200 +; CHECK-NEXT: .b8 201 +; CHECK-NEXT: .b8 171 +; CHECK-NEXT: .b8 2 ; CHECK-NEXT: .b8 120 // DW_AT_name ; CHECK-NEXT: .b8 0 ; CHECK-NEXT: .b8 1 // DW_AT_decl_file ; CHECK-NEXT: .b8 5 // DW_AT_decl_line ; CHECK-NEXT: .b32 2109 // DW_AT_type -; CHECK-NEXT: .b8 22 // Abbrev [22] 0x893:0x9 DW_TAG_formal_parameter +; CHECK-NEXT: .b8 24 // Abbrev [24] 0x89b:0x11 DW_TAG_formal_parameter +; CHECK-NEXT: .b8 2 // DW_AT_address_class +; CHECK-NEXT: .b8 6 // DW_AT_location +; CHECK-NEXT: .b8 144 +; CHECK-NEXT: .b8 180 +; CHECK-NEXT: .b8 200 +; CHECK-NEXT: .b8 201 +; CHECK-NEXT: .b8 171 +; CHECK-NEXT: .b8 2 ; CHECK-NEXT: .b8 121 // DW_AT_name ; CHECK-NEXT: .b8 0 ; CHECK-NEXT: .b8 1 // DW_AT_decl_file ; CHECK-NEXT: .b8 5 // DW_AT_decl_line ; CHECK-NEXT: .b32 2109 // DW_AT_type -; CHECK-NEXT: .b8 26 // Abbrev [26] 0x89c:0xd DW_TAG_variable +; CHECK-NEXT: .b8 26 // Abbrev [26] 0x8ac:0xd DW_TAG_variable ; CHECK-NEXT: .b32 $L__debug_loc1 // DW_AT_location ; CHECK-NEXT: .b8 105 // DW_AT_name ; CHECK-NEXT: .b8 0 ; CHECK-NEXT: .b8 1 // DW_AT_decl_file ; CHECK-NEXT: .b8 6 // DW_AT_decl_line -; CHECK-NEXT: .b32 2384 // DW_AT_type -; CHECK-NEXT: .b8 27 // Abbrev [27] 0x8a9:0x18 DW_TAG_inlined_subroutine +; CHECK-NEXT: .b32 2400 // DW_AT_type +; CHECK-NEXT: .b8 27 // Abbrev [27] 0x8b9:0x18 DW_TAG_inlined_subroutine ; CHECK-NEXT: .b32 691 // DW_AT_abstract_origin -; CHECK-NEXT: .b64 $L__tmp1 // DW_AT_low_pc -; CHECK-NEXT: .b64 $L__tmp2 // DW_AT_high_pc +; CHECK-NEXT: .b64 $L__tmp3 // DW_AT_low_pc +; CHECK-NEXT: .b64 $L__tmp4 // DW_AT_high_pc ; CHECK-NEXT: .b8 1 // DW_AT_call_file ; CHECK-NEXT: .b8 6 // DW_AT_call_line ; CHECK-NEXT: .b8 11 // DW_AT_call_column -; CHECK-NEXT: .b8 27 // Abbrev [27] 0x8c1:0x18 DW_TAG_inlined_subroutine +; CHECK-NEXT: .b8 27 // Abbrev [27] 0x8d1:0x18 DW_TAG_inlined_subroutine ; CHECK-NEXT: .b32 1450 // DW_AT_abstract_origin -; CHECK-NEXT: .b64 $L__tmp2 // DW_AT_low_pc -; CHECK-NEXT: .b64 $L__tmp3 // DW_AT_high_pc +; CHECK-NEXT: .b64 $L__tmp4 // DW_AT_low_pc +; CHECK-NEXT: .b64 $L__tmp5 // DW_AT_high_pc ; CHECK-NEXT: .b8 1 // DW_AT_call_file ; CHECK-NEXT: .b8 6 // DW_AT_call_line ; CHECK-NEXT: .b8 24 // DW_AT_call_column -; CHECK-NEXT: .b8 27 // Abbrev [27] 0x8d9:0x18 DW_TAG_inlined_subroutine +; CHECK-NEXT: .b8 27 // Abbrev [27] 0x8e9:0x18 DW_TAG_inlined_subroutine ; CHECK-NEXT: .b32 2044 // DW_AT_abstract_origin -; CHECK-NEXT: .b64 $L__tmp3 // DW_AT_low_pc -; CHECK-NEXT: .b64 $L__tmp4 // DW_AT_high_pc +; CHECK-NEXT: .b64 $L__tmp5 // DW_AT_low_pc +; CHECK-NEXT: .b64 $L__tmp6 // DW_AT_high_pc ; CHECK-NEXT: .b8 1 // DW_AT_call_file ; CHECK-NEXT: .b8 6 // DW_AT_call_line ; CHECK-NEXT: .b8 37 // DW_AT_call_column -; CHECK-NEXT: .b8 28 // Abbrev [28] 0x8f1:0x25 DW_TAG_inlined_subroutine +; CHECK-NEXT: .b8 28 // Abbrev [28] 0x901:0x25 DW_TAG_inlined_subroutine ; CHECK-NEXT: .b32 2050 // DW_AT_abstract_origin -; CHECK-NEXT: .b64 $L__tmp9 // DW_AT_low_pc -; CHECK-NEXT: .b64 $L__tmp10 // DW_AT_high_pc +; CHECK-NEXT: .b64 $L__tmp11 // DW_AT_low_pc +; CHECK-NEXT: .b64 $L__tmp12 // DW_AT_high_pc ; CHECK-NEXT: .b8 1 // DW_AT_call_file ; CHECK-NEXT: .b8 8 // DW_AT_call_line ; CHECK-NEXT: .b8 5 // DW_AT_call_column -; CHECK-NEXT: .b8 29 // Abbrev [29] 0x909:0xc DW_TAG_formal_parameter +; CHECK-NEXT: .b8 29 // Abbrev [29] 0x919:0xc DW_TAG_formal_parameter ; CHECK-NEXT: .b8 2 // DW_AT_address_class ; CHECK-NEXT: .b8 5 // DW_AT_location ; CHECK-NEXT: .b8 144 @@ -2588,17 +2604,17 @@ if.end: ; preds = %if.then, %entry ; CHECK-NEXT: .b32 2079 // DW_AT_abstract_origin ; CHECK-NEXT: .b8 0 // End Of Children Mark ; CHECK-NEXT: .b8 0 // End Of Children Mark -; CHECK-NEXT: .b8 30 // Abbrev [30] 0x917:0xd DW_TAG_namespace +; CHECK-NEXT: .b8 30 // Abbrev [30] 0x927:0xd DW_TAG_namespace ; CHECK-NEXT: .b8 115 // DW_AT_name ; CHECK-NEXT: .b8 116 ; CHECK-NEXT: .b8 100 ; CHECK-NEXT: .b8 0 -; CHECK-NEXT: .b8 31 // Abbrev [31] 0x91c:0x7 DW_TAG_imported_declaration +; CHECK-NEXT: .b8 31 // Abbrev [31] 0x92c:0x7 DW_TAG_imported_declaration ; CHECK-NEXT: .b8 4 // DW_AT_decl_file ; CHECK-NEXT: .b8 202 // DW_AT_decl_line -; CHECK-NEXT: .b32 2340 // DW_AT_import +; CHECK-NEXT: .b32 2356 // DW_AT_import ; CHECK-NEXT: .b8 0 // End Of Children Mark -; CHECK-NEXT: .b8 32 // Abbrev [32] 0x924:0x1b DW_TAG_subprogram +; CHECK-NEXT: .b8 32 // Abbrev [32] 0x934:0x1b DW_TAG_subprogram ; CHECK-NEXT: .b8 95 // DW_AT_MIPS_linkage_name ; CHECK-NEXT: .b8 90 ; CHECK-NEXT: .b8 76 @@ -2614,12 +2630,12 @@ if.end: ; preds = %if.then, %entry ; CHECK-NEXT: .b8 0 ; CHECK-NEXT: .b8 4 // DW_AT_decl_file ; CHECK-NEXT: .b8 44 // DW_AT_decl_line -; CHECK-NEXT: .b32 2367 // DW_AT_type +; CHECK-NEXT: .b32 2383 // DW_AT_type ; CHECK-NEXT: .b8 1 // DW_AT_declaration -; CHECK-NEXT: .b8 7 // Abbrev [7] 0x939:0x5 DW_TAG_formal_parameter -; CHECK-NEXT: .b32 2367 // DW_AT_type +; CHECK-NEXT: .b8 7 // Abbrev [7] 0x949:0x5 DW_TAG_formal_parameter +; CHECK-NEXT: .b32 2383 // DW_AT_type ; CHECK-NEXT: .b8 0 // End Of Children Mark -; CHECK-NEXT: .b8 10 // Abbrev [10] 0x93f:0x11 DW_TAG_base_type +; CHECK-NEXT: .b8 10 // Abbrev [10] 0x94f:0x11 DW_TAG_base_type ; CHECK-NEXT: .b8 108 // DW_AT_name ; CHECK-NEXT: .b8 111 ; CHECK-NEXT: .b8 110 @@ -2636,7 +2652,7 @@ if.end: ; preds = %if.then, %entry ; CHECK-NEXT: .b8 0 ; CHECK-NEXT: .b8 5 // DW_AT_encoding ; CHECK-NEXT: .b8 8 // DW_AT_byte_size -; CHECK-NEXT: .b8 10 // Abbrev [10] 0x950:0x7 DW_TAG_base_type +; CHECK-NEXT: .b8 10 // Abbrev [10] 0x960:0x7 DW_TAG_base_type ; CHECK-NEXT: .b8 105 // DW_AT_name ; CHECK-NEXT: .b8 110 ; CHECK-NEXT: .b8 116 diff --git a/llvm/test/Transforms/InferAddressSpaces/NVPTX/arguments.ll b/llvm/test/Transforms/InferAddressSpaces/NVPTX/arguments.ll new file mode 100644 index 0000000000000..dbd2662de4274 --- /dev/null +++ b/llvm/test/Transforms/InferAddressSpaces/NVPTX/arguments.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=infer-address-spaces %s | FileCheck %s + +target triple = "nvptx64-nvidia-cuda" + + +define ptx_kernel i32 @test_kernel(ptr %a, ptr byval(i32) %b) { +; CHECK-LABEL: define ptx_kernel i32 @test_kernel( +; CHECK-SAME: ptr [[A:%.*]], ptr byval(i32) [[B:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1) +; CHECK-NEXT: [[V1:%.*]] = load i32, ptr addrspace(1) [[TMP1]], align 4 +; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[B]], align 4 +; CHECK-NEXT: [[SUM:%.*]] = add i32 [[V1]], [[V2]] +; CHECK-NEXT: ret i32 [[SUM]] +; + %v1 = load i32, ptr %a + %v2 = load i32, ptr %b + %sum = add i32 %v1, %v2 + ret i32 %sum +} + +define i32 @test_device(ptr %a, ptr byval(i32) %b) { +; CHECK-LABEL: define i32 @test_device( +; CHECK-SAME: ptr [[A:%.*]], ptr byval(i32) [[B:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(5) +; CHECK-NEXT: [[V1:%.*]] = load i32, ptr [[A]], align 4 +; CHECK-NEXT: [[V2:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4 +; CHECK-NEXT: [[SUM:%.*]] = add i32 [[V1]], [[V2]] +; CHECK-NEXT: ret i32 [[SUM]] +; + %v1 = load i32, ptr %a + %v2 = load i32, ptr %b + %sum = add i32 %v1, %v2 + ret i32 %sum +}