From fc56a3cd26e7403625f0b98e81f495b06b9d29c0 Mon Sep 17 00:00:00 2001 From: Michael Kuron Date: Sat, 28 Sep 2024 12:57:43 +0200 Subject: [PATCH 1/3] [NVTPX] Copy kernel arguments as byte array Ensures that struct padding is not skipped, as it may contain actual data if the struct is really a union. Fixes #53710 --- llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp | 11 ++- llvm/test/CodeGen/NVPTX/lower-args.ll | 2 +- llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 84 +++++++++++++-------- 3 files changed, 62 insertions(+), 35 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index 082546c4dd72f..7fc6b8949f8a6 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -626,10 +626,17 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, // Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX // addrspacecast preserves alignment. Since params are constant, this load // is definitely not volatile. + const auto StructBytes = *AllocA->getAllocationSize(DL); + const auto ChunkBytes = (StructBytes % 8 == 0) ? 8 : + (StructBytes % 4 == 0) ? 4 : + (StructBytes % 2 == 0) ? 2 : 1; + Type *ChunkType = Type::getIntNTy(Func->getContext(), 8 * ChunkBytes); + Type *OpaqueType = ArrayType::get(ChunkType, StructBytes / ChunkBytes); LoadInst *LI = - new LoadInst(StructType, ArgInParam, Arg->getName(), + new LoadInst(OpaqueType, ArgInParam, Arg->getName(), /*isVolatile=*/false, AllocA->getAlign(), FirstInst); - new StoreInst(LI, AllocA, FirstInst); + new StoreInst(LI, AllocA, + /*isVolatile=*/false, AllocA->getAlign(), FirstInst); } } diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll index 029f1944d596b..9a306036044be 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args.ll @@ -14,7 +14,7 @@ target triple = "nvptx64-nvidia-cuda" ; COMMON-LABEL: load_alignment define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 %arg) { entry: -; IR: load %class.outer, ptr addrspace(101) +; IR: load [3 x i64], ptr addrspace(101) ; IR-SAME: align 8 ; PTX: ld.param.u64 ; PTX-NOT: ld.param.u8 diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll index a414a6c41cd5b..5b55e3c5b7280 100644 --- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -88,8 +88,8 @@ define dso_local void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 -; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4 ; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) ; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr ; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 @@ -115,8 +115,8 @@ define dso_local void @escape_ptr(ptr nocapture noundef readnone %out, ptr nound ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 -; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4 ; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) ; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr ; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S3]]) @@ -134,8 +134,8 @@ define dso_local void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr n ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 -; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4 ; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) ; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr ; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 @@ -155,8 +155,8 @@ define dso_local void @escape_ptr_store(ptr nocapture noundef writeonly %out, pt ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 -; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4 ; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) ; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr ; COMMON-NEXT: store ptr [[S3]], ptr [[OUT2]], align 8 @@ -174,8 +174,8 @@ define dso_local void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 -; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4 ; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) ; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr ; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 @@ -195,8 +195,8 @@ define dso_local void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 -; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4 ; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) ; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr ; COMMON-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64 @@ -232,8 +232,8 @@ define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr n ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 -; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4 ; COMMON-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) ; COMMON-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr ; COMMON-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true) @@ -251,8 +251,8 @@ define dso_local void @copy_on_store(ptr nocapture noundef readonly %in, ptr noc ; COMMON-NEXT: [[BB:.*:]] ; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 -; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4 ; COMMON-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) ; COMMON-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr ; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[IN2]], align 4 @@ -273,12 +273,12 @@ define void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ; SM_60-NEXT: [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr ; SM_60-NEXT: [[INPUT24:%.*]] = alloca i32, align 4 ; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) -; SM_60-NEXT: [[INPUT26:%.*]] = load i32, ptr addrspace(101) [[INPUT25]], align 4 -; SM_60-NEXT: store i32 [[INPUT26]], ptr [[INPUT24]], align 4 +; SM_60-NEXT: [[INPUT26:%.*]] = load [1 x i32], ptr addrspace(101) [[INPUT25]], align 4 +; SM_60-NEXT: store [1 x i32] [[INPUT26]], ptr [[INPUT24]], align 4 ; SM_60-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 ; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; SM_60-NEXT: [[INPUT13:%.*]] = load i32, ptr addrspace(101) [[INPUT12]], align 4 -; SM_60-NEXT: store i32 [[INPUT13]], ptr [[INPUT11]], align 4 +; SM_60-NEXT: [[INPUT13:%.*]] = load [1 x i32], ptr addrspace(101) [[INPUT12]], align 4 +; SM_60-NEXT: store [1 x i32] [[INPUT13]], ptr [[INPUT11]], align 4 ; SM_60-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]] ; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 ; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[OUT8]], align 4 @@ -313,12 +313,12 @@ define void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %i ; COMMON-NEXT: [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr ; COMMON-NEXT: [[INPUT24:%.*]] = alloca i32, align 4 ; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) -; COMMON-NEXT: [[INPUT26:%.*]] = load i32, ptr addrspace(101) [[INPUT25]], align 4 -; COMMON-NEXT: store i32 [[INPUT26]], ptr [[INPUT24]], align 4 +; COMMON-NEXT: [[INPUT26:%.*]] = load [1 x i32], ptr addrspace(101) [[INPUT25]], align 4 +; COMMON-NEXT: store [1 x i32] [[INPUT26]], ptr [[INPUT24]], align 4 ; COMMON-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 ; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; COMMON-NEXT: [[INPUT13:%.*]] = load i32, ptr addrspace(101) [[INPUT12]], align 4 -; COMMON-NEXT: store i32 [[INPUT13]], ptr [[INPUT11]], align 4 +; COMMON-NEXT: [[INPUT13:%.*]] = load [1 x i32], ptr addrspace(101) [[INPUT12]], align 4 +; COMMON-NEXT: store [1 x i32] [[INPUT13]], ptr [[INPUT11]], align 4 ; COMMON-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]] ; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4 ; COMMON-NEXT: ret void @@ -337,12 +337,12 @@ define void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) ; SM_60-NEXT: [[INOUT8:%.*]] = addrspacecast ptr addrspace(1) [[INOUT7]] to ptr ; SM_60-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8 ; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) -; SM_60-NEXT: [[INPUT26:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT25]], align 8 -; SM_60-NEXT: store [[STRUCT_S]] [[INPUT26]], ptr [[INPUT24]], align 4 +; SM_60-NEXT: [[INPUT26:%.*]] = load [1 x i64], ptr addrspace(101) [[INPUT25]], align 8 +; SM_60-NEXT: store [1 x i64] [[INPUT26]], ptr [[INPUT24]], align 8 ; SM_60-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4 ; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; SM_60-NEXT: [[INPUT13:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT12]], align 4 -; SM_60-NEXT: store [[STRUCT_S]] [[INPUT13]], ptr [[INPUT11]], align 4 +; SM_60-NEXT: [[INPUT13:%.*]] = load [1 x i64], ptr addrspace(101) [[INPUT12]], align 4 +; SM_60-NEXT: store [1 x i64] [[INPUT13]], ptr [[INPUT11]], align 4 ; SM_60-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] ; SM_60: [[FIRST]]: ; SM_60-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0 @@ -402,12 +402,12 @@ define void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr byval(%str ; COMMON-NEXT: [[BB:.*:]] ; COMMON-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8 ; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) -; COMMON-NEXT: [[INPUT26:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT25]], align 8 -; COMMON-NEXT: store [[STRUCT_S]] [[INPUT26]], ptr [[INPUT24]], align 4 +; COMMON-NEXT: [[INPUT26:%.*]] = load [1 x i64], ptr addrspace(101) [[INPUT25]], align 8 +; COMMON-NEXT: store [1 x i64] [[INPUT26]], ptr [[INPUT24]], align 8 ; COMMON-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; COMMON-NEXT: [[INPUT13:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT12]], align 4 -; COMMON-NEXT: store [[STRUCT_S]] [[INPUT13]], ptr [[INPUT11]], align 4 +; COMMON-NEXT: [[INPUT13:%.*]] = load [1 x i64], ptr addrspace(101) [[INPUT12]], align 4 +; COMMON-NEXT: store [1 x i64] [[INPUT13]], ptr [[INPUT11]], align 4 ; COMMON-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] ; COMMON: [[FIRST]]: ; COMMON-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0 @@ -437,6 +437,26 @@ merge: ; preds = %second, %first ret void } +%union.U = type { %struct.P } +%struct.P = type { i8, i32 } + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @padding(ptr nocapture noundef readnone %out, ptr noundef byval(%union.U) align 4 %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @padding( +; COMMON-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[UNION_U:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S1:%.*]] = alloca [[UNION_U]], align 4 +; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S3:%.*]] = load [1 x i64], ptr addrspace(101) [[S2]], align 4 +; COMMON-NEXT: store [1 x i64] [[S3]], ptr [[S1]], align 4 +; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) +; COMMON-NEXT: ret void +; +entry: + call void @_Z6escapePv(ptr noundef nonnull %s) #0 + ret void +} + attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "no-trapping-math"="true" "target-cpu"="sm_60" "target-features"="+ptx78,+sm_60" "uniform-work-group-size"="true" } attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) } From 35bcad64b060ab8252ced5c0bf65820b5d0e7f14 Mon Sep 17 00:00:00 2001 From: Michael Kuron Date: Sun, 29 Sep 2024 13:44:13 +0200 Subject: [PATCH 2/3] [NVPTX] Coalesce kernel argument copies --- llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp | 25 ++++-- llvm/test/CodeGen/NVPTX/lower-args.ll | 3 +- llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 88 ++++++++++++------- .../Inputs/nvptx-basic.ll.expected | 16 ++-- 4 files changed, 83 insertions(+), 49 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index 7fc6b8949f8a6..8604cf68e4062 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -623,15 +623,28 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, Value *ArgInParam = new AddrSpaceCastInst( Arg, PointerType::get(Arg->getContext(), ADDRESS_SPACE_PARAM), Arg->getName(), FirstInst); + // Create an opaque type of same size as StructType but without padding + // holes as this could have been a union. + const auto StructBytes = *AllocA->getAllocationSize(DL); + SmallVector ChunkTypes; + if (StructBytes >= 16) { + Type *IntType = Type::getInt64Ty(Func->getContext()); + Type *ChunkType = VectorType::get(IntType, 2, false); + Type *OpaqueType = StructBytes < 32 ? ChunkType : + ArrayType::get(ChunkType, StructBytes / 16); + ChunkTypes.push_back(OpaqueType); + } + for (const auto ChunkBytes: {8, 4, 2, 1}) { + if (StructBytes & ChunkBytes) { + Type *ChunkType = Type::getIntNTy(Func->getContext(), 8 * ChunkBytes); + ChunkTypes.push_back(ChunkType); + } + } + Type * OpaqueType = ChunkTypes.size() == 1 ? ChunkTypes[0] : + StructType::create(ChunkTypes); // Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX // addrspacecast preserves alignment. Since params are constant, this load // is definitely not volatile. - const auto StructBytes = *AllocA->getAllocationSize(DL); - const auto ChunkBytes = (StructBytes % 8 == 0) ? 8 : - (StructBytes % 4 == 0) ? 4 : - (StructBytes % 2 == 0) ? 2 : 1; - Type *ChunkType = Type::getIntNTy(Func->getContext(), 8 * ChunkBytes); - Type *OpaqueType = ArrayType::get(ChunkType, StructBytes / ChunkBytes); LoadInst *LI = new LoadInst(OpaqueType, ArgInParam, Arg->getName(), /*isVolatile=*/false, AllocA->getAlign(), FirstInst); diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll index 9a306036044be..17b04b5a60124 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args.ll @@ -7,6 +7,7 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" +; IR: [[OPAQUE_OUTER:.*]] = type { <2 x i64>, i64 } %class.outer = type <{ %class.inner, i32, [4 x i8] }> %class.inner = type { ptr, ptr } @@ -14,7 +15,7 @@ target triple = "nvptx64-nvidia-cuda" ; COMMON-LABEL: load_alignment define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 %arg) { entry: -; IR: load [3 x i64], ptr addrspace(101) +; IR: load [[OPAQUE_OUTER]], ptr addrspace(101) ; IR-SAME: align 8 ; PTX: ld.param.u64 ; PTX-NOT: ld.param.u8 diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll index 5b55e3c5b7280..7dc5df76bd332 100644 --- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -5,6 +5,7 @@ source_filename = "" target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" +; COMMON: [[OPAQUE_C:.*]] = type { [2 x <2 x i64>], i64, i32, i8 } %struct.S = type { i32, i32 } ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) @@ -88,8 +89,8 @@ define dso_local void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4 -; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4 ; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) ; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr ; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 @@ -115,8 +116,8 @@ define dso_local void @escape_ptr(ptr nocapture noundef readnone %out, ptr nound ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4 -; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4 ; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) ; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr ; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S3]]) @@ -134,8 +135,8 @@ define dso_local void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr n ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4 -; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4 ; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) ; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr ; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 @@ -155,8 +156,8 @@ define dso_local void @escape_ptr_store(ptr nocapture noundef writeonly %out, pt ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4 -; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4 ; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) ; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr ; COMMON-NEXT: store ptr [[S3]], ptr [[OUT2]], align 8 @@ -174,8 +175,8 @@ define dso_local void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4 -; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4 ; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) ; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr ; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 @@ -195,8 +196,8 @@ define dso_local void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4 -; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4 ; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) ; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr ; COMMON-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64 @@ -232,8 +233,8 @@ define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr n ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4 -; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4 ; COMMON-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) ; COMMON-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr ; COMMON-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true) @@ -251,8 +252,8 @@ define dso_local void @copy_on_store(ptr nocapture noundef readonly %in, ptr noc ; COMMON-NEXT: [[BB:.*:]] ; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4 -; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4 ; COMMON-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) ; COMMON-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr ; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[IN2]], align 4 @@ -273,12 +274,12 @@ define void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ; SM_60-NEXT: [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr ; SM_60-NEXT: [[INPUT24:%.*]] = alloca i32, align 4 ; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) -; SM_60-NEXT: [[INPUT26:%.*]] = load [1 x i32], ptr addrspace(101) [[INPUT25]], align 4 -; SM_60-NEXT: store [1 x i32] [[INPUT26]], ptr [[INPUT24]], align 4 +; SM_60-NEXT: [[INPUT26:%.*]] = load i32, ptr addrspace(101) [[INPUT25]], align 4 +; SM_60-NEXT: store i32 [[INPUT26]], ptr [[INPUT24]], align 4 ; SM_60-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 ; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; SM_60-NEXT: [[INPUT13:%.*]] = load [1 x i32], ptr addrspace(101) [[INPUT12]], align 4 -; SM_60-NEXT: store [1 x i32] [[INPUT13]], ptr [[INPUT11]], align 4 +; SM_60-NEXT: [[INPUT13:%.*]] = load i32, ptr addrspace(101) [[INPUT12]], align 4 +; SM_60-NEXT: store i32 [[INPUT13]], ptr [[INPUT11]], align 4 ; SM_60-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]] ; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 ; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[OUT8]], align 4 @@ -313,12 +314,12 @@ define void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %i ; COMMON-NEXT: [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr ; COMMON-NEXT: [[INPUT24:%.*]] = alloca i32, align 4 ; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) -; COMMON-NEXT: [[INPUT26:%.*]] = load [1 x i32], ptr addrspace(101) [[INPUT25]], align 4 -; COMMON-NEXT: store [1 x i32] [[INPUT26]], ptr [[INPUT24]], align 4 +; COMMON-NEXT: [[INPUT26:%.*]] = load i32, ptr addrspace(101) [[INPUT25]], align 4 +; COMMON-NEXT: store i32 [[INPUT26]], ptr [[INPUT24]], align 4 ; COMMON-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 ; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; COMMON-NEXT: [[INPUT13:%.*]] = load [1 x i32], ptr addrspace(101) [[INPUT12]], align 4 -; COMMON-NEXT: store [1 x i32] [[INPUT13]], ptr [[INPUT11]], align 4 +; COMMON-NEXT: [[INPUT13:%.*]] = load i32, ptr addrspace(101) [[INPUT12]], align 4 +; COMMON-NEXT: store i32 [[INPUT13]], ptr [[INPUT11]], align 4 ; COMMON-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]] ; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4 ; COMMON-NEXT: ret void @@ -337,12 +338,12 @@ define void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) ; SM_60-NEXT: [[INOUT8:%.*]] = addrspacecast ptr addrspace(1) [[INOUT7]] to ptr ; SM_60-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8 ; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) -; SM_60-NEXT: [[INPUT26:%.*]] = load [1 x i64], ptr addrspace(101) [[INPUT25]], align 8 -; SM_60-NEXT: store [1 x i64] [[INPUT26]], ptr [[INPUT24]], align 8 +; SM_60-NEXT: [[INPUT26:%.*]] = load i64, ptr addrspace(101) [[INPUT25]], align 8 +; SM_60-NEXT: store i64 [[INPUT26]], ptr [[INPUT24]], align 8 ; SM_60-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4 ; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; SM_60-NEXT: [[INPUT13:%.*]] = load [1 x i64], ptr addrspace(101) [[INPUT12]], align 4 -; SM_60-NEXT: store [1 x i64] [[INPUT13]], ptr [[INPUT11]], align 4 +; SM_60-NEXT: [[INPUT13:%.*]] = load i64, ptr addrspace(101) [[INPUT12]], align 4 +; SM_60-NEXT: store i64 [[INPUT13]], ptr [[INPUT11]], align 4 ; SM_60-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] ; SM_60: [[FIRST]]: ; SM_60-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0 @@ -402,12 +403,12 @@ define void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr byval(%str ; COMMON-NEXT: [[BB:.*:]] ; COMMON-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8 ; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) -; COMMON-NEXT: [[INPUT26:%.*]] = load [1 x i64], ptr addrspace(101) [[INPUT25]], align 8 -; COMMON-NEXT: store [1 x i64] [[INPUT26]], ptr [[INPUT24]], align 8 +; COMMON-NEXT: [[INPUT26:%.*]] = load i64, ptr addrspace(101) [[INPUT25]], align 8 +; COMMON-NEXT: store i64 [[INPUT26]], ptr [[INPUT24]], align 8 ; COMMON-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; COMMON-NEXT: [[INPUT13:%.*]] = load [1 x i64], ptr addrspace(101) [[INPUT12]], align 4 -; COMMON-NEXT: store [1 x i64] [[INPUT13]], ptr [[INPUT11]], align 4 +; COMMON-NEXT: [[INPUT13:%.*]] = load i64, ptr addrspace(101) [[INPUT12]], align 4 +; COMMON-NEXT: store i64 [[INPUT13]], ptr [[INPUT11]], align 4 ; COMMON-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] ; COMMON: [[FIRST]]: ; COMMON-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0 @@ -447,8 +448,27 @@ define dso_local void @padding(ptr nocapture noundef readnone %out, ptr noundef ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S1:%.*]] = alloca [[UNION_U]], align 4 ; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S3:%.*]] = load [1 x i64], ptr addrspace(101) [[S2]], align 4 -; COMMON-NEXT: store [1 x i64] [[S3]], ptr [[S1]], align 4 +; COMMON-NEXT: [[S3:%.*]] = load i64, ptr addrspace(101) [[S2]], align 4 +; COMMON-NEXT: store i64 [[S3]], ptr [[S1]], align 4 +; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) +; COMMON-NEXT: ret void +; +entry: + call void @_Z6escapePv(ptr noundef nonnull %s) #0 + ret void +} + +%struct.C = type { [45 x i8] } + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @coalescing(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.C) align 4 %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @coalescing( +; COMMON-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_C:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_C]], align 4 +; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S3:%.*]] = load [[OPAQUE_C]], ptr addrspace(101) [[S2]], align 4 +; COMMON-NEXT: store [[OPAQUE_C]] [[S3]], ptr [[S1]], align 4 ; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) ; COMMON-NEXT: ret void ; diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected index 3ac63d070933d..bc802363bf46d 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected @@ -17,14 +17,14 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct ; CHECK-NEXT: cvta.local.u32 %SP, %SPL; ; CHECK-NEXT: ld.param.u32 %r1, [caller_St8x4_param_1]; ; CHECK-NEXT: add.u32 %r3, %SPL, 0; -; CHECK-NEXT: ld.param.u64 %rd1, [caller_St8x4_param_0+24]; -; CHECK-NEXT: ld.param.u64 %rd2, [caller_St8x4_param_0+16]; -; CHECK-NEXT: ld.param.u64 %rd3, [caller_St8x4_param_0+8]; -; CHECK-NEXT: ld.param.u64 %rd4, [caller_St8x4_param_0]; -; CHECK-NEXT: st.local.u64 [%r3], %rd4; -; CHECK-NEXT: st.local.u64 [%r3+8], %rd3; -; CHECK-NEXT: st.local.u64 [%r3+16], %rd2; -; CHECK-NEXT: st.local.u64 [%r3+24], %rd1; +; CHECK-NEXT: ld.param.u64 %rd1, [caller_St8x4_param_0+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [caller_St8x4_param_0+24]; +; CHECK-NEXT: ld.param.u64 %rd3, [caller_St8x4_param_0]; +; CHECK-NEXT: ld.param.u64 %rd4, [caller_St8x4_param_0+8]; +; CHECK-NEXT: st.local.u64 [%r3+8], %rd4; +; CHECK-NEXT: st.local.u64 [%r3], %rd3; +; CHECK-NEXT: st.local.u64 [%r3+24], %rd2; +; CHECK-NEXT: st.local.u64 [%r3+16], %rd1; ; CHECK-NEXT: ld.u64 %rd5, [%SP+8]; ; CHECK-NEXT: ld.u64 %rd6, [%SP+0]; ; CHECK-NEXT: ld.u64 %rd7, [%SP+24]; From 88a395eaefe76913acd033b93a645856be2132c2 Mon Sep 17 00:00:00 2001 From: Michael Kuron Date: Sun, 6 Oct 2024 11:42:08 +0200 Subject: [PATCH 3/3] [NVTPX] Copy kernel arguments via memcpy intrinsic --- llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp | 28 +----- llvm/test/CodeGen/NVPTX/lower-args.ll | 35 +++++++- llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 88 ++++--------------- .../Inputs/nvptx-basic.ll.expected | 41 ++++----- 4 files changed, 70 insertions(+), 122 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index 8604cf68e4062..1b688fb4f0c2c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -623,33 +623,13 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, Value *ArgInParam = new AddrSpaceCastInst( Arg, PointerType::get(Arg->getContext(), ADDRESS_SPACE_PARAM), Arg->getName(), FirstInst); - // Create an opaque type of same size as StructType but without padding - // holes as this could have been a union. - const auto StructBytes = *AllocA->getAllocationSize(DL); - SmallVector ChunkTypes; - if (StructBytes >= 16) { - Type *IntType = Type::getInt64Ty(Func->getContext()); - Type *ChunkType = VectorType::get(IntType, 2, false); - Type *OpaqueType = StructBytes < 32 ? ChunkType : - ArrayType::get(ChunkType, StructBytes / 16); - ChunkTypes.push_back(OpaqueType); - } - for (const auto ChunkBytes: {8, 4, 2, 1}) { - if (StructBytes & ChunkBytes) { - Type *ChunkType = Type::getIntNTy(Func->getContext(), 8 * ChunkBytes); - ChunkTypes.push_back(ChunkType); - } - } - Type * OpaqueType = ChunkTypes.size() == 1 ? ChunkTypes[0] : - StructType::create(ChunkTypes); // Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX // addrspacecast preserves alignment. Since params are constant, this load // is definitely not volatile. - LoadInst *LI = - new LoadInst(OpaqueType, ArgInParam, Arg->getName(), - /*isVolatile=*/false, AllocA->getAlign(), FirstInst); - new StoreInst(LI, AllocA, - /*isVolatile=*/false, AllocA->getAlign(), FirstInst); + const auto ArgSize = *AllocA->getAllocationSize(DL); + IRBuilder<> IRB(&*FirstInst); + IRB.CreateMemCpy(AllocA, AllocA->getAlign(), ArgInParam, AllocA->getAlign(), + ArgSize); } } diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll index 17b04b5a60124..d1bec032ec3a9 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args.ll @@ -7,16 +7,15 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" -; IR: [[OPAQUE_OUTER:.*]] = type { <2 x i64>, i64 } %class.outer = type <{ %class.inner, i32, [4 x i8] }> %class.inner = type { ptr, ptr } +%class.padded = type { i8, i32 } ; Check that nvptx-lower-args preserves arg alignment ; COMMON-LABEL: load_alignment define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 %arg) { entry: -; IR: load [[OPAQUE_OUTER]], ptr addrspace(101) -; IR-SAME: align 8 +; IR: call void @llvm.memcpy.p0.p101.i64(ptr align 8 ; PTX: ld.param.u64 ; PTX-NOT: ld.param.u8 %arg.idx.val = load ptr, ptr %arg, align 8 @@ -34,6 +33,36 @@ entry: ret void } +; Check that nvptx-lower-args copies padding as the struct may have been a union +; COMMON-LABEL: load_padding +define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) { +; PTX: { +; PTX-NEXT: .local .align 8 .b8 __local_depot1[8]; +; PTX-NEXT: .reg .b64 %SP; +; PTX-NEXT: .reg .b64 %SPL; +; PTX-NEXT: .reg .b64 %rd<5>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.u64 %SPL, __local_depot1; +; PTX-NEXT: cvta.local.u64 %SP, %SPL; +; PTX-NEXT: ld.param.u64 %rd1, [load_padding_param_0]; +; PTX-NEXT: st.u64 [%SP+0], %rd1; +; PTX-NEXT: add.u64 %rd2, %SP, 0; +; PTX-NEXT: { // callseq 1, 0 +; PTX-NEXT: .param .b64 param0; +; PTX-NEXT: st.param.b64 [param0+0], %rd2; +; PTX-NEXT: .param .b64 retval0; +; PTX-NEXT: call.uni (retval0), +; PTX-NEXT: escape, +; PTX-NEXT: ( +; PTX-NEXT: param0 +; PTX-NEXT: ); +; PTX-NEXT: ld.param.b64 %rd3, [retval0+0]; +; PTX-NEXT: } // callseq 1 +; PTX-NEXT: ret; + %tmp = call ptr @escape(ptr nonnull align 16 %arg) + ret void +} ; COMMON-LABEL: ptr_generic define void @ptr_generic(ptr %out, ptr %in) { diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll index 7dc5df76bd332..a7dbc4c1620a5 100644 --- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -5,7 +5,6 @@ source_filename = "" target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" -; COMMON: [[OPAQUE_C:.*]] = type { [2 x <2 x i64>], i64, i32, i8 } %struct.S = type { i32, i32 } ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) @@ -89,8 +88,7 @@ define dso_local void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4 -; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) ; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) ; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr ; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 @@ -116,8 +114,7 @@ define dso_local void @escape_ptr(ptr nocapture noundef readnone %out, ptr nound ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4 -; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) ; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) ; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr ; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S3]]) @@ -135,8 +132,7 @@ define dso_local void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr n ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4 -; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) ; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) ; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr ; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 @@ -156,8 +152,7 @@ define dso_local void @escape_ptr_store(ptr nocapture noundef writeonly %out, pt ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4 -; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) ; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) ; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr ; COMMON-NEXT: store ptr [[S3]], ptr [[OUT2]], align 8 @@ -175,8 +170,7 @@ define dso_local void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4 -; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) ; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) ; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr ; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 @@ -196,8 +190,7 @@ define dso_local void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4 -; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) ; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) ; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr ; COMMON-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64 @@ -233,8 +226,7 @@ define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr n ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4 -; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) ; COMMON-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) ; COMMON-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr ; COMMON-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true) @@ -252,8 +244,7 @@ define dso_local void @copy_on_store(ptr nocapture noundef readonly %in, ptr noc ; COMMON-NEXT: [[BB:.*:]] ; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4 -; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) ; COMMON-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) ; COMMON-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr ; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[IN2]], align 4 @@ -274,12 +265,10 @@ define void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ; SM_60-NEXT: [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr ; SM_60-NEXT: [[INPUT24:%.*]] = alloca i32, align 4 ; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) -; SM_60-NEXT: [[INPUT26:%.*]] = load i32, ptr addrspace(101) [[INPUT25]], align 4 -; SM_60-NEXT: store i32 [[INPUT26]], ptr [[INPUT24]], align 4 +; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT24]], ptr addrspace(101) align 4 [[INPUT25]], i64 4, i1 false) ; SM_60-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 ; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; SM_60-NEXT: [[INPUT13:%.*]] = load i32, ptr addrspace(101) [[INPUT12]], align 4 -; SM_60-NEXT: store i32 [[INPUT13]], ptr [[INPUT11]], align 4 +; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false) ; SM_60-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]] ; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 ; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[OUT8]], align 4 @@ -314,12 +303,10 @@ define void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %i ; COMMON-NEXT: [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr ; COMMON-NEXT: [[INPUT24:%.*]] = alloca i32, align 4 ; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) -; COMMON-NEXT: [[INPUT26:%.*]] = load i32, ptr addrspace(101) [[INPUT25]], align 4 -; COMMON-NEXT: store i32 [[INPUT26]], ptr [[INPUT24]], align 4 +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT24]], ptr addrspace(101) align 4 [[INPUT25]], i64 4, i1 false) ; COMMON-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 ; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; COMMON-NEXT: [[INPUT13:%.*]] = load i32, ptr addrspace(101) [[INPUT12]], align 4 -; COMMON-NEXT: store i32 [[INPUT13]], ptr [[INPUT11]], align 4 +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false) ; COMMON-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]] ; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4 ; COMMON-NEXT: ret void @@ -338,12 +325,10 @@ define void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) ; SM_60-NEXT: [[INOUT8:%.*]] = addrspacecast ptr addrspace(1) [[INOUT7]] to ptr ; SM_60-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8 ; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) -; SM_60-NEXT: [[INPUT26:%.*]] = load i64, ptr addrspace(101) [[INPUT25]], align 8 -; SM_60-NEXT: store i64 [[INPUT26]], ptr [[INPUT24]], align 8 +; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false) ; SM_60-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4 ; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; SM_60-NEXT: [[INPUT13:%.*]] = load i64, ptr addrspace(101) [[INPUT12]], align 4 -; SM_60-NEXT: store i64 [[INPUT13]], ptr [[INPUT11]], align 4 +; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false) ; SM_60-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] ; SM_60: [[FIRST]]: ; SM_60-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0 @@ -403,12 +388,10 @@ define void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr byval(%str ; COMMON-NEXT: [[BB:.*:]] ; COMMON-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8 ; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) -; COMMON-NEXT: [[INPUT26:%.*]] = load i64, ptr addrspace(101) [[INPUT25]], align 8 -; COMMON-NEXT: store i64 [[INPUT26]], ptr [[INPUT24]], align 8 +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false) ; COMMON-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4 ; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; COMMON-NEXT: [[INPUT13:%.*]] = load i64, ptr addrspace(101) [[INPUT12]], align 4 -; COMMON-NEXT: store i64 [[INPUT13]], ptr [[INPUT11]], align 4 +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false) ; COMMON-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] ; COMMON: [[FIRST]]: ; COMMON-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0 @@ -438,45 +421,6 @@ merge: ; preds = %second, %first ret void } -%union.U = type { %struct.P } -%struct.P = type { i8, i32 } - -; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) -define dso_local void @padding(ptr nocapture noundef readnone %out, ptr noundef byval(%union.U) align 4 %s) local_unnamed_addr #0 { -; COMMON-LABEL: define dso_local void @padding( -; COMMON-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[UNION_U:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COMMON-NEXT: [[ENTRY:.*:]] -; COMMON-NEXT: [[S1:%.*]] = alloca [[UNION_U]], align 4 -; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S3:%.*]] = load i64, ptr addrspace(101) [[S2]], align 4 -; COMMON-NEXT: store i64 [[S3]], ptr [[S1]], align 4 -; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) -; COMMON-NEXT: ret void -; -entry: - call void @_Z6escapePv(ptr noundef nonnull %s) #0 - ret void -} - -%struct.C = type { [45 x i8] } - -; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) -define dso_local void @coalescing(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.C) align 4 %s) local_unnamed_addr #0 { -; COMMON-LABEL: define dso_local void @coalescing( -; COMMON-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_C:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COMMON-NEXT: [[ENTRY:.*:]] -; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_C]], align 4 -; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[S3:%.*]] = load [[OPAQUE_C]], ptr addrspace(101) [[S2]], align 4 -; COMMON-NEXT: store [[OPAQUE_C]] [[S3]], ptr [[S1]], align 4 -; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) -; COMMON-NEXT: ret void -; -entry: - call void @_Z6escapePv(ptr noundef nonnull %s) #0 - ret void -} - attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "no-trapping-math"="true" "target-cpu"="sm_60" "target-features"="+ptx78,+sm_60" "uniform-work-group-size"="true" } attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) } diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected index bc802363bf46d..5c9af3bb44da2 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected @@ -9,43 +9,38 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct ; CHECK-NEXT: .local .align 8 .b8 __local_depot0[32]; ; CHECK-NEXT: .reg .b32 %SP; ; CHECK-NEXT: .reg .b32 %SPL; -; CHECK-NEXT: .reg .b32 %r<4>; -; CHECK-NEXT: .reg .b64 %rd<17>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<13>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u32 %SPL, __local_depot0; ; CHECK-NEXT: cvta.local.u32 %SP, %SPL; ; CHECK-NEXT: ld.param.u32 %r1, [caller_St8x4_param_1]; -; CHECK-NEXT: add.u32 %r3, %SPL, 0; -; CHECK-NEXT: ld.param.u64 %rd1, [caller_St8x4_param_0+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [caller_St8x4_param_0+24]; -; CHECK-NEXT: ld.param.u64 %rd3, [caller_St8x4_param_0]; -; CHECK-NEXT: ld.param.u64 %rd4, [caller_St8x4_param_0+8]; -; CHECK-NEXT: st.local.u64 [%r3+8], %rd4; -; CHECK-NEXT: st.local.u64 [%r3], %rd3; -; CHECK-NEXT: st.local.u64 [%r3+24], %rd2; -; CHECK-NEXT: st.local.u64 [%r3+16], %rd1; -; CHECK-NEXT: ld.u64 %rd5, [%SP+8]; -; CHECK-NEXT: ld.u64 %rd6, [%SP+0]; -; CHECK-NEXT: ld.u64 %rd7, [%SP+24]; -; CHECK-NEXT: ld.u64 %rd8, [%SP+16]; +; CHECK-NEXT: ld.param.u64 %rd1, [caller_St8x4_param_0+24]; +; CHECK-NEXT: st.u64 [%SP+24], %rd1; +; CHECK-NEXT: ld.param.u64 %rd2, [caller_St8x4_param_0+16]; +; CHECK-NEXT: st.u64 [%SP+16], %rd2; +; CHECK-NEXT: ld.param.u64 %rd3, [caller_St8x4_param_0+8]; +; CHECK-NEXT: st.u64 [%SP+8], %rd3; +; CHECK-NEXT: ld.param.u64 %rd4, [caller_St8x4_param_0]; +; CHECK-NEXT: st.u64 [%SP+0], %rd4; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[32]; -; CHECK-NEXT: st.param.v2.b64 [param0+0], {%rd6, %rd5}; -; CHECK-NEXT: st.param.v2.b64 [param0+16], {%rd8, %rd7}; +; CHECK-NEXT: st.param.v2.b64 [param0+0], {%rd4, %rd3}; +; CHECK-NEXT: st.param.v2.b64 [param0+16], {%rd2, %rd1}; ; CHECK-NEXT: .param .align 16 .b8 retval0[32]; ; CHECK-NEXT: call.uni (retval0), ; CHECK-NEXT: callee_St8x4, ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); -; CHECK-NEXT: ld.param.v2.b64 {%rd9, %rd10}, [retval0+0]; -; CHECK-NEXT: ld.param.v2.b64 {%rd11, %rd12}, [retval0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [retval0+0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [retval0+16]; ; CHECK-NEXT: } // callseq 0 -; CHECK-NEXT: st.u64 [%r1], %rd9; -; CHECK-NEXT: st.u64 [%r1+8], %rd10; -; CHECK-NEXT: st.u64 [%r1+16], %rd11; -; CHECK-NEXT: st.u64 [%r1+24], %rd12; +; CHECK-NEXT: st.u64 [%r1], %rd5; +; CHECK-NEXT: st.u64 [%r1+8], %rd6; +; CHECK-NEXT: st.u64 [%r1+16], %rd7; +; CHECK-NEXT: st.u64 [%r1+24], %rd8; ; CHECK-NEXT: ret; %call = tail call fastcc [4 x i64] @callee_St8x4(ptr noundef nonnull byval(%struct.St8x4) align 8 %in) #2 %.fca.0.extract = extractvalue [4 x i64] %call, 0