Skip to content

Commit 88a395e

Browse files
committed
[NVTPX] Copy kernel arguments via memcpy intrinsic
1 parent 35bcad6 commit 88a395e

File tree

4 files changed

+70
-122
lines changed

4 files changed

+70
-122
lines changed

llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp

Lines changed: 4 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -623,33 +623,13 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
623623
Value *ArgInParam = new AddrSpaceCastInst(
624624
Arg, PointerType::get(Arg->getContext(), ADDRESS_SPACE_PARAM),
625625
Arg->getName(), FirstInst);
626-
// Create an opaque type of same size as StructType but without padding
627-
// holes as this could have been a union.
628-
const auto StructBytes = *AllocA->getAllocationSize(DL);
629-
SmallVector<Type *, 5> ChunkTypes;
630-
if (StructBytes >= 16) {
631-
Type *IntType = Type::getInt64Ty(Func->getContext());
632-
Type *ChunkType = VectorType::get(IntType, 2, false);
633-
Type *OpaqueType = StructBytes < 32 ? ChunkType :
634-
ArrayType::get(ChunkType, StructBytes / 16);
635-
ChunkTypes.push_back(OpaqueType);
636-
}
637-
for (const auto ChunkBytes: {8, 4, 2, 1}) {
638-
if (StructBytes & ChunkBytes) {
639-
Type *ChunkType = Type::getIntNTy(Func->getContext(), 8 * ChunkBytes);
640-
ChunkTypes.push_back(ChunkType);
641-
}
642-
}
643-
Type * OpaqueType = ChunkTypes.size() == 1 ? ChunkTypes[0] :
644-
StructType::create(ChunkTypes);
645626
// Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX
646627
// addrspacecast preserves alignment. Since params are constant, this load
647628
// is definitely not volatile.
648-
LoadInst *LI =
649-
new LoadInst(OpaqueType, ArgInParam, Arg->getName(),
650-
/*isVolatile=*/false, AllocA->getAlign(), FirstInst);
651-
new StoreInst(LI, AllocA,
652-
/*isVolatile=*/false, AllocA->getAlign(), FirstInst);
629+
const auto ArgSize = *AllocA->getAllocationSize(DL);
630+
IRBuilder<> IRB(&*FirstInst);
631+
IRB.CreateMemCpy(AllocA, AllocA->getAlign(), ArgInParam, AllocA->getAlign(),
632+
ArgSize);
653633
}
654634
}
655635

llvm/test/CodeGen/NVPTX/lower-args.ll

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,15 @@
77
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
88
target triple = "nvptx64-nvidia-cuda"
99

10-
; IR: [[OPAQUE_OUTER:.*]] = type { <2 x i64>, i64 }
1110
%class.outer = type <{ %class.inner, i32, [4 x i8] }>
1211
%class.inner = type { ptr, ptr }
12+
%class.padded = type { i8, i32 }
1313

1414
; Check that nvptx-lower-args preserves arg alignment
1515
; COMMON-LABEL: load_alignment
1616
define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 %arg) {
1717
entry:
18-
; IR: load [[OPAQUE_OUTER]], ptr addrspace(101)
19-
; IR-SAME: align 8
18+
; IR: call void @llvm.memcpy.p0.p101.i64(ptr align 8
2019
; PTX: ld.param.u64
2120
; PTX-NOT: ld.param.u8
2221
%arg.idx.val = load ptr, ptr %arg, align 8
@@ -34,6 +33,36 @@ entry:
3433
ret void
3534
}
3635

36+
; Check that nvptx-lower-args copies padding as the struct may have been a union
37+
; COMMON-LABEL: load_padding
38+
define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) {
39+
; PTX: {
40+
; PTX-NEXT: .local .align 8 .b8 __local_depot1[8];
41+
; PTX-NEXT: .reg .b64 %SP;
42+
; PTX-NEXT: .reg .b64 %SPL;
43+
; PTX-NEXT: .reg .b64 %rd<5>;
44+
; PTX-EMPTY:
45+
; PTX-NEXT: // %bb.0:
46+
; PTX-NEXT: mov.u64 %SPL, __local_depot1;
47+
; PTX-NEXT: cvta.local.u64 %SP, %SPL;
48+
; PTX-NEXT: ld.param.u64 %rd1, [load_padding_param_0];
49+
; PTX-NEXT: st.u64 [%SP+0], %rd1;
50+
; PTX-NEXT: add.u64 %rd2, %SP, 0;
51+
; PTX-NEXT: { // callseq 1, 0
52+
; PTX-NEXT: .param .b64 param0;
53+
; PTX-NEXT: st.param.b64 [param0+0], %rd2;
54+
; PTX-NEXT: .param .b64 retval0;
55+
; PTX-NEXT: call.uni (retval0),
56+
; PTX-NEXT: escape,
57+
; PTX-NEXT: (
58+
; PTX-NEXT: param0
59+
; PTX-NEXT: );
60+
; PTX-NEXT: ld.param.b64 %rd3, [retval0+0];
61+
; PTX-NEXT: } // callseq 1
62+
; PTX-NEXT: ret;
63+
%tmp = call ptr @escape(ptr nonnull align 16 %arg)
64+
ret void
65+
}
3766

3867
; COMMON-LABEL: ptr_generic
3968
define void @ptr_generic(ptr %out, ptr %in) {

llvm/test/CodeGen/NVPTX/lower-byval-args.ll

Lines changed: 16 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ source_filename = "<stdin>"
55
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
66
target triple = "nvptx64-nvidia-cuda"
77

8-
; COMMON: [[OPAQUE_C:.*]] = type { [2 x <2 x i64>], i64, i32, i8 }
98
%struct.S = type { i32, i32 }
109

1110
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
@@ -89,8 +88,7 @@ define dso_local void @read_only_gep_asc0(ptr nocapture noundef writeonly %out,
8988
; COMMON-NEXT: [[ENTRY:.*:]]
9089
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
9190
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
92-
; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4
93-
; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4
91+
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
9492
; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
9593
; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
9694
; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
@@ -116,8 +114,7 @@ define dso_local void @escape_ptr(ptr nocapture noundef readnone %out, ptr nound
116114
; COMMON-NEXT: [[ENTRY:.*:]]
117115
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
118116
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
119-
; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4
120-
; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4
117+
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
121118
; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
122119
; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
123120
; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S3]])
@@ -135,8 +132,7 @@ define dso_local void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr n
135132
; COMMON-NEXT: [[ENTRY:.*:]]
136133
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
137134
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
138-
; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4
139-
; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4
135+
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
140136
; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
141137
; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
142138
; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
@@ -156,8 +152,7 @@ define dso_local void @escape_ptr_store(ptr nocapture noundef writeonly %out, pt
156152
; COMMON-NEXT: [[ENTRY:.*:]]
157153
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
158154
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
159-
; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4
160-
; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4
155+
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
161156
; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
162157
; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
163158
; COMMON-NEXT: store ptr [[S3]], ptr [[OUT2]], align 8
@@ -175,8 +170,7 @@ define dso_local void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out
175170
; COMMON-NEXT: [[ENTRY:.*:]]
176171
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
177172
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
178-
; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4
179-
; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4
173+
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
180174
; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
181175
; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
182176
; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
@@ -196,8 +190,7 @@ define dso_local void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr
196190
; COMMON-NEXT: [[ENTRY:.*:]]
197191
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
198192
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
199-
; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4
200-
; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4
193+
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
201194
; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
202195
; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
203196
; COMMON-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64
@@ -233,8 +226,7 @@ define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr n
233226
; COMMON-NEXT: [[ENTRY:.*:]]
234227
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
235228
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
236-
; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4
237-
; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4
229+
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
238230
; COMMON-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
239231
; COMMON-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr
240232
; COMMON-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true)
@@ -252,8 +244,7 @@ define dso_local void @copy_on_store(ptr nocapture noundef readonly %in, ptr noc
252244
; COMMON-NEXT: [[BB:.*:]]
253245
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
254246
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
255-
; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4
256-
; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4
247+
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
257248
; COMMON-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
258249
; COMMON-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr
259250
; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[IN2]], align 4
@@ -274,12 +265,10 @@ define void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2,
274265
; SM_60-NEXT: [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr
275266
; SM_60-NEXT: [[INPUT24:%.*]] = alloca i32, align 4
276267
; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
277-
; SM_60-NEXT: [[INPUT26:%.*]] = load i32, ptr addrspace(101) [[INPUT25]], align 4
278-
; SM_60-NEXT: store i32 [[INPUT26]], ptr [[INPUT24]], align 4
268+
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT24]], ptr addrspace(101) align 4 [[INPUT25]], i64 4, i1 false)
279269
; SM_60-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
280270
; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
281-
; SM_60-NEXT: [[INPUT13:%.*]] = load i32, ptr addrspace(101) [[INPUT12]], align 4
282-
; SM_60-NEXT: store i32 [[INPUT13]], ptr [[INPUT11]], align 4
271+
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
283272
; SM_60-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]]
284273
; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
285274
; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[OUT8]], align 4
@@ -314,12 +303,10 @@ define void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %i
314303
; COMMON-NEXT: [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr
315304
; COMMON-NEXT: [[INPUT24:%.*]] = alloca i32, align 4
316305
; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
317-
; COMMON-NEXT: [[INPUT26:%.*]] = load i32, ptr addrspace(101) [[INPUT25]], align 4
318-
; COMMON-NEXT: store i32 [[INPUT26]], ptr [[INPUT24]], align 4
306+
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT24]], ptr addrspace(101) align 4 [[INPUT25]], i64 4, i1 false)
319307
; COMMON-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
320308
; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
321-
; COMMON-NEXT: [[INPUT13:%.*]] = load i32, ptr addrspace(101) [[INPUT12]], align 4
322-
; COMMON-NEXT: store i32 [[INPUT13]], ptr [[INPUT11]], align 4
309+
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
323310
; COMMON-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]]
324311
; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4
325312
; COMMON-NEXT: ret void
@@ -338,12 +325,10 @@ define void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S)
338325
; SM_60-NEXT: [[INOUT8:%.*]] = addrspacecast ptr addrspace(1) [[INOUT7]] to ptr
339326
; SM_60-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
340327
; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
341-
; SM_60-NEXT: [[INPUT26:%.*]] = load i64, ptr addrspace(101) [[INPUT25]], align 8
342-
; SM_60-NEXT: store i64 [[INPUT26]], ptr [[INPUT24]], align 8
328+
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false)
343329
; SM_60-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
344330
; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
345-
; SM_60-NEXT: [[INPUT13:%.*]] = load i64, ptr addrspace(101) [[INPUT12]], align 4
346-
; SM_60-NEXT: store i64 [[INPUT13]], ptr [[INPUT11]], align 4
331+
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false)
347332
; SM_60-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
348333
; SM_60: [[FIRST]]:
349334
; SM_60-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0
@@ -403,12 +388,10 @@ define void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr byval(%str
403388
; COMMON-NEXT: [[BB:.*:]]
404389
; COMMON-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
405390
; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
406-
; COMMON-NEXT: [[INPUT26:%.*]] = load i64, ptr addrspace(101) [[INPUT25]], align 8
407-
; COMMON-NEXT: store i64 [[INPUT26]], ptr [[INPUT24]], align 8
391+
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false)
408392
; COMMON-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
409393
; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
410-
; COMMON-NEXT: [[INPUT13:%.*]] = load i64, ptr addrspace(101) [[INPUT12]], align 4
411-
; COMMON-NEXT: store i64 [[INPUT13]], ptr [[INPUT11]], align 4
394+
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false)
412395
; COMMON-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
413396
; COMMON: [[FIRST]]:
414397
; COMMON-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0
@@ -438,45 +421,6 @@ merge: ; preds = %second, %first
438421
ret void
439422
}
440423

441-
%union.U = type { %struct.P }
442-
%struct.P = type { i8, i32 }
443-
444-
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
445-
define dso_local void @padding(ptr nocapture noundef readnone %out, ptr noundef byval(%union.U) align 4 %s) local_unnamed_addr #0 {
446-
; COMMON-LABEL: define dso_local void @padding(
447-
; COMMON-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[UNION_U:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
448-
; COMMON-NEXT: [[ENTRY:.*:]]
449-
; COMMON-NEXT: [[S1:%.*]] = alloca [[UNION_U]], align 4
450-
; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
451-
; COMMON-NEXT: [[S3:%.*]] = load i64, ptr addrspace(101) [[S2]], align 4
452-
; COMMON-NEXT: store i64 [[S3]], ptr [[S1]], align 4
453-
; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]])
454-
; COMMON-NEXT: ret void
455-
;
456-
entry:
457-
call void @_Z6escapePv(ptr noundef nonnull %s) #0
458-
ret void
459-
}
460-
461-
%struct.C = type { [45 x i8] }
462-
463-
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
464-
define dso_local void @coalescing(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.C) align 4 %s) local_unnamed_addr #0 {
465-
; COMMON-LABEL: define dso_local void @coalescing(
466-
; COMMON-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_C:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
467-
; COMMON-NEXT: [[ENTRY:.*:]]
468-
; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_C]], align 4
469-
; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
470-
; COMMON-NEXT: [[S3:%.*]] = load [[OPAQUE_C]], ptr addrspace(101) [[S2]], align 4
471-
; COMMON-NEXT: store [[OPAQUE_C]] [[S3]], ptr [[S1]], align 4
472-
; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]])
473-
; COMMON-NEXT: ret void
474-
;
475-
entry:
476-
call void @_Z6escapePv(ptr noundef nonnull %s) #0
477-
ret void
478-
}
479-
480424
attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "no-trapping-math"="true" "target-cpu"="sm_60" "target-features"="+ptx78,+sm_60" "uniform-work-group-size"="true" }
481425
attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
482426
attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }

0 commit comments

Comments
 (0)