@@ -988,6 +988,77 @@ merge: ; preds = %second, %first
988988 ret void
989989}
990990
991+ define ptx_kernel void @test_forward_byval_arg (ptr byval (i32 ) align 4 %input ) {
992+ ; COMMON-LABEL: define ptx_kernel void @test_forward_byval_arg(
993+ ; COMMON-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR3]] {
994+ ; COMMON-NEXT: [[INPUT1:%.*]] = alloca i32, align 4
995+ ; COMMON-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
996+ ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT1]], ptr addrspace(101) align 4 [[INPUT2]], i64 4, i1 false)
997+ ; COMMON-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT1]])
998+ ; COMMON-NEXT: ret void
999+ ;
1000+ ; PTX-LABEL: test_forward_byval_arg(
1001+ ; PTX: {
1002+ ; PTX-NEXT: .local .align 4 .b8 __local_depot17[4];
1003+ ; PTX-NEXT: .reg .b64 %SP;
1004+ ; PTX-NEXT: .reg .b64 %SPL;
1005+ ; PTX-NEXT: .reg .b32 %r<2>;
1006+ ; PTX-NEXT: .reg .b64 %rd<3>;
1007+ ; PTX-EMPTY:
1008+ ; PTX-NEXT: // %bb.0:
1009+ ; PTX-NEXT: mov.b64 %SPL, __local_depot17;
1010+ ; PTX-NEXT: add.u64 %rd2, %SPL, 0;
1011+ ; PTX-NEXT: ld.param.u32 %r1, [test_forward_byval_arg_param_0];
1012+ ; PTX-NEXT: st.local.u32 [%rd2], %r1;
1013+ ; PTX-NEXT: { // callseq 2, 0
1014+ ; PTX-NEXT: .param .align 4 .b8 param0[4];
1015+ ; PTX-NEXT: st.param.b32 [param0], %r1;
1016+ ; PTX-NEXT: call.uni
1017+ ; PTX-NEXT: device_func,
1018+ ; PTX-NEXT: (
1019+ ; PTX-NEXT: param0
1020+ ; PTX-NEXT: );
1021+ ; PTX-NEXT: } // callseq 2
1022+ ; PTX-NEXT: ret;
1023+ call void @device_func (ptr byval (i32 ) align 4 %input )
1024+ ret void
1025+ }
1026+
1027+ define void @device_func (ptr byval (i32 ) align 4 %input ) {
1028+ ; LOWER-ARGS-LABEL: define void @device_func(
1029+ ; LOWER-ARGS-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR3]] {
1030+ ; LOWER-ARGS-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT]])
1031+ ; LOWER-ARGS-NEXT: ret void
1032+ ;
1033+ ; COPY-LABEL: define void @device_func(
1034+ ; COPY-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR3]] {
1035+ ; COPY-NEXT: [[INPUT1:%.*]] = alloca i32, align 4
1036+ ; COPY-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
1037+ ; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT1]], ptr addrspace(101) align 4 [[INPUT2]], i64 4, i1 false)
1038+ ; COPY-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT1]])
1039+ ; COPY-NEXT: ret void
1040+ ;
1041+ ; PTX-LABEL: device_func(
1042+ ; PTX: {
1043+ ; PTX-NEXT: .reg .b32 %r<2>;
1044+ ; PTX-NEXT: .reg .b64 %rd<3>;
1045+ ; PTX-EMPTY:
1046+ ; PTX-NEXT: // %bb.0:
1047+ ; PTX-NEXT: ld.param.u32 %r1, [device_func_param_0];
1048+ ; PTX-NEXT: { // callseq 3, 0
1049+ ; PTX-NEXT: .param .align 4 .b8 param0[4];
1050+ ; PTX-NEXT: st.param.b32 [param0], %r1;
1051+ ; PTX-NEXT: call.uni
1052+ ; PTX-NEXT: device_func,
1053+ ; PTX-NEXT: (
1054+ ; PTX-NEXT: param0
1055+ ; PTX-NEXT: );
1056+ ; PTX-NEXT: } // callseq 3
1057+ ; PTX-NEXT: ret;
1058+ call void @device_func (ptr byval (i32 ) align 4 %input )
1059+ ret void
1060+ }
1061+
9911062attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "no-trapping-math" ="true" "target-cpu" ="sm_60" "target-features" ="+ptx78,+sm_60" "uniform-work-group-size" ="true" }
9921063attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
9931064attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
0 commit comments