11; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5
2- ; RUN: llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | FileCheck %s
3- ; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %}
2+ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx83 | FileCheck %s
3+ ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %}
44
55target triple = "nvptx64-nvidia-cuda"
66
@@ -9,47 +9,41 @@ target triple = "nvptx64-nvidia-cuda"
99define void @test_b128_input_from_const () {
1010; CHECK-LABEL: test_b128_input_from_const(
1111; CHECK: {
12- ; CHECK-NEXT: .reg .b32 %r<3>;
13- ; CHECK-NEXT: .reg .b64 %rd<4>;
12+ ; CHECK-NEXT: .reg .b64 %rd<5>;
1413; CHECK-NEXT: .reg .b128 %rq<2>;
1514; CHECK-EMPTY:
1615; CHECK-NEXT: // %bb.0:
1716; CHECK-NEXT: mov.u64 %rd2, 0;
1817; CHECK-NEXT: mov.u64 %rd3, 42;
1918; CHECK-NEXT: mov.b128 %rq1, {%rd3, %rd2};
20- ; CHECK-NEXT: mov.u32 %r1, value;
21- ; CHECK-NEXT: cvta.global.u32 %r2, %r1;
22- ; CHECK-NEXT: cvt.u64.u32 %rd1, %r2;
19+ ; CHECK-NEXT: mov.u64 %rd4, value;
20+ ; CHECK-NEXT: cvta.global.u64 %rd1, %rd4;
2321; CHECK-NEXT: // begin inline asm
2422; CHECK-NEXT: { st.b128 [%rd1], %rq1; }
2523; CHECK-NEXT: // end inline asm
2624; CHECK-NEXT: ret;
27-
2825 tail call void asm sideeffect "{ st.b128 [$0], $1; }" , "l,q" (ptr nonnull addrspacecast (ptr addrspace (1 ) @value to ptr ), i128 42 )
2926 ret void
3027}
3128
3229define void @test_b128_input_from_load (ptr nocapture readonly %data ) {
3330; CHECK-LABEL: test_b128_input_from_load(
3431; CHECK: {
35- ; CHECK-NEXT: .reg .b32 %r<5>;
36- ; CHECK-NEXT: .reg .b64 %rd<4>;
32+ ; CHECK-NEXT: .reg .b64 %rd<7>;
3733; CHECK-NEXT: .reg .b128 %rq<2>;
3834; CHECK-EMPTY:
3935; CHECK-NEXT: // %bb.0:
40- ; CHECK-NEXT: ld.param.u32 %r1, [test_b128_input_from_load_param_0];
41- ; CHECK-NEXT: cvta.to.global.u32 %r2, %r1;
42- ; CHECK-NEXT: ld.global.u64 %rd2, [%r2+8];
43- ; CHECK-NEXT: ld.global.u64 %rd3, [%r2];
44- ; CHECK-NEXT: mov.b128 %rq1, {%rd3, %rd2};
45- ; CHECK-NEXT: mov.u32 %r3, value;
46- ; CHECK-NEXT: cvta.global.u32 %r4, %r3;
47- ; CHECK-NEXT: cvt.u64.u32 %rd1, %r4;
36+ ; CHECK-NEXT: ld.param.u64 %rd2, [test_b128_input_from_load_param_0];
37+ ; CHECK-NEXT: cvta.to.global.u64 %rd3, %rd2;
38+ ; CHECK-NEXT: ld.global.u64 %rd4, [%rd3+8];
39+ ; CHECK-NEXT: ld.global.u64 %rd5, [%rd3];
40+ ; CHECK-NEXT: mov.b128 %rq1, {%rd5, %rd4};
41+ ; CHECK-NEXT: mov.u64 %rd6, value;
42+ ; CHECK-NEXT: cvta.global.u64 %rd1, %rd6;
4843; CHECK-NEXT: // begin inline asm
4944; CHECK-NEXT: { st.b128 [%rd1], %rq1; }
5045; CHECK-NEXT: // end inline asm
5146; CHECK-NEXT: ret;
52-
5347 %1 = addrspacecast ptr %data to ptr addrspace (1 )
5448 %2 = load <2 x i64 >, ptr addrspace (1 ) %1 , align 16
5549 %3 = bitcast <2 x i64 > %2 to i128
@@ -62,26 +56,23 @@ define void @test_b128_input_from_select(ptr nocapture readonly %flag) {
6256; CHECK: {
6357; CHECK-NEXT: .reg .pred %p<2>;
6458; CHECK-NEXT: .reg .b16 %rs<2>;
65- ; CHECK-NEXT: .reg .b32 %r<5>;
66- ; CHECK-NEXT: .reg .b64 %rd<4>;
59+ ; CHECK-NEXT: .reg .b64 %rd<7>;
6760; CHECK-NEXT: .reg .b128 %rq<2>;
6861; CHECK-EMPTY:
6962; CHECK-NEXT: // %bb.0:
70- ; CHECK-NEXT: ld.param.u32 %r1 , [test_b128_input_from_select_param_0];
71- ; CHECK-NEXT: cvta.to.global.u32 %r2 , %r1 ;
72- ; CHECK-NEXT: ld.global.u8 %rs1, [%r2 ];
63+ ; CHECK-NEXT: ld.param.u64 %rd2 , [test_b128_input_from_select_param_0];
64+ ; CHECK-NEXT: cvta.to.global.u64 %rd3 , %rd2 ;
65+ ; CHECK-NEXT: ld.global.u8 %rs1, [%rd3 ];
7366; CHECK-NEXT: setp.eq.s16 %p1, %rs1, 0;
74- ; CHECK-NEXT: selp.b64 %rd2, 24, 42, %p1;
75- ; CHECK-NEXT: mov.u64 %rd3, 0;
76- ; CHECK-NEXT: mov.b128 %rq1, {%rd2, %rd3};
77- ; CHECK-NEXT: mov.u32 %r3, value;
78- ; CHECK-NEXT: cvta.global.u32 %r4, %r3;
79- ; CHECK-NEXT: cvt.u64.u32 %rd1, %r4;
67+ ; CHECK-NEXT: selp.b64 %rd4, 24, 42, %p1;
68+ ; CHECK-NEXT: mov.u64 %rd5, 0;
69+ ; CHECK-NEXT: mov.b128 %rq1, {%rd4, %rd5};
70+ ; CHECK-NEXT: mov.u64 %rd6, value;
71+ ; CHECK-NEXT: cvta.global.u64 %rd1, %rd6;
8072; CHECK-NEXT: // begin inline asm
8173; CHECK-NEXT: { st.b128 [%rd1], %rq1; }
8274; CHECK-NEXT: // end inline asm
8375; CHECK-NEXT: ret;
84-
8576 %1 = addrspacecast ptr %flag to ptr addrspace (1 )
8677 %2 = load i8 , ptr addrspace (1 ) %1 , align 1
8778 %3 = icmp eq i8 %2 , 0
@@ -106,7 +97,6 @@ define void @test_store_b128_output() {
10697; CHECK-NEXT: st.global.u64 [value+8], %rd4;
10798; CHECK-NEXT: st.global.u64 [value], %rd3;
10899; CHECK-NEXT: ret;
109-
110100 %1 = tail call i128 asm "{ mov.b128 $0, 41; }" , "=q" ()
111101 %add = add nsw i128 %1 , 1
112102 %2 = bitcast i128 %add to <2 x i64 >
@@ -117,26 +107,24 @@ define void @test_store_b128_output() {
117107define void @test_use_of_b128_output (ptr nocapture readonly %data ) {
118108; CHECK-LABEL: test_use_of_b128_output(
119109; CHECK: {
120- ; CHECK-NEXT: .reg .b32 %r<3>;
121- ; CHECK-NEXT: .reg .b64 %rd<7>;
110+ ; CHECK-NEXT: .reg .b64 %rd<9>;
122111; CHECK-NEXT: .reg .b128 %rq<3>;
123112; CHECK-EMPTY:
124113; CHECK-NEXT: // %bb.0:
125- ; CHECK-NEXT: ld.param.u32 %r1 , [test_use_of_b128_output_param_0];
126- ; CHECK-NEXT: cvta.to.global.u32 %r2 , %r1 ;
127- ; CHECK-NEXT: ld.global.u64 %rd1 , [%r2 +8];
128- ; CHECK-NEXT: ld.global.u64 %rd2 , [%r2 ];
129- ; CHECK-NEXT: mov.b128 %rq2, {%rd2 , %rd1 };
114+ ; CHECK-NEXT: ld.param.u64 %rd1 , [test_use_of_b128_output_param_0];
115+ ; CHECK-NEXT: cvta.to.global.u64 %rd2 , %rd1 ;
116+ ; CHECK-NEXT: ld.global.u64 %rd3 , [%rd2 +8];
117+ ; CHECK-NEXT: ld.global.u64 %rd4 , [%rd2 ];
118+ ; CHECK-NEXT: mov.b128 %rq2, {%rd4 , %rd3 };
130119; CHECK-NEXT: // begin inline asm
131120; CHECK-NEXT: { mov.b128 %rq1, %rq2; }
132121; CHECK-NEXT: // end inline asm
133- ; CHECK-NEXT: mov.b128 {%rd3 , %rd4 }, %rq1;
134- ; CHECK-NEXT: add.cc.s64 %rd5 , %rd3 , 1;
135- ; CHECK-NEXT: addc.cc.s64 %rd6 , %rd4 , 0;
136- ; CHECK-NEXT: st.global.u64 [value], %rd5 ;
137- ; CHECK-NEXT: st.global.u64 [value+8], %rd6 ;
122+ ; CHECK-NEXT: mov.b128 {%rd5 , %rd6 }, %rq1;
123+ ; CHECK-NEXT: add.cc.s64 %rd7 , %rd5 , 1;
124+ ; CHECK-NEXT: addc.cc.s64 %rd8 , %rd6 , 0;
125+ ; CHECK-NEXT: st.global.u64 [value], %rd7 ;
126+ ; CHECK-NEXT: st.global.u64 [value+8], %rd8 ;
138127; CHECK-NEXT: ret;
139-
140128 %1 = addrspacecast ptr %data to ptr addrspace (1 )
141129 %2 = load <2 x i64 >, ptr addrspace (1 ) %1 , align 16
142130 %3 = bitcast <2 x i64 > %2 to i128
0 commit comments