1+ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
12; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
23; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
34
@@ -10,67 +11,95 @@ declare i64 @llvm.ctlz.i64(i64, i1) readnone
1011; There should be no difference between llvm.ctlz.i32(%a, true) and
1112; llvm.ctlz.i32(%a, false), as ptx's clz(0) is defined to return 0.
1213
13- ; CHECK-LABEL: myctlz(
1414define i32 @myctlz (i32 %a ) {
15- ; CHECK: ld.param.
16- ; CHECK-NEXT: clz.b32
17- ; CHECK-NEXT: st.param.
18- ; CHECK-NEXT: ret;
15+ ; CHECK-LABEL: myctlz(
16+ ; CHECK: {
17+ ; CHECK-NEXT: .reg .b32 %r<3>;
18+ ; CHECK-EMPTY:
19+ ; CHECK-NEXT: // %bb.0:
20+ ; CHECK-NEXT: ld.param.u32 %r1, [myctlz_param_0];
21+ ; CHECK-NEXT: clz.b32 %r2, %r1;
22+ ; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
23+ ; CHECK-NEXT: ret;
1924 %val = call i32 @llvm.ctlz.i32 (i32 %a , i1 false ) readnone
2025 ret i32 %val
2126}
22- ; CHECK-LABEL: myctlz_2(
2327define i32 @myctlz_2 (i32 %a ) {
24- ; CHECK: ld.param.
25- ; CHECK-NEXT: clz.b32
26- ; CHECK-NEXT: st.param.
27- ; CHECK-NEXT: ret;
28+ ; CHECK-LABEL: myctlz_2(
29+ ; CHECK: {
30+ ; CHECK-NEXT: .reg .b32 %r<3>;
31+ ; CHECK-EMPTY:
32+ ; CHECK-NEXT: // %bb.0:
33+ ; CHECK-NEXT: ld.param.u32 %r1, [myctlz_2_param_0];
34+ ; CHECK-NEXT: clz.b32 %r2, %r1;
35+ ; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
36+ ; CHECK-NEXT: ret;
2837 %val = call i32 @llvm.ctlz.i32 (i32 %a , i1 true ) readnone
2938 ret i32 %val
3039}
3140
3241; PTX's clz.b64 returns a 32-bit value, but LLVM's intrinsic returns a 64-bit
3342; value, so here we have to zero-extend it.
34- ; CHECK-LABEL: myctlz64(
3543define i64 @myctlz64 (i64 %a ) {
36- ; CHECK: ld.param.
37- ; CHECK-NEXT: clz.b64
38- ; CHECK-NEXT: cvt.u64.u32
39- ; CHECK-NEXT: st.param.
40- ; CHECK-NEXT: ret;
44+ ; CHECK-LABEL: myctlz64(
45+ ; CHECK: {
46+ ; CHECK-NEXT: .reg .b32 %r<2>;
47+ ; CHECK-NEXT: .reg .b64 %rd<3>;
48+ ; CHECK-EMPTY:
49+ ; CHECK-NEXT: // %bb.0:
50+ ; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_param_0];
51+ ; CHECK-NEXT: clz.b64 %r1, %rd1;
52+ ; CHECK-NEXT: cvt.u64.u32 %rd2, %r1;
53+ ; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
54+ ; CHECK-NEXT: ret;
4155 %val = call i64 @llvm.ctlz.i64 (i64 %a , i1 false ) readnone
4256 ret i64 %val
4357}
44- ; CHECK-LABEL: myctlz64_2(
4558define i64 @myctlz64_2 (i64 %a ) {
46- ; CHECK: ld.param.
47- ; CHECK-NEXT: clz.b64
48- ; CHECK-NEXT: cvt.u64.u32
49- ; CHECK-NEXT: st.param.
50- ; CHECK-NEXT: ret;
59+ ; CHECK-LABEL: myctlz64_2(
60+ ; CHECK: {
61+ ; CHECK-NEXT: .reg .b32 %r<2>;
62+ ; CHECK-NEXT: .reg .b64 %rd<3>;
63+ ; CHECK-EMPTY:
64+ ; CHECK-NEXT: // %bb.0:
65+ ; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_2_param_0];
66+ ; CHECK-NEXT: clz.b64 %r1, %rd1;
67+ ; CHECK-NEXT: cvt.u64.u32 %rd2, %r1;
68+ ; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
69+ ; CHECK-NEXT: ret;
5170 %val = call i64 @llvm.ctlz.i64 (i64 %a , i1 true ) readnone
5271 ret i64 %val
5372}
5473
5574; Here we truncate the 64-bit value of LLVM's ctlz intrinsic to 32 bits, the
5675; natural return width of ptx's clz.b64 instruction. No conversions should be
5776; necessary in the PTX.
58- ; CHECK-LABEL: myctlz64_as_32(
5977define i32 @myctlz64_as_32 (i64 %a ) {
60- ; CHECK: ld.param.
61- ; CHECK-NEXT: clz.b64
62- ; CHECK-NEXT: st.param.
63- ; CHECK-NEXT: ret;
78+ ; CHECK-LABEL: myctlz64_as_32(
79+ ; CHECK: {
80+ ; CHECK-NEXT: .reg .b32 %r<2>;
81+ ; CHECK-NEXT: .reg .b64 %rd<2>;
82+ ; CHECK-EMPTY:
83+ ; CHECK-NEXT: // %bb.0:
84+ ; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_as_32_param_0];
85+ ; CHECK-NEXT: clz.b64 %r1, %rd1;
86+ ; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
87+ ; CHECK-NEXT: ret;
6488 %val = call i64 @llvm.ctlz.i64 (i64 %a , i1 false ) readnone
6589 %trunc = trunc i64 %val to i32
6690 ret i32 %trunc
6791}
68- ; CHECK-LABEL: myctlz64_as_32_2(
6992define i32 @myctlz64_as_32_2 (i64 %a ) {
70- ; CHECK: ld.param.
71- ; CHECK-NEXT: clz.b64
72- ; CHECK-NEXT: st.param.
73- ; CHECK-NEXT: ret;
93+ ; CHECK-LABEL: myctlz64_as_32_2(
94+ ; CHECK: {
95+ ; CHECK-NEXT: .reg .b32 %r<2>;
96+ ; CHECK-NEXT: .reg .b64 %rd<2>;
97+ ; CHECK-EMPTY:
98+ ; CHECK-NEXT: // %bb.0:
99+ ; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_as_32_2_param_0];
100+ ; CHECK-NEXT: clz.b64 %r1, %rd1;
101+ ; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
102+ ; CHECK-NEXT: ret;
74103 %val = call i64 @llvm.ctlz.i64 (i64 %a , i1 false ) readnone
75104 %trunc = trunc i64 %val to i32
76105 ret i32 %trunc
@@ -80,53 +109,77 @@ define i32 @myctlz64_as_32_2(i64 %a) {
80109; and then truncating the result back down to i16. But the NVPTX ABI
81110; zero-extends i16 return values to i32, so the final truncation doesn't appear
82111; in this function.
83- ; CHECK-LABEL: myctlz_ret16(
84112define i16 @myctlz_ret16 (i16 %a ) {
85- ; CHECK: ld.param.
86- ; CHECK-NEXT: cvt.u32.u16
87- ; CHECK-NEXT: clz.b32
88- ; CHECK-NEXT: sub.
89- ; CHECK-NEXT: st.param.
90- ; CHECK-NEXT: ret;
113+ ; CHECK-LABEL: myctlz_ret16(
114+ ; CHECK: {
115+ ; CHECK-NEXT: .reg .b16 %rs<2>;
116+ ; CHECK-NEXT: .reg .b32 %r<4>;
117+ ; CHECK-EMPTY:
118+ ; CHECK-NEXT: // %bb.0:
119+ ; CHECK-NEXT: ld.param.u16 %rs1, [myctlz_ret16_param_0];
120+ ; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
121+ ; CHECK-NEXT: clz.b32 %r2, %r1;
122+ ; CHECK-NEXT: sub.s32 %r3, %r2, 16;
123+ ; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
124+ ; CHECK-NEXT: ret;
91125 %val = call i16 @llvm.ctlz.i16 (i16 %a , i1 false ) readnone
92126 ret i16 %val
93127}
94- ; CHECK-LABEL: myctlz_ret16_2(
95128define i16 @myctlz_ret16_2 (i16 %a ) {
96- ; CHECK: ld.param.
97- ; CHECK-NEXT: cvt.u32.u16
98- ; CHECK-NEXT: clz.b32
99- ; CHECK-NEXT: sub.
100- ; CHECK-NEXT: st.param.
101- ; CHECK-NEXT: ret;
129+ ; CHECK-LABEL: myctlz_ret16_2(
130+ ; CHECK: {
131+ ; CHECK-NEXT: .reg .b16 %rs<2>;
132+ ; CHECK-NEXT: .reg .b32 %r<4>;
133+ ; CHECK-EMPTY:
134+ ; CHECK-NEXT: // %bb.0:
135+ ; CHECK-NEXT: ld.param.u16 %rs1, [myctlz_ret16_2_param_0];
136+ ; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
137+ ; CHECK-NEXT: clz.b32 %r2, %r1;
138+ ; CHECK-NEXT: sub.s32 %r3, %r2, 16;
139+ ; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
140+ ; CHECK-NEXT: ret;
102141 %val = call i16 @llvm.ctlz.i16 (i16 %a , i1 true ) readnone
103142 ret i16 %val
104143}
105144
106145; Here we store the result of ctlz.16 into an i16 pointer, so the trunc should
107146; remain.
108- ; CHECK-LABEL: myctlz_store16(
109147define void @myctlz_store16 (i16 %a , ptr %b ) {
110- ; CHECK: ld.param.
111- ; CHECK-NEXT: cvt.u32.u16
112- ; CHECK-NEXT: clz.b32
113- ; CHECK-DAG: cvt.u16.u32
114- ; CHECK-DAG: sub.
115- ; CHECK: st.{{[a-z]}}16
116- ; CHECK: ret;
148+ ; CHECK-LABEL: myctlz_store16(
149+ ; CHECK: {
150+ ; CHECK-NEXT: .reg .b16 %rs<4>;
151+ ; CHECK-NEXT: .reg .b32 %r<3>;
152+ ; CHECK-NEXT: .reg .b64 %rd<2>;
153+ ; CHECK-EMPTY:
154+ ; CHECK-NEXT: // %bb.0:
155+ ; CHECK-NEXT: ld.param.u16 %rs1, [myctlz_store16_param_0];
156+ ; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
157+ ; CHECK-NEXT: clz.b32 %r2, %r1;
158+ ; CHECK-NEXT: cvt.u16.u32 %rs2, %r2;
159+ ; CHECK-NEXT: sub.s16 %rs3, %rs2, 16;
160+ ; CHECK-NEXT: ld.param.u64 %rd1, [myctlz_store16_param_1];
161+ ; CHECK-NEXT: st.u16 [%rd1], %rs3;
162+ ; CHECK-NEXT: ret;
117163 %val = call i16 @llvm.ctlz.i16 (i16 %a , i1 false ) readnone
118164 store i16 %val , ptr %b
119165 ret void
120166}
121- ; CHECK-LABEL: myctlz_store16_2(
122167define void @myctlz_store16_2 (i16 %a , ptr %b ) {
123- ; CHECK: ld.param.
124- ; CHECK-NEXT: cvt.u32.u16
125- ; CHECK-NEXT: clz.b32
126- ; CHECK-DAG: cvt.u16.u32
127- ; CHECK-DAG: sub.
128- ; CHECK: st.{{[a-z]}}16
129- ; CHECK: ret;
168+ ; CHECK-LABEL: myctlz_store16_2(
169+ ; CHECK: {
170+ ; CHECK-NEXT: .reg .b16 %rs<4>;
171+ ; CHECK-NEXT: .reg .b32 %r<3>;
172+ ; CHECK-NEXT: .reg .b64 %rd<2>;
173+ ; CHECK-EMPTY:
174+ ; CHECK-NEXT: // %bb.0:
175+ ; CHECK-NEXT: ld.param.u16 %rs1, [myctlz_store16_2_param_0];
176+ ; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
177+ ; CHECK-NEXT: clz.b32 %r2, %r1;
178+ ; CHECK-NEXT: cvt.u16.u32 %rs2, %r2;
179+ ; CHECK-NEXT: sub.s16 %rs3, %rs2, 16;
180+ ; CHECK-NEXT: ld.param.u64 %rd1, [myctlz_store16_2_param_1];
181+ ; CHECK-NEXT: st.u16 [%rd1], %rs3;
182+ ; CHECK-NEXT: ret;
130183 %val = call i16 @llvm.ctlz.i16 (i16 %a , i1 false ) readnone
131184 store i16 %val , ptr %b
132185 ret void
0 commit comments