Skip to content

Commit 948c749

Browse files
committed
Add new SLSR tests
1 parent 5588a65 commit 948c749

File tree

4 files changed

+602
-0
lines changed

4 files changed

+602
-0
lines changed
Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,246 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: opt < %s -passes=slsr -S | FileCheck %s
3+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_75 | FileCheck %s --check-prefix=PTX
4+
5+
target triple = "nvptx64-nvidia-cuda"
6+
7+
; CHECK-LABEL: slsr_i8_zero_delta(
8+
; CHECK-SAME: ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]])
9+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]]
10+
; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 32
11+
; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GEP0]]
12+
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 64
13+
; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GEP1]]
14+
; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 96
15+
; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GEP2]]
16+
define void @slsr_i8_zero_delta(ptr %in, ptr %out, i64 %add) {
17+
; PTX-LABEL: slsr_i8_zero_delta(
18+
; PTX: {
19+
; PTX-NEXT: .reg .b16 %rs<6>;
20+
; PTX-NEXT: .reg .b64 %rd<5>;
21+
; PTX-EMPTY:
22+
; PTX-NEXT: // %bb.0:
23+
; PTX-NEXT: ld.param.b64 %rd1, [slsr_i8_zero_delta_param_0];
24+
; PTX-NEXT: ld.param.b64 %rd2, [slsr_i8_zero_delta_param_2];
25+
; PTX-NEXT: add.s64 %rd3, %rd1, %rd2;
26+
; PTX-NEXT: ld.param.b64 %rd4, [slsr_i8_zero_delta_param_1];
27+
; PTX-NEXT: ld.b8 %rs1, [%rd3+32];
28+
; PTX-NEXT: ld.b8 %rs2, [%rd3+64];
29+
; PTX-NEXT: ld.b8 %rs3, [%rd3+96];
30+
; PTX-NEXT: add.s16 %rs4, %rs1, %rs2;
31+
; PTX-NEXT: add.s16 %rs5, %rs4, %rs3;
32+
; PTX-NEXT: st.b8 [%rd4], %rs5;
33+
; PTX-NEXT: ret;
34+
%getElem0.0 = getelementptr inbounds i8, ptr %in, i64 %add
35+
%getElem0.1 = getelementptr inbounds i8, ptr %getElem0.0, i64 32
36+
%load0 = load i8, ptr %getElem0.1
37+
38+
%getElem1.0 = getelementptr inbounds i8, ptr %in, i64 %add
39+
%getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 64
40+
%load1 = load i8, ptr %getElem1.1
41+
42+
%getElem2.0 = getelementptr inbounds i8, ptr %in, i64 %add
43+
%getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 96
44+
%load2 = load i8, ptr %getElem2.1
45+
46+
%out0 = add i8 %load0, %load1
47+
%out1 = add i8 %out0, %load2
48+
store i8 %out1, ptr %out
49+
50+
ret void
51+
}
52+
53+
; CHECK-LABEL: slsr_i8_zero_delta_2(
54+
; CHECK-SAME: ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]])
55+
; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]]
56+
; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GEP0]]
57+
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[GEP0]], i64 32
58+
; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GEP1]]
59+
; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[GEP0]], i64 64
60+
; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GEP2]]
61+
define void @slsr_i8_zero_delta_2(ptr %in, ptr %out, i64 %add) {
62+
; PTX-LABEL: slsr_i8_zero_delta_2(
63+
; PTX: {
64+
; PTX-NEXT: .reg .b16 %rs<6>;
65+
; PTX-NEXT: .reg .b64 %rd<5>;
66+
; PTX-EMPTY:
67+
; PTX-NEXT: // %bb.0:
68+
; PTX-NEXT: ld.param.b64 %rd1, [slsr_i8_zero_delta_2_param_0];
69+
; PTX-NEXT: ld.param.b64 %rd2, [slsr_i8_zero_delta_2_param_2];
70+
; PTX-NEXT: add.s64 %rd3, %rd1, %rd2;
71+
; PTX-NEXT: ld.param.b64 %rd4, [slsr_i8_zero_delta_2_param_1];
72+
; PTX-NEXT: ld.b8 %rs1, [%rd3];
73+
; PTX-NEXT: ld.b8 %rs2, [%rd3+32];
74+
; PTX-NEXT: ld.b8 %rs3, [%rd3+64];
75+
; PTX-NEXT: add.s16 %rs4, %rs1, %rs2;
76+
; PTX-NEXT: add.s16 %rs5, %rs4, %rs3;
77+
; PTX-NEXT: st.b8 [%rd4], %rs5;
78+
; PTX-NEXT: ret;
79+
%getElem0.0 = getelementptr inbounds i8, ptr %in, i64 %add
80+
%load0 = load i8, ptr %getElem0.0
81+
82+
%getElem1.0 = getelementptr i8, ptr %in, i64 %add
83+
%getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 32
84+
%load1 = load i8, ptr %getElem1.1
85+
86+
%getElem2.0 = getelementptr i8, ptr %in, i64 %add
87+
%getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 64
88+
%load2 = load i8, ptr %getElem2.1
89+
90+
%out0 = add i8 %load0, %load1
91+
%out1 = add i8 %out0, %load2
92+
store i8 %out1, ptr %out
93+
94+
ret void
95+
}
96+
97+
; CHECK-LABEL: slsr_i8_base_delta(
98+
; CHECK-SAME: ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]])
99+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]]
100+
; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 32
101+
; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GEP0]]
102+
; CHECK-NEXT: [[GEP1_0:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 1
103+
; CHECK-NEXT: [[GEP1_1:%.*]] = getelementptr inbounds i8, ptr [[GEP1_0]], i64 64
104+
; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GEP1_1]]
105+
; CHECK-NEXT: [[GEP2_0:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 2
106+
; CHECK-NEXT: [[GEP2_1:%.*]] = getelementptr inbounds i8, ptr [[GEP2_0]], i64 96
107+
; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GEP2_1]]
108+
define void @slsr_i8_base_delta(ptr %in, ptr %out, i64 %add) {
109+
; PTX-LABEL: slsr_i8_base_delta(
110+
; PTX: {
111+
; PTX-NEXT: .reg .b16 %rs<6>;
112+
; PTX-NEXT: .reg .b64 %rd<5>;
113+
; PTX-EMPTY:
114+
; PTX-NEXT: // %bb.0:
115+
; PTX-NEXT: ld.param.b64 %rd1, [slsr_i8_base_delta_param_0];
116+
; PTX-NEXT: ld.param.b64 %rd2, [slsr_i8_base_delta_param_2];
117+
; PTX-NEXT: add.s64 %rd3, %rd1, %rd2;
118+
; PTX-NEXT: ld.param.b64 %rd4, [slsr_i8_base_delta_param_1];
119+
; PTX-NEXT: ld.b8 %rs1, [%rd3+32];
120+
; PTX-NEXT: ld.b8 %rs2, [%rd3+65];
121+
; PTX-NEXT: ld.b8 %rs3, [%rd3+98];
122+
; PTX-NEXT: add.s16 %rs4, %rs1, %rs2;
123+
; PTX-NEXT: add.s16 %rs5, %rs4, %rs3;
124+
; PTX-NEXT: st.b8 [%rd4], %rs5;
125+
; PTX-NEXT: ret;
126+
%getElem0.0 = getelementptr inbounds i8, ptr %in, i64 %add
127+
%getElem0.1 = getelementptr inbounds i8, ptr %getElem0.0, i64 32
128+
%load0 = load i8, ptr %getElem0.1
129+
130+
%getElem1.0 = getelementptr inbounds i8, ptr %in, i64 1
131+
%getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 %add
132+
%getElem1.2 = getelementptr inbounds i8, ptr %getElem1.1, i64 64
133+
%load1 = load i8, ptr %getElem1.2
134+
135+
%getElem2.0 = getelementptr inbounds i8, ptr %in, i64 2
136+
%getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 %add
137+
%getElem2.2 = getelementptr inbounds i8, ptr %getElem2.1, i64 96
138+
%load2 = load i8, ptr %getElem2.2
139+
140+
%out0 = add i8 %load0, %load1
141+
%out1 = add i8 %out0, %load2
142+
store i8 %out1, ptr %out
143+
144+
ret void
145+
}
146+
147+
; CHECK-LABEL: slsr_i8_index_delta(
148+
; CHECK-SAME: ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]])
149+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[ADD]]
150+
; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 32
151+
; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GEP0]]
152+
; CHECK-NEXT: [[GEP1_0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]]
153+
; CHECK-NEXT: [[GEP1_1:%.*]] = getelementptr inbounds i8, ptr [[GEP1_0]], i64 64
154+
; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GEP1_1]]
155+
; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[GEP1_0]], i64 96
156+
; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GEP2]]
157+
define void @slsr_i8_index_delta(ptr %in, ptr %out, i64 %add) {
158+
; PTX-LABEL: slsr_i8_index_delta(
159+
; PTX: {
160+
; PTX-NEXT: .reg .b16 %rs<6>;
161+
; PTX-NEXT: .reg .b64 %rd<7>;
162+
; PTX-EMPTY:
163+
; PTX-NEXT: // %bb.0:
164+
; PTX-NEXT: ld.param.b64 %rd1, [slsr_i8_index_delta_param_0];
165+
; PTX-NEXT: ld.param.b64 %rd2, [slsr_i8_index_delta_param_2];
166+
; PTX-NEXT: shl.b64 %rd3, %rd2, 3;
167+
; PTX-NEXT: add.s64 %rd4, %rd1, %rd3;
168+
; PTX-NEXT: ld.param.b64 %rd5, [slsr_i8_index_delta_param_1];
169+
; PTX-NEXT: ld.b8 %rs1, [%rd4+32];
170+
; PTX-NEXT: add.s64 %rd6, %rd1, %rd2;
171+
; PTX-NEXT: ld.b8 %rs2, [%rd6+64];
172+
; PTX-NEXT: ld.b8 %rs3, [%rd6+96];
173+
; PTX-NEXT: add.s16 %rs4, %rs1, %rs2;
174+
; PTX-NEXT: add.s16 %rs5, %rs4, %rs3;
175+
; PTX-NEXT: st.b8 [%rd5], %rs5;
176+
; PTX-NEXT: ret;
177+
%getElem0.0 = getelementptr inbounds double, ptr %in, i64 %add
178+
%getElem0.1 = getelementptr inbounds i8, ptr %getElem0.0, i64 32
179+
%load0 = load i8, ptr %getElem0.1
180+
181+
%getElem1.0 = getelementptr inbounds i8, ptr %in, i64 %add
182+
%getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 64
183+
%load1 = load i8, ptr %getElem1.1
184+
185+
%getElem2.0 = getelementptr inbounds i8, ptr %in, i64 %add
186+
%getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 96
187+
%load2 = load i8, ptr %getElem2.1
188+
189+
%out0 = add i8 %load0, %load1
190+
%out1 = add i8 %out0, %load2
191+
store i8 %out1, ptr %out
192+
193+
ret void
194+
}
195+
196+
; CHECK-LABEL: slsr_i8_stride_delta(
197+
; CHECK-SAME: ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]], i64 [[OFFSET:%.*]])
198+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]]
199+
; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 32
200+
; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GEP0]]
201+
; CHECK-NEXT: [[GEP1_0:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 1
202+
; CHECK-NEXT: [[GEP1_1:%.*]] = getelementptr inbounds i8, ptr [[GEP1_0]], i64 64
203+
; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GEP1_1]]
204+
; CHECK-NEXT: [[GEP2_0:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 [[OFFSET]]
205+
; CHECK-NEXT: [[GEP2_1:%.*]] = getelementptr inbounds i8, ptr [[GEP2_0]], i64 96
206+
; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GEP2_1]]
207+
define void @slsr_i8_stride_delta(ptr %in, ptr %out, i64 %add, i64 %offset) {
208+
; PTX-LABEL: slsr_i8_stride_delta(
209+
; PTX: {
210+
; PTX-NEXT: .reg .b16 %rs<6>;
211+
; PTX-NEXT: .reg .b64 %rd<7>;
212+
; PTX-EMPTY:
213+
; PTX-NEXT: // %bb.0:
214+
; PTX-NEXT: ld.param.b64 %rd1, [slsr_i8_stride_delta_param_0];
215+
; PTX-NEXT: ld.param.b64 %rd2, [slsr_i8_stride_delta_param_2];
216+
; PTX-NEXT: add.s64 %rd3, %rd1, %rd2;
217+
; PTX-NEXT: ld.param.b64 %rd4, [slsr_i8_stride_delta_param_1];
218+
; PTX-NEXT: ld.b8 %rs1, [%rd3+32];
219+
; PTX-NEXT: ld.param.b64 %rd5, [slsr_i8_stride_delta_param_3];
220+
; PTX-NEXT: ld.b8 %rs2, [%rd3+65];
221+
; PTX-NEXT: add.s64 %rd6, %rd3, %rd5;
222+
; PTX-NEXT: ld.b8 %rs3, [%rd6+96];
223+
; PTX-NEXT: add.s16 %rs4, %rs1, %rs2;
224+
; PTX-NEXT: add.s16 %rs5, %rs4, %rs3;
225+
; PTX-NEXT: st.b8 [%rd4], %rs5;
226+
; PTX-NEXT: ret;
227+
%getElem0.0 = getelementptr inbounds i8, ptr %in, i64 %add
228+
%getElem0.1 = getelementptr inbounds i8, ptr %getElem0.0, i64 32
229+
%load0 = load i8, ptr %getElem0.1
230+
231+
%add1 = add i64 %add, 1
232+
%getElem1.0 = getelementptr inbounds i8, ptr %in, i64 %add1
233+
%getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 64
234+
%load1 = load i8, ptr %getElem1.1
235+
236+
%add2 = add i64 %add, %offset
237+
%getElem2.0 = getelementptr inbounds i8, ptr %in, i64 %add2
238+
%getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 96
239+
%load2 = load i8, ptr %getElem2.1
240+
241+
%out0 = add i8 %load0, %load1
242+
%out1 = add i8 %out0, %load2
243+
store i8 %out1, ptr %out
244+
245+
ret void
246+
}
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: opt < %s -passes=slsr -S | FileCheck %s
3+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_75 | FileCheck %s --check-prefix=PTX
4+
5+
target triple = "nvptx64-nvidia-cuda"
6+
7+
; Test SLSR can reuse the computation by complex variable delta.
8+
; The original program needs 4 mul.wide.s32, after SLSR with
9+
; variable-delta, it can reduce to 1 mul.wide.s32.
10+
define void @foo(ptr %a, ptr %b, i32 %j) {
11+
; PTX-LABEL: foo(
12+
; PTX: {
13+
; PTX-NEXT: .reg .b32 %r<8>;
14+
; PTX-NEXT: .reg .b64 %rd<13>;
15+
; PTX-EMPTY:
16+
; PTX-NEXT: // %bb.0:
17+
; PTX-NEXT: ld.param.b64 %rd1, [foo_param_0];
18+
; PTX-NEXT: ld.b32 %r1, [%rd1];
19+
; PTX-NEXT: ld.param.b64 %rd2, [foo_param_1];
20+
; PTX-NEXT: ld.param.b32 %r2, [foo_param_2];
21+
; PTX-NEXT: add.s32 %r3, %r1, %r2;
22+
; PTX-NEXT: mul.wide.s32 %rd3, %r3, 4;
23+
; PTX-NEXT: add.s64 %rd4, %rd2, %rd3;
24+
; PTX-NEXT: st.b32 [%rd4], 0;
25+
; PTX-NEXT: add.s32 %r4, %r3, %r3;
26+
; PTX-NEXT: mul.wide.s32 %rd5, %r4, 4;
27+
; PTX-NEXT: add.s64 %rd6, %rd2, %rd5;
28+
; PTX-NEXT: st.b32 [%rd6], 1;
29+
; PTX-NEXT: add.s32 %r5, %r4, 1;
30+
; PTX-NEXT: mul.wide.s32 %rd7, %r5, 4;
31+
; PTX-NEXT: add.s64 %rd8, %rd2, %rd7;
32+
; PTX-NEXT: st.b32 [%rd8], 2;
33+
; PTX-NEXT: add.s32 %r6, %r5, %r3;
34+
; PTX-NEXT: mul.wide.s32 %rd9, %r6, 4;
35+
; PTX-NEXT: add.s64 %rd10, %rd2, %rd9;
36+
; PTX-NEXT: st.b32 [%rd10], 3;
37+
; PTX-NEXT: add.s32 %r7, %r6, %r3;
38+
; PTX-NEXT: mul.wide.s32 %rd11, %r7, 4;
39+
; PTX-NEXT: add.s64 %rd12, %rd2, %rd11;
40+
; PTX-NEXT: st.b32 [%rd12], 4;
41+
; PTX-NEXT: ret;
42+
%i.0 = load i32, ptr %a, align 8
43+
%i = add i32 %i.0, %j
44+
; CHECK: [[L:%.*]] = load i32, ptr %a, align 8
45+
; CHECK: [[I:%.*]] = add i32 [[L]], %j
46+
%gep.24 = getelementptr float, ptr %b, i32 %i
47+
; CHECK: [[GEP0:%.*]] = getelementptr float, ptr %b, i32 [[I]]
48+
; CHECK: store i32 0, ptr [[GEP0]]
49+
store i32 0, ptr %gep.24
50+
%gep.24.sum1 = add i32 %i, %i
51+
%gep.25 = getelementptr float, ptr %b, i32 %gep.24.sum1
52+
; CHECK: [[EXT1:%.*]] = sext i32 [[I]] to i64
53+
; CHECK: [[MUL1:%.*]] = shl i64 [[EXT1]], 2
54+
; CHECK: [[GEP1:%.*]] = getelementptr i8, ptr [[GEP0]], i64 [[MUL1]]
55+
; CHECK: store i32 1, ptr [[GEP1]]
56+
store i32 1, ptr %gep.25
57+
%gep.26.sum3 = add i32 1, %i
58+
%gep.27.sum = add i32 %gep.26.sum3, %i
59+
%gep.28 = getelementptr float, ptr %b, i32 %gep.27.sum
60+
; CHECK: [[GEP2:%.*]] = getelementptr i8, ptr [[GEP1]], i64 4
61+
; CHECK: store i32 2, ptr [[GEP2]]
62+
store i32 2, ptr %gep.28
63+
%gep.28.sum = add i32 %gep.27.sum, %i
64+
%gep.29 = getelementptr float, ptr %b, i32 %gep.28.sum
65+
; CHECK: [[EXT2:%.*]] = sext i32 [[I]] to i64
66+
; CHECK: [[MUL2:%.*]] = shl i64 [[EXT2]], 2
67+
; CHECK: [[GEP3:%.*]] = getelementptr i8, ptr [[GEP2]], i64 [[MUL2]]
68+
; CHECK: store i32 3, ptr [[GEP3]]
69+
store i32 3, ptr %gep.29
70+
%gep.29.sum = add i32 %gep.28.sum, %i
71+
%gep.30 = getelementptr float, ptr %b, i32 %gep.29.sum
72+
; CHECK: [[EXT3:%.*]] = sext i32 [[I]] to i64
73+
; CHECK: [[MUL3:%.*]] = shl i64 [[EXT3]], 2
74+
; CHECK: [[GEP4:%.*]] = getelementptr i8, ptr [[GEP3]], i64 [[MUL3]]
75+
; CHECK: store i32 4, ptr [[GEP4]]
76+
store i32 4, ptr %gep.30
77+
ret void
78+
}

0 commit comments

Comments
 (0)