Skip to content

Commit 626edaf

Browse files
committed
Add new SLSR tests
1 parent 5588a65 commit 626edaf

File tree

4 files changed

+641
-0
lines changed

4 files changed

+641
-0
lines changed
Lines changed: 285 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,285 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
2+
; RUN: opt < %s -passes=slsr -S | FileCheck %s
3+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_75 | FileCheck %s --check-prefix=PTX
4+
5+
target triple = "nvptx64-nvidia-cuda"
6+
7+
; CHECK-LABEL: slsr_i8_zero_delta(
8+
define void @slsr_i8_zero_delta(ptr %in, ptr %out, i64 %add) {
9+
; PTX-LABEL: slsr_i8_zero_delta(
10+
; PTX: {
11+
; PTX-NEXT: .reg .b16 %rs<6>;
12+
; PTX-NEXT: .reg .b64 %rd<5>;
13+
; PTX-EMPTY:
14+
; PTX-NEXT: // %bb.0:
15+
; PTX-NEXT: ld.param.b64 %rd1, [slsr_i8_zero_delta_param_0];
16+
; PTX-NEXT: ld.param.b64 %rd2, [slsr_i8_zero_delta_param_2];
17+
; PTX-NEXT: add.s64 %rd3, %rd1, %rd2;
18+
; PTX-NEXT: ld.param.b64 %rd4, [slsr_i8_zero_delta_param_1];
19+
; PTX-NEXT: ld.b8 %rs1, [%rd3+32];
20+
; PTX-NEXT: ld.b8 %rs2, [%rd3+64];
21+
; PTX-NEXT: ld.b8 %rs3, [%rd3+96];
22+
; PTX-NEXT: add.s16 %rs4, %rs1, %rs2;
23+
; PTX-NEXT: add.s16 %rs5, %rs4, %rs3;
24+
; PTX-NEXT: st.b8 [%rd4], %rs5;
25+
; PTX-NEXT: ret;
26+
; CHECK-LABEL: define void @slsr_i8_zero_delta(
27+
; CHECK-SAME: ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]]) {
28+
; CHECK-NEXT: [[GETELEM0_0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]]
29+
; CHECK-NEXT: [[GETELEM0_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 32
30+
; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GETELEM0_1]], align 1
31+
; CHECK-NEXT: [[GETELEM1_0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]]
32+
; CHECK-NEXT: [[GETELEM1_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM1_0]], i64 64
33+
; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GETELEM1_1]], align 1
34+
; CHECK-NEXT: [[GETELEM2_0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]]
35+
; CHECK-NEXT: [[GETELEM2_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM2_0]], i64 96
36+
; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GETELEM2_1]], align 1
37+
; CHECK-NEXT: [[OUT0:%.*]] = add i8 [[LOAD0]], [[LOAD1]]
38+
; CHECK-NEXT: [[OUT1:%.*]] = add i8 [[OUT0]], [[LOAD2]]
39+
; CHECK-NEXT: store i8 [[OUT1]], ptr [[OUT]], align 1
40+
; CHECK-NEXT: ret void
41+
;
42+
%getElem0.0 = getelementptr inbounds i8, ptr %in, i64 %add
43+
%getElem0.1 = getelementptr inbounds i8, ptr %getElem0.0, i64 32
44+
%load0 = load i8, ptr %getElem0.1
45+
46+
%getElem1.0 = getelementptr inbounds i8, ptr %in, i64 %add
47+
%getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 64
48+
%load1 = load i8, ptr %getElem1.1
49+
50+
%getElem2.0 = getelementptr inbounds i8, ptr %in, i64 %add
51+
%getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 96
52+
%load2 = load i8, ptr %getElem2.1
53+
54+
%out0 = add i8 %load0, %load1
55+
%out1 = add i8 %out0, %load2
56+
store i8 %out1, ptr %out
57+
58+
ret void
59+
}
60+
61+
; CHECK-LABEL: slsr_i8_zero_delta_2(
62+
define void @slsr_i8_zero_delta_2(ptr %in, ptr %out, i64 %add) {
63+
; PTX-LABEL: slsr_i8_zero_delta_2(
64+
; PTX: {
65+
; PTX-NEXT: .reg .b16 %rs<6>;
66+
; PTX-NEXT: .reg .b64 %rd<5>;
67+
; PTX-EMPTY:
68+
; PTX-NEXT: // %bb.0:
69+
; PTX-NEXT: ld.param.b64 %rd1, [slsr_i8_zero_delta_2_param_0];
70+
; PTX-NEXT: ld.param.b64 %rd2, [slsr_i8_zero_delta_2_param_2];
71+
; PTX-NEXT: add.s64 %rd3, %rd1, %rd2;
72+
; PTX-NEXT: ld.param.b64 %rd4, [slsr_i8_zero_delta_2_param_1];
73+
; PTX-NEXT: ld.b8 %rs1, [%rd3];
74+
; PTX-NEXT: ld.b8 %rs2, [%rd3+32];
75+
; PTX-NEXT: ld.b8 %rs3, [%rd3+64];
76+
; PTX-NEXT: add.s16 %rs4, %rs1, %rs2;
77+
; PTX-NEXT: add.s16 %rs5, %rs4, %rs3;
78+
; PTX-NEXT: st.b8 [%rd4], %rs5;
79+
; PTX-NEXT: ret;
80+
; CHECK-LABEL: define void @slsr_i8_zero_delta_2(
81+
; CHECK-SAME: ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]]) {
82+
; CHECK-NEXT: [[GETELEM0_0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]]
83+
; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GETELEM0_0]], align 1
84+
; CHECK-NEXT: [[GETELEM1_0:%.*]] = getelementptr i8, ptr [[IN]], i64 [[ADD]]
85+
; CHECK-NEXT: [[GETELEM1_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM1_0]], i64 32
86+
; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GETELEM1_1]], align 1
87+
; CHECK-NEXT: [[GETELEM2_0:%.*]] = getelementptr i8, ptr [[IN]], i64 [[ADD]]
88+
; CHECK-NEXT: [[GETELEM2_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM2_0]], i64 64
89+
; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GETELEM2_1]], align 1
90+
; CHECK-NEXT: [[OUT0:%.*]] = add i8 [[LOAD0]], [[LOAD1]]
91+
; CHECK-NEXT: [[OUT1:%.*]] = add i8 [[OUT0]], [[LOAD2]]
92+
; CHECK-NEXT: store i8 [[OUT1]], ptr [[OUT]], align 1
93+
; CHECK-NEXT: ret void
94+
;
95+
%getElem0.0 = getelementptr inbounds i8, ptr %in, i64 %add
96+
%load0 = load i8, ptr %getElem0.0
97+
98+
%getElem1.0 = getelementptr i8, ptr %in, i64 %add
99+
%getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 32
100+
%load1 = load i8, ptr %getElem1.1
101+
102+
%getElem2.0 = getelementptr i8, ptr %in, i64 %add
103+
%getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 64
104+
%load2 = load i8, ptr %getElem2.1
105+
106+
%out0 = add i8 %load0, %load1
107+
%out1 = add i8 %out0, %load2
108+
store i8 %out1, ptr %out
109+
110+
ret void
111+
}
112+
113+
; CHECK-LABEL: slsr_i8_base_delta(
114+
define void @slsr_i8_base_delta(ptr %in, ptr %out, i64 %add) {
115+
; PTX-LABEL: slsr_i8_base_delta(
116+
; PTX: {
117+
; PTX-NEXT: .reg .b16 %rs<6>;
118+
; PTX-NEXT: .reg .b64 %rd<5>;
119+
; PTX-EMPTY:
120+
; PTX-NEXT: // %bb.0:
121+
; PTX-NEXT: ld.param.b64 %rd1, [slsr_i8_base_delta_param_0];
122+
; PTX-NEXT: ld.param.b64 %rd2, [slsr_i8_base_delta_param_2];
123+
; PTX-NEXT: add.s64 %rd3, %rd1, %rd2;
124+
; PTX-NEXT: ld.param.b64 %rd4, [slsr_i8_base_delta_param_1];
125+
; PTX-NEXT: ld.b8 %rs1, [%rd3+32];
126+
; PTX-NEXT: ld.b8 %rs2, [%rd3+65];
127+
; PTX-NEXT: ld.b8 %rs3, [%rd3+98];
128+
; PTX-NEXT: add.s16 %rs4, %rs1, %rs2;
129+
; PTX-NEXT: add.s16 %rs5, %rs4, %rs3;
130+
; PTX-NEXT: st.b8 [%rd4], %rs5;
131+
; PTX-NEXT: ret;
132+
; CHECK-LABEL: define void @slsr_i8_base_delta(
133+
; CHECK-SAME: ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]]) {
134+
; CHECK-NEXT: [[GETELEM0_0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]]
135+
; CHECK-NEXT: [[GETELEM0_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 32
136+
; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GETELEM0_1]], align 1
137+
; CHECK-NEXT: [[GETELEM1_0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 1
138+
; CHECK-NEXT: [[GETELEM1_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM1_0]], i64 [[ADD]]
139+
; CHECK-NEXT: [[GETELEM1_2:%.*]] = getelementptr inbounds i8, ptr [[GETELEM1_1]], i64 64
140+
; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GETELEM1_2]], align 1
141+
; CHECK-NEXT: [[GETELEM2_0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 2
142+
; CHECK-NEXT: [[GETELEM2_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM2_0]], i64 [[ADD]]
143+
; CHECK-NEXT: [[GETELEM2_2:%.*]] = getelementptr inbounds i8, ptr [[GETELEM2_1]], i64 96
144+
; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GETELEM2_2]], align 1
145+
; CHECK-NEXT: [[OUT0:%.*]] = add i8 [[LOAD0]], [[LOAD1]]
146+
; CHECK-NEXT: [[OUT1:%.*]] = add i8 [[OUT0]], [[LOAD2]]
147+
; CHECK-NEXT: store i8 [[OUT1]], ptr [[OUT]], align 1
148+
; CHECK-NEXT: ret void
149+
;
150+
%getElem0.0 = getelementptr inbounds i8, ptr %in, i64 %add
151+
%getElem0.1 = getelementptr inbounds i8, ptr %getElem0.0, i64 32
152+
%load0 = load i8, ptr %getElem0.1
153+
154+
%getElem1.0 = getelementptr inbounds i8, ptr %in, i64 1
155+
%getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 %add
156+
%getElem1.2 = getelementptr inbounds i8, ptr %getElem1.1, i64 64
157+
%load1 = load i8, ptr %getElem1.2
158+
159+
%getElem2.0 = getelementptr inbounds i8, ptr %in, i64 2
160+
%getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 %add
161+
%getElem2.2 = getelementptr inbounds i8, ptr %getElem2.1, i64 96
162+
%load2 = load i8, ptr %getElem2.2
163+
164+
%out0 = add i8 %load0, %load1
165+
%out1 = add i8 %out0, %load2
166+
store i8 %out1, ptr %out
167+
168+
ret void
169+
}
170+
171+
; CHECK-LABEL: slsr_i8_index_delta(
172+
define void @slsr_i8_index_delta(ptr %in, ptr %out, i64 %add) {
173+
; PTX-LABEL: slsr_i8_index_delta(
174+
; PTX: {
175+
; PTX-NEXT: .reg .b16 %rs<6>;
176+
; PTX-NEXT: .reg .b64 %rd<7>;
177+
; PTX-EMPTY:
178+
; PTX-NEXT: // %bb.0:
179+
; PTX-NEXT: ld.param.b64 %rd1, [slsr_i8_index_delta_param_0];
180+
; PTX-NEXT: ld.param.b64 %rd2, [slsr_i8_index_delta_param_2];
181+
; PTX-NEXT: shl.b64 %rd3, %rd2, 3;
182+
; PTX-NEXT: add.s64 %rd4, %rd1, %rd3;
183+
; PTX-NEXT: ld.param.b64 %rd5, [slsr_i8_index_delta_param_1];
184+
; PTX-NEXT: ld.b8 %rs1, [%rd4+32];
185+
; PTX-NEXT: add.s64 %rd6, %rd1, %rd2;
186+
; PTX-NEXT: ld.b8 %rs2, [%rd6+64];
187+
; PTX-NEXT: ld.b8 %rs3, [%rd6+96];
188+
; PTX-NEXT: add.s16 %rs4, %rs1, %rs2;
189+
; PTX-NEXT: add.s16 %rs5, %rs4, %rs3;
190+
; PTX-NEXT: st.b8 [%rd5], %rs5;
191+
; PTX-NEXT: ret;
192+
; CHECK-LABEL: define void @slsr_i8_index_delta(
193+
; CHECK-SAME: ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]]) {
194+
; CHECK-NEXT: [[GETELEM0_0:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[ADD]]
195+
; CHECK-NEXT: [[GETELEM0_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 32
196+
; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GETELEM0_1]], align 1
197+
; CHECK-NEXT: [[GETELEM1_0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]]
198+
; CHECK-NEXT: [[GETELEM1_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM1_0]], i64 64
199+
; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GETELEM1_1]], align 1
200+
; CHECK-NEXT: [[GETELEM2_0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]]
201+
; CHECK-NEXT: [[GETELEM2_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM2_0]], i64 96
202+
; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GETELEM2_1]], align 1
203+
; CHECK-NEXT: [[OUT0:%.*]] = add i8 [[LOAD0]], [[LOAD1]]
204+
; CHECK-NEXT: [[OUT1:%.*]] = add i8 [[OUT0]], [[LOAD2]]
205+
; CHECK-NEXT: store i8 [[OUT1]], ptr [[OUT]], align 1
206+
; CHECK-NEXT: ret void
207+
;
208+
%getElem0.0 = getelementptr inbounds double, ptr %in, i64 %add
209+
%getElem0.1 = getelementptr inbounds i8, ptr %getElem0.0, i64 32
210+
%load0 = load i8, ptr %getElem0.1
211+
212+
%getElem1.0 = getelementptr inbounds i8, ptr %in, i64 %add
213+
%getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 64
214+
%load1 = load i8, ptr %getElem1.1
215+
216+
%getElem2.0 = getelementptr inbounds i8, ptr %in, i64 %add
217+
%getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 96
218+
%load2 = load i8, ptr %getElem2.1
219+
220+
%out0 = add i8 %load0, %load1
221+
%out1 = add i8 %out0, %load2
222+
store i8 %out1, ptr %out
223+
224+
ret void
225+
}
226+
227+
; CHECK-LABEL: slsr_i8_stride_delta(
228+
define void @slsr_i8_stride_delta(ptr %in, ptr %out, i64 %add, i64 %offset) {
229+
; PTX-LABEL: slsr_i8_stride_delta(
230+
; PTX: {
231+
; PTX-NEXT: .reg .b16 %rs<6>;
232+
; PTX-NEXT: .reg .b64 %rd<7>;
233+
; PTX-EMPTY:
234+
; PTX-NEXT: // %bb.0:
235+
; PTX-NEXT: ld.param.b64 %rd1, [slsr_i8_stride_delta_param_0];
236+
; PTX-NEXT: ld.param.b64 %rd2, [slsr_i8_stride_delta_param_2];
237+
; PTX-NEXT: add.s64 %rd3, %rd1, %rd2;
238+
; PTX-NEXT: ld.param.b64 %rd4, [slsr_i8_stride_delta_param_1];
239+
; PTX-NEXT: ld.b8 %rs1, [%rd3+32];
240+
; PTX-NEXT: ld.param.b64 %rd5, [slsr_i8_stride_delta_param_3];
241+
; PTX-NEXT: ld.b8 %rs2, [%rd3+65];
242+
; PTX-NEXT: add.s64 %rd6, %rd3, %rd5;
243+
; PTX-NEXT: ld.b8 %rs3, [%rd6+96];
244+
; PTX-NEXT: add.s16 %rs4, %rs1, %rs2;
245+
; PTX-NEXT: add.s16 %rs5, %rs4, %rs3;
246+
; PTX-NEXT: st.b8 [%rd4], %rs5;
247+
; PTX-NEXT: ret;
248+
; CHECK-LABEL: define void @slsr_i8_stride_delta(
249+
; CHECK-SAME: ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]], i64 [[OFFSET:%.*]]) {
250+
; CHECK-NEXT: [[GETELEM0_0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]]
251+
; CHECK-NEXT: [[GETELEM0_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 32
252+
; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GETELEM0_1]], align 1
253+
; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[ADD]], 1
254+
; CHECK-NEXT: [[GETELEM1_0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD1]]
255+
; CHECK-NEXT: [[GETELEM1_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM1_0]], i64 64
256+
; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GETELEM1_1]], align 1
257+
; CHECK-NEXT: [[ADD2:%.*]] = add i64 [[ADD]], [[OFFSET]]
258+
; CHECK-NEXT: [[GETELEM2_0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD2]]
259+
; CHECK-NEXT: [[GETELEM2_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM2_0]], i64 96
260+
; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GETELEM2_1]], align 1
261+
; CHECK-NEXT: [[OUT0:%.*]] = add i8 [[LOAD0]], [[LOAD1]]
262+
; CHECK-NEXT: [[OUT1:%.*]] = add i8 [[OUT0]], [[LOAD2]]
263+
; CHECK-NEXT: store i8 [[OUT1]], ptr [[OUT]], align 1
264+
; CHECK-NEXT: ret void
265+
;
266+
%getElem0.0 = getelementptr inbounds i8, ptr %in, i64 %add
267+
%getElem0.1 = getelementptr inbounds i8, ptr %getElem0.0, i64 32
268+
%load0 = load i8, ptr %getElem0.1
269+
270+
%add1 = add i64 %add, 1
271+
%getElem1.0 = getelementptr inbounds i8, ptr %in, i64 %add1
272+
%getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 64
273+
%load1 = load i8, ptr %getElem1.1
274+
275+
%add2 = add i64 %add, %offset
276+
%getElem2.0 = getelementptr inbounds i8, ptr %in, i64 %add2
277+
%getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 96
278+
%load2 = load i8, ptr %getElem2.1
279+
280+
%out0 = add i8 %load0, %load1
281+
%out1 = add i8 %out0, %load2
282+
store i8 %out1, ptr %out
283+
284+
ret void
285+
}
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: opt < %s -passes=slsr -S | FileCheck %s
3+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_75 | FileCheck %s --check-prefix=PTX
4+
5+
target triple = "nvptx64-nvidia-cuda"
6+
7+
; Test SLSR can reuse the computation by complex variable delta.
8+
; The original program needs 4 mul.wide.s32, after SLSR with
9+
; variable-delta, it can reduce to 1 mul.wide.s32.
10+
define void @foo(ptr %a, ptr %b, i32 %j) {
11+
; PTX-LABEL: foo(
12+
; PTX: {
13+
; PTX-NEXT: .reg .b32 %r<8>;
14+
; PTX-NEXT: .reg .b64 %rd<13>;
15+
; PTX-EMPTY:
16+
; PTX-NEXT: // %bb.0:
17+
; PTX-NEXT: ld.param.b64 %rd1, [foo_param_0];
18+
; PTX-NEXT: ld.b32 %r1, [%rd1];
19+
; PTX-NEXT: ld.param.b64 %rd2, [foo_param_1];
20+
; PTX-NEXT: ld.param.b32 %r2, [foo_param_2];
21+
; PTX-NEXT: add.s32 %r3, %r1, %r2;
22+
; PTX-NEXT: mul.wide.s32 %rd3, %r3, 4;
23+
; PTX-NEXT: add.s64 %rd4, %rd2, %rd3;
24+
; PTX-NEXT: st.b32 [%rd4], 0;
25+
; PTX-NEXT: add.s32 %r4, %r3, %r3;
26+
; PTX-NEXT: mul.wide.s32 %rd5, %r4, 4;
27+
; PTX-NEXT: add.s64 %rd6, %rd2, %rd5;
28+
; PTX-NEXT: st.b32 [%rd6], 1;
29+
; PTX-NEXT: add.s32 %r5, %r4, 1;
30+
; PTX-NEXT: mul.wide.s32 %rd7, %r5, 4;
31+
; PTX-NEXT: add.s64 %rd8, %rd2, %rd7;
32+
; PTX-NEXT: st.b32 [%rd8], 2;
33+
; PTX-NEXT: add.s32 %r6, %r5, %r3;
34+
; PTX-NEXT: mul.wide.s32 %rd9, %r6, 4;
35+
; PTX-NEXT: add.s64 %rd10, %rd2, %rd9;
36+
; PTX-NEXT: st.b32 [%rd10], 3;
37+
; PTX-NEXT: add.s32 %r7, %r6, %r3;
38+
; PTX-NEXT: mul.wide.s32 %rd11, %r7, 4;
39+
; PTX-NEXT: add.s64 %rd12, %rd2, %rd11;
40+
; PTX-NEXT: st.b32 [%rd12], 4;
41+
; PTX-NEXT: ret;
42+
%i.0 = load i32, ptr %a, align 8
43+
%i = add i32 %i.0, %j
44+
; CHECK: [[L:%.*]] = load i32, ptr %a, align 8
45+
; CHECK: [[I:%.*]] = add i32 [[L]], %j
46+
%gep.24 = getelementptr float, ptr %b, i32 %i
47+
; CHECK: [[GEP0:%.*]] = getelementptr float, ptr %b, i32 [[I]]
48+
; CHECK: store i32 0, ptr [[GEP0]]
49+
store i32 0, ptr %gep.24
50+
%gep.24.sum1 = add i32 %i, %i
51+
%gep.25 = getelementptr float, ptr %b, i32 %gep.24.sum1
52+
; CHECK: [[EXT1:%.*]] = sext i32 [[I]] to i64
53+
; CHECK: [[MUL1:%.*]] = shl i64 [[EXT1]], 2
54+
; CHECK: [[GEP1:%.*]] = getelementptr i8, ptr [[GEP0]], i64 [[MUL1]]
55+
; CHECK: store i32 1, ptr [[GEP1]]
56+
store i32 1, ptr %gep.25
57+
%gep.26.sum3 = add i32 1, %i
58+
%gep.27.sum = add i32 %gep.26.sum3, %i
59+
%gep.28 = getelementptr float, ptr %b, i32 %gep.27.sum
60+
; CHECK: [[GEP2:%.*]] = getelementptr i8, ptr [[GEP1]], i64 4
61+
; CHECK: store i32 2, ptr [[GEP2]]
62+
store i32 2, ptr %gep.28
63+
%gep.28.sum = add i32 %gep.27.sum, %i
64+
%gep.29 = getelementptr float, ptr %b, i32 %gep.28.sum
65+
; CHECK: [[EXT2:%.*]] = sext i32 [[I]] to i64
66+
; CHECK: [[MUL2:%.*]] = shl i64 [[EXT2]], 2
67+
; CHECK: [[GEP3:%.*]] = getelementptr i8, ptr [[GEP2]], i64 [[MUL2]]
68+
; CHECK: store i32 3, ptr [[GEP3]]
69+
store i32 3, ptr %gep.29
70+
%gep.29.sum = add i32 %gep.28.sum, %i
71+
%gep.30 = getelementptr float, ptr %b, i32 %gep.29.sum
72+
; CHECK: [[EXT3:%.*]] = sext i32 [[I]] to i64
73+
; CHECK: [[MUL3:%.*]] = shl i64 [[EXT3]], 2
74+
; CHECK: [[GEP4:%.*]] = getelementptr i8, ptr [[GEP3]], i64 [[MUL3]]
75+
; CHECK: store i32 4, ptr [[GEP4]]
76+
store i32 4, ptr %gep.30
77+
ret void
78+
}

0 commit comments

Comments
 (0)