Skip to content

Commit de53b1a

Browse files
committed
[LV] Simplify IR for gather-cost.ll, auto-generate checks. (NFC)
Simplify tests and auto-generate check in preparation for further updates.
1 parent 2ac832b commit de53b1a

File tree

3 files changed

+243
-212
lines changed

3 files changed

+243
-212
lines changed
Lines changed: 86 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
12
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-ios -S -mcpu=cyclone -enable-interleaved-mem-accesses=false < %s | FileCheck %s
23
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
34

@@ -6,80 +7,112 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
67
@kernel3 = global [512 x float] zeroinitializer, align 16
78
@kernel4 = global [512 x float] zeroinitializer, align 16
89
@src_data = global [1536 x float] zeroinitializer, align 16
9-
@r_ = global i8 0, align 1
10-
@g_ = global i8 0, align 1
11-
@b_ = global i8 0, align 1
1210

1311
; We don't want to vectorize most loops containing gathers because they are
1412
; expensive.
1513
; Make sure we don't vectorize it.
16-
; CHECK-NOT: x float>
1714

18-
define void @_Z4testmm(i64 %size, i64 %offset) {
15+
define float @_Z4testmm(i64 %size, i64 %offset) {
16+
; CHECK-LABEL: define float @_Z4testmm(
17+
; CHECK-SAME: i64 [[SIZE:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0:[0-9]+]] {
18+
; CHECK-NEXT: [[ENTRY:.*]]:
19+
; CHECK-NEXT: br label %[[LOOP:.*]]
20+
; CHECK: [[LOOP]]:
21+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
22+
; CHECK-NEXT: [[RDX_0:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[RDX_0_NEXT:%.*]], %[[LOOP]] ]
23+
; CHECK-NEXT: [[RDX_1:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[RDX_1_NEXT:%.*]], %[[LOOP]] ]
24+
; CHECK-NEXT: [[RED_2:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[RDX_2_NEXT:%.*]], %[[LOOP]] ]
25+
; CHECK-NEXT: [[ADD:%.*]] = add i64 [[IV]], [[OFFSET]]
26+
; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[ADD]], 3
27+
; CHECK-NEXT: [[GEP_SRC_DATA:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[MUL]]
28+
; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[GEP_SRC_DATA]], align 4
29+
; CHECK-NEXT: [[GEP_KERNEL:%.*]] = getelementptr inbounds [512 x float], ptr @kernel, i64 0, i64 [[IV]]
30+
; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[GEP_KERNEL]], align 4
31+
; CHECK-NEXT: [[MUL3:%.*]] = fmul fast float [[TMP0]], [[TMP1]]
32+
; CHECK-NEXT: [[GEP_KERNEL2:%.*]] = getelementptr inbounds [512 x float], ptr @kernel2, i64 0, i64 [[IV]]
33+
; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[GEP_KERNEL2]], align 4
34+
; CHECK-NEXT: [[MUL5:%.*]] = fmul fast float [[MUL3]], [[TMP2]]
35+
; CHECK-NEXT: [[GEP_KERNEL3:%.*]] = getelementptr inbounds [512 x float], ptr @kernel3, i64 0, i64 [[IV]]
36+
; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[GEP_KERNEL3]], align 4
37+
; CHECK-NEXT: [[MUL7:%.*]] = fmul fast float [[MUL5]], [[TMP3]]
38+
; CHECK-NEXT: [[GEP_KERNEL4:%.*]] = getelementptr inbounds [512 x float], ptr @kernel4, i64 0, i64 [[IV]]
39+
; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[GEP_KERNEL4]], align 4
40+
; CHECK-NEXT: [[MUL9:%.*]] = fmul fast float [[MUL7]], [[TMP4]]
41+
; CHECK-NEXT: [[RDX_0_NEXT]] = fadd fast float [[RDX_0]], [[MUL9]]
42+
; CHECK-NEXT: [[GEP_SRC_DATA_SUM:%.*]] = add i64 [[MUL]], 1
43+
; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[GEP_SRC_DATA_SUM]]
44+
; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
45+
; CHECK-NEXT: [[MUL13:%.*]] = fmul fast float [[TMP1]], [[TMP5]]
46+
; CHECK-NEXT: [[MUL15:%.*]] = fmul fast float [[TMP2]], [[MUL13]]
47+
; CHECK-NEXT: [[MUL17:%.*]] = fmul fast float [[TMP3]], [[MUL15]]
48+
; CHECK-NEXT: [[MUL19:%.*]] = fmul fast float [[TMP4]], [[MUL17]]
49+
; CHECK-NEXT: [[RDX_1_NEXT]] = fadd fast float [[RDX_1]], [[MUL19]]
50+
; CHECK-NEXT: [[GEP_SRC_DATA_SUM52:%.*]] = add i64 [[MUL]], 2
51+
; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[GEP_SRC_DATA_SUM52]]
52+
; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX21]], align 4
53+
; CHECK-NEXT: [[MUL23:%.*]] = fmul fast float [[TMP1]], [[TMP6]]
54+
; CHECK-NEXT: [[MUL25:%.*]] = fmul fast float [[TMP2]], [[MUL23]]
55+
; CHECK-NEXT: [[MUL27:%.*]] = fmul fast float [[TMP3]], [[MUL25]]
56+
; CHECK-NEXT: [[MUL29:%.*]] = fmul fast float [[TMP4]], [[MUL27]]
57+
; CHECK-NEXT: [[RDX_2_NEXT]] = fadd fast float [[RED_2]], [[MUL29]]
58+
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
59+
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[SIZE]]
60+
; CHECK-NEXT: br i1 [[EXITCOND]], label %[[LOOP]], label %[[EXIT:.*]]
61+
; CHECK: [[EXIT]]:
62+
; CHECK-NEXT: [[RDX_0_NEXT_LCSSA:%.*]] = phi float [ [[RDX_0_NEXT]], %[[LOOP]] ]
63+
; CHECK-NEXT: [[RDX_1_NEXT_LCSSA:%.*]] = phi float [ [[RDX_1_NEXT]], %[[LOOP]] ]
64+
; CHECK-NEXT: [[RDX_2_NEXT_LCSSA:%.*]] = phi float [ [[RDX_2_NEXT]], %[[LOOP]] ]
65+
; CHECK-NEXT: [[RES_0:%.*]] = fadd float [[RDX_0_NEXT_LCSSA]], [[RDX_1_NEXT_LCSSA]]
66+
; CHECK-NEXT: [[RES_1:%.*]] = fadd float [[RES_0]], [[RDX_2_NEXT_LCSSA]]
67+
; CHECK-NEXT: ret float [[RES_1]]
68+
;
1969
entry:
20-
%cmp53 = icmp eq i64 %size, 0
21-
br i1 %cmp53, label %for.end, label %for.body.lr.ph
70+
br label %loop
2271

23-
for.body.lr.ph:
24-
br label %for.body
25-
26-
for.body:
27-
%r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ]
28-
%g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ]
29-
%v.055 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
30-
%b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ]
31-
%add = add i64 %v.055, %offset
72+
loop:
73+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
74+
%rdx.0 = phi float [ 0.000000e+00, %entry ], [ %rdx.0.next, %loop ]
75+
%rdx.1 = phi float [ 0.000000e+00, %entry ], [ %rdx.1.next, %loop ]
76+
%red.2 = phi float [ 0.000000e+00, %entry ], [ %rdx.2.next, %loop ]
77+
%add = add i64 %iv, %offset
3278
%mul = mul i64 %add, 3
33-
%arrayidx = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 %mul
34-
%0 = load float, ptr %arrayidx, align 4
35-
%arrayidx2 = getelementptr inbounds [512 x float], ptr @kernel, i64 0, i64 %v.055
36-
%1 = load float, ptr %arrayidx2, align 4
79+
%gep.src_data = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 %mul
80+
%0 = load float, ptr %gep.src_data, align 4
81+
%gep.kernel = getelementptr inbounds [512 x float], ptr @kernel, i64 0, i64 %iv
82+
%1 = load float, ptr %gep.kernel, align 4
3783
%mul3 = fmul fast float %0, %1
38-
%arrayidx4 = getelementptr inbounds [512 x float], ptr @kernel2, i64 0, i64 %v.055
39-
%2 = load float, ptr %arrayidx4, align 4
84+
%gep.kernel2 = getelementptr inbounds [512 x float], ptr @kernel2, i64 0, i64 %iv
85+
%2 = load float, ptr %gep.kernel2, align 4
4086
%mul5 = fmul fast float %mul3, %2
41-
%arrayidx6 = getelementptr inbounds [512 x float], ptr @kernel3, i64 0, i64 %v.055
42-
%3 = load float, ptr %arrayidx6, align 4
87+
%gep.kernel3 = getelementptr inbounds [512 x float], ptr @kernel3, i64 0, i64 %iv
88+
%3 = load float, ptr %gep.kernel3, align 4
4389
%mul7 = fmul fast float %mul5, %3
44-
%arrayidx8 = getelementptr inbounds [512 x float], ptr @kernel4, i64 0, i64 %v.055
45-
%4 = load float, ptr %arrayidx8, align 4
90+
%gep.kernel4 = getelementptr inbounds [512 x float], ptr @kernel4, i64 0, i64 %iv
91+
%4 = load float, ptr %gep.kernel4, align 4
4692
%mul9 = fmul fast float %mul7, %4
47-
%add10 = fadd fast float %r.057, %mul9
48-
%arrayidx.sum = add i64 %mul, 1
49-
%arrayidx11 = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 %arrayidx.sum
93+
%rdx.0.next = fadd fast float %rdx.0, %mul9
94+
%gep.src_data.sum = add i64 %mul, 1
95+
%arrayidx11 = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 %gep.src_data.sum
5096
%5 = load float, ptr %arrayidx11, align 4
5197
%mul13 = fmul fast float %1, %5
5298
%mul15 = fmul fast float %2, %mul13
5399
%mul17 = fmul fast float %3, %mul15
54100
%mul19 = fmul fast float %4, %mul17
55-
%add20 = fadd fast float %g.056, %mul19
56-
%arrayidx.sum52 = add i64 %mul, 2
57-
%arrayidx21 = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 %arrayidx.sum52
101+
%rdx.1.next = fadd fast float %rdx.1, %mul19
102+
%gep.src_data.sum52 = add i64 %mul, 2
103+
%arrayidx21 = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 %gep.src_data.sum52
58104
%6 = load float, ptr %arrayidx21, align 4
59105
%mul23 = fmul fast float %1, %6
60106
%mul25 = fmul fast float %2, %mul23
61107
%mul27 = fmul fast float %3, %mul25
62108
%mul29 = fmul fast float %4, %mul27
63-
%add30 = fadd fast float %b.054, %mul29
64-
%inc = add i64 %v.055, 1
65-
%exitcond = icmp ne i64 %inc, %size
66-
br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
67-
68-
for.cond.for.end_crit_edge:
69-
%add30.lcssa = phi float [ %add30, %for.body ]
70-
%add20.lcssa = phi float [ %add20, %for.body ]
71-
%add10.lcssa = phi float [ %add10, %for.body ]
72-
%phitmp = fptoui float %add10.lcssa to i8
73-
%phitmp60 = fptoui float %add20.lcssa to i8
74-
%phitmp61 = fptoui float %add30.lcssa to i8
75-
br label %for.end
109+
%rdx.2.next = fadd fast float %red.2, %mul29
110+
%iv.next = add i64 %iv, 1
111+
%exitcond = icmp ne i64 %iv.next, %size
112+
br i1 %exitcond, label %loop, label %exit
76113

77-
for.end:
78-
%r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
79-
%g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ]
80-
%b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ]
81-
store i8 %r.0.lcssa, ptr @r_, align 1
82-
store i8 %g.0.lcssa, ptr @g_, align 1
83-
store i8 %b.0.lcssa, ptr @b_, align 1
84-
ret void
114+
exit:
115+
%res.0 = fadd float %rdx.0.next, %rdx.1.next
116+
%res.1 = fadd float %res.0, %rdx.2.next
117+
ret float %res.1
85118
}
Lines changed: 89 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
12
; RUN: opt -passes=loop-vectorize -mtriple=thumbv7s-apple-ios6.0.0 -S -enable-interleaved-mem-accesses=false < %s | FileCheck %s
23

34
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
@@ -7,82 +8,112 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-
78
@kernel3 = global [512 x float] zeroinitializer, align 4
89
@kernel4 = global [512 x float] zeroinitializer, align 4
910
@src_data = global [1536 x float] zeroinitializer, align 4
10-
@r_ = global i8 0, align 4
11-
@g_ = global i8 0, align 4
12-
@b_ = global i8 0, align 4
1311

1412
; We don't want to vectorize most loops containing gathers because they are
15-
; expensive. This function represents a point where vectorization starts to
16-
; become beneficial.
17-
; Make sure we are conservative and don't vectorize it.
18-
; CHECK-NOT: <2 x float>
19-
; CHECK-NOT: <4 x float>
13+
; expensive.
14+
; Make sure we don't vectorize it.
2015

21-
define void @_Z4testmm(i32 %size, i32 %offset) {
16+
define float @_Z4testmm(i64 %size, i64 %offset) {
17+
; CHECK-LABEL: define float @_Z4testmm(
18+
; CHECK-SAME: i64 [[SIZE:%.*]], i64 [[OFFSET:%.*]]) {
19+
; CHECK-NEXT: [[ENTRY:.*]]:
20+
; CHECK-NEXT: br label %[[LOOP:.*]]
21+
; CHECK: [[LOOP]]:
22+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
23+
; CHECK-NEXT: [[RDX_0:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[RDX_0_NEXT:%.*]], %[[LOOP]] ]
24+
; CHECK-NEXT: [[RDX_1:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[RDX_1_NEXT:%.*]], %[[LOOP]] ]
25+
; CHECK-NEXT: [[RED_2:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[RDX_2_NEXT:%.*]], %[[LOOP]] ]
26+
; CHECK-NEXT: [[ADD:%.*]] = add i64 [[IV]], [[OFFSET]]
27+
; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[ADD]], 3
28+
; CHECK-NEXT: [[GEP_SRC_DATA:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[MUL]]
29+
; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[GEP_SRC_DATA]], align 4
30+
; CHECK-NEXT: [[GEP_KERNEL:%.*]] = getelementptr inbounds [512 x float], ptr @kernel, i64 0, i64 [[IV]]
31+
; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[GEP_KERNEL]], align 4
32+
; CHECK-NEXT: [[MUL3:%.*]] = fmul fast float [[TMP0]], [[TMP1]]
33+
; CHECK-NEXT: [[GEP_KERNEL2:%.*]] = getelementptr inbounds [512 x float], ptr @kernel2, i64 0, i64 [[IV]]
34+
; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[GEP_KERNEL2]], align 4
35+
; CHECK-NEXT: [[MUL5:%.*]] = fmul fast float [[MUL3]], [[TMP2]]
36+
; CHECK-NEXT: [[GEP_KERNEL3:%.*]] = getelementptr inbounds [512 x float], ptr @kernel3, i64 0, i64 [[IV]]
37+
; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[GEP_KERNEL3]], align 4
38+
; CHECK-NEXT: [[MUL7:%.*]] = fmul fast float [[MUL5]], [[TMP3]]
39+
; CHECK-NEXT: [[GEP_KERNEL4:%.*]] = getelementptr inbounds [512 x float], ptr @kernel4, i64 0, i64 [[IV]]
40+
; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[GEP_KERNEL4]], align 4
41+
; CHECK-NEXT: [[MUL9:%.*]] = fmul fast float [[MUL7]], [[TMP4]]
42+
; CHECK-NEXT: [[RDX_0_NEXT]] = fadd fast float [[RDX_0]], [[MUL9]]
43+
; CHECK-NEXT: [[GEP_SRC_DATA_SUM:%.*]] = add i64 [[MUL]], 1
44+
; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[GEP_SRC_DATA_SUM]]
45+
; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
46+
; CHECK-NEXT: [[MUL13:%.*]] = fmul fast float [[TMP1]], [[TMP5]]
47+
; CHECK-NEXT: [[MUL15:%.*]] = fmul fast float [[TMP2]], [[MUL13]]
48+
; CHECK-NEXT: [[MUL17:%.*]] = fmul fast float [[TMP3]], [[MUL15]]
49+
; CHECK-NEXT: [[MUL19:%.*]] = fmul fast float [[TMP4]], [[MUL17]]
50+
; CHECK-NEXT: [[RDX_1_NEXT]] = fadd fast float [[RDX_1]], [[MUL19]]
51+
; CHECK-NEXT: [[GEP_SRC_DATA_SUM52:%.*]] = add i64 [[MUL]], 2
52+
; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[GEP_SRC_DATA_SUM52]]
53+
; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX21]], align 4
54+
; CHECK-NEXT: [[MUL23:%.*]] = fmul fast float [[TMP1]], [[TMP6]]
55+
; CHECK-NEXT: [[MUL25:%.*]] = fmul fast float [[TMP2]], [[MUL23]]
56+
; CHECK-NEXT: [[MUL27:%.*]] = fmul fast float [[TMP3]], [[MUL25]]
57+
; CHECK-NEXT: [[MUL29:%.*]] = fmul fast float [[TMP4]], [[MUL27]]
58+
; CHECK-NEXT: [[RDX_2_NEXT]] = fadd fast float [[RED_2]], [[MUL29]]
59+
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
60+
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[SIZE]]
61+
; CHECK-NEXT: br i1 [[EXITCOND]], label %[[LOOP]], label %[[EXIT:.*]]
62+
; CHECK: [[EXIT]]:
63+
; CHECK-NEXT: [[RDX_0_NEXT_LCSSA:%.*]] = phi float [ [[RDX_0_NEXT]], %[[LOOP]] ]
64+
; CHECK-NEXT: [[RDX_1_NEXT_LCSSA:%.*]] = phi float [ [[RDX_1_NEXT]], %[[LOOP]] ]
65+
; CHECK-NEXT: [[RDX_2_NEXT_LCSSA:%.*]] = phi float [ [[RDX_2_NEXT]], %[[LOOP]] ]
66+
; CHECK-NEXT: [[RES_0:%.*]] = fadd float [[RDX_0_NEXT_LCSSA]], [[RDX_1_NEXT_LCSSA]]
67+
; CHECK-NEXT: [[RES_1:%.*]] = fadd float [[RES_0]], [[RDX_2_NEXT_LCSSA]]
68+
; CHECK-NEXT: ret float [[RES_1]]
69+
;
2270
entry:
23-
%cmp53 = icmp eq i32 %size, 0
24-
br i1 %cmp53, label %for.end, label %for.body.lr.ph
71+
br label %loop
2572

26-
for.body.lr.ph:
27-
br label %for.body
28-
29-
for.body:
30-
%r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ]
31-
%g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ]
32-
%v.055 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
33-
%b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ]
34-
%add = add i32 %v.055, %offset
35-
%mul = mul i32 %add, 3
36-
%arrayidx = getelementptr inbounds [1536 x float], ptr @src_data, i32 0, i32 %mul
37-
%0 = load float, ptr %arrayidx, align 4
38-
%arrayidx2 = getelementptr inbounds [512 x float], ptr @kernel, i32 0, i32 %v.055
39-
%1 = load float, ptr %arrayidx2, align 4
73+
loop:
74+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
75+
%rdx.0 = phi float [ 0.000000e+00, %entry ], [ %rdx.0.next, %loop ]
76+
%rdx.1 = phi float [ 0.000000e+00, %entry ], [ %rdx.1.next, %loop ]
77+
%red.2 = phi float [ 0.000000e+00, %entry ], [ %rdx.2.next, %loop ]
78+
%add = add i64 %iv, %offset
79+
%mul = mul i64 %add, 3
80+
%gep.src_data = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 %mul
81+
%0 = load float, ptr %gep.src_data, align 4
82+
%gep.kernel = getelementptr inbounds [512 x float], ptr @kernel, i64 0, i64 %iv
83+
%1 = load float, ptr %gep.kernel, align 4
4084
%mul3 = fmul fast float %0, %1
41-
%arrayidx4 = getelementptr inbounds [512 x float], ptr @kernel2, i32 0, i32 %v.055
42-
%2 = load float, ptr %arrayidx4, align 4
85+
%gep.kernel2 = getelementptr inbounds [512 x float], ptr @kernel2, i64 0, i64 %iv
86+
%2 = load float, ptr %gep.kernel2, align 4
4387
%mul5 = fmul fast float %mul3, %2
44-
%arrayidx6 = getelementptr inbounds [512 x float], ptr @kernel3, i32 0, i32 %v.055
45-
%3 = load float, ptr %arrayidx6, align 4
88+
%gep.kernel3 = getelementptr inbounds [512 x float], ptr @kernel3, i64 0, i64 %iv
89+
%3 = load float, ptr %gep.kernel3, align 4
4690
%mul7 = fmul fast float %mul5, %3
47-
%arrayidx8 = getelementptr inbounds [512 x float], ptr @kernel4, i32 0, i32 %v.055
48-
%4 = load float, ptr %arrayidx8, align 4
91+
%gep.kernel4 = getelementptr inbounds [512 x float], ptr @kernel4, i64 0, i64 %iv
92+
%4 = load float, ptr %gep.kernel4, align 4
4993
%mul9 = fmul fast float %mul7, %4
50-
%add10 = fadd fast float %r.057, %mul9
51-
%arrayidx.sum = add i32 %mul, 1
52-
%arrayidx11 = getelementptr inbounds [1536 x float], ptr @src_data, i32 0, i32 %arrayidx.sum
94+
%rdx.0.next = fadd fast float %rdx.0, %mul9
95+
%gep.src_data.sum = add i64 %mul, 1
96+
%arrayidx11 = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 %gep.src_data.sum
5397
%5 = load float, ptr %arrayidx11, align 4
5498
%mul13 = fmul fast float %1, %5
5599
%mul15 = fmul fast float %2, %mul13
56100
%mul17 = fmul fast float %3, %mul15
57101
%mul19 = fmul fast float %4, %mul17
58-
%add20 = fadd fast float %g.056, %mul19
59-
%arrayidx.sum52 = add i32 %mul, 2
60-
%arrayidx21 = getelementptr inbounds [1536 x float], ptr @src_data, i32 0, i32 %arrayidx.sum52
102+
%rdx.1.next = fadd fast float %rdx.1, %mul19
103+
%gep.src_data.sum52 = add i64 %mul, 2
104+
%arrayidx21 = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 %gep.src_data.sum52
61105
%6 = load float, ptr %arrayidx21, align 4
62106
%mul23 = fmul fast float %1, %6
63107
%mul25 = fmul fast float %2, %mul23
64108
%mul27 = fmul fast float %3, %mul25
65109
%mul29 = fmul fast float %4, %mul27
66-
%add30 = fadd fast float %b.054, %mul29
67-
%inc = add i32 %v.055, 1
68-
%exitcond = icmp ne i32 %inc, %size
69-
br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
70-
71-
for.cond.for.end_crit_edge:
72-
%add30.lcssa = phi float [ %add30, %for.body ]
73-
%add20.lcssa = phi float [ %add20, %for.body ]
74-
%add10.lcssa = phi float [ %add10, %for.body ]
75-
%phitmp = fptoui float %add10.lcssa to i8
76-
%phitmp60 = fptoui float %add20.lcssa to i8
77-
%phitmp61 = fptoui float %add30.lcssa to i8
78-
br label %for.end
110+
%rdx.2.next = fadd fast float %red.2, %mul29
111+
%iv.next = add i64 %iv, 1
112+
%exitcond = icmp ne i64 %iv.next, %size
113+
br i1 %exitcond, label %loop, label %exit
79114

80-
for.end:
81-
%r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
82-
%g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ]
83-
%b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ]
84-
store i8 %r.0.lcssa, ptr @r_, align 4
85-
store i8 %g.0.lcssa, ptr @g_, align 4
86-
store i8 %b.0.lcssa, ptr @b_, align 4
87-
ret void
115+
exit:
116+
%res.0 = fadd float %rdx.0.next, %rdx.1.next
117+
%res.1 = fadd float %res.0, %rdx.2.next
118+
ret float %res.1
88119
}

0 commit comments

Comments
 (0)