Skip to content

Commit e518dde

Browse files
[ARM] Test unroll behaviour on machines with low overhead branching
Current behaviour is the single loop with fmul gets runtime unrolled by count of 4, with the loop remainder unrolled as the 3 for.body9.us.prol sections. This is quite a lot of compare and branch, negating the benefits of the low overhead loop mechanism.
1 parent dd7a3d4 commit e518dde

File tree

1 file changed

+63
-0
lines changed

1 file changed

+63
-0
lines changed
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
; RUN: opt -mcpu=cortex-m55 -mtriple=thumbv8.1m.main -passes=loop-unroll -S %s -o - | FileCheck %s --check-prefix=LOB
2+
3+
; This test checks behaviour of loop unrolling on processors with low overhead branching available
4+
5+
; LOB-CHECK-LABEL: for.body{{.*}}.prol
6+
; LOB-COUNT-1: fmul fast float
7+
; LOB-CHECK-LABEL: for.body{{.*}}.prol.1
8+
; LOB-COUNT-1: fmul fast float
9+
; LOB-CHECK-LABEL: for.body{{.*}}.prol.2
10+
; LOB-COUNT-1: fmul fast float
11+
; LOB-CHECK-LABEL: for.body{{.*}}
12+
; LOB-COUNT-4: fmul fast float
13+
; LOB-NOT: fmul fast float
14+
15+
; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
16+
define dso_local void @test(i32 noundef %n, ptr nocapture noundef %pA) local_unnamed_addr #0 {
17+
entry:
18+
%cmp46 = icmp sgt i32 %n, 0
19+
br i1 %cmp46, label %for.body, label %for.cond.cleanup
20+
21+
for.cond.loopexit: ; preds = %for.cond6.for.cond.cleanup8_crit_edge.us, %for.body
22+
%exitcond49.not = icmp eq i32 %add, %n
23+
br i1 %exitcond49.not, label %for.cond.cleanup, label %for.body
24+
25+
for.cond.cleanup: ; preds = %for.cond.loopexit, %entry
26+
ret void
27+
28+
for.body: ; preds = %entry, %for.cond.loopexit
29+
%k.047 = phi i32 [ %add, %for.cond.loopexit ], [ 0, %entry ]
30+
%add = add nuw nsw i32 %k.047, 1
31+
%cmp244 = icmp slt i32 %add, %n
32+
br i1 %cmp244, label %for.cond6.preheader.lr.ph, label %for.cond.loopexit
33+
34+
for.cond6.preheader.lr.ph: ; preds = %for.body
35+
%invariant.gep = getelementptr float, ptr %pA, i32 %k.047
36+
br label %for.cond6.preheader.us
37+
38+
for.cond6.preheader.us: ; preds = %for.cond6.for.cond.cleanup8_crit_edge.us, %for.cond6.preheader.lr.ph
39+
%w.045.us = phi i32 [ %add, %for.cond6.preheader.lr.ph ], [ %inc19.us, %for.cond6.for.cond.cleanup8_crit_edge.us ]
40+
%mul.us = mul nuw nsw i32 %w.045.us, %n
41+
%0 = getelementptr float, ptr %pA, i32 %mul.us
42+
%arrayidx.us = getelementptr float, ptr %0, i32 %k.047
43+
br label %for.body9.us
44+
45+
for.body9.us: ; preds = %for.cond6.preheader.us, %for.body9.us
46+
%x.043.us = phi i32 [ %add, %for.cond6.preheader.us ], [ %inc.us, %for.body9.us ]
47+
%1 = load float, ptr %arrayidx.us, align 4
48+
%mul11.us = mul nuw nsw i32 %x.043.us, %n
49+
%gep.us = getelementptr float, ptr %invariant.gep, i32 %mul11.us
50+
%2 = load float, ptr %gep.us, align 4
51+
%mul14.us = fmul fast float %2, %1
52+
%arrayidx17.us = getelementptr float, ptr %0, i32 %x.043.us
53+
store float %mul14.us, ptr %arrayidx17.us, align 4
54+
%inc.us = add nuw nsw i32 %x.043.us, 1
55+
%exitcond.not = icmp eq i32 %inc.us, %n
56+
br i1 %exitcond.not, label %for.cond6.for.cond.cleanup8_crit_edge.us, label %for.body9.us
57+
58+
for.cond6.for.cond.cleanup8_crit_edge.us: ; preds = %for.body9.us
59+
%inc19.us = add nuw nsw i32 %w.045.us, 1
60+
%exitcond48.not = icmp eq i32 %inc19.us, %n
61+
br i1 %exitcond48.not, label %for.cond.loopexit, label %for.cond6.preheader.us
62+
}
63+

0 commit comments

Comments
 (0)