@@ -5,7 +5,7 @@ target triple = "aarch64-linux-gnu"
55
66%pair = type { i8 , i8 }
77
8- ; For a loop with a profile-guided estimated TC of 32, when the auto-vectorizer chooses VF 16,
8+ ; For a loop with a profile-guided estimated TC of 32, when the auto-vectorizer chooses VF 16,
99; it should conservatively choose IC 1 so that the vector loop runs twice at least
1010; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
1111define void @loop_with_profile_tc_32 (ptr noalias %p , ptr noalias %q , i64 %n ) {
@@ -29,7 +29,7 @@ for.end:
2929 ret void
3030}
3131
32- ; For a loop with a profile-guided estimated TC of 33, when the auto-vectorizer chooses VF 16,
32+ ; For a loop with a profile-guided estimated TC of 33, when the auto-vectorizer chooses VF 16,
3333; it should conservatively choose IC 1 so that the vector loop runs twice at least
3434; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
3535define void @loop_with_profile_tc_33 (ptr noalias %p , ptr noalias %q , i64 %n ) {
@@ -53,7 +53,7 @@ for.end:
5353 ret void
5454}
5555
56- ; For a loop with a profile-guided estimated TC of 48, when the auto-vectorizer chooses VF 16,
56+ ; For a loop with a profile-guided estimated TC of 48, when the auto-vectorizer chooses VF 16,
5757; it should conservatively choose IC 1 so that the vector loop runs twice at least
5858; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
5959define void @loop_with_profile_tc_48 (ptr noalias %p , ptr noalias %q , i64 %n ) {
@@ -77,7 +77,7 @@ for.end:
7777 ret void
7878}
7979
80- ; For a loop with a profile-guided estimated TC of 63, when the auto-vectorizer chooses VF 16,
80+ ; For a loop with a profile-guided estimated TC of 63, when the auto-vectorizer chooses VF 16,
8181; it should conservatively choose IC 1 so that the vector loop runs twice at least
8282; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
8383define void @loop_with_profile_tc_63 (ptr noalias %p , ptr noalias %q , i64 %n ) {
@@ -101,7 +101,7 @@ for.end:
101101 ret void
102102}
103103
104- ; For a loop with a profile-guided estimated TC of 64, when the auto-vectorizer chooses VF 16,
104+ ; For a loop with a profile-guided estimated TC of 64, when the auto-vectorizer chooses VF 16,
105105; it should choose conservatively IC 2 so that the vector loop runs twice at least
106106; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
107107define void @loop_with_profile_tc_64 (ptr noalias %p , ptr noalias %q , i64 %n ) {
@@ -125,10 +125,10 @@ for.end:
125125 ret void
126126}
127127
128- ; This has the same profile-guided estimated trip count as loop_with_profile_tc_64 but since the
129- ; resulting interleaved group in this case may access memory out-of-bounds, it requires a scalar
128+ ; This has the same profile-guided estimated trip count as loop_with_profile_tc_64 but since the
129+ ; resulting interleaved group in this case may access memory out-of-bounds, it requires a scalar
130130; epilogue iteration for correctness, making at most 63 iterations available for interleaving.
131- ; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar
131+ ; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar
132132; remainder than IC 2
133133; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
134134define void @loop_with_profile_tc_64_scalar_epilogue_reqd (ptr noalias %p , ptr noalias %q , i64 %n ) {
@@ -149,7 +149,7 @@ for.end:
149149 ret void
150150}
151151
152- ; For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16,
152+ ; For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16,
153153; it should choose conservatively IC 2 so that the vector loop runs twice at least
154154; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
155155define void @loop_with_profile_tc_100 (ptr noalias %p , ptr noalias %q , i64 %n ) {
@@ -173,7 +173,7 @@ for.end:
173173 ret void
174174}
175175
176- ; For a loop with a profile-guided estimated TC of 128, when the auto-vectorizer chooses VF 16,
176+ ; For a loop with a profile-guided estimated TC of 128, when the auto-vectorizer chooses VF 16,
177177; it should choose conservatively IC 4 so that the vector loop runs twice at least
178178; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
179179define void @loop_with_profile_tc_128 (ptr noalias %p , ptr noalias %q , i64 %n ) {
@@ -206,11 +206,11 @@ for.end:
206206 ret void
207207}
208208
209- ; This has the same profile-guided estimated trip count as loop_with_profile_tc_128 but since
210- ; the resulting interleaved group in this case may access memory out-of-bounds, it requires
211- ; a scalar epilogue iteration for correctness, making at most 127 iterations available for
209+ ; This has the same profile-guided estimated trip count as loop_with_profile_tc_128 but since
210+ ; the resulting interleaved group in this case may access memory out-of-bounds, it requires
211+ ; a scalar epilogue iteration for correctness, making at most 127 iterations available for
212212; interleaving.
213- ; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
213+ ; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
214214; remainder than IC 4
215215; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
216216define void @loop_with_profile_tc_128_scalar_epilogue_reqd (ptr noalias %p , ptr noalias %q , i64 %n ) {
@@ -240,7 +240,7 @@ for.end:
240240 ret void
241241}
242242
243- ; For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16,
243+ ; For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16,
244244; it should choose conservatively IC 4 so that the vector loop runs twice at least
245245; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
246246define void @loop_with_profile_tc_129 (ptr noalias %p , ptr noalias %q , i64 %n ) {
@@ -264,7 +264,7 @@ for.end:
264264 ret void
265265}
266266
267- ; For a loop with a profile-guided estimated TC of 180, when the auto-vectorizer chooses VF 16,
267+ ; For a loop with a profile-guided estimated TC of 180, when the auto-vectorizer chooses VF 16,
268268; it should choose conservatively IC 4 so that the vector loop runs twice at least
269269; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
270270define void @loop_with_profile_tc_180 (ptr noalias %p , ptr noalias %q , i64 %n ) {
@@ -288,7 +288,7 @@ for.end:
288288 ret void
289289}
290290
291- ; For a loop with a profile-guided estimated TC of 193, when the auto-vectorizer chooses VF 16,
291+ ; For a loop with a profile-guided estimated TC of 193, when the auto-vectorizer chooses VF 16,
292292; it should choose conservatively IC 4 so that the vector loop runs twice at least
293293; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
294294define void @loop_with_profile_tc_193 (ptr noalias %p , ptr noalias %q , i64 %n ) {
@@ -312,7 +312,7 @@ for.end:
312312 ret void
313313}
314314
315- ; For a loop with a profile-guided estimated TC of 1000, when the auto-vectorizer chooses VF 16,
315+ ; For a loop with a profile-guided estimated TC of 1000, when the auto-vectorizer chooses VF 16,
316316; the IC will be capped by the target-specific maximum interleave count
317317; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
318318define void @loop_with_profile_tc_1000 (ptr noalias %p , ptr noalias %q , i64 %n ) {
@@ -336,6 +336,30 @@ for.end:
336336 ret void
337337}
338338
339+ ; When the loop weight is UINT_MAX, and the exit count is 1, the trip count
340+ ; computation could wrap.
341+ ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
342+ define void @loop_with_profile_wrap (ptr noalias %p , ptr noalias %q , i64 %n ) {
343+ entry:
344+ br label %for.body
345+
346+ for.body:
347+ %i = phi i64 [ 0 , %entry ], [ %i.next , %for.body ]
348+ %tmp0 = getelementptr %pair , ptr %p , i64 %i , i32 0
349+ %tmp1 = load i8 , ptr %tmp0 , align 1
350+ %tmp2 = getelementptr %pair , ptr %p , i64 %i , i32 1
351+ %tmp3 = load i8 , ptr %tmp2 , align 1
352+ %add = add i8 %tmp1 , %tmp3
353+ %qi = getelementptr i8 , ptr %q , i64 %i
354+ store i8 %add , ptr %qi , align 1
355+ %i.next = add nuw nsw i64 %i , 1
356+ %cond = icmp eq i64 %i.next , %n
357+ br i1 %cond , label %for.end , label %for.body , !prof !11
358+
359+ for.end:
360+ ret void
361+ }
362+
339363!0 = !{!"branch_weights" , i32 1 , i32 31 }
340364!1 = !{!"branch_weights" , i32 1 , i32 32 }
341365!2 = !{!"branch_weights" , i32 1 , i32 47 }
@@ -347,3 +371,4 @@ for.end:
347371!8 = !{!"branch_weights" , i32 1 , i32 179 }
348372!9 = !{!"branch_weights" , i32 1 , i32 192 }
349373!10 = !{!"branch_weights" , i32 1 , i32 999 }
374+ !11 = !{!"branch_weights" , i32 1 , i32 -1 }
0 commit comments