@@ -144,6 +144,90 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
144
144
; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop
145
145
; CHECK: Executing best plan with VF=vscale x 4, UF=1
146
146
; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' {
147
+ ; CHECK-NEXT: Live-in ir<[[VF:%.+]]> = VF
148
+ ; CHECK-NEXT: Live-in ir<[[VFxUF:%.+]]>.1 = VF * UF
149
+ ; CHECK-NEXT: Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count
150
+ ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count
151
+ ; CHECK-EMPTY:
152
+ ; CHECK-NEXT: ir-bb<for.body.preheader>:
153
+ ; CHECK-NEXT: IR %0 = zext i32 %n to i64
154
+ ; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64)
155
+ ; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.scevcheck>
156
+ ; CHECK-EMPTY:
157
+ ; CHECK-NEXT: ir-bb<vector.scevcheck>:
158
+ ; CHECK-NEXT: IR %3 = add nsw i64 %0, -1
159
+ ; CHECK-NEXT: IR %4 = add i32 %n, -1
160
+ ; CHECK-NEXT: IR %5 = trunc i64 %3 to i32
161
+ ; CHECK-NEXT: IR %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5)
162
+ ; CHECK-NEXT: IR %mul.result = extractvalue { i32, i1 } %mul, 0
163
+ ; CHECK-NEXT: IR %mul.overflow = extractvalue { i32, i1 } %mul, 1
164
+ ; CHECK-NEXT: IR %6 = sub i32 %4, %mul.result
165
+ ; CHECK-NEXT: IR %7 = icmp ugt i32 %6, %4
166
+ ; CHECK-NEXT: IR %8 = or i1 %7, %mul.overflow
167
+ ; CHECK-NEXT: IR %9 = icmp ugt i64 %3, 4294967295
168
+ ; CHECK-NEXT: IR %10 = or i1 %8, %9
169
+ ; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.memcheck>
170
+ ; CHECK-EMPTY:
171
+ ; CHECK-NEXT: ir-bb<vector.memcheck>:
172
+ ; CHECK-NEXT: IR %11 = call i64 @llvm.vscale.i64()
173
+ ; CHECK-NEXT: IR %12 = mul i64 %11, 4
174
+ ; CHECK-NEXT: IR %13 = mul i64 %12, 4
175
+ ; CHECK-NEXT: IR %14 = sub i64 %B1, %A2
176
+ ; CHECK-NEXT: IR %diff.check = icmp ult i64 %14, %13
177
+ ; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph>
178
+ ; CHECK-EMPTY:
179
+ ; CHECK-NEXT: ir-bb<vector.ph>:
180
+ ; CHECK-NEXT: IR %15 = call i64 @llvm.vscale.i64()
181
+ ; CHECK-NEXT: IR %16 = mul i64 %15, 4
182
+ ; CHECK-NEXT: IR %n.mod.vf = urem i64 %0, %16
183
+ ; CHECK-NEXT: IR %n.vec = sub i64 %0, %n.mod.vf
184
+ ; CHECK-NEXT: IR %ind.end = sub i64 %0, %n.vec
185
+ ; CHECK-NEXT: IR %.cast = trunc i64 %n.vec to i32
186
+ ; CHECK-NEXT: IR %ind.end3 = sub i32 %n, %.cast
187
+ ; CHECK-NEXT: IR %17 = call i64 @llvm.vscale.i64()
188
+ ; CHECK-NEXT: IR %18 = mul i64 %17, 4
189
+ ; CHECK-NEXT: Successor(s): vector loop
190
+ ; CHECK-EMPTY:
191
+ ; CHECK-NEXT: <x1> vector loop: {
192
+ ; CHECK-NEXT: vector.body:
193
+ ; CHECK-NEXT: SCALAR-PHI vp<[[CAN_IV:%.+]]> = phi ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
194
+ ; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
195
+ ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1>
196
+ ; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1>
197
+ ; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0>
198
+ ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
199
+ ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx>, ir<[[VF]]>
200
+ ; CHECK-NEXT: WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]>
201
+ ; CHECK-NEXT: WIDEN ir<%add9> = add ir<[[L]]>, ir<1>
202
+ ; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
203
+ ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx3>, ir<[[VF]]>
204
+ ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add9>
205
+ ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1
206
+ ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
207
+ ; CHECK-NEXT: No successors
208
+ ; CHECK-NEXT: }
209
+ ; CHECK-NEXT: Successor(s): ir-bb<middle.block>
210
+ ; CHECK-EMPTY:
211
+ ; CHECK-NEXT: ir-bb<middle.block>:
212
+ ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, ir<[[VEC_TC]]>
213
+ ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
214
+ ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, ir-bb<scalar.ph>
215
+ ; CHECK-EMPTY:
216
+ ; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>:
217
+ ; CHECK-NEXT: No successors
218
+ ; CHECK-EMPTY:
219
+ ; CHECK-NEXT: ir-bb<scalar.ph>:
220
+ ; CHECK-NEXT: IR [[RESUME_1:%.+]] = phi i64
221
+ ; CHECK-NEXT: IR [[RESUME_2:%.+]] = phi i32
222
+ ; CHECK-NEXT: Successor(s): ir-bb<for.body>
223
+ ; CHECK-EMPTY:
224
+ ; CHECK-NEXT: ir-bb<for.body>:
225
+ ; CHECK-NEXT: IR %indvars.iv = phi i64 [ [[RESUME_1]], %scalar.ph ], [ %indvars.iv.next, %for.body ]
226
+ ; CHECK-NEXT: IR %i.0.in8 = phi i32 [ [[RESUME_2]], %scalar.ph ], [ %i.0, %for.body ]
227
+ ; CHECK: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1
228
+ ; CHECK-NEXT: No successors
229
+ ; CHECK-NEXT: }
230
+ ; CHECK: LV: Loop does not require scalar epilogue
147
231
;
148
232
entry:
149
233
%cmp7 = icmp sgt i32 %n , 0
@@ -306,6 +390,91 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
306
390
; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in <stdin>
307
391
; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop
308
392
; CHECK: Executing best plan with VF=vscale x 4, UF=1
393
+ ; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' {
394
+ ; CHECK-NEXT: Live-in ir<[[VF:%.+]]> = VF
395
+ ; CHECK-NEXT: Live-in ir<[[VFxUF:%.+]]>.1 = VF * UF
396
+ ; CHECK-NEXT: Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count
397
+ ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count
398
+ ; CHECK-EMPTY:
399
+ ; CHECK-NEXT: ir-bb<for.body.preheader>:
400
+ ; CHECK-NEXT: IR %0 = zext i32 %n to i64
401
+ ; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64)
402
+ ; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.scevcheck>
403
+ ; CHECK-EMPTY:
404
+ ; CHECK-NEXT: ir-bb<vector.scevcheck>:
405
+ ; CHECK-NEXT: IR %3 = add nsw i64 %0, -1
406
+ ; CHECK-NEXT: IR %4 = add i32 %n, -1
407
+ ; CHECK-NEXT: IR %5 = trunc i64 %3 to i32
408
+ ; CHECK-NEXT: IR %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5)
409
+ ; CHECK-NEXT: IR %mul.result = extractvalue { i32, i1 } %mul, 0
410
+ ; CHECK-NEXT: IR %mul.overflow = extractvalue { i32, i1 } %mul, 1
411
+ ; CHECK-NEXT: IR %6 = sub i32 %4, %mul.result
412
+ ; CHECK-NEXT: IR %7 = icmp ugt i32 %6, %4
413
+ ; CHECK-NEXT: IR %8 = or i1 %7, %mul.overflow
414
+ ; CHECK-NEXT: IR %9 = icmp ugt i64 %3, 4294967295
415
+ ; CHECK-NEXT: IR %10 = or i1 %8, %9
416
+ ; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.memcheck>
417
+ ; CHECK-EMPTY:
418
+ ; CHECK-NEXT: ir-bb<vector.memcheck>:
419
+ ; CHECK-NEXT: IR %11 = call i64 @llvm.vscale.i64()
420
+ ; CHECK-NEXT: IR %12 = mul i64 %11, 4
421
+ ; CHECK-NEXT: IR %13 = mul i64 %12, 4
422
+ ; CHECK-NEXT: IR %14 = sub i64 %B1, %A2
423
+ ; CHECK-NEXT: IR %diff.check = icmp ult i64 %14, %13
424
+ ; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph>
425
+ ; CHECK-EMPTY:
426
+ ; CHECK-NEXT: ir-bb<vector.ph>:
427
+ ; CHECK-NEXT: IR %15 = call i64 @llvm.vscale.i64()
428
+ ; CHECK-NEXT: IR %16 = mul i64 %15, 4
429
+ ; CHECK-NEXT: IR %n.mod.vf = urem i64 %0, %16
430
+ ; CHECK-NEXT: IR %n.vec = sub i64 %0, %n.mod.vf
431
+ ; CHECK-NEXT: IR %ind.end = sub i64 %0, %n.vec
432
+ ; CHECK-NEXT: IR %.cast = trunc i64 %n.vec to i32
433
+ ; CHECK-NEXT: IR %ind.end3 = sub i32 %n, %.cast
434
+ ; CHECK-NEXT: IR %17 = call i64 @llvm.vscale.i64()
435
+ ; CHECK-NEXT: IR %18 = mul i64 %17, 4
436
+ ; CHECK-NEXT: Successor(s): vector loop
437
+ ; CHECK-EMPTY:
438
+ ; CHECK-NEXT: <x1> vector loop: {
439
+ ; CHECK-NEXT: vector.body:
440
+ ; CHECK-NEXT: SCALAR-PHI vp<[[CAN_IV:%.+]]> = phi ir<0>, vp<[[CAN_IV_NEXT:.+]]>
441
+ ; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
442
+ ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1>
443
+ ; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1>
444
+ ; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0>
445
+ ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
446
+ ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx>, ir<[[VF]]>
447
+ ; CHECK-NEXT: WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]>
448
+ ; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<[[L]]>, ir<1.000000e+00>
449
+ ; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
450
+ ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx3>, ir<[[VF]]>
451
+ ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%conv1>
452
+ ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1
453
+ ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
454
+ ; CHECK-NEXT: No successors
455
+ ; CHECK-NEXT: }
456
+ ; CHECK-NEXT: Successor(s): ir-bb<middle.block>
457
+ ; CHECK-EMPTY:
458
+ ; CHECK-NEXT: ir-bb<middle.block>:
459
+ ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, ir<[[VEC_TC]]>
460
+ ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
461
+ ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, ir-bb<scalar.ph>
462
+ ; CHECK-EMPTY:
463
+ ; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>:
464
+ ; CHECK-NEXT: No successors
465
+ ; CHECK-EMPTY:
466
+ ; CHECK-NEXT: ir-bb<scalar.ph>:
467
+ ; CHECK-NEXT: IR [[RESUME_1:%.+]] = phi i64
468
+ ; CHECK-NEXT: IR [[RESUME_2:%.+]] = phi i32
469
+ ; CHECK-NEXT: Successor(s): ir-bb<for.body>
470
+ ; CHECK-EMPTY:
471
+ ; CHECK-NEXT: ir-bb<for.body>:
472
+ ; CHECK-NEXT: IR %indvars.iv = phi i64 [ [[RESUME_1]], %scalar.ph ], [ %indvars.iv.next, %for.body ]
473
+ ; CHECK-NEXT: IR %i.0.in8 = phi i32 [ [[RESUME_2]], %scalar.ph ], [ %i.0, %for.body ]
474
+ ; CHECK: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1
475
+ ; CHECK-NEXT: No successors
476
+ ; CHECK-NEXT: }
477
+ ; CHECK: LV: Loop does not require scalar epilogue
309
478
;
310
479
entry:
311
480
%cmp7 = icmp sgt i32 %n , 0
0 commit comments