Skip to content

Commit f29281b

Browse files
[AArch64] Allow unrolling of scalar epilogue loops
PR#147420 changed the unrolling preferences to permit unrolling of non-auto vectorized loops by checking for the isvectorized attribute, however when a loop is vectorized this attribute is put on both the vector loop and the scalar epilogue, so this change prevented the scalar epilogue from being unrolled. Restore the previous behaviour of unrolling the scalar epilogue by checking both for the isvectorized attribute and vector instructions in the loop.
1 parent 586cacd commit f29281b

File tree

2 files changed

+285
-5
lines changed

2 files changed

+285
-5
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4905,14 +4905,17 @@ void AArch64TTIImpl::getUnrollingPreferences(
49054905
// Disable partial & runtime unrolling on -Os.
49064906
UP.PartialOptSizeThreshold = 0;
49074907

4908-
// No need to unroll auto-vectorized loops
4909-
if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
4910-
return;
4911-
49124908
// Scan the loop: don't unroll loops with calls as this could prevent
4913-
// inlining.
4909+
// inlining. Don't unroll auto-vectorized loops either, though do allow
4910+
// unrolling of the scalar remainder.
4911+
bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
49144912
for (auto *BB : L->getBlocks()) {
49154913
for (auto &I : *BB) {
4914+
// Both auto-vectorized loops and the scalar remainder have the
4915+
// isvectorized attribute, so differentiate between them by the presence
4916+
// of vector instructions.
4917+
if (IsVectorized && I.getType()->isVectorTy())
4918+
return;
49164919
if (isa<CallBase>(I)) {
49174920
if (isa<CallInst>(I) || isa<InvokeInst>(I))
49184921
if (const Function *F = cast<CallBase>(I).getCalledFunction())

llvm/test/Transforms/LoopUnroll/AArch64/vector.ll

Lines changed: 277 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -485,12 +485,289 @@ exit: ; preds = %vector.body
485485
!0 = !{!0, !1}
486486
!1 = !{!"llvm.loop.isvectorized", i32 1}
487487

488+
; On Cortex-A55 we should runtime unroll the scalar epilogue loop, but not the
489+
; vector loop or vector epilogue loop.
490+
define void @scalar_epilogue(i64 %N, ptr %p, i8 %val) {
491+
; APPLE-LABEL: define void @scalar_epilogue(
492+
; APPLE-SAME: i64 [[N:%.*]], ptr [[P:%.*]], i8 [[VAL:%.*]]) #[[ATTR0]] {
493+
; APPLE-NEXT: [[ENTRY:.*]]:
494+
; APPLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
495+
; APPLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
496+
; APPLE: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
497+
; APPLE-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[N]], 32
498+
; APPLE-NEXT: br i1 [[MIN_ITERS_CHECK7]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
499+
; APPLE: [[VECTOR_PH]]:
500+
; APPLE-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -32
501+
; APPLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[VAL]], i64 0
502+
; APPLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
503+
; APPLE-NEXT: br label %[[VECTOR_BODY:.*]]
504+
; APPLE: [[VECTOR_BODY]]:
505+
; APPLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
506+
; APPLE-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INDEX]]
507+
; APPLE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
508+
; APPLE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
509+
; APPLE-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
510+
; APPLE-NEXT: [[TMP2:%.*]] = add <16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
511+
; APPLE-NEXT: [[TMP3:%.*]] = add <16 x i8> [[WIDE_LOAD8]], [[BROADCAST_SPLAT]]
512+
; APPLE-NEXT: store <16 x i8> [[TMP2]], ptr [[TMP0]], align 1
513+
; APPLE-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP1]], align 1
514+
; APPLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
515+
; APPLE-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
516+
; APPLE-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
517+
; APPLE: [[MIDDLE_BLOCK]]:
518+
; APPLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
519+
; APPLE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
520+
; APPLE: [[VEC_EPILOG_ITER_CHECK]]:
521+
; APPLE-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[N]], 24
522+
; APPLE-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
523+
; APPLE-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER]], label %[[VEC_EPILOG_PH]]
524+
; APPLE: [[VEC_EPILOG_PH]]:
525+
; APPLE-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
526+
; APPLE-NEXT: [[N_VEC10:%.*]] = and i64 [[N]], -8
527+
; APPLE-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <8 x i8> poison, i8 [[VAL]], i64 0
528+
; APPLE-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT11]], <8 x i8> poison, <8 x i32> zeroinitializer
529+
; APPLE-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
530+
; APPLE: [[VEC_EPILOG_VECTOR_BODY]]:
531+
; APPLE-NEXT: [[INDEX13:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
532+
; APPLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INDEX13]]
533+
; APPLE-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
534+
; APPLE-NEXT: [[TMP6:%.*]] = add <8 x i8> [[WIDE_LOAD14]], [[BROADCAST_SPLAT12]]
535+
; APPLE-NEXT: store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1
536+
; APPLE-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX13]], 8
537+
; APPLE-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC10]]
538+
; APPLE-NEXT: br i1 [[TMP7]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
539+
; APPLE: [[VEC_EPILOG_MIDDLE_BLOCK]]:
540+
; APPLE-NEXT: [[CMP_N16:%.*]] = icmp eq i64 [[N]], [[N_VEC10]]
541+
; APPLE-NEXT: br i1 [[CMP_N16]], label %[[EXIT]], label %[[FOR_BODY_PREHEADER]]
542+
; APPLE: [[FOR_BODY_PREHEADER]]:
543+
; APPLE-NEXT: [[I_06_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC10]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
544+
; APPLE-NEXT: br label %[[FOR_BODY:.*]]
545+
; APPLE: [[FOR_BODY]]:
546+
; APPLE-NEXT: [[I_06:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[I_06_PH]], %[[FOR_BODY_PREHEADER]] ]
547+
; APPLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[I_06]]
548+
; APPLE-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
549+
; APPLE-NEXT: [[ADD:%.*]] = add i8 [[TMP8]], [[VAL]]
550+
; APPLE-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
551+
; APPLE-NEXT: [[INC]] = add nuw i64 [[I_06]], 1
552+
; APPLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
553+
; APPLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
554+
; APPLE: [[EXIT_LOOPEXIT]]:
555+
; APPLE-NEXT: br label %[[EXIT]]
556+
; APPLE: [[EXIT]]:
557+
; APPLE-NEXT: ret void
558+
;
559+
; CORTEXA55-LABEL: define void @scalar_epilogue(
560+
; CORTEXA55-SAME: i64 [[N:%.*]], ptr [[P:%.*]], i8 [[VAL:%.*]]) #[[ATTR0]] {
561+
; CORTEXA55-NEXT: [[ENTRY:.*]]:
562+
; CORTEXA55-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
563+
; CORTEXA55-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
564+
; CORTEXA55: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
565+
; CORTEXA55-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[N]], 32
566+
; CORTEXA55-NEXT: br i1 [[MIN_ITERS_CHECK7]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
567+
; CORTEXA55: [[VECTOR_PH]]:
568+
; CORTEXA55-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -32
569+
; CORTEXA55-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[VAL]], i64 0
570+
; CORTEXA55-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
571+
; CORTEXA55-NEXT: br label %[[VECTOR_BODY:.*]]
572+
; CORTEXA55: [[VECTOR_BODY]]:
573+
; CORTEXA55-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
574+
; CORTEXA55-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INDEX]]
575+
; CORTEXA55-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
576+
; CORTEXA55-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
577+
; CORTEXA55-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
578+
; CORTEXA55-NEXT: [[TMP2:%.*]] = add <16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
579+
; CORTEXA55-NEXT: [[TMP3:%.*]] = add <16 x i8> [[WIDE_LOAD8]], [[BROADCAST_SPLAT]]
580+
; CORTEXA55-NEXT: store <16 x i8> [[TMP2]], ptr [[TMP0]], align 1
581+
; CORTEXA55-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP1]], align 1
582+
; CORTEXA55-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
583+
; CORTEXA55-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
584+
; CORTEXA55-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
585+
; CORTEXA55: [[MIDDLE_BLOCK]]:
586+
; CORTEXA55-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
587+
; CORTEXA55-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
588+
; CORTEXA55: [[VEC_EPILOG_ITER_CHECK]]:
589+
; CORTEXA55-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[N]], 24
590+
; CORTEXA55-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
591+
; CORTEXA55-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER]], label %[[VEC_EPILOG_PH]]
592+
; CORTEXA55: [[VEC_EPILOG_PH]]:
593+
; CORTEXA55-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
594+
; CORTEXA55-NEXT: [[N_VEC10:%.*]] = and i64 [[N]], -8
595+
; CORTEXA55-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <8 x i8> poison, i8 [[VAL]], i64 0
596+
; CORTEXA55-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT11]], <8 x i8> poison, <8 x i32> zeroinitializer
597+
; CORTEXA55-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
598+
; CORTEXA55: [[VEC_EPILOG_VECTOR_BODY]]:
599+
; CORTEXA55-NEXT: [[INDEX13:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
600+
; CORTEXA55-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INDEX13]]
601+
; CORTEXA55-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
602+
; CORTEXA55-NEXT: [[TMP6:%.*]] = add <8 x i8> [[WIDE_LOAD14]], [[BROADCAST_SPLAT12]]
603+
; CORTEXA55-NEXT: store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1
604+
; CORTEXA55-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX13]], 8
605+
; CORTEXA55-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC10]]
606+
; CORTEXA55-NEXT: br i1 [[TMP7]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
607+
; CORTEXA55: [[VEC_EPILOG_MIDDLE_BLOCK]]:
608+
; CORTEXA55-NEXT: [[CMP_N16:%.*]] = icmp eq i64 [[N]], [[N_VEC10]]
609+
; CORTEXA55-NEXT: br i1 [[CMP_N16]], label %[[EXIT]], label %[[FOR_BODY_PREHEADER]]
610+
; CORTEXA55: [[FOR_BODY_PREHEADER]]:
611+
; CORTEXA55-NEXT: [[I_06_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC10]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
612+
; CORTEXA55-NEXT: [[TMP8:%.*]] = sub i64 [[N]], [[I_06_PH]]
613+
; CORTEXA55-NEXT: [[TMP9:%.*]] = add i64 [[N]], -1
614+
; CORTEXA55-NEXT: [[TMP10:%.*]] = sub i64 [[TMP9]], [[I_06_PH]]
615+
; CORTEXA55-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP8]], 3
616+
; CORTEXA55-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
617+
; CORTEXA55-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_PROL_PREHEADER:.*]], label %[[FOR_BODY_PROL_LOOPEXIT:.*]]
618+
; CORTEXA55: [[FOR_BODY_PROL_PREHEADER]]:
619+
; CORTEXA55-NEXT: br label %[[FOR_BODY_PROL:.*]]
620+
; CORTEXA55: [[FOR_BODY_PROL]]:
621+
; CORTEXA55-NEXT: [[ARRAYIDX_PROL:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[I_06_PH]]
622+
; CORTEXA55-NEXT: [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX_PROL]], align 1
623+
; CORTEXA55-NEXT: [[ADD_PROL:%.*]] = add i8 [[TMP11]], [[VAL]]
624+
; CORTEXA55-NEXT: store i8 [[ADD_PROL]], ptr [[ARRAYIDX_PROL]], align 1
625+
; CORTEXA55-NEXT: [[INC_PROL:%.*]] = add nuw i64 [[I_06_PH]], 1
626+
; CORTEXA55-NEXT: [[PROL_ITER_CMP:%.*]] = icmp ne i64 1, [[XTRAITER]]
627+
; CORTEXA55-NEXT: br i1 [[PROL_ITER_CMP]], label %[[FOR_BODY_PROL_1:.*]], label %[[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA:.*]]
628+
; CORTEXA55: [[FOR_BODY_PROL_1]]:
629+
; CORTEXA55-NEXT: [[ARRAYIDX_PROL_1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_PROL]]
630+
; CORTEXA55-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX_PROL_1]], align 1
631+
; CORTEXA55-NEXT: [[ADD_PROL_1:%.*]] = add i8 [[TMP12]], [[VAL]]
632+
; CORTEXA55-NEXT: store i8 [[ADD_PROL_1]], ptr [[ARRAYIDX_PROL_1]], align 1
633+
; CORTEXA55-NEXT: [[INC_PROL_1:%.*]] = add nuw i64 [[I_06_PH]], 2
634+
; CORTEXA55-NEXT: [[PROL_ITER_CMP_1:%.*]] = icmp ne i64 2, [[XTRAITER]]
635+
; CORTEXA55-NEXT: br i1 [[PROL_ITER_CMP_1]], label %[[FOR_BODY_PROL_2:.*]], label %[[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]]
636+
; CORTEXA55: [[FOR_BODY_PROL_2]]:
637+
; CORTEXA55-NEXT: [[ARRAYIDX_PROL_2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_PROL_1]]
638+
; CORTEXA55-NEXT: [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX_PROL_2]], align 1
639+
; CORTEXA55-NEXT: [[ADD_PROL_2:%.*]] = add i8 [[TMP13]], [[VAL]]
640+
; CORTEXA55-NEXT: store i8 [[ADD_PROL_2]], ptr [[ARRAYIDX_PROL_2]], align 1
641+
; CORTEXA55-NEXT: [[INC_PROL_2:%.*]] = add nuw i64 [[I_06_PH]], 3
642+
; CORTEXA55-NEXT: br label %[[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]]
643+
; CORTEXA55: [[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]]:
644+
; CORTEXA55-NEXT: [[I_06_UNR_PH:%.*]] = phi i64 [ [[INC_PROL]], %[[FOR_BODY_PROL]] ], [ [[INC_PROL_1]], %[[FOR_BODY_PROL_1]] ], [ [[INC_PROL_2]], %[[FOR_BODY_PROL_2]] ]
645+
; CORTEXA55-NEXT: br label %[[FOR_BODY_PROL_LOOPEXIT]]
646+
; CORTEXA55: [[FOR_BODY_PROL_LOOPEXIT]]:
647+
; CORTEXA55-NEXT: [[I_06_UNR:%.*]] = phi i64 [ [[I_06_PH]], %[[FOR_BODY_PREHEADER]] ], [ [[I_06_UNR_PH]], %[[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]] ]
648+
; CORTEXA55-NEXT: [[TMP14:%.*]] = icmp ult i64 [[TMP10]], 3
649+
; CORTEXA55-NEXT: br i1 [[TMP14]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
650+
; CORTEXA55: [[FOR_BODY_PREHEADER_NEW]]:
651+
; CORTEXA55-NEXT: br label %[[FOR_BODY:.*]]
652+
; CORTEXA55: [[FOR_BODY]]:
653+
; CORTEXA55-NEXT: [[I_06:%.*]] = phi i64 [ [[I_06_UNR]], %[[FOR_BODY_PREHEADER_NEW]] ], [ [[INC_3:%.*]], %[[FOR_BODY]] ]
654+
; CORTEXA55-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[I_06]]
655+
; CORTEXA55-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
656+
; CORTEXA55-NEXT: [[ADD:%.*]] = add i8 [[TMP15]], [[VAL]]
657+
; CORTEXA55-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
658+
; CORTEXA55-NEXT: [[INC:%.*]] = add nuw i64 [[I_06]], 1
659+
; CORTEXA55-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC]]
660+
; CORTEXA55-NEXT: [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1
661+
; CORTEXA55-NEXT: [[ADD_1:%.*]] = add i8 [[TMP16]], [[VAL]]
662+
; CORTEXA55-NEXT: store i8 [[ADD_1]], ptr [[ARRAYIDX_1]], align 1
663+
; CORTEXA55-NEXT: [[INC_1:%.*]] = add nuw i64 [[I_06]], 2
664+
; CORTEXA55-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_1]]
665+
; CORTEXA55-NEXT: [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1
666+
; CORTEXA55-NEXT: [[ADD_2:%.*]] = add i8 [[TMP17]], [[VAL]]
667+
; CORTEXA55-NEXT: store i8 [[ADD_2]], ptr [[ARRAYIDX_2]], align 1
668+
; CORTEXA55-NEXT: [[INC_2:%.*]] = add nuw i64 [[I_06]], 3
669+
; CORTEXA55-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_2]]
670+
; CORTEXA55-NEXT: [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1
671+
; CORTEXA55-NEXT: [[ADD_3:%.*]] = add i8 [[TMP18]], [[VAL]]
672+
; CORTEXA55-NEXT: store i8 [[ADD_3]], ptr [[ARRAYIDX_3]], align 1
673+
; CORTEXA55-NEXT: [[INC_3]] = add nuw i64 [[I_06]], 4
674+
; CORTEXA55-NEXT: [[EXITCOND_NOT_3:%.*]] = icmp eq i64 [[INC_3]], [[N]]
675+
; CORTEXA55-NEXT: br i1 [[EXITCOND_NOT_3]], label %[[EXIT_LOOPEXIT_UNR_LCSSA:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
676+
; CORTEXA55: [[EXIT_LOOPEXIT_UNR_LCSSA]]:
677+
; CORTEXA55-NEXT: br label %[[EXIT_LOOPEXIT]]
678+
; CORTEXA55: [[EXIT_LOOPEXIT]]:
679+
; CORTEXA55-NEXT: br label %[[EXIT]]
680+
; CORTEXA55: [[EXIT]]:
681+
; CORTEXA55-NEXT: ret void
682+
;
683+
entry:
684+
%min.iters.check = icmp ult i64 %N, 8
685+
br i1 %min.iters.check, label %for.body, label %vector.main.loop.iter.check
686+
687+
vector.main.loop.iter.check:
688+
%min.iters.check7 = icmp ult i64 %N, 32
689+
br i1 %min.iters.check7, label %vec.epilog.ph, label %vector.ph
690+
691+
vector.ph:
692+
%n.vec = and i64 %N, -32
693+
%broadcast.splatinsert = insertelement <16 x i8> poison, i8 %val, i64 0
694+
%broadcast.splat = shufflevector <16 x i8> %broadcast.splatinsert, <16 x i8> poison, <16 x i32> zeroinitializer
695+
br label %vector.body
696+
697+
vector.body:
698+
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
699+
%0 = getelementptr inbounds nuw i8, ptr %p, i64 %index
700+
%1 = getelementptr inbounds nuw i8, ptr %0, i64 16
701+
%wide.load = load <16 x i8>, ptr %0, align 1
702+
%wide.load8 = load <16 x i8>, ptr %1, align 1
703+
%2 = add <16 x i8> %wide.load, %broadcast.splat
704+
%3 = add <16 x i8> %wide.load8, %broadcast.splat
705+
store <16 x i8> %2, ptr %0, align 1
706+
store <16 x i8> %3, ptr %1, align 1
707+
%index.next = add nuw i64 %index, 32
708+
%4 = icmp eq i64 %index.next, %n.vec
709+
br i1 %4, label %middle.block, label %vector.body, !llvm.loop !2
710+
711+
middle.block:
712+
%cmp.n = icmp eq i64 %N, %n.vec
713+
br i1 %cmp.n, label %exit, label %vec.epilog.iter.check
714+
715+
vec.epilog.iter.check:
716+
%n.vec.remaining = and i64 %N, 24
717+
%min.epilog.iters.check = icmp eq i64 %n.vec.remaining, 0
718+
br i1 %min.epilog.iters.check, label %for.body, label %vec.epilog.ph
719+
720+
vec.epilog.ph:
721+
%vec.epilog.resume.val = phi i64 [ %n.vec, %vec.epilog.iter.check ], [ 0, %vector.main.loop.iter.check ]
722+
%n.vec10 = and i64 %N, -8
723+
%broadcast.splatinsert11 = insertelement <8 x i8> poison, i8 %val, i64 0
724+
%broadcast.splat12 = shufflevector <8 x i8> %broadcast.splatinsert11, <8 x i8> poison, <8 x i32> zeroinitializer
725+
br label %vec.epilog.vector.body
726+
727+
vec.epilog.vector.body:
728+
%index13 = phi i64 [ %vec.epilog.resume.val, %vec.epilog.ph ], [ %index.next15, %vec.epilog.vector.body ]
729+
%5 = getelementptr inbounds nuw i8, ptr %p, i64 %index13
730+
%wide.load14 = load <8 x i8>, ptr %5, align 1
731+
%6 = add <8 x i8> %wide.load14, %broadcast.splat12
732+
store <8 x i8> %6, ptr %5, align 1
733+
%index.next15 = add nuw i64 %index13, 8
734+
%7 = icmp eq i64 %index.next15, %n.vec10
735+
br i1 %7, label %vec.epilog.middle.block, label %vec.epilog.vector.body, !llvm.loop !3
736+
737+
vec.epilog.middle.block:
738+
%cmp.n16 = icmp eq i64 %N, %n.vec10
739+
br i1 %cmp.n16, label %exit, label %for.body
740+
741+
for.body:
742+
%i.06 = phi i64 [ %inc, %for.body ], [ %n.vec10, %vec.epilog.middle.block ], [ %n.vec, %vec.epilog.iter.check ], [ 0, %entry ]
743+
%arrayidx = getelementptr inbounds nuw i8, ptr %p, i64 %i.06
744+
%8 = load i8, ptr %arrayidx, align 1
745+
%add = add i8 %8, %val
746+
store i8 %add, ptr %arrayidx, align 1
747+
%inc = add nuw i64 %i.06, 1
748+
%exitcond.not = icmp eq i64 %inc, %N
749+
br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !4
750+
751+
exit:
752+
ret void
753+
}
754+
755+
!2 = distinct !{!2, !1}
756+
!3 = distinct !{!3, !1}
757+
!4 = distinct !{!4, !1}
758+
488759
;.
489760
; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
490761
; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
491762
; APPLE: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]}
492763
; APPLE: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
764+
; APPLE: [[LOOP4]] = distinct !{[[LOOP4]], [[META3]]}
765+
; APPLE: [[LOOP5]] = distinct !{[[LOOP5]], [[META3]]}
766+
; APPLE: [[LOOP6]] = distinct !{[[LOOP6]], [[META3]]}
493767
;.
494768
; CORTEXA55: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
495769
; CORTEXA55: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
770+
; CORTEXA55: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
771+
; CORTEXA55: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
772+
; CORTEXA55: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
496773
;.

0 commit comments

Comments
 (0)