@@ -486,16 +486,13 @@ exit: ; preds = %vector.body
486486!1 = !{!"llvm.loop.isvectorized" , i32 1 }
487487
488488; On Cortex-A55 we should runtime unroll the scalar epilogue loop, but not the
489- ; vector loop or vector epilogue loop .
489+ ; vector loop.
490490define void @scalar_epilogue (i64 %N , ptr %p , i8 %val ) {
491491; APPLE-LABEL: define void @scalar_epilogue(
492492; APPLE-SAME: i64 [[N:%.*]], ptr [[P:%.*]], i8 [[VAL:%.*]]) #[[ATTR0]] {
493493; APPLE-NEXT: [[ENTRY:.*]]:
494- ; APPLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
495- ; APPLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
496- ; APPLE: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
497494; APPLE-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[N]], 32
498- ; APPLE-NEXT: br i1 [[MIN_ITERS_CHECK7]], label %[[VEC_EPILOG_PH :.*]], label %[[VECTOR_PH:.*]]
495+ ; APPLE-NEXT: br i1 [[MIN_ITERS_CHECK7]], label %[[FOR_BODY_PREHEADER :.*]], label %[[VECTOR_PH:.*]]
499496; APPLE: [[VECTOR_PH]]:
500497; APPLE-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -32
501498; APPLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[VAL]], i64 0
@@ -516,31 +513,9 @@ define void @scalar_epilogue(i64 %N, ptr %p, i8 %val) {
516513; APPLE-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
517514; APPLE: [[MIDDLE_BLOCK]]:
518515; APPLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
519- ; APPLE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
520- ; APPLE: [[VEC_EPILOG_ITER_CHECK]]:
521- ; APPLE-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[N]], 24
522- ; APPLE-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
523- ; APPLE-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER]], label %[[VEC_EPILOG_PH]]
524- ; APPLE: [[VEC_EPILOG_PH]]:
525- ; APPLE-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
526- ; APPLE-NEXT: [[N_VEC10:%.*]] = and i64 [[N]], -8
527- ; APPLE-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <8 x i8> poison, i8 [[VAL]], i64 0
528- ; APPLE-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT11]], <8 x i8> poison, <8 x i32> zeroinitializer
529- ; APPLE-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
530- ; APPLE: [[VEC_EPILOG_VECTOR_BODY]]:
531- ; APPLE-NEXT: [[INDEX13:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
532- ; APPLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INDEX13]]
533- ; APPLE-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
534- ; APPLE-NEXT: [[TMP6:%.*]] = add <8 x i8> [[WIDE_LOAD14]], [[BROADCAST_SPLAT12]]
535- ; APPLE-NEXT: store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1
536- ; APPLE-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX13]], 8
537- ; APPLE-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC10]]
538- ; APPLE-NEXT: br i1 [[TMP7]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
539- ; APPLE: [[VEC_EPILOG_MIDDLE_BLOCK]]:
540- ; APPLE-NEXT: [[CMP_N16:%.*]] = icmp eq i64 [[N]], [[N_VEC10]]
541- ; APPLE-NEXT: br i1 [[CMP_N16]], label %[[EXIT]], label %[[FOR_BODY_PREHEADER]]
516+ ; APPLE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[FOR_BODY_PREHEADER]]
542517; APPLE: [[FOR_BODY_PREHEADER]]:
543- ; APPLE-NEXT: [[I_06_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC10]], %[[VEC_EPILOG_MIDDLE_BLOCK ]] ]
518+ ; APPLE-NEXT: [[I_06_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK ]] ]
544519; APPLE-NEXT: br label %[[FOR_BODY:.*]]
545520; APPLE: [[FOR_BODY]]:
546521; APPLE-NEXT: [[I_06:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[I_06_PH]], %[[FOR_BODY_PREHEADER]] ]
@@ -550,7 +525,7 @@ define void @scalar_epilogue(i64 %N, ptr %p, i8 %val) {
550525; APPLE-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
551526; APPLE-NEXT: [[INC]] = add nuw i64 [[I_06]], 1
552527; APPLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
553- ; APPLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP6 :![0-9]+]]
528+ ; APPLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP5 :![0-9]+]]
554529; APPLE: [[EXIT_LOOPEXIT]]:
555530; APPLE-NEXT: br label %[[EXIT]]
556531; APPLE: [[EXIT]]:
@@ -559,11 +534,8 @@ define void @scalar_epilogue(i64 %N, ptr %p, i8 %val) {
559534; CORTEXA55-LABEL: define void @scalar_epilogue(
560535; CORTEXA55-SAME: i64 [[N:%.*]], ptr [[P:%.*]], i8 [[VAL:%.*]]) #[[ATTR0]] {
561536; CORTEXA55-NEXT: [[ENTRY:.*]]:
562- ; CORTEXA55-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
563- ; CORTEXA55-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
564- ; CORTEXA55: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
565537; CORTEXA55-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[N]], 32
566- ; CORTEXA55-NEXT: br i1 [[MIN_ITERS_CHECK7]], label %[[VEC_EPILOG_PH :.*]], label %[[VECTOR_PH:.*]]
538+ ; CORTEXA55-NEXT: br i1 [[MIN_ITERS_CHECK7]], label %[[FOR_BODY_PREHEADER :.*]], label %[[VECTOR_PH:.*]]
567539; CORTEXA55: [[VECTOR_PH]]:
568540; CORTEXA55-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -32
569541; CORTEXA55-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[VAL]], i64 0
@@ -584,31 +556,9 @@ define void @scalar_epilogue(i64 %N, ptr %p, i8 %val) {
584556; CORTEXA55-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
585557; CORTEXA55: [[MIDDLE_BLOCK]]:
586558; CORTEXA55-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
587- ; CORTEXA55-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
588- ; CORTEXA55: [[VEC_EPILOG_ITER_CHECK]]:
589- ; CORTEXA55-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[N]], 24
590- ; CORTEXA55-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
591- ; CORTEXA55-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER]], label %[[VEC_EPILOG_PH]]
592- ; CORTEXA55: [[VEC_EPILOG_PH]]:
593- ; CORTEXA55-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
594- ; CORTEXA55-NEXT: [[N_VEC10:%.*]] = and i64 [[N]], -8
595- ; CORTEXA55-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <8 x i8> poison, i8 [[VAL]], i64 0
596- ; CORTEXA55-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT11]], <8 x i8> poison, <8 x i32> zeroinitializer
597- ; CORTEXA55-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
598- ; CORTEXA55: [[VEC_EPILOG_VECTOR_BODY]]:
599- ; CORTEXA55-NEXT: [[INDEX13:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
600- ; CORTEXA55-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INDEX13]]
601- ; CORTEXA55-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
602- ; CORTEXA55-NEXT: [[TMP6:%.*]] = add <8 x i8> [[WIDE_LOAD14]], [[BROADCAST_SPLAT12]]
603- ; CORTEXA55-NEXT: store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1
604- ; CORTEXA55-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX13]], 8
605- ; CORTEXA55-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC10]]
606- ; CORTEXA55-NEXT: br i1 [[TMP7]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
607- ; CORTEXA55: [[VEC_EPILOG_MIDDLE_BLOCK]]:
608- ; CORTEXA55-NEXT: [[CMP_N16:%.*]] = icmp eq i64 [[N]], [[N_VEC10]]
609- ; CORTEXA55-NEXT: br i1 [[CMP_N16]], label %[[EXIT]], label %[[FOR_BODY_PREHEADER]]
559+ ; CORTEXA55-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[FOR_BODY_PREHEADER]]
610560; CORTEXA55: [[FOR_BODY_PREHEADER]]:
611- ; CORTEXA55-NEXT: [[I_06_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC10]], %[[VEC_EPILOG_MIDDLE_BLOCK ]] ]
561+ ; CORTEXA55-NEXT: [[I_06_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK ]] ]
612562; CORTEXA55-NEXT: [[TMP8:%.*]] = sub i64 [[N]], [[I_06_PH]]
613563; CORTEXA55-NEXT: [[TMP9:%.*]] = add i64 [[N]], -1
614564; CORTEXA55-NEXT: [[TMP10:%.*]] = sub i64 [[TMP9]], [[I_06_PH]]
@@ -672,7 +622,7 @@ define void @scalar_epilogue(i64 %N, ptr %p, i8 %val) {
672622; CORTEXA55-NEXT: store i8 [[ADD_3]], ptr [[ARRAYIDX_3]], align 1
673623; CORTEXA55-NEXT: [[INC_3]] = add nuw i64 [[I_06]], 4
674624; CORTEXA55-NEXT: [[EXITCOND_NOT_3:%.*]] = icmp eq i64 [[INC_3]], [[N]]
675- ; CORTEXA55-NEXT: br i1 [[EXITCOND_NOT_3]], label %[[EXIT_LOOPEXIT_UNR_LCSSA:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP4 :![0-9]+]]
625+ ; CORTEXA55-NEXT: br i1 [[EXITCOND_NOT_3]], label %[[EXIT_LOOPEXIT_UNR_LCSSA:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP3 :![0-9]+]]
676626; CORTEXA55: [[EXIT_LOOPEXIT_UNR_LCSSA]]:
677627; CORTEXA55-NEXT: br label %[[EXIT_LOOPEXIT]]
678628; CORTEXA55: [[EXIT_LOOPEXIT]]:
@@ -681,12 +631,8 @@ define void @scalar_epilogue(i64 %N, ptr %p, i8 %val) {
681631; CORTEXA55-NEXT: ret void
682632;
683633entry:
684- %min.iters.check = icmp ult i64 %N , 8
685- br i1 %min.iters.check , label %for.body , label %vector.main.loop.iter.check
686-
687- vector.main.loop.iter.check:
688- %min.iters.check7 = icmp ult i64 %N , 32
689- br i1 %min.iters.check7 , label %vec.epilog.ph , label %vector.ph
634+ %min.iters.check = icmp ult i64 %N , 32
635+ br i1 %min.iters.check , label %for.body , label %vector.ph
690636
691637vector.ph:
692638 %n.vec = and i64 %N , -32
@@ -710,51 +656,24 @@ vector.body:
710656
711657middle.block:
712658 %cmp.n = icmp eq i64 %N , %n.vec
713- br i1 %cmp.n , label %exit , label %vec.epilog.iter.check
714-
715- vec.epilog.iter.check:
716- %n.vec.remaining = and i64 %N , 24
717- %min.epilog.iters.check = icmp eq i64 %n.vec.remaining , 0
718- br i1 %min.epilog.iters.check , label %for.body , label %vec.epilog.ph
719-
720- vec.epilog.ph:
721- %vec.epilog.resume.val = phi i64 [ %n.vec , %vec.epilog.iter.check ], [ 0 , %vector.main.loop.iter.check ]
722- %n.vec10 = and i64 %N , -8
723- %broadcast.splatinsert11 = insertelement <8 x i8 > poison, i8 %val , i64 0
724- %broadcast.splat12 = shufflevector <8 x i8 > %broadcast.splatinsert11 , <8 x i8 > poison, <8 x i32 > zeroinitializer
725- br label %vec.epilog.vector.body
726-
727- vec.epilog.vector.body:
728- %index13 = phi i64 [ %vec.epilog.resume.val , %vec.epilog.ph ], [ %index.next15 , %vec.epilog.vector.body ]
729- %5 = getelementptr inbounds nuw i8 , ptr %p , i64 %index13
730- %wide.load14 = load <8 x i8 >, ptr %5 , align 1
731- %6 = add <8 x i8 > %wide.load14 , %broadcast.splat12
732- store <8 x i8 > %6 , ptr %5 , align 1
733- %index.next15 = add nuw i64 %index13 , 8
734- %7 = icmp eq i64 %index.next15 , %n.vec10
735- br i1 %7 , label %vec.epilog.middle.block , label %vec.epilog.vector.body , !llvm.loop !3
736-
737- vec.epilog.middle.block:
738- %cmp.n16 = icmp eq i64 %N , %n.vec10
739- br i1 %cmp.n16 , label %exit , label %for.body
659+ br i1 %cmp.n , label %exit , label %for.body
740660
741661for.body:
742- %i.06 = phi i64 [ %inc , %for.body ], [ %n.vec10 , %vec.epilog. middle.block ], [ %n.vec , %vec.epilog.iter.check ], [ 0 , %entry ]
662+ %i.06 = phi i64 [ %inc , %for.body ], [ %n.vec , %middle.block ], [ 0 , %entry ]
743663 %arrayidx = getelementptr inbounds nuw i8 , ptr %p , i64 %i.06
744664 %8 = load i8 , ptr %arrayidx , align 1
745665 %add = add i8 %8 , %val
746666 store i8 %add , ptr %arrayidx , align 1
747667 %inc = add nuw i64 %i.06 , 1
748668 %exitcond.not = icmp eq i64 %inc , %N
749- br i1 %exitcond.not , label %exit , label %for.body , !llvm.loop !4
669+ br i1 %exitcond.not , label %exit , label %for.body , !llvm.loop !3
750670
751671exit:
752672 ret void
753673}
754674
755675!2 = distinct !{!2 , !1 }
756676!3 = distinct !{!3 , !1 }
757- !4 = distinct !{!4 , !1 }
758677
759678;.
760679; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
@@ -763,11 +682,9 @@ exit:
763682; APPLE: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
764683; APPLE: [[LOOP4]] = distinct !{[[LOOP4]], [[META3]]}
765684; APPLE: [[LOOP5]] = distinct !{[[LOOP5]], [[META3]]}
766- ; APPLE: [[LOOP6]] = distinct !{[[LOOP6]], [[META3]]}
767685;.
768686; CORTEXA55: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
769687; CORTEXA55: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
770688; CORTEXA55: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
771689; CORTEXA55: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
772- ; CORTEXA55: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
773690;.
0 commit comments