diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1d9e4f5a19f5c..c9931625d1bfd 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4731,7 +4731,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( unsigned Multiplier = IC; if (MainLoopVF.isScalable()) - Multiplier = getVScaleForTuning(OrigLoop, TTI).value_or(1); + Multiplier *= getVScaleForTuning(OrigLoop, TTI).value_or(1); if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, Multiplier)) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll index 1eab166b2e553..3f1ddc88ba2c5 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll @@ -302,10 +302,15 @@ exit: define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 { ; DEFAULT-LABEL: define i16 @reduce_udiv( ; DEFAULT-SAME: ptr [[SRC:%.*]], i16 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: iter.check: ; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 ; DEFAULT-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8 +; DEFAULT-NEXT: [[TMP8:%.*]] = mul i64 [[TMP1]], 2 +; DEFAULT-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], [[TMP8]] +; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; DEFAULT: vector.main.loop.iter.check: +; DEFAULT-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP2:%.*]] = mul i64 [[TMP9]], 8 ; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] ; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; DEFAULT: vector.ph: @@ -336,28 +341,63 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 { ; DEFAULT-NEXT: [[TMP22]] = or [[TMP20]], [[VEC_PHI1]] ; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; DEFAULT-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; DEFAULT-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; DEFAULT-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; DEFAULT: middle.block: ; DEFAULT-NEXT: [[BIN_RDX:%.*]] = or [[TMP22]], [[TMP21]] ; DEFAULT-NEXT: [[TMP24:%.*]] = call i16 @llvm.vector.reduce.or.nxv4i16( [[BIN_RDX]]) ; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] -; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] -; DEFAULT: scalar.ph: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; DEFAULT: vec.epilog.iter.check: +; DEFAULT-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] +; DEFAULT-NEXT: [[TMP35:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP36:%.*]] = mul i64 [[TMP35]], 2 +; DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP36]] +; DEFAULT-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]] +; DEFAULT: vec.epilog.ph: ; DEFAULT-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP24]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; DEFAULT-NEXT: [[TMP37:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 2 +; DEFAULT-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[TMP0]], [[TMP38]] +; DEFAULT-NEXT: [[N_VEC5:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF4]] +; DEFAULT-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 2 +; DEFAULT-NEXT: [[TMP27:%.*]] = insertelement zeroinitializer, i16 [[BC_MERGE_RDX]], i32 0 +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement poison, i16 [[X]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector [[BROADCAST_SPLATINSERT9]], poison, zeroinitializer ; DEFAULT-NEXT: br label [[LOOP:%.*]] +; DEFAULT: vec.epilog.vector.body: +; DEFAULT-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT11:%.*]], [[LOOP]] ] +; DEFAULT-NEXT: [[VEC_PHI7:%.*]] = phi [ [[TMP27]], [[SCALAR_PH]] ], [ [[TMP32:%.*]], [[LOOP]] ] +; DEFAULT-NEXT: [[TMP28:%.*]] = add i64 [[INDEX6]], 0 +; DEFAULT-NEXT: [[TMP29:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP28]] +; DEFAULT-NEXT: [[TMP30:%.*]] = getelementptr i16, ptr [[TMP29]], i32 0 +; DEFAULT-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP30]], align 2 +; DEFAULT-NEXT: [[TMP31:%.*]] = udiv [[WIDE_LOAD8]], [[BROADCAST_SPLAT10]] +; DEFAULT-NEXT: [[TMP32]] = or [[TMP31]], [[VEC_PHI7]] +; DEFAULT-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX6]], [[TMP26]] +; DEFAULT-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC5]] +; DEFAULT-NEXT: br i1 [[TMP33]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; DEFAULT: vec.epilog.middle.block: +; DEFAULT-NEXT: [[TMP34:%.*]] = call i16 @llvm.vector.reduce.or.nxv2i16( [[TMP32]]) +; DEFAULT-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]] +; DEFAULT-NEXT: br i1 [[CMP_N12]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; DEFAULT: vec.epilog.scalar.ph: +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; DEFAULT-NEXT: [[BC_MERGE_RDX13:%.*]] = phi i16 [ [[TMP34]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ] +; DEFAULT-NEXT: br label [[LOOP1:%.*]] ; DEFAULT: loop: -; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; DEFAULT-NEXT: [[RED:%.*]] = phi i16 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] +; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP1]] ] +; DEFAULT-NEXT: [[RED:%.*]] = phi i16 [ [[BC_MERGE_RDX13]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP1]] ] ; DEFAULT-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]] ; DEFAULT-NEXT: [[L:%.*]] = load i16, ptr [[GEP]], align 2 ; DEFAULT-NEXT: [[DIV:%.*]] = udiv i16 [[L]], [[X]] ; DEFAULT-NEXT: [[RED_NEXT]] = or i16 [[DIV]], [[RED]] ; DEFAULT-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]] -; DEFAULT-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; DEFAULT-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP6:![0-9]+]] ; DEFAULT: exit: -; DEFAULT-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i16 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ] +; DEFAULT-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i16 [ [[RED_NEXT]], [[LOOP1]] ], [ [[TMP24]], [[MIDDLE_BLOCK1]] ], [ [[TMP34]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; DEFAULT-NEXT: ret i16 [[RED_NEXT_LCSSA]] ; ; PRED-LABEL: define i16 @reduce_udiv( @@ -445,7 +485,8 @@ attributes #0 = { "target-features"="+sve" } ; DEFAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} ; DEFAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} ; DEFAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} -; DEFAULT: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; DEFAULT: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} +; DEFAULT: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]} ;. ; PRED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; PRED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll index 592b118f53207..f2c8261469160 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll @@ -281,10 +281,15 @@ for.cond.cleanup: ; preds = %for.inc, %entry define void @gather_nxv4i32_ind64_stride2(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) #0 { ; CHECK-LABEL: @gather_nxv4i32_ind64_stride2( -; CHECK-NEXT: entry: +; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[N:%.*]], [[TMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP0]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK_NOT:%.*]] = icmp ugt i64 [[N:%.*]], [[TMP8]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_NOT]], label [[ENTRY:%.*]], label [[VEC_EPILOG_SCALAR_PH:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP17]], 3 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[N]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -320,14 +325,51 @@ define void @gather_nxv4i32_ind64_stride2(ptr noalias nocapture %a, ptr noalias ; CHECK-NEXT: store [[WIDE_MASKED_GATHER2]], ptr [[TMP14]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC1]] -; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP22:%.*]] = shl nuw nsw i64 [[TMP21]], 1 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK_NOT:%.*]] = icmp ugt i64 [[TMP6]], [[TMP22]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK_NOT]], label [[SCALAR_PH]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP24:%.*]] = shl nuw nsw i64 [[TMP23]], 1 +; CHECK-NEXT: [[TMP25:%.*]] = add nsw i64 [[TMP24]], -1 +; CHECK-NEXT: [[N_MOD_VF4:%.*]] = and i64 [[N]], [[TMP25]] +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[N_MOD_VF4]], 0 +; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[TMP24]], i64 [[N_MOD_VF4]] +; CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[N]], [[TMP27]] +; CHECK-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP29:%.*]] = shl nuw nsw i64 [[TMP28]], 1 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[BC_RESUME_VAL]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP30:%.*]] = call @llvm.stepvector.nxv2i64() +; CHECK-NEXT: [[INDUCTION:%.*]] = add [[DOTSPLAT]], [[TMP30]] +; CHECK-NEXT: [[DOTSPLATINSERT8:%.*]] = insertelement poison, i64 [[TMP29]], i64 0 +; CHECK-NEXT: [[DOTSPLAT9:%.*]] = shufflevector [[DOTSPLATINSERT8]], poison, zeroinitializer ; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT10:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[SCALAR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP31:%.*]] = shl [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[B]], [[TMP31]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv2f32.nxv2p0( [[TMP32]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX7]] +; CHECK-NEXT: store [[WIDE_MASKED_GATHER1]], ptr [[TMP33]], align 4 +; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], [[TMP29]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT9]] +; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]] +; CHECK-NEXT: br i1 [[TMP34]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: br label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC1]], [[MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY1:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ], [ [[BC_RESUME_VAL6]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX_IDX:%.*]] = shl i64 [[INDVARS_IV]], 3 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[ARRAYIDX_IDX]] ; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4 @@ -335,7 +377,7 @@ define void @gather_nxv4i32_ind64_stride2(ptr noalias nocapture %a, ptr noalias ; CHECK-NEXT: store float [[TMP16]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY1]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll index 0c41477f285d0..76733a34e108f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll @@ -466,9 +466,14 @@ for.exit: define void @simple_histogram_user_interleave(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 { ; CHECK-LABEL: define void @simple_histogram_user_interleave( ; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: +; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3 +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP0]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP3]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP6]], 3 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -494,15 +499,42 @@ define void @simple_histogram_user_interleave(ptr noalias %buckets, ptr readonly ; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32( [[TMP21]], i32 1, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP24]], 1 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP16]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[DOTNEG9:%.*]] = mul nsw i64 [[TMP25]], -2 +; CHECK-NEXT: [[N_VEC4:%.*]] = and i64 [[N]], [[DOTNEG9]] +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP26:%.*]] = shl nuw nsw i64 [[TMP18]], 1 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT7:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX5]] +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP20]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = zext [[WIDE_LOAD6]] to +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], [[TMP27]] +; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv2p0.i32( [[TMP22]], i32 1, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[INDEX_NEXT7]] = add nuw i64 [[INDEX5]], [[TMP26]] +; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC4]] +; CHECK-NEXT: br i1 [[TMP23]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N8:%.*]] = icmp eq i64 [[N]], [[N_VEC4]] +; CHECK-NEXT: br i1 [[CMP_N8]], label [[FOR_EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY1:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY1]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]] ; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[TMP12]] to i64 @@ -512,7 +544,7 @@ define void @simple_histogram_user_interleave(ptr noalias %buckets, ptr readonly ; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: for.exit: ; CHECK-NEXT: ret void ; @@ -564,7 +596,7 @@ define void @histogram_array_3op_gep(i64 noundef %N) #0 { ; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32( [[TMP11]], i32 1, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] @@ -582,7 +614,7 @@ define void @histogram_array_3op_gep(i64 noundef %N) #0 { ; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX6]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: for.exit: ; CHECK-NEXT: ret void ; @@ -634,7 +666,7 @@ define void @histogram_array_4op_gep_nonzero_const_idx(i64 noundef %N, ptr reado ; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32( [[TMP7]], i32 1, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP4]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] @@ -652,7 +684,7 @@ define void @histogram_array_4op_gep_nonzero_const_idx(i64 noundef %N, ptr reado ; CHECK-NEXT: store i32 [[INC]], ptr [[GEP_BUCKET]], align 4 ; CHECK-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: for.exit: ; CHECK-NEXT: ret void ; @@ -701,13 +733,13 @@ define void @simple_histogram_tailfold(ptr noalias %buckets, ptr readonly %indic ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]]) ; CHECK-NEXT: [[TMP11:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 -; CHECK-NEXT: br i1 [[TMP11]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: br i1 poison, label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-NEXT: br i1 poison, label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: for.exit: ; CHECK-NEXT: ret void ; @@ -774,7 +806,7 @@ define void @simple_histogram_rtdepcheck(ptr noalias %buckets, ptr %array, ptr % ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] @@ -795,7 +827,7 @@ define void @simple_histogram_rtdepcheck(ptr noalias %buckets, ptr %array, ptr % ; CHECK-NEXT: store i32 [[IV_TRUNC]], ptr [[IDX_ADDR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: for.exit: ; CHECK-NEXT: ret void ; @@ -887,7 +919,7 @@ define void @simple_histogram_64b(ptr noalias %buckets, ptr readonly %indices, i ; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv2p0.i64( [[TMP6]], i64 1, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] @@ -904,7 +936,7 @@ define void @simple_histogram_64b(ptr noalias %buckets, ptr readonly %indices, i ; CHECK-NEXT: store i64 [[INC]], ptr [[GEP_BUCKET]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: for.exit: ; CHECK-NEXT: ret void ;