diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 02b79f2053d59..bbbc46d8550f3 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -989,9 +989,10 @@ class LoopVectorizationCostModel { InterleavedAccessInfo &IAI) : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), - Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) { + Hints(Hints), InterleaveInfo(IAI) { if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors) initializeVScaleForTuning(); + CostKind = F->hasMinSize() ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput; } /// \return An upper bound for the vectorization factors (both fixed and @@ -3384,7 +3385,7 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, // Scale the cost by the probability of executing the predicated blocks. // This assumes the predicated block for each vector lane is equally // likely. - ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb(); + ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind); } InstructionCost SafeDivisorCost = 0; @@ -4300,6 +4301,13 @@ bool LoopVectorizationPlanner::isMoreProfitable( EstimatedWidthB *= *VScale; } + // When optimizing for size choose whichever is smallest, which will be the + // one with the smallest cost for the whole loop. On a tie pick the larger + // vector width, on the assumption that throughput will be greater. + if (CM.CostKind == TTI::TCK_CodeSize) + return CostA < CostB || + (CostA == CostB && EstimatedWidthA > EstimatedWidthB); + // Assume vscale may be larger than 1 (or the value being tuned for), // so that scalable vectorization is slightly favorable over fixed-width // vectorization. @@ -5530,7 +5538,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( } // Scale the total scalar cost by block probability. - ScalarCost /= getReciprocalPredBlockProb(); + ScalarCost /= getPredBlockCostDivisor(CostKind); // Compute the discount. A non-negative discount means the vector version // of the instruction costs more, and scalarizing would be beneficial. @@ -5583,7 +5591,7 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) { // cost by the probability of executing it. blockNeedsPredication from // Legal is used so as to not include all blocks in tail folded loops. if (VF.isScalar() && Legal->blockNeedsPredication(BB)) - BlockCost /= getReciprocalPredBlockProb(); + BlockCost /= getPredBlockCostDivisor(CostKind); Cost += BlockCost; } @@ -5661,7 +5669,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // conditional branches, but may not be executed for each vector lane. Scale // the cost by the probability of executing the predicated block. if (isPredicatedInst(I)) { - Cost /= getReciprocalPredBlockProb(); + Cost /= getPredBlockCostDivisor(CostKind); // Add the cost of an i1 extract and a branch auto *VecI1Ty = diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 5a88ebeffb18b..d01bef82cff2e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -808,7 +808,7 @@ InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) { // For the scalar case, we may not always execute the original predicated // block, Thus, scale the block's cost by the probability of executing it. if (VF.isScalar()) - return ThenCost / getReciprocalPredBlockProb(); + return ThenCost / getPredBlockCostDivisor(Ctx.CostKind); return ThenCost; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index 74713daf904f0..457c4f7cffcf8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -48,13 +48,20 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF); Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step); -/// A helper function that returns the reciprocal of the block probability of -/// predicated blocks. If we return X, we are assuming the predicated block -/// will execute once for every X iterations of the loop header. +/// A helper function that returns how much we should divide the cost of a +/// predicated block by. Typically this is the reciprocal of the block +/// probability, i.e. if we return X we are assuming the predicated block will +/// execute once for every X iterations of the loop header so the block should +/// only contribute 1/X of its cost to the total cost calculation, but when +/// optimizing for code size it will just be 1 as code size costs don't depend +/// on execution probabilities. /// /// TODO: We should use actual block probability here, if available. Currently, /// we always assume predicated blocks have a 50% chance of executing. -inline unsigned getReciprocalPredBlockProb() { return 2; } +inline unsigned +getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind) { + return CostKind == TTI::TCK_CodeSize ? 1 : 2; +} /// A range of powers-of-2 vectorization factors with fixed start and /// adjustable end. The range includes start and excludes end, e.g.,: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll new file mode 100644 index 0000000000000..457e1ca5ea762 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll @@ -0,0 +1,1298 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 +; The tests here check for differences in behaviour between the default, +; optsize, and minsize. +; RUN: opt -passes=loop-vectorize -S < %s | FileCheck %s --check-prefix=DEFAULT +; RUN: opt -passes=forceattrs,loop-vectorize -force-attribute=optsize -S < %s | FileCheck %s --check-prefix=OPTSIZE +; RUN: opt -passes=forceattrs,loop-vectorize -force-attribute=minsize -S < %s | FileCheck %s --check-prefix=MINSIZE + +target triple = "aarch64-unknown-linux-gnu" + +@A = global [1000 x i16] zeroinitializer, align 2 +@B = global [1000 x i32] zeroinitializer, align 4 +@C = global [1000 x i32] zeroinitializer, align 4 + +; This should always vectorize, as using vector instructions eliminates the loop +; which is both faster and smaller (a scalar version is emitted, but the branch +; to it is false and it's later removed). +define void @always_vectorize(ptr %p, i32 %x) { +; DEFAULT-LABEL: define void @always_vectorize( +; DEFAULT-SAME: ptr [[P:%.*]], i32 [[X:%.*]]) { +; DEFAULT-NEXT: [[ENTRY:.*]]: +; DEFAULT-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; DEFAULT: [[VECTOR_PH]]: +; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] +; DEFAULT: [[VECTOR_BODY]]: +; DEFAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 0 +; DEFAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; DEFAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; DEFAULT-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; DEFAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; DEFAULT-NEXT: store <4 x i32> [[TMP3]], ptr [[TMP5]], align 4 +; DEFAULT-NEXT: br label %[[MIDDLE_BLOCK:.*]] +; DEFAULT: [[MIDDLE_BLOCK]]: +; DEFAULT-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; DEFAULT: [[SCALAR_PH]]: +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; DEFAULT-NEXT: br label %[[FOR_BODY:.*]] +; DEFAULT: [[FOR_BODY]]: +; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV]] +; DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP4]], [[X]] +; DEFAULT-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX]], align 4 +; DEFAULT-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; DEFAULT-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4 +; DEFAULT-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; DEFAULT: [[FOR_COND_CLEANUP]]: +; DEFAULT-NEXT: ret void +; +; OPTSIZE-LABEL: define void @always_vectorize( +; OPTSIZE-SAME: ptr [[P:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; OPTSIZE-NEXT: [[ENTRY:.*]]: +; OPTSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; OPTSIZE: [[VECTOR_PH]]: +; OPTSIZE-NEXT: br label %[[VECTOR_BODY:.*]] +; OPTSIZE: [[VECTOR_BODY]]: +; OPTSIZE-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 0 +; OPTSIZE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; OPTSIZE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; OPTSIZE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i64 0 +; OPTSIZE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; OPTSIZE-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; OPTSIZE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; OPTSIZE-NEXT: store <4 x i32> [[TMP3]], ptr [[TMP5]], align 4 +; OPTSIZE-NEXT: br label %[[MIDDLE_BLOCK:.*]] +; OPTSIZE: [[MIDDLE_BLOCK]]: +; OPTSIZE-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; OPTSIZE: [[SCALAR_PH]]: +; OPTSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; OPTSIZE-NEXT: br label %[[FOR_BODY:.*]] +; OPTSIZE: [[FOR_BODY]]: +; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; OPTSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV]] +; OPTSIZE-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; OPTSIZE-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP4]], [[X]] +; OPTSIZE-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX]], align 4 +; OPTSIZE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; OPTSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4 +; OPTSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; OPTSIZE: [[FOR_COND_CLEANUP]]: +; OPTSIZE-NEXT: ret void +; +; MINSIZE-LABEL: define void @always_vectorize( +; MINSIZE-SAME: ptr [[P:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; MINSIZE-NEXT: [[ENTRY:.*]]: +; MINSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; MINSIZE: [[VECTOR_PH]]: +; MINSIZE-NEXT: br label %[[VECTOR_BODY:.*]] +; MINSIZE: [[VECTOR_BODY]]: +; MINSIZE-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 0 +; MINSIZE-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0 +; MINSIZE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; MINSIZE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i64 0 +; MINSIZE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; MINSIZE-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; MINSIZE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0 +; MINSIZE-NEXT: store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4 +; MINSIZE-NEXT: br label %[[MIDDLE_BLOCK:.*]] +; MINSIZE: [[MIDDLE_BLOCK]]: +; MINSIZE-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; MINSIZE: [[SCALAR_PH]]: +; MINSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; MINSIZE-NEXT: br label %[[FOR_BODY:.*]] +; MINSIZE: [[FOR_BODY]]: +; MINSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; MINSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV]] +; MINSIZE-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; MINSIZE-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP4]], [[X]] +; MINSIZE-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX]], align 4 +; MINSIZE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; MINSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4 +; MINSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; MINSIZE: [[FOR_COND_CLEANUP]]: +; MINSIZE-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %p, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %add = add nsw i32 %0, %x + store i32 %add, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 4 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + +; This should vectorize only without optsize, as it needs a scalar version +; which increases code size. +define void @vectorize_without_optsize(ptr %p, i32 %x, i64 %n) { +; DEFAULT-LABEL: define void @vectorize_without_optsize( +; DEFAULT-SAME: ptr [[P:%.*]], i32 [[X:%.*]], i64 [[N:%.*]]) { +; DEFAULT-NEXT: [[ENTRY:.*]]: +; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; DEFAULT: [[VECTOR_PH]]: +; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] +; DEFAULT: [[VECTOR_BODY]]: +; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; DEFAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[TMP0]] +; DEFAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; DEFAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 4 +; DEFAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; DEFAULT-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 +; DEFAULT-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; DEFAULT-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]] +; DEFAULT-NEXT: store <4 x i32> [[TMP4]], ptr [[TMP2]], align 4 +; DEFAULT-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP3]], align 4 +; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; DEFAULT-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; DEFAULT-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; DEFAULT: [[MIDDLE_BLOCK]]: +; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; DEFAULT-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; DEFAULT: [[SCALAR_PH]]: +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; DEFAULT-NEXT: br label %[[FOR_BODY:.*]] +; DEFAULT: [[FOR_BODY]]: +; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV]] +; DEFAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP7]], [[X]] +; DEFAULT-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX]], align 4 +; DEFAULT-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; DEFAULT-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; DEFAULT-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; DEFAULT: [[FOR_COND_CLEANUP]]: +; DEFAULT-NEXT: ret void +; +; OPTSIZE-LABEL: define void @vectorize_without_optsize( +; OPTSIZE-SAME: ptr [[P:%.*]], i32 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; OPTSIZE-NEXT: [[ENTRY:.*]]: +; OPTSIZE-NEXT: br label %[[FOR_BODY:.*]] +; OPTSIZE: [[FOR_BODY]]: +; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; OPTSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV]] +; OPTSIZE-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; OPTSIZE-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[X]] +; OPTSIZE-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX]], align 4 +; OPTSIZE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; OPTSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; OPTSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY]] +; OPTSIZE: [[FOR_COND_CLEANUP]]: +; OPTSIZE-NEXT: ret void +; +; MINSIZE-LABEL: define void @vectorize_without_optsize( +; MINSIZE-SAME: ptr [[P:%.*]], i32 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; MINSIZE-NEXT: [[ENTRY:.*]]: +; MINSIZE-NEXT: br label %[[FOR_BODY:.*]] +; MINSIZE: [[FOR_BODY]]: +; MINSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; MINSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV]] +; MINSIZE-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; MINSIZE-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[X]] +; MINSIZE-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX]], align 4 +; MINSIZE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; MINSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; MINSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY]] +; MINSIZE: [[FOR_COND_CLEANUP]]: +; MINSIZE-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %p, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %add = add nsw i32 %0, %x + store i32 %add, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + +; This should be vectorized and tail predicated without optsize, as that's +; faster, but not with optsize, as it's much larger. +; FIXME: Currently we avoid tail predication only with minsize +define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n) { +; DEFAULT-LABEL: define void @tail_predicate_without_optsize( +; DEFAULT-SAME: ptr [[P:%.*]], i8 [[A:%.*]], i8 [[B:%.*]], i8 [[C:%.*]], i32 [[N:%.*]]) { +; DEFAULT-NEXT: [[ENTRY:.*]]: +; DEFAULT-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; DEFAULT: [[VECTOR_PH]]: +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <16 x i8> poison, i8 [[B]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT3]], <16 x i8> poison, <16 x i32> zeroinitializer +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT5]], <16 x i8> poison, <16 x i32> zeroinitializer +; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] +; DEFAULT: [[VECTOR_BODY]]: +; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE36:.*]] ] +; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE36]] ] +; DEFAULT-NEXT: [[VEC_IND1:%.*]] = phi <16 x i8> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], %[[PRED_STORE_CONTINUE36]] ] +; DEFAULT-NEXT: [[TMP0:%.*]] = icmp ule <16 x i64> [[VEC_IND]], splat (i64 14) +; DEFAULT-NEXT: [[TMP1:%.*]] = mul <16 x i8> [[BROADCAST_SPLAT]], [[VEC_IND1]] +; DEFAULT-NEXT: [[TMP2:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 1) +; DEFAULT-NEXT: [[TMP3:%.*]] = mul <16 x i8> [[TMP2]], [[BROADCAST_SPLAT4]] +; DEFAULT-NEXT: [[TMP4:%.*]] = add <16 x i8> [[TMP3]], [[TMP1]] +; DEFAULT-NEXT: [[TMP5:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 2) +; DEFAULT-NEXT: [[TMP6:%.*]] = mul <16 x i8> [[TMP5]], [[BROADCAST_SPLAT6]] +; DEFAULT-NEXT: [[TMP7:%.*]] = add <16 x i8> [[TMP4]], [[TMP6]] +; DEFAULT-NEXT: [[TMP8:%.*]] = extractelement <16 x i1> [[TMP0]], i32 0 +; DEFAULT-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; DEFAULT: [[PRED_STORE_IF]]: +; DEFAULT-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; DEFAULT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP9]] +; DEFAULT-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP7]], i32 0 +; DEFAULT-NEXT: store i8 [[TMP11]], ptr [[TMP10]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE]] +; DEFAULT: [[PRED_STORE_CONTINUE]]: +; DEFAULT-NEXT: [[TMP12:%.*]] = extractelement <16 x i1> [[TMP0]], i32 1 +; DEFAULT-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] +; DEFAULT: [[PRED_STORE_IF7]]: +; DEFAULT-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 1 +; DEFAULT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP13]] +; DEFAULT-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[TMP7]], i32 1 +; DEFAULT-NEXT: store i8 [[TMP15]], ptr [[TMP14]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE8]] +; DEFAULT: [[PRED_STORE_CONTINUE8]]: +; DEFAULT-NEXT: [[TMP16:%.*]] = extractelement <16 x i1> [[TMP0]], i32 2 +; DEFAULT-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] +; DEFAULT: [[PRED_STORE_IF9]]: +; DEFAULT-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 2 +; DEFAULT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP17]] +; DEFAULT-NEXT: [[TMP19:%.*]] = extractelement <16 x i8> [[TMP7]], i32 2 +; DEFAULT-NEXT: store i8 [[TMP19]], ptr [[TMP18]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE10]] +; DEFAULT: [[PRED_STORE_CONTINUE10]]: +; DEFAULT-NEXT: [[TMP20:%.*]] = extractelement <16 x i1> [[TMP0]], i32 3 +; DEFAULT-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] +; DEFAULT: [[PRED_STORE_IF11]]: +; DEFAULT-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 3 +; DEFAULT-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP21]] +; DEFAULT-NEXT: [[TMP23:%.*]] = extractelement <16 x i8> [[TMP7]], i32 3 +; DEFAULT-NEXT: store i8 [[TMP23]], ptr [[TMP22]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE12]] +; DEFAULT: [[PRED_STORE_CONTINUE12]]: +; DEFAULT-NEXT: [[TMP24:%.*]] = extractelement <16 x i1> [[TMP0]], i32 4 +; DEFAULT-NEXT: br i1 [[TMP24]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] +; DEFAULT: [[PRED_STORE_IF13]]: +; DEFAULT-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 4 +; DEFAULT-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP25]] +; DEFAULT-NEXT: [[TMP27:%.*]] = extractelement <16 x i8> [[TMP7]], i32 4 +; DEFAULT-NEXT: store i8 [[TMP27]], ptr [[TMP26]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE14]] +; DEFAULT: [[PRED_STORE_CONTINUE14]]: +; DEFAULT-NEXT: [[TMP28:%.*]] = extractelement <16 x i1> [[TMP0]], i32 5 +; DEFAULT-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]] +; DEFAULT: [[PRED_STORE_IF15]]: +; DEFAULT-NEXT: [[TMP29:%.*]] = add i64 [[INDEX]], 5 +; DEFAULT-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP29]] +; DEFAULT-NEXT: [[TMP31:%.*]] = extractelement <16 x i8> [[TMP7]], i32 5 +; DEFAULT-NEXT: store i8 [[TMP31]], ptr [[TMP30]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE16]] +; DEFAULT: [[PRED_STORE_CONTINUE16]]: +; DEFAULT-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP0]], i32 6 +; DEFAULT-NEXT: br i1 [[TMP32]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]] +; DEFAULT: [[PRED_STORE_IF17]]: +; DEFAULT-NEXT: [[TMP33:%.*]] = add i64 [[INDEX]], 6 +; DEFAULT-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP33]] +; DEFAULT-NEXT: [[TMP35:%.*]] = extractelement <16 x i8> [[TMP7]], i32 6 +; DEFAULT-NEXT: store i8 [[TMP35]], ptr [[TMP34]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE18]] +; DEFAULT: [[PRED_STORE_CONTINUE18]]: +; DEFAULT-NEXT: [[TMP36:%.*]] = extractelement <16 x i1> [[TMP0]], i32 7 +; DEFAULT-NEXT: br i1 [[TMP36]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]] +; DEFAULT: [[PRED_STORE_IF19]]: +; DEFAULT-NEXT: [[TMP37:%.*]] = add i64 [[INDEX]], 7 +; DEFAULT-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP37]] +; DEFAULT-NEXT: [[TMP39:%.*]] = extractelement <16 x i8> [[TMP7]], i32 7 +; DEFAULT-NEXT: store i8 [[TMP39]], ptr [[TMP38]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE20]] +; DEFAULT: [[PRED_STORE_CONTINUE20]]: +; DEFAULT-NEXT: [[TMP40:%.*]] = extractelement <16 x i1> [[TMP0]], i32 8 +; DEFAULT-NEXT: br i1 [[TMP40]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]] +; DEFAULT: [[PRED_STORE_IF21]]: +; DEFAULT-NEXT: [[TMP41:%.*]] = add i64 [[INDEX]], 8 +; DEFAULT-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP41]] +; DEFAULT-NEXT: [[TMP43:%.*]] = extractelement <16 x i8> [[TMP7]], i32 8 +; DEFAULT-NEXT: store i8 [[TMP43]], ptr [[TMP42]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE22]] +; DEFAULT: [[PRED_STORE_CONTINUE22]]: +; DEFAULT-NEXT: [[TMP44:%.*]] = extractelement <16 x i1> [[TMP0]], i32 9 +; DEFAULT-NEXT: br i1 [[TMP44]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]] +; DEFAULT: [[PRED_STORE_IF23]]: +; DEFAULT-NEXT: [[TMP45:%.*]] = add i64 [[INDEX]], 9 +; DEFAULT-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP45]] +; DEFAULT-NEXT: [[TMP47:%.*]] = extractelement <16 x i8> [[TMP7]], i32 9 +; DEFAULT-NEXT: store i8 [[TMP47]], ptr [[TMP46]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE24]] +; DEFAULT: [[PRED_STORE_CONTINUE24]]: +; DEFAULT-NEXT: [[TMP48:%.*]] = extractelement <16 x i1> [[TMP0]], i32 10 +; DEFAULT-NEXT: br i1 [[TMP48]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]] +; DEFAULT: [[PRED_STORE_IF25]]: +; DEFAULT-NEXT: [[TMP49:%.*]] = add i64 [[INDEX]], 10 +; DEFAULT-NEXT: [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP49]] +; DEFAULT-NEXT: [[TMP51:%.*]] = extractelement <16 x i8> [[TMP7]], i32 10 +; DEFAULT-NEXT: store i8 [[TMP51]], ptr [[TMP50]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE26]] +; DEFAULT: [[PRED_STORE_CONTINUE26]]: +; DEFAULT-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP0]], i32 11 +; DEFAULT-NEXT: br i1 [[TMP52]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]] +; DEFAULT: [[PRED_STORE_IF27]]: +; DEFAULT-NEXT: [[TMP53:%.*]] = add i64 [[INDEX]], 11 +; DEFAULT-NEXT: [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP53]] +; DEFAULT-NEXT: [[TMP55:%.*]] = extractelement <16 x i8> [[TMP7]], i32 11 +; DEFAULT-NEXT: store i8 [[TMP55]], ptr [[TMP54]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE28]] +; DEFAULT: [[PRED_STORE_CONTINUE28]]: +; DEFAULT-NEXT: [[TMP56:%.*]] = extractelement <16 x i1> [[TMP0]], i32 12 +; DEFAULT-NEXT: br i1 [[TMP56]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]] +; DEFAULT: [[PRED_STORE_IF29]]: +; DEFAULT-NEXT: [[TMP57:%.*]] = add i64 [[INDEX]], 12 +; DEFAULT-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP57]] +; DEFAULT-NEXT: [[TMP59:%.*]] = extractelement <16 x i8> [[TMP7]], i32 12 +; DEFAULT-NEXT: store i8 [[TMP59]], ptr [[TMP58]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE30]] +; DEFAULT: [[PRED_STORE_CONTINUE30]]: +; DEFAULT-NEXT: [[TMP60:%.*]] = extractelement <16 x i1> [[TMP0]], i32 13 +; DEFAULT-NEXT: br i1 [[TMP60]], label %[[PRED_STORE_IF31:.*]], label %[[PRED_STORE_CONTINUE32:.*]] +; DEFAULT: [[PRED_STORE_IF31]]: +; DEFAULT-NEXT: [[TMP61:%.*]] = add i64 [[INDEX]], 13 +; DEFAULT-NEXT: [[TMP62:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP61]] +; DEFAULT-NEXT: [[TMP63:%.*]] = extractelement <16 x i8> [[TMP7]], i32 13 +; DEFAULT-NEXT: store i8 [[TMP63]], ptr [[TMP62]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE32]] +; DEFAULT: [[PRED_STORE_CONTINUE32]]: +; DEFAULT-NEXT: [[TMP64:%.*]] = extractelement <16 x i1> [[TMP0]], i32 14 +; DEFAULT-NEXT: br i1 [[TMP64]], label %[[PRED_STORE_IF33:.*]], label %[[PRED_STORE_CONTINUE34:.*]] +; DEFAULT: [[PRED_STORE_IF33]]: +; DEFAULT-NEXT: [[TMP65:%.*]] = add i64 [[INDEX]], 14 +; DEFAULT-NEXT: [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP65]] +; DEFAULT-NEXT: [[TMP67:%.*]] = extractelement <16 x i8> [[TMP7]], i32 14 +; DEFAULT-NEXT: store i8 [[TMP67]], ptr [[TMP66]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE34]] +; DEFAULT: [[PRED_STORE_CONTINUE34]]: +; DEFAULT-NEXT: [[TMP68:%.*]] = extractelement <16 x i1> [[TMP0]], i32 15 +; DEFAULT-NEXT: br i1 [[TMP68]], label %[[PRED_STORE_IF35:.*]], label %[[PRED_STORE_CONTINUE36]] +; DEFAULT: [[PRED_STORE_IF35]]: +; DEFAULT-NEXT: [[TMP69:%.*]] = add i64 [[INDEX]], 15 +; DEFAULT-NEXT: [[TMP70:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP69]] +; DEFAULT-NEXT: [[TMP71:%.*]] = extractelement <16 x i8> [[TMP7]], i32 15 +; DEFAULT-NEXT: store i8 [[TMP71]], ptr [[TMP70]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE36]] +; DEFAULT: [[PRED_STORE_CONTINUE36]]: +; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16) +; DEFAULT-NEXT: [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16) +; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; DEFAULT-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; DEFAULT: [[MIDDLE_BLOCK]]: +; DEFAULT-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; DEFAULT: [[SCALAR_PH]]: +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; DEFAULT-NEXT: br label %[[FOR_BODY:.*]] +; DEFAULT: [[FOR_BODY]]: +; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; DEFAULT-NEXT: [[TMP72:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i8 +; DEFAULT-NEXT: [[MUL:%.*]] = mul i8 [[A]], [[TMP72]] +; DEFAULT-NEXT: [[SHR:%.*]] = lshr i8 [[TMP72]], 1 +; DEFAULT-NEXT: [[MUL5:%.*]] = mul i8 [[SHR]], [[B]] +; DEFAULT-NEXT: [[ADD:%.*]] = add i8 [[MUL5]], [[MUL]] +; DEFAULT-NEXT: [[SHR7:%.*]] = lshr i8 [[TMP72]], 2 +; DEFAULT-NEXT: [[MUL9:%.*]] = mul i8 [[SHR7]], [[C]] +; DEFAULT-NEXT: [[ADD10:%.*]] = add i8 [[ADD]], [[MUL9]] +; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]] +; DEFAULT-NEXT: store i8 [[ADD10]], ptr [[ARRAYIDX]], align 1 +; DEFAULT-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; DEFAULT-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 15 +; DEFAULT-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; DEFAULT: [[FOR_COND_CLEANUP]]: +; DEFAULT-NEXT: ret void +; +; OPTSIZE-LABEL: define void @tail_predicate_without_optsize( +; OPTSIZE-SAME: ptr [[P:%.*]], i8 [[A:%.*]], i8 [[B:%.*]], i8 [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; OPTSIZE-NEXT: [[ENTRY:.*]]: +; OPTSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; OPTSIZE: [[VECTOR_PH]]: +; OPTSIZE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0 +; OPTSIZE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer +; OPTSIZE-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <16 x i8> poison, i8 [[B]], i64 0 +; OPTSIZE-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT3]], <16 x i8> poison, <16 x i32> zeroinitializer +; OPTSIZE-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0 +; OPTSIZE-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT5]], <16 x i8> poison, <16 x i32> zeroinitializer +; OPTSIZE-NEXT: br label %[[VECTOR_BODY:.*]] +; OPTSIZE: [[VECTOR_BODY]]: +; OPTSIZE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE36:.*]] ] +; OPTSIZE-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE36]] ] +; OPTSIZE-NEXT: [[VEC_IND1:%.*]] = phi <16 x i8> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], %[[PRED_STORE_CONTINUE36]] ] +; OPTSIZE-NEXT: [[TMP72:%.*]] = icmp ule <16 x i64> [[VEC_IND]], splat (i64 14) +; OPTSIZE-NEXT: [[TMP1:%.*]] = mul <16 x i8> [[BROADCAST_SPLAT]], [[VEC_IND1]] +; OPTSIZE-NEXT: [[TMP2:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 1) +; OPTSIZE-NEXT: [[TMP3:%.*]] = mul <16 x i8> [[TMP2]], [[BROADCAST_SPLAT4]] +; OPTSIZE-NEXT: [[TMP4:%.*]] = add <16 x i8> [[TMP3]], [[TMP1]] +; OPTSIZE-NEXT: [[TMP5:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 2) +; OPTSIZE-NEXT: [[TMP6:%.*]] = mul <16 x i8> [[TMP5]], [[BROADCAST_SPLAT6]] +; OPTSIZE-NEXT: [[TMP7:%.*]] = add <16 x i8> [[TMP4]], [[TMP6]] +; OPTSIZE-NEXT: [[TMP8:%.*]] = extractelement <16 x i1> [[TMP72]], i32 0 +; OPTSIZE-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; OPTSIZE: [[PRED_STORE_IF]]: +; OPTSIZE-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; OPTSIZE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP9]] +; OPTSIZE-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP7]], i32 0 +; OPTSIZE-NEXT: store i8 [[TMP11]], ptr [[TMP10]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE]] +; OPTSIZE: [[PRED_STORE_CONTINUE]]: +; OPTSIZE-NEXT: [[TMP12:%.*]] = extractelement <16 x i1> [[TMP72]], i32 1 +; OPTSIZE-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] +; OPTSIZE: [[PRED_STORE_IF7]]: +; OPTSIZE-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 1 +; OPTSIZE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP13]] +; OPTSIZE-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[TMP7]], i32 1 +; OPTSIZE-NEXT: store i8 [[TMP15]], ptr [[TMP14]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE8]] +; OPTSIZE: [[PRED_STORE_CONTINUE8]]: +; OPTSIZE-NEXT: [[TMP16:%.*]] = extractelement <16 x i1> [[TMP72]], i32 2 +; OPTSIZE-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] +; OPTSIZE: [[PRED_STORE_IF9]]: +; OPTSIZE-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 2 +; OPTSIZE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP17]] +; OPTSIZE-NEXT: [[TMP19:%.*]] = extractelement <16 x i8> [[TMP7]], i32 2 +; OPTSIZE-NEXT: store i8 [[TMP19]], ptr [[TMP18]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE10]] +; OPTSIZE: [[PRED_STORE_CONTINUE10]]: +; OPTSIZE-NEXT: [[TMP20:%.*]] = extractelement <16 x i1> [[TMP72]], i32 3 +; OPTSIZE-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] +; OPTSIZE: [[PRED_STORE_IF11]]: +; OPTSIZE-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 3 +; OPTSIZE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP21]] +; OPTSIZE-NEXT: [[TMP23:%.*]] = extractelement <16 x i8> [[TMP7]], i32 3 +; OPTSIZE-NEXT: store i8 [[TMP23]], ptr [[TMP22]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE12]] +; OPTSIZE: [[PRED_STORE_CONTINUE12]]: +; OPTSIZE-NEXT: [[TMP24:%.*]] = extractelement <16 x i1> [[TMP72]], i32 4 +; OPTSIZE-NEXT: br i1 [[TMP24]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] +; OPTSIZE: [[PRED_STORE_IF13]]: +; OPTSIZE-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 4 +; OPTSIZE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP25]] +; OPTSIZE-NEXT: [[TMP27:%.*]] = extractelement <16 x i8> [[TMP7]], i32 4 +; OPTSIZE-NEXT: store i8 [[TMP27]], ptr [[TMP26]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE14]] +; OPTSIZE: [[PRED_STORE_CONTINUE14]]: +; OPTSIZE-NEXT: [[TMP28:%.*]] = extractelement <16 x i1> [[TMP72]], i32 5 +; OPTSIZE-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]] +; OPTSIZE: [[PRED_STORE_IF15]]: +; OPTSIZE-NEXT: [[TMP29:%.*]] = add i64 [[INDEX]], 5 +; OPTSIZE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP29]] +; OPTSIZE-NEXT: [[TMP31:%.*]] = extractelement <16 x i8> [[TMP7]], i32 5 +; OPTSIZE-NEXT: store i8 [[TMP31]], ptr [[TMP30]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE16]] +; OPTSIZE: [[PRED_STORE_CONTINUE16]]: +; OPTSIZE-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP72]], i32 6 +; OPTSIZE-NEXT: br i1 [[TMP32]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]] +; OPTSIZE: [[PRED_STORE_IF17]]: +; OPTSIZE-NEXT: [[TMP33:%.*]] = add i64 [[INDEX]], 6 +; OPTSIZE-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP33]] +; OPTSIZE-NEXT: [[TMP35:%.*]] = extractelement <16 x i8> [[TMP7]], i32 6 +; OPTSIZE-NEXT: store i8 [[TMP35]], ptr [[TMP34]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE18]] +; OPTSIZE: [[PRED_STORE_CONTINUE18]]: +; OPTSIZE-NEXT: [[TMP36:%.*]] = extractelement <16 x i1> [[TMP72]], i32 7 +; OPTSIZE-NEXT: br i1 [[TMP36]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]] +; OPTSIZE: [[PRED_STORE_IF19]]: +; OPTSIZE-NEXT: [[TMP37:%.*]] = add i64 [[INDEX]], 7 +; OPTSIZE-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP37]] +; OPTSIZE-NEXT: [[TMP39:%.*]] = extractelement <16 x i8> [[TMP7]], i32 7 +; OPTSIZE-NEXT: store i8 [[TMP39]], ptr [[TMP38]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE20]] +; OPTSIZE: [[PRED_STORE_CONTINUE20]]: +; OPTSIZE-NEXT: [[TMP40:%.*]] = extractelement <16 x i1> [[TMP72]], i32 8 +; OPTSIZE-NEXT: br i1 [[TMP40]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]] +; OPTSIZE: [[PRED_STORE_IF21]]: +; OPTSIZE-NEXT: [[TMP41:%.*]] = add i64 [[INDEX]], 8 +; OPTSIZE-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP41]] +; OPTSIZE-NEXT: [[TMP43:%.*]] = extractelement <16 x i8> [[TMP7]], i32 8 +; OPTSIZE-NEXT: store i8 [[TMP43]], ptr [[TMP42]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE22]] +; OPTSIZE: [[PRED_STORE_CONTINUE22]]: +; OPTSIZE-NEXT: [[TMP44:%.*]] = extractelement <16 x i1> [[TMP72]], i32 9 +; OPTSIZE-NEXT: br i1 [[TMP44]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]] +; OPTSIZE: [[PRED_STORE_IF23]]: +; OPTSIZE-NEXT: [[TMP45:%.*]] = add i64 [[INDEX]], 9 +; OPTSIZE-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP45]] +; OPTSIZE-NEXT: [[TMP47:%.*]] = extractelement <16 x i8> [[TMP7]], i32 9 +; OPTSIZE-NEXT: store i8 [[TMP47]], ptr [[TMP46]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE24]] +; OPTSIZE: [[PRED_STORE_CONTINUE24]]: +; OPTSIZE-NEXT: [[TMP48:%.*]] = extractelement <16 x i1> [[TMP72]], i32 10 +; OPTSIZE-NEXT: br i1 [[TMP48]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]] +; OPTSIZE: [[PRED_STORE_IF25]]: +; OPTSIZE-NEXT: [[TMP49:%.*]] = add i64 [[INDEX]], 10 +; OPTSIZE-NEXT: [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP49]] +; OPTSIZE-NEXT: [[TMP51:%.*]] = extractelement <16 x i8> [[TMP7]], i32 10 +; OPTSIZE-NEXT: store i8 [[TMP51]], ptr [[TMP50]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE26]] +; OPTSIZE: [[PRED_STORE_CONTINUE26]]: +; OPTSIZE-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP72]], i32 11 +; OPTSIZE-NEXT: br i1 [[TMP52]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]] +; OPTSIZE: [[PRED_STORE_IF27]]: +; OPTSIZE-NEXT: [[TMP53:%.*]] = add i64 [[INDEX]], 11 +; OPTSIZE-NEXT: [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP53]] +; OPTSIZE-NEXT: [[TMP55:%.*]] = extractelement <16 x i8> [[TMP7]], i32 11 +; OPTSIZE-NEXT: store i8 [[TMP55]], ptr [[TMP54]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE28]] +; OPTSIZE: [[PRED_STORE_CONTINUE28]]: +; OPTSIZE-NEXT: [[TMP56:%.*]] = extractelement <16 x i1> [[TMP72]], i32 12 +; OPTSIZE-NEXT: br i1 [[TMP56]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]] +; OPTSIZE: [[PRED_STORE_IF29]]: +; OPTSIZE-NEXT: [[TMP57:%.*]] = add i64 [[INDEX]], 12 +; OPTSIZE-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP57]] +; OPTSIZE-NEXT: [[TMP59:%.*]] = extractelement <16 x i8> [[TMP7]], i32 12 +; OPTSIZE-NEXT: store i8 [[TMP59]], ptr [[TMP58]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE30]] +; OPTSIZE: [[PRED_STORE_CONTINUE30]]: +; OPTSIZE-NEXT: [[TMP60:%.*]] = extractelement <16 x i1> [[TMP72]], i32 13 +; OPTSIZE-NEXT: br i1 [[TMP60]], label %[[PRED_STORE_IF31:.*]], label %[[PRED_STORE_CONTINUE32:.*]] +; OPTSIZE: [[PRED_STORE_IF31]]: +; OPTSIZE-NEXT: [[TMP61:%.*]] = add i64 [[INDEX]], 13 +; OPTSIZE-NEXT: [[TMP62:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP61]] +; OPTSIZE-NEXT: [[TMP63:%.*]] = extractelement <16 x i8> [[TMP7]], i32 13 +; OPTSIZE-NEXT: store i8 [[TMP63]], ptr [[TMP62]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE32]] +; OPTSIZE: [[PRED_STORE_CONTINUE32]]: +; OPTSIZE-NEXT: [[TMP64:%.*]] = extractelement <16 x i1> [[TMP72]], i32 14 +; OPTSIZE-NEXT: br i1 [[TMP64]], label %[[PRED_STORE_IF33:.*]], label %[[PRED_STORE_CONTINUE34:.*]] +; OPTSIZE: [[PRED_STORE_IF33]]: +; OPTSIZE-NEXT: [[TMP65:%.*]] = add i64 [[INDEX]], 14 +; OPTSIZE-NEXT: [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP65]] +; OPTSIZE-NEXT: [[TMP67:%.*]] = extractelement <16 x i8> [[TMP7]], i32 14 +; OPTSIZE-NEXT: store i8 [[TMP67]], ptr [[TMP66]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE34]] +; OPTSIZE: [[PRED_STORE_CONTINUE34]]: +; OPTSIZE-NEXT: [[TMP68:%.*]] = extractelement <16 x i1> [[TMP72]], i32 15 +; OPTSIZE-NEXT: br i1 [[TMP68]], label %[[PRED_STORE_IF35:.*]], label %[[PRED_STORE_CONTINUE36]] +; OPTSIZE: [[PRED_STORE_IF35]]: +; OPTSIZE-NEXT: [[TMP69:%.*]] = add i64 [[INDEX]], 15 +; OPTSIZE-NEXT: [[TMP70:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP69]] +; OPTSIZE-NEXT: [[TMP71:%.*]] = extractelement <16 x i8> [[TMP7]], i32 15 +; OPTSIZE-NEXT: store i8 [[TMP71]], ptr [[TMP70]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE36]] +; OPTSIZE: [[PRED_STORE_CONTINUE36]]: +; OPTSIZE-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16) +; OPTSIZE-NEXT: [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16) +; OPTSIZE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; OPTSIZE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; OPTSIZE: [[MIDDLE_BLOCK]]: +; OPTSIZE-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; OPTSIZE: [[SCALAR_PH]]: +; OPTSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; OPTSIZE-NEXT: br label %[[FOR_BODY:.*]] +; OPTSIZE: [[FOR_BODY]]: +; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; OPTSIZE-NEXT: [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i8 +; OPTSIZE-NEXT: [[MUL:%.*]] = mul i8 [[A]], [[TMP0]] +; OPTSIZE-NEXT: [[SHR:%.*]] = lshr i8 [[TMP0]], 1 +; OPTSIZE-NEXT: [[MUL5:%.*]] = mul i8 [[SHR]], [[B]] +; OPTSIZE-NEXT: [[ADD:%.*]] = add i8 [[MUL5]], [[MUL]] +; OPTSIZE-NEXT: [[SHR7:%.*]] = lshr i8 [[TMP0]], 2 +; OPTSIZE-NEXT: [[MUL9:%.*]] = mul i8 [[SHR7]], [[C]] +; OPTSIZE-NEXT: [[ADD10:%.*]] = add i8 [[ADD]], [[MUL9]] +; OPTSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]] +; OPTSIZE-NEXT: store i8 [[ADD10]], ptr [[ARRAYIDX]], align 1 +; OPTSIZE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; OPTSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 15 +; OPTSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; OPTSIZE: [[FOR_COND_CLEANUP]]: +; OPTSIZE-NEXT: ret void +; +; MINSIZE-LABEL: define void @tail_predicate_without_optsize( +; MINSIZE-SAME: ptr [[P:%.*]], i8 [[A:%.*]], i8 [[B:%.*]], i8 [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; MINSIZE-NEXT: [[ENTRY:.*]]: +; MINSIZE-NEXT: br label %[[FOR_BODY:.*]] +; MINSIZE: [[FOR_BODY]]: +; MINSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; MINSIZE-NEXT: [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i8 +; MINSIZE-NEXT: [[MUL:%.*]] = mul i8 [[A]], [[TMP0]] +; MINSIZE-NEXT: [[SHR:%.*]] = lshr i8 [[TMP0]], 1 +; MINSIZE-NEXT: [[MUL5:%.*]] = mul i8 [[SHR]], [[B]] +; MINSIZE-NEXT: [[ADD:%.*]] = add i8 [[MUL5]], [[MUL]] +; MINSIZE-NEXT: [[SHR7:%.*]] = lshr i8 [[TMP0]], 2 +; MINSIZE-NEXT: [[MUL9:%.*]] = mul i8 [[SHR7]], [[C]] +; MINSIZE-NEXT: [[ADD10:%.*]] = add i8 [[ADD]], [[MUL9]] +; MINSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]] +; MINSIZE-NEXT: store i8 [[ADD10]], ptr [[ARRAYIDX]], align 1 +; MINSIZE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; MINSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 15 +; MINSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY]] +; MINSIZE: [[FOR_COND_CLEANUP]]: +; MINSIZE-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %0 = trunc nuw nsw i64 %iv to i8 + %mul = mul i8 %a, %0 + %shr = lshr i8 %0, 1 + %mul5 = mul i8 %shr, %b + %add = add i8 %mul5, %mul + %shr7 = lshr i8 %0, 2 + %mul9 = mul i8 %shr7, %c + %add10 = add i8 %add, %mul9 + %arrayidx = getelementptr inbounds i8, ptr %p, i64 %iv + store i8 %add10, ptr %arrayidx, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 15 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + +; This is the same as the previous test, but with sve enabled which means we can +; tail predicate without scalarizing the store. This means we should vectorize +; with optsize, but we still shouldn't vectorize with minsize as doing so +; would result in larger code. +; FIXME: We currently vectorize with minsize as we don't account for the cost of +; the lane mask instructions +define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n) #0 { +; DEFAULT-LABEL: define void @sve_tail_predicate_without_minsize( +; DEFAULT-SAME: ptr [[P:%.*]], i8 [[A:%.*]], i8 [[B:%.*]], i8 [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; DEFAULT-NEXT: [[ENTRY:.*]]: +; DEFAULT-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; DEFAULT: [[VECTOR_PH]]: +; DEFAULT-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; DEFAULT-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; DEFAULT-NEXT: [[N_RND_UP:%.*]] = add i64 15, [[TMP2]] +; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; DEFAULT-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 16 +; DEFAULT-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; DEFAULT-NEXT: [[TMP7:%.*]] = sub i64 15, [[TMP6]] +; DEFAULT-NEXT: [[TMP8:%.*]] = icmp ugt i64 15, [[TMP6]] +; DEFAULT-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; DEFAULT-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 15) +; DEFAULT-NEXT: [[TMP10:%.*]] = call @llvm.stepvector.nxv16i8() +; DEFAULT-NEXT: [[TMP11:%.*]] = mul [[TMP10]], splat (i8 1) +; DEFAULT-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP11]] +; DEFAULT-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP4]] to i8 +; DEFAULT-NEXT: [[TMP13:%.*]] = mul i8 1, [[TMP12]] +; DEFAULT-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP13]], i64 0 +; DEFAULT-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i8 [[A]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i8 [[B]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i8 [[C]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] +; DEFAULT: [[VECTOR_BODY]]: +; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; DEFAULT-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; DEFAULT-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 0 +; DEFAULT-NEXT: [[TMP15:%.*]] = mul [[BROADCAST_SPLAT]], [[VEC_IND]] +; DEFAULT-NEXT: [[TMP16:%.*]] = lshr [[VEC_IND]], splat (i8 1) +; DEFAULT-NEXT: [[TMP17:%.*]] = mul [[TMP16]], [[BROADCAST_SPLAT2]] +; DEFAULT-NEXT: [[TMP18:%.*]] = add [[TMP17]], [[TMP15]] +; DEFAULT-NEXT: [[TMP19:%.*]] = lshr [[VEC_IND]], splat (i8 2) +; DEFAULT-NEXT: [[TMP20:%.*]] = mul [[TMP19]], [[BROADCAST_SPLAT4]] +; DEFAULT-NEXT: [[TMP21:%.*]] = add [[TMP18]], [[TMP20]] +; DEFAULT-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP14]] +; DEFAULT-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i32 0 +; DEFAULT-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP21]], ptr [[TMP23]], i32 1, [[ACTIVE_LANE_MASK]]) +; DEFAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] +; DEFAULT-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; DEFAULT-NEXT: [[TMP24:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) +; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; DEFAULT-NEXT: [[TMP25:%.*]] = extractelement [[TMP24]], i32 0 +; DEFAULT-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; DEFAULT: [[MIDDLE_BLOCK]]: +; DEFAULT-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; DEFAULT: [[SCALAR_PH]]: +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; DEFAULT-NEXT: br label %[[FOR_BODY:.*]] +; DEFAULT: [[FOR_BODY]]: +; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; DEFAULT-NEXT: [[TMP26:%.*]] = trunc nuw nsw i64 [[IV]] to i8 +; DEFAULT-NEXT: [[MUL:%.*]] = mul i8 [[A]], [[TMP26]] +; DEFAULT-NEXT: [[SHR:%.*]] = lshr i8 [[TMP26]], 1 +; DEFAULT-NEXT: [[MUL5:%.*]] = mul i8 [[SHR]], [[B]] +; DEFAULT-NEXT: [[ADD:%.*]] = add i8 [[MUL5]], [[MUL]] +; DEFAULT-NEXT: [[SHR7:%.*]] = lshr i8 [[TMP26]], 2 +; DEFAULT-NEXT: [[MUL9:%.*]] = mul i8 [[SHR7]], [[C]] +; DEFAULT-NEXT: [[ADD10:%.*]] = add i8 [[ADD]], [[MUL9]] +; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[IV]] +; DEFAULT-NEXT: store i8 [[ADD10]], ptr [[ARRAYIDX]], align 1 +; DEFAULT-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; DEFAULT-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 15 +; DEFAULT-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; DEFAULT: [[FOR_COND_CLEANUP]]: +; DEFAULT-NEXT: ret void +; +; OPTSIZE-LABEL: define void @sve_tail_predicate_without_minsize( +; OPTSIZE-SAME: ptr [[P:%.*]], i8 [[A:%.*]], i8 [[B:%.*]], i8 [[C:%.*]], i32 [[N:%.*]]) #[[ATTR1:[0-9]+]] { +; OPTSIZE-NEXT: [[ENTRY:.*]]: +; OPTSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; OPTSIZE: [[VECTOR_PH]]: +; OPTSIZE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; OPTSIZE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; OPTSIZE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; OPTSIZE-NEXT: [[N_RND_UP:%.*]] = add i64 15, [[TMP2]] +; OPTSIZE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; OPTSIZE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; OPTSIZE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; OPTSIZE-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 16 +; OPTSIZE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; OPTSIZE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; OPTSIZE-NEXT: [[TMP7:%.*]] = sub i64 15, [[TMP6]] +; OPTSIZE-NEXT: [[TMP8:%.*]] = icmp ugt i64 15, [[TMP6]] +; OPTSIZE-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; OPTSIZE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 15) +; OPTSIZE-NEXT: [[TMP10:%.*]] = call @llvm.stepvector.nxv16i8() +; OPTSIZE-NEXT: [[TMP11:%.*]] = mul [[TMP10]], splat (i8 1) +; OPTSIZE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP11]] +; OPTSIZE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP4]] to i8 +; OPTSIZE-NEXT: [[TMP13:%.*]] = mul i8 1, [[TMP12]] +; OPTSIZE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP13]], i64 0 +; OPTSIZE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; OPTSIZE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i8 [[A]], i64 0 +; OPTSIZE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; OPTSIZE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i8 [[B]], i64 0 +; OPTSIZE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; OPTSIZE-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i8 [[C]], i64 0 +; OPTSIZE-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; OPTSIZE-NEXT: br label %[[VECTOR_BODY:.*]] +; OPTSIZE: [[VECTOR_BODY]]: +; OPTSIZE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; OPTSIZE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; OPTSIZE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; OPTSIZE-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 0 +; OPTSIZE-NEXT: [[TMP15:%.*]] = mul [[BROADCAST_SPLAT]], [[VEC_IND]] +; OPTSIZE-NEXT: [[TMP16:%.*]] = lshr [[VEC_IND]], splat (i8 1) +; OPTSIZE-NEXT: [[TMP17:%.*]] = mul [[TMP16]], [[BROADCAST_SPLAT2]] +; OPTSIZE-NEXT: [[TMP18:%.*]] = add [[TMP17]], [[TMP15]] +; OPTSIZE-NEXT: [[TMP19:%.*]] = lshr [[VEC_IND]], splat (i8 2) +; OPTSIZE-NEXT: [[TMP20:%.*]] = mul [[TMP19]], [[BROADCAST_SPLAT4]] +; OPTSIZE-NEXT: [[TMP21:%.*]] = add [[TMP18]], [[TMP20]] +; OPTSIZE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP14]] +; OPTSIZE-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i32 0 +; OPTSIZE-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP21]], ptr [[TMP23]], i32 1, [[ACTIVE_LANE_MASK]]) +; OPTSIZE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] +; OPTSIZE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; OPTSIZE-NEXT: [[TMP24:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) +; OPTSIZE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; OPTSIZE-NEXT: [[TMP25:%.*]] = extractelement [[TMP24]], i32 0 +; OPTSIZE-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; OPTSIZE: [[MIDDLE_BLOCK]]: +; OPTSIZE-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; OPTSIZE: [[SCALAR_PH]]: +; OPTSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; OPTSIZE-NEXT: br label %[[FOR_BODY:.*]] +; OPTSIZE: [[FOR_BODY]]: +; OPTSIZE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; OPTSIZE-NEXT: [[TMP26:%.*]] = trunc nuw nsw i64 [[IV]] to i8 +; OPTSIZE-NEXT: [[MUL:%.*]] = mul i8 [[A]], [[TMP26]] +; OPTSIZE-NEXT: [[SHR:%.*]] = lshr i8 [[TMP26]], 1 +; OPTSIZE-NEXT: [[MUL5:%.*]] = mul i8 [[SHR]], [[B]] +; OPTSIZE-NEXT: [[ADD:%.*]] = add i8 [[MUL5]], [[MUL]] +; OPTSIZE-NEXT: [[SHR7:%.*]] = lshr i8 [[TMP26]], 2 +; OPTSIZE-NEXT: [[MUL9:%.*]] = mul i8 [[SHR7]], [[C]] +; OPTSIZE-NEXT: [[ADD10:%.*]] = add i8 [[ADD]], [[MUL9]] +; OPTSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[IV]] +; OPTSIZE-NEXT: store i8 [[ADD10]], ptr [[ARRAYIDX]], align 1 +; OPTSIZE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; OPTSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 15 +; OPTSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; OPTSIZE: [[FOR_COND_CLEANUP]]: +; OPTSIZE-NEXT: ret void +; +; MINSIZE-LABEL: define void @sve_tail_predicate_without_minsize( +; MINSIZE-SAME: ptr [[P:%.*]], i8 [[A:%.*]], i8 [[B:%.*]], i8 [[C:%.*]], i32 [[N:%.*]]) #[[ATTR1:[0-9]+]] { +; MINSIZE-NEXT: [[ENTRY:.*]]: +; MINSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; MINSIZE: [[VECTOR_PH]]: +; MINSIZE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; MINSIZE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; MINSIZE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; MINSIZE-NEXT: [[N_RND_UP:%.*]] = add i64 15, [[TMP2]] +; MINSIZE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; MINSIZE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; MINSIZE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; MINSIZE-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 16 +; MINSIZE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; MINSIZE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; MINSIZE-NEXT: [[TMP7:%.*]] = sub i64 15, [[TMP6]] +; MINSIZE-NEXT: [[TMP8:%.*]] = icmp ugt i64 15, [[TMP6]] +; MINSIZE-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; MINSIZE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 15) +; MINSIZE-NEXT: [[TMP10:%.*]] = call @llvm.stepvector.nxv16i8() +; MINSIZE-NEXT: [[TMP11:%.*]] = mul [[TMP10]], splat (i8 1) +; MINSIZE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP11]] +; MINSIZE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP4]] to i8 +; MINSIZE-NEXT: [[TMP13:%.*]] = mul i8 1, [[TMP12]] +; MINSIZE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP13]], i64 0 +; MINSIZE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; MINSIZE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i8 [[A]], i64 0 +; MINSIZE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; MINSIZE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i8 [[B]], i64 0 +; MINSIZE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; MINSIZE-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i8 [[C]], i64 0 +; MINSIZE-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; MINSIZE-NEXT: br label %[[VECTOR_BODY:.*]] +; MINSIZE: [[VECTOR_BODY]]: +; MINSIZE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; MINSIZE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; MINSIZE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; MINSIZE-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 0 +; MINSIZE-NEXT: [[TMP15:%.*]] = mul [[BROADCAST_SPLAT]], [[VEC_IND]] +; MINSIZE-NEXT: [[TMP16:%.*]] = lshr [[VEC_IND]], splat (i8 1) +; MINSIZE-NEXT: [[TMP17:%.*]] = mul [[TMP16]], [[BROADCAST_SPLAT2]] +; MINSIZE-NEXT: [[TMP18:%.*]] = add [[TMP17]], [[TMP15]] +; MINSIZE-NEXT: [[TMP19:%.*]] = lshr [[VEC_IND]], splat (i8 2) +; MINSIZE-NEXT: [[TMP20:%.*]] = mul [[TMP19]], [[BROADCAST_SPLAT4]] +; MINSIZE-NEXT: [[TMP21:%.*]] = add [[TMP18]], [[TMP20]] +; MINSIZE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP14]] +; MINSIZE-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i32 0 +; MINSIZE-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP21]], ptr [[TMP23]], i32 1, [[ACTIVE_LANE_MASK]]) +; MINSIZE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] +; MINSIZE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; MINSIZE-NEXT: [[TMP24:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) +; MINSIZE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; MINSIZE-NEXT: [[TMP25:%.*]] = extractelement [[TMP24]], i32 0 +; MINSIZE-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; MINSIZE: [[MIDDLE_BLOCK]]: +; MINSIZE-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; MINSIZE: [[SCALAR_PH]]: +; MINSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; MINSIZE-NEXT: br label %[[FOR_BODY:.*]] +; MINSIZE: [[FOR_BODY]]: +; MINSIZE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; MINSIZE-NEXT: [[TMP26:%.*]] = trunc nuw nsw i64 [[IV]] to i8 +; MINSIZE-NEXT: [[MUL:%.*]] = mul i8 [[A]], [[TMP26]] +; MINSIZE-NEXT: [[SHR:%.*]] = lshr i8 [[TMP26]], 1 +; MINSIZE-NEXT: [[MUL5:%.*]] = mul i8 [[SHR]], [[B]] +; MINSIZE-NEXT: [[ADD:%.*]] = add i8 [[MUL5]], [[MUL]] +; MINSIZE-NEXT: [[SHR7:%.*]] = lshr i8 [[TMP26]], 2 +; MINSIZE-NEXT: [[MUL9:%.*]] = mul i8 [[SHR7]], [[C]] +; MINSIZE-NEXT: [[ADD10:%.*]] = add i8 [[ADD]], [[MUL9]] +; MINSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[IV]] +; MINSIZE-NEXT: store i8 [[ADD10]], ptr [[ARRAYIDX]], align 1 +; MINSIZE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; MINSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 15 +; MINSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; MINSIZE: [[FOR_COND_CLEANUP]]: +; MINSIZE-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %0 = trunc nuw nsw i64 %iv to i8 + %mul = mul i8 %a, %0 + %shr = lshr i8 %0, 1 + %mul5 = mul i8 %shr, %b + %add = add i8 %mul5, %mul + %shr7 = lshr i8 %0, 2 + %mul9 = mul i8 %shr7, %c + %add10 = add i8 %add, %mul9 + %arrayidx = getelementptr inbounds i8, ptr %p, i64 %iv + store i8 %add10, ptr %arrayidx, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 15 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + +; Vector width 8 is fastest, default behaviour is to interleave by 2 but we +; shouldn't do this with optsize. We shouldn't vectorize with minsize as it +; needs an extra instruction to do the trunc which the scalar version doesn't +; need. +; FIXME: We currently vectorize with minsize as the trunc cost is incorrect +define void @dont_vectorize_with_minsize() { +; DEFAULT-LABEL: define void @dont_vectorize_with_minsize() { +; DEFAULT-NEXT: [[ENTRY:.*]]: +; DEFAULT-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; DEFAULT: [[VECTOR_PH]]: +; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] +; DEFAULT: [[VECTOR_BODY]]: +; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; DEFAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[TMP0]] +; DEFAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0 +; DEFAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 8 +; DEFAULT-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4 +; DEFAULT-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4 +; DEFAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[TMP0]] +; DEFAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP4]], i32 0 +; DEFAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP4]], i32 8 +; DEFAULT-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4 +; DEFAULT-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4 +; DEFAULT-NEXT: [[TMP7:%.*]] = mul nsw <8 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; DEFAULT-NEXT: [[TMP8:%.*]] = mul nsw <8 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD3]] +; DEFAULT-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[TMP0]] +; DEFAULT-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP9]], i32 0 +; DEFAULT-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP9]], i32 8 +; DEFAULT-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i16>, ptr [[TMP10]], align 2 +; DEFAULT-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i16>, ptr [[TMP11]], align 2 +; DEFAULT-NEXT: [[TMP12:%.*]] = trunc <8 x i32> [[TMP7]] to <8 x i16> +; DEFAULT-NEXT: [[TMP13:%.*]] = trunc <8 x i32> [[TMP8]] to <8 x i16> +; DEFAULT-NEXT: [[TMP14:%.*]] = add <8 x i16> [[TMP12]], [[WIDE_LOAD4]] +; DEFAULT-NEXT: [[TMP15:%.*]] = add <8 x i16> [[TMP13]], [[WIDE_LOAD5]] +; DEFAULT-NEXT: store <8 x i16> [[TMP14]], ptr [[TMP10]], align 2 +; DEFAULT-NEXT: store <8 x i16> [[TMP15]], ptr [[TMP11]], align 2 +; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; DEFAULT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 +; DEFAULT-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; DEFAULT: [[MIDDLE_BLOCK]]: +; DEFAULT-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; DEFAULT: [[SCALAR_PH]]: +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; DEFAULT-NEXT: br label %[[FOR_BODY:.*]] +; DEFAULT: [[FOR_BODY]]: +; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]] +; DEFAULT-NEXT: [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; DEFAULT-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]] +; DEFAULT-NEXT: [[CVAL:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; DEFAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[BVAL]], [[CVAL]] +; DEFAULT-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDVARS_IV]] +; DEFAULT-NEXT: [[AVAL:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 +; DEFAULT-NEXT: [[TRUNC:%.*]] = trunc i32 [[MUL]] to i16 +; DEFAULT-NEXT: [[ADD:%.*]] = add i16 [[TRUNC]], [[AVAL]] +; DEFAULT-NEXT: store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2 +; DEFAULT-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; DEFAULT-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64 +; DEFAULT-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; DEFAULT: [[FOR_COND_CLEANUP]]: +; DEFAULT-NEXT: ret void +; +; OPTSIZE-LABEL: define void @dont_vectorize_with_minsize( +; OPTSIZE-SAME: ) #[[ATTR0]] { +; OPTSIZE-NEXT: [[ENTRY:.*]]: +; OPTSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; OPTSIZE: [[VECTOR_PH]]: +; OPTSIZE-NEXT: br label %[[VECTOR_BODY:.*]] +; OPTSIZE: [[VECTOR_BODY]]: +; OPTSIZE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; OPTSIZE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; OPTSIZE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[TMP0]] +; OPTSIZE-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0 +; OPTSIZE-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4 +; OPTSIZE-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[TMP0]] +; OPTSIZE-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP3]], i32 0 +; OPTSIZE-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4 +; OPTSIZE-NEXT: [[TMP5:%.*]] = mul nsw <8 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; OPTSIZE-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[TMP0]] +; OPTSIZE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP6]], i32 0 +; OPTSIZE-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2 +; OPTSIZE-NEXT: [[TMP8:%.*]] = trunc <8 x i32> [[TMP5]] to <8 x i16> +; OPTSIZE-NEXT: [[TMP9:%.*]] = add <8 x i16> [[TMP8]], [[WIDE_LOAD2]] +; OPTSIZE-NEXT: store <8 x i16> [[TMP9]], ptr [[TMP7]], align 2 +; OPTSIZE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; OPTSIZE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 +; OPTSIZE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; OPTSIZE: [[MIDDLE_BLOCK]]: +; OPTSIZE-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; OPTSIZE: [[SCALAR_PH]]: +; OPTSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; OPTSIZE-NEXT: br label %[[FOR_BODY:.*]] +; OPTSIZE: [[FOR_BODY]]: +; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; OPTSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]] +; OPTSIZE-NEXT: [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; OPTSIZE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]] +; OPTSIZE-NEXT: [[CVAL:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; OPTSIZE-NEXT: [[MUL:%.*]] = mul nsw i32 [[BVAL]], [[CVAL]] +; OPTSIZE-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDVARS_IV]] +; OPTSIZE-NEXT: [[AVAL:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 +; OPTSIZE-NEXT: [[TRUNC:%.*]] = trunc i32 [[MUL]] to i16 +; OPTSIZE-NEXT: [[ADD:%.*]] = add i16 [[TRUNC]], [[AVAL]] +; OPTSIZE-NEXT: store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2 +; OPTSIZE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; OPTSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64 +; OPTSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; OPTSIZE: [[FOR_COND_CLEANUP]]: +; OPTSIZE-NEXT: ret void +; +; MINSIZE-LABEL: define void @dont_vectorize_with_minsize( +; MINSIZE-SAME: ) #[[ATTR0]] { +; MINSIZE-NEXT: [[ENTRY:.*]]: +; MINSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; MINSIZE: [[VECTOR_PH]]: +; MINSIZE-NEXT: br label %[[VECTOR_BODY:.*]] +; MINSIZE: [[VECTOR_BODY]]: +; MINSIZE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; MINSIZE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; MINSIZE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[TMP0]] +; MINSIZE-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0 +; MINSIZE-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 +; MINSIZE-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[TMP0]] +; MINSIZE-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP3]], i32 0 +; MINSIZE-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4 +; MINSIZE-NEXT: [[TMP5:%.*]] = mul nsw <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; MINSIZE-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[TMP0]] +; MINSIZE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP6]], i32 0 +; MINSIZE-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i16>, ptr [[TMP7]], align 2 +; MINSIZE-NEXT: [[TMP8:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16> +; MINSIZE-NEXT: [[TMP9:%.*]] = add <2 x i16> [[TMP8]], [[WIDE_LOAD2]] +; MINSIZE-NEXT: store <2 x i16> [[TMP9]], ptr [[TMP7]], align 2 +; MINSIZE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; MINSIZE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 +; MINSIZE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; MINSIZE: [[MIDDLE_BLOCK]]: +; MINSIZE-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; MINSIZE: [[SCALAR_PH]]: +; MINSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; MINSIZE-NEXT: br label %[[FOR_BODY:.*]] +; MINSIZE: [[FOR_BODY]]: +; MINSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; MINSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]] +; MINSIZE-NEXT: [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; MINSIZE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]] +; MINSIZE-NEXT: [[CVAL:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; MINSIZE-NEXT: [[MUL:%.*]] = mul nsw i32 [[BVAL]], [[CVAL]] +; MINSIZE-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDVARS_IV]] +; MINSIZE-NEXT: [[AVAL:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 +; MINSIZE-NEXT: [[TRUNC:%.*]] = trunc i32 [[MUL]] to i16 +; MINSIZE-NEXT: [[ADD:%.*]] = add i16 [[TRUNC]], [[AVAL]] +; MINSIZE-NEXT: store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2 +; MINSIZE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; MINSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64 +; MINSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; MINSIZE: [[FOR_COND_CLEANUP]]: +; MINSIZE-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 %iv + %bval = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 %iv + %cval = load i32, ptr %arrayidx2, align 4 + %mul = mul nsw i32 %bval, %cval + %arrayidx4 = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 %iv + %aval = load i16, ptr %arrayidx4, align 2 + %trunc = trunc i32 %mul to i16 + %add = add i16 %trunc, %aval + store i16 %add, ptr %arrayidx4, align 2 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 64 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + + +; If vectorization is forced then with minsize vector width 4 is the smallest: +; width 8 means the 8 x i32 mul needs two instructions, width 2 means we have a +; 2 x i16 load and store which each become two scalar instructions. +; FIXME: We currently use width 2 as the load/store cost is incorrect. +define void @vectorization_forced_minsize_reduce_width() { +; DEFAULT-LABEL: define void @vectorization_forced_minsize_reduce_width() { +; DEFAULT-NEXT: [[ENTRY:.*]]: +; DEFAULT-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; DEFAULT: [[VECTOR_PH]]: +; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] +; DEFAULT: [[VECTOR_BODY]]: +; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; DEFAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[TMP0]] +; DEFAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0 +; DEFAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 8 +; DEFAULT-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4 +; DEFAULT-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4 +; DEFAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[TMP0]] +; DEFAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP4]], i32 0 +; DEFAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP4]], i32 8 +; DEFAULT-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4 +; DEFAULT-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4 +; DEFAULT-NEXT: [[TMP7:%.*]] = mul nsw <8 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; DEFAULT-NEXT: [[TMP8:%.*]] = mul nsw <8 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD3]] +; DEFAULT-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[TMP0]] +; DEFAULT-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP9]], i32 0 +; DEFAULT-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP9]], i32 8 +; DEFAULT-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i16>, ptr [[TMP10]], align 2 +; DEFAULT-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i16>, ptr [[TMP11]], align 2 +; DEFAULT-NEXT: [[TMP12:%.*]] = trunc <8 x i32> [[TMP7]] to <8 x i16> +; DEFAULT-NEXT: [[TMP13:%.*]] = trunc <8 x i32> [[TMP8]] to <8 x i16> +; DEFAULT-NEXT: [[TMP14:%.*]] = add <8 x i16> [[TMP12]], [[WIDE_LOAD4]] +; DEFAULT-NEXT: [[TMP15:%.*]] = add <8 x i16> [[TMP13]], [[WIDE_LOAD5]] +; DEFAULT-NEXT: store <8 x i16> [[TMP14]], ptr [[TMP10]], align 2 +; DEFAULT-NEXT: store <8 x i16> [[TMP15]], ptr [[TMP11]], align 2 +; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; DEFAULT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 +; DEFAULT-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; DEFAULT: [[MIDDLE_BLOCK]]: +; DEFAULT-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; DEFAULT: [[SCALAR_PH]]: +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; DEFAULT-NEXT: br label %[[FOR_BODY:.*]] +; DEFAULT: [[FOR_BODY]]: +; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]] +; DEFAULT-NEXT: [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; DEFAULT-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]] +; DEFAULT-NEXT: [[CVAL:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; DEFAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[BVAL]], [[CVAL]] +; DEFAULT-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDVARS_IV]] +; DEFAULT-NEXT: [[AVAL:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 +; DEFAULT-NEXT: [[TRUNC:%.*]] = trunc i32 [[MUL]] to i16 +; DEFAULT-NEXT: [[ADD:%.*]] = add i16 [[TRUNC]], [[AVAL]] +; DEFAULT-NEXT: store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2 +; DEFAULT-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; DEFAULT-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64 +; DEFAULT-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; DEFAULT: [[FOR_COND_CLEANUP]]: +; DEFAULT-NEXT: ret void +; +; OPTSIZE-LABEL: define void @vectorization_forced_minsize_reduce_width( +; OPTSIZE-SAME: ) #[[ATTR0]] { +; OPTSIZE-NEXT: [[ENTRY:.*]]: +; OPTSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; OPTSIZE: [[VECTOR_PH]]: +; OPTSIZE-NEXT: br label %[[VECTOR_BODY:.*]] +; OPTSIZE: [[VECTOR_BODY]]: +; OPTSIZE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; OPTSIZE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; OPTSIZE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[TMP0]] +; OPTSIZE-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0 +; OPTSIZE-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4 +; OPTSIZE-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[TMP0]] +; OPTSIZE-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP3]], i32 0 +; OPTSIZE-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4 +; OPTSIZE-NEXT: [[TMP5:%.*]] = mul nsw <8 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; OPTSIZE-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[TMP0]] +; OPTSIZE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP6]], i32 0 +; OPTSIZE-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2 +; OPTSIZE-NEXT: [[TMP8:%.*]] = trunc <8 x i32> [[TMP5]] to <8 x i16> +; OPTSIZE-NEXT: [[TMP9:%.*]] = add <8 x i16> [[TMP8]], [[WIDE_LOAD2]] +; OPTSIZE-NEXT: store <8 x i16> [[TMP9]], ptr [[TMP7]], align 2 +; OPTSIZE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; OPTSIZE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 +; OPTSIZE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; OPTSIZE: [[MIDDLE_BLOCK]]: +; OPTSIZE-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; OPTSIZE: [[SCALAR_PH]]: +; OPTSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; OPTSIZE-NEXT: br label %[[FOR_BODY:.*]] +; OPTSIZE: [[FOR_BODY]]: +; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; OPTSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]] +; OPTSIZE-NEXT: [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; OPTSIZE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]] +; OPTSIZE-NEXT: [[CVAL:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; OPTSIZE-NEXT: [[MUL:%.*]] = mul nsw i32 [[BVAL]], [[CVAL]] +; OPTSIZE-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDVARS_IV]] +; OPTSIZE-NEXT: [[AVAL:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 +; OPTSIZE-NEXT: [[TRUNC:%.*]] = trunc i32 [[MUL]] to i16 +; OPTSIZE-NEXT: [[ADD:%.*]] = add i16 [[TRUNC]], [[AVAL]] +; OPTSIZE-NEXT: store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2 +; OPTSIZE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; OPTSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64 +; OPTSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; OPTSIZE: [[FOR_COND_CLEANUP]]: +; OPTSIZE-NEXT: ret void +; +; MINSIZE-LABEL: define void @vectorization_forced_minsize_reduce_width( +; MINSIZE-SAME: ) #[[ATTR0]] { +; MINSIZE-NEXT: [[ENTRY:.*]]: +; MINSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; MINSIZE: [[VECTOR_PH]]: +; MINSIZE-NEXT: br label %[[VECTOR_BODY:.*]] +; MINSIZE: [[VECTOR_BODY]]: +; MINSIZE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; MINSIZE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; MINSIZE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[TMP0]] +; MINSIZE-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0 +; MINSIZE-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 +; MINSIZE-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[TMP0]] +; MINSIZE-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP3]], i32 0 +; MINSIZE-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4 +; MINSIZE-NEXT: [[TMP5:%.*]] = mul nsw <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; MINSIZE-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[TMP0]] +; MINSIZE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP6]], i32 0 +; MINSIZE-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i16>, ptr [[TMP7]], align 2 +; MINSIZE-NEXT: [[TMP8:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16> +; MINSIZE-NEXT: [[TMP9:%.*]] = add <2 x i16> [[TMP8]], [[WIDE_LOAD2]] +; MINSIZE-NEXT: store <2 x i16> [[TMP9]], ptr [[TMP7]], align 2 +; MINSIZE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; MINSIZE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 +; MINSIZE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; MINSIZE: [[MIDDLE_BLOCK]]: +; MINSIZE-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; MINSIZE: [[SCALAR_PH]]: +; MINSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; MINSIZE-NEXT: br label %[[FOR_BODY:.*]] +; MINSIZE: [[FOR_BODY]]: +; MINSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; MINSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]] +; MINSIZE-NEXT: [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; MINSIZE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]] +; MINSIZE-NEXT: [[CVAL:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; MINSIZE-NEXT: [[MUL:%.*]] = mul nsw i32 [[BVAL]], [[CVAL]] +; MINSIZE-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDVARS_IV]] +; MINSIZE-NEXT: [[AVAL:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 +; MINSIZE-NEXT: [[TRUNC:%.*]] = trunc i32 [[MUL]] to i16 +; MINSIZE-NEXT: [[ADD:%.*]] = add i16 [[TRUNC]], [[AVAL]] +; MINSIZE-NEXT: store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2 +; MINSIZE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; MINSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64 +; MINSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; MINSIZE: [[FOR_COND_CLEANUP]]: +; MINSIZE-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 %iv + %bval = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 %iv + %cval = load i32, ptr %arrayidx2, align 4 + %mul = mul nsw i32 %bval, %cval + %arrayidx4 = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 %iv + %aval = load i16, ptr %arrayidx4, align 2 + %trunc = trunc i32 %mul to i16 + %add = add i16 %trunc, %aval + store i16 %add, ptr %arrayidx4, align 2 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 64 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0 + +for.cond.cleanup: + ret void +} + +attributes #0 = { "target-features"="+sve" } + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.enable", i1 true} + diff --git a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll new file mode 100644 index 0000000000000..c9ddf19b1b9d9 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll @@ -0,0 +1,1012 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 +; The tests here check for differences in behaviour between the default, +; optsize, and minsize. +; RUN: opt -passes=loop-vectorize -S < %s | FileCheck %s --check-prefix=DEFAULT +; RUN: opt -passes=forceattrs,loop-vectorize -force-attribute=optsize -S < %s | FileCheck %s --check-prefix=OPTSIZE +; RUN: opt -passes=forceattrs,loop-vectorize -force-attribute=minsize -S < %s | FileCheck %s --check-prefix=MINSIZE + +target triple = "armv7a-none-eabi" + +@A = global [1000 x i16] zeroinitializer, align 2 +@B = global [1000 x i32] zeroinitializer, align 4 +@C = global [1000 x i32] zeroinitializer, align 4 + +; This should always vectorize, as using vector instructions eliminates the loop +; which is both faster and smaller (a scalar version is emitted, but the branch +; to it is false and it's later removed). +define void @always_vectorize(ptr %p, i32 %x) { +; DEFAULT-LABEL: define void @always_vectorize( +; DEFAULT-SAME: ptr [[P:%.*]], i32 [[X:%.*]]) { +; DEFAULT-NEXT: [[ENTRY:.*]]: +; DEFAULT-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; DEFAULT: [[VECTOR_PH]]: +; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] +; DEFAULT: [[VECTOR_BODY]]: +; DEFAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 0 +; DEFAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; DEFAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; DEFAULT-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; DEFAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; DEFAULT-NEXT: store <4 x i32> [[TMP3]], ptr [[TMP5]], align 4 +; DEFAULT-NEXT: br label %[[MIDDLE_BLOCK:.*]] +; DEFAULT: [[MIDDLE_BLOCK]]: +; DEFAULT-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; DEFAULT: [[SCALAR_PH]]: +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; DEFAULT-NEXT: br label %[[FOR_BODY:.*]] +; DEFAULT: [[FOR_BODY]]: +; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV]] +; DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP4]], [[X]] +; DEFAULT-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX]], align 4 +; DEFAULT-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; DEFAULT-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4 +; DEFAULT-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; DEFAULT: [[FOR_COND_CLEANUP]]: +; DEFAULT-NEXT: ret void +; +; OPTSIZE-LABEL: define void @always_vectorize( +; OPTSIZE-SAME: ptr [[P:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; OPTSIZE-NEXT: [[ENTRY:.*]]: +; OPTSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; OPTSIZE: [[VECTOR_PH]]: +; OPTSIZE-NEXT: br label %[[VECTOR_BODY:.*]] +; OPTSIZE: [[VECTOR_BODY]]: +; OPTSIZE-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 0 +; OPTSIZE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; OPTSIZE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; OPTSIZE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i64 0 +; OPTSIZE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; OPTSIZE-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; OPTSIZE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; OPTSIZE-NEXT: store <4 x i32> [[TMP3]], ptr [[TMP5]], align 4 +; OPTSIZE-NEXT: br label %[[MIDDLE_BLOCK:.*]] +; OPTSIZE: [[MIDDLE_BLOCK]]: +; OPTSIZE-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; OPTSIZE: [[SCALAR_PH]]: +; OPTSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; OPTSIZE-NEXT: br label %[[FOR_BODY:.*]] +; OPTSIZE: [[FOR_BODY]]: +; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; OPTSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV]] +; OPTSIZE-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; OPTSIZE-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP4]], [[X]] +; OPTSIZE-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX]], align 4 +; OPTSIZE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; OPTSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4 +; OPTSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; OPTSIZE: [[FOR_COND_CLEANUP]]: +; OPTSIZE-NEXT: ret void +; +; MINSIZE-LABEL: define void @always_vectorize( +; MINSIZE-SAME: ptr [[P:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; MINSIZE-NEXT: [[ENTRY:.*]]: +; MINSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; MINSIZE: [[VECTOR_PH]]: +; MINSIZE-NEXT: br label %[[VECTOR_BODY:.*]] +; MINSIZE: [[VECTOR_BODY]]: +; MINSIZE-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 0 +; MINSIZE-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0 +; MINSIZE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; MINSIZE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i64 0 +; MINSIZE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; MINSIZE-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; MINSIZE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0 +; MINSIZE-NEXT: store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4 +; MINSIZE-NEXT: br label %[[MIDDLE_BLOCK:.*]] +; MINSIZE: [[MIDDLE_BLOCK]]: +; MINSIZE-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; MINSIZE: [[SCALAR_PH]]: +; MINSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; MINSIZE-NEXT: br label %[[FOR_BODY:.*]] +; MINSIZE: [[FOR_BODY]]: +; MINSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; MINSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV]] +; MINSIZE-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; MINSIZE-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP4]], [[X]] +; MINSIZE-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX]], align 4 +; MINSIZE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; MINSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4 +; MINSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; MINSIZE: [[FOR_COND_CLEANUP]]: +; MINSIZE-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %p, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %add = add nsw i32 %0, %x + store i32 %add, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 4 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + +; This should vectorize only without optsize, as it needs a scalar version +; which increases code size. +define void @vectorize_without_optsize(ptr %p, i32 %x, i64 %n) { +; DEFAULT-LABEL: define void @vectorize_without_optsize( +; DEFAULT-SAME: ptr [[P:%.*]], i32 [[X:%.*]], i64 [[N:%.*]]) { +; DEFAULT-NEXT: [[ENTRY:.*]]: +; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; DEFAULT: [[VECTOR_PH]]: +; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] +; DEFAULT: [[VECTOR_BODY]]: +; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; DEFAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[TMP0]] +; DEFAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; DEFAULT-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; DEFAULT-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]] +; DEFAULT-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP2]], align 4 +; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; DEFAULT-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; DEFAULT-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; DEFAULT: [[MIDDLE_BLOCK]]: +; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; DEFAULT-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; DEFAULT: [[SCALAR_PH]]: +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; DEFAULT-NEXT: br label %[[FOR_BODY:.*]] +; DEFAULT: [[FOR_BODY]]: +; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV]] +; DEFAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP7]], [[X]] +; DEFAULT-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX]], align 4 +; DEFAULT-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; DEFAULT-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; DEFAULT-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; DEFAULT: [[FOR_COND_CLEANUP]]: +; DEFAULT-NEXT: ret void +; +; OPTSIZE-LABEL: define void @vectorize_without_optsize( +; OPTSIZE-SAME: ptr [[P:%.*]], i32 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; OPTSIZE-NEXT: [[ENTRY:.*]]: +; OPTSIZE-NEXT: br label %[[FOR_BODY:.*]] +; OPTSIZE: [[FOR_BODY]]: +; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; OPTSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV]] +; OPTSIZE-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; OPTSIZE-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[X]] +; OPTSIZE-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX]], align 4 +; OPTSIZE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; OPTSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; OPTSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY]] +; OPTSIZE: [[FOR_COND_CLEANUP]]: +; OPTSIZE-NEXT: ret void +; +; MINSIZE-LABEL: define void @vectorize_without_optsize( +; MINSIZE-SAME: ptr [[P:%.*]], i32 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; MINSIZE-NEXT: [[ENTRY:.*]]: +; MINSIZE-NEXT: br label %[[FOR_BODY:.*]] +; MINSIZE: [[FOR_BODY]]: +; MINSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; MINSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV]] +; MINSIZE-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; MINSIZE-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[X]] +; MINSIZE-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX]], align 4 +; MINSIZE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; MINSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; MINSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY]] +; MINSIZE: [[FOR_COND_CLEANUP]]: +; MINSIZE-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %p, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %add = add nsw i32 %0, %x + store i32 %add, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + +; This should be vectorized and tail predicated without optsize, as that's +; faster, but not with optsize, as it's much larger. +; FIXME: Currently we avoid tail predication only with minsize +define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n) { +; DEFAULT-LABEL: define void @tail_predicate_without_optsize( +; DEFAULT-SAME: ptr [[P:%.*]], i8 [[A:%.*]], i8 [[B:%.*]], i8 [[C:%.*]], i32 [[N:%.*]]) { +; DEFAULT-NEXT: [[ENTRY:.*]]: +; DEFAULT-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; DEFAULT: [[VECTOR_PH]]: +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <16 x i8> poison, i8 [[B]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT3]], <16 x i8> poison, <16 x i32> zeroinitializer +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT5]], <16 x i8> poison, <16 x i32> zeroinitializer +; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] +; DEFAULT: [[VECTOR_BODY]]: +; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE36:.*]] ] +; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE36]] ] +; DEFAULT-NEXT: [[VEC_IND1:%.*]] = phi <16 x i8> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], %[[PRED_STORE_CONTINUE36]] ] +; DEFAULT-NEXT: [[TMP0:%.*]] = icmp ule <16 x i64> [[VEC_IND]], splat (i64 14) +; DEFAULT-NEXT: [[TMP1:%.*]] = mul <16 x i8> [[BROADCAST_SPLAT]], [[VEC_IND1]] +; DEFAULT-NEXT: [[TMP2:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 1) +; DEFAULT-NEXT: [[TMP3:%.*]] = mul <16 x i8> [[TMP2]], [[BROADCAST_SPLAT4]] +; DEFAULT-NEXT: [[TMP4:%.*]] = add <16 x i8> [[TMP3]], [[TMP1]] +; DEFAULT-NEXT: [[TMP5:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 2) +; DEFAULT-NEXT: [[TMP6:%.*]] = mul <16 x i8> [[TMP5]], [[BROADCAST_SPLAT6]] +; DEFAULT-NEXT: [[TMP7:%.*]] = add <16 x i8> [[TMP4]], [[TMP6]] +; DEFAULT-NEXT: [[TMP8:%.*]] = extractelement <16 x i1> [[TMP0]], i32 0 +; DEFAULT-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; DEFAULT: [[PRED_STORE_IF]]: +; DEFAULT-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; DEFAULT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP9]] +; DEFAULT-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP7]], i32 0 +; DEFAULT-NEXT: store i8 [[TMP11]], ptr [[TMP10]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE]] +; DEFAULT: [[PRED_STORE_CONTINUE]]: +; DEFAULT-NEXT: [[TMP12:%.*]] = extractelement <16 x i1> [[TMP0]], i32 1 +; DEFAULT-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] +; DEFAULT: [[PRED_STORE_IF7]]: +; DEFAULT-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 1 +; DEFAULT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP13]] +; DEFAULT-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[TMP7]], i32 1 +; DEFAULT-NEXT: store i8 [[TMP15]], ptr [[TMP14]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE8]] +; DEFAULT: [[PRED_STORE_CONTINUE8]]: +; DEFAULT-NEXT: [[TMP16:%.*]] = extractelement <16 x i1> [[TMP0]], i32 2 +; DEFAULT-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] +; DEFAULT: [[PRED_STORE_IF9]]: +; DEFAULT-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 2 +; DEFAULT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP17]] +; DEFAULT-NEXT: [[TMP19:%.*]] = extractelement <16 x i8> [[TMP7]], i32 2 +; DEFAULT-NEXT: store i8 [[TMP19]], ptr [[TMP18]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE10]] +; DEFAULT: [[PRED_STORE_CONTINUE10]]: +; DEFAULT-NEXT: [[TMP20:%.*]] = extractelement <16 x i1> [[TMP0]], i32 3 +; DEFAULT-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] +; DEFAULT: [[PRED_STORE_IF11]]: +; DEFAULT-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 3 +; DEFAULT-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP21]] +; DEFAULT-NEXT: [[TMP23:%.*]] = extractelement <16 x i8> [[TMP7]], i32 3 +; DEFAULT-NEXT: store i8 [[TMP23]], ptr [[TMP22]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE12]] +; DEFAULT: [[PRED_STORE_CONTINUE12]]: +; DEFAULT-NEXT: [[TMP24:%.*]] = extractelement <16 x i1> [[TMP0]], i32 4 +; DEFAULT-NEXT: br i1 [[TMP24]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] +; DEFAULT: [[PRED_STORE_IF13]]: +; DEFAULT-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 4 +; DEFAULT-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP25]] +; DEFAULT-NEXT: [[TMP27:%.*]] = extractelement <16 x i8> [[TMP7]], i32 4 +; DEFAULT-NEXT: store i8 [[TMP27]], ptr [[TMP26]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE14]] +; DEFAULT: [[PRED_STORE_CONTINUE14]]: +; DEFAULT-NEXT: [[TMP28:%.*]] = extractelement <16 x i1> [[TMP0]], i32 5 +; DEFAULT-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]] +; DEFAULT: [[PRED_STORE_IF15]]: +; DEFAULT-NEXT: [[TMP29:%.*]] = add i64 [[INDEX]], 5 +; DEFAULT-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP29]] +; DEFAULT-NEXT: [[TMP31:%.*]] = extractelement <16 x i8> [[TMP7]], i32 5 +; DEFAULT-NEXT: store i8 [[TMP31]], ptr [[TMP30]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE16]] +; DEFAULT: [[PRED_STORE_CONTINUE16]]: +; DEFAULT-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP0]], i32 6 +; DEFAULT-NEXT: br i1 [[TMP32]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]] +; DEFAULT: [[PRED_STORE_IF17]]: +; DEFAULT-NEXT: [[TMP33:%.*]] = add i64 [[INDEX]], 6 +; DEFAULT-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP33]] +; DEFAULT-NEXT: [[TMP35:%.*]] = extractelement <16 x i8> [[TMP7]], i32 6 +; DEFAULT-NEXT: store i8 [[TMP35]], ptr [[TMP34]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE18]] +; DEFAULT: [[PRED_STORE_CONTINUE18]]: +; DEFAULT-NEXT: [[TMP36:%.*]] = extractelement <16 x i1> [[TMP0]], i32 7 +; DEFAULT-NEXT: br i1 [[TMP36]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]] +; DEFAULT: [[PRED_STORE_IF19]]: +; DEFAULT-NEXT: [[TMP37:%.*]] = add i64 [[INDEX]], 7 +; DEFAULT-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP37]] +; DEFAULT-NEXT: [[TMP39:%.*]] = extractelement <16 x i8> [[TMP7]], i32 7 +; DEFAULT-NEXT: store i8 [[TMP39]], ptr [[TMP38]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE20]] +; DEFAULT: [[PRED_STORE_CONTINUE20]]: +; DEFAULT-NEXT: [[TMP40:%.*]] = extractelement <16 x i1> [[TMP0]], i32 8 +; DEFAULT-NEXT: br i1 [[TMP40]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]] +; DEFAULT: [[PRED_STORE_IF21]]: +; DEFAULT-NEXT: [[TMP41:%.*]] = add i64 [[INDEX]], 8 +; DEFAULT-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP41]] +; DEFAULT-NEXT: [[TMP43:%.*]] = extractelement <16 x i8> [[TMP7]], i32 8 +; DEFAULT-NEXT: store i8 [[TMP43]], ptr [[TMP42]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE22]] +; DEFAULT: [[PRED_STORE_CONTINUE22]]: +; DEFAULT-NEXT: [[TMP44:%.*]] = extractelement <16 x i1> [[TMP0]], i32 9 +; DEFAULT-NEXT: br i1 [[TMP44]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]] +; DEFAULT: [[PRED_STORE_IF23]]: +; DEFAULT-NEXT: [[TMP45:%.*]] = add i64 [[INDEX]], 9 +; DEFAULT-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP45]] +; DEFAULT-NEXT: [[TMP47:%.*]] = extractelement <16 x i8> [[TMP7]], i32 9 +; DEFAULT-NEXT: store i8 [[TMP47]], ptr [[TMP46]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE24]] +; DEFAULT: [[PRED_STORE_CONTINUE24]]: +; DEFAULT-NEXT: [[TMP48:%.*]] = extractelement <16 x i1> [[TMP0]], i32 10 +; DEFAULT-NEXT: br i1 [[TMP48]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]] +; DEFAULT: [[PRED_STORE_IF25]]: +; DEFAULT-NEXT: [[TMP49:%.*]] = add i64 [[INDEX]], 10 +; DEFAULT-NEXT: [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP49]] +; DEFAULT-NEXT: [[TMP51:%.*]] = extractelement <16 x i8> [[TMP7]], i32 10 +; DEFAULT-NEXT: store i8 [[TMP51]], ptr [[TMP50]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE26]] +; DEFAULT: [[PRED_STORE_CONTINUE26]]: +; DEFAULT-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP0]], i32 11 +; DEFAULT-NEXT: br i1 [[TMP52]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]] +; DEFAULT: [[PRED_STORE_IF27]]: +; DEFAULT-NEXT: [[TMP53:%.*]] = add i64 [[INDEX]], 11 +; DEFAULT-NEXT: [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP53]] +; DEFAULT-NEXT: [[TMP55:%.*]] = extractelement <16 x i8> [[TMP7]], i32 11 +; DEFAULT-NEXT: store i8 [[TMP55]], ptr [[TMP54]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE28]] +; DEFAULT: [[PRED_STORE_CONTINUE28]]: +; DEFAULT-NEXT: [[TMP56:%.*]] = extractelement <16 x i1> [[TMP0]], i32 12 +; DEFAULT-NEXT: br i1 [[TMP56]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]] +; DEFAULT: [[PRED_STORE_IF29]]: +; DEFAULT-NEXT: [[TMP57:%.*]] = add i64 [[INDEX]], 12 +; DEFAULT-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP57]] +; DEFAULT-NEXT: [[TMP59:%.*]] = extractelement <16 x i8> [[TMP7]], i32 12 +; DEFAULT-NEXT: store i8 [[TMP59]], ptr [[TMP58]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE30]] +; DEFAULT: [[PRED_STORE_CONTINUE30]]: +; DEFAULT-NEXT: [[TMP60:%.*]] = extractelement <16 x i1> [[TMP0]], i32 13 +; DEFAULT-NEXT: br i1 [[TMP60]], label %[[PRED_STORE_IF31:.*]], label %[[PRED_STORE_CONTINUE32:.*]] +; DEFAULT: [[PRED_STORE_IF31]]: +; DEFAULT-NEXT: [[TMP61:%.*]] = add i64 [[INDEX]], 13 +; DEFAULT-NEXT: [[TMP62:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP61]] +; DEFAULT-NEXT: [[TMP63:%.*]] = extractelement <16 x i8> [[TMP7]], i32 13 +; DEFAULT-NEXT: store i8 [[TMP63]], ptr [[TMP62]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE32]] +; DEFAULT: [[PRED_STORE_CONTINUE32]]: +; DEFAULT-NEXT: [[TMP64:%.*]] = extractelement <16 x i1> [[TMP0]], i32 14 +; DEFAULT-NEXT: br i1 [[TMP64]], label %[[PRED_STORE_IF33:.*]], label %[[PRED_STORE_CONTINUE34:.*]] +; DEFAULT: [[PRED_STORE_IF33]]: +; DEFAULT-NEXT: [[TMP65:%.*]] = add i64 [[INDEX]], 14 +; DEFAULT-NEXT: [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP65]] +; DEFAULT-NEXT: [[TMP67:%.*]] = extractelement <16 x i8> [[TMP7]], i32 14 +; DEFAULT-NEXT: store i8 [[TMP67]], ptr [[TMP66]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE34]] +; DEFAULT: [[PRED_STORE_CONTINUE34]]: +; DEFAULT-NEXT: [[TMP68:%.*]] = extractelement <16 x i1> [[TMP0]], i32 15 +; DEFAULT-NEXT: br i1 [[TMP68]], label %[[PRED_STORE_IF35:.*]], label %[[PRED_STORE_CONTINUE36]] +; DEFAULT: [[PRED_STORE_IF35]]: +; DEFAULT-NEXT: [[TMP69:%.*]] = add i64 [[INDEX]], 15 +; DEFAULT-NEXT: [[TMP70:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP69]] +; DEFAULT-NEXT: [[TMP71:%.*]] = extractelement <16 x i8> [[TMP7]], i32 15 +; DEFAULT-NEXT: store i8 [[TMP71]], ptr [[TMP70]], align 1 +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE36]] +; DEFAULT: [[PRED_STORE_CONTINUE36]]: +; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16) +; DEFAULT-NEXT: [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16) +; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; DEFAULT-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; DEFAULT: [[MIDDLE_BLOCK]]: +; DEFAULT-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; DEFAULT: [[SCALAR_PH]]: +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; DEFAULT-NEXT: br label %[[FOR_BODY:.*]] +; DEFAULT: [[FOR_BODY]]: +; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; DEFAULT-NEXT: [[TMP72:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i8 +; DEFAULT-NEXT: [[MUL:%.*]] = mul i8 [[A]], [[TMP72]] +; DEFAULT-NEXT: [[SHR:%.*]] = lshr i8 [[TMP72]], 1 +; DEFAULT-NEXT: [[MUL5:%.*]] = mul i8 [[SHR]], [[B]] +; DEFAULT-NEXT: [[ADD:%.*]] = add i8 [[MUL5]], [[MUL]] +; DEFAULT-NEXT: [[SHR7:%.*]] = lshr i8 [[TMP72]], 2 +; DEFAULT-NEXT: [[MUL9:%.*]] = mul i8 [[SHR7]], [[C]] +; DEFAULT-NEXT: [[ADD10:%.*]] = add i8 [[ADD]], [[MUL9]] +; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]] +; DEFAULT-NEXT: store i8 [[ADD10]], ptr [[ARRAYIDX]], align 1 +; DEFAULT-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; DEFAULT-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 15 +; DEFAULT-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; DEFAULT: [[FOR_COND_CLEANUP]]: +; DEFAULT-NEXT: ret void +; +; OPTSIZE-LABEL: define void @tail_predicate_without_optsize( +; OPTSIZE-SAME: ptr [[P:%.*]], i8 [[A:%.*]], i8 [[B:%.*]], i8 [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; OPTSIZE-NEXT: [[ENTRY:.*]]: +; OPTSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; OPTSIZE: [[VECTOR_PH]]: +; OPTSIZE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0 +; OPTSIZE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer +; OPTSIZE-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <16 x i8> poison, i8 [[B]], i64 0 +; OPTSIZE-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT3]], <16 x i8> poison, <16 x i32> zeroinitializer +; OPTSIZE-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0 +; OPTSIZE-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT5]], <16 x i8> poison, <16 x i32> zeroinitializer +; OPTSIZE-NEXT: br label %[[VECTOR_BODY:.*]] +; OPTSIZE: [[VECTOR_BODY]]: +; OPTSIZE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE36:.*]] ] +; OPTSIZE-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE36]] ] +; OPTSIZE-NEXT: [[VEC_IND1:%.*]] = phi <16 x i8> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], %[[PRED_STORE_CONTINUE36]] ] +; OPTSIZE-NEXT: [[TMP72:%.*]] = icmp ule <16 x i64> [[VEC_IND]], splat (i64 14) +; OPTSIZE-NEXT: [[TMP1:%.*]] = mul <16 x i8> [[BROADCAST_SPLAT]], [[VEC_IND1]] +; OPTSIZE-NEXT: [[TMP2:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 1) +; OPTSIZE-NEXT: [[TMP3:%.*]] = mul <16 x i8> [[TMP2]], [[BROADCAST_SPLAT4]] +; OPTSIZE-NEXT: [[TMP4:%.*]] = add <16 x i8> [[TMP3]], [[TMP1]] +; OPTSIZE-NEXT: [[TMP5:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 2) +; OPTSIZE-NEXT: [[TMP6:%.*]] = mul <16 x i8> [[TMP5]], [[BROADCAST_SPLAT6]] +; OPTSIZE-NEXT: [[TMP7:%.*]] = add <16 x i8> [[TMP4]], [[TMP6]] +; OPTSIZE-NEXT: [[TMP8:%.*]] = extractelement <16 x i1> [[TMP72]], i32 0 +; OPTSIZE-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; OPTSIZE: [[PRED_STORE_IF]]: +; OPTSIZE-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; OPTSIZE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP9]] +; OPTSIZE-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP7]], i32 0 +; OPTSIZE-NEXT: store i8 [[TMP11]], ptr [[TMP10]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE]] +; OPTSIZE: [[PRED_STORE_CONTINUE]]: +; OPTSIZE-NEXT: [[TMP12:%.*]] = extractelement <16 x i1> [[TMP72]], i32 1 +; OPTSIZE-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] +; OPTSIZE: [[PRED_STORE_IF7]]: +; OPTSIZE-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 1 +; OPTSIZE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP13]] +; OPTSIZE-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[TMP7]], i32 1 +; OPTSIZE-NEXT: store i8 [[TMP15]], ptr [[TMP14]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE8]] +; OPTSIZE: [[PRED_STORE_CONTINUE8]]: +; OPTSIZE-NEXT: [[TMP16:%.*]] = extractelement <16 x i1> [[TMP72]], i32 2 +; OPTSIZE-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] +; OPTSIZE: [[PRED_STORE_IF9]]: +; OPTSIZE-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 2 +; OPTSIZE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP17]] +; OPTSIZE-NEXT: [[TMP19:%.*]] = extractelement <16 x i8> [[TMP7]], i32 2 +; OPTSIZE-NEXT: store i8 [[TMP19]], ptr [[TMP18]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE10]] +; OPTSIZE: [[PRED_STORE_CONTINUE10]]: +; OPTSIZE-NEXT: [[TMP20:%.*]] = extractelement <16 x i1> [[TMP72]], i32 3 +; OPTSIZE-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] +; OPTSIZE: [[PRED_STORE_IF11]]: +; OPTSIZE-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 3 +; OPTSIZE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP21]] +; OPTSIZE-NEXT: [[TMP23:%.*]] = extractelement <16 x i8> [[TMP7]], i32 3 +; OPTSIZE-NEXT: store i8 [[TMP23]], ptr [[TMP22]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE12]] +; OPTSIZE: [[PRED_STORE_CONTINUE12]]: +; OPTSIZE-NEXT: [[TMP24:%.*]] = extractelement <16 x i1> [[TMP72]], i32 4 +; OPTSIZE-NEXT: br i1 [[TMP24]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] +; OPTSIZE: [[PRED_STORE_IF13]]: +; OPTSIZE-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 4 +; OPTSIZE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP25]] +; OPTSIZE-NEXT: [[TMP27:%.*]] = extractelement <16 x i8> [[TMP7]], i32 4 +; OPTSIZE-NEXT: store i8 [[TMP27]], ptr [[TMP26]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE14]] +; OPTSIZE: [[PRED_STORE_CONTINUE14]]: +; OPTSIZE-NEXT: [[TMP28:%.*]] = extractelement <16 x i1> [[TMP72]], i32 5 +; OPTSIZE-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]] +; OPTSIZE: [[PRED_STORE_IF15]]: +; OPTSIZE-NEXT: [[TMP29:%.*]] = add i64 [[INDEX]], 5 +; OPTSIZE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP29]] +; OPTSIZE-NEXT: [[TMP31:%.*]] = extractelement <16 x i8> [[TMP7]], i32 5 +; OPTSIZE-NEXT: store i8 [[TMP31]], ptr [[TMP30]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE16]] +; OPTSIZE: [[PRED_STORE_CONTINUE16]]: +; OPTSIZE-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP72]], i32 6 +; OPTSIZE-NEXT: br i1 [[TMP32]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]] +; OPTSIZE: [[PRED_STORE_IF17]]: +; OPTSIZE-NEXT: [[TMP33:%.*]] = add i64 [[INDEX]], 6 +; OPTSIZE-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP33]] +; OPTSIZE-NEXT: [[TMP35:%.*]] = extractelement <16 x i8> [[TMP7]], i32 6 +; OPTSIZE-NEXT: store i8 [[TMP35]], ptr [[TMP34]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE18]] +; OPTSIZE: [[PRED_STORE_CONTINUE18]]: +; OPTSIZE-NEXT: [[TMP36:%.*]] = extractelement <16 x i1> [[TMP72]], i32 7 +; OPTSIZE-NEXT: br i1 [[TMP36]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]] +; OPTSIZE: [[PRED_STORE_IF19]]: +; OPTSIZE-NEXT: [[TMP37:%.*]] = add i64 [[INDEX]], 7 +; OPTSIZE-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP37]] +; OPTSIZE-NEXT: [[TMP39:%.*]] = extractelement <16 x i8> [[TMP7]], i32 7 +; OPTSIZE-NEXT: store i8 [[TMP39]], ptr [[TMP38]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE20]] +; OPTSIZE: [[PRED_STORE_CONTINUE20]]: +; OPTSIZE-NEXT: [[TMP40:%.*]] = extractelement <16 x i1> [[TMP72]], i32 8 +; OPTSIZE-NEXT: br i1 [[TMP40]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]] +; OPTSIZE: [[PRED_STORE_IF21]]: +; OPTSIZE-NEXT: [[TMP41:%.*]] = add i64 [[INDEX]], 8 +; OPTSIZE-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP41]] +; OPTSIZE-NEXT: [[TMP43:%.*]] = extractelement <16 x i8> [[TMP7]], i32 8 +; OPTSIZE-NEXT: store i8 [[TMP43]], ptr [[TMP42]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE22]] +; OPTSIZE: [[PRED_STORE_CONTINUE22]]: +; OPTSIZE-NEXT: [[TMP44:%.*]] = extractelement <16 x i1> [[TMP72]], i32 9 +; OPTSIZE-NEXT: br i1 [[TMP44]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]] +; OPTSIZE: [[PRED_STORE_IF23]]: +; OPTSIZE-NEXT: [[TMP45:%.*]] = add i64 [[INDEX]], 9 +; OPTSIZE-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP45]] +; OPTSIZE-NEXT: [[TMP47:%.*]] = extractelement <16 x i8> [[TMP7]], i32 9 +; OPTSIZE-NEXT: store i8 [[TMP47]], ptr [[TMP46]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE24]] +; OPTSIZE: [[PRED_STORE_CONTINUE24]]: +; OPTSIZE-NEXT: [[TMP48:%.*]] = extractelement <16 x i1> [[TMP72]], i32 10 +; OPTSIZE-NEXT: br i1 [[TMP48]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]] +; OPTSIZE: [[PRED_STORE_IF25]]: +; OPTSIZE-NEXT: [[TMP49:%.*]] = add i64 [[INDEX]], 10 +; OPTSIZE-NEXT: [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP49]] +; OPTSIZE-NEXT: [[TMP51:%.*]] = extractelement <16 x i8> [[TMP7]], i32 10 +; OPTSIZE-NEXT: store i8 [[TMP51]], ptr [[TMP50]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE26]] +; OPTSIZE: [[PRED_STORE_CONTINUE26]]: +; OPTSIZE-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP72]], i32 11 +; OPTSIZE-NEXT: br i1 [[TMP52]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]] +; OPTSIZE: [[PRED_STORE_IF27]]: +; OPTSIZE-NEXT: [[TMP53:%.*]] = add i64 [[INDEX]], 11 +; OPTSIZE-NEXT: [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP53]] +; OPTSIZE-NEXT: [[TMP55:%.*]] = extractelement <16 x i8> [[TMP7]], i32 11 +; OPTSIZE-NEXT: store i8 [[TMP55]], ptr [[TMP54]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE28]] +; OPTSIZE: [[PRED_STORE_CONTINUE28]]: +; OPTSIZE-NEXT: [[TMP56:%.*]] = extractelement <16 x i1> [[TMP72]], i32 12 +; OPTSIZE-NEXT: br i1 [[TMP56]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]] +; OPTSIZE: [[PRED_STORE_IF29]]: +; OPTSIZE-NEXT: [[TMP57:%.*]] = add i64 [[INDEX]], 12 +; OPTSIZE-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP57]] +; OPTSIZE-NEXT: [[TMP59:%.*]] = extractelement <16 x i8> [[TMP7]], i32 12 +; OPTSIZE-NEXT: store i8 [[TMP59]], ptr [[TMP58]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE30]] +; OPTSIZE: [[PRED_STORE_CONTINUE30]]: +; OPTSIZE-NEXT: [[TMP60:%.*]] = extractelement <16 x i1> [[TMP72]], i32 13 +; OPTSIZE-NEXT: br i1 [[TMP60]], label %[[PRED_STORE_IF31:.*]], label %[[PRED_STORE_CONTINUE32:.*]] +; OPTSIZE: [[PRED_STORE_IF31]]: +; OPTSIZE-NEXT: [[TMP61:%.*]] = add i64 [[INDEX]], 13 +; OPTSIZE-NEXT: [[TMP62:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP61]] +; OPTSIZE-NEXT: [[TMP63:%.*]] = extractelement <16 x i8> [[TMP7]], i32 13 +; OPTSIZE-NEXT: store i8 [[TMP63]], ptr [[TMP62]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE32]] +; OPTSIZE: [[PRED_STORE_CONTINUE32]]: +; OPTSIZE-NEXT: [[TMP64:%.*]] = extractelement <16 x i1> [[TMP72]], i32 14 +; OPTSIZE-NEXT: br i1 [[TMP64]], label %[[PRED_STORE_IF33:.*]], label %[[PRED_STORE_CONTINUE34:.*]] +; OPTSIZE: [[PRED_STORE_IF33]]: +; OPTSIZE-NEXT: [[TMP65:%.*]] = add i64 [[INDEX]], 14 +; OPTSIZE-NEXT: [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP65]] +; OPTSIZE-NEXT: [[TMP67:%.*]] = extractelement <16 x i8> [[TMP7]], i32 14 +; OPTSIZE-NEXT: store i8 [[TMP67]], ptr [[TMP66]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE34]] +; OPTSIZE: [[PRED_STORE_CONTINUE34]]: +; OPTSIZE-NEXT: [[TMP68:%.*]] = extractelement <16 x i1> [[TMP72]], i32 15 +; OPTSIZE-NEXT: br i1 [[TMP68]], label %[[PRED_STORE_IF35:.*]], label %[[PRED_STORE_CONTINUE36]] +; OPTSIZE: [[PRED_STORE_IF35]]: +; OPTSIZE-NEXT: [[TMP69:%.*]] = add i64 [[INDEX]], 15 +; OPTSIZE-NEXT: [[TMP70:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP69]] +; OPTSIZE-NEXT: [[TMP71:%.*]] = extractelement <16 x i8> [[TMP7]], i32 15 +; OPTSIZE-NEXT: store i8 [[TMP71]], ptr [[TMP70]], align 1 +; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE36]] +; OPTSIZE: [[PRED_STORE_CONTINUE36]]: +; OPTSIZE-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16) +; OPTSIZE-NEXT: [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16) +; OPTSIZE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; OPTSIZE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; OPTSIZE: [[MIDDLE_BLOCK]]: +; OPTSIZE-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; OPTSIZE: [[SCALAR_PH]]: +; OPTSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; OPTSIZE-NEXT: br label %[[FOR_BODY:.*]] +; OPTSIZE: [[FOR_BODY]]: +; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; OPTSIZE-NEXT: [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i8 +; OPTSIZE-NEXT: [[MUL:%.*]] = mul i8 [[A]], [[TMP0]] +; OPTSIZE-NEXT: [[SHR:%.*]] = lshr i8 [[TMP0]], 1 +; OPTSIZE-NEXT: [[MUL5:%.*]] = mul i8 [[SHR]], [[B]] +; OPTSIZE-NEXT: [[ADD:%.*]] = add i8 [[MUL5]], [[MUL]] +; OPTSIZE-NEXT: [[SHR7:%.*]] = lshr i8 [[TMP0]], 2 +; OPTSIZE-NEXT: [[MUL9:%.*]] = mul i8 [[SHR7]], [[C]] +; OPTSIZE-NEXT: [[ADD10:%.*]] = add i8 [[ADD]], [[MUL9]] +; OPTSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]] +; OPTSIZE-NEXT: store i8 [[ADD10]], ptr [[ARRAYIDX]], align 1 +; OPTSIZE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; OPTSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 15 +; OPTSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; OPTSIZE: [[FOR_COND_CLEANUP]]: +; OPTSIZE-NEXT: ret void +; +; MINSIZE-LABEL: define void @tail_predicate_without_optsize( +; MINSIZE-SAME: ptr [[P:%.*]], i8 [[A:%.*]], i8 [[B:%.*]], i8 [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; MINSIZE-NEXT: [[ENTRY:.*]]: +; MINSIZE-NEXT: br label %[[FOR_BODY:.*]] +; MINSIZE: [[FOR_BODY]]: +; MINSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; MINSIZE-NEXT: [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i8 +; MINSIZE-NEXT: [[MUL:%.*]] = mul i8 [[A]], [[TMP0]] +; MINSIZE-NEXT: [[SHR:%.*]] = lshr i8 [[TMP0]], 1 +; MINSIZE-NEXT: [[MUL5:%.*]] = mul i8 [[SHR]], [[B]] +; MINSIZE-NEXT: [[ADD:%.*]] = add i8 [[MUL5]], [[MUL]] +; MINSIZE-NEXT: [[SHR7:%.*]] = lshr i8 [[TMP0]], 2 +; MINSIZE-NEXT: [[MUL9:%.*]] = mul i8 [[SHR7]], [[C]] +; MINSIZE-NEXT: [[ADD10:%.*]] = add i8 [[ADD]], [[MUL9]] +; MINSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]] +; MINSIZE-NEXT: store i8 [[ADD10]], ptr [[ARRAYIDX]], align 1 +; MINSIZE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; MINSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 15 +; MINSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY]] +; MINSIZE: [[FOR_COND_CLEANUP]]: +; MINSIZE-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %0 = trunc nuw nsw i64 %iv to i8 + %mul = mul i8 %a, %0 + %shr = lshr i8 %0, 1 + %mul5 = mul i8 %shr, %b + %add = add i8 %mul5, %mul + %shr7 = lshr i8 %0, 2 + %mul9 = mul i8 %shr7, %c + %add10 = add i8 %add, %mul9 + %arrayidx = getelementptr inbounds i8, ptr %p, i64 %iv + store i8 %add10, ptr %arrayidx, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 15 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + + +; Vector width 4 is fastest. We shouldn't vectorize with minsize as it needs an +; extra instruction to do the trunc which the scalar version doesn't need, and +; we can't combine the getelementptr into the load/store. +; FIXME: We currently vectorize with minsize as the trunc cost is incorrect and +; we don't account for the addressing mode difference. +define void @dont_vectorize_with_minsize() { +; DEFAULT-LABEL: define void @dont_vectorize_with_minsize() { +; DEFAULT-NEXT: [[ENTRY:.*]]: +; DEFAULT-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; DEFAULT: [[VECTOR_PH]]: +; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] +; DEFAULT: [[VECTOR_BODY]]: +; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; DEFAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[TMP0]] +; DEFAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0 +; DEFAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; DEFAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[TMP0]] +; DEFAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP4]], i32 0 +; DEFAULT-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4 +; DEFAULT-NEXT: [[TMP6:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; DEFAULT-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[TMP0]] +; DEFAULT-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP9]], i32 0 +; DEFAULT-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i16>, ptr [[TMP10]], align 2 +; DEFAULT-NEXT: [[TMP8:%.*]] = trunc <4 x i32> [[TMP6]] to <4 x i16> +; DEFAULT-NEXT: [[TMP11:%.*]] = add <4 x i16> [[TMP8]], [[WIDE_LOAD2]] +; DEFAULT-NEXT: store <4 x i16> [[TMP11]], ptr [[TMP10]], align 2 +; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; DEFAULT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 +; DEFAULT-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; DEFAULT: [[MIDDLE_BLOCK]]: +; DEFAULT-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; DEFAULT: [[SCALAR_PH]]: +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; DEFAULT-NEXT: br label %[[FOR_BODY:.*]] +; DEFAULT: [[FOR_BODY]]: +; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]] +; DEFAULT-NEXT: [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; DEFAULT-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]] +; DEFAULT-NEXT: [[CVAL:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; DEFAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[BVAL]], [[CVAL]] +; DEFAULT-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDVARS_IV]] +; DEFAULT-NEXT: [[AVAL:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 +; DEFAULT-NEXT: [[TRUNC:%.*]] = trunc i32 [[MUL]] to i16 +; DEFAULT-NEXT: [[ADD:%.*]] = add i16 [[TRUNC]], [[AVAL]] +; DEFAULT-NEXT: store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2 +; DEFAULT-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; DEFAULT-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64 +; DEFAULT-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; DEFAULT: [[FOR_COND_CLEANUP]]: +; DEFAULT-NEXT: ret void +; +; OPTSIZE-LABEL: define void @dont_vectorize_with_minsize( +; OPTSIZE-SAME: ) #[[ATTR0]] { +; OPTSIZE-NEXT: [[ENTRY:.*]]: +; OPTSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; OPTSIZE: [[VECTOR_PH]]: +; OPTSIZE-NEXT: br label %[[VECTOR_BODY:.*]] +; OPTSIZE: [[VECTOR_BODY]]: +; OPTSIZE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; OPTSIZE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; OPTSIZE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[TMP0]] +; OPTSIZE-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0 +; OPTSIZE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; OPTSIZE-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[TMP0]] +; OPTSIZE-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP3]], i32 0 +; OPTSIZE-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; OPTSIZE-NEXT: [[TMP5:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; OPTSIZE-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[TMP0]] +; OPTSIZE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP6]], i32 0 +; OPTSIZE-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i16>, ptr [[TMP7]], align 2 +; OPTSIZE-NEXT: [[TMP8:%.*]] = trunc <4 x i32> [[TMP5]] to <4 x i16> +; OPTSIZE-NEXT: [[TMP9:%.*]] = add <4 x i16> [[TMP8]], [[WIDE_LOAD2]] +; OPTSIZE-NEXT: store <4 x i16> [[TMP9]], ptr [[TMP7]], align 2 +; OPTSIZE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; OPTSIZE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 +; OPTSIZE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; OPTSIZE: [[MIDDLE_BLOCK]]: +; OPTSIZE-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; OPTSIZE: [[SCALAR_PH]]: +; OPTSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; OPTSIZE-NEXT: br label %[[FOR_BODY:.*]] +; OPTSIZE: [[FOR_BODY]]: +; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; OPTSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]] +; OPTSIZE-NEXT: [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; OPTSIZE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]] +; OPTSIZE-NEXT: [[CVAL:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; OPTSIZE-NEXT: [[MUL:%.*]] = mul nsw i32 [[BVAL]], [[CVAL]] +; OPTSIZE-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDVARS_IV]] +; OPTSIZE-NEXT: [[AVAL:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 +; OPTSIZE-NEXT: [[TRUNC:%.*]] = trunc i32 [[MUL]] to i16 +; OPTSIZE-NEXT: [[ADD:%.*]] = add i16 [[TRUNC]], [[AVAL]] +; OPTSIZE-NEXT: store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2 +; OPTSIZE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; OPTSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64 +; OPTSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; OPTSIZE: [[FOR_COND_CLEANUP]]: +; OPTSIZE-NEXT: ret void +; +; MINSIZE-LABEL: define void @dont_vectorize_with_minsize( +; MINSIZE-SAME: ) #[[ATTR0]] { +; MINSIZE-NEXT: [[ENTRY:.*]]: +; MINSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; MINSIZE: [[VECTOR_PH]]: +; MINSIZE-NEXT: br label %[[VECTOR_BODY:.*]] +; MINSIZE: [[VECTOR_BODY]]: +; MINSIZE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; MINSIZE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; MINSIZE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[TMP0]] +; MINSIZE-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0 +; MINSIZE-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 +; MINSIZE-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[TMP0]] +; MINSIZE-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP3]], i32 0 +; MINSIZE-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4 +; MINSIZE-NEXT: [[TMP5:%.*]] = mul nsw <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; MINSIZE-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[TMP0]] +; MINSIZE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP6]], i32 0 +; MINSIZE-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i16>, ptr [[TMP7]], align 2 +; MINSIZE-NEXT: [[TMP8:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16> +; MINSIZE-NEXT: [[TMP9:%.*]] = add <2 x i16> [[TMP8]], [[WIDE_LOAD2]] +; MINSIZE-NEXT: store <2 x i16> [[TMP9]], ptr [[TMP7]], align 2 +; MINSIZE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; MINSIZE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 +; MINSIZE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; MINSIZE: [[MIDDLE_BLOCK]]: +; MINSIZE-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; MINSIZE: [[SCALAR_PH]]: +; MINSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; MINSIZE-NEXT: br label %[[FOR_BODY:.*]] +; MINSIZE: [[FOR_BODY]]: +; MINSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; MINSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]] +; MINSIZE-NEXT: [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; MINSIZE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]] +; MINSIZE-NEXT: [[CVAL:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; MINSIZE-NEXT: [[MUL:%.*]] = mul nsw i32 [[BVAL]], [[CVAL]] +; MINSIZE-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDVARS_IV]] +; MINSIZE-NEXT: [[AVAL:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 +; MINSIZE-NEXT: [[TRUNC:%.*]] = trunc i32 [[MUL]] to i16 +; MINSIZE-NEXT: [[ADD:%.*]] = add i16 [[TRUNC]], [[AVAL]] +; MINSIZE-NEXT: store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2 +; MINSIZE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; MINSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64 +; MINSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; MINSIZE: [[FOR_COND_CLEANUP]]: +; MINSIZE-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 %iv + %bval = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 %iv + %cval = load i32, ptr %arrayidx2, align 4 + %mul = mul nsw i32 %bval, %cval + %arrayidx4 = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 %iv + %aval = load i16, ptr %arrayidx4, align 2 + %trunc = trunc i32 %mul to i16 + %add = add i16 %trunc, %aval + store i16 %add, ptr %arrayidx4, align 2 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 64 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + + +; If vectorization is forced then with minsize vector width 4 is the smallest: +; width 8 means each 8 x i32 load needs two instructions, width 2 means we have +; to use extra instructions to handle the 2 x i16 load and store. +; FIXME: We currently use width 2 as the load/store cost is incorrect. +define void @vectorization_forced() { +; DEFAULT-LABEL: define void @vectorization_forced() { +; DEFAULT-NEXT: [[ENTRY:.*]]: +; DEFAULT-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; DEFAULT: [[VECTOR_PH]]: +; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] +; DEFAULT: [[VECTOR_BODY]]: +; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; DEFAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[TMP0]] +; DEFAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0 +; DEFAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; DEFAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[TMP0]] +; DEFAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP4]], i32 0 +; DEFAULT-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4 +; DEFAULT-NEXT: [[TMP6:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; DEFAULT-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[TMP0]] +; DEFAULT-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP9]], i32 0 +; DEFAULT-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i16>, ptr [[TMP10]], align 2 +; DEFAULT-NEXT: [[TMP8:%.*]] = trunc <4 x i32> [[TMP6]] to <4 x i16> +; DEFAULT-NEXT: [[TMP11:%.*]] = add <4 x i16> [[TMP8]], [[WIDE_LOAD2]] +; DEFAULT-NEXT: store <4 x i16> [[TMP11]], ptr [[TMP10]], align 2 +; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; DEFAULT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 +; DEFAULT-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; DEFAULT: [[MIDDLE_BLOCK]]: +; DEFAULT-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; DEFAULT: [[SCALAR_PH]]: +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; DEFAULT-NEXT: br label %[[FOR_BODY:.*]] +; DEFAULT: [[FOR_BODY]]: +; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]] +; DEFAULT-NEXT: [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; DEFAULT-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]] +; DEFAULT-NEXT: [[CVAL:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; DEFAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[BVAL]], [[CVAL]] +; DEFAULT-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDVARS_IV]] +; DEFAULT-NEXT: [[AVAL:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 +; DEFAULT-NEXT: [[TRUNC:%.*]] = trunc i32 [[MUL]] to i16 +; DEFAULT-NEXT: [[ADD:%.*]] = add i16 [[TRUNC]], [[AVAL]] +; DEFAULT-NEXT: store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2 +; DEFAULT-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; DEFAULT-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64 +; DEFAULT-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; DEFAULT: [[FOR_COND_CLEANUP]]: +; DEFAULT-NEXT: ret void +; +; OPTSIZE-LABEL: define void @vectorization_forced( +; OPTSIZE-SAME: ) #[[ATTR0]] { +; OPTSIZE-NEXT: [[ENTRY:.*]]: +; OPTSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; OPTSIZE: [[VECTOR_PH]]: +; OPTSIZE-NEXT: br label %[[VECTOR_BODY:.*]] +; OPTSIZE: [[VECTOR_BODY]]: +; OPTSIZE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; OPTSIZE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; OPTSIZE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[TMP0]] +; OPTSIZE-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0 +; OPTSIZE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; OPTSIZE-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[TMP0]] +; OPTSIZE-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP3]], i32 0 +; OPTSIZE-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; OPTSIZE-NEXT: [[TMP5:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; OPTSIZE-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[TMP0]] +; OPTSIZE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP6]], i32 0 +; OPTSIZE-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i16>, ptr [[TMP7]], align 2 +; OPTSIZE-NEXT: [[TMP8:%.*]] = trunc <4 x i32> [[TMP5]] to <4 x i16> +; OPTSIZE-NEXT: [[TMP9:%.*]] = add <4 x i16> [[TMP8]], [[WIDE_LOAD2]] +; OPTSIZE-NEXT: store <4 x i16> [[TMP9]], ptr [[TMP7]], align 2 +; OPTSIZE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; OPTSIZE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 +; OPTSIZE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; OPTSIZE: [[MIDDLE_BLOCK]]: +; OPTSIZE-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; OPTSIZE: [[SCALAR_PH]]: +; OPTSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; OPTSIZE-NEXT: br label %[[FOR_BODY:.*]] +; OPTSIZE: [[FOR_BODY]]: +; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; OPTSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]] +; OPTSIZE-NEXT: [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; OPTSIZE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]] +; OPTSIZE-NEXT: [[CVAL:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; OPTSIZE-NEXT: [[MUL:%.*]] = mul nsw i32 [[BVAL]], [[CVAL]] +; OPTSIZE-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDVARS_IV]] +; OPTSIZE-NEXT: [[AVAL:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 +; OPTSIZE-NEXT: [[TRUNC:%.*]] = trunc i32 [[MUL]] to i16 +; OPTSIZE-NEXT: [[ADD:%.*]] = add i16 [[TRUNC]], [[AVAL]] +; OPTSIZE-NEXT: store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2 +; OPTSIZE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; OPTSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64 +; OPTSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; OPTSIZE: [[FOR_COND_CLEANUP]]: +; OPTSIZE-NEXT: ret void +; +; MINSIZE-LABEL: define void @vectorization_forced( +; MINSIZE-SAME: ) #[[ATTR0]] { +; MINSIZE-NEXT: [[ENTRY:.*]]: +; MINSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; MINSIZE: [[VECTOR_PH]]: +; MINSIZE-NEXT: br label %[[VECTOR_BODY:.*]] +; MINSIZE: [[VECTOR_BODY]]: +; MINSIZE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; MINSIZE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; MINSIZE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[TMP0]] +; MINSIZE-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0 +; MINSIZE-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 +; MINSIZE-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[TMP0]] +; MINSIZE-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP3]], i32 0 +; MINSIZE-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4 +; MINSIZE-NEXT: [[TMP5:%.*]] = mul nsw <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; MINSIZE-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[TMP0]] +; MINSIZE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP6]], i32 0 +; MINSIZE-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i16>, ptr [[TMP7]], align 2 +; MINSIZE-NEXT: [[TMP8:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16> +; MINSIZE-NEXT: [[TMP9:%.*]] = add <2 x i16> [[TMP8]], [[WIDE_LOAD2]] +; MINSIZE-NEXT: store <2 x i16> [[TMP9]], ptr [[TMP7]], align 2 +; MINSIZE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; MINSIZE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 +; MINSIZE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; MINSIZE: [[MIDDLE_BLOCK]]: +; MINSIZE-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; MINSIZE: [[SCALAR_PH]]: +; MINSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; MINSIZE-NEXT: br label %[[FOR_BODY:.*]] +; MINSIZE: [[FOR_BODY]]: +; MINSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; MINSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]] +; MINSIZE-NEXT: [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; MINSIZE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]] +; MINSIZE-NEXT: [[CVAL:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; MINSIZE-NEXT: [[MUL:%.*]] = mul nsw i32 [[BVAL]], [[CVAL]] +; MINSIZE-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDVARS_IV]] +; MINSIZE-NEXT: [[AVAL:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 +; MINSIZE-NEXT: [[TRUNC:%.*]] = trunc i32 [[MUL]] to i16 +; MINSIZE-NEXT: [[ADD:%.*]] = add i16 [[TRUNC]], [[AVAL]] +; MINSIZE-NEXT: store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2 +; MINSIZE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; MINSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64 +; MINSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; MINSIZE: [[FOR_COND_CLEANUP]]: +; MINSIZE-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 %iv + %bval = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 %iv + %cval = load i32, ptr %arrayidx2, align 4 + %mul = mul nsw i32 %bval, %cval + %arrayidx4 = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 %iv + %aval = load i16, ptr %arrayidx4, align 2 + %trunc = trunc i32 %mul to i16 + %add = add i16 %trunc, %aval + store i16 %add, ptr %arrayidx4, align 2 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 64 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0 + +for.cond.cleanup: + ret void +} + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.enable", i1 true} +