diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b96d29e635465..9cfc51badb339 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6224,6 +6224,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, case Instruction::ICmp: case Instruction::FCmp: { Type *ValTy = I->getOperand(0)->getType(); + InstructionCost Cost = 0; if (canTruncateToMinimalBitwidth(I, VF)) { [[maybe_unused]] Instruction *Op0AsInstruction = @@ -6235,11 +6236,22 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]); } + // If the Cmp instruction has multiple uses in the loop, it + // will generate a scalar Cmp for latch and a vector Cmp for other uses. + if (I == TheLoop->getLatchCmpInst() && !I->hasOneUse()) + Cost += TTI.getCmpSelInstrCost(I->getOpcode(), ValTy, + CmpInst::makeCmpResultType(ValTy), + cast(I)->getPredicate(), CostKind, + {TTI::OK_AnyValue, TTI::OP_None}, + {TTI::OK_AnyValue, TTI::OP_None}, I); + VectorTy = toVectorTy(ValTy, VF); - return TTI.getCmpSelInstrCost( - I->getOpcode(), VectorTy, CmpInst::makeCmpResultType(VectorTy), - cast(I)->getPredicate(), CostKind, - {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, I); + return Cost + TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, + CmpInst::makeCmpResultType(VectorTy), + cast(I)->getPredicate(), + CostKind, + {TTI::OK_AnyValue, TTI::OP_None}, + {TTI::OK_AnyValue, TTI::OP_None}, I); } case Instruction::Store: case Instruction::Load: { @@ -6769,46 +6781,6 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, } } - /// Compute the cost of all exiting conditions of the loop using the legacy - /// cost model. This is to match the legacy behavior, which adds the cost of - /// all exit conditions. Note that this over-estimates the cost, as there will - /// be a single condition to control the vector loop. - SmallVector Exiting; - CM.TheLoop->getExitingBlocks(Exiting); - SetVector ExitInstrs; - // Collect all exit conditions. - for (BasicBlock *EB : Exiting) { - auto *Term = dyn_cast(EB->getTerminator()); - if (!Term || CostCtx.skipCostComputation(Term, VF.isVector())) - continue; - if (auto *CondI = dyn_cast(Term->getOperand(0))) { - ExitInstrs.insert(CondI); - } - } - // Compute the cost of all instructions only feeding the exit conditions. - for (unsigned I = 0; I != ExitInstrs.size(); ++I) { - Instruction *CondI = ExitInstrs[I]; - if (!OrigLoop->contains(CondI) || - !CostCtx.SkipCostComputation.insert(CondI).second) - continue; - InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF); - LLVM_DEBUG({ - dbgs() << "Cost of " << CondICost << " for VF " << VF - << ": exit condition instruction " << *CondI << "\n"; - }); - Cost += CondICost; - for (Value *Op : CondI->operands()) { - auto *OpI = dyn_cast(Op); - if (!OpI || CostCtx.skipCostComputation(OpI, VF.isVector()) || - any_of(OpI->users(), [&ExitInstrs, this](User *U) { - return OrigLoop->contains(cast(U)->getParent()) && - !ExitInstrs.contains(cast(U)); - })) - continue; - ExitInstrs.insert(OpI); - } - } - // Pre-compute the costs for branches except for the backedge, as the number // of replicate regions in a VPlan may not directly match the number of // branches, which would lead to different decisions. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 8e91677292788..355fa92b9cb74 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1104,6 +1104,36 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, return Ctx.TTI.getArithmeticReductionCost( Instruction::Or, cast(VecTy), std::nullopt, Ctx.CostKind); } + case VPInstruction::BranchOnCount: { + Type *ValTy = Ctx.Types.inferScalarType(getOperand(0)); + + // If the vector loop only executed once (VF == original trip count), ignore + // the cost of cmp. + // TODO: We can remove this after hoist `unrollByUF` and + // `optimizeForVFandUF` which will optimize BranchOnCount out. + auto TC = dyn_cast_if_present( + getParent()->getPlan()->getTripCount()->getUnderlyingValue()); + if (TC && VF.isFixed() && TC->getZExtValue() == VF.getFixedValue()) + return 0; + + // BranchOnCount will generate icmp_eq + br instructions and the cost of + // branch will be calculated in VPRegionBlock. + return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ValTy, nullptr, + CmpInst::ICMP_EQ, Ctx.CostKind); + } + case VPInstruction::BranchOnCond: { + // BranchOnCond is free since the branch cost is already calculated by VPBB. + if (vputils::onlyFirstLaneUsed(getOperand(0))) + return 0; + + // Otherwise, BranchOnCond will generate `extractelement` to extract the + // condition from vector type. + return Ctx.TTI.getVectorInstrCost( + Instruction::ExtractElement, + cast( + toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF)), + Ctx.CostKind, 0, nullptr, nullptr); + } case VPInstruction::FirstActiveLane: { Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0)); if (VF.isScalar()) @@ -1145,6 +1175,27 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, I32Ty, {Arg0Ty, I32Ty, I1Ty}); return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); } + case VPInstruction::Not: { + Type *RetTy = Ctx.Types.inferScalarType(getOperand(0)); + if (!vputils::onlyFirstLaneUsed(this)) + RetTy = toVectorTy(RetTy, VF); + return Ctx.TTI.getArithmeticInstrCost(Instruction::Xor, RetTy, + Ctx.CostKind); + } + case Instruction::ICmp: + case Instruction::FCmp: { + Instruction *CtxI = dyn_cast_or_null(getUnderlyingValue()); + Type *SrcTy = Ctx.Types.inferScalarType(getOperand(0)); + Type *RetTy = Ctx.Types.inferScalarType(this); + if (!vputils::onlyFirstLaneUsed(this)) { + SrcTy = toVectorTy(SrcTy, VF); + RetTy = toVectorTy(RetTy, VF); + } + return Ctx.TTI.getCmpSelInstrCost(Opcode, SrcTy, RetTy, getPredicate(), + Ctx.CostKind, + {TTI::OK_AnyValue, TTI::OP_None}, + {TTI::OK_AnyValue, TTI::OP_None}, CtxI); + } case VPInstruction::ExtractLastElement: { // Add on the cost of extracting the element. auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll index 6cf11be0e11f7..26e4280a6fa65 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll @@ -534,25 +534,47 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 { ; DEFAULT-LABEL: define void @multiple_exit_conditions( ; DEFAULT-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR2:[0-9]+]] { ; DEFAULT-NEXT: [[ENTRY:.*:]] -; DEFAULT-NEXT: br label %[[VECTOR_PH:.*]] +; DEFAULT-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP5]], 3 +; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 257, [[TMP4]] +; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; DEFAULT: [[VECTOR_PH]]: -; DEFAULT-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 2048 +; DEFAULT-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 257, [[TMP3]] +; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 257, [[N_MOD_VF]] +; DEFAULT-NEXT: [[TMP6:%.*]] = mul i64 [[N_VEC]], 8 +; DEFAULT-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]] +; DEFAULT-NEXT: [[TMP8:%.*]] = mul i64 [[N_VEC]], 2 ; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] ; DEFAULT: [[VECTOR_BODY]]: ; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; DEFAULT-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 ; DEFAULT-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]] ; DEFAULT-NEXT: [[TMP1:%.*]] = load i16, ptr [[SRC]], align 2 -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer -; DEFAULT-NEXT: [[TMP2:%.*]] = or <8 x i16> [[BROADCAST_SPLAT]], splat (i16 1) -; DEFAULT-NEXT: [[TMP3:%.*]] = uitofp <8 x i16> [[TMP2]] to <8 x double> -; DEFAULT-NEXT: store <8 x double> [[TMP3]], ptr [[NEXT_GEP]], align 8 -; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; DEFAULT-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; DEFAULT-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16 [[TMP1]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; DEFAULT-NEXT: [[TMP20:%.*]] = or [[BROADCAST_SPLAT]], splat (i16 1) +; DEFAULT-NEXT: [[TMP9:%.*]] = uitofp [[TMP20]] to +; DEFAULT-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 1 +; DEFAULT-NEXT: [[TMP12:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP11]] +; DEFAULT-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP14:%.*]] = shl nuw i64 [[TMP13]], 2 +; DEFAULT-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP14]] +; DEFAULT-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 6 +; DEFAULT-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP17]] +; DEFAULT-NEXT: store [[TMP9]], ptr [[NEXT_GEP]], align 8 +; DEFAULT-NEXT: store [[TMP9]], ptr [[TMP12]], align 8 +; DEFAULT-NEXT: store [[TMP9]], ptr [[TMP15]], align 8 +; DEFAULT-NEXT: store [[TMP9]], ptr [[TMP18]], align 8 +; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; DEFAULT-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; DEFAULT-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; DEFAULT: [[MIDDLE_BLOCK]]: -; DEFAULT-NEXT: br label %[[SCALAR_PH:.*]] +; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 257, [[N_VEC]] +; DEFAULT-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] ; DEFAULT: [[SCALAR_PH]]: ; ; PRED-LABEL: define void @multiple_exit_conditions( @@ -660,16 +682,17 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) { ; COMMON-NEXT: store i8 6, ptr [[TMP6]], align 1 ; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE12]] ; COMMON: [[PRED_STORE_CONTINUE12]]: -; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT1:.*]] +; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] ; COMMON: [[PRED_STORE_IF13]]: ; COMMON-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7 ; COMMON-NEXT: store i8 7, ptr [[TMP7]], align 1 -; COMMON-NEXT: br label %[[EXIT1]] -; COMMON: [[EXIT1]]: -; COMMON-NEXT: br label %[[SCALAR_PH1:.*]] -; COMMON: [[SCALAR_PH1]]: -; COMMON-NEXT: br [[EXIT:label %.*]] -; COMMON: [[SCALAR_PH:.*:]] +; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE14]] +; COMMON: [[PRED_STORE_CONTINUE14]]: +; COMMON-NEXT: br label %[[MIDDLE_BLOCK:.*]] +; COMMON: [[MIDDLE_BLOCK]]: +; COMMON-NEXT: br label %[[EXIT:.*]] +; COMMON: [[EXIT]]: +; COMMON-NEXT: ret void ; entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll index 27ca4143b5be5..40198a3d09906 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll @@ -9,11 +9,12 @@ define i64 @test(ptr %a, ptr %b) #0 { ; CHECK-LABEL: LV: Checking a loop in 'test' ; CHECK: Cost of 1 for VF 8: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] -; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%{{.+}}>, vp<%{{.+}}> ; CHECK: Cost for VF 8: 30 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%{{.+}}>, vp<%{{.+}}> ; CHECK: Cost for VF 16: 56 ; CHECK: LV: Selecting VF: 16 entry: @@ -43,12 +44,13 @@ define i64 @test_external_iv_user(ptr %a, ptr %b) #0 { ; CHECK-LABEL: LV: Checking a loop in 'test_external_iv_user' ; CHECK: Cost of 1 for VF 8: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] -; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<{{.+}}>, vp<{{.+}}> ; CHECK: Cost for VF 8: 30 ; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<{{.+}}>, vp<{{.+}}> ; CHECK: Cost for VF 16: 57 ; CHECK: LV: Selecting VF: vscale x 2 entry: @@ -80,12 +82,13 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 { ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 8: induction instruction %j.iv.next = add nuw nsw i64 %j.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] -; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%{{.+}}>, vp<%{{.+}}> ; CHECK: Cost for VF 8: 27 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%{{.+}}>, vp<%{{.+}}> ; CHECK: Cost for VF 16: 41 ; CHECK: LV: Selecting VF: 16 entry: @@ -116,11 +119,14 @@ define i1 @test_extra_cmp_user(ptr nocapture noundef %dst, ptr nocapture noundef ; CHECK-LABEL: LV: Checking a loop in 'test_extra_cmp_user' ; CHECK: Cost of 4 for VF 8: induction instruction %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: Cost of 4 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %indvars.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 8: 12 +; CHECK: Cost of 4 for VF 8: WIDEN ir<%{{.+}}> = icmp eq ir<%{{.+}}>, ir<16> +; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%{{.+}}>, vp<%{{.+}}> +; CHECK: Cost for VF 8: 13 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK: Cost of 0 for VF 16: WIDEN ir<%{{.+}}> = icmp eq ir<%{{.+}}>, ir<16> +; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%{{.+}}>, vp<%{{.+}}> ; CHECK: Cost for VF 16: 4 ; CHECK: LV: Selecting VF: 16 entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll index fd6e275d098ca..6cbd8dd236c33 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll @@ -495,10 +495,10 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 { ; PRED-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2 ; PRED-NEXT: br i1 [[TMP27]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5:.*]] ; PRED: [[PRED_STORE_IF4]]: -; PRED-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP18]], i32 2 -; PRED-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP28]] +; PRED-NEXT: [[TMP35:%.*]] = extractelement <4 x i64> [[TMP18]], i32 2 +; PRED-NEXT: [[TMP36:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP35]] ; PRED-NEXT: [[TMP30:%.*]] = add i32 [[OFFSET_IDX]], 2 -; PRED-NEXT: store i32 [[TMP30]], ptr [[TMP29]], align 4 +; PRED-NEXT: store i32 [[TMP30]], ptr [[TMP36]], align 4 ; PRED-NEXT: br label %[[PRED_STORE_CONTINUE5]] ; PRED: [[PRED_STORE_CONTINUE5]]: ; PRED-NEXT: [[TMP31:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3 @@ -512,10 +512,10 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 { ; PRED: [[PRED_STORE_CONTINUE7]]: ; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP16]]) -; PRED-NEXT: [[TMP35:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 -; PRED-NEXT: [[TMP36:%.*]] = xor i1 [[TMP35]], true +; PRED-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; PRED-NEXT: [[TMP28:%.*]] = xor i1 [[TMP29]], true ; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) -; PRED-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; PRED-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: ; PRED-NEXT: br label %[[EXIT:.*]] ; PRED: [[SCALAR_PH]]: @@ -684,10 +684,10 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 { ; PRED-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2 ; PRED-NEXT: br i1 [[TMP26]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] ; PRED: [[PRED_STORE_IF3]]: -; PRED-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2 -; PRED-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP27]] +; PRED-NEXT: [[TMP34:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2 +; PRED-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP34]] ; PRED-NEXT: [[TMP29:%.*]] = add i32 [[OFFSET_IDX]], 2 -; PRED-NEXT: store i32 [[TMP29]], ptr [[TMP28]], align 4 +; PRED-NEXT: store i32 [[TMP29]], ptr [[TMP35]], align 4 ; PRED-NEXT: br label %[[PRED_STORE_CONTINUE4]] ; PRED: [[PRED_STORE_CONTINUE4]]: ; PRED-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3 @@ -701,10 +701,10 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 { ; PRED: [[PRED_STORE_CONTINUE6]]: ; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP15]]) -; PRED-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 -; PRED-NEXT: [[TMP35:%.*]] = xor i1 [[TMP34]], true +; PRED-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; PRED-NEXT: [[TMP27:%.*]] = xor i1 [[TMP28]], true ; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) -; PRED-NEXT: br i1 [[TMP35]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; PRED-NEXT: br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: ; PRED-NEXT: br label %[[EXIT:.*]] ; PRED: [[SCALAR_PH]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll index 0f82de629afa9..16d2329308f5b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll @@ -435,8 +435,8 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 { ; PRED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 ; PRED-NEXT: br label %[[VECTOR_PH:.*]] ; PRED: [[VECTOR_PH]]: -; PRED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 8 +; PRED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; PRED-NEXT: [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 8 ; PRED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; PRED-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3 ; PRED-NEXT: [[TMP10:%.*]] = sub i64 [[TMP0]], [[TMP9]] @@ -449,19 +449,19 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 { ; PRED: [[VECTOR_BODY]]: ; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] -; PRED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ] +; PRED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ] ; PRED-NEXT: [[TMP14:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[INDEX]] ; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8i16.p0(ptr [[TMP14]], i32 2, [[ACTIVE_LANE_MASK]], poison) -; PRED-NEXT: [[TMP20:%.*]] = udiv [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] -; PRED-NEXT: [[TMP21:%.*]] = or [[TMP20]], [[VEC_PHI]] -; PRED-NEXT: [[TMP16]] = select [[ACTIVE_LANE_MASK]], [[TMP21]], [[VEC_PHI]] -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]] +; PRED-NEXT: [[TMP15:%.*]] = udiv [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] +; PRED-NEXT: [[TMP16:%.*]] = or [[TMP15]], [[VEC_PHI]] +; PRED-NEXT: [[TMP13]] = select [[ACTIVE_LANE_MASK]], [[TMP16]], [[VEC_PHI]] +; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]] ; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP12]]) ; PRED-NEXT: [[TMP15:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; PRED-NEXT: [[TMP17:%.*]] = xor i1 [[TMP15]], true ; PRED-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: -; PRED-NEXT: [[TMP19:%.*]] = call i16 @llvm.vector.reduce.or.nxv8i16( [[TMP16]]) +; PRED-NEXT: [[TMP19:%.*]] = call i16 @llvm.vector.reduce.or.nxv8i16( [[TMP13]]) ; PRED-NEXT: br label %[[EXIT:.*]] ; PRED: [[EXIT]]: ; PRED-NEXT: ret i16 [[TMP19]] diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll index bdf832f32964f..13a686251daef 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll @@ -21,7 +21,6 @@ target triple = "thumbv8.1m.main-arm-none-eabi" ; CHECK: LV: Scalar loop costs: 5. ; CHECK: Cost of 1 for VF 2: induction instruction %inc = add nuw nsw i32 %i.016, 1 ; CHECK: Cost of 0 for VF 2: induction instruction %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] -; CHECK: Cost of 1 for VF 2: exit condition instruction %exitcond.not = icmp eq i32 %inc, %n ; CHECK: Cost of 0 for VF 2: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 2: vp<{{.+}}> = SCALAR-STEPS vp<{{.+}}>, ir<1> ; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<{{.+}}> @@ -34,11 +33,10 @@ target triple = "thumbv8.1m.main-arm-none-eabi" ; CHECK: Cost of 0 for VF 2: vp<{{.+}}> = vector-pointer ir<%arrayidx7> ; CHECK: Cost of 16 for VF 2: WIDEN store vp<{{.+}}>, ir<%conv6>, ir<%cmp2> ; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<{{.+}}>, vp<{{.+}}> -; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK: Cost of 1 for VF 2: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 2: 86 (Estimated cost per lane: 43. ; CHECK: Cost of 1 for VF 4: induction instruction %inc = add nuw nsw i32 %i.016, 1 ; CHECK: Cost of 0 for VF 4: induction instruction %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] -; CHECK: Cost of 1 for VF 4: exit condition instruction %exitcond.not = icmp eq i32 %inc, %n ; CHECK: Cost of 0 for VF 4: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 4: vp<{{.+}}> = SCALAR-STEPS vp<{{.+}}>, ir<1> ; CHECK: Cost of 0 for VF 4: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<{{.+}}> @@ -51,11 +49,10 @@ target triple = "thumbv8.1m.main-arm-none-eabi" ; CHECK: Cost of 0 for VF 4: vp<{{.+}}> = vector-pointer ir<%arrayidx7> ; CHECK: Cost of 2 for VF 4: WIDEN store vp<{{.+}}>, ir<%conv6>, ir<%cmp2> ; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<{{.+}}>, vp<{{.+}}> -; CHECK: Cost of 0 for VF 4: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK: Cost of 1 for VF 4: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 4: 10 (Estimated cost per lane: 2. ; CHECK: Cost of 1 for VF 8: induction instruction %inc = add nuw nsw i32 %i.016, 1 ; CHECK: Cost of 0 for VF 8: induction instruction %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] -; CHECK: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i32 %inc, %n ; CHECK: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 8: vp<{{.+}}> = SCALAR-STEPS vp<{{.+}}>, ir<1> ; CHECK: Cost of 0 for VF 8: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<{{.+}}> @@ -68,7 +65,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi" ; CHECK: Cost of 0 for VF 8: vp<{{.+}}> = vector-pointer ir<%arrayidx7> ; CHECK: Cost of 2 for VF 8: WIDEN store vp<{{.+}}>, ir<%conv6>, ir<%cmp2> ; CHECK: Cost of 0 for VF 8: EMIT vp<%index.next> = add nuw vp<{{.+}}>, vp<{{.+}}> -; CHECK: Cost of 0 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 8: 46 (Estimated cost per lane: 5. ; CHECK: LV: Selecting VF: 4. define void @expensive_icmp(ptr noalias nocapture %d, ptr nocapture readonly %s, i32 %n, i16 zeroext %m) #0 { @@ -134,7 +131,6 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 2: induction instruction %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ] ; CHECK: Cost of 0 for VF 2: induction instruction %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1 ; CHECK: Cost of 0 for VF 2: induction instruction %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ] -; CHECK: Cost of 1 for VF 2: exit condition instruction %cmp.not = icmp eq i32 %dec, 0 ; CHECK: Cost of 0 for VF 2: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 2: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK: Cost of 0 for VF 2: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<[[STEPS1]]> @@ -156,7 +152,7 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 2: vp<[[VEC_PTR3:%.+]]> = vector-pointer vp<%next.gep>.1 ; CHECK: Cost of 18 for VF 2: WIDEN store vp<[[VEC_PTR3]]>, ir<%conv4> ; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}> -; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK: Cost of 1 for VF 2: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 2: 130 (Estimated cost per lane: 65. ; CHECK: Cost of 1 for VF 4: induction instruction %dec = add i32 %blkCnt.012, -1 ; CHECK: Cost of 0 for VF 4: induction instruction %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ] @@ -166,7 +162,6 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 4: induction instruction %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ] ; CHECK: Cost of 0 for VF 4: induction instruction %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1 ; CHECK: Cost of 0 for VF 4: induction instruction %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ] -; CHECK: Cost of 1 for VF 4: exit condition instruction %cmp.not = icmp eq i32 %dec, 0 ; CHECK: Cost of 0 for VF 4: EMIT vp<[[CAN_IV:%.]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 4: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK: Cost of 0 for VF 4: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<[[STEPS1]]> @@ -188,7 +183,7 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 4: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.1 ; CHECK: Cost of 2 for VF 4: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv4> ; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}> -; CHECK: Cost of 0 for VF 4: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK: Cost of 1 for VF 4: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 4: 14 (Estimated cost per lane: 3. ; CHECK: Cost of 1 for VF 8: induction instruction %dec = add i32 %blkCnt.012, -1 ; CHECK: Cost of 0 for VF 8: induction instruction %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ] @@ -198,7 +193,6 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 8: induction instruction %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ] ; CHECK: Cost of 0 for VF 8: induction instruction %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1 ; CHECK: Cost of 0 for VF 8: induction instruction %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ] -; CHECK: Cost of 1 for VF 8: exit condition instruction %cmp.not = icmp eq i32 %dec, 0 ; CHECK: Cost of 0 for VF 8: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 8: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK: Cost of 0 for VF 8: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<[[STEPS1]]> @@ -220,7 +214,7 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 8: vp<[[VEC_PTR3:%.+]]> = vector-pointer vp<%next.gep>.1 ; CHECK: Cost of 2 for VF 8: WIDEN store vp<[[VEC_PTR3]]>, ir<%conv4> ; CHECK: Cost of 0 for VF 8: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}} -; CHECK: Cost of 0 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 8: 26 (Estimated cost per lane: 3. ; CHECK: Cost of 1 for VF 16: induction instruction %dec = add i32 %blkCnt.012, -1 ; CHECK: Cost of 0 for VF 16: induction instruction %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ] @@ -230,7 +224,6 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 16: induction instruction %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ] ; CHECK: Cost of 0 for VF 16: induction instruction %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1 ; CHECK: Cost of 0 for VF 16: induction instruction %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ] -; CHECK: Cost of 1 for VF 16: exit condition instruction %cmp.not = icmp eq i32 %dec, 0 ; CHECK: Cost of 0 for VF 16: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 16: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK: Cost of 0 for VF 16: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<[[STEPS1]]> @@ -252,7 +245,7 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 16: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.1 ; CHECK: Cost of 2 for VF 16: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv4> ; CHECK: Cost of 0 for VF 16: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}> -; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK: Cost of 1 for VF 16: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 16: 50 ; CHECK: LV: Selecting VF: 16. define void @cheap_icmp(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr nocapture %pDst, i32 %blockSize) #0 { diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll index bb85b88f181f7..ec01bd068f9fc 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll @@ -9,7 +9,6 @@ define void @wide_or_replaced_with_add_vpinstruction(ptr %src, ptr noalias %dst) ; CHECK-LABEL: 'wide_or_replaced_with_add_vpinstruction' ; CHECK: Cost of 1 for VF 2: induction instruction %iv.next = add nuw nsw i64 %iv, 1 ; CHECK: Cost of 0 for VF 2: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] -; CHECK: Cost of 1 for VF 2: exit condition instruction %exitcond = icmp eq i64 %iv.next, 32 ; CHECK: Cost of 0 for VF 2: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 2: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<%0> ; CHECK: Cost of 0 for VF 2: vp<%4> = SCALAR-STEPS vp<%3>, ir<1> @@ -23,11 +22,10 @@ define void @wide_or_replaced_with_add_vpinstruction(ptr %src, ptr noalias %dst) ; CHECK: Cost of 0 for VF 2: vp<%6> = vector-pointer ir<%g.dst> ; CHECK: Cost of 1 for VF 2: WIDEN store vp<%6>, ir<%iv.4>, ir<%c> ; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<%3>, vp<%1> -; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK: Cost of 1 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%2> ; CHECK: Cost of 0 for VF 2: vector loop backedge ; CHECK: Cost of 1 for VF 4: induction instruction %iv.next = add nuw nsw i64 %iv, 1 ; CHECK: Cost of 0 for VF 4: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] -; CHECK: Cost of 1 for VF 4: exit condition instruction %exitcond = icmp eq i64 %iv.next, 32 ; CHECK: Cost of 0 for VF 4: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 4: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<%0> ; CHECK: Cost of 0 for VF 4: vp<%4> = SCALAR-STEPS vp<%3>, ir<1> @@ -41,11 +39,10 @@ define void @wide_or_replaced_with_add_vpinstruction(ptr %src, ptr noalias %dst) ; CHECK: Cost of 0 for VF 4: vp<%6> = vector-pointer ir<%g.dst> ; CHECK: Cost of 1 for VF 4: WIDEN store vp<%6>, ir<%iv.4>, ir<%c> ; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<%3>, vp<%1> -; CHECK: Cost of 0 for VF 4: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK: Cost of 1 for VF 4: EMIT branch-on-count vp<%index.next>, vp<%2> ; CHECK: Cost of 0 for VF 4: vector loop backedge ; CHECK: Cost of 1 for VF 4: induction instruction %iv.next = add nuw nsw i64 %iv, 1 ; CHECK: Cost of 0 for VF 4: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] -; CHECK: Cost of 1 for VF 4: exit condition instruction %exitcond = icmp eq i64 %iv.next, 32 ; entry: br label %loop.header diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll index e11b1ad7f09dc..511be474e74c5 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll @@ -166,7 +166,6 @@ attributes #0 = { "target-cpu"="knl" } ; CHECK: LV: Found uniform instruction: {{%.*}} = icmp eq i32 {{%.*}}, 0 ; CHECK-NOT: LV: Found uniform instruction: {{%.*}} = load i32, ptr {{%.*}}, align 1 ; CHECK: LV: Found not uniform due to requiring predication: {{%.*}} = load i32, ptr {{%.*}}, align 1 -; CHECK: LV: Found scalar instruction: {{%.*}} = getelementptr inbounds [3 x i32], ptr @a, i32 0, i32 {{%.*}} ; ; @a = internal constant [3 x i32] [i32 7, i32 7, i32 0], align 1 @@ -175,14 +174,33 @@ attributes #0 = { "target-cpu"="knl" } define void @PR40816() #1 { ; CHECK-LABEL: define void @PR40816( ; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: store i32 [[TMP0]], ptr @b, align 1 -; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[TMP0]], 2 -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[TMP0]], 1 -; CHECK-NEXT: br i1 [[CMP2]], label %[[RETURN:.*]], label %[[FOR_BODY]] +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: br i1 true, label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: store i32 0, ptr @b, align 1 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: br i1 true, label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] +; CHECK: [[PRED_STORE_IF1]]: +; CHECK-NEXT: store i32 1, ptr @b, align 1 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]] +; CHECK: [[PRED_STORE_CONTINUE2]]: +; CHECK-NEXT: br i1 true, label %[[PRED_STORE_IF3:.*]], label %[[RETURN1:.*]] +; CHECK: [[PRED_STORE_IF3]]: +; CHECK-NEXT: store i32 2, ptr @b, align 1 +; CHECK-NEXT: br label %[[RETURN1]] +; CHECK: [[RETURN1]]: +; CHECK-NEXT: br i1 false, label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] +; CHECK: [[PRED_STORE_IF5]]: +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]] +; CHECK: [[PRED_STORE_CONTINUE6]]: +; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[RETURN:.*]] ; CHECK: [[RETURN]]: ; CHECK-NEXT: ret void ; @@ -215,8 +233,9 @@ define void @PR40816() #1 { ; FORCE-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4 ; FORCE-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; FORCE: [[MIDDLE_BLOCK]]: -; FORCE-NEXT: br [[RETURN:label %.*]] -; FORCE: [[SCALAR_PH:.*:]] +; FORCE-NEXT: br label %[[RETURN:.*]] +; FORCE: [[RETURN]]: +; FORCE-NEXT: ret void ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll index f65a9d7d45ed8..03817eb1601d2 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll @@ -49,48 +49,23 @@ bb3: define void @redundant_or_1(ptr %dst, i1 %c.0, i1 %c.1) { ; CHECK-LABEL: @redundant_or_1( ; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C_0:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i1> poison, i1 [[C_1:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT1]], <4 x i1> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT2]], splat (i1 true) -; CHECK-NEXT: [[TMP0:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> , <4 x i1> [[TMP0]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0 -; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 0 -; CHECK-NEXT: store i32 0, ptr [[TMP8]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] -; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] -; CHECK: pred.store.if3: -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 1 -; CHECK-NEXT: store i32 0, ptr [[TMP11]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] -; CHECK: pred.store.continue4: -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2 -; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] -; CHECK: pred.store.if5: -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 2 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[PRED_STORE_IF3:%.*]] ] +; CHECK-NEXT: br i1 [[TMP9:%.*]], label [[PRED_STORE_IF3]], label [[PRED_STORE_CONTINUE4:%.*]] +; CHECK: then.1: +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV]], 2 +; CHECK-NEXT: [[OR:%.*]] = or i1 [[CMP]], true +; CHECK-NEXT: [[COND:%.*]] = select i1 [[OR]], i1 [[C_1:%.*]], i1 false +; CHECK-NEXT: br i1 [[COND]], label [[THEN_2:%.*]], label [[PRED_STORE_IF3]] +; CHECK: then.2: +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[IV]] ; CHECK-NEXT: store i32 0, ptr [[TMP14]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] -; CHECK: pred.store.continue6: -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3 -; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] -; CHECK: pred.store.if7: -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 3 -; CHECK-NEXT: store i32 0, ptr [[TMP17]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]] -; CHECK: pred.store.continue8: -; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] -; CHECK: middle.block: -; CHECK-NEXT: br label [[LOOP_LATCH:%.*]] +; CHECK-NEXT: br label [[PRED_STORE_IF3]] +; CHECK: loop.latch: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 3 +; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[PRED_STORE_CONTINUE]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -124,48 +99,23 @@ exit: define void @redundant_or_2(ptr %dst, i1 %c.0, i1 %c.1) { ; CHECK-LABEL: @redundant_or_2( ; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C_1:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i1> poison, i1 [[C_0:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT1]], <4 x i1> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT2]], splat (i1 true) -; CHECK-NEXT: [[TMP0:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> , <4 x i1> [[TMP0]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 0 -; CHECK-NEXT: store i32 0, ptr [[TMP7]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] -; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] -; CHECK: pred.store.if3: -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 1 -; CHECK-NEXT: store i32 0, ptr [[TMP10]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] -; CHECK: pred.store.continue4: -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 -; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] -; CHECK: pred.store.if5: -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 2 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[PRED_STORE_IF3:%.*]] ] +; CHECK-NEXT: br i1 [[TMP8:%.*]], label [[PRED_STORE_IF3]], label [[PRED_STORE_CONTINUE4:%.*]] +; CHECK: then.1: +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV]], 2 +; CHECK-NEXT: [[OR:%.*]] = or i1 true, [[CMP]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[OR]], i1 [[C_1:%.*]], i1 false +; CHECK-NEXT: br i1 [[COND]], label [[THEN_2:%.*]], label [[PRED_STORE_IF3]] +; CHECK: then.2: +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[IV]] ; CHECK-NEXT: store i32 0, ptr [[TMP13]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] -; CHECK: pred.store.continue6: -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 -; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] -; CHECK: pred.store.if7: -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 3 -; CHECK-NEXT: store i32 0, ptr [[TMP16]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]] -; CHECK: pred.store.continue8: -; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] -; CHECK: middle.block: -; CHECK-NEXT: br label [[LOOP_LATCH:%.*]] +; CHECK-NEXT: br label [[PRED_STORE_IF3]] +; CHECK: loop.latch: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 3 +; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[PRED_STORE_CONTINUE]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -199,49 +149,23 @@ exit: define void @redundant_and_1(ptr %dst, i1 %c.0, i1 %c.1) { ; CHECK-LABEL: @redundant_and_1( ; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C_0:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i1> poison, i1 [[C_1:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT1]], <4 x i1> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> , <4 x i1> [[BROADCAST_SPLAT2]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP0]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> , <4 x i1> [[TMP2]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0 -; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 0 -; CHECK-NEXT: store i32 0, ptr [[TMP9]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] -; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP6]], i32 1 -; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] -; CHECK: pred.store.if3: -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 1 -; CHECK-NEXT: store i32 0, ptr [[TMP12]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] -; CHECK: pred.store.continue4: -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP6]], i32 2 -; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] -; CHECK: pred.store.if5: -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 2 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[PRED_STORE_IF3:%.*]] ] +; CHECK-NEXT: br i1 [[TMP10:%.*]], label [[PRED_STORE_IF3]], label [[PRED_STORE_CONTINUE4:%.*]] +; CHECK: then.1: +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV]], 2 +; CHECK-NEXT: [[OR:%.*]] = or i1 [[CMP]], false +; CHECK-NEXT: [[COND:%.*]] = select i1 [[OR]], i1 [[C_1:%.*]], i1 false +; CHECK-NEXT: br i1 [[COND]], label [[THEN_2:%.*]], label [[PRED_STORE_IF3]] +; CHECK: then.2: +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[IV]] ; CHECK-NEXT: store i32 0, ptr [[TMP15]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] -; CHECK: pred.store.continue6: -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP6]], i32 3 -; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] -; CHECK: pred.store.if7: -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 3 -; CHECK-NEXT: store i32 0, ptr [[TMP18]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]] -; CHECK: pred.store.continue8: -; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] -; CHECK: middle.block: -; CHECK-NEXT: br label [[LOOP_LATCH:%.*]] +; CHECK-NEXT: br label [[PRED_STORE_IF3]] +; CHECK: loop.latch: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 3 +; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[PRED_STORE_CONTINUE]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll index 9453ad7c61f68..76fd086737d58 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -334,7 +334,7 @@ define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 { ; CHECK-NEXT: [[TMP1:%.*]] = freeze i64 [[TMP0]] ; CHECK-NEXT: [[UMIN7:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[A:%.*]]) ; CHECK-NEXT: [[TMP2:%.*]] = add nuw i64 [[UMIN7]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 28 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 14 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[B]], i64 1) @@ -819,54 +819,54 @@ define i32 @g(i64 %n) { ; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[N]], 4294967295 ; CHECK-NEXT: br i1 [[TMP2]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: -; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP1]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP1]], 32 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 32 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4) -; CHECK-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4) -; CHECK-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4) -; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[VEC_IND]] to <4 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i32> [[STEP_ADD]] to <4 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i32> [[STEP_ADD_2]] to <4 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i32> [[STEP_ADD_3]] to <4 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> zeroinitializer, <4 x i32> splat (i32 2) -; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> zeroinitializer, <4 x i32> splat (i32 2) -; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> zeroinitializer, <4 x i32> splat (i32 2) -; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> zeroinitializer, <4 x i32> splat (i32 2) -; CHECK-NEXT: [[TMP15]] = or <4 x i32> [[TMP11]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP16]] = or <4 x i32> [[TMP12]], [[VEC_PHI2]] -; CHECK-NEXT: [[TMP17]] = or <4 x i32> [[TMP13]], [[VEC_PHI3]] -; CHECK-NEXT: [[TMP18]] = or <4 x i32> [[TMP14]], [[VEC_PHI4]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4) +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <8 x i32> [[VEC_IND]], splat (i32 8) +; CHECK-NEXT: [[STEP_ADD_2:%.*]] = add <8 x i32> [[STEP_ADD]], splat (i32 8) +; CHECK-NEXT: [[STEP_ADD_3:%.*]] = add <8 x i32> [[STEP_ADD_2]], splat (i32 8) +; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i32> [[VEC_IND]] to <8 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i32> [[STEP_ADD]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i32> [[STEP_ADD_2]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = zext <8 x i32> [[STEP_ADD_3]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <8 x i64> [[BROADCAST_SPLAT]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <8 x i64> [[BROADCAST_SPLAT]], [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <8 x i64> [[BROADCAST_SPLAT]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <8 x i64> [[BROADCAST_SPLAT]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP7]], <8 x i32> zeroinitializer, <8 x i32> splat (i32 2) +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP8]], <8 x i32> zeroinitializer, <8 x i32> splat (i32 2) +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP9]], <8 x i32> zeroinitializer, <8 x i32> splat (i32 2) +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP10]], <8 x i32> zeroinitializer, <8 x i32> splat (i32 2) +; CHECK-NEXT: [[TMP15]] = or <8 x i32> [[TMP11]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP16]] = or <8 x i32> [[TMP12]], [[VEC_PHI2]] +; CHECK-NEXT: [[TMP17]] = or <8 x i32> [[TMP13]], [[VEC_PHI3]] +; CHECK-NEXT: [[TMP18]] = or <8 x i32> [[TMP14]], [[VEC_PHI4]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[STEP_ADD_3]], splat (i32 8) ; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = or <4 x i32> [[TMP16]], [[TMP15]] -; CHECK-NEXT: [[BIN_RDX5:%.*]] = or <4 x i32> [[TMP17]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX6:%.*]] = or <4 x i32> [[TMP18]], [[BIN_RDX5]] -; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[BIN_RDX6]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = or <8 x i32> [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[BIN_RDX5:%.*]] = or <8 x i32> [[TMP17]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX6:%.*]] = or <8 x i32> [[TMP18]], [[BIN_RDX5]] +; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[BIN_RDX6]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_MOD_VF]], 4 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF26:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP20]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -890,7 +890,7 @@ define i32 @g(i64 %n) { ; CHECK-NEXT: [[INDEX_NEXT15]] = add nuw i32 [[INDEX9]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT11]] = add <4 x i32> [[VEC_IND10]], splat (i32 4) ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT15]], [[N_VEC8]] -; CHECK-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP25]]) ; CHECK-NEXT: [[CMP_N16:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC8]] @@ -907,7 +907,7 @@ define i32 @g(i64 %n) { ; CHECK-NEXT: [[SELECT_I:%.*]] = select i1 [[EXITCOND]], i32 0, i32 2 ; CHECK-NEXT: [[SELECT_NEXT]] = or i32 [[SELECT_I]], [[SELECT]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP28:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[SELECT_NEXT_LCSSA:%.*]] = phi i32 [ [[SELECT_NEXT]], [[LOOP]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ [[TMP27]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[SELECT_NEXT_LCSSA]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll index cc84fabd00ecc..002d811d46992 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll @@ -435,67 +435,16 @@ define void @test_first_order_recurrence_tried_to_scalarized(ptr %dst, i1 %c, i3 ; CHECK-LABEL: @test_first_order_recurrence_tried_to_scalarized( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[N:%.*]] = select i1 [[C:%.*]], i32 8, i32 9 -; CHECK-NEXT: br label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[N]], 1 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ] -; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND]], [[PRED_STORE_CONTINUE6]] ] -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 -; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[DST:%.*]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw i32 10, [[TMP5]] -; CHECK-NEXT: store i32 [[TMP6]], ptr [[TMP4]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] -; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 -; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] -; CHECK: pred.store.if1: -; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 1 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i32 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = sub nsw i32 10, [[TMP10]] -; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP9]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] -; CHECK: pred.store.continue2: -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 -; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] -; CHECK: pred.store.if3: -; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i32 [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2 +; CHECK: loop: +; CHECK-NEXT: [[TMP18:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP15:%.*]] = phi i32 [ 4, [[ENTRY]] ], [ [[TMP18]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[TMP18]], 1 ; CHECK-NEXT: [[TMP16:%.*]] = sub nsw i32 10, [[TMP15]] -; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP14]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] -; CHECK: pred.store.continue4: -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3 -; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] -; CHECK: pred.store.if5: -; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[INDEX]], 3 -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i32 [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP21:%.*]] = sub nsw i32 10, [[TMP20]] -; CHECK-NEXT: store i32 [[TMP21]], ptr [[TMP19]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] -; CHECK: pred.store.continue6: -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) -; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw i32, ptr [[DST:%.*]], i32 [[TMP18]] +; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP19]], align 4 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[VECTOR_BODY]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll index a19b294541172..ae31fbe30aa94 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll @@ -552,55 +552,14 @@ define void @wide_iv_trunc(ptr %dst, i64 %N) { ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[N]], 8 ; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_PREHEADER:%.*]], label [[EXIT:%.*]] ; CHECK: loop.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; CHECK-NEXT: br label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[LOOP_PREHEADER]] ] ; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDEX]] to i32 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0 -; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP1]], 0 -; CHECK-NEXT: store i32 [[TMP4]], ptr [[DST]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] -; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1 -; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] -; CHECK: pred.store.if1: -; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], 1 -; CHECK-NEXT: store i32 [[TMP6]], ptr [[DST]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] -; CHECK: pred.store.continue2: -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2 -; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] -; CHECK: pred.store.if3: -; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP1]], 2 -; CHECK-NEXT: store i32 [[TMP8]], ptr [[DST]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] -; CHECK: pred.store.continue4: -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] -; CHECK: pred.store.if5: -; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP1]], 3 -; CHECK-NEXT: store i32 [[TMP10]], ptr [[DST]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] -; CHECK: pred.store.continue6: -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: store i32 [[TMP1]], ptr [[DST]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[INDEX]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT_LOOPEXIT:%.*]], label [[VECTOR_BODY]] ; CHECK: exit.loopexit: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -684,7 +643,7 @@ define void @wombat(i32 %arg, ptr %dst) #1 { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], [[DOTSPLAT4]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 56 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK: scalar.ph: @@ -699,7 +658,7 @@ define void @wombat(i32 %arg, ptr %dst) #1 { ; CHECK-NEXT: [[ADD]] = add i64 [[PHI]], 1 ; CHECK-NEXT: [[ICMP:%.*]] = icmp ugt i64 [[PHI]], 65 ; CHECK-NEXT: [[TRUNC]] = trunc i64 [[MUL3]] to i32 -; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -754,7 +713,7 @@ define void @wombat2(i32 %arg, ptr %dst) #1 { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], [[DOTSPLAT4]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 56 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK: scalar.ph: @@ -770,7 +729,7 @@ define void @wombat2(i32 %arg, ptr %dst) #1 { ; CHECK-NEXT: [[ICMP:%.*]] = icmp ugt i64 [[PHI]], 65 ; CHECK-NEXT: [[TRUNC_0:%.*]] = trunc i64 [[MUL3]] to i60 ; CHECK-NEXT: [[TRUNC_1]] = trunc i60 [[TRUNC_0]] to i32 -; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP30:![0-9]+]] +; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -827,7 +786,7 @@ define void @with_dead_use(i32 %arg, ptr %dst) #1 { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], [[DOTSPLAT4]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 56 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK: scalar.ph: @@ -843,7 +802,7 @@ define void @with_dead_use(i32 %arg, ptr %dst) #1 { ; CHECK-NEXT: [[ICMP:%.*]] = icmp ugt i64 [[PHI]], 65 ; CHECK-NEXT: [[TRUNC]] = trunc i64 [[MUL3]] to i32 ; CHECK-NEXT: [[DEAD_AND:%.*]] = and i32 [[TRUNC]], 123 -; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP32:![0-9]+]] +; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP31:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -899,10 +858,9 @@ attributes #1 = { "target-cpu"="skylake-avx512" "target-features"="-avx512f" } ; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]} ; CHECK: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]} ; CHECK: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]} -; CHECK: [[LOOP27]] = distinct !{[[LOOP27]], [[META1]], [[META2]]} -; CHECK: [[LOOP28]] = distinct !{[[LOOP28]], [[META2]], [[META1]]} -; CHECK: [[LOOP29]] = distinct !{[[LOOP29]], [[META1]], [[META2]]} -; CHECK: [[LOOP30]] = distinct !{[[LOOP30]], [[META2]], [[META1]]} -; CHECK: [[LOOP31]] = distinct !{[[LOOP31]], [[META1]], [[META2]]} -; CHECK: [[LOOP32]] = distinct !{[[LOOP32]], [[META2]], [[META1]]} +; CHECK: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]} +; CHECK: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]} +; CHECK: [[LOOP29]] = distinct !{[[LOOP29]], [[META2]], [[META1]]} +; CHECK: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]], [[META2]]} +; CHECK: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll index de6418066dea0..7b0b68b294f75 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll @@ -27,23 +27,30 @@ define i64 @test_value_in_exit_compare_chain_used_outside(ptr %src, i64 %x, i64 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[TMP10:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i8> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP29:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i8> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i8> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP18:%.*]] = and i64 [[TMP10]], 1 ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP18]] ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0 -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[TMP27]], i32 -7 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP28]], align 1 -; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i8> [[WIDE_LOAD]], <8 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP29]] = xor <8 x i8> [[REVERSE]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP27]], i32 -3 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP26]], i32 -4 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 -3 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1 +; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1 +; CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD4]], <4 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP17]] = xor <4 x i8> [[REVERSE]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP19]] = xor <4 x i8> [[REVERSE5]], [[VEC_PHI3]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP10]], 8 ; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP31:%.*]] = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> [[TMP29]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = xor <4 x i8> [[TMP19]], [[TMP17]] +; CHECK-NEXT: [[TMP20:%.*]] = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> [[BIN_RDX]]) ; CHECK-NEXT: br label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP31]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP20]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll index e1140b59681fe..9c55446b1121c 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll @@ -115,18 +115,18 @@ define i32 @foo_minsize() #1 { ; AUTOVF-NEXT: br label %[[VECTOR_BODY:.*]] ; AUTOVF: [[VECTOR_BODY]]: ; AUTOVF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; AUTOVF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <32 x i32> poison, i32 [[INDEX]], i64 0 -; AUTOVF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x i32> [[BROADCAST_SPLATINSERT]], <32 x i32> poison, <32 x i32> zeroinitializer -; AUTOVF-NEXT: [[VEC_IV:%.*]] = add <32 x i32> [[BROADCAST_SPLAT]], -; AUTOVF-NEXT: [[TMP1:%.*]] = icmp ule <32 x i32> [[VEC_IV]], splat (i32 202) +; AUTOVF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[INDEX]], i64 0 +; AUTOVF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer +; AUTOVF-NEXT: [[VEC_IV:%.*]] = add <16 x i32> [[BROADCAST_SPLAT]], +; AUTOVF-NEXT: [[TMP0:%.*]] = icmp ule <16 x i32> [[VEC_IV]], splat (i32 202) ; AUTOVF-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[INDEX]] -; AUTOVF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr [[TMP2]], i32 1, <32 x i1> [[TMP1]], <32 x i8> poison) -; AUTOVF-NEXT: [[TMP4:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer -; AUTOVF-NEXT: [[TMP5:%.*]] = select <32 x i1> [[TMP4]], <32 x i8> splat (i8 2), <32 x i8> splat (i8 1) -; AUTOVF-NEXT: call void @llvm.masked.store.v32i8.p0(<32 x i8> [[TMP5]], ptr [[TMP2]], i32 1, <32 x i1> [[TMP1]]) -; AUTOVF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32 -; AUTOVF-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224 -; AUTOVF-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; AUTOVF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[TMP0]], <16 x i8> poison) +; AUTOVF-NEXT: [[TMP5:%.*]] = icmp eq <16 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer +; AUTOVF-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> splat (i8 2), <16 x i8> splat (i8 1) +; AUTOVF-NEXT: call void @llvm.masked.store.v16i8.p0(<16 x i8> [[TMP3]], ptr [[TMP2]], i32 1, <16 x i1> [[TMP0]]) +; AUTOVF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 +; AUTOVF-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 208 +; AUTOVF-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; AUTOVF: [[MIDDLE_BLOCK]]: ; AUTOVF-NEXT: br label %[[FOR_END:.*]] ; AUTOVF: [[FOR_END]]: diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll index 36163790706ed..1b237d42bfee2 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll @@ -14,32 +14,21 @@ define void @test(ptr noundef align 8 dereferenceable_or_null(16) %arr) #0 { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ptr noundef align 8 dereferenceable_or_null(16) [[ARR:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: bb5: -; CHECK-NEXT: br label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 99, [[INDEX]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], splat (i64 8) -; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i64> [[VEC_IND]], splat (i64 1) -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[ARR]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[TMP7]], i32 -3 -; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i1> [[TMP4]], <4 x i1> poison, <4 x i32> -; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr [[TMP8]], i32 8, <4 x i1> [[REVERSE]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4) -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12 -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF0:![0-9]+]], !llvm.loop [[LOOP1:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: br label [[LOOP_LATCH:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 99, [[BB5:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[AND:%.*]] = and i64 [[IV]], 1 +; CHECK-NEXT: [[ICMP17:%.*]] = icmp eq i64 [[AND]], 0 +; CHECK-NEXT: br i1 [[ICMP17]], label [[BB18:%.*]], label [[LOOP_LATCH]], !prof [[PROF0:![0-9]+]] +; CHECK: bb18: +; CHECK-NEXT: [[OR:%.*]] = or disjoint i64 [[IV]], 1 +; CHECK-NEXT: [[GETELEMENTPTR19:%.*]] = getelementptr inbounds i64, ptr [[ARR]], i64 [[OR]] +; CHECK-NEXT: store i64 1, ptr [[GETELEMENTPTR19]], align 8 +; CHECK-NEXT: br label [[LOOP_LATCH]] +; CHECK: loop.latch: +; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; CHECK-NEXT: [[ICMP22:%.*]] = icmp eq i64 [[IV_NEXT]], 90 +; CHECK-NEXT: br i1 [[ICMP22]], label [[BB6:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]] ; CHECK: bb6: ; CHECK-NEXT: ret void ; @@ -78,9 +67,6 @@ attributes #0 = {"target-cpu"="haswell" "target-features"="+avx2" } ;. -; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 23} -; CHECK: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]], [[META4:![0-9]+]]} -; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[META4]] = !{!"llvm.loop.estimated_trip_count", i32 24} +; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 1} +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 95} ;. diff --git a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll index e25be6f867862..db1a835f0816a 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll @@ -923,49 +923,9 @@ define void @switch_under_br_default_common_dest_with_case(ptr %start, ptr %end, ; COST-LABEL: define void @switch_under_br_default_common_dest_with_case( ; COST-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i64 [[X:%.*]]) #[[ATTR0]] { ; COST-NEXT: [[ENTRY:.*]]: -; COST-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 -; COST-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 -; COST-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 -; COST-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] -; COST-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 -; COST-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 -; COST-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4 -; COST-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; COST: [[VECTOR_PH]]: -; COST-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4 -; COST-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] -; COST-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 -; COST-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] -; COST-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[X]], i64 0 -; COST-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; COST-NEXT: br label %[[VECTOR_BODY:.*]] -; COST: [[VECTOR_BODY]]: -; COST-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; COST-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 -; COST-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]] -; COST-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[NEXT_GEP]], align 1 -; COST-NEXT: [[TMP7:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; COST-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 -12) -; COST-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 13) -; COST-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP7]], <4 x i1> [[TMP8]], <4 x i1> zeroinitializer -; COST-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP7]], <4 x i1> [[TMP9]], <4 x i1> zeroinitializer -; COST-NEXT: [[TMP12:%.*]] = or <4 x i1> [[TMP10]], [[TMP11]] -; COST-NEXT: [[TMP13:%.*]] = xor <4 x i1> [[TMP12]], splat (i1 true) -; COST-NEXT: [[TMP14:%.*]] = select <4 x i1> [[TMP7]], <4 x i1> [[TMP13]], <4 x i1> zeroinitializer -; COST-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP11]]) -; COST-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP10]]) -; COST-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP14]]) -; COST-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; COST-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; COST-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] -; COST: [[MIDDLE_BLOCK]]: -; COST-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] -; COST-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] -; COST: [[SCALAR_PH]]: -; COST-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; COST-NEXT: br label %[[LOOP_HEADER:.*]] ; COST: [[LOOP_HEADER]]: -; COST-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; COST-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; COST-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 ; COST-NEXT: [[C:%.*]] = icmp ule i64 [[L]], [[X]] ; COST-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]] @@ -987,7 +947,7 @@ define void @switch_under_br_default_common_dest_with_case(ptr %start, ptr %end, ; COST: [[LOOP_LATCH]]: ; COST-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 ; COST-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; COST-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]] +; COST-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] ; COST: [[EXIT]]: ; COST-NEXT: ret void ; @@ -1431,8 +1391,6 @@ exit: ; COST: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} ; COST: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} ; COST: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} -; COST: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} -; COST: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} ;. ; FORCED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; FORCED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll index 2cda2533e80e0..d905d925dc834 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll @@ -30,8 +30,6 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: br ; CHECK: Cost of 1 for VF 2: induction instruction %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 ; CHECK: Cost of 1 for VF 2: induction instruction %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] -; CHECK: Cost of 1 for VF 2: exit condition instruction %exitcond = icmp eq i32 %lftr.wideiv, %n -; CHECK: Cost of 0 for VF 2: exit condition instruction %lftr.wideiv = trunc i64 %indvars.iv.next to i32 ; CHECK: Cost of 0 for VF 2: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 1 for VF 2: WIDEN-REDUCTION-PHI ir<%sum.013> = phi vp<{{.+}}>, vp<[[EXT:%.+]]> ; CHECK: Cost of 0 for VF 2: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> @@ -49,7 +47,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<[[TRUNC:%.+]]> = trunc ir<%add5> to i8 ; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<[[EXT]]> = zext vp<[[TRUNC]]> to i32 ; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}> -; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK: Cost of 1 for VF 2: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; define i8 @reduction_i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %n) { entry: diff --git a/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll b/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll index 272b62bdbd5aa..fd7868f34e211 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll @@ -24,12 +24,12 @@ define void @value_defined_in_loop1_used_for_trip_counts(i32 %start, i1 %c, ptr ; CHECK-NEXT: br label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[IV_1_LCSSA2]], 1 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <16 x i64> , [[BROADCAST_SPLAT]] -; CHECK-NEXT: call void @llvm.masked.store.v16i8.p0(<16 x i8> zeroinitializer, ptr [[DST]], i32 1, <16 x i1> [[TMP0]]) +; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <8 x i64> , [[BROADCAST_SPLAT]] +; CHECK-NEXT: call void @llvm.masked.store.v8i8.p0(<8 x i8> zeroinitializer, ptr [[DST]], i32 1, <8 x i1> [[TMP0]]) ; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT_1_LOOPEXIT1:.*]]