diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 32c3435ccb38d..324e46b0feebd 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -540,10 +540,6 @@ class InnerLoopVectorizer { protected: friend class LoopVectorizationPlanner; - /// Iteratively sink the scalarized operands of a predicated instruction into - /// the block that was created for it. - void sinkScalarOperands(Instruction *PredInst); - /// Returns (and creates if needed) the trip count of the widened loop. Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); @@ -628,9 +624,6 @@ class InnerLoopVectorizer { /// A list of all bypass blocks. The first block is the entry of the loop. SmallVector LoopBypassBlocks; - /// Store instructions that were predicated. - SmallVector PredicatedInstructions; - /// Trip count of the original loop. Value *TripCount = nullptr; @@ -2382,17 +2375,13 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, if (auto *II = dyn_cast(Cloned)) AC->registerAssumption(II); - // End if-block. - VPRegionBlock *Parent = RepRecipe->getParent()->getParent(); - bool IfPredicateInstr = Parent ? Parent->isReplicator() : false; assert( - (Parent || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() || + (RepRecipe->getParent()->getParent() || + !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() || all_of(RepRecipe->operands(), [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) && "Expected a recipe is either within a region or all of its operands " "are defined outside the vectorized region."); - if (IfPredicateInstr) - PredicatedInstructions.push_back(Cloned); } Value * @@ -2866,9 +2855,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { if (!State.Plan->getVectorLoopRegion()) return; - for (Instruction *PI : PredicatedInstructions) - sinkScalarOperands(&*PI); - VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion(); VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock(); BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB]; @@ -2894,82 +2880,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { VF.getKnownMinValue() * UF); } -void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { - // The basic block and loop containing the predicated instruction. - auto *PredBB = PredInst->getParent(); - auto *VectorLoop = LI->getLoopFor(PredBB); - - // Initialize a worklist with the operands of the predicated instruction. - SetVector Worklist(PredInst->op_begin(), PredInst->op_end()); - - // Holds instructions that we need to analyze again. An instruction may be - // reanalyzed if we don't yet know if we can sink it or not. - SmallVector InstsToReanalyze; - - // Returns true if a given use occurs in the predicated block. Phi nodes use - // their operands in their corresponding predecessor blocks. - auto IsBlockOfUsePredicated = [&](Use &U) -> bool { - auto *I = cast(U.getUser()); - BasicBlock *BB = I->getParent(); - if (auto *Phi = dyn_cast(I)) - BB = Phi->getIncomingBlock( - PHINode::getIncomingValueNumForOperand(U.getOperandNo())); - return BB == PredBB; - }; - - // Iteratively sink the scalarized operands of the predicated instruction - // into the block we created for it. When an instruction is sunk, it's - // operands are then added to the worklist. The algorithm ends after one pass - // through the worklist doesn't sink a single instruction. - bool Changed; - do { - // Add the instructions that need to be reanalyzed to the worklist, and - // reset the changed indicator. - Worklist.insert_range(InstsToReanalyze); - InstsToReanalyze.clear(); - Changed = false; - - while (!Worklist.empty()) { - auto *I = dyn_cast(Worklist.pop_back_val()); - - // We can't sink an instruction if it is a phi node, is not in the loop, - // may have side effects or may read from memory. - // TODO: Could do more granular checking to allow sinking - // a load past non-store instructions. - if (!I || isa(I) || !VectorLoop->contains(I) || - I->mayHaveSideEffects() || I->mayReadFromMemory()) - continue; - - // If the instruction is already in PredBB, check if we can sink its - // operands. In that case, VPlan's sinkScalarOperands() succeeded in - // sinking the scalar instruction I, hence it appears in PredBB; but it - // may have failed to sink I's operands (recursively), which we try - // (again) here. - if (I->getParent() == PredBB) { - Worklist.insert_range(I->operands()); - continue; - } - - // It's legal to sink the instruction if all its uses occur in the - // predicated block. Otherwise, there's nothing to do yet, and we may - // need to reanalyze the instruction. - if (!llvm::all_of(I->uses(), IsBlockOfUsePredicated)) { - InstsToReanalyze.push_back(I); - continue; - } - - // Move the instruction to the beginning of the predicated block, and add - // it's operands to the worklist. - I->moveBefore(PredBB->getFirstInsertionPt()); - Worklist.insert_range(I->operands()); - - // The sinking may have enabled other instructions to be sunk, so we will - // need to iterate. - Changed = true; - } - } while (Changed); -} - void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { auto Iter = vp_depth_first_deep(Plan.getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll index 585c2df08f7d6..42a51bc7be30a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -1000,22 +1000,25 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE5]] ] ; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], %[[PRED_STORE_CONTINUE5]] ] ; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = load double, ptr [[P2]], align 8 -; TFA_INTERLEAVE-NEXT: br i1 [[ACTIVE_LANE_MASK]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] -; TFA_INTERLEAVE: [[PRED_STORE_IF]]: ; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7:[0-9]+]] -; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = fcmp ogt double [[TMP5]], 0.000000e+00 -; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = xor i1 [[TMP6]], true -; TFA_INTERLEAVE-NEXT: [[TMP24:%.*]] = select i1 [[TMP7]], double 1.000000e+00, double 0.000000e+00 -; TFA_INTERLEAVE-NEXT: store double [[TMP24]], ptr [[P]], align 8 -; TFA_INTERLEAVE-NEXT: br label %[[PRED_STORE_CONTINUE]] -; TFA_INTERLEAVE: [[PRED_STORE_CONTINUE]]: -; TFA_INTERLEAVE-NEXT: br i1 [[ACTIVE_LANE_MASK2]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5]] -; TFA_INTERLEAVE: [[PRED_STORE_IF4]]: ; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7]] +; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = fcmp ogt double [[TMP5]], 0.000000e+00 ; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = fcmp ogt double [[TMP8]], 0.000000e+00 -; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = xor i1 [[TMP9]], true +; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = xor i1 [[TMP6]], true +; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = xor i1 [[TMP9]], true +; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = select i1 [[ACTIVE_LANE_MASK]], i1 [[TMP18]], i1 false +; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = select i1 [[ACTIVE_LANE_MASK2]], i1 [[TMP20]], i1 false ; TFA_INTERLEAVE-NEXT: [[TMP26:%.*]] = select i1 [[TMP10]], double 1.000000e+00, double 0.000000e+00 -; TFA_INTERLEAVE-NEXT: store double [[TMP26]], ptr [[P]], align 8 +; TFA_INTERLEAVE-NEXT: [[PREDPHI3:%.*]] = select i1 [[TMP21]], double 1.000000e+00, double 0.000000e+00 +; TFA_INTERLEAVE-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[ACTIVE_LANE_MASK2]], double [[PREDPHI3]], double [[TMP26]] +; TFA_INTERLEAVE-NEXT: [[TMP13:%.*]] = xor i1 [[ACTIVE_LANE_MASK]], true +; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = xor i1 [[ACTIVE_LANE_MASK2]], true +; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = xor i1 [[TMP13]], true +; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = xor i1 [[TMP14]], true +; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = or i1 [[TMP15]], [[TMP16]] +; TFA_INTERLEAVE-NEXT: br i1 [[TMP17]], label %[[BB18:.*]], label %[[PRED_STORE_CONTINUE5]] +; TFA_INTERLEAVE: [[BB18]]: +; TFA_INTERLEAVE-NEXT: store double [[SPEC_SELECT]], ptr [[P]], align 8 ; TFA_INTERLEAVE-NEXT: br label %[[PRED_STORE_CONTINUE5]] ; TFA_INTERLEAVE: [[PRED_STORE_CONTINUE5]]: ; TFA_INTERLEAVE-NEXT: [[TMP27]] = add i64 [[INDEX]], 2 diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll index c91ead00a950d..644f10b617eb7 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll @@ -8,13 +8,13 @@ define ptr @test(ptr noalias %src, ptr noalias %dst) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE2]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <2 x i64> [[VEC_IND]], zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true) ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 ; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll index 6424fb5565a63..5917b300b4820 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll @@ -258,6 +258,8 @@ define void @example2(i32 %n, i32 %x) optsize { ; Loop has no primary induction as its integer IV has step -1 starting at ; unknown N, but can still be vectorized. +; Note: Most scalar pointer induction GEPs could be sunk into the conditional +; blocks. define void @example3(i32 %n, ptr noalias nocapture %p, ptr noalias nocapture %q) optsize { ; CHECK-LABEL: @example3( ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[N:%.*]], 0 @@ -275,7 +277,19 @@ define void @example3(i32 %n, ptr noalias nocapture %p, ptr noalias nocapture %q ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE16:%.*]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[TMP11]], i64 4 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[P]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[TMP12]], i64 8 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[P]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[TMP15]], i64 12 ; CHECK-NEXT: [[OFFSET_IDX6:%.*]] = shl i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[Q:%.*]], i64 [[OFFSET_IDX6]] +; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[TMP16]], i64 4 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[Q]], i64 [[OFFSET_IDX6]] +; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[TMP7]], i64 8 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[Q]], i64 [[OFFSET_IDX6]] +; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[TMP8]], i64 12 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT]], @@ -283,8 +297,8 @@ define void @example3(i32 %n, ptr noalias nocapture %p, ptr noalias nocapture %q ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0 ; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[Q:%.*]], i64 [[OFFSET_IDX6]] +; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[Q]], i64 [[OFFSET_IDX6]] +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[P]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[NEXT_GEP7]], align 16 ; CHECK-NEXT: store i32 [[TMP5]], ptr [[NEXT_GEP]], align 16 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] @@ -292,10 +306,6 @@ define void @example3(i32 %n, ptr noalias nocapture %p, ptr noalias nocapture %q ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1 ; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]] ; CHECK: pred.store.if11: -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[P]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[TMP7]], i64 4 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[Q]], i64 [[OFFSET_IDX6]] -; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[TMP8]], i64 4 ; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[NEXT_GEP8]], align 16 ; CHECK-NEXT: store i32 [[TMP9]], ptr [[NEXT_GEP3]], align 16 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]] @@ -303,10 +313,6 @@ define void @example3(i32 %n, ptr noalias nocapture %p, ptr noalias nocapture %q ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2 ; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]] ; CHECK: pred.store.if13: -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[P]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[TMP11]], i64 8 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[Q]], i64 [[OFFSET_IDX6]] -; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[TMP12]], i64 8 ; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[NEXT_GEP9]], align 16 ; CHECK-NEXT: store i32 [[TMP13]], ptr [[NEXT_GEP4]], align 16 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE14]] @@ -314,10 +320,6 @@ define void @example3(i32 %n, ptr noalias nocapture %p, ptr noalias nocapture %q ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3 ; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16]] ; CHECK: pred.store.if15: -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[P]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[TMP15]], i64 12 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[Q]], i64 [[OFFSET_IDX6]] -; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[TMP16]], i64 12 ; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[NEXT_GEP10]], align 16 ; CHECK-NEXT: store i32 [[TMP17]], ptr [[NEXT_GEP5]], align 16 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE16]] @@ -445,6 +447,8 @@ define void @example23b(ptr noalias nocapture %src, ptr noalias nocapture %dst) } ; We CAN vectorize this example by folding the tail it entails. +; Note: Most scalar pointer induction GEPs could be sunk into the conditional +; blocks. define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst) optsize { ; CHECK-LABEL: @example23c( ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -453,7 +457,19 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst) ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE15:%.*]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[TMP7]], i64 2 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[TMP8]], i64 4 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[TMP13]], i64 6 ; CHECK-NEXT: [[OFFSET_IDX5:%.*]] = shl i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 [[OFFSET_IDX5]] +; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[TMP14]], i64 4 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX5]] +; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[TMP19]], i64 8 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX5]] +; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[TMP20]], i64 12 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT]], @@ -461,8 +477,8 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst) ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i64 0 ; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: -; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 [[OFFSET_IDX5]] -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX5]] +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr [[NEXT_GEP]], align 2 ; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP3]] to i32 ; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 7 @@ -472,10 +488,6 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst) ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP1]], i64 1 ; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] ; CHECK: pred.store.if9: -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX5]] -; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[TMP7]], i64 4 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[TMP8]], i64 2 ; CHECK-NEXT: [[TMP9:%.*]] = load i16, ptr [[NEXT_GEP2]], align 2 ; CHECK-NEXT: [[TMP10:%.*]] = zext i16 [[TMP9]] to i32 ; CHECK-NEXT: [[TMP11:%.*]] = shl nuw nsw i32 [[TMP10]], 7 @@ -485,10 +497,6 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst) ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP1]], i64 2 ; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE12:%.*]] ; CHECK: pred.store.if11: -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX5]] -; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[TMP13]], i64 8 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[TMP14]], i64 4 ; CHECK-NEXT: [[TMP15:%.*]] = load i16, ptr [[NEXT_GEP3]], align 2 ; CHECK-NEXT: [[TMP16:%.*]] = zext i16 [[TMP15]] to i32 ; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i32 [[TMP16]], 7 @@ -498,10 +506,6 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst) ; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i1> [[TMP1]], i64 3 ; CHECK-NEXT: br i1 [[TMP18]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE15]] ; CHECK: pred.store.if13: -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX5]] -; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[TMP19]], i64 12 -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[TMP20]], i64 6 ; CHECK-NEXT: [[TMP21:%.*]] = load i16, ptr [[NEXT_GEP4]], align 2 ; CHECK-NEXT: [[TMP22:%.*]] = zext i16 [[TMP21]] to i32 ; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 7 diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll index 06b6a2b29c01e..d7dc68b082f29 100644 --- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll @@ -3,7 +3,8 @@ ; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -lv-strided-pointer-ivs=true -S | FileCheck --check-prefixes=CHECK,STRIDED %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" - +; Note: Most scalar pointer induction GEPs could be sunk into the conditional +; blocks. ; Function Attrs: nofree norecurse nounwind define void @a(ptr readnone %b) { ; CHECK-LABEL: @a( @@ -25,7 +26,13 @@ define void @a(ptr readnone %b) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE10:%.*]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 0, [[INDEX]] ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], -1 +; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], -2 +; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], -3 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr null, i64 [[TMP2]] +; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr null, i64 [[TMP11]] +; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr null, i64 [[TMP14]] +; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr null, i64 [[TMP17]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i64 -1 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 -3 @@ -43,8 +50,6 @@ define void @a(ptr readnone %b) { ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP7]], i32 1 ; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] ; CHECK: pred.store.if5: -; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], -1 -; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr null, i64 [[TMP11]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP2]], i64 -1 ; CHECK-NEXT: store i8 95, ptr [[TMP12]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] @@ -52,8 +57,6 @@ define void @a(ptr readnone %b) { ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP7]], i32 2 ; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] ; CHECK: pred.store.if7: -; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], -2 -; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr null, i64 [[TMP14]] ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP3]], i64 -1 ; CHECK-NEXT: store i8 95, ptr [[TMP15]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]] @@ -61,8 +64,6 @@ define void @a(ptr readnone %b) { ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3 ; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10]] ; CHECK: pred.store.if9: -; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], -3 -; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr null, i64 [[TMP17]] ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP4]], i64 -1 ; CHECK-NEXT: store i8 95, ptr [[TMP18]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]] diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll index caa5969bbc365..13d5be1b94d15 100644 --- a/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll +++ b/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll @@ -82,6 +82,8 @@ for.body: br i1 %exitcond, label %for.cond.cleanup, label %for.body } +; Note: Most scalar pointer induction GEPs could be sunk into the conditional +; blocks. define void @VF1-VPWidenCanonicalIVRecipeExe(ptr %ptr1) { ; CHECK-LABEL: @VF1-VPWidenCanonicalIVRecipeExe( ; CHECK-NEXT: entry: @@ -92,6 +94,13 @@ define void @VF1-VPWidenCanonicalIVRecipeExe(ptr %ptr1) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE12:%.*]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 8 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 24 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP4]] +; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP5]] +; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP6]] ; CHECK-NEXT: [[VEC_IV:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[VEC_IV4:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[VEC_IV5:%.*]] = add i64 [[INDEX]], 2 @@ -102,28 +111,21 @@ define void @VF1-VPWidenCanonicalIVRecipeExe(ptr %ptr1) { ; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i64 [[VEC_IV6]], 14 ; CHECK-NEXT: br i1 [[TMP0]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[NEXT_GEP]], align 8 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] ; CHECK: pred.store.if7: -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 8 -; CHECK-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP5]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[NEXT_GEP1]], align 8 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]] ; CHECK: pred.store.continue8: ; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] ; CHECK: pred.store.if9: -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 16 -; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP6]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[NEXT_GEP2]], align 8 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]] ; CHECK: pred.store.continue10: ; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12]] ; CHECK: pred.store.if11: -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 24 -; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP7]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[NEXT_GEP3]], align 8 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]] ; CHECK: pred.store.continue12: