diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b96d29e635465..04232849dced2 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7212,6 +7212,7 @@ DenseMap LoopVectorizationPlanner::executePlan( VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); VPlanTransforms::simplifyRecipes(BestVPlan); + VPlanTransforms::removeRedundantAndMasks(BestVPlan); VPlanTransforms::removeBranchOnConst(BestVPlan); if (BestVPlan.getEntry()->getSingleSuccessor() == BestVPlan.getScalarPreheader()) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 40b7e8df7aec9..3860e7cb6db3b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -4406,3 +4406,146 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan, } } } + +// Use vector.check block to determine if we can eliminate a bounds check on +// the IV if we know that we can only enter the vector block if the tripcount +// is within certain bounds. +static bool canElimAndMaskOnPHI(Instruction *I, VPIRBasicBlock *SCEVCheckBB, + Value *SCEVCheckConditional) { + + if (I->getOpcode() != Instruction::And) + return false; + + Value *Op0 = I->getOperand(0); + Value *Op1 = I->getOperand(1); + + PHINode *IndVar; + ConstantInt *Mask; + + if (Mask = dyn_cast(Op0)) + IndVar = dyn_cast(Op1); + else if (Mask = dyn_cast(Op1)) + IndVar = dyn_cast(Op0); + + if (!Mask || !IndVar) + return false; + + if (auto *CmpI = dyn_cast(SCEVCheckConditional)) { + // Check if the condition for the terminating instruction + // is doing some comparison with a constant integer. If not + // we can't elim our AND mask + Value *CmpOp0 = CmpI->getOperand(0); + Value *CmpOp1 = CmpI->getOperand(1); + auto *CmpConstant = (dyn_cast(CmpOp0)) + ? dyn_cast(CmpOp0) + : dyn_cast(CmpOp1); + if (!CmpConstant) + return false; + + unsigned CmpIOpcode = CmpI->getPredicate(); + if (((CmpConstant == CmpOp1 && CmpIOpcode == CmpInst::ICMP_UGT) || + (CmpConstant == CmpOp0 && CmpIOpcode == CmpInst::ICMP_ULT)) && + (CmpConstant->uge(Mask->getZExtValue()))) + return true; + } + return false; +} + +// Check that there's a path from the src BB to the dest BB +static bool CheckPathFromSrcBBToDestBB(VPBlockBase *Src, VPBlockBase *Dest) { + if (!Src || !Dest) + return false; + + for (auto *VPBB : Src->getSuccessors()) { + if (VPBB == Dest) { + return true; + } else if (VPBB->getNumSuccessors() > 0 && + CheckPathFromSrcBBToDestBB(VPBB, Dest)) + return true; + } + return false; +}; + +// Attempt to spot and eliminate no-op AND operations in loop bodies. +// For example loop Vectorization may create loops like the following. +// +// vector.scevcheck: +// %1 = add i64 %flatten.tripcount, -1 +// %2 = icmp ugt i64 %1, 4294967295 +// br i1 %2, label %scalar.ph, label %vector.ph +// vector.ph: +// %iv = phi i64 [ 0, %vector.scevcheck], [ %iv.next, %vector.ph ] +// %m = and i64 %iv, 4294967295 ; 0xffff_fffe no op +// %p = getelementptr inbounds <4 x i32>, ptr %A, i64 %m +// %load = load <4 x i32>, ptr %p, align 4 +// %1 = add <4 x i32> %load, %X +// store <4 x i32> %1, ptr %p, align 4 +// %iv.next = add nuw i64 %iv, 4 +// %c = icmp ult i64 %iv.next, %N +// br i1 %c, label %vector.ph, label %exit +// exit: +// ret void +// +// The vectorizer creates the SCEV check block to perform +// runtime IV checks. This block can be used to determine true +// range of the the IV as entry into the vector loop is only possible +// for certain tripcount values. +// +void VPlanTransforms::removeRedundantAndMasks(VPlan &Plan) { + if (!Plan.getVectorLoopRegion()) + return; + + auto FindSCEVCheckBlock = [&]() -> VPIRBasicBlock * { + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_deep(Plan.getEntry()))) { + if (auto *IRBB = dyn_cast(VPBB)) + if (IRBB->getIRBasicBlock()->getName() == "vector.scevcheck") + return IRBB; + } + return nullptr; + }; + + auto FindPHIRecipeToReplaceAnd = [&](VPBasicBlock *VPBB, + VPSingleDefRecipe *ToReplace) -> void { + VPRecipeBase *PredRecipe = nullptr; + for (auto &PHI : VPBB->phis()) { + if (auto *VPI = dyn_cast(&PHI)) + if (ToReplace->getOperand(0) == VPI || + ToReplace->getOperand(1) == VPI) { + ToReplace->replaceAllUsesWith(VPI); + return; + } + } + }; + + if (VPIRBasicBlock *SCEVCheckBB = FindSCEVCheckBlock()) { + VPBasicBlock *VPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); + + // Determine if the SCEV check BB branches to the loop preheader + // or header + if (!CheckPathFromSrcBBToDestBB(SCEVCheckBB, Plan.getVectorPreheader()) && + !CheckPathFromSrcBBToDestBB(SCEVCheckBB, Plan.getVectorLoopRegion())) + return; + + if (auto *SCEVCheckTerminatorRecipe = + dyn_cast(SCEVCheckBB->getTerminator())) { + if (SCEVCheckTerminatorRecipe->getOpcode() != VPInstruction::BranchOnCond) + return; + + VPValue *SCEVCheckCondRecipe = SCEVCheckTerminatorRecipe->getOperand(0); + + for (auto &R : VPBB->getRecipeList()) { + if (auto *VPI = dyn_cast(&R)) { + Value *V = VPI->getUnderlyingValue(); + if (!V) + continue; + + if (Instruction *I = dyn_cast(V)) + if (canElimAndMaskOnPHI(I, SCEVCheckBB, + SCEVCheckCondRecipe->getLiveInIRValue())) + return FindPHIRecipeToReplaceAnd(VPBB, VPI); + } + } + } + } +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 5a8a2bbc2975e..f6e5c4c4ea1de 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -376,6 +376,11 @@ struct VPlanTransforms { /// users in the original exit block using the VPIRInstruction wrapping to the /// LCSSA phi. static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range); + + // Make use of runtime SCEV check blocks generated by the LoopVectorizer + // to see if we can eliminate bounds checking AND instructions in loop + // blocks + static void removeRedundantAndMasks(VPlan &Plan); }; } // namespace llvm diff --git a/llvm/test/Transforms/LoopVectorize/vplan-transforms-remove-redundant-masks.ll b/llvm/test/Transforms/LoopVectorize/vplan-transforms-remove-redundant-masks.ll new file mode 100644 index 0000000000000..18370ed23bb74 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/vplan-transforms-remove-redundant-masks.ll @@ -0,0 +1,87 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes=loop-vectorize -force-vector-width=2 < %s | FileCheck %s + +define void @elim_no_op_and(i32 %N, ptr %A, i32 %val) { +; CHECK-LABEL: define void @elim_no_op_and( +; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]], i32 [[VAL:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP16_NOT:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP16_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_COND1_PREHEADER_LR_PH:.*]] +; CHECK: [[FOR_COND1_PREHEADER_LR_PH]]: +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[FLATTEN_TRIPCOUNT:%.*]] = mul nuw i64 [[TMP0]], [[TMP0]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[FLATTEN_TRIPCOUNT]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; CHECK: [[VECTOR_SCEVCHECK]]: +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[FLATTEN_TRIPCOUNT]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[TMP1]], 4294967295 +; CHECK-NEXT: br i1 [[TMP2]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[FLATTEN_TRIPCOUNT]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[FLATTEN_TRIPCOUNT]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[VAL]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[TMP3:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store <2 x i32> [[TMP5]], ptr [[TMP4]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP3]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[FLATTEN_TRIPCOUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_COND1_PREHEADER_LR_PH]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: br label %[[FOR_COND1_PREHEADER:.*]] +; CHECK: [[FOR_COND1_PREHEADER]]: +; CHECK-NEXT: [[INDVAR18:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVAR_NEXT19:%.*]], %[[FOR_COND1_PREHEADER]] ] +; CHECK-NEXT: [[IDXPROM:%.*]] = and i64 [[INDVAR18]], 4294967295 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD5:%.*]] = add i32 [[TMP7]], [[VAL]] +; CHECK-NEXT: store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[INDVAR_NEXT19]] = add nuw i64 [[INDVAR18]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVAR_NEXT19]], [[FLATTEN_TRIPCOUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; +entry: + %cmp16.not = icmp eq i32 %N, 0 + br i1 %cmp16.not, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph + +for.cond1.preheader.lr.ph: ; preds = %entry + %0 = zext i32 %N to i64 + %flatten.tripcount = mul nuw i64 %0, %0 + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond1.preheader.lr.ph, %for.cond1.preheader + %indvar18 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvar.next19, %for.cond1.preheader ] + %idxprom = and i64 %indvar18, 4294967295 + %arrayidx = getelementptr inbounds nuw i32, ptr %A, i64 %idxprom + %1 = load i32, ptr %arrayidx, align 4 + %add5 = add i32 %1, %val + store i32 %add5, ptr %arrayidx, align 4 + %indvar.next19 = add nuw i64 %indvar18, 1 + %exitcond.not = icmp eq i64 %indvar.next19, %flatten.tripcount + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.cond1.preheader + +for.cond.cleanup.loopexit: ; preds = %for.cond1.preheader + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void +} + +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} +;.