Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7212,6 +7212,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(

VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
VPlanTransforms::simplifyRecipes(BestVPlan);
VPlanTransforms::removeRedundantAndMasks(BestVPlan);
VPlanTransforms::removeBranchOnConst(BestVPlan);
if (BestVPlan.getEntry()->getSingleSuccessor() ==
BestVPlan.getScalarPreheader()) {
Expand Down
143 changes: 143 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4406,3 +4406,146 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
}
}
}

// Use vector.check block to determine if we can eliminate a bounds check on
// the IV if we know that we can only enter the vector block if the tripcount
// is within certain bounds.
static bool canElimAndMaskOnPHI(Instruction *I, VPIRBasicBlock *SCEVCheckBB,
Value *SCEVCheckConditional) {

if (I->getOpcode() != Instruction::And)
return false;

Value *Op0 = I->getOperand(0);
Value *Op1 = I->getOperand(1);

PHINode *IndVar;
ConstantInt *Mask;

if (Mask = dyn_cast<ConstantInt>(Op0))
IndVar = dyn_cast<PHINode>(Op1);
else if (Mask = dyn_cast<ConstantInt>(Op1))
IndVar = dyn_cast<PHINode>(Op0);

if (!Mask || !IndVar)
return false;

if (auto *CmpI = dyn_cast<CmpInst>(SCEVCheckConditional)) {
// Check if the condition for the terminating instruction
// is doing some comparison with a constant integer. If not
// we can't elim our AND mask
Value *CmpOp0 = CmpI->getOperand(0);
Value *CmpOp1 = CmpI->getOperand(1);
auto *CmpConstant = (dyn_cast<ConstantInt>(CmpOp0))
? dyn_cast<ConstantInt>(CmpOp0)
: dyn_cast<ConstantInt>(CmpOp1);
if (!CmpConstant)
return false;

unsigned CmpIOpcode = CmpI->getPredicate();
if (((CmpConstant == CmpOp1 && CmpIOpcode == CmpInst::ICMP_UGT) ||
(CmpConstant == CmpOp0 && CmpIOpcode == CmpInst::ICMP_ULT)) &&
(CmpConstant->uge(Mask->getZExtValue())))
return true;
}
return false;
}

// Check that there's a path from the src BB to the dest BB
static bool CheckPathFromSrcBBToDestBB(VPBlockBase *Src, VPBlockBase *Dest) {
if (!Src || !Dest)
return false;

for (auto *VPBB : Src->getSuccessors()) {
if (VPBB == Dest) {
return true;
} else if (VPBB->getNumSuccessors() > 0 &&
CheckPathFromSrcBBToDestBB(VPBB, Dest))
return true;
}
return false;
};

// Attempt to spot and eliminate no-op AND operations in loop bodies.
// For example loop Vectorization may create loops like the following.
//
// vector.scevcheck:
// %1 = add i64 %flatten.tripcount, -1
// %2 = icmp ugt i64 %1, 4294967295
// br i1 %2, label %scalar.ph, label %vector.ph
// vector.ph:
// %iv = phi i64 [ 0, %vector.scevcheck], [ %iv.next, %vector.ph ]
// %m = and i64 %iv, 4294967295 ; 0xffff_fffe no op
// %p = getelementptr inbounds <4 x i32>, ptr %A, i64 %m
// %load = load <4 x i32>, ptr %p, align 4
// %1 = add <4 x i32> %load, %X
// store <4 x i32> %1, ptr %p, align 4
// %iv.next = add nuw i64 %iv, 4
// %c = icmp ult i64 %iv.next, %N
// br i1 %c, label %vector.ph, label %exit
// exit:
// ret void
//
// The vectorizer creates the SCEV check block to perform
// runtime IV checks. This block can be used to determine true
// range of the the IV as entry into the vector loop is only possible
// for certain tripcount values.
//
void VPlanTransforms::removeRedundantAndMasks(VPlan &Plan) {
if (!Plan.getVectorLoopRegion())
return;

auto FindSCEVCheckBlock = [&]() -> VPIRBasicBlock * {
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_deep(Plan.getEntry()))) {
if (auto *IRBB = dyn_cast<VPIRBasicBlock>(VPBB))
if (IRBB->getIRBasicBlock()->getName() == "vector.scevcheck")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You shouldn't be checking for the block by name, as it isn't guaranteed to work, e.g. if you have two loops in a function that are vectorized the scev check block of the second will have a number appended to its name, so this transform will fail to apply.

It's GeneratedRTChecks that's generating the check condition, and it has it in SCEVCheckCond, so I think probably you need to get that value here somehow.

return IRBB;
}
return nullptr;
};

auto FindPHIRecipeToReplaceAnd = [&](VPBasicBlock *VPBB,
VPSingleDefRecipe *ToReplace) -> void {
VPRecipeBase *PredRecipe = nullptr;
for (auto &PHI : VPBB->phis()) {
if (auto *VPI = dyn_cast<VPSingleDefRecipe>(&PHI))
if (ToReplace->getOperand(0) == VPI ||
ToReplace->getOperand(1) == VPI) {
ToReplace->replaceAllUsesWith(VPI);
return;
}
}
};

if (VPIRBasicBlock *SCEVCheckBB = FindSCEVCheckBlock()) {
VPBasicBlock *VPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();

// Determine if the SCEV check BB branches to the loop preheader
// or header
if (!CheckPathFromSrcBBToDestBB(SCEVCheckBB, Plan.getVectorPreheader()) &&
!CheckPathFromSrcBBToDestBB(SCEVCheckBB, Plan.getVectorLoopRegion()))
return;

if (auto *SCEVCheckTerminatorRecipe =
dyn_cast<VPInstruction>(SCEVCheckBB->getTerminator())) {
if (SCEVCheckTerminatorRecipe->getOpcode() != VPInstruction::BranchOnCond)
return;

VPValue *SCEVCheckCondRecipe = SCEVCheckTerminatorRecipe->getOperand(0);

for (auto &R : VPBB->getRecipeList()) {
if (auto *VPI = dyn_cast<VPSingleDefRecipe>(&R)) {
Value *V = VPI->getUnderlyingValue();
if (!V)
continue;

if (Instruction *I = dyn_cast<Instruction>(V))
if (canElimAndMaskOnPHI(I, SCEVCheckBB,
SCEVCheckCondRecipe->getLiveInIRValue()))
return FindPHIRecipeToReplaceAnd(VPBB, VPI);
}
}
}
}
}
5 changes: 5 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,11 @@ struct VPlanTransforms {
/// users in the original exit block using the VPIRInstruction wrapping to the
/// LCSSA phi.
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range);

// Make use of runtime SCEV check blocks generated by the LoopVectorizer
// to see if we can eliminate bounds checking AND instructions in loop
// blocks
static void removeRedundantAndMasks(VPlan &Plan);
};

} // namespace llvm
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt -S -passes=loop-vectorize -force-vector-width=2 < %s | FileCheck %s

define void @elim_no_op_and(i32 %N, ptr %A, i32 %val) {
; CHECK-LABEL: define void @elim_no_op_and(
; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]], i32 [[VAL:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[CMP16_NOT:%.*]] = icmp eq i32 [[N]], 0
; CHECK-NEXT: br i1 [[CMP16_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_COND1_PREHEADER_LR_PH:.*]]
; CHECK: [[FOR_COND1_PREHEADER_LR_PH]]:
; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
; CHECK-NEXT: [[FLATTEN_TRIPCOUNT:%.*]] = mul nuw i64 [[TMP0]], [[TMP0]]
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[FLATTEN_TRIPCOUNT]], 2
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
; CHECK: [[VECTOR_SCEVCHECK]]:
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[FLATTEN_TRIPCOUNT]], -1
; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[TMP1]], 4294967295
; CHECK-NEXT: br i1 [[TMP2]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[FLATTEN_TRIPCOUNT]], 2
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[FLATTEN_TRIPCOUNT]], [[N_MOD_VF]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[VAL]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[TMP3:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: store <2 x i32> [[TMP5]], ptr [[TMP4]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP3]], 2
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[FLATTEN_TRIPCOUNT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_COND1_PREHEADER_LR_PH]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
; CHECK-NEXT: br label %[[FOR_COND1_PREHEADER:.*]]
; CHECK: [[FOR_COND1_PREHEADER]]:
; CHECK-NEXT: [[INDVAR18:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVAR_NEXT19:%.*]], %[[FOR_COND1_PREHEADER]] ]
; CHECK-NEXT: [[IDXPROM:%.*]] = and i64 [[INDVAR18]], 4294967295
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IDXPROM]]
; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ADD5:%.*]] = add i32 [[TMP7]], [[VAL]]
; CHECK-NEXT: store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[INDVAR_NEXT19]] = add nuw i64 [[INDVAR18]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVAR_NEXT19]], [[FLATTEN_TRIPCOUNT]]
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]:
; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
; CHECK: [[FOR_COND_CLEANUP]]:
; CHECK-NEXT: ret void
;
entry:
%cmp16.not = icmp eq i32 %N, 0
br i1 %cmp16.not, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph

for.cond1.preheader.lr.ph: ; preds = %entry
%0 = zext i32 %N to i64
%flatten.tripcount = mul nuw i64 %0, %0
br label %for.cond1.preheader

for.cond1.preheader: ; preds = %for.cond1.preheader.lr.ph, %for.cond1.preheader
%indvar18 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvar.next19, %for.cond1.preheader ]
%idxprom = and i64 %indvar18, 4294967295
%arrayidx = getelementptr inbounds nuw i32, ptr %A, i64 %idxprom
%1 = load i32, ptr %arrayidx, align 4
%add5 = add i32 %1, %val
store i32 %add5, ptr %arrayidx, align 4
%indvar.next19 = add nuw i64 %indvar18, 1
%exitcond.not = icmp eq i64 %indvar.next19, %flatten.tripcount
br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.cond1.preheader

for.cond.cleanup.loopexit: ; preds = %for.cond1.preheader
br label %for.cond.cleanup

for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
ret void
}

;.
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
;.
Loading