Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1852,6 +1852,12 @@ class TargetTransformInfo {
/// maximum register pressure exceeds getNumberOfRegisters.
LLVM_ABI bool shouldConsiderVectorizationRegPressure() const;

/// Return true if the loop vectorizer can generate control flow (conditional
/// blocks) inside the vector region. Otherwise, the loop vectorizer will
/// generate a single block for the vector region and handle control flow via
/// a mask.
LLVM_ABI bool preferControlFlow() const;

/// \returns True if the target wants to expand the given reduction intrinsic
/// into a shuffle sequence.
LLVM_ABI bool shouldExpandReduction(const IntrinsicInst *II) const;
Expand Down
2 changes: 2 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1099,6 +1099,8 @@ class TargetTransformInfoImplBase {

virtual bool shouldConsiderVectorizationRegPressure() const { return false; }

virtual bool preferControlFlow() const { return false; }

virtual bool shouldExpandReduction(const IntrinsicInst *II) const {
return true;
}
Expand Down
2 changes: 2 additions & 0 deletions llvm/include/llvm/CodeGen/BasicTTIImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -793,6 +793,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return BaseT::preferPredicateOverEpilogue(TFI);
}

bool preferControlFlow() const override { return BaseT::preferControlFlow(); }

TailFoldingStyle
getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) const override {
return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow);
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Analysis/TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,10 @@ bool TargetTransformInfo::preferPredicateOverEpilogue(
return TTIImpl->preferPredicateOverEpilogue(TFI);
}

bool TargetTransformInfo::preferControlFlow() const {
return TTIImpl->preferControlFlow();
}

TailFoldingStyle TargetTransformInfo::getPreferredTailFoldingStyle(
bool IVUpdateMayOverflow) const {
return TTIImpl->getPreferredTailFoldingStyle(IVUpdateMayOverflow);
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {

bool shouldConsiderVectorizationRegPressure() const override { return true; }

bool preferControlFlow() const override { return false; }

InstructionCost
getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
unsigned AddressSpace,
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,10 @@ static cl::opt<bool> PreferPredicatedReductionSelect(
cl::desc(
"Prefer predicating a reduction operation over an after loop select."));

static cl::opt<bool> PreferControlFlow(
"prefer-control-flow", cl::init(false), cl::Hidden,
cl::desc("Generate control flow inside the vector region."));

cl::opt<bool> llvm::EnableVPlanNativePath(
"enable-vplan-native-path", cl::Hidden,
cl::desc("Enable VPlan-native vectorization path with "
Expand Down Expand Up @@ -4202,6 +4206,10 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
case VPInstruction::ExplicitVectorLength:
C += VPI->cost(VF, CostCtx);
break;
case VPInstruction::AnyOf:
if (!VPI->getUnderlyingValue())
C += VPI->cost(VF, CostCtx);
break;
default:
break;
}
Expand Down Expand Up @@ -8198,6 +8206,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
if (CM.foldTailWithEVL())
VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength,
*Plan, CM.getMaxSafeElements());
if (PreferControlFlow || TTI.preferControlFlow())
VPlanTransforms::optimizeConditionalVPBB(*Plan);
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
VPlans.push_back(std::move(Plan));
}
Expand Down
157 changes: 157 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4486,3 +4486,160 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
}
}
}

void VPlanTransforms::optimizeConditionalVPBB(VPlan &Plan) {
VPDominatorTree VPDT(Plan);

VPValue *HeaderMask = findHeaderMask(Plan);

// Get the mask from the store recipes.
auto GetMask = [&HeaderMask](VPRecipeBase &R) -> VPValue * {
using namespace llvm::VPlanPatternMatch;
if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(R)) {
VPValue *OrigMask = cast<VPWidenMemoryRecipe>(R).getMask();
if (!OrigMask || OrigMask == HeaderMask ||
match(OrigMask, m_VPInstruction<VPInstruction::ActiveLaneMask>(
m_VPValue(), m_VPValue())))
return nullptr;

return OrigMask;
}
return nullptr;
};

// First, collect all masked stores.
SmallVector<std::pair<VPRecipeBase *, VPValue *>> MaskedStores;
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
Plan.getEntry());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
for (VPRecipeBase &R : *VPBB) {
if (VPValue *Mask = GetMask(R))
MaskedStores.emplace_back(&R, Mask);
}
}

if (MaskedStores.empty())
return;

DenseSet<VPRecipeBase *> Candidates;
auto AddOperandsToCandidates = [&Candidates](VPRecipeBase *R) {
for (VPValue *Op : R->operands())
if (VPRecipeBase *OpR = Op->getDefiningRecipe())
Candidates.insert(OpR);
};

SmallVector<SetVector<VPRecipeBase *>> Tries;
while (!MaskedStores.empty()) {
auto [SR, M] = MaskedStores.pop_back_val();
Candidates.clear();
AddOperandsToCandidates(SR);

SetVector<VPRecipeBase *> CurrentTree;
CurrentTree.insert(SR);

VPBasicBlock *MaskBlock =
M->hasDefiningRecipe() ? M->getDefiningRecipe()->getParent() : nullptr;

// Don't move recipes before the mask and PHI recipes.
auto End = MaskBlock == SR->getParent()
? M->getDefiningRecipe()->getReverseIterator()
: SR->getParent()->getFirstNonPhi()->getReverseIterator();
// Also don't move the recipes through any recipe that may have side effects
// or write to memory.
for (auto It = std::next(SR->getReverseIterator()); It != End; ++It) {
if (It->mayHaveSideEffects() || It->mayWriteToMemory()) {
End = It;
break;
}
}

// Greedily add all recipes that are used to compute the stored value to the
// tree. All users of the added recipe must dominate the store
// recipe.
for (VPRecipeBase &R : make_range(SR->getReverseIterator(), End)) {
// Recipe is not part of the tree
if (!Candidates.contains(&R))
continue;

if (any_of(R.definedValues(), [&SR = SR, &VPDT](VPValue *Def) {
for (VPUser *U : Def->users()) {
if (auto *UR = dyn_cast<VPRecipeBase>(U)) {
if (UR == SR || VPDT.properlyDominates(UR, SR))
continue;
}
return true;
}
return false;
}))
continue;

CurrentTree.insert(&R);
AddOperandsToCandidates(&R);
}
// The previous traversal could have added recipes that are used by
// non-added recipes, which need to be removed from the list.
SmallDenseSet<VPRecipeBase *, 8> ToRemove;
bool Changed;
do {
Changed = false;
for (VPRecipeBase *R : CurrentTree) {
if (ToRemove.contains(R))
continue;
if (any_of(R->definedValues(), [&](VPValue *Def) {
for (VPUser *U : Def->users()) {
if (auto *UR = dyn_cast<VPRecipeBase>(U))
if (!CurrentTree.contains(UR) || ToRemove.contains(UR))
return true;
}
return false;
})) {
Changed = true;
ToRemove.insert(R);
}
}
} while (Changed);

for (VPRecipeBase *R : ToRemove)
CurrentTree.remove(R);

if (CurrentTree.size() > 1)
Tries.push_back(CurrentTree);
}

for (const auto &List : Tries) {
VPRecipeBase *SR = List.front();
VPValue *M = cast<VPWidenMemoryRecipe>(SR)->getMask();
assert(M && "Mask VPValue must exist at this point");
auto Recipes = reverse(List.getArrayRef());

// Split the current basic block at the store recipe point so that
// a predicated block can be added in between.
VPBasicBlock *ParentBB = SR->getParent();
VPBasicBlock *ContBB = ParentBB->splitAt(SR->getIterator());

// Create VPBB and insert it between ParentBB and ContBB.
VPBasicBlock *IfBB = Plan.createVPBasicBlock("vector.if.bb");
VPBlockUtils::insertBlockAfter(IfBB, ParentBB);
if (ContBB->getNumSuccessors() == 0)
ParentBB->getEnclosingLoopRegion()->setExiting(ContBB);

// Copy recipes into the conditional block.
for (VPRecipeBase *R : Recipes)
R->moveBefore(*IfBB, IfBB->end());

// Add the condition and branch in the parent block.
auto *ActiveLane =
new VPInstruction(VPInstruction::AnyOf, {M}, nullptr, "any.of.mask");

auto *BranchOnCond =
new VPInstruction(VPInstruction::BranchOnCond, ActiveLane);
ParentBB->appendRecipe(ActiveLane);
ParentBB->appendRecipe(BranchOnCond);

// Set proper predecessors and successors for the conditional block.
ParentBB->clearSuccessors();
ParentBB->setSuccessors({IfBB, ContBB});
ContBB->clearPredecessors();
ContBB->setPredecessors({ParentBB, IfBB});
}
}
23 changes: 23 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,29 @@ struct VPlanTransforms {
/// users in the original exit block using the VPIRInstruction wrapping to the
/// LCSSA phi.
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range);

/// Try to convert flattened control flow into a conditional vector basic
/// block. If there are no active bits in the mask, it will skip all masked
/// operations. This transformation will collect all masked operations
/// bottom-up from the masked stores and put all masked operations in a new
/// vector basic block. The original vector.loop will be split and the newly
/// created basic block will be inserted in between.
///
///
/// [ ] <-- vector.loop
/// ^ | %any.active.mask = any-of(%Mask)
/// / | Branch-On-Count %any.active.mask, 0
/// / |\
/// | (T)| \ (F)
/// | | v
/// | | [ ] <-- vector.if.bb (masked operations)
/// | | |
/// | | v
/// | +-->[ ] <-- vector.loop.split
/// | | |
/// +---------+ v
/// [ ] <-- middle.block
static void optimizeConditionalVPBB(VPlan &Plan);
};

} // namespace llvm
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -p loop-vectorize -force-vector-width=4 -S -mtriple=riscv64 -mattr=+v -prefer-control-flow %s | FileCheck %s

define void @test(i32 %control1, i32 %control2, i32 %target, i32 %reg.4.val, ptr %reg.24.val) {
; CHECK-LABEL: define void @test(
; CHECK-SAME: i32 [[CONTROL1:%.*]], i32 [[CONTROL2:%.*]], i32 [[TARGET:%.*]], i32 [[REG_4_VAL:%.*]], ptr [[REG_24_VAL:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[REG_4_VAL]], 0
; CHECK-NEXT: br i1 [[CMP1]], label %[[FOR_BODY_LR_PH:.*]], label %[[FOR_END:.*]]
; CHECK: [[FOR_BODY_LR_PH]]:
; CHECK-NEXT: [[SH_PROM:%.*]] = zext nneg i32 [[CONTROL1]] to i64
; CHECK-NEXT: [[SHL:%.*]] = shl nuw i64 1, [[SH_PROM]]
; CHECK-NEXT: [[SH_PROM5:%.*]] = zext nneg i32 [[CONTROL2]] to i64
; CHECK-NEXT: [[SHL6:%.*]] = shl nuw i64 1, [[SH_PROM5]]
; CHECK-NEXT: [[SH_PROM10:%.*]] = zext nneg i32 [[TARGET]] to i64
; CHECK-NEXT: [[SHL11:%.*]] = shl nuw nsw i64 1, [[SH_PROM10]]
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[REG_4_VAL]] to i64
; CHECK-NEXT: [[TMP0:%.*]] = freeze i64 [[SHL6]]
; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[SHL]], [[TMP0]]
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[SHL11]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP1]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY_SPLIT:.*]] ]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[REG_24_VAL]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[TMP2]], i32 4
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT2]]
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT2]]
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[TMP6]], [[BROADCAST_SPLAT2]]
; CHECK-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP7]]
; CHECK-NEXT: [[TMP13:%.*]] = freeze <4 x i1> [[TMP8]]
; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i1> [[TMP12]], [[TMP13]]
; CHECK-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]])
; CHECK-NEXT: br i1 [[TMP11]], label %[[VECTOR_IF_BB:.*]], label %[[VECTOR_BODY_SPLIT]]
; CHECK: [[VECTOR_IF_BB]]:
; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP9]], ptr [[TMP2]], i32 8, <4 x i1> [[TMP7]])
; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP10]], ptr [[TMP4]], i32 8, <4 x i1> [[TMP8]])
; CHECK-NEXT: br label %[[VECTOR_BODY_SPLIT]]
; CHECK: [[VECTOR_BODY_SPLIT]]:
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_LR_PH]] ]
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
; CHECK: [[FOR_BODY]]:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[REG_24_VAL]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[TMP27:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; CHECK-NEXT: [[TMP28:%.*]] = and i64 [[TMP27]], [[TMP1]]
; CHECK-NEXT: [[OR_COND_NOT:%.*]] = icmp eq i64 [[TMP28]], [[TMP1]]
; CHECK-NEXT: br i1 [[OR_COND_NOT]], label %[[IF_THEN9:.*]], label %[[FOR_INC]]
; CHECK: [[IF_THEN9]]:
; CHECK-NEXT: [[XOR:%.*]] = xor i64 [[TMP27]], [[SHL11]]
; CHECK-NEXT: store i64 [[XOR]], ptr [[ARRAYIDX]], align 8
; CHECK-NEXT: br label %[[FOR_INC]]
; CHECK: [[FOR_INC]]:
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[FOR_END_LOOPEXIT]]:
; CHECK-NEXT: br label %[[FOR_END]]
; CHECK: [[FOR_END]]:
; CHECK-NEXT: ret void
;
entry:
%cmp1 = icmp sgt i32 %reg.4.val, 0
br i1 %cmp1, label %for.body.lr.ph, label %for.end

for.body.lr.ph:
%sh_prom = zext nneg i32 %control1 to i64
%shl = shl nuw i64 1, %sh_prom
%sh_prom5 = zext nneg i32 %control2 to i64
%shl6 = shl nuw i64 1, %sh_prom5
%sh_prom10 = zext nneg i32 %target to i64
%shl11 = shl nuw nsw i64 1, %sh_prom10
%wide.trip.count = zext nneg i32 %reg.4.val to i64
%0 = freeze i64 %shl6
%1 = or i64 %shl, %0
br label %for.body

for.body:
%indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.inc ]
%arrayidx = getelementptr inbounds i64, ptr %reg.24.val, i64 %indvars.iv
%2 = load i64, ptr %arrayidx, align 8
%3 = and i64 %2, %1
%or.cond.not = icmp eq i64 %3, %1
br i1 %or.cond.not, label %if.then9, label %for.inc

if.then9:
%xor = xor i64 %2, %shl11
store i64 %xor, ptr %arrayidx, align 8
br label %for.inc

for.inc:
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.end.loopexit, label %for.body

for.end.loopexit:
br label %for.end

for.end:
ret void
}
;.
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
;.