Skip to content

Commit 7ce8513

Browse files
committed
[LV] Use ExtractLane(LastActiveLane, V) live outs when tail-folding. (WIP)
Building on top of llvm#148817, use ExtractLane + FirstActiveLane to support vectorizing external users when tail-folding. Currently marked as WIP as there is a regression when -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue is used because we bail out when building VPlans, so cannot recover and switch to non-tail-folding. Ideally we would have built both VPlans (llvm#148882).
1 parent 84d6014 commit 7ce8513

File tree

9 files changed

+470
-191
lines changed

9 files changed

+470
-191
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1929,24 +1929,6 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const {
19291929
for (const auto &Reduction : getReductionVars())
19301930
ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr());
19311931

1932-
// TODO: handle non-reduction outside users when tail is folded by masking.
1933-
for (auto *AE : AllowedExit) {
1934-
// Check that all users of allowed exit values are inside the loop or
1935-
// are the live-out of a reduction.
1936-
if (ReductionLiveOuts.count(AE))
1937-
continue;
1938-
for (User *U : AE->users()) {
1939-
Instruction *UI = cast<Instruction>(U);
1940-
if (TheLoop->contains(UI))
1941-
continue;
1942-
LLVM_DEBUG(
1943-
dbgs()
1944-
<< "LV: Cannot fold tail by masking, loop has an outside user for "
1945-
<< *UI << "\n");
1946-
return false;
1947-
}
1948-
}
1949-
19501932
for (const auto &Entry : getInductionVars()) {
19511933
PHINode *OrigPhi = Entry.first;
19521934
for (User *U : OrigPhi->users()) {

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8446,7 +8446,9 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
84468446
/// exit block. The penultimate value of recurrences is fed to their LCSSA phi
84478447
/// users in the original exit block using the VPIRInstruction wrapping to the
84488448
/// LCSSA phi.
8449-
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) {
8449+
static bool addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) {
8450+
using namespace llvm::VPlanPatternMatch;
8451+
84508452
VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
84518453
auto *ScalarPHVPBB = Plan.getScalarPreheader();
84528454
auto *MiddleVPBB = Plan.getMiddleBlock();
@@ -8465,6 +8467,15 @@ static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) {
84658467
assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
84668468
"Cannot handle loops with uncountable early exits");
84678469

8470+
// TODO: Support ExtractLane of last-active-lane with first-order
8471+
// recurrences.
8472+
8473+
if (any_of(FOR->users(), [FOR](VPUser *U) {
8474+
return match(U, m_VPInstruction<VPInstruction::ExtractLane>(
8475+
m_VPValue(), m_Specific(FOR)));
8476+
}))
8477+
return false;
8478+
84688479
// This is the second phase of vectorizing first-order recurrences, creating
84698480
// extract for users outside the loop. An overview of the transformation is
84708481
// described below. Suppose we have the following loop with some use after
@@ -8536,24 +8547,25 @@ static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) {
85368547
// Extract the penultimate value of the recurrence and use it as operand for
85378548
// the VPIRInstruction modeling the phi.
85388549
for (VPUser *U : FOR->users()) {
8539-
using namespace llvm::VPlanPatternMatch;
85408550
if (!match(U, m_VPInstruction<VPInstruction::ExtractLastElement>(
85418551
m_Specific(FOR))))
85428552
continue;
8553+
85438554
// For VF vscale x 1, if vscale = 1, we are unable to extract the
85448555
// penultimate value of the recurrence. Instead we rely on the existing
85458556
// extract of the last element from the result of
85468557
// VPInstruction::FirstOrderRecurrenceSplice.
85478558
// TODO: Consider vscale_range info and UF.
85488559
if (LoopVectorizationPlanner::getDecisionAndClampRange(IsScalableOne,
85498560
Range))
8550-
return;
8561+
return true;
85518562
VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
85528563
VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
85538564
{}, "vector.recur.extract.for.phi");
85548565
cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement);
85558566
}
85568567
}
8568+
return true;
85578569
}
85588570

85598571
VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
@@ -8758,7 +8770,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
87588770
R->setOperand(1, WideIV->getStepValue());
87598771
}
87608772

8761-
addExitUsersForFirstOrderRecurrences(*Plan, Range);
8773+
if (!addExitUsersForFirstOrderRecurrences(*Plan, Range))
8774+
return nullptr;
87628775
DenseMap<VPValue *, VPValue *> IVEndValues;
87638776
addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
87648777

@@ -9170,7 +9183,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
91709183
continue;
91719184
U->replaceUsesOfWith(OrigExitingVPV, FinalReductionResult);
91729185
if (match(U, m_VPInstruction<VPInstruction::ExtractLastElement>(
9173-
m_VPValue())))
9186+
m_VPValue())) ||
9187+
match(U, m_VPInstruction<VPInstruction::ExtractLane>(m_VPValue(),
9188+
m_VPValue())))
91749189
cast<VPInstruction>(U)->replaceAllUsesWith(FinalReductionResult);
91759190
}
91769191

llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp

Lines changed: 47 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,13 @@
1414
#include "VPRecipeBuilder.h"
1515
#include "VPlan.h"
1616
#include "VPlanCFG.h"
17+
#include "VPlanPatternMatch.h"
1718
#include "VPlanTransforms.h"
1819
#include "VPlanUtils.h"
1920
#include "llvm/ADT/PostOrderIterator.h"
2021

2122
using namespace llvm;
23+
using namespace VPlanPatternMatch;
2224

2325
namespace {
2426
class VPPredicator {
@@ -42,11 +44,6 @@ class VPPredicator {
4244
/// possibly inserting new recipes at \p Dst (using Builder's insertion point)
4345
VPValue *createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst);
4446

45-
/// Returns the *entry* mask for \p VPBB.
46-
VPValue *getBlockInMask(VPBasicBlock *VPBB) const {
47-
return BlockMaskCache.lookup(VPBB);
48-
}
49-
5047
/// Record \p Mask as the *entry* mask of \p VPBB, which is expected to not
5148
/// already have a mask.
5249
void setBlockInMask(VPBasicBlock *VPBB, VPValue *Mask) {
@@ -66,6 +63,11 @@ class VPPredicator {
6663
}
6764

6865
public:
66+
/// Returns the *entry* mask for \p VPBB.
67+
VPValue *getBlockInMask(VPBasicBlock *VPBB) const {
68+
return BlockMaskCache.lookup(VPBB);
69+
}
70+
6971
/// Returns the precomputed predicate of the edge from \p Src to \p Dst.
7072
VPValue *getEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst) const {
7173
return EdgeMaskCache.lookup({Src, Dst});
@@ -300,5 +302,45 @@ VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) {
300302

301303
PrevVPBB = VPBB;
302304
}
305+
306+
// If we folded the tail and introduced a header mask, any extract of the last element must be updated to only extract the last-active-lane of the header mask.
307+
if (FoldTail) {
308+
assert(Plan.getExitBlocks().size() == 1 &&
309+
"only a single-exit block is supported currently");
310+
VPBasicBlock *EB = Plan.getExitBlocks().front();
311+
assert(EB->getSinglePredecessor() == Plan.getMiddleBlock() &&
312+
"the exit block must have middle block as single predecessor");
313+
314+
VPValue *LastActiveLane = nullptr;
315+
VPBuilder B(Plan.getMiddleBlock()->getTerminator());
316+
for (auto &P : EB->phis()) {
317+
auto *ExitIRI = cast<VPIRPhi>(&P);
318+
VPValue *Inc = ExitIRI->getIncomingValue(0);
319+
VPValue *Op;
320+
if (!match(Inc, m_VPInstruction<VPInstruction::ExtractLastElement>(
321+
m_VPValue(Op))))
322+
continue;
323+
324+
if (!LastActiveLane) {
325+
// Compute the index of the last active lane, by getting the
326+
// first-active-lane of the negated header mask (which is the first lane
327+
// the original header mask was false) and subtract 1.
328+
VPValue *HeaderMask = Predicator.getBlockInMask(
329+
Plan.getVectorLoopRegion()->getEntryBasicBlock());
330+
LastActiveLane = B.createNaryOp(
331+
Instruction::Sub,
332+
{B.createNaryOp(VPInstruction::FirstActiveLane,
333+
{B.createNot(HeaderMask)}),
334+
Plan.getOrAddLiveIn(ConstantInt::get(
335+
IntegerType::get(
336+
Plan.getScalarHeader()->getIRBasicBlock()->getContext(),
337+
64),
338+
1))});
339+
}
340+
auto *Ext =
341+
B.createNaryOp(VPInstruction::ExtractLane, {LastActiveLane, Op});
342+
Inc->replaceAllUsesWith(Ext);
343+
}
344+
}
303345
return Predicator.getBlockMaskCache();
304346
}

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -901,8 +901,17 @@ Value *VPInstruction::generate(VPTransformState &State) {
901901
unsigned LastOpIdx = getNumOperands() - 1;
902902
Value *Res = nullptr;
903903
for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
904-
Value *TrailingZeros = Builder.CreateCountTrailingZeroElems(
905-
Builder.getInt64Ty(), State.get(getOperand(Idx)), true, Name);
904+
Value *TrailingZeros =
905+
State.VF.isScalar()
906+
? Builder.CreateZExt(
907+
Builder.CreateICmpEQ(State.get(getOperand(Idx)),
908+
Builder.getInt1(0)),
909+
Builder.getInt64Ty())
910+
: Builder.CreateCountTrailingZeroElems(
911+
// Value *TrailingZeros =
912+
// Builder.CreateCountTrailingZeroElems(
913+
Builder.getInt64Ty(), State.get(getOperand(Idx)), true,
914+
Name);
906915
Value *Current = Builder.CreateAdd(
907916
Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros);
908917
if (Res) {
@@ -1093,7 +1102,6 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
10931102
default:
10941103
return false;
10951104
case Instruction::ExtractElement:
1096-
case VPInstruction::ExtractLane:
10971105
return Op == getOperand(1);
10981106
case Instruction::PHI:
10991107
return true;
@@ -1117,6 +1125,8 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
11171125
case VPInstruction::ComputeAnyOfResult:
11181126
case VPInstruction::ComputeFindIVResult:
11191127
return Op == getOperand(1);
1128+
case VPInstruction::ExtractLane:
1129+
return Op == getOperand(0);
11201130
};
11211131
llvm_unreachable("switch should return");
11221132
}

llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll

Lines changed: 43 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -315,17 +315,51 @@ for.end:
315315
define i64 @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %n) {
316316
; CHECK-LABEL: @uniform_load(
317317
; CHECK-NEXT: entry:
318+
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
319+
; CHECK: vector.ph:
320+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
321+
; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
322+
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
323+
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]]
324+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
325+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
326+
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
327+
; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
318328
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
319-
; CHECK: for.body:
320-
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
329+
; CHECK: vector.body:
330+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
331+
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[IV]], i64 1025)
321332
; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[B:%.*]], align 8
333+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
334+
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
322335
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]]
323-
; CHECK-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
324-
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
325-
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
326-
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
336+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
337+
; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
338+
; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP4]]
339+
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
340+
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
341+
; CHECK: middle.block:
342+
; CHECK-NEXT: [[TMP9:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], splat (i1 true)
343+
; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP9]], i1 true)
344+
; CHECK-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], 1
345+
; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
346+
; CHECK-NEXT: [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 2
347+
; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 0
348+
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <vscale x 2 x i64> [[BROADCAST_SPLAT]], i64 [[TMP11]]
349+
; CHECK-NEXT: br label [[FOR_END:%.*]]
350+
; CHECK: scalar.ph:
351+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ]
352+
; CHECK-NEXT: br label [[FOR_BODY1:%.*]]
353+
; CHECK: for.body:
354+
; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
355+
; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[B]], align 8
356+
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV1]]
357+
; CHECK-NEXT: store i64 [[V1]], ptr [[ARRAYIDX1]], align 8
358+
; CHECK-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
359+
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], 1025
360+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP13:![0-9]+]]
327361
; CHECK: for.end:
328-
; CHECK-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V]], [[FOR_BODY]] ]
362+
; CHECK-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V1]], [[FOR_BODY1]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
329363
; CHECK-NEXT: ret i64 [[V_LCSSA]]
330364
;
331365
entry:
@@ -371,7 +405,7 @@ define void @vector_add_trip1024(ptr noalias nocapture %a, i64 %v, i64 %n) {
371405
; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP10]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
372406
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
373407
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
374-
; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
408+
; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
375409
; CHECK: middle.block:
376410
; CHECK-NEXT: br label [[FOR_END:%.*]]
377411
; CHECK: scalar.ph:
@@ -385,7 +419,7 @@ define void @vector_add_trip1024(ptr noalias nocapture %a, i64 %v, i64 %n) {
385419
; CHECK-NEXT: store i64 [[ADD]], ptr [[ARRAYIDX]], align 8
386420
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
387421
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
388-
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
422+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
389423
; CHECK: for.end:
390424
; CHECK-NEXT: ret void
391425
;

0 commit comments

Comments
 (0)