Skip to content

Commit a6edeed

Browse files
committed
Revert "[LV] Use ExtractLane(LastActiveLane, V) live outs when tail-folding. (#149042)"
This reverts commit 62d1a08. This appears to be causing some runtime failures on RISCV https://lab.llvm.org/buildbot/#/builders/210/builds/5221
1 parent 79cd1b7 commit a6edeed

21 files changed

+584
-1626
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2097,6 +2097,24 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const {
20972097
for (const auto &Reduction : getReductionVars())
20982098
ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr());
20992099

2100+
// TODO: handle non-reduction outside users when tail is folded by masking.
2101+
for (auto *AE : AllowedExit) {
2102+
// Check that all users of allowed exit values are inside the loop or
2103+
// are the live-out of a reduction.
2104+
if (ReductionLiveOuts.count(AE))
2105+
continue;
2106+
for (User *U : AE->users()) {
2107+
Instruction *UI = cast<Instruction>(U);
2108+
if (TheLoop->contains(UI))
2109+
continue;
2110+
LLVM_DEBUG(
2111+
dbgs()
2112+
<< "LV: Cannot fold tail by masking, loop has an outside user for "
2113+
<< *UI << "\n");
2114+
return false;
2115+
}
2116+
}
2117+
21002118
for (const auto &Entry : getInductionVars()) {
21012119
PHINode *OrigPhi = Entry.first;
21022120
for (User *U : OrigPhi->users()) {

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8893,8 +8893,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
88938893
if (FinalReductionResult == U || Parent->getParent())
88948894
continue;
88958895
U->replaceUsesOfWith(OrigExitingVPV, FinalReductionResult);
8896-
if (match(U, m_CombineOr(m_ExtractLastElement(m_VPValue()),
8897-
m_ExtractLane(m_VPValue(), m_VPValue()))))
8896+
if (match(U, m_ExtractLastElement(m_VPValue())))
88988897
cast<VPInstruction>(U)->replaceAllUsesWith(FinalReductionResult);
88998898
}
89008899

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1047,13 +1047,6 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
10471047
// It produces the lane index across all unrolled iterations. Unrolling will
10481048
// add all copies of its original operand as additional operands.
10491049
FirstActiveLane,
1050-
// Calculates the last active lane index of the vector predicate operands.
1051-
// The predicates must be prefix-masks (all 1s before all 0s). Used when
1052-
// tail-folding to extract the correct live-out value from the last active
1053-
// iteration. It produces the lane index across all unrolled iterations.
1054-
// Unrolling will add all copies of its original operand as additional
1055-
// operands.
1056-
LastActiveLane,
10571050

10581051
// The opcodes below are used for VPInstructionWithType.
10591052
//

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,6 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
115115
case VPInstruction::ExtractLane:
116116
return inferScalarType(R->getOperand(1));
117117
case VPInstruction::FirstActiveLane:
118-
case VPInstruction::LastActiveLane:
119118
return Type::getIntNTy(Ctx, 64);
120119
case VPInstruction::ExtractLastElement:
121120
case VPInstruction::ExtractLastLanePerPart:

llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -395,24 +395,12 @@ m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1) {
395395
return m_VPInstruction<Instruction::ExtractElement>(Op0, Op1);
396396
}
397397

398-
template <typename Op0_t, typename Op1_t>
399-
inline VPInstruction_match<VPInstruction::ExtractLane, Op0_t, Op1_t>
400-
m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1) {
401-
return m_VPInstruction<VPInstruction::ExtractLane>(Op0, Op1);
402-
}
403-
404398
template <typename Op0_t>
405399
inline VPInstruction_match<VPInstruction::ExtractLastLanePerPart, Op0_t>
406400
m_ExtractLastLanePerPart(const Op0_t &Op0) {
407401
return m_VPInstruction<VPInstruction::ExtractLastLanePerPart>(Op0);
408402
}
409403

410-
template <typename Op0_t>
411-
inline VPInstruction_match<VPInstruction::ExtractPenultimateElement, Op0_t>
412-
m_ExtractPenultimateElement(const Op0_t &Op0) {
413-
return m_VPInstruction<VPInstruction::ExtractPenultimateElement>(Op0);
414-
}
415-
416404
template <typename Op0_t, typename Op1_t, typename Op2_t>
417405
inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t>
418406
m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
@@ -441,16 +429,6 @@ m_FirstActiveLane(const Op0_t &Op0) {
441429
return m_VPInstruction<VPInstruction::FirstActiveLane>(Op0);
442430
}
443431

444-
template <typename Op0_t>
445-
inline VPInstruction_match<VPInstruction::LastActiveLane, Op0_t>
446-
m_LastActiveLane(const Op0_t &Op0) {
447-
return m_VPInstruction<VPInstruction::LastActiveLane>(Op0);
448-
}
449-
450-
inline VPInstruction_match<VPInstruction::StepVector> m_StepVector() {
451-
return m_VPInstruction<VPInstruction::StepVector>();
452-
}
453-
454432
template <unsigned Opcode, typename Op0_t>
455433
inline AllRecipe_match<Opcode, Op0_t> m_Unary(const Op0_t &Op0) {
456434
return AllRecipe_match<Opcode, Op0_t>(Op0);

llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp

Lines changed: 5 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,11 @@ class VPPredicator {
4444
/// possibly inserting new recipes at \p Dst (using Builder's insertion point)
4545
VPValue *createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst);
4646

47+
/// Returns the *entry* mask for \p VPBB.
48+
VPValue *getBlockInMask(VPBasicBlock *VPBB) const {
49+
return BlockMaskCache.lookup(VPBB);
50+
}
51+
4752
/// Record \p Mask as the *entry* mask of \p VPBB, which is expected to not
4853
/// already have a mask.
4954
void setBlockInMask(VPBasicBlock *VPBB, VPValue *Mask) {
@@ -63,11 +68,6 @@ class VPPredicator {
6368
}
6469

6570
public:
66-
/// Returns the *entry* mask for \p VPBB.
67-
VPValue *getBlockInMask(VPBasicBlock *VPBB) const {
68-
return BlockMaskCache.lookup(VPBB);
69-
}
70-
7171
/// Returns the precomputed predicate of the edge from \p Src to \p Dst.
7272
VPValue *getEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst) const {
7373
return EdgeMaskCache.lookup({Src, Dst});
@@ -301,34 +301,5 @@ VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) {
301301

302302
PrevVPBB = VPBB;
303303
}
304-
305-
// If we folded the tail and introduced a header mask, any extract of the
306-
// last element must be updated to extract from the last active lane of the
307-
// header mask instead (i.e., the lane corresponding to the last active
308-
// iteration).
309-
if (FoldTail) {
310-
assert(Plan.getExitBlocks().size() == 1 &&
311-
"only a single-exit block is supported currently");
312-
VPBasicBlock *EB = Plan.getExitBlocks().front();
313-
assert(EB->getSinglePredecessor() == Plan.getMiddleBlock() &&
314-
"the exit block must have middle block as single predecessor");
315-
316-
VPBuilder B(Plan.getMiddleBlock()->getTerminator());
317-
for (auto &P : EB->phis()) {
318-
auto *ExitIRI = cast<VPIRPhi>(&P);
319-
VPValue *Inc = ExitIRI->getIncomingValue(0);
320-
VPValue *Op;
321-
if (!match(Inc, m_ExtractLastElement(m_VPValue(Op))))
322-
continue;
323-
324-
// Compute the index of the last active lane.
325-
VPValue *HeaderMask = Predicator.getBlockInMask(Header);
326-
VPValue *LastActiveLane =
327-
B.createNaryOp(VPInstruction::LastActiveLane, HeaderMask);
328-
auto *Ext =
329-
B.createNaryOp(VPInstruction::ExtractLane, {LastActiveLane, Op});
330-
Inc->replaceAllUsesWith(Ext);
331-
}
332-
}
333304
return Predicator.getBlockMaskCache();
334305
}

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 0 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -547,7 +547,6 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
547547
case VPInstruction::ExtractLastLanePerPart:
548548
case VPInstruction::ExtractPenultimateElement:
549549
case VPInstruction::FirstActiveLane:
550-
case VPInstruction::LastActiveLane:
551550
case VPInstruction::Not:
552551
case VPInstruction::Unpack:
553552
return 1;
@@ -1157,29 +1156,6 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
11571156
{PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
11581157
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
11591158
}
1160-
case VPInstruction::LastActiveLane: {
1161-
Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
1162-
if (VF.isScalar())
1163-
return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
1164-
CmpInst::makeCmpResultType(ScalarTy),
1165-
CmpInst::ICMP_EQ, Ctx.CostKind);
1166-
// Calculate the cost of determining the lane index: NOT + cttz_elts + SUB.
1167-
auto *PredTy = toVectorTy(ScalarTy, VF);
1168-
IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
1169-
Type::getInt64Ty(Ctx.LLVMCtx),
1170-
{PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
1171-
InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1172-
// Add cost of NOT operation on the predicate.
1173-
Cost += Ctx.TTI.getArithmeticInstrCost(
1174-
Instruction::Xor, PredTy, Ctx.CostKind,
1175-
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1176-
{TargetTransformInfo::OK_UniformConstantValue,
1177-
TargetTransformInfo::OP_None});
1178-
// Add cost of SUB operation on the index.
1179-
Cost += Ctx.TTI.getArithmeticInstrCost(
1180-
Instruction::Sub, Type::getInt64Ty(Ctx.LLVMCtx), Ctx.CostKind);
1181-
return Cost;
1182-
}
11831159
case VPInstruction::FirstOrderRecurrenceSplice: {
11841160
assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
11851161
SmallVector<int> Mask(VF.getKnownMinValue());
@@ -1234,7 +1210,6 @@ bool VPInstruction::isVectorToScalar() const {
12341210
getOpcode() == Instruction::ExtractElement ||
12351211
getOpcode() == VPInstruction::ExtractLane ||
12361212
getOpcode() == VPInstruction::FirstActiveLane ||
1237-
getOpcode() == VPInstruction::LastActiveLane ||
12381213
getOpcode() == VPInstruction::ComputeAnyOfResult ||
12391214
getOpcode() == VPInstruction::ComputeFindIVResult ||
12401215
getOpcode() == VPInstruction::ComputeReductionResult ||
@@ -1301,7 +1276,6 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
13011276
case VPInstruction::ActiveLaneMask:
13021277
case VPInstruction::ExplicitVectorLength:
13031278
case VPInstruction::FirstActiveLane:
1304-
case VPInstruction::LastActiveLane:
13051279
case VPInstruction::FirstOrderRecurrenceSplice:
13061280
case VPInstruction::LogicalAnd:
13071281
case VPInstruction::Not:
@@ -1478,9 +1452,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
14781452
case VPInstruction::FirstActiveLane:
14791453
O << "first-active-lane";
14801454
break;
1481-
case VPInstruction::LastActiveLane:
1482-
O << "last-active-lane";
1483-
break;
14841455
case VPInstruction::ReductionStartVector:
14851456
O << "reduction-start-vector";
14861457
break;

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 4 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -805,8 +805,8 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
805805
VPValue *Op,
806806
ScalarEvolution &SE) {
807807
VPValue *Incoming, *Mask;
808-
if (!match(Op, m_ExtractLane(m_FirstActiveLane(m_VPValue(Mask)),
809-
m_VPValue(Incoming))))
808+
if (!match(Op, m_VPInstruction<VPInstruction::ExtractLane>(
809+
m_FirstActiveLane(m_VPValue(Mask)), m_VPValue(Incoming))))
810810
return nullptr;
811811

812812
auto *WideIV = getOptimizableIVOf(Incoming, SE);
@@ -1274,7 +1274,8 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
12741274
}
12751275

12761276
// Look through ExtractPenultimateElement (BuildVector ....).
1277-
if (match(Def, m_ExtractPenultimateElement(m_BuildVector()))) {
1277+
if (match(Def, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
1278+
m_BuildVector()))) {
12781279
auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
12791280
Def->replaceAllUsesWith(
12801281
BuildVector->getOperand(BuildVector->getNumOperands() - 2));
@@ -2085,32 +2086,6 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
20852086
// Set the first operand of RecurSplice to FOR again, after replacing
20862087
// all users.
20872088
RecurSplice->setOperand(0, FOR);
2088-
2089-
// Check for users extracting at the penultimate active lane of the FOR.
2090-
// If only a single lane is active in the current iteration, we need to
2091-
// select the last element from the previous iteration (from the FOR phi
2092-
// directly).
2093-
for (VPUser *U : RecurSplice->users()) {
2094-
if (!match(U, m_ExtractLane(m_LastActiveLane(m_VPValue()),
2095-
m_Specific(RecurSplice))))
2096-
continue;
2097-
2098-
VPBuilder B(cast<VPInstruction>(U));
2099-
VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0);
2100-
Type *I64Ty = Type::getInt64Ty(Plan.getContext());
2101-
VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 0));
2102-
VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 1));
2103-
VPValue *PenultimateIndex =
2104-
B.createNaryOp(Instruction::Sub, {LastActiveLane, One});
2105-
VPValue *PenultimateLastIter =
2106-
B.createNaryOp(VPInstruction::ExtractLane,
2107-
{PenultimateIndex, FOR->getBackedgeValue()});
2108-
VPValue *LastPrevIter =
2109-
B.createNaryOp(VPInstruction::ExtractLastElement, FOR);
2110-
VPValue *Cmp = B.createICmp(CmpInst::ICMP_EQ, LastActiveLane, Zero);
2111-
VPValue *Sel = B.createSelect(Cmp, LastPrevIter, PenultimateLastIter);
2112-
cast<VPInstruction>(U)->replaceAllUsesWith(Sel);
2113-
}
21142089
}
21152090
return true;
21162091
}
@@ -3500,34 +3475,6 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
35003475
ToRemove.push_back(Expr);
35013476
}
35023477

3503-
// Expand LastActiveLane into Not + FirstActiveLane + Sub.
3504-
auto *LastActiveL = dyn_cast<VPInstruction>(&R);
3505-
if (LastActiveL &&
3506-
LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
3507-
// Create Not(Mask) for all operands.
3508-
SmallVector<VPValue *, 2> NotMasks;
3509-
for (VPValue *Op : LastActiveL->operands()) {
3510-
VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
3511-
NotMasks.push_back(NotMask);
3512-
}
3513-
3514-
// Create FirstActiveLane on the inverted masks.
3515-
VPValue *FirstInactiveLane = Builder.createNaryOp(
3516-
VPInstruction::FirstActiveLane, NotMasks,
3517-
LastActiveL->getDebugLoc(), "first.inactive.lane");
3518-
3519-
// Subtract 1 to get the last active lane.
3520-
VPValue *One = Plan.getOrAddLiveIn(
3521-
ConstantInt::get(Type::getInt64Ty(Plan.getContext()), 1));
3522-
VPValue *LastLane = Builder.createNaryOp(
3523-
Instruction::Sub, {FirstInactiveLane, One},
3524-
LastActiveL->getDebugLoc(), "last.active.lane");
3525-
3526-
LastActiveL->replaceAllUsesWith(LastLane);
3527-
ToRemove.push_back(LastActiveL);
3528-
continue;
3529-
}
3530-
35313478
VPValue *VectorStep;
35323479
VPValue *ScalarStep;
35333480
if (!match(&R, m_VPInstruction<VPInstruction::WideIVStep>(

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,6 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
352352
VPValue *Op1;
353353
if (match(&R, m_VPInstruction<VPInstruction::AnyOf>(m_VPValue(Op1))) ||
354354
match(&R, m_FirstActiveLane(m_VPValue(Op1))) ||
355-
match(&R, m_LastActiveLane(m_VPValue(Op1))) ||
356355
match(&R, m_VPInstruction<VPInstruction::ComputeAnyOfResult>(
357356
m_VPValue(), m_VPValue(), m_VPValue(Op1))) ||
358357
match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(
@@ -365,21 +364,17 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
365364
continue;
366365
}
367366
VPValue *Op0;
368-
if (match(&R, m_ExtractLane(m_VPValue(Op0), m_VPValue(Op1)))) {
367+
if (match(&R, m_VPInstruction<VPInstruction::ExtractLane>(
368+
m_VPValue(Op0), m_VPValue(Op1)))) {
369369
addUniformForAllParts(cast<VPInstruction>(&R));
370370
for (unsigned Part = 1; Part != UF; ++Part)
371371
R.addOperand(getValueForPart(Op1, Part));
372372
continue;
373373
}
374374
if (match(&R, m_ExtractLastElement(m_VPValue(Op0))) ||
375-
match(&R, m_ExtractPenultimateElement(m_VPValue(Op0)))) {
375+
match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
376+
m_VPValue(Op0)))) {
376377
addUniformForAllParts(cast<VPSingleDefRecipe>(&R));
377-
if (isa<VPFirstOrderRecurrencePHIRecipe>(Op0)) {
378-
assert(match(&R, m_ExtractLastElement(m_VPValue())) &&
379-
"can only extract last element of FOR");
380-
continue;
381-
}
382-
383378
if (Plan.hasScalarVFOnly()) {
384379
auto *I = cast<VPInstruction>(&R);
385380
// Extracting from end with VF = 1 implies retrieving the last or

0 commit comments

Comments
 (0)