@@ -3883,7 +3883,6 @@ class BoUpSLP {
3883
3883
enum CombinedOpcode {
3884
3884
NotCombinedOp = -1,
3885
3885
MinMax = Instruction::OtherOpsEnd + 1,
3886
- FMulAdd,
3887
3886
};
3888
3887
CombinedOpcode CombinedOp = NotCombinedOp;
3889
3888
@@ -4034,9 +4033,6 @@ class BoUpSLP {
4034
4033
/// Returns true if any scalar in the list is a copyable element.
4035
4034
bool hasCopyableElements() const { return !CopyableElements.empty(); }
4036
4035
4037
- /// Returns the state of the operations.
4038
- const InstructionsState &getOperations() const { return S; }
4039
-
4040
4036
/// When ReuseReorderShuffleIndices is empty it just returns position of \p
4041
4037
/// V within vector of Scalars. Otherwise, try to remap on its reuse index.
4042
4038
unsigned findLaneForValue(Value *V) const {
@@ -11991,82 +11987,6 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
11991
11987
}
11992
11988
}
11993
11989
11994
- static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
11995
- const InstructionsState &S,
11996
- DominatorTree &DT, const DataLayout &DL,
11997
- TargetTransformInfo &TTI,
11998
- const TargetLibraryInfo &TLI) {
11999
- assert(all_of(VL,
12000
- [](Value *V) {
12001
- return V->getType()->getScalarType()->isFloatingPointTy();
12002
- }) &&
12003
- "Can only convert to FMA for floating point types");
12004
- assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
12005
-
12006
- auto CheckForContractable = [&](ArrayRef<Value *> VL) {
12007
- FastMathFlags FMF;
12008
- FMF.set();
12009
- for (Value *V : VL) {
12010
- auto *I = dyn_cast<Instruction>(V);
12011
- if (!I)
12012
- continue;
12013
- // TODO: support for copyable elements.
12014
- Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
12015
- if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12016
- continue;
12017
- if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12018
- FMF &= FPCI->getFastMathFlags();
12019
- }
12020
- return FMF.allowContract();
12021
- };
12022
- if (!CheckForContractable(VL))
12023
- return InstructionCost::getInvalid();
12024
- // fmul also should be contractable
12025
- InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
12026
- SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
12027
-
12028
- InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
12029
- if (!OpS.valid())
12030
- return InstructionCost::getInvalid();
12031
- if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
12032
- return InstructionCost::getInvalid();
12033
- if (!CheckForContractable(Operands.front()))
12034
- return InstructionCost::getInvalid();
12035
- // Compare the costs.
12036
- InstructionCost FMulPlusFAddCost = 0;
12037
- InstructionCost FMACost = 0;
12038
- constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
12039
- FastMathFlags FMF;
12040
- FMF.set();
12041
- for (Value *V : VL) {
12042
- auto *I = dyn_cast<Instruction>(V);
12043
- if (!I)
12044
- continue;
12045
- if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12046
- FMF &= FPCI->getFastMathFlags();
12047
- FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
12048
- }
12049
- unsigned NumOps = 0;
12050
- for (auto [V, Op] : zip(VL, Operands.front())) {
12051
- auto *I = dyn_cast<Instruction>(Op);
12052
- if (!I || !I->hasOneUse()) {
12053
- if (auto *OpI = dyn_cast<Instruction>(V))
12054
- FMACost += TTI.getInstructionCost(OpI, CostKind);
12055
- if (I)
12056
- FMACost += TTI.getInstructionCost(I, CostKind);
12057
- continue;
12058
- }
12059
- ++NumOps;
12060
- if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12061
- FMF &= FPCI->getFastMathFlags();
12062
- FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
12063
- }
12064
- Type *Ty = VL.front()->getType();
12065
- IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
12066
- FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
12067
- return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
12068
- }
12069
-
12070
11990
void BoUpSLP::transformNodes() {
12071
11991
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
12072
11992
BaseGraphSize = VectorizableTree.size();
@@ -12435,25 +12355,6 @@ void BoUpSLP::transformNodes() {
12435
12355
}
12436
12356
break;
12437
12357
}
12438
- case Instruction::FSub:
12439
- case Instruction::FAdd: {
12440
- // Check if possible to convert (a*b)+c to fma.
12441
- if (E.State != TreeEntry::Vectorize ||
12442
- !E.getOperations().isAddSubLikeOp())
12443
- break;
12444
- if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
12445
- .isValid())
12446
- break;
12447
- // This node is a fmuladd node.
12448
- E.CombinedOp = TreeEntry::FMulAdd;
12449
- TreeEntry *FMulEntry = getOperandEntry(&E, 0);
12450
- if (FMulEntry->UserTreeIndex &&
12451
- FMulEntry->State == TreeEntry::Vectorize) {
12452
- // The FMul node is part of the combined fmuladd node.
12453
- FMulEntry->State = TreeEntry::CombinedVectorize;
12454
- }
12455
- break;
12456
- }
12457
12358
default:
12458
12359
break;
12459
12360
}
@@ -13686,11 +13587,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
13686
13587
}
13687
13588
return IntrinsicCost;
13688
13589
};
13689
- auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
13690
- Instruction *VI) {
13691
- InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI);
13692
- return Cost;
13693
- };
13694
13590
switch (ShuffleOrOp) {
13695
13591
case Instruction::PHI: {
13696
13592
// Count reused scalars.
@@ -14031,30 +13927,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
14031
13927
};
14032
13928
return GetCostDiff(GetScalarCost, GetVectorCost);
14033
13929
}
14034
- case TreeEntry::FMulAdd: {
14035
- auto GetScalarCost = [&](unsigned Idx) {
14036
- if (isa<PoisonValue>(UniqueValues[Idx]))
14037
- return InstructionCost(TTI::TCC_Free);
14038
- return GetFMulAddCost(E->getOperations(),
14039
- cast<Instruction>(UniqueValues[Idx]));
14040
- };
14041
- auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
14042
- FastMathFlags FMF;
14043
- FMF.set();
14044
- for (Value *V : E->Scalars) {
14045
- if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {
14046
- FMF &= FPCI->getFastMathFlags();
14047
- if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
14048
- FMF &= FPCIOp->getFastMathFlags();
14049
- }
14050
- }
14051
- IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
14052
- {VecTy, VecTy, VecTy}, FMF);
14053
- InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
14054
- return VecCost + CommonCost;
14055
- };
14056
- return GetCostDiff(GetScalarCost, GetVectorCost);
14057
- }
14058
13930
case Instruction::FNeg:
14059
13931
case Instruction::Add:
14060
13932
case Instruction::FAdd:
@@ -14092,16 +13964,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
14092
13964
}
14093
13965
TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(Op1);
14094
13966
TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(Op2);
14095
- InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
14096
- ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);
14097
- if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);
14098
- I && (ShuffleOrOp == Instruction::FAdd ||
14099
- ShuffleOrOp == Instruction::FSub)) {
14100
- InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
14101
- if (IntrinsicCost.isValid())
14102
- ScalarCost = IntrinsicCost;
14103
- }
14104
- return ScalarCost;
13967
+ return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
13968
+ Op1Info, Op2Info, Operands);
14105
13969
};
14106
13970
auto GetVectorCost = [=](InstructionCost CommonCost) {
14107
13971
if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
@@ -22730,21 +22594,11 @@ class HorizontalReduction {
22730
22594
/// Try to find a reduction tree.
22731
22595
bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
22732
22596
ScalarEvolution &SE, const DataLayout &DL,
22733
- const TargetLibraryInfo &TLI,
22734
- DominatorTree &DT, TargetTransformInfo &TTI) {
22597
+ const TargetLibraryInfo &TLI) {
22735
22598
RdxKind = HorizontalReduction::getRdxKind(Root);
22736
22599
if (!isVectorizable(RdxKind, Root))
22737
22600
return false;
22738
22601
22739
- // FMA reduction root - skip.
22740
- auto CheckForFMA = [&](Instruction *I) {
22741
- return RdxKind == RecurKind::FAdd &&
22742
- canConvertToFMA(I, getSameOpcode(I, TLI), DT, DL, TTI, TLI)
22743
- .isValid();
22744
- };
22745
- if (CheckForFMA(Root))
22746
- return false;
22747
-
22748
22602
// Analyze "regular" integer/FP types for reductions - no target-specific
22749
22603
// types or pointers.
22750
22604
Type *Ty = Root->getType();
@@ -22782,7 +22636,7 @@ class HorizontalReduction {
22782
22636
// Also, do not try to reduce const values, if the operation is not
22783
22637
// foldable.
22784
22638
if (!EdgeInst || Level > RecursionMaxDepth ||
22785
- getRdxKind(EdgeInst) != RdxKind || CheckForFMA(EdgeInst) ||
22639
+ getRdxKind(EdgeInst) != RdxKind ||
22786
22640
IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
22787
22641
!hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
22788
22642
!isVectorizable(RdxKind, EdgeInst) ||
@@ -24351,13 +24205,13 @@ bool SLPVectorizerPass::vectorizeHorReduction(
24351
24205
Stack.emplace(SelectRoot(), 0);
24352
24206
SmallPtrSet<Value *, 8> VisitedInstrs;
24353
24207
bool Res = false;
24354
- auto TryToReduce = [this, &R, TTI = TTI ](Instruction *Inst) -> Value * {
24208
+ auto && TryToReduce = [this, &R](Instruction *Inst) -> Value * {
24355
24209
if (R.isAnalyzedReductionRoot(Inst))
24356
24210
return nullptr;
24357
24211
if (!isReductionCandidate(Inst))
24358
24212
return nullptr;
24359
24213
HorizontalReduction HorRdx;
24360
- if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI, *DT, *TTI ))
24214
+ if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
24361
24215
return nullptr;
24362
24216
return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
24363
24217
};
@@ -24423,12 +24277,6 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
24423
24277
24424
24278
if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
24425
24279
return false;
24426
- // Skip potential FMA candidates.
24427
- if ((I->getOpcode() == Instruction::FAdd ||
24428
- I->getOpcode() == Instruction::FSub) &&
24429
- canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI)
24430
- .isValid())
24431
- return false;
24432
24280
24433
24281
Value *P = I->getParent();
24434
24282
0 commit comments