Skip to content

Commit ef68ab3

Browse files
alexey-bataevmemfrob
authored andcommitted
[SLP]Improve calculations of the cost for reused/reordered scalars.
Part of D105020. Also, fixed FIXMEs that need to use wider vector type when trying to calculate the cost of reused scalars. This may cause regressions unless D100486 is landed to improve the cost estimations for long vectors shuffling. Differential Revision: https://reviews.llvm.org/D106060
1 parent f713fea commit ef68ab3

File tree

3 files changed

+130
-140
lines changed

3 files changed

+130
-140
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 95 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -3621,6 +3621,27 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
36213621
return Cost;
36223622
}
36233623

3624+
/// Shuffles \p Mask in accordance with the given \p SubMask.
3625+
static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {
3626+
if (SubMask.empty())
3627+
return;
3628+
if (Mask.empty()) {
3629+
Mask.append(SubMask.begin(), SubMask.end());
3630+
return;
3631+
}
3632+
SmallVector<int, 4> NewMask(SubMask.size(), SubMask.size());
3633+
int TermValue = std::min(Mask.size(), SubMask.size());
3634+
for (int I = 0, E = SubMask.size(); I < E; ++I) {
3635+
if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||
3636+
Mask[SubMask[I]] >= TermValue) {
3637+
NewMask[I] = UndefMaskElem;
3638+
continue;
3639+
}
3640+
NewMask[I] = Mask[SubMask[I]];
3641+
}
3642+
Mask.swap(NewMask);
3643+
}
3644+
36243645
InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
36253646
ArrayRef<Value *> VectorizedVals) {
36263647
ArrayRef<Value*> VL = E->Scalars;
@@ -3633,6 +3654,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
36333654
else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
36343655
ScalarTy = IE->getOperand(1)->getType();
36353656
auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
3657+
auto *FinalVecTy = VecTy;
36363658
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
36373659

36383660
// If we have computed a smaller type for the expression, update VecTy so
@@ -3643,12 +3665,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
36433665

36443666
unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
36453667
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
3646-
InstructionCost ReuseShuffleCost = 0;
3647-
if (NeedToShuffleReuses) {
3648-
ReuseShuffleCost =
3649-
TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy,
3650-
E->ReuseShuffleIndices);
3651-
}
3668+
if (NeedToShuffleReuses)
3669+
FinalVecTy =
3670+
FixedVectorType::get(VecTy->getElementType(), ReuseShuffleNumbers);
36523671
// FIXME: it tries to fix a problem with MSVC buildbots.
36533672
TargetTransformInfo &TTIRef = *TTI;
36543673
auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy,
@@ -3737,23 +3756,26 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
37373756
dbgs()
37383757
<< "SLP: perfect diamond match for gather bundle that starts with "
37393758
<< *VL.front() << ".\n");
3759+
if (NeedToShuffleReuses)
3760+
GatherCost =
3761+
TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
3762+
FinalVecTy, E->ReuseShuffleIndices);
37403763
} else {
37413764
LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
37423765
<< " entries for bundle that starts with "
37433766
<< *VL.front() << ".\n");
37443767
// Detected that instead of gather we can emit a shuffle of single/two
37453768
// previously vectorized nodes. Add the cost of the permutation rather
37463769
// than gather.
3747-
GatherCost = TTI->getShuffleCost(*Shuffle, VecTy, Mask);
3770+
::addMask(Mask, E->ReuseShuffleIndices);
3771+
GatherCost = TTI->getShuffleCost(*Shuffle, FinalVecTy, Mask);
37483772
}
3749-
return ReuseShuffleCost + GatherCost;
3773+
return GatherCost;
37503774
}
37513775
if (isSplat(VL)) {
37523776
// Found the broadcasting of the single scalar, calculate the cost as the
37533777
// broadcast.
3754-
return ReuseShuffleCost +
3755-
TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, None,
3756-
0);
3778+
return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);
37573779
}
37583780
if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) &&
37593781
allSameBlock(VL) &&
@@ -3771,11 +3793,36 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
37713793
InstructionCost Cost =
37723794
computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI);
37733795
AdjustExtractsCost(Cost, /*IsGather=*/true);
3774-
return ReuseShuffleCost + Cost;
3796+
if (NeedToShuffleReuses)
3797+
Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
3798+
FinalVecTy, E->ReuseShuffleIndices);
3799+
return Cost;
37753800
}
37763801
}
3802+
InstructionCost ReuseShuffleCost = 0;
3803+
if (NeedToShuffleReuses)
3804+
ReuseShuffleCost = TTI->getShuffleCost(
3805+
TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices);
37773806
return ReuseShuffleCost + getGatherCost(VL);
37783807
}
3808+
InstructionCost CommonCost = 0;
3809+
SmallVector<int> Mask;
3810+
if (!E->ReorderIndices.empty()) {
3811+
SmallVector<int> NewMask;
3812+
if (E->getOpcode() == Instruction::Store) {
3813+
// For stores the order is actually a mask.
3814+
NewMask.resize(E->ReorderIndices.size());
3815+
copy(E->ReorderIndices, NewMask.begin());
3816+
} else {
3817+
inversePermutation(E->ReorderIndices, NewMask);
3818+
}
3819+
::addMask(Mask, NewMask);
3820+
}
3821+
if (NeedToShuffleReuses)
3822+
::addMask(Mask, E->ReuseShuffleIndices);
3823+
if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask))
3824+
CommonCost =
3825+
TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
37793826
assert((E->State == TreeEntry::Vectorize ||
37803827
E->State == TreeEntry::ScatterVectorize) &&
37813828
"Unhandled state");
@@ -3797,34 +3844,28 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
37973844
for (unsigned I : E->ReuseShuffleIndices) {
37983845
if (ShuffleOrOp == Instruction::ExtractElement) {
37993846
auto *EE = cast<ExtractElementInst>(VL[I]);
3800-
ReuseShuffleCost -= TTI->getVectorInstrCost(
3801-
Instruction::ExtractElement, EE->getVectorOperandType(),
3802-
*getExtractIndex(EE));
3847+
CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement,
3848+
EE->getVectorOperandType(),
3849+
*getExtractIndex(EE));
38033850
} else {
3804-
ReuseShuffleCost -= TTI->getVectorInstrCost(
3805-
Instruction::ExtractElement, VecTy, Idx);
3851+
CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement,
3852+
VecTy, Idx);
38063853
++Idx;
38073854
}
38083855
}
38093856
Idx = ReuseShuffleNumbers;
38103857
for (Value *V : VL) {
38113858
if (ShuffleOrOp == Instruction::ExtractElement) {
38123859
auto *EE = cast<ExtractElementInst>(V);
3813-
ReuseShuffleCost += TTI->getVectorInstrCost(
3814-
Instruction::ExtractElement, EE->getVectorOperandType(),
3815-
*getExtractIndex(EE));
3860+
CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement,
3861+
EE->getVectorOperandType(),
3862+
*getExtractIndex(EE));
38163863
} else {
38173864
--Idx;
3818-
ReuseShuffleCost += TTI->getVectorInstrCost(
3819-
Instruction::ExtractElement, VecTy, Idx);
3865+
CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement,
3866+
VecTy, Idx);
38203867
}
38213868
}
3822-
CommonCost = ReuseShuffleCost;
3823-
} else if (!E->ReorderIndices.empty()) {
3824-
SmallVector<int> NewMask;
3825-
inversePermutation(E->ReorderIndices, NewMask);
3826-
CommonCost = TTI->getShuffleCost(
3827-
TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask);
38283869
}
38293870
if (ShuffleOrOp == Instruction::ExtractValue) {
38303871
for (unsigned I = 0, E = VL.size(); I < E; ++I) {
@@ -3915,7 +3956,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
39153956
TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy,
39163957
TTI::getCastContextHint(VL0), CostKind, VL0);
39173958
if (NeedToShuffleReuses) {
3918-
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
3959+
CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
39193960
}
39203961

39213962
// Calculate the cost of this instruction.
@@ -3925,12 +3966,11 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
39253966
InstructionCost VecCost = 0;
39263967
// Check if the values are candidates to demote.
39273968
if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
3928-
VecCost =
3929-
ReuseShuffleCost +
3930-
TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,
3931-
TTI::getCastContextHint(VL0), CostKind, VL0);
3969+
VecCost = CommonCost + TTI->getCastInstrCost(
3970+
E->getOpcode(), VecTy, SrcVecTy,
3971+
TTI::getCastContextHint(VL0), CostKind, VL0);
39323972
}
3933-
LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
3973+
LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
39343974
return VecCost - ScalarCost;
39353975
}
39363976
case Instruction::FCmp:
@@ -3941,7 +3981,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
39413981
TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
39423982
CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0);
39433983
if (NeedToShuffleReuses) {
3944-
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
3984+
CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
39453985
}
39463986
auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
39473987
InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
@@ -3982,8 +4022,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
39824022
CmpInst::BAD_ICMP_PREDICATE, CostKind);
39834023
VecCost = std::min(VecCost, IntrinsicCost);
39844024
}
3985-
LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
3986-
return ReuseShuffleCost + VecCost - ScalarCost;
4025+
LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
4026+
return CommonCost + VecCost - ScalarCost;
39874027
}
39884028
case Instruction::FNeg:
39894029
case Instruction::Add:
@@ -4046,14 +4086,14 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
40464086
TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK,
40474087
Op2VK, Op1VP, Op2VP, Operands, VL0);
40484088
if (NeedToShuffleReuses) {
4049-
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
4089+
CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
40504090
}
40514091
InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
40524092
InstructionCost VecCost =
40534093
TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK,
40544094
Op2VK, Op1VP, Op2VP, Operands, VL0);
4055-
LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
4056-
return ReuseShuffleCost + VecCost - ScalarCost;
4095+
LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
4096+
return CommonCost + VecCost - ScalarCost;
40574097
}
40584098
case Instruction::GetElementPtr: {
40594099
TargetTransformInfo::OperandValueKind Op1VK =
@@ -4064,21 +4104,21 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
40644104
InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost(
40654105
Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK);
40664106
if (NeedToShuffleReuses) {
4067-
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
4107+
CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
40684108
}
40694109
InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
40704110
InstructionCost VecCost = TTI->getArithmeticInstrCost(
40714111
Instruction::Add, VecTy, CostKind, Op1VK, Op2VK);
4072-
LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
4073-
return ReuseShuffleCost + VecCost - ScalarCost;
4112+
LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
4113+
return CommonCost + VecCost - ScalarCost;
40744114
}
40754115
case Instruction::Load: {
40764116
// Cost of wide load - cost of scalar loads.
40774117
Align Alignment = cast<LoadInst>(VL0)->getAlign();
40784118
InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
40794119
Instruction::Load, ScalarTy, Alignment, 0, CostKind, VL0);
40804120
if (NeedToShuffleReuses) {
4081-
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
4121+
CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
40824122
}
40834123
InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
40844124
InstructionCost VecLdCost;
@@ -4095,14 +4135,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
40954135
Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
40964136
/*VariableMask=*/false, Alignment, CostKind, VL0);
40974137
}
4098-
if (!NeedToShuffleReuses && !E->ReorderIndices.empty()) {
4099-
SmallVector<int> NewMask;
4100-
inversePermutation(E->ReorderIndices, NewMask);
4101-
VecLdCost += TTI->getShuffleCost(
4102-
TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask);
4103-
}
4104-
LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecLdCost, ScalarLdCost));
4105-
return ReuseShuffleCost + VecLdCost - ScalarLdCost;
4138+
LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecLdCost, ScalarLdCost));
4139+
return CommonCost + VecLdCost - ScalarLdCost;
41064140
}
41074141
case Instruction::Store: {
41084142
// We know that we can merge the stores. Calculate the cost.
@@ -4115,14 +4149,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
41154149
InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
41164150
InstructionCost VecStCost = TTI->getMemoryOpCost(
41174151
Instruction::Store, VecTy, Alignment, 0, CostKind, VL0);
4118-
if (IsReorder) {
4119-
SmallVector<int> NewMask;
4120-
inversePermutation(E->ReorderIndices, NewMask);
4121-
VecStCost += TTI->getShuffleCost(
4122-
TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask);
4123-
}
4124-
LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecStCost, ScalarStCost));
4125-
return VecStCost - ScalarStCost;
4152+
LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecStCost, ScalarStCost));
4153+
return CommonCost + VecStCost - ScalarStCost;
41264154
}
41274155
case Instruction::Call: {
41284156
CallInst *CI = cast<CallInst>(VL0);
@@ -4133,7 +4161,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
41334161
InstructionCost ScalarEltCost =
41344162
TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
41354163
if (NeedToShuffleReuses) {
4136-
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
4164+
CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
41374165
}
41384166
InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
41394167

@@ -4145,7 +4173,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
41454173
<< " (" << VecCallCost << "-" << ScalarCallCost << ")"
41464174
<< " for " << *CI << "\n");
41474175

4148-
return ReuseShuffleCost + VecCallCost - ScalarCallCost;
4176+
return CommonCost + VecCallCost - ScalarCallCost;
41494177
}
41504178
case Instruction::ShuffleVector: {
41514179
assert(E->isAltShuffle() &&
@@ -4158,11 +4186,11 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
41584186
if (NeedToShuffleReuses) {
41594187
for (unsigned Idx : E->ReuseShuffleIndices) {
41604188
Instruction *I = cast<Instruction>(VL[Idx]);
4161-
ReuseShuffleCost -= TTI->getInstructionCost(I, CostKind);
4189+
CommonCost -= TTI->getInstructionCost(I, CostKind);
41624190
}
41634191
for (Value *V : VL) {
41644192
Instruction *I = cast<Instruction>(V);
4165-
ReuseShuffleCost += TTI->getInstructionCost(I, CostKind);
4193+
CommonCost += TTI->getInstructionCost(I, CostKind);
41664194
}
41674195
}
41684196
for (Value *V : VL) {
@@ -4196,8 +4224,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
41964224
}
41974225
VecCost +=
41984226
TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, 0);
4199-
LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
4200-
return ReuseShuffleCost + VecCost - ScalarCost;
4227+
LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
4228+
return CommonCost + VecCost - ScalarCost;
42014229
}
42024230
default:
42034231
llvm_unreachable("Unknown instruction");
@@ -4929,25 +4957,7 @@ class ShuffleInstructionBuilder {
49294957
addMask(NewMask);
49304958
}
49314959

4932-
void addMask(ArrayRef<int> SubMask) {
4933-
if (SubMask.empty())
4934-
return;
4935-
if (Mask.empty()) {
4936-
Mask.append(SubMask.begin(), SubMask.end());
4937-
return;
4938-
}
4939-
SmallVector<int, 4> NewMask(SubMask.size(), SubMask.size());
4940-
int TermValue = std::min(Mask.size(), SubMask.size());
4941-
for (int I = 0, E = SubMask.size(); I < E; ++I) {
4942-
if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||
4943-
Mask[SubMask[I]] >= TermValue) {
4944-
NewMask[I] = UndefMaskElem;
4945-
continue;
4946-
}
4947-
NewMask[I] = Mask[SubMask[I]];
4948-
}
4949-
Mask.swap(NewMask);
4950-
}
4960+
void addMask(ArrayRef<int> SubMask) { ::addMask(Mask, SubMask); }
49514961

49524962
Value *finalize(Value *V) {
49534963
IsFinalized = true;

0 commit comments

Comments
 (0)