Skip to content

Commit 75e1cf4

Browse files
committed
[COST]Improve cost model for shuffles in SLP.
Introduced masks where they are not added and improved target dependent cost models to avoid returning of the incorrect cost results after adding masks. Differential Revision: https://reviews.llvm.org/D100486
1 parent db013b2 commit 75e1cf4

File tree

9 files changed

+217
-152
lines changed

9 files changed

+217
-152
lines changed

llvm/include/llvm/Analysis/VectorUtils.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -413,7 +413,7 @@ bool widenShuffleMaskElts(int Scale, ArrayRef<int> Mask,
413413
void processShuffleMasks(
414414
ArrayRef<int> Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs,
415415
unsigned NumOfUsedRegs, function_ref<void()> NoInputAction,
416-
function_ref<void(ArrayRef<int>, unsigned)> SingleInputAction,
416+
function_ref<void(ArrayRef<int>, unsigned, unsigned)> SingleInputAction,
417417
function_ref<void(ArrayRef<int>, unsigned, unsigned)> ManyInputsAction);
418418

419419
/// Compute a map of integer instructions to their minimum legal type

llvm/lib/Analysis/VectorUtils.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -499,7 +499,7 @@ bool llvm::widenShuffleMaskElts(int Scale, ArrayRef<int> Mask,
499499
void llvm::processShuffleMasks(
500500
ArrayRef<int> Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs,
501501
unsigned NumOfUsedRegs, function_ref<void()> NoInputAction,
502-
function_ref<void(ArrayRef<int>, unsigned)> SingleInputAction,
502+
function_ref<void(ArrayRef<int>, unsigned, unsigned)> SingleInputAction,
503503
function_ref<void(ArrayRef<int>, unsigned, unsigned)> ManyInputsAction) {
504504
SmallVector<SmallVector<SmallVector<int>>> Res(NumOfDestRegs);
505505
// Try to perform better estimation of the permutation.
@@ -543,7 +543,7 @@ void llvm::processShuffleMasks(
543543
auto *It =
544544
find_if(Dest, [](ArrayRef<int> Mask) { return !Mask.empty(); });
545545
unsigned SrcReg = std::distance(Dest.begin(), It);
546-
SingleInputAction(*It, SrcReg);
546+
SingleInputAction(*It, SrcReg, I);
547547
break;
548548
}
549549
default: {

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2523,7 +2523,7 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
25232523
/*NumOfUsedRegs=*/1,
25242524
[&Output, &DAG = DAG, NewVT]() { Output = DAG.getUNDEF(NewVT); },
25252525
[&Output, &DAG = DAG, NewVT, &DL, &Inputs,
2526-
&BuildVector](ArrayRef<int> Mask, unsigned Idx) {
2526+
&BuildVector](ArrayRef<int> Mask, unsigned Idx, unsigned /*Unused*/) {
25272527
if (Inputs[Idx]->getOpcode() == ISD::BUILD_VECTOR)
25282528
Output = BuildVector(Inputs[Idx], Inputs[Idx], Mask);
25292529
else

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1224,6 +1224,60 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
12241224
auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
12251225
LegalVT.getVectorNumElements());
12261226

1227+
if (!Mask.empty() && NumOfDests.isValid()) {
1228+
// Try to perform better estimation of the permutation.
1229+
// 1. Split the source/destination vectors into real registers.
1230+
// 2. Do the mask analysis to identify which real registers are
1231+
// permuted. If more than 1 source registers are used for the
1232+
// destination register building, the cost for this destination register
1233+
// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1234+
// source register is used, build mask and calculate the cost as a cost
1235+
// of PermuteSingleSrc.
1236+
// Also, for the single register permute we try to identify if the
1237+
// destination register is just a copy of the source register or the
1238+
// copy of the previous destination register (the cost is
1239+
// TTI::TCC_Basic). If the source register is just reused, the cost for
1240+
// this operation is 0.
1241+
unsigned NormalizedVF = LT.second.getVectorNumElements() * NumOfSrcs;
1242+
SmallVector<int> NormalizedMask(NormalizedVF, UndefMaskElem);
1243+
copy(Mask, NormalizedMask.begin());
1244+
unsigned E = *NumOfDests.getValue();
1245+
unsigned PrevSrcReg = 0;
1246+
ArrayRef<int> PrevRegMask;
1247+
InstructionCost Cost = 0;
1248+
processShuffleMasks(
1249+
NormalizedMask, NumOfSrcs, E, E, []() {},
1250+
[this, SingleOpTy, &PrevSrcReg, &PrevRegMask,
1251+
&Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1252+
if (!ShuffleVectorInst::isIdentityMask(RegMask)) {
1253+
// Check if the previous register can be just copied to the next
1254+
// one.
1255+
if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1256+
PrevRegMask != RegMask)
1257+
Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
1258+
RegMask, 0, nullptr);
1259+
else
1260+
// Just a copy of previous destination register.
1261+
Cost += TTI::TCC_Basic;
1262+
return;
1263+
}
1264+
if (SrcReg != DestReg &&
1265+
any_of(RegMask, [](int I) { return I != UndefMaskElem; })) {
1266+
// Just a copy of the source register.
1267+
Cost += TTI::TCC_Basic;
1268+
}
1269+
PrevSrcReg = SrcReg;
1270+
PrevRegMask = RegMask;
1271+
},
1272+
[this, SingleOpTy, &Cost](ArrayRef<int> RegMask,
1273+
unsigned /*Unused*/,
1274+
unsigned /*Unused*/) {
1275+
Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1276+
0, nullptr);
1277+
});
1278+
return Cost;
1279+
}
1280+
12271281
InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
12281282
return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
12291283
None, 0, nullptr);

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5059,6 +5059,7 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
50595059
// Process extracts in blocks of EltsPerVector to check if the source vector
50605060
// operand can be re-used directly. If not, add the cost of creating a shuffle
50615061
// to extract the values into a vector register.
5062+
SmallVector<int> RegMask(EltsPerVector, UndefMaskElem);
50625063
for (auto *V : VL) {
50635064
++Idx;
50645065

@@ -5068,6 +5069,7 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
50685069

50695070
// Reached the start of a new vector registers.
50705071
if (Idx % EltsPerVector == 0) {
5072+
RegMask.assign(EltsPerVector, UndefMaskElem);
50715073
AllConsecutive = true;
50725074
continue;
50735075
}
@@ -5079,6 +5081,7 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
50795081
unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));
50805082
AllConsecutive &= PrevIdx + 1 == CurrentIdx &&
50815083
CurrentIdx % EltsPerVector == Idx % EltsPerVector;
5084+
RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;
50825085
}
50835086

50845087
if (AllConsecutive)
@@ -5093,7 +5096,7 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
50935096
// cost to extract the a vector with EltsPerVector elements.
50945097
Cost += TTI.getShuffleCost(
50955098
TargetTransformInfo::SK_PermuteSingleSrc,
5096-
FixedVectorType::get(VecTy->getElementType(), EltsPerVector));
5099+
FixedVectorType::get(VecTy->getElementType(), EltsPerVector), RegMask);
50975100
}
50985101
return Cost;
50995102
}
@@ -5880,16 +5883,21 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
58805883
TTI::CastContextHint::None, CostKind);
58815884
}
58825885

5883-
SmallVector<int> Mask;
5884-
buildShuffleEntryMask(
5885-
E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
5886-
[E](Instruction *I) {
5887-
assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
5888-
return isAlternateInstruction(I, E->getMainOp(), E->getAltOp());
5889-
},
5890-
Mask);
5891-
CommonCost =
5892-
TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy, Mask);
5886+
if (E->ReuseShuffleIndices.empty()) {
5887+
CommonCost =
5888+
TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy);
5889+
} else {
5890+
SmallVector<int> Mask;
5891+
buildShuffleEntryMask(
5892+
E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
5893+
[E](Instruction *I) {
5894+
assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
5895+
return I->getOpcode() == E->getAltOpcode();
5896+
},
5897+
Mask);
5898+
CommonCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
5899+
FinalVecTy, Mask);
5900+
}
58935901
LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
58945902
return CommonCost + VecCost - ScalarCost;
58955903
}
@@ -6278,7 +6286,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
62786286
Cost += SpillCost + ExtractCost;
62796287
if (FirstUsers.size() == 1) {
62806288
int Limit = ShuffleMask.front().size() * 2;
6281-
if (all_of(ShuffleMask.front(), [Limit](int Idx) { return Idx < Limit; }) &&
6289+
if (!all_of(ShuffleMask.front(),
6290+
[Limit](int Idx) { return Idx < Limit; }) ||
62826291
!ShuffleVectorInst::isIdentityMask(ShuffleMask.front())) {
62836292
InstructionCost C = TTI->getShuffleCost(
62846293
TTI::SK_PermuteSingleSrc,
@@ -6327,6 +6336,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
63276336
<< "SLP: Current total cost = " << Cost << "\n");
63286337
Cost -= InsertCost;
63296338
for (int I = 2, E = FirstUsers.size(); I < E; ++I) {
6339+
if (ShuffleMask[I].empty())
6340+
continue;
63306341
// Other elements - permutation of 2 vectors (the initial one and the
63316342
// next Ith incoming vector).
63326343
unsigned VF = ShuffleMask[I].size();

0 commit comments

Comments
 (0)