Skip to content

Commit 13d8ba7

Browse files
authored
[LV][TTI] Calculate cost of extracting last index in a scalable vector (#144086)
There are a couple of places in the loop vectoriser where we want to calculate the cost of extracting the last lane in a vector. However, we wrongly assume that asking for the cost of extracting lane (VF.getKnownMinValue() - 1) is an accurate representation of the cost of extracting the last lane. For SVE at least, this is non-trivial as it requires the use of whilelo and lastb instructions. To solve this problem I have added a new getReverseVectorInstrCost interface where the index is used in reverse from the end of the vector. Suppose a vector has a given ElementCount EC, the extracted/inserted lane would be EC - 1 - Index. For scalable vectors this index is unknown at compile time. I've added a AArch64 hook that better represents the cost, and also a RISCV hook that maintains compatibility with the behaviour prior to this PR. I've also taken the liberty of adding support in vplan for calculating the cost of VPInstruction::ExtractLastElement.
1 parent 0c622d7 commit 13d8ba7

File tree

11 files changed

+106
-23
lines changed

11 files changed

+106
-23
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1512,6 +1512,14 @@ class TargetTransformInfo {
15121512
TTI::TargetCostKind CostKind,
15131513
unsigned Index = -1) const;
15141514

1515+
/// \return The expected cost of inserting or extracting a lane that is \p
1516+
/// Index elements from the end of a vector, i.e. the mathematical expression
1517+
/// for the lane is (VF - 1 - Index). This is required for scalable vectors
1518+
/// where the exact lane index is unknown at compile time.
1519+
LLVM_ABI InstructionCost getIndexedVectorInstrCostFromEnd(
1520+
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind,
1521+
unsigned Index) const;
1522+
15151523
/// \return The expected cost of aggregate inserts and extracts. This is
15161524
/// used when the instruction is not available; a typical use case is to
15171525
/// provision the cost of vectorization/scalarization in vectorizer passes.

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -809,6 +809,13 @@ class TargetTransformInfoImplBase {
809809
return 1;
810810
}
811811

812+
virtual InstructionCost
813+
getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
814+
TTI::TargetCostKind CostKind,
815+
unsigned Index) const {
816+
return 1;
817+
}
818+
812819
virtual InstructionCost
813820
getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF,
814821
const APInt &DemandedDstElts,

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1444,6 +1444,20 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
14441444
Op1);
14451445
}
14461446

1447+
InstructionCost
1448+
getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
1449+
TTI::TargetCostKind CostKind,
1450+
unsigned Index) const override {
1451+
unsigned NewIndex = -1;
1452+
if (auto *FVTy = dyn_cast<FixedVectorType>(Val)) {
1453+
assert(Index < FVTy->getNumElements() &&
1454+
"Unexpected index from end of vector");
1455+
NewIndex = FVTy->getNumElements() - 1 - Index;
1456+
}
1457+
return thisT()->getVectorInstrCost(Opcode, Val, CostKind, NewIndex, nullptr,
1458+
nullptr);
1459+
}
1460+
14471461
InstructionCost
14481462
getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF,
14491463
const APInt &DemandedDstElts,

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1130,6 +1130,15 @@ TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
11301130
return Cost;
11311131
}
11321132

1133+
InstructionCost TargetTransformInfo::getIndexedVectorInstrCostFromEnd(
1134+
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind,
1135+
unsigned Index) const {
1136+
InstructionCost Cost =
1137+
TTIImpl->getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind, Index);
1138+
assert(Cost >= 0 && "TTI should not produce negative costs!");
1139+
return Cost;
1140+
}
1141+
11331142
InstructionCost TargetTransformInfo::getInsertExtractValueCost(
11341143
unsigned Opcode, TTI::TargetCostKind CostKind) const {
11351144
assert((Opcode == Instruction::InsertValue ||

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3983,6 +3983,24 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
39833983
return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I);
39843984
}
39853985

3986+
InstructionCost
3987+
AArch64TTIImpl::getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
3988+
TTI::TargetCostKind CostKind,
3989+
unsigned Index) const {
3990+
if (isa<FixedVectorType>(Val))
3991+
return BaseT::getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind,
3992+
Index);
3993+
3994+
// This typically requires both while and lastb instructions in order
3995+
// to extract the last element. If this is in a loop the while
3996+
// instruction can at least be hoisted out, although it will consume a
3997+
// predicate register. The cost should be more expensive than the base
3998+
// extract cost, which is 2 for most CPUs.
3999+
return CostKind == TTI::TCK_CodeSize
4000+
? 2
4001+
: ST->getVectorInsertExtractBaseCost() + 1;
4002+
}
4003+
39864004
InstructionCost AArch64TTIImpl::getScalarizationOverhead(
39874005
VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
39884006
TTI::TargetCostKind CostKind, bool ForPoisonSrc,

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,11 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
220220
TTI::TargetCostKind CostKind,
221221
unsigned Index) const override;
222222

223+
InstructionCost
224+
getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
225+
TTI::TargetCostKind CostKind,
226+
unsigned Index) const override;
227+
223228
InstructionCost
224229
getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF,
225230
TTI::TargetCostKind CostKind) const override;

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2415,6 +2415,24 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
24152415
return BaseCost + SlideCost;
24162416
}
24172417

2418+
InstructionCost
2419+
RISCVTTIImpl::getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
2420+
TTI::TargetCostKind CostKind,
2421+
unsigned Index) const {
2422+
if (isa<FixedVectorType>(Val))
2423+
return BaseT::getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind,
2424+
Index);
2425+
2426+
// TODO: This code replicates what LoopVectorize.cpp used to do when asking
2427+
// for the cost of extracting the last lane of a scalable vector. It probably
2428+
// needs a more accurate cost.
2429+
ElementCount EC = cast<VectorType>(Val)->getElementCount();
2430+
assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
2431+
return getVectorInstrCost(Opcode, Val, CostKind,
2432+
EC.getKnownMinValue() - 1 - Index, nullptr,
2433+
nullptr);
2434+
}
2435+
24182436
InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
24192437
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
24202438
TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,11 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
243243
unsigned Index, const Value *Op0,
244244
const Value *Op1) const override;
245245

246+
InstructionCost
247+
getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
248+
TTI::TargetCostKind CostKind,
249+
unsigned Index) const override;
250+
246251
InstructionCost getArithmeticInstrCost(
247252
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
248253
TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5303,13 +5303,13 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
53035303
// VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
53045304
// the actual generated code, which involves extracting the last element of
53055305
// a scalable vector where the lane to extract is unknown at compile time.
5306-
return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5307-
TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
5308-
CostKind) +
5309-
(IsLoopInvariantStoreValue
5310-
? 0
5311-
: TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5312-
CostKind, VF.getKnownMinValue() - 1));
5306+
InstructionCost Cost =
5307+
TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5308+
TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind);
5309+
if (!IsLoopInvariantStoreValue)
5310+
Cost += TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
5311+
VectorTy, CostKind, 0);
5312+
return Cost;
53135313
}
53145314

53155315
InstructionCost

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1013,6 +1013,12 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
10131013
I32Ty, {Arg0Ty, I32Ty, I1Ty});
10141014
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
10151015
}
1016+
case VPInstruction::ExtractLastElement: {
1017+
// Add on the cost of extracting the element.
1018+
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1019+
return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
1020+
VecTy, Ctx.CostKind, 0);
1021+
}
10161022
case VPInstruction::ExtractPenultimateElement:
10171023
if (VF == ElementCount::getScalable(1))
10181024
return InstructionCost::getInvalid();

0 commit comments

Comments
 (0)