Skip to content

Commit 2471f47

Browse files
committed
[CostModel][AArch64] Make extractelement, with fmul user, free whenever possible
In case of Neon, if there exists extractelement from lane != 0 such that 1. extractelement does not necessitate a move from vector_reg -> GPR. 2. extractelement result feeds into fmul. 3. Other operand of fmul is a scalar or extractelement from lane 0 or lane equivalent to 0. then the extractelement can be merged with fmul in the backend and it incurs no cost. e.g. define double @foo(<2 x double> %a) { %1 = extractelement <2 x double> %a, i32 0 %2 = extractelement <2 x double> %a, i32 1 %res = fmul double %1, %2 ret double %res } %2 and %res can be merged in the backend to generate: fmul d0, d0, v0.d[1] The change was tested with SPEC FP(C/C++) on Neoverse-v2. Compile time impact: None Performance impact: Observing 1.3-1.7% uplift on lbm benchmark with -flto depending upon the config.
1 parent 924a64a commit 2471f47

File tree

9 files changed

+303
-72
lines changed

9 files changed

+303
-72
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1392,6 +1392,19 @@ class TargetTransformInfo {
13921392
unsigned Index = -1, Value *Op0 = nullptr,
13931393
Value *Op1 = nullptr) const;
13941394

1395+
/// \return The expected cost of vector Insert and Extract.
1396+
/// Use -1 to indicate that there is no information on the index value.
1397+
/// This is used when the instruction is not available; a typical use
1398+
/// case is to provision the cost of vectorization/scalarization in
1399+
/// vectorizer passes.
1400+
InstructionCost getVectorInstrCost(
1401+
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
1402+
Value *Op0, Value *Op1, Value *Scalar,
1403+
const DenseMap<std::pair<Value *, unsigned>, SmallVector<Value *, 4>>
1404+
&ScalarAndIdxToUser,
1405+
const DenseMap<Value *, SmallVector<std::pair<Value *, unsigned>, 4>>
1406+
&UserToScalarAndIdx) const;
1407+
13951408
/// \return The expected cost of vector Insert and Extract.
13961409
/// This is used when instruction is available, and implementation
13971410
/// asserts 'I' is not nullptr.
@@ -2062,6 +2075,14 @@ class TargetTransformInfo::Concept {
20622075
TTI::TargetCostKind CostKind,
20632076
unsigned Index, Value *Op0,
20642077
Value *Op1) = 0;
2078+
2079+
virtual InstructionCost getVectorInstrCost(
2080+
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
2081+
Value *Op0, Value *Op1, Value *Scalar,
2082+
const DenseMap<std::pair<Value *, unsigned>, SmallVector<Value *, 4>>
2083+
&ScalarAndIdxToUser,
2084+
const DenseMap<Value *, SmallVector<std::pair<Value *, unsigned>, 4>>
2085+
&UserToScalarAndIdx) = 0;
20652086
virtual InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
20662087
TTI::TargetCostKind CostKind,
20672088
unsigned Index) = 0;
@@ -2726,6 +2747,17 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
27262747
Value *Op1) override {
27272748
return Impl.getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
27282749
}
2750+
InstructionCost getVectorInstrCost(
2751+
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
2752+
Value *Op0, Value *Op1, Value *Scalar,
2753+
const DenseMap<std::pair<Value *, unsigned>, SmallVector<Value *, 4>>
2754+
&ScalarAndIdxToUser,
2755+
const DenseMap<Value *, SmallVector<std::pair<Value *, unsigned>, 4>>
2756+
&UserToScalarAndIdx) override {
2757+
return Impl.getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1,
2758+
Scalar, ScalarAndIdxToUser,
2759+
UserToScalarAndIdx);
2760+
}
27292761
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
27302762
TTI::TargetCostKind CostKind,
27312763
unsigned Index) override {

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -683,6 +683,16 @@ class TargetTransformInfoImplBase {
683683
return 1;
684684
}
685685

686+
InstructionCost getVectorInstrCost(
687+
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
688+
Value *Op0, Value *Op1, Value *Scalar,
689+
const DenseMap<std::pair<Value *, unsigned>, SmallVector<Value *, 4>>
690+
&ScalarAndIdxToUser,
691+
const DenseMap<Value *, SmallVector<std::pair<Value *, unsigned>, 4>>
692+
&UserToScalarAndIdx) const {
693+
return 1;
694+
}
695+
686696
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
687697
TTI::TargetCostKind CostKind,
688698
unsigned Index) const {

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1277,12 +1277,23 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
12771277
return 1;
12781278
}
12791279

1280-
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
1281-
TTI::TargetCostKind CostKind,
1282-
unsigned Index, Value *Op0, Value *Op1) {
1280+
virtual InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
1281+
TTI::TargetCostKind CostKind,
1282+
unsigned Index, Value *Op0,
1283+
Value *Op1) {
12831284
return getRegUsageForType(Val->getScalarType());
12841285
}
12851286

1287+
InstructionCost getVectorInstrCost(
1288+
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
1289+
Value *Op0, Value *Op1, Value *Scalar,
1290+
const DenseMap<std::pair<Value *, unsigned>, SmallVector<Value *, 4>>
1291+
&ScalarAndIdxToUser,
1292+
const DenseMap<Value *, SmallVector<std::pair<Value *, unsigned>, 4>>
1293+
&UserToScalarAndIdx) {
1294+
return getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1295+
}
1296+
12861297
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
12871298
TTI::TargetCostKind CostKind,
12881299
unsigned Index) {

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1037,6 +1037,23 @@ InstructionCost TargetTransformInfo::getVectorInstrCost(
10371037
return Cost;
10381038
}
10391039

1040+
InstructionCost TargetTransformInfo::getVectorInstrCost(
1041+
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
1042+
Value *Op0, Value *Op1, Value *Scalar,
1043+
const DenseMap<std::pair<Value *, unsigned>, SmallVector<Value *, 4>>
1044+
&ScalarAndIdxToUser,
1045+
const DenseMap<Value *, SmallVector<std::pair<Value *, unsigned>, 4>>
1046+
&UserToScalarAndIdx) const {
1047+
// FIXME: Assert that Opcode is either InsertElement or ExtractElement.
1048+
// This is mentioned in the interface description and respected by all
1049+
// callers, but never asserted upon.
1050+
InstructionCost Cost = TTIImpl->getVectorInstrCost(
1051+
Opcode, Val, CostKind, Index, Op0, Op1, Scalar, ScalarAndIdxToUser,
1052+
UserToScalarAndIdx);
1053+
assert(Cost >= 0 && "TTI should not produce negative costs!");
1054+
return Cost;
1055+
}
1056+
10401057
InstructionCost
10411058
TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
10421059
TTI::TargetCostKind CostKind,

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 159 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,20 @@
1616
#include "llvm/CodeGen/BasicTTIImpl.h"
1717
#include "llvm/CodeGen/CostTable.h"
1818
#include "llvm/CodeGen/TargetLowering.h"
19+
#include "llvm/IR/DerivedTypes.h"
20+
#include "llvm/IR/InstrTypes.h"
21+
#include "llvm/IR/Instruction.h"
22+
#include "llvm/IR/Instructions.h"
1923
#include "llvm/IR/IntrinsicInst.h"
2024
#include "llvm/IR/Intrinsics.h"
2125
#include "llvm/IR/IntrinsicsAArch64.h"
2226
#include "llvm/IR/PatternMatch.h"
27+
#include "llvm/Support/Casting.h"
2328
#include "llvm/Support/Debug.h"
2429
#include "llvm/Transforms/InstCombine/InstCombiner.h"
2530
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
2631
#include <algorithm>
32+
#include <cassert>
2733
#include <optional>
2834
using namespace llvm;
2935
using namespace llvm::PatternMatch;
@@ -3145,12 +3151,20 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
31453151
return 0;
31463152
}
31473153

3148-
InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
3149-
Type *Val,
3150-
unsigned Index,
3151-
bool HasRealUse) {
3154+
InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
3155+
std::variant<const Instruction *, const unsigned> InstOrOpcode, Type *Val,
3156+
unsigned Index, bool HasRealUse, Value *Scalar,
3157+
const DenseMap<std::pair<Value *, unsigned>, SmallVector<Value *, 4>>
3158+
&ScalarAndIdxToUser,
3159+
const DenseMap<Value *, SmallVector<std::pair<Value *, unsigned>, 4>>
3160+
&UserToScalarAndIdx) {
31523161
assert(Val->isVectorTy() && "This must be a vector type");
31533162

3163+
const Instruction *I =
3164+
(std::holds_alternative<const Instruction *>(InstOrOpcode)
3165+
? get<const Instruction *>(InstOrOpcode)
3166+
: nullptr);
3167+
31543168
if (Index != -1U) {
31553169
// Legalize the type.
31563170
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
@@ -3194,6 +3208,134 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
31943208
// compile-time considerations.
31953209
}
31963210

3211+
// In case of Neon, if there exists extractelement from lane != 0 such that
3212+
// 1. extractelement does not necessitate a move from vector_reg -> GPR.
3213+
// 2. extractelement result feeds into fmul.
3214+
// 3. Other operand of fmul is a scalar or extractelement from lane 0 or lane
3215+
// equivalent to 0.
3216+
// then the extractelement can be merged with fmul in the backend and it
3217+
// incurs no cost.
3218+
// e.g.
3219+
// define double @foo(<2 x double> %a) {
3220+
// %1 = extractelement <2 x double> %a, i32 0
3221+
// %2 = extractelement <2 x double> %a, i32 1
3222+
// %res = fmul double %1, %2
3223+
// ret double %res
3224+
// }
3225+
// %2 and %res can be merged in the backend to generate fmul v0, v0, v1.d[1]
3226+
auto ExtractCanFuseWithFmul = [&]() {
3227+
// We bail out if the extract is from lane 0.
3228+
if (Index == 0)
3229+
return false;
3230+
3231+
// Check if the scalar element type of the vector operand of ExtractElement
3232+
// instruction is one of the allowed types.
3233+
auto IsAllowedScalarTy = [&](const Type *T) {
3234+
return T->isFloatTy() || T->isDoubleTy() ||
3235+
(T->isHalfTy() && ST->hasFullFP16());
3236+
};
3237+
3238+
// Check if the extractelement user is scalar fmul.
3239+
auto IsUserFMulScalarTy = [](const Value *EEUser) {
3240+
// Check if the user is scalar fmul.
3241+
const BinaryOperator *BO = dyn_cast_if_present<BinaryOperator>(EEUser);
3242+
return BO && BO->getOpcode() == BinaryOperator::FMul &&
3243+
!BO->getType()->isVectorTy();
3244+
};
3245+
3246+
// InstCombine combines fmul with fadd/fsub. Hence, extractelement fusion
3247+
// with fmul does not happen.
3248+
auto IsFMulUserFAddFSub = [](const Value *FMul) {
3249+
return any_of(FMul->users(), [](const User *U) {
3250+
const BinaryOperator *BO = dyn_cast_if_present<BinaryOperator>(U);
3251+
return (BO && (BO->getOpcode() == BinaryOperator::FAdd ||
3252+
BO->getOpcode() == BinaryOperator::FSub));
3253+
});
3254+
};
3255+
3256+
// Check if the type constraints on input vector type and result scalar type
3257+
// of extractelement instruction are satisfied.
3258+
auto TypeConstraintsOnEESatisfied =
3259+
[&IsAllowedScalarTy](const Type *VectorTy, const Type *ScalarTy) {
3260+
return isa<FixedVectorType>(VectorTy) && IsAllowedScalarTy(ScalarTy);
3261+
};
3262+
3263+
// Check if the extract index is from lane 0 or lane equivalent to 0 for a
3264+
// certain scalar type and a certain vector register width.
3265+
auto IsExtractLaneEquivalentToZero = [&](const unsigned &Idx,
3266+
const unsigned &EltSz) {
3267+
auto RegWidth =
3268+
getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
3269+
.getFixedValue();
3270+
return (Idx == 0 || (Idx * EltSz) % RegWidth == 0);
3271+
};
3272+
3273+
if (std::holds_alternative<const unsigned>(InstOrOpcode)) {
3274+
if (!TypeConstraintsOnEESatisfied(Val, Val->getScalarType()))
3275+
return false;
3276+
const auto &ScalarIdxPair = std::make_pair(Scalar, Index);
3277+
return ScalarAndIdxToUser.find(ScalarIdxPair) !=
3278+
ScalarAndIdxToUser.end() &&
3279+
all_of(ScalarAndIdxToUser.at(ScalarIdxPair), [&](Value *U) {
3280+
if (!IsUserFMulScalarTy(U) || IsFMulUserFAddFSub(U))
3281+
return false;
3282+
// 1. Check if the other operand is extract from lane 0 or lane
3283+
// equivalent to 0.
3284+
// 2. In case of SLP, if the other operand is not extract from
3285+
// same tree, we bail out since we can not analyze that extract.
3286+
return UserToScalarAndIdx.at(U).size() == 2 &&
3287+
all_of(UserToScalarAndIdx.at(U), [&](auto &P) {
3288+
if (ScalarIdxPair == P)
3289+
return true; // Skip.
3290+
return IsExtractLaneEquivalentToZero(
3291+
P.second, Val->getScalarSizeInBits());
3292+
});
3293+
});
3294+
} else {
3295+
const ExtractElementInst *EE = cast<ExtractElementInst>(I);
3296+
3297+
const ConstantInt *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
3298+
if (!IdxOp)
3299+
return false;
3300+
3301+
if (!TypeConstraintsOnEESatisfied(EE->getVectorOperand()->getType(),
3302+
EE->getType()))
3303+
return false;
3304+
3305+
return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
3306+
if (!IsUserFMulScalarTy(U) || IsFMulUserFAddFSub(U))
3307+
return false;
3308+
3309+
// Check if the other operand of extractelement is also extractelement
3310+
// from lane equivalent to 0.
3311+
const BinaryOperator *BO = cast<BinaryOperator>(U);
3312+
const ExtractElementInst *OtherEE = dyn_cast<ExtractElementInst>(
3313+
BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
3314+
if (OtherEE) {
3315+
const ConstantInt *IdxOp =
3316+
dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
3317+
if (!IdxOp)
3318+
return false;
3319+
return IsExtractLaneEquivalentToZero(
3320+
cast<ConstantInt>(OtherEE->getIndexOperand())
3321+
->getValue()
3322+
.getZExtValue(),
3323+
OtherEE->getType()->getScalarSizeInBits());
3324+
}
3325+
return true;
3326+
});
3327+
}
3328+
return false;
3329+
};
3330+
3331+
if (std::holds_alternative<const unsigned>(InstOrOpcode)) {
3332+
const unsigned &Opcode = get<const unsigned>(InstOrOpcode);
3333+
if (Opcode == Instruction::ExtractElement && ExtractCanFuseWithFmul())
3334+
return 0;
3335+
} else if (I && I->getOpcode() == Instruction::ExtractElement &&
3336+
ExtractCanFuseWithFmul())
3337+
return 0;
3338+
31973339
// All other insert/extracts cost this much.
31983340
return ST->getVectorInsertExtractBaseCost();
31993341
}
@@ -3207,6 +3349,19 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
32073349
return getVectorInstrCostHelper(nullptr, Val, Index, HasRealUse);
32083350
}
32093351

3352+
InstructionCost AArch64TTIImpl::getVectorInstrCost(
3353+
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3354+
Value *Op0, Value *Op1, Value *Scalar,
3355+
const DenseMap<std::pair<Value *, unsigned>, SmallVector<Value *, 4>>
3356+
&ScalarAndIdxToUser,
3357+
const DenseMap<Value *, SmallVector<std::pair<Value *, unsigned>, 4>>
3358+
&UserToScalarAndIdx) {
3359+
bool HasRealUse =
3360+
Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
3361+
return getVectorInstrCostHelper(Opcode, Val, Index, HasRealUse, Scalar,
3362+
ScalarAndIdxToUser, UserToScalarAndIdx);
3363+
}
3364+
32103365
InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
32113366
Type *Val,
32123367
TTI::TargetCostKind CostKind,

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "llvm/CodeGen/BasicTTIImpl.h"
2525
#include "llvm/IR/Function.h"
2626
#include "llvm/IR/Intrinsics.h"
27+
#include <climits>
2728
#include <cstdint>
2829
#include <optional>
2930

@@ -66,8 +67,15 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
6667
// 'Val' and 'Index' are forwarded from 'getVectorInstrCost'; 'HasRealUse'
6768
// indicates whether the vector instruction is available in the input IR or
6869
// just imaginary in vectorizer passes.
69-
InstructionCost getVectorInstrCostHelper(const Instruction *I, Type *Val,
70-
unsigned Index, bool HasRealUse);
70+
InstructionCost getVectorInstrCostHelper(
71+
std::variant<const Instruction *, const unsigned> InstOrOpcode, Type *Val,
72+
unsigned Index, bool HasRealUse, Value *Scalar = nullptr,
73+
const DenseMap<std::pair<Value *, unsigned>, SmallVector<Value *, 4>>
74+
&ScalarAndIdxToUser =
75+
DenseMap<std::pair<Value *, unsigned>, SmallVector<Value *, 4>>(),
76+
const DenseMap<Value *, SmallVector<std::pair<Value *, unsigned>, 4>>
77+
&UserToScalarAndIdx = DenseMap<
78+
Value *, SmallVector<std::pair<Value *, unsigned>, 4>>());
7179

7280
public:
7381
explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
@@ -185,6 +193,15 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
185193
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
186194
TTI::TargetCostKind CostKind,
187195
unsigned Index, Value *Op0, Value *Op1);
196+
197+
InstructionCost getVectorInstrCost(
198+
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
199+
Value *Op0, Value *Op1, Value *Scalar,
200+
const DenseMap<std::pair<Value *, unsigned>, SmallVector<Value *, 4>>
201+
&ScalarAndIdxToUser,
202+
const DenseMap<Value *, SmallVector<std::pair<Value *, unsigned>, 4>>
203+
&UserToScalarAndIdx);
204+
188205
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
189206
TTI::TargetCostKind CostKind,
190207
unsigned Index);

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11633,6 +11633,17 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1163311633
std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
1163411634
DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;
1163511635
SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
11636+
// Keep track of {Scalar, Index} -> User and User -> {Scalar, Index}.
11637+
// On AArch64, this helps in fusing a mov instruction, associated with
11638+
// extractelement, with fmul in the backend so that extractelement is free.
11639+
DenseMap<std::pair<Value *, unsigned>, SmallVector<Value *, 4>>
11640+
ScalarAndIdxToUser;
11641+
DenseMap<Value *, SmallVector<std::pair<Value *, unsigned>, 4>>
11642+
UserToScalarAndIdx;
11643+
for (ExternalUser &EU : ExternalUses) {
11644+
UserToScalarAndIdx[EU.User].push_back({EU.Scalar, EU.Lane});
11645+
ScalarAndIdxToUser[{EU.Scalar, EU.Lane}].push_back(EU.User);
11646+
}
1163611647
for (ExternalUser &EU : ExternalUses) {
1163711648
// Uses by ephemeral values are free (because the ephemeral value will be
1163811649
// removed prior to code generation, and so the extraction will be
@@ -11739,8 +11750,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1173911750
ExtraCost = TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
1174011751
VecTy, EU.Lane);
1174111752
} else {
11742-
ExtraCost = TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
11743-
CostKind, EU.Lane);
11753+
ExtraCost = TTI->getVectorInstrCost(
11754+
Instruction::ExtractElement, VecTy, CostKind, EU.Lane, nullptr,
11755+
nullptr, EU.Scalar, ScalarAndIdxToUser, UserToScalarAndIdx);
1174411756
}
1174511757
// Leave the scalar instructions as is if they are cheaper than extracts.
1174611758
if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||

0 commit comments

Comments
 (0)