Skip to content

Commit f4f9396

Browse files
committed
[CostModel][AArch64] Make extractelement, with fmul user, free whenever possible
In case of Neon, if there exists extractelement from lane != 0 such that 1. extractelement does not necessitate a move from vector_reg -> GPR. 2. extractelement result feeds into fmul. 3. Other operand of fmul is a scalar or extractelement from lane 0 or lane equivalent to 0. then the extractelement can be merged with fmul in the backend and it incurs no cost. e.g. define double @foo(<2 x double> %a) { %1 = extractelement <2 x double> %a, i32 0 %2 = extractelement <2 x double> %a, i32 1 %res = fmul double %1, %2 ret double %res } %2 and %res can be merged in the backend to generate: fmul d0, d0, v0.d[1] The change was tested with SPEC FP(C/C++) on Neoverse-v2. Compile time impact: None Performance impact: Observing 1.3-1.7% uplift on lbm benchmark with -flto depending upon the config.
1 parent 924a64a commit f4f9396

File tree

9 files changed

+261
-76
lines changed

9 files changed

+261
-76
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
2323

2424
#include "llvm/ADT/APInt.h"
25+
#include "llvm/ADT/ArrayRef.h"
2526
#include "llvm/IR/FMF.h"
2627
#include "llvm/IR/InstrTypes.h"
2728
#include "llvm/IR/PassManager.h"
@@ -1392,6 +1393,16 @@ class TargetTransformInfo {
13921393
unsigned Index = -1, Value *Op0 = nullptr,
13931394
Value *Op1 = nullptr) const;
13941395

1396+
/// \return The expected cost of vector Insert and Extract.
1397+
/// Use -1 to indicate that there is no information on the index value.
1398+
/// This is used when the instruction is not available; a typical use
1399+
/// case is to provision the cost of vectorization/scalarization in
1400+
/// vectorizer passes.
1401+
InstructionCost getVectorInstrCost(
1402+
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
1403+
Value *Scalar,
1404+
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const;
1405+
13951406
/// \return The expected cost of vector Insert and Extract.
13961407
/// This is used when instruction is available, and implementation
13971408
/// asserts 'I' is not nullptr.
@@ -2062,6 +2073,12 @@ class TargetTransformInfo::Concept {
20622073
TTI::TargetCostKind CostKind,
20632074
unsigned Index, Value *Op0,
20642075
Value *Op1) = 0;
2076+
2077+
virtual InstructionCost getVectorInstrCost(
2078+
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
2079+
Value *Scalar,
2080+
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) = 0;
2081+
20652082
virtual InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
20662083
TTI::TargetCostKind CostKind,
20672084
unsigned Index) = 0;
@@ -2726,6 +2743,13 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
27262743
Value *Op1) override {
27272744
return Impl.getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
27282745
}
2746+
InstructionCost getVectorInstrCost(
2747+
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
2748+
Value *Scalar,
2749+
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) override {
2750+
return Impl.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
2751+
ScalarUserAndIdx);
2752+
}
27292753
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
27302754
TTI::TargetCostKind CostKind,
27312755
unsigned Index) override {

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -683,6 +683,13 @@ class TargetTransformInfoImplBase {
683683
return 1;
684684
}
685685

686+
InstructionCost getVectorInstrCost(
687+
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
688+
Value *Scalar,
689+
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
690+
return 1;
691+
}
692+
686693
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
687694
TTI::TargetCostKind CostKind,
688695
unsigned Index) const {

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
#define LLVM_CODEGEN_BASICTTIIMPL_H
1818

1919
#include "llvm/ADT/APInt.h"
20-
#include "llvm/ADT/ArrayRef.h"
2120
#include "llvm/ADT/BitVector.h"
2221
#include "llvm/ADT/SmallPtrSet.h"
2322
#include "llvm/ADT/SmallVector.h"
@@ -1277,12 +1276,20 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
12771276
return 1;
12781277
}
12791278

1280-
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
1281-
TTI::TargetCostKind CostKind,
1282-
unsigned Index, Value *Op0, Value *Op1) {
1279+
virtual InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
1280+
TTI::TargetCostKind CostKind,
1281+
unsigned Index, Value *Op0,
1282+
Value *Op1) {
12831283
return getRegUsageForType(Val->getScalarType());
12841284
}
12851285

1286+
InstructionCost getVectorInstrCost(
1287+
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
1288+
Value *Scalar,
1289+
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
1290+
return getVectorInstrCost(Opcode, Val, CostKind, Index, nullptr, nullptr);
1291+
}
1292+
12861293
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
12871294
TTI::TargetCostKind CostKind,
12881295
unsigned Index) {

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1037,6 +1037,19 @@ InstructionCost TargetTransformInfo::getVectorInstrCost(
10371037
return Cost;
10381038
}
10391039

1040+
InstructionCost TargetTransformInfo::getVectorInstrCost(
1041+
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
1042+
Value *Scalar,
1043+
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
1044+
// FIXME: Assert that Opcode is either InsertElement or ExtractElement.
1045+
// This is mentioned in the interface description and respected by all
1046+
// callers, but never asserted upon.
1047+
InstructionCost Cost = TTIImpl->getVectorInstrCost(
1048+
Opcode, Val, CostKind, Index, Scalar, ScalarUserAndIdx);
1049+
assert(Cost >= 0 && "TTI should not produce negative costs!");
1050+
return Cost;
1051+
}
1052+
10401053
InstructionCost
10411054
TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
10421055
TTI::TargetCostKind CostKind,

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 147 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include "AArch64ExpandImm.h"
1111
#include "AArch64PerfectShuffle.h"
1212
#include "MCTargetDesc/AArch64AddressingModes.h"
13+
#include "llvm/ADT/DenseMap.h"
1314
#include "llvm/Analysis/IVDescriptors.h"
1415
#include "llvm/Analysis/LoopInfo.h"
1516
#include "llvm/Analysis/TargetTransformInfo.h"
@@ -3145,10 +3146,10 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
31453146
return 0;
31463147
}
31473148

3148-
InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
3149-
Type *Val,
3150-
unsigned Index,
3151-
bool HasRealUse) {
3149+
InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
3150+
Type *Val, unsigned Index, bool HasRealUse, const Instruction *I,
3151+
std::optional<unsigned> Opcode, Value *Scalar,
3152+
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
31523153
assert(Val->isVectorTy() && "This must be a vector type");
31533154

31543155
if (Index != -1U) {
@@ -3194,6 +3195,138 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
31943195
// compile-time considerations.
31953196
}
31963197

3198+
// In case of Neon, if there exists extractelement from lane != 0 such that
3199+
// 1. extractelement does not necessitate a move from vector_reg -> GPR.
3200+
// 2. extractelement result feeds into fmul.
3201+
// 3. Other operand of fmul is an extractelement from lane 0 or lane
3202+
// equivalent to 0.
3203+
// then the extractelement can be merged with fmul in the backend and it
3204+
// incurs no cost.
3205+
// e.g.
3206+
// define double @foo(<2 x double> %a) {
3207+
// %1 = extractelement <2 x double> %a, i32 0
3208+
// %2 = extractelement <2 x double> %a, i32 1
3209+
// %res = fmul double %1, %2
3210+
// ret double %res
3211+
// }
3212+
// %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
3213+
auto ExtractCanFuseWithFmul = [&]() {
3214+
// We bail out if the extract is from lane 0.
3215+
if (Index == 0)
3216+
return false;
3217+
3218+
// Check if the scalar element type of the vector operand of ExtractElement
3219+
// instruction is one of the allowed types.
3220+
auto IsAllowedScalarTy = [&](const Type *T) {
3221+
return T->isFloatTy() || T->isDoubleTy() ||
3222+
(T->isHalfTy() && ST->hasFullFP16());
3223+
};
3224+
3225+
// Check if the extractelement user is scalar fmul.
3226+
auto IsUserFMulScalarTy = [](const Value *EEUser) {
3227+
// Check if the user is scalar fmul.
3228+
const auto *BO = dyn_cast_if_present<BinaryOperator>(EEUser);
3229+
return BO && BO->getOpcode() == BinaryOperator::FMul &&
3230+
!BO->getType()->isVectorTy();
3231+
};
3232+
3233+
// InstCombine combines fmul with fadd/fsub. Hence, extractelement fusion
3234+
// with fmul does not happen.
3235+
auto IsFMulUserFAddFSub = [](const Value *FMul) {
3236+
return any_of(FMul->users(), [](const User *U) {
3237+
const auto *BO = dyn_cast_if_present<BinaryOperator>(U);
3238+
return (BO && (BO->getOpcode() == BinaryOperator::FAdd ||
3239+
BO->getOpcode() == BinaryOperator::FSub));
3240+
});
3241+
};
3242+
3243+
// Check if the type constraints on input vector type and result scalar type
3244+
// of extractelement instruction are satisfied.
3245+
auto TypeConstraintsOnEESatisfied =
3246+
[&IsAllowedScalarTy](const Type *VectorTy, const Type *ScalarTy) {
3247+
return isa<FixedVectorType>(VectorTy) && IsAllowedScalarTy(ScalarTy);
3248+
};
3249+
3250+
// Check if the extract index is from lane 0 or lane equivalent to 0 for a
3251+
// certain scalar type and a certain vector register width.
3252+
auto IsExtractLaneEquivalentToZero = [&](const unsigned &Idx,
3253+
const unsigned &EltSz) {
3254+
auto RegWidth =
3255+
getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
3256+
.getFixedValue();
3257+
return (Idx == 0 || (Idx * EltSz) % RegWidth == 0);
3258+
};
3259+
3260+
if (Opcode.has_value()) {
3261+
if (!TypeConstraintsOnEESatisfied(Val, Val->getScalarType()))
3262+
return false;
3263+
3264+
DenseMap<User *, unsigned> UserToExtractIdx;
3265+
for (auto *U : Scalar->users()) {
3266+
if (!IsUserFMulScalarTy(U) || IsFMulUserFAddFSub(U))
3267+
return false;
3268+
// Recording entry for the user is important. Index value is not
3269+
// important.
3270+
UserToExtractIdx[U];
3271+
}
3272+
for (auto &[S, U, L] : ScalarUserAndIdx) {
3273+
for (auto *U : S->users()) {
3274+
if (UserToExtractIdx.find(U) != UserToExtractIdx.end()) {
3275+
auto *FMul = cast<BinaryOperator>(U);
3276+
auto *Op0 = FMul->getOperand(0);
3277+
auto *Op1 = FMul->getOperand(1);
3278+
if ((Op0 == S && Op1 == S) || (Op0 != S) || (Op1 != S)) {
3279+
UserToExtractIdx[U] = L;
3280+
break;
3281+
}
3282+
}
3283+
}
3284+
}
3285+
for (auto &[U, L] : UserToExtractIdx) {
3286+
if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
3287+
!IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
3288+
return false;
3289+
}
3290+
} else {
3291+
const auto *EE = cast<ExtractElementInst>(I);
3292+
3293+
const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
3294+
if (!IdxOp)
3295+
return false;
3296+
3297+
if (!TypeConstraintsOnEESatisfied(EE->getVectorOperand()->getType(),
3298+
EE->getType()))
3299+
return false;
3300+
3301+
return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
3302+
if (!IsUserFMulScalarTy(U) || IsFMulUserFAddFSub(U))
3303+
return false;
3304+
3305+
// Check if the other operand of extractelement is also extractelement
3306+
// from lane equivalent to 0.
3307+
const auto *BO = cast<BinaryOperator>(U);
3308+
const auto *OtherEE = dyn_cast<ExtractElementInst>(
3309+
BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
3310+
if (OtherEE) {
3311+
const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
3312+
if (!IdxOp)
3313+
return false;
3314+
return IsExtractLaneEquivalentToZero(
3315+
cast<ConstantInt>(OtherEE->getIndexOperand())
3316+
->getValue()
3317+
.getZExtValue(),
3318+
OtherEE->getType()->getScalarSizeInBits());
3319+
}
3320+
return true;
3321+
});
3322+
}
3323+
return true;
3324+
};
3325+
3326+
unsigned InstOpcode = I ? I->getOpcode() : Opcode.value();
3327+
if (InstOpcode == Instruction::ExtractElement && ExtractCanFuseWithFmul())
3328+
return 0;
3329+
31973330
// All other insert/extracts cost this much.
31983331
return ST->getVectorInsertExtractBaseCost();
31993332
}
@@ -3204,14 +3337,22 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
32043337
Value *Op1) {
32053338
bool HasRealUse =
32063339
Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
3207-
return getVectorInstrCostHelper(nullptr, Val, Index, HasRealUse);
3340+
return getVectorInstrCostHelper(Val, Index, HasRealUse);
3341+
}
3342+
3343+
InstructionCost AArch64TTIImpl::getVectorInstrCost(
3344+
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3345+
Value *Scalar,
3346+
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
3347+
return getVectorInstrCostHelper(Val, Index, false, nullptr, Opcode, Scalar,
3348+
ScalarUserAndIdx);
32083349
}
32093350

32103351
InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
32113352
Type *Val,
32123353
TTI::TargetCostKind CostKind,
32133354
unsigned Index) {
3214-
return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */);
3355+
return getVectorInstrCostHelper(Val, Index, true /* HasRealUse */, &I);
32153356
}
32163357

32173358
InstructionCost AArch64TTIImpl::getScalarizationOverhead(

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
#include "AArch64.h"
2020
#include "AArch64Subtarget.h"
2121
#include "AArch64TargetMachine.h"
22-
#include "llvm/ADT/ArrayRef.h"
2322
#include "llvm/Analysis/TargetTransformInfo.h"
2423
#include "llvm/CodeGen/BasicTTIImpl.h"
2524
#include "llvm/IR/Function.h"
@@ -66,8 +65,11 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
6665
// 'Val' and 'Index' are forwarded from 'getVectorInstrCost'; 'HasRealUse'
6766
// indicates whether the vector instruction is available in the input IR or
6867
// just imaginary in vectorizer passes.
69-
InstructionCost getVectorInstrCostHelper(const Instruction *I, Type *Val,
70-
unsigned Index, bool HasRealUse);
68+
InstructionCost getVectorInstrCostHelper(
69+
Type *Val, unsigned Index, bool HasRealUse,
70+
const Instruction *I = nullptr,
71+
std::optional<unsigned> Opcode = std::nullopt, Value *Scalar = nullptr,
72+
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx = {});
7173

7274
public:
7375
explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
@@ -185,6 +187,12 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
185187
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
186188
TTI::TargetCostKind CostKind,
187189
unsigned Index, Value *Op0, Value *Op1);
190+
191+
InstructionCost getVectorInstrCost(
192+
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
193+
Value *Scalar,
194+
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx);
195+
188196
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
189197
TTI::TargetCostKind CostKind,
190198
unsigned Index);

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11633,6 +11633,13 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1163311633
std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
1163411634
DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;
1163511635
SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
11636+
// Keep track {Scalar, Index, User} tuple.
11637+
// On AArch64, this helps in fusing a mov instruction, associated with
11638+
// extractelement, with fmul in the backend so that extractelement is free.
11639+
SmallVector<std::tuple<Value *, User *, int>, 4> ScalarUserAndIdx;
11640+
for (ExternalUser &EU : ExternalUses) {
11641+
ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
11642+
}
1163611643
for (ExternalUser &EU : ExternalUses) {
1163711644
// Uses by ephemeral values are free (because the ephemeral value will be
1163811645
// removed prior to code generation, and so the extraction will be
@@ -11739,8 +11746,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1173911746
ExtraCost = TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
1174011747
VecTy, EU.Lane);
1174111748
} else {
11742-
ExtraCost = TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
11743-
CostKind, EU.Lane);
11749+
ExtraCost =
11750+
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
11751+
EU.Lane, EU.Scalar, ScalarUserAndIdx);
1174411752
}
1174511753
// Leave the scalar instructions as is if they are cheaper than extracts.
1174611754
if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||

0 commit comments

Comments
 (0)