Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H

#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/IR/FMF.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/PassManager.h"
Expand Down Expand Up @@ -1404,6 +1405,20 @@ class TargetTransformInfo {
unsigned Index = -1, Value *Op0 = nullptr,
Value *Op1 = nullptr) const;

/// \return The expected cost of vector Insert and Extract.
/// Use -1 to indicate that there is no information on the index value.
/// This is used when the instruction is not available; a typical use
/// case is to provision the cost of vectorization/scalarization in
/// vectorizer passes.
/// \param ScalarUserAndIdx encodes the information about extracts from a
/// vector with 'Scalar' being the value being extracted,'User' being the user
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
InstructionCost getVectorInstrCost(
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const;

/// \return The expected cost of vector Insert and Extract.
/// This is used when instruction is available, and implementation
/// asserts 'I' is not nullptr.
Expand Down Expand Up @@ -2100,6 +2115,16 @@ class TargetTransformInfo::Concept {
TTI::TargetCostKind CostKind,
unsigned Index, Value *Op0,
Value *Op1) = 0;

/// \param ScalarUserAndIdx encodes the information about extracts from a
/// vector with 'Scalar' being the value being extracted,'User' being the user
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
virtual InstructionCost getVectorInstrCost(
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) = 0;

virtual InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index) = 0;
Expand Down Expand Up @@ -2785,6 +2810,13 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
Value *Op1) override {
return Impl.getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
}
InstructionCost getVectorInstrCost(
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) override {
return Impl.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
ScalarUserAndIdx);
}
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index) override {
Expand Down
11 changes: 11 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,17 @@ class TargetTransformInfoImplBase {
return 1;
}

/// \param ScalarUserAndIdx encodes the information about extracts from a
/// vector with 'Scalar' being the value being extracted,'User' being the user
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
InstructionCost getVectorInstrCost(
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
return 1;
}

InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index) const {
Expand Down
13 changes: 12 additions & 1 deletion llvm/include/llvm/CodeGen/BasicTTIImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
#define LLVM_CODEGEN_BASICTTIIMPL_H

#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
Expand Down Expand Up @@ -1288,6 +1287,18 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return getRegUsageForType(Val->getScalarType());
}

/// \param ScalarUserAndIdx encodes the information about extracts from a
/// vector with 'Scalar' being the value being extracted,'User' being the user
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
InstructionCost getVectorInstrCost(
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
return thisT()->getVectorInstrCost(Opcode, Val, CostKind, Index, nullptr,
nullptr);
}

InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index) {
Expand Down
19 changes: 16 additions & 3 deletions llvm/lib/Analysis/TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1047,15 +1047,28 @@ InstructionCost TargetTransformInfo::getCmpSelInstrCost(
InstructionCost TargetTransformInfo::getVectorInstrCost(
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
Value *Op0, Value *Op1) const {
// FIXME: Assert that Opcode is either InsertElement or ExtractElement.
// This is mentioned in the interface description and respected by all
// callers, but never asserted upon.
assert((Opcode == Instruction::InsertElement ||
Opcode == Instruction::ExtractElement) &&
"Expecting Opcode to be insertelement/extractelement.");
InstructionCost Cost =
TTIImpl->getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
assert(Cost >= 0 && "TTI should not produce negative costs!");
return Cost;
}

InstructionCost TargetTransformInfo::getVectorInstrCost(
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
assert((Opcode == Instruction::InsertElement ||
Opcode == Instruction::ExtractElement) &&
"Expecting Opcode to be insertelement/extractelement.");
InstructionCost Cost = TTIImpl->getVectorInstrCost(
Opcode, Val, CostKind, Index, Scalar, ScalarUserAndIdx);
assert(Cost >= 0 && "TTI should not produce negative costs!");
return Cost;
}

InstructionCost
TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
Expand Down
135 changes: 129 additions & 6 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "AArch64PerfectShuffle.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "Utils/AArch64SMEAttributes.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
Expand Down Expand Up @@ -3177,10 +3178,10 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
return 0;
}

InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
Type *Val,
unsigned Index,
bool HasRealUse) {
InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
unsigned Opcode, Type *Val, unsigned Index, bool HasRealUse,
const Instruction *I, Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
assert(Val->isVectorTy() && "This must be a vector type");

if (Index != -1U) {
Expand Down Expand Up @@ -3226,6 +3227,119 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
// compile-time considerations.
}

// In case of Neon, if there exists extractelement from lane != 0 such that
// 1. extractelement does not necessitate a move from vector_reg -> GPR.
// 2. extractelement result feeds into fmul.
// 3. Other operand of fmul is an extractelement from lane 0 or lane
// equivalent to 0.
// then the extractelement can be merged with fmul in the backend and it
// incurs no cost.
// e.g.
// define double @foo(<2 x double> %a) {
// %1 = extractelement <2 x double> %a, i32 0
// %2 = extractelement <2 x double> %a, i32 1
// %res = fmul double %1, %2
// ret double %res
// }
// %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
auto ExtractCanFuseWithFmul = [&]() {
// We bail out if the extract is from lane 0.
if (Index == 0)
return false;

// Check if the scalar element type of the vector operand of ExtractElement
// instruction is one of the allowed types.
auto IsAllowedScalarTy = [&](const Type *T) {
return T->isFloatTy() || T->isDoubleTy() ||
(T->isHalfTy() && ST->hasFullFP16());
};

// Check if the extractelement user is scalar fmul.
auto IsUserFMulScalarTy = [](const Value *EEUser) {
// Check if the user is scalar fmul.
const auto *BO = dyn_cast_if_present<BinaryOperator>(EEUser);
return BO && BO->getOpcode() == BinaryOperator::FMul &&
!BO->getType()->isVectorTy();
};

// Check if the extract index is from lane 0 or lane equivalent to 0 for a
// certain scalar type and a certain vector register width.
auto IsExtractLaneEquivalentToZero = [&](const unsigned &Idx,
const unsigned &EltSz) {
auto RegWidth =
getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
.getFixedValue();
return (Idx == 0 || (Idx * EltSz) % RegWidth == 0);
};

// Check if the type constraints on input vector type and result scalar type
// of extractelement instruction are satisfied.
if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
return false;

if (Scalar) {
DenseMap<User *, unsigned> UserToExtractIdx;
for (auto *U : Scalar->users()) {
if (!IsUserFMulScalarTy(U))
return false;
// Recording entry for the user is important. Index value is not
// important.
UserToExtractIdx[U];
}
for (auto &[S, U, L] : ScalarUserAndIdx) {
for (auto *U : S->users()) {
if (UserToExtractIdx.find(U) != UserToExtractIdx.end()) {
auto *FMul = cast<BinaryOperator>(U);
auto *Op0 = FMul->getOperand(0);
auto *Op1 = FMul->getOperand(1);
if ((Op0 == S && Op1 == S) || (Op0 != S) || (Op1 != S)) {
UserToExtractIdx[U] = L;
break;
}
}
}
}
for (auto &[U, L] : UserToExtractIdx) {
if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
!IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
return false;
}
} else {
const auto *EE = cast<ExtractElementInst>(I);

const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
if (!IdxOp)
return false;

return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
if (!IsUserFMulScalarTy(U))
return false;

// Check if the other operand of extractelement is also extractelement
// from lane equivalent to 0.
const auto *BO = cast<BinaryOperator>(U);
const auto *OtherEE = dyn_cast<ExtractElementInst>(
BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
if (OtherEE) {
const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
if (!IdxOp)
return false;
return IsExtractLaneEquivalentToZero(
cast<ConstantInt>(OtherEE->getIndexOperand())
->getValue()
.getZExtValue(),
OtherEE->getType()->getScalarSizeInBits());
}
return true;
});
}
return true;
};

if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
ExtractCanFuseWithFmul())
return 0;

// All other insert/extracts cost this much.
return ST->getVectorInsertExtractBaseCost();
}
Expand All @@ -3236,14 +3350,23 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
Value *Op1) {
bool HasRealUse =
Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
return getVectorInstrCostHelper(nullptr, Val, Index, HasRealUse);
return getVectorInstrCostHelper(Opcode, Val, Index, HasRealUse);
}

InstructionCost AArch64TTIImpl::getVectorInstrCost(
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
return getVectorInstrCostHelper(Opcode, Val, Index, false, nullptr, Scalar,
ScalarUserAndIdx);
}

InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index) {
return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */);
return getVectorInstrCostHelper(I.getOpcode(), Val, Index,
true /* HasRealUse */, &I);
}

InstructionCost AArch64TTIImpl::getScalarizationOverhead(
Expand Down
21 changes: 18 additions & 3 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
#include "AArch64.h"
#include "AArch64Subtarget.h"
#include "AArch64TargetMachine.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/IR/Function.h"
Expand Down Expand Up @@ -66,8 +65,14 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
// 'Val' and 'Index' are forwarded from 'getVectorInstrCost'; 'HasRealUse'
// indicates whether the vector instruction is available in the input IR or
// just imaginary in vectorizer passes.
InstructionCost getVectorInstrCostHelper(const Instruction *I, Type *Val,
unsigned Index, bool HasRealUse);
/// \param ScalarUserAndIdx encodes the information about extracts from a
/// vector with 'Scalar' being the value being extracted,'User' being the user
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
InstructionCost getVectorInstrCostHelper(
unsigned Opcode, Type *Val, unsigned Index, bool HasRealUse,
const Instruction *I = nullptr, Value *Scalar = nullptr,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx = {});

public:
explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
Expand Down Expand Up @@ -185,6 +190,16 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index, Value *Op0, Value *Op1);

/// \param ScalarUserAndIdx encodes the information about extracts from a
/// vector with 'Scalar' being the value being extracted,'User' being the user
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
InstructionCost getVectorInstrCost(
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx);

InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index);
Expand Down
12 changes: 10 additions & 2 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12192,6 +12192,13 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;
SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
// Keep track {Scalar, Index, User} tuple.
// On AArch64, this helps in fusing a mov instruction, associated with
// extractelement, with fmul in the backend so that extractelement is free.
SmallVector<std::tuple<Value *, User *, int>, 4> ScalarUserAndIdx;
for (ExternalUser &EU : ExternalUses) {
ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
}
for (ExternalUser &EU : ExternalUses) {
// Uses by ephemeral values are free (because the ephemeral value will be
// removed prior to code generation, and so the extraction will be
Expand Down Expand Up @@ -12304,8 +12311,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
ExtraCost = TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
VecTy, EU.Lane);
} else {
ExtraCost = TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind, EU.Lane);
ExtraCost =
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
EU.Lane, EU.Scalar, ScalarUserAndIdx);
}
// Leave the scalar instructions as is if they are cheaper than extracts.
if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
Expand Down
Loading
Loading