Skip to content

Commit 6b6acb9

Browse files
authored
(release 20.x non-upstream patch) [LV] Teach the vectorizer to cost and vectorize llvm.sincos intrinsics (#84)
This teaches the loop vectorizer that `llvm.sincos` is trivially vectorizable. Additionally, this patch updates the cost model to cost intrinsics that return multiple values correctly. Previously, the cost model only thought intrinsics that return `VectorType` need scalarizing, which meant it cost intrinsics that return multiple vectors (that need scalarizing) way too cheap (giving it the cost of a single function call). The `llvm.sincos` intrinsic also has a custom cost when a vector function library is available, as certain VFs can be expanded (later in code-gen) to a vector function, reducing the cost to a single call (+ the possible loads from the vector function returns values via output pointers). --- Downstream issue: #87
1 parent e96b1db commit 6b6acb9

File tree

11 files changed

+657
-64
lines changed

11 files changed

+657
-64
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,12 +126,17 @@ class IntrinsicCostAttributes {
126126
// If ScalarizationCost is UINT_MAX, the cost of scalarizing the
127127
// arguments and the return value will be computed based on types.
128128
InstructionCost ScalarizationCost = InstructionCost::getInvalid();
129+
/* Downstream change: #87 (sincos vectorization)*/
130+
TargetLibraryInfo const *LibInfo = nullptr;
131+
/* End downstream change: #87 */
129132

130133
public:
134+
/* Downstream change: #87 (sincos vectorization)*/
131135
IntrinsicCostAttributes(
132136
Intrinsic::ID Id, const CallBase &CI,
133137
InstructionCost ScalarCost = InstructionCost::getInvalid(),
134-
bool TypeBasedOnly = false);
138+
bool TypeBasedOnly = false, TargetLibraryInfo const *LibInfo = nullptr);
139+
/* End downstream change: #87 */
135140

136141
IntrinsicCostAttributes(
137142
Intrinsic::ID Id, Type *RTy, ArrayRef<Type *> Tys,
@@ -141,11 +146,14 @@ class IntrinsicCostAttributes {
141146
IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
142147
ArrayRef<const Value *> Args);
143148

149+
/* Downstream change: #87 (sincos vectorization)*/
144150
IntrinsicCostAttributes(
145151
Intrinsic::ID Id, Type *RTy, ArrayRef<const Value *> Args,
146152
ArrayRef<Type *> Tys, FastMathFlags Flags = FastMathFlags(),
147153
const IntrinsicInst *I = nullptr,
148-
InstructionCost ScalarCost = InstructionCost::getInvalid());
154+
InstructionCost ScalarCost = InstructionCost::getInvalid(),
155+
TargetLibraryInfo const *LibInfo = nullptr);
156+
/* End downstream change: #87 */
149157

150158
Intrinsic::ID getID() const { return IID; }
151159
const IntrinsicInst *getInst() const { return II; }
@@ -155,6 +163,10 @@ class IntrinsicCostAttributes {
155163
const SmallVectorImpl<const Value *> &getArgs() const { return Arguments; }
156164
const SmallVectorImpl<Type *> &getArgTypes() const { return ParamTys; }
157165

166+
/* Downstream change: #87 (sincos vectorization)*/
167+
const TargetLibraryInfo *getLibInfo() const { return LibInfo; }
168+
/* End downstream change: #87 */
169+
158170
bool isTypeBasedOnly() const {
159171
return Arguments.empty();
160172
}

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 105 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "llvm/ADT/SmallVector.h"
2323
#include "llvm/Analysis/LoopInfo.h"
2424
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
25+
#include "llvm/Analysis/TargetLibraryInfo.h"
2526
#include "llvm/Analysis/TargetTransformInfo.h"
2627
#include "llvm/Analysis/TargetTransformInfoImpl.h"
2728
#include "llvm/Analysis/ValueTracking.h"
@@ -285,6 +286,66 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
285286
return false;
286287
}
287288

289+
/* Downstream change: #87 (sincos vectorization)*/
290+
/// Several intrinsics that return structs (including llvm.sincos[pi] and
291+
/// llvm.modf) can be lowered to a vector library call (for certain VFs). The
292+
/// vector library functions correspond to the scalar calls (e.g. sincos or
293+
/// modf), which unlike the intrinsic return values via output pointers. This
294+
/// helper checks if a vector call exists for the given intrinsic, and returns
295+
/// the cost, which includes the cost of the mask (if required), and the loads
296+
/// for values returned via output pointers. \p LC is the scalar libcall and
297+
/// \p CallRetElementIndex (optional) is the struct element which is mapped to
298+
/// the call return value. If std::nullopt is returned, then no vector library
299+
/// call is available, so the intrinsic should be assigned the default cost
300+
/// (e.g. scalarization).
301+
std::optional<InstructionCost> getMultipleResultIntrinsicVectorLibCallCost(
302+
const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind,
303+
RTLIB::Libcall LC, std::optional<unsigned> CallRetElementIndex = {}) {
304+
Type *RetTy = ICA.getReturnType();
305+
// Vector variants of the intrinsic can be mapped to a vector library call.
306+
auto const *LibInfo = ICA.getLibInfo();
307+
if (!LibInfo || !isa<StructType>(RetTy) ||
308+
!isVectorizedStructTy(cast<StructType>(RetTy)))
309+
return std::nullopt;
310+
311+
// Find associated libcall.
312+
const char *LCName = getTLI()->getLibcallName(LC);
313+
if (!LCName)
314+
return std::nullopt;
315+
316+
// Search for a corresponding vector variant.
317+
LLVMContext &Ctx = RetTy->getContext();
318+
ElementCount VF = getVectorizedTypeVF(RetTy);
319+
VecDesc const *VD = nullptr;
320+
for (bool Masked : {false, true}) {
321+
if ((VD = LibInfo->getVectorMappingInfo(LCName, VF, Masked)))
322+
break;
323+
}
324+
if (!VD)
325+
return std::nullopt;
326+
327+
// Cost the call + mask.
328+
auto Cost =
329+
thisT()->getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind);
330+
if (VD->isMasked())
331+
Cost += thisT()->getShuffleCost(
332+
TargetTransformInfo::SK_Broadcast,
333+
VectorType::get(IntegerType::getInt1Ty(Ctx), VF), {}, CostKind, 0,
334+
nullptr, {});
335+
336+
// Lowering to a library call (with output pointers) may require us to emit
337+
// reloads for the results.
338+
for (auto [Idx, VectorTy] : enumerate(getContainedTypes(RetTy))) {
339+
if (Idx == CallRetElementIndex)
340+
continue;
341+
Cost += thisT()->getMemoryOpCost(
342+
Instruction::Load, VectorTy,
343+
thisT()->getDataLayout().getABITypeAlign(VectorTy), 0, CostKind);
344+
}
345+
return Cost;
346+
}
347+
/* End downstream change: #87 */
348+
288349
protected:
289350
explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
290351
: BaseT(DL) {}
@@ -1716,9 +1777,11 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
17161777

17171778
Type *RetTy = ICA.getReturnType();
17181779

1719-
ElementCount RetVF =
1720-
(RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1721-
: ElementCount::getFixed(1));
1780+
/* Downstream change: #87 (sincos vectorization)*/
1781+
ElementCount RetVF = isVectorizedTy(RetTy) ? getVectorizedTypeVF(RetTy)
1782+
: ElementCount::getFixed(1);
1783+
/* End downstream change: #87 */
1784+
17221785
const IntrinsicInst *I = ICA.getInst();
17231786
const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
17241787
FastMathFlags FMF = ICA.getFlags();
@@ -1971,6 +2034,18 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
19712034
}
19722035
case Intrinsic::experimental_vector_match:
19732036
return thisT()->getTypeBasedIntrinsicInstrCost(ICA, CostKind);
2037+
/* Downstream change: #87 (sincos vectorization)*/
2038+
case Intrinsic::sincos: {
2039+
Type *Ty = getContainedTypes(RetTy).front();
2040+
EVT VT = getTLI()->getValueType(DL, Ty);
2041+
RTLIB::Libcall LC = RTLIB::getFSINCOS(VT.getScalarType());
2042+
if (auto Cost =
2043+
getMultipleResultIntrinsicVectorLibCallCost(ICA, CostKind, LC))
2044+
return *Cost;
2045+
// Otherwise, fallback to default scalarization cost.
2046+
break;
2047+
}
2048+
/* End downstream change: #87 */
19742049
}
19752050

19762051
// Assume that we need to scalarize this intrinsic.)
@@ -1979,10 +2054,15 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
19792054
InstructionCost ScalarizationCost = InstructionCost::getInvalid();
19802055
if (RetVF.isVector() && !RetVF.isScalable()) {
19812056
ScalarizationCost = 0;
1982-
if (!RetTy->isVoidTy())
1983-
ScalarizationCost += getScalarizationOverhead(
1984-
cast<VectorType>(RetTy),
1985-
/*Insert*/ true, /*Extract*/ false, CostKind);
2057+
/* Downstream change: #87 (sincos vectorization)*/
2058+
if (!RetTy->isVoidTy()) {
2059+
for (Type *VectorTy : getContainedTypes(RetTy)) {
2060+
ScalarizationCost += getScalarizationOverhead(
2061+
cast<VectorType>(VectorTy),
2062+
/*Insert=*/true, /*Extract=*/false, CostKind);
2063+
}
2064+
}
2065+
/* End downstream change: #87 */
19862066
ScalarizationCost +=
19872067
getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind);
19882068
}
@@ -2637,27 +2717,34 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
26372717
// Else, assume that we need to scalarize this intrinsic. For math builtins
26382718
// this will emit a costly libcall, adding call overhead and spills. Make it
26392719
// very expensive.
2640-
if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
2720+
/* Downstream change: #87 (sincos vectorization)*/
2721+
if (isVectorizedTy(RetTy)) {
2722+
ArrayRef<Type *> RetVTys = getContainedTypes(RetTy);
2723+
26412724
// Scalable vectors cannot be scalarized, so return Invalid.
2642-
if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
2643-
return isa<ScalableVectorType>(Ty);
2644-
}))
2725+
if (any_of(concat<Type *const>(RetVTys, Tys),
2726+
[](Type *Ty) { return isa<ScalableVectorType>(Ty); }))
26452727
return InstructionCost::getInvalid();
26462728

2647-
InstructionCost ScalarizationCost =
2648-
SkipScalarizationCost
2649-
? ScalarizationCostPassed
2650-
: getScalarizationOverhead(RetVTy, /*Insert*/ true,
2651-
/*Extract*/ false, CostKind);
2729+
InstructionCost ScalarizationCost = ScalarizationCostPassed;
2730+
if (!SkipScalarizationCost) {
2731+
ScalarizationCost = 0;
2732+
for (Type *RetVTy : RetVTys) {
2733+
ScalarizationCost += getScalarizationOverhead(
2734+
cast<VectorType>(RetVTy), /*Insert=*/true,
2735+
/*Extract=*/false, CostKind);
2736+
}
2737+
}
26522738

2653-
unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
2739+
unsigned ScalarCalls = getVectorizedTypeVF(RetTy).getFixedValue();
26542740
SmallVector<Type *, 4> ScalarTys;
26552741
for (Type *Ty : Tys) {
26562742
if (Ty->isVectorTy())
26572743
Ty = Ty->getScalarType();
26582744
ScalarTys.push_back(Ty);
26592745
}
2660-
IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
2746+
IntrinsicCostAttributes Attrs(IID, toScalarizedTy(RetTy), ScalarTys, FMF);
2747+
/* End downstream change: #87 */
26612748
InstructionCost ScalarCost =
26622749
thisT()->getIntrinsicInstrCost(Attrs, CostKind);
26632750
for (Type *Ty : Tys) {

llvm/lib/Analysis/CostModel.cpp

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,15 @@
1717
//===----------------------------------------------------------------------===//
1818

1919
#include "llvm/Analysis/CostModel.h"
20+
#include "llvm/Analysis/TargetLibraryInfo.h"
2021
#include "llvm/Analysis/TargetTransformInfo.h"
2122
#include "llvm/IR/Function.h"
2223
#include "llvm/IR/IntrinsicInst.h"
2324
#include "llvm/IR/PassManager.h"
2425
#include "llvm/Pass.h"
2526
#include "llvm/Support/CommandLine.h"
2627
#include "llvm/Support/raw_ostream.h"
28+
2729
using namespace llvm;
2830

2931
static cl::opt<TargetTransformInfo::TargetCostKind> CostKind(
@@ -42,25 +44,36 @@ static cl::opt<bool> TypeBasedIntrinsicCost("type-based-intrinsic-cost",
4244
cl::desc("Calculate intrinsics cost based only on argument types"),
4345
cl::init(false));
4446

47+
/* Downstream change: #87 (sincos vectorization)*/
48+
static cl::opt<bool> PreferIntrinsicCost(
49+
"prefer-intrinsic-cost",
50+
cl::desc("Prefer using getIntrinsicInstrCost over getInstructionCost"),
51+
cl::init(false));
52+
/* End downstream change: #87 */
53+
4554
#define CM_NAME "cost-model"
4655
#define DEBUG_TYPE CM_NAME
4756

4857
PreservedAnalyses CostModelPrinterPass::run(Function &F,
4958
FunctionAnalysisManager &AM) {
5059
auto &TTI = AM.getResult<TargetIRAnalysis>(F);
60+
// Downstream change: #87 (sincos vectorization)
61+
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
5162
OS << "Printing analysis 'Cost Model Analysis' for function '" << F.getName() << "':\n";
5263
for (BasicBlock &B : F) {
5364
for (Instruction &Inst : B) {
5465
// TODO: Use a pass parameter instead of cl::opt CostKind to determine
5566
// which cost kind to print.
5667
InstructionCost Cost;
5768
auto *II = dyn_cast<IntrinsicInst>(&Inst);
58-
if (II && TypeBasedIntrinsicCost) {
59-
IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II,
60-
InstructionCost::getInvalid(), true);
69+
/* Downstream change: #87 (sincos vectorization)*/
70+
if (II && (PreferIntrinsicCost || TypeBasedIntrinsicCost)) {
71+
IntrinsicCostAttributes ICA(
72+
II->getIntrinsicID(), *II, InstructionCost::getInvalid(),
73+
/*TypeBasedOnly=*/TypeBasedIntrinsicCost, &TLI);
6174
Cost = TTI.getIntrinsicInstrCost(ICA, CostKind);
62-
}
63-
else {
75+
} else {
76+
/* End downstream change: #87 */
6477
Cost = TTI.getInstructionCost(&Inst, CostKind);
6578
}
6679

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -67,11 +67,13 @@ bool HardwareLoopInfo::canAnalyze(LoopInfo &LI) {
6767
return true;
6868
}
6969

70+
/* Downstream change: #87 (sincos vectorization)*/
7071
IntrinsicCostAttributes::IntrinsicCostAttributes(
7172
Intrinsic::ID Id, const CallBase &CI, InstructionCost ScalarizationCost,
72-
bool TypeBasedOnly)
73+
bool TypeBasedOnly, const TargetLibraryInfo *LibInfo)
7374
: II(dyn_cast<IntrinsicInst>(&CI)), RetTy(CI.getType()), IID(Id),
74-
ScalarizationCost(ScalarizationCost) {
75+
ScalarizationCost(ScalarizationCost), LibInfo(LibInfo) {
76+
/* End downstream change: #87 */
7577

7678
if (const auto *FPMO = dyn_cast<FPMathOperator>(&CI))
7779
FMF = FPMO->getFastMathFlags();
@@ -101,13 +103,14 @@ IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *Ty,
101103
ParamTys.push_back(Argument->getType());
102104
}
103105

104-
IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
105-
ArrayRef<const Value *> Args,
106-
ArrayRef<Type *> Tys,
107-
FastMathFlags Flags,
108-
const IntrinsicInst *I,
109-
InstructionCost ScalarCost)
110-
: II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) {
106+
/* Downstream change: #87 (sincos vectorization)*/
107+
IntrinsicCostAttributes::IntrinsicCostAttributes(
108+
Intrinsic::ID Id, Type *RTy, ArrayRef<const Value *> Args,
109+
ArrayRef<Type *> Tys, FastMathFlags Flags, const IntrinsicInst *I,
110+
InstructionCost ScalarCost, TargetLibraryInfo const *LibInfo)
111+
: II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost),
112+
LibInfo(LibInfo) {
113+
/* End downstream change: #87 */
111114
ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end());
112115
Arguments.insert(Arguments.begin(), Args.begin(), Args.end());
113116
}

llvm/lib/Analysis/VectorUtils.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
7272
case Intrinsic::atan2:
7373
case Intrinsic::sin:
7474
case Intrinsic::cos:
75+
// Downstream change: #87 (sincos vectorization)
76+
case Intrinsic::sincos:
7577
case Intrinsic::tan:
7678
case Intrinsic::sinh:
7779
case Intrinsic::cosh:
@@ -179,6 +181,8 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
179181
case Intrinsic::ucmp:
180182
case Intrinsic::scmp:
181183
return OpdIdx == -1 || OpdIdx == 0;
184+
// Downstream change: #87 (sincos vectorization)
185+
case Intrinsic::sincos:
182186
case Intrinsic::is_fpclass:
183187
case Intrinsic::vp_is_fpclass:
184188
return OpdIdx == 0;

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2884,8 +2884,11 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
28842884
std::back_inserter(ParamTys),
28852885
[&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
28862886

2887+
/* Downstream change: #87 (sincos vectorization)*/
28872888
IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2888-
dyn_cast<IntrinsicInst>(CI));
2889+
dyn_cast<IntrinsicInst>(CI),
2890+
InstructionCost::getInvalid(), TLI);
2891+
/* End downstream change: #87 */
28892892
return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
28902893
}
28912894

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1149,9 +1149,12 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
11491149

11501150
// TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
11511151
FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags();
1152+
/* Downstream change: #87 (sincos vectorization)*/
11521153
IntrinsicCostAttributes CostAttrs(
11531154
VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
1154-
dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()));
1155+
dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()),
1156+
InstructionCost::getInvalid(), &Ctx.TLI);
1157+
/* End downstream change: #87 */
11551158
return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
11561159
}
11571160

0 commit comments

Comments
 (0)