From 0763c9fb52083259c826399d72b6f3be423a42df Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 27 Feb 2025 10:06:27 +0000 Subject: [PATCH 1/2] [LV] Teach the vectorizer to cost and vectorize llvm.sincos intrinsics This teaches the loop vectorizer that `llvm.sincos` is trivially vectorizable. Additionally, this patch updates the cost model to cost intrinsics that return multiple values correctly. Previously, the cost model only thought intrinsics that return `VectorType` need scalarizing, which meant it cost intrinsics that return multiple vectors (that need scalarizing) way too cheap (giving it the cost of a single function call). The `llvm.sincos` intrinsic also has a custom cost when a vector function library is available, as certain VFs can be expanded (later in code-gen) to a vector function, reducing the cost to a single call (+ the possible loads from the vector function returns values via output pointers). --- .../llvm/Analysis/TargetTransformInfo.h | 7 +- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 113 ++++++-- llvm/lib/Analysis/CostModel.cpp | 18 +- llvm/lib/Analysis/TargetTransformInfo.cpp | 17 +- llvm/lib/Analysis/VectorUtils.cpp | 2 + .../Transforms/Vectorize/LoopVectorize.cpp | 3 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 3 +- .../test/Analysis/CostModel/AArch64/sincos.ll | 60 +++++ llvm/test/Analysis/CostModel/AMDGPU/frexp.ll | 56 ++-- .../LoopVectorize/AArch64/sincos.ll | 251 ++++++++++++++++++ llvm/test/Transforms/LoopVectorize/sincos.ll | 157 +++++++++++ 11 files changed, 623 insertions(+), 64 deletions(-) create mode 100644 llvm/test/Analysis/CostModel/AArch64/sincos.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/sincos.ll create mode 100644 llvm/test/Transforms/LoopVectorize/sincos.ll diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 45077f174115..1e80f90a47a8 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -126,12 +126,13 @@ class IntrinsicCostAttributes { // If ScalarizationCost is UINT_MAX, the cost of scalarizing the // arguments and the return value will be computed based on types. InstructionCost ScalarizationCost = InstructionCost::getInvalid(); + TargetLibraryInfo const *LibInfo = nullptr; public: IntrinsicCostAttributes( Intrinsic::ID Id, const CallBase &CI, InstructionCost ScalarCost = InstructionCost::getInvalid(), - bool TypeBasedOnly = false); + bool TypeBasedOnly = false, TargetLibraryInfo const *LibInfo = nullptr); IntrinsicCostAttributes( Intrinsic::ID Id, Type *RTy, ArrayRef Tys, @@ -145,7 +146,8 @@ class IntrinsicCostAttributes { Intrinsic::ID Id, Type *RTy, ArrayRef Args, ArrayRef Tys, FastMathFlags Flags = FastMathFlags(), const IntrinsicInst *I = nullptr, - InstructionCost ScalarCost = InstructionCost::getInvalid()); + InstructionCost ScalarCost = InstructionCost::getInvalid(), + TargetLibraryInfo const *LibInfo = nullptr); Intrinsic::ID getID() const { return IID; } const IntrinsicInst *getInst() const { return II; } @@ -154,6 +156,7 @@ class IntrinsicCostAttributes { InstructionCost getScalarizationCost() const { return ScalarizationCost; } const SmallVectorImpl &getArgs() const { return Arguments; } const SmallVectorImpl &getArgTypes() const { return ParamTys; } + const TargetLibraryInfo *getLibInfo() const { return LibInfo; } bool isTypeBasedOnly() const { return Arguments.empty(); diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 596db3923921..2e161d382bb2 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -22,6 +22,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/TargetTransformInfoImpl.h" #include "llvm/Analysis/ValueTracking.h" @@ -285,6 +286,64 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return false; } + /// Several intrinsics that return structs (including llvm.sincos[pi] and + /// llvm.modf) can be lowered to a vector library call (for certain VFs). The + /// vector library functions correspond to the scalar calls (e.g. sincos or + /// modf), which unlike the intrinsic return values via output pointers. This + /// helper checks if a vector call exists for the given intrinsic, and returns + /// the cost, which includes the cost of the mask (if required), and the loads + /// for values returned via output pointers. \p LC is the scalar libcall and + /// \p CallRetElementIndex (optional) is the struct element which is mapped to + /// the call return value. If std::nullopt is returned, then no vector library + /// call is available, so the intrinsic should be assigned the default cost + /// (e.g. scalarization). + std::optional getMultipleResultIntrinsicVectorLibCallCost( + const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind, + RTLIB::Libcall LC, std::optional CallRetElementIndex = {}) { + Type *RetTy = ICA.getReturnType(); + // Vector variants of the intrinsic can be mapped to a vector library call. + auto const *LibInfo = ICA.getLibInfo(); + if (!LibInfo || !isa(RetTy) || + !isVectorizedStructTy(cast(RetTy))) + return std::nullopt; + + // Find associated libcall. + const char *LCName = getTLI()->getLibcallName(LC); + if (!LCName) + return std::nullopt; + + // Search for a corresponding vector variant. + LLVMContext &Ctx = RetTy->getContext(); + ElementCount VF = getVectorizedTypeVF(RetTy); + VecDesc const *VD = nullptr; + for (bool Masked : {false, true}) { + if ((VD = LibInfo->getVectorMappingInfo(LCName, VF, Masked))) + break; + } + if (!VD) + return std::nullopt; + + // Cost the call + mask. + auto Cost = + thisT()->getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind); + if (VD->isMasked()) + Cost += thisT()->getShuffleCost( + TargetTransformInfo::SK_Broadcast, + VectorType::get(IntegerType::getInt1Ty(Ctx), VF), {}, CostKind, 0, + nullptr, {}); + + // Lowering to a library call (with output pointers) may require us to emit + // reloads for the results. + for (auto [Idx, VectorTy] : enumerate(getContainedTypes(RetTy))) { + if (Idx == CallRetElementIndex) + continue; + Cost += thisT()->getMemoryOpCost( + Instruction::Load, VectorTy, + thisT()->getDataLayout().getABITypeAlign(VectorTy), 0, CostKind); + } + return Cost; + } + protected: explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) : BaseT(DL) {} @@ -1716,9 +1775,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { Type *RetTy = ICA.getReturnType(); - ElementCount RetVF = - (RetTy->isVectorTy() ? cast(RetTy)->getElementCount() - : ElementCount::getFixed(1)); + ElementCount RetVF = isVectorizedTy(RetTy) ? getVectorizedTypeVF(RetTy) + : ElementCount::getFixed(1); + const IntrinsicInst *I = ICA.getInst(); const SmallVectorImpl &Args = ICA.getArgs(); FastMathFlags FMF = ICA.getFlags(); @@ -1971,6 +2030,16 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { } case Intrinsic::experimental_vector_match: return thisT()->getTypeBasedIntrinsicInstrCost(ICA, CostKind); + case Intrinsic::sincos: { + Type *Ty = getContainedTypes(RetTy).front(); + EVT VT = getTLI()->getValueType(DL, Ty); + RTLIB::Libcall LC = RTLIB::getFSINCOS(VT.getScalarType()); + if (auto Cost = + getMultipleResultIntrinsicVectorLibCallCost(ICA, CostKind, LC)) + return *Cost; + // Otherwise, fallback to default scalarization cost. + break; + } } // Assume that we need to scalarize this intrinsic.) @@ -1979,10 +2048,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { InstructionCost ScalarizationCost = InstructionCost::getInvalid(); if (RetVF.isVector() && !RetVF.isScalable()) { ScalarizationCost = 0; - if (!RetTy->isVoidTy()) - ScalarizationCost += getScalarizationOverhead( - cast(RetTy), - /*Insert*/ true, /*Extract*/ false, CostKind); + if (!RetTy->isVoidTy()) { + for (Type *VectorTy : getContainedTypes(RetTy)) { + ScalarizationCost += getScalarizationOverhead( + cast(VectorTy), + /*Insert=*/true, /*Extract=*/false, CostKind); + } + } ScalarizationCost += getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind); } @@ -2637,27 +2709,32 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { // Else, assume that we need to scalarize this intrinsic. For math builtins // this will emit a costly libcall, adding call overhead and spills. Make it // very expensive. - if (auto *RetVTy = dyn_cast(RetTy)) { + if (isVectorizedTy(RetTy)) { + ArrayRef RetVTys = getContainedTypes(RetTy); + // Scalable vectors cannot be scalarized, so return Invalid. - if (isa(RetTy) || any_of(Tys, [](const Type *Ty) { - return isa(Ty); - })) + if (any_of(concat(RetVTys, Tys), + [](Type *Ty) { return isa(Ty); })) return InstructionCost::getInvalid(); - InstructionCost ScalarizationCost = - SkipScalarizationCost - ? ScalarizationCostPassed - : getScalarizationOverhead(RetVTy, /*Insert*/ true, - /*Extract*/ false, CostKind); + InstructionCost ScalarizationCost = ScalarizationCostPassed; + if (!SkipScalarizationCost) { + ScalarizationCost = 0; + for (Type *RetVTy : RetVTys) { + ScalarizationCost += getScalarizationOverhead( + cast(RetVTy), /*Insert=*/true, + /*Extract=*/false, CostKind); + } + } - unsigned ScalarCalls = cast(RetVTy)->getNumElements(); + unsigned ScalarCalls = getVectorizedTypeVF(RetTy).getFixedValue(); SmallVector ScalarTys; for (Type *Ty : Tys) { if (Ty->isVectorTy()) Ty = Ty->getScalarType(); ScalarTys.push_back(Ty); } - IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF); + IntrinsicCostAttributes Attrs(IID, toScalarizedTy(RetTy), ScalarTys, FMF); InstructionCost ScalarCost = thisT()->getIntrinsicInstrCost(Attrs, CostKind); for (Type *Ty : Tys) { diff --git a/llvm/lib/Analysis/CostModel.cpp b/llvm/lib/Analysis/CostModel.cpp index ee6622516a5a..68cb536bf789 100644 --- a/llvm/lib/Analysis/CostModel.cpp +++ b/llvm/lib/Analysis/CostModel.cpp @@ -17,6 +17,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/CostModel.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/IntrinsicInst.h" @@ -24,6 +25,7 @@ #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/raw_ostream.h" + using namespace llvm; static cl::opt CostKind( @@ -42,12 +44,18 @@ static cl::opt TypeBasedIntrinsicCost("type-based-intrinsic-cost", cl::desc("Calculate intrinsics cost based only on argument types"), cl::init(false)); +static cl::opt PreferIntrinsicCost( + "prefer-intrinsic-cost", + cl::desc("Prefer using getIntrinsicInstrCost over getInstructionCost"), + cl::init(false)); + #define CM_NAME "cost-model" #define DEBUG_TYPE CM_NAME PreservedAnalyses CostModelPrinterPass::run(Function &F, FunctionAnalysisManager &AM) { auto &TTI = AM.getResult(F); + auto &TLI = AM.getResult(F); OS << "Printing analysis 'Cost Model Analysis' for function '" << F.getName() << "':\n"; for (BasicBlock &B : F) { for (Instruction &Inst : B) { @@ -55,12 +63,12 @@ PreservedAnalyses CostModelPrinterPass::run(Function &F, // which cost kind to print. InstructionCost Cost; auto *II = dyn_cast(&Inst); - if (II && TypeBasedIntrinsicCost) { - IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II, - InstructionCost::getInvalid(), true); + if (II && (PreferIntrinsicCost || TypeBasedIntrinsicCost)) { + IntrinsicCostAttributes ICA( + II->getIntrinsicID(), *II, InstructionCost::getInvalid(), + /*TypeBasedOnly=*/TypeBasedIntrinsicCost, &TLI); Cost = TTI.getIntrinsicInstrCost(ICA, CostKind); - } - else { + } else { Cost = TTI.getInstructionCost(&Inst, CostKind); } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index ade398ea72f9..4dfe5406566e 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -69,9 +69,9 @@ bool HardwareLoopInfo::canAnalyze(LoopInfo &LI) { IntrinsicCostAttributes::IntrinsicCostAttributes( Intrinsic::ID Id, const CallBase &CI, InstructionCost ScalarizationCost, - bool TypeBasedOnly) + bool TypeBasedOnly, const TargetLibraryInfo *LibInfo) : II(dyn_cast(&CI)), RetTy(CI.getType()), IID(Id), - ScalarizationCost(ScalarizationCost) { + ScalarizationCost(ScalarizationCost), LibInfo(LibInfo) { if (const auto *FPMO = dyn_cast(&CI)) FMF = FPMO->getFastMathFlags(); @@ -101,13 +101,12 @@ IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *Ty, ParamTys.push_back(Argument->getType()); } -IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy, - ArrayRef Args, - ArrayRef Tys, - FastMathFlags Flags, - const IntrinsicInst *I, - InstructionCost ScalarCost) - : II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) { +IntrinsicCostAttributes::IntrinsicCostAttributes( + Intrinsic::ID Id, Type *RTy, ArrayRef Args, + ArrayRef Tys, FastMathFlags Flags, const IntrinsicInst *I, + InstructionCost ScalarCost, TargetLibraryInfo const *LibInfo) + : II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost), + LibInfo(LibInfo) { ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end()); Arguments.insert(Arguments.begin(), Args.begin(), Args.end()); } diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index ad80e458ab57..44678289d7c8 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -72,6 +72,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) { case Intrinsic::atan2: case Intrinsic::sin: case Intrinsic::cos: + case Intrinsic::sincos: case Intrinsic::tan: case Intrinsic::sinh: case Intrinsic::cosh: @@ -179,6 +180,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg( case Intrinsic::ucmp: case Intrinsic::scmp: return OpdIdx == -1 || OpdIdx == 0; + case Intrinsic::sincos: case Intrinsic::is_fpclass: case Intrinsic::vp_is_fpclass: return OpdIdx == 0; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e2aad1ccb904..2b861b2a8ede 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2885,7 +2885,8 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, [&](Type *Ty) { return maybeVectorizeType(Ty, VF); }); IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, - dyn_cast(CI)); + dyn_cast(CI), + InstructionCost::getInvalid(), TLI); return TTI.getIntrinsicInstrCost(CostAttrs, CostKind); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 99b8da7fb349..173edc60a211 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1151,7 +1151,8 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(); IntrinsicCostAttributes CostAttrs( VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF, - dyn_cast_or_null(getUnderlyingValue())); + dyn_cast_or_null(getUnderlyingValue()), + InstructionCost::getInvalid(), &Ctx.TLI); return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind); } diff --git a/llvm/test/Analysis/CostModel/AArch64/sincos.ll b/llvm/test/Analysis/CostModel/AArch64/sincos.ll new file mode 100644 index 000000000000..e32c51667e0a --- /dev/null +++ b/llvm/test/Analysis/CostModel/AArch64/sincos.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "sincos" +; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s +; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=ArmPL -passes="print" -prefer-intrinsic-cost -cost-kind=throughput 2>&1 -disable-output | FileCheck %s -check-prefix=CHECK-VECLIB + +define void @sincos() { +; CHECK-LABEL: 'sincos' +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half poison) +; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison) +; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison) +; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison) +; +; CHECK: Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison) +; CHECK: Cost Model: Found an estimated cost of 52 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison) +; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison) +; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison) +; CHECK: Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison) +; +; CHECK: Cost Model: Invalid cost for instruction: %nxv8f16 = call { , } @llvm.sincos.nxv8f16( poison) +; CHECK: Cost Model: Invalid cost for instruction: %nxv4f32 = call { , } @llvm.sincos.nxv4f32( poison) +; CHECK: Cost Model: Invalid cost for instruction: %nxv2f64 = call { , } @llvm.sincos.nxv2f64( poison) +; CHECK: Cost Model: Invalid cost for instruction: %nxv1f128 = call { , } @llvm.sincos.nxv1f128( poison) +; CHECK: Cost Model: Invalid cost for instruction: %nxv8f32 = call { , } @llvm.sincos.nxv8f32( poison) +; +; CHECK-VECLIB-LABEL: 'sincos' +; CHECK-VECLIB: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half poison) +; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison) +; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison) +; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison) +; +; CHECK-VECLIB: Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison) +; CHECK-VECLIB: Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison) +; CHECK-VECLIB: Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison) +; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison) +; CHECK-VECLIB: Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison) +; +; CHECK-VECLIB: Cost Model: Invalid cost for instruction: %nxv8f16 = call { , } @llvm.sincos.nxv8f16( poison) +; CHECK-VECLIB: Cost Model: Found an estimated cost of 13 for instruction: %nxv4f32 = call { , } @llvm.sincos.nxv4f32( poison) +; CHECK-VECLIB: Cost Model: Found an estimated cost of 13 for instruction: %nxv2f64 = call { , } @llvm.sincos.nxv2f64( poison) +; CHECK-VECLIB: Cost Model: Invalid cost for instruction: %nxv1f128 = call { , } @llvm.sincos.nxv1f128( poison) +; CHECK-VECLIB: Cost Model: Invalid cost for instruction: %nxv8f32 = call { , } @llvm.sincos.nxv8f32( poison) +; + %f16 = call { half, half } @llvm.sincos.f16(half poison) + %f32 = call { float, float } @llvm.sincos.f32(float poison) + %f64 = call { double, double } @llvm.sincos.f64(double poison) + %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison) + + %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison) + %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison) + %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison) + %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison) + %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison) + + %nxv8f16 = call { , } @llvm.sincos.v8f16( poison) + %nxv4f32 = call { , } @llvm.sincos.v4f32( poison) + %nxv2f64 = call { , } @llvm.sincos.v2f64( poison) + %nxv1f128 = call { , } @llvm.sincos.v1f128( poison) + %nxv8f32 = call { , } @llvm.sincos.v8f32( poison) + + ret void +} diff --git a/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll b/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll index 22134d042fab..f5f4445b34b0 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll @@ -68,46 +68,46 @@ define void @frexp_f16_i32() { define void @frexp_f16_i16() { ; GFX7-LABEL: 'frexp_f16_i16' ; GFX7-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef) -; GFX7-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef) -; GFX7-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef) -; GFX7-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef) -; GFX7-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef) -; GFX7-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef) -; GFX7-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef) -; GFX7-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef) +; GFX7-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef) +; GFX7-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef) +; GFX7-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef) +; GFX7-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef) +; GFX7-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef) +; GFX7-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef) +; GFX7-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef) ; GFX7-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX8PLUS-LABEL: 'frexp_f16_i16' ; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef) -; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef) -; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef) -; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef) -; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef) -; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef) -; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef) -; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef) +; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef) +; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef) +; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef) +; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef) +; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef) +; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef) +; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef) ; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX7-SIZE-LABEL: 'frexp_f16_i16' ; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef) -; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef) -; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef) -; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef) -; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef) -; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef) -; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef) -; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef) +; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef) +; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef) +; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef) +; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef) +; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef) +; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef) +; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef) ; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; GFX8PLUS-SIZE-LABEL: 'frexp_f16_i16' ; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef) -; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef) -; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef) -; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef) -; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef) -; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef) -; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef) -; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef) +; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef) +; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef) +; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef) +; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef) +; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef) +; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef) +; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef) ; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sincos.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sincos.ll new file mode 100644 index 000000000000..a7e949838f76 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sincos.ll @@ -0,0 +1,251 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "(:|sincos|extractvalue|store)" --version 5 +; RUN: opt -passes=loop-vectorize -mtriple=aarch64-gnu-linux -mcpu=neoverse-v1 -mattr=+sve < %s -S -o - -debug-only=loop-vectorize 2>%t.1 | FileCheck %s --check-prefix=CHECK +; RUN: opt -passes=loop-vectorize -mtriple=aarch64-gnu-linux -mcpu=neoverse-v1 -mattr=+sve -vector-library=ArmPL < %s -S -o - -debug-only=loop-vectorize 2>%t.2 | FileCheck %s --check-prefix=CHECK-ARMPL +; RUN: FileCheck --input-file=%t.1 --check-prefix=CHECK-COST %s +; RUN: FileCheck --input-file=%t.2 --check-prefix=CHECK-COST-ARMPL %s +; REQUIRES: asserts + +; CHECK-COST-LABEL: sincos_f32 +; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction: %call = tail call { float, float } @llvm.sincos.f32(float %in_val) +; CHECK-COST: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST: Cost of 58 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST: Cost of Invalid for VF vscale x 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) + +; CHECK-COST-ARMPL-LABEL: sincos_f32 +; CHECK-COST-ARMPL: LV: Found an estimated cost of 10 for VF 1 For instruction: %call = tail call { float, float } @llvm.sincos.f32(float %in_val) +; CHECK-COST-ARMPL: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST-ARMPL: Cost of 12 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST-ARMPL: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST-ARMPL: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST-ARMPL: Cost of 13 for VF vscale x 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) + +define void @sincos_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { +; CHECK-LABEL: define void @sincos_f32( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK: [[ENTRY:.*:]] +; CHECK: [[VECTOR_PH:.*:]] +; CHECK: [[VECTOR_BODY:.*:]] +; CHECK: [[TMP3:%.*]] = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> [[WIDE_LOAD:%.*]]) +; CHECK: [[TMP4:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 0 +; CHECK: [[TMP5:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 1 +; CHECK: store <2 x float> [[TMP4]], ptr [[TMP7:%.*]], align 4 +; CHECK: store <2 x float> [[TMP5]], ptr [[TMP9:%.*]], align 4 +; CHECK: [[MIDDLE_BLOCK:.*:]] +; CHECK: [[SCALAR_PH:.*:]] +; CHECK: [[FOR_BODY:.*:]] +; CHECK: [[CALL:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[IN_VAL:%.*]]) +; CHECK: [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0 +; CHECK: [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1 +; CHECK: store float [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 4 +; CHECK: store float [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 4 +; CHECK: [[EXIT:.*:]] +; +; CHECK-ARMPL-LABEL: define void @sincos_f32( +; CHECK-ARMPL-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-ARMPL: [[ENTRY:.*:]] +; CHECK-ARMPL: [[VECTOR_PH:.*:]] +; CHECK-ARMPL: [[VECTOR_BODY:.*:]] +; CHECK-ARMPL: [[TMP12:%.*]] = call { , } @llvm.sincos.nxv4f32( [[WIDE_LOAD:%.*]]) +; CHECK-ARMPL: [[TMP13:%.*]] = call { , } @llvm.sincos.nxv4f32( [[WIDE_LOAD1:%.*]]) +; CHECK-ARMPL: [[TMP14:%.*]] = extractvalue { , } [[TMP12]], 0 +; CHECK-ARMPL: [[TMP15:%.*]] = extractvalue { , } [[TMP13]], 0 +; CHECK-ARMPL: [[TMP16:%.*]] = extractvalue { , } [[TMP12]], 1 +; CHECK-ARMPL: [[TMP17:%.*]] = extractvalue { , } [[TMP13]], 1 +; CHECK-ARMPL: store [[TMP14]], ptr [[TMP19:%.*]], align 4 +; CHECK-ARMPL: store [[TMP15]], ptr [[TMP22:%.*]], align 4 +; CHECK-ARMPL: store [[TMP16]], ptr [[TMP24:%.*]], align 4 +; CHECK-ARMPL: store [[TMP17]], ptr [[TMP27:%.*]], align 4 +; CHECK-ARMPL: [[MIDDLE_BLOCK:.*:]] +; CHECK-ARMPL: [[SCALAR_PH:.*:]] +; CHECK-ARMPL: [[FOR_BODY:.*:]] +; CHECK-ARMPL: [[CALL:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[IN_VAL:%.*]]) +; CHECK-ARMPL: [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0 +; CHECK-ARMPL: [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1 +; CHECK-ARMPL: store float [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 4 +; CHECK-ARMPL: store float [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 4 +; CHECK-ARMPL: [[EXIT:.*:]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv + %in_val = load float, ptr %arrayidx, align 4 + %call = tail call { float, float } @llvm.sincos.f32(float %in_val) + %extract_a = extractvalue { float, float } %call, 0 + %extract_b = extractvalue { float, float } %call, 1 + %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv + store float %extract_a, ptr %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv + store float %extract_b, ptr %arrayidx4, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +; CHECK-COST-LABEL: sincos_f64 +; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction: %call = tail call { double, double } @llvm.sincos.f64(double %in_val) +; CHECK-COST: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) + +; CHECK-COST-ARMPL-LABEL: sincos_f64 +; CHECK-COST-ARMPL: LV: Found an estimated cost of 10 for VF 1 For instruction: %call = tail call { double, double } @llvm.sincos.f64(double %in_val) +; CHECK-COST-ARMPL: Cost of 12 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST-ARMPL: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST-ARMPL: Cost of 13 for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) + +define void @sincos_f64(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { +; CHECK-LABEL: define void @sincos_f64( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0]] { +; CHECK: [[ENTRY:.*:]] +; CHECK: [[VECTOR_PH:.*:]] +; CHECK: [[VECTOR_BODY:.*:]] +; CHECK: [[TMP3:%.*]] = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> [[WIDE_LOAD:%.*]]) +; CHECK: [[TMP4:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP3]], 0 +; CHECK: [[TMP5:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP3]], 1 +; CHECK: store <2 x double> [[TMP4]], ptr [[TMP7:%.*]], align 8 +; CHECK: store <2 x double> [[TMP5]], ptr [[TMP9:%.*]], align 8 +; CHECK: [[MIDDLE_BLOCK:.*:]] +; CHECK: [[SCALAR_PH:.*:]] +; CHECK: [[FOR_BODY:.*:]] +; CHECK: [[CALL:%.*]] = tail call { double, double } @llvm.sincos.f64(double [[IN_VAL:%.*]]) +; CHECK: [[EXTRACT_A:%.*]] = extractvalue { double, double } [[CALL]], 0 +; CHECK: [[EXTRACT_B:%.*]] = extractvalue { double, double } [[CALL]], 1 +; CHECK: store double [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 8 +; CHECK: store double [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 8 +; CHECK: [[EXIT:.*:]] +; +; CHECK-ARMPL-LABEL: define void @sincos_f64( +; CHECK-ARMPL-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0]] { +; CHECK-ARMPL: [[ENTRY:.*:]] +; CHECK-ARMPL: [[VECTOR_PH:.*:]] +; CHECK-ARMPL: [[VECTOR_BODY:.*:]] +; CHECK-ARMPL: [[TMP12:%.*]] = call { , } @llvm.sincos.nxv2f64( [[WIDE_LOAD:%.*]]) +; CHECK-ARMPL: [[TMP13:%.*]] = call { , } @llvm.sincos.nxv2f64( [[WIDE_LOAD1:%.*]]) +; CHECK-ARMPL: [[TMP14:%.*]] = extractvalue { , } [[TMP12]], 0 +; CHECK-ARMPL: [[TMP15:%.*]] = extractvalue { , } [[TMP13]], 0 +; CHECK-ARMPL: [[TMP16:%.*]] = extractvalue { , } [[TMP12]], 1 +; CHECK-ARMPL: [[TMP17:%.*]] = extractvalue { , } [[TMP13]], 1 +; CHECK-ARMPL: store [[TMP14]], ptr [[TMP19:%.*]], align 8 +; CHECK-ARMPL: store [[TMP15]], ptr [[TMP22:%.*]], align 8 +; CHECK-ARMPL: store [[TMP16]], ptr [[TMP24:%.*]], align 8 +; CHECK-ARMPL: store [[TMP17]], ptr [[TMP27:%.*]], align 8 +; CHECK-ARMPL: [[MIDDLE_BLOCK:.*:]] +; CHECK-ARMPL: [[SCALAR_PH:.*:]] +; CHECK-ARMPL: [[FOR_BODY:.*:]] +; CHECK-ARMPL: [[CALL:%.*]] = tail call { double, double } @llvm.sincos.f64(double [[IN_VAL:%.*]]) +; CHECK-ARMPL: [[EXTRACT_A:%.*]] = extractvalue { double, double } [[CALL]], 0 +; CHECK-ARMPL: [[EXTRACT_B:%.*]] = extractvalue { double, double } [[CALL]], 1 +; CHECK-ARMPL: store double [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 8 +; CHECK-ARMPL: store double [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 8 +; CHECK-ARMPL: [[EXIT:.*:]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds double, ptr %in, i64 %iv + %in_val = load double, ptr %arrayidx, align 8 + %call = tail call { double, double } @llvm.sincos.f64(double %in_val) + %extract_a = extractvalue { double, double } %call, 0 + %extract_b = extractvalue { double, double } %call, 1 + %arrayidx2 = getelementptr inbounds double, ptr %out_a, i64 %iv + store double %extract_a, ptr %arrayidx2, align 8 + %arrayidx4 = getelementptr inbounds double, ptr %out_b, i64 %iv + store double %extract_b, ptr %arrayidx4, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +; CHECK-COST-LABEL: predicated_sincos +; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction: %call = tail call { float, float } @llvm.sincos.f32(float %in_val) +; CHECK-COST: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST: Cost of 58 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST: Cost of Invalid for VF vscale x 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) + +; CHECK-COST-ARMPL-LABEL: predicated_sincos +; CHECK-COST-ARMPL: LV: Found an estimated cost of 10 for VF 1 For instruction: %call = tail call { float, float } @llvm.sincos.f32(float %in_val) +; CHECK-COST-ARMPL: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST-ARMPL: Cost of 12 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST-ARMPL: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST-ARMPL: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST-ARMPL: Cost of 13 for VF vscale x 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) + +define void @predicated_sincos(float %x, ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { +; CHECK-LABEL: define void @predicated_sincos( +; CHECK-SAME: float [[X:%.*]], ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0]] { +; CHECK: [[ENTRY:.*:]] +; CHECK: [[FOR_BODY:.*:]] +; CHECK: [[IF_THEN:.*:]] +; CHECK: [[CALL:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[IN_VAL:%.*]]) +; CHECK: [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0 +; CHECK: [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1 +; CHECK: store float [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 4 +; CHECK: store float [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 4 +; CHECK: [[IF_MERGE:.*:]] +; CHECK: [[FOR_END:.*:]] +; +; CHECK-ARMPL-LABEL: define void @predicated_sincos( +; CHECK-ARMPL-SAME: float [[X:%.*]], ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0]] { +; CHECK-ARMPL: [[ENTRY:.*:]] +; CHECK-ARMPL: [[VECTOR_PH:.*:]] +; CHECK-ARMPL: [[VECTOR_BODY:.*:]] +; CHECK-ARMPL: [[TMP15:%.*]] = call { , } @llvm.sincos.nxv4f32( [[WIDE_MASKED_LOAD:%.*]]) +; CHECK-ARMPL: [[TMP16:%.*]] = extractvalue { , } [[TMP15]], 0 +; CHECK-ARMPL: [[TMP17:%.*]] = extractvalue { , } [[TMP15]], 1 +; CHECK-ARMPL: call void @llvm.masked.store.nxv4f32.p0( [[TMP16]], ptr [[TMP19:%.*]], i32 4, [[TMP14:%.*]]) +; CHECK-ARMPL: call void @llvm.masked.store.nxv4f32.p0( [[TMP17]], ptr [[TMP21:%.*]], i32 4, [[TMP14]]) +; CHECK-ARMPL: [[MIDDLE_BLOCK:.*:]] +; CHECK-ARMPL: [[SCALAR_PH:.*:]] +; CHECK-ARMPL: [[FOR_BODY:.*:]] +; CHECK-ARMPL: [[IF_THEN:.*:]] +; CHECK-ARMPL: [[CALL:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[IN_VAL:%.*]]) +; CHECK-ARMPL: [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0 +; CHECK-ARMPL: [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1 +; CHECK-ARMPL: store float [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 4 +; CHECK-ARMPL: store float [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 4 +; CHECK-ARMPL: [[IF_MERGE:.*:]] +; CHECK-ARMPL: [[FOR_END:.*:]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ %iv.next, %if.merge ], [ 0, %entry ] + %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv + %in_val = load float, ptr %arrayidx, align 4 + %if_cond = fcmp olt float %in_val, %x + br i1 %if_cond, label %if.then, label %if.merge + +if.then: + %call = tail call { float, float } @llvm.sincos.f32(float %in_val) + %extract_a = extractvalue { float, float } %call, 0 + %extract_b = extractvalue { float, float } %call, 1 + %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv + store float %extract_a, ptr %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv + store float %extract_b, ptr %arrayidx4, align 4 + br label %if.merge + +if.merge: + %iv.next = add nuw nsw i64 %iv, 1 + %cond = icmp slt i64 %iv.next, 1024 + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/sincos.ll b/llvm/test/Transforms/LoopVectorize/sincos.ll new file mode 100644 index 000000000000..c2936eb8bb8b --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/sincos.ll @@ -0,0 +1,157 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "(:|sincos|extract|store)" --version 5 +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 < %s -S -o - | FileCheck %s + +define void @sincos_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { +; CHECK-LABEL: define void @sincos_f32( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) { +; CHECK: [[ENTRY:.*:]] +; CHECK: [[VECTOR_PH:.*:]] +; CHECK: [[VECTOR_BODY:.*:]] +; CHECK: [[TMP3:%.*]] = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> [[WIDE_LOAD:%.*]]) +; CHECK: [[TMP4:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 0 +; CHECK: [[TMP5:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 1 +; CHECK: store <2 x float> [[TMP4]], ptr [[TMP7:%.*]], align 4 +; CHECK: store <2 x float> [[TMP5]], ptr [[TMP9:%.*]], align 4 +; CHECK: [[MIDDLE_BLOCK:.*:]] +; CHECK: [[SCALAR_PH:.*:]] +; CHECK: [[FOR_BODY:.*:]] +; CHECK: [[CALL:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[IN_VAL:%.*]]) +; CHECK: [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0 +; CHECK: [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1 +; CHECK: store float [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 4 +; CHECK: store float [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 4 +; CHECK: [[EXIT:.*:]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv + %in_val = load float, ptr %arrayidx, align 4 + %call = tail call { float, float } @llvm.sincos.f32(float %in_val) + %extract_a = extractvalue { float, float } %call, 0 + %extract_b = extractvalue { float, float } %call, 1 + %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv + store float %extract_a, ptr %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv + store float %extract_b, ptr %arrayidx4, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +define void @sincos_f64(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { +; CHECK-LABEL: define void @sincos_f64( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) { +; CHECK: [[ENTRY:.*:]] +; CHECK: [[VECTOR_PH:.*:]] +; CHECK: [[VECTOR_BODY:.*:]] +; CHECK: [[TMP3:%.*]] = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> [[WIDE_LOAD:%.*]]) +; CHECK: [[TMP4:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP3]], 0 +; CHECK: [[TMP5:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP3]], 1 +; CHECK: store <2 x double> [[TMP4]], ptr [[TMP7:%.*]], align 8 +; CHECK: store <2 x double> [[TMP5]], ptr [[TMP9:%.*]], align 8 +; CHECK: [[MIDDLE_BLOCK:.*:]] +; CHECK: [[SCALAR_PH:.*:]] +; CHECK: [[FOR_BODY:.*:]] +; CHECK: [[CALL:%.*]] = tail call { double, double } @llvm.sincos.f64(double [[IN_VAL:%.*]]) +; CHECK: [[EXTRACT_A:%.*]] = extractvalue { double, double } [[CALL]], 0 +; CHECK: [[EXTRACT_B:%.*]] = extractvalue { double, double } [[CALL]], 1 +; CHECK: store double [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 8 +; CHECK: store double [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 8 +; CHECK: [[EXIT:.*:]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds double, ptr %in, i64 %iv + %in_val = load double, ptr %arrayidx, align 8 + %call = tail call { double, double } @llvm.sincos.f64(double %in_val) + %extract_a = extractvalue { double, double } %call, 0 + %extract_b = extractvalue { double, double } %call, 1 + %arrayidx2 = getelementptr inbounds double, ptr %out_a, i64 %iv + store double %extract_a, ptr %arrayidx2, align 8 + %arrayidx4 = getelementptr inbounds double, ptr %out_b, i64 %iv + store double %extract_b, ptr %arrayidx4, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +define void @predicated_sincos(float %x, ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { +; CHECK-LABEL: define void @predicated_sincos( +; CHECK-SAME: float [[X:%.*]], ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) { +; CHECK: [[ENTRY:.*:]] +; CHECK: [[VECTOR_BODY1:.*]]: +; CHECK: [[VECTOR_BODY:.*:]] +; CHECK: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_BODY1]] ], [ [[INDEX_NEXT:%.*]], %[[FOR_BODY1:.*]] ] +; CHECK: [[TMP4:%.*]] = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> [[WIDE_LOAD:%.*]]) +; CHECK: [[TMP5:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP4]], 0 +; CHECK: [[TMP6:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP4]], 1 +; CHECK: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP3:%.*]], i32 0 +; CHECK: br i1 [[TMP7]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK: [[TMP9:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 +; CHECK: store float [[TMP9]], ptr [[TMP8:%.*]], align 4 +; CHECK: [[TMP11:%.*]] = extractelement <2 x float> [[TMP6]], i32 0 +; CHECK: store float [[TMP11]], ptr [[TMP10:%.*]], align 4 +; CHECK: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1 +; CHECK: br i1 [[TMP12]], label %[[PRED_STORE_IF1:.*]], label %[[FOR_BODY1]] +; CHECK: [[PRED_STORE_IF1]]: +; CHECK: [[TMP15:%.*]] = extractelement <2 x float> [[TMP5]], i32 1 +; CHECK: store float [[TMP15]], ptr [[TMP14:%.*]], align 4 +; CHECK: [[TMP17:%.*]] = extractelement <2 x float> [[TMP6]], i32 1 +; CHECK: store float [[TMP17]], ptr [[TMP16:%.*]], align 4 +; CHECK: br label %[[FOR_BODY1]] +; CHECK: [[FOR_BODY1]]: +; CHECK: [[IF_THEN1:.*:]] +; CHECK: [[IF_THEN2:.*:]] +; CHECK: [[IF_THEN:.*:]] +; CHECK: [[IF_THEN3:.*:]] +; CHECK: [[CALL:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[IN_VAL:%.*]]) +; CHECK: [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0 +; CHECK: [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1 +; CHECK: store float [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 4 +; CHECK: store float [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 4 +; CHECK: [[IF_MERGE:.*:]] +; CHECK: [[FOR_END:.*:]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ %iv.next, %if.merge ], [ 0, %entry ] + %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv + %in_val = load float, ptr %arrayidx, align 4 + %if_cond = fcmp olt float %in_val, %x + br i1 %if_cond, label %if.then, label %if.merge + +if.then: + %call = tail call { float, float } @llvm.sincos.f32(float %in_val) + %extract_a = extractvalue { float, float } %call, 0 + %extract_b = extractvalue { float, float } %call, 1 + %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv + store float %extract_a, ptr %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv + store float %extract_b, ptr %arrayidx4, align 4 + br label %if.merge + +if.merge: + %iv.next = add nuw nsw i64 %iv, 1 + %cond = icmp slt i64 %iv.next, 1024 + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} From f1cf5988785e03fe44dc66089b44a0fc62f17e83 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 27 Feb 2025 11:10:04 +0000 Subject: [PATCH 2/2] Add a load of comments --- llvm/include/llvm/Analysis/TargetTransformInfo.h | 9 +++++++++ llvm/include/llvm/CodeGen/BasicTTIImpl.h | 10 ++++++++++ llvm/lib/Analysis/CostModel.cpp | 5 +++++ llvm/lib/Analysis/TargetTransformInfo.cpp | 4 ++++ llvm/lib/Analysis/VectorUtils.cpp | 2 ++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 ++ llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 ++ 7 files changed, 34 insertions(+) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 1e80f90a47a8..d57da4f53e05 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -126,13 +126,17 @@ class IntrinsicCostAttributes { // If ScalarizationCost is UINT_MAX, the cost of scalarizing the // arguments and the return value will be computed based on types. InstructionCost ScalarizationCost = InstructionCost::getInvalid(); + /* Downstream change: #87 (sincos vectorization)*/ TargetLibraryInfo const *LibInfo = nullptr; + /* End downstream change: #87 */ public: + /* Downstream change: #87 (sincos vectorization)*/ IntrinsicCostAttributes( Intrinsic::ID Id, const CallBase &CI, InstructionCost ScalarCost = InstructionCost::getInvalid(), bool TypeBasedOnly = false, TargetLibraryInfo const *LibInfo = nullptr); + /* End downstream change: #87 */ IntrinsicCostAttributes( Intrinsic::ID Id, Type *RTy, ArrayRef Tys, @@ -142,12 +146,14 @@ class IntrinsicCostAttributes { IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy, ArrayRef Args); + /* Downstream change: #87 (sincos vectorization)*/ IntrinsicCostAttributes( Intrinsic::ID Id, Type *RTy, ArrayRef Args, ArrayRef Tys, FastMathFlags Flags = FastMathFlags(), const IntrinsicInst *I = nullptr, InstructionCost ScalarCost = InstructionCost::getInvalid(), TargetLibraryInfo const *LibInfo = nullptr); + /* End downstream change: #87 */ Intrinsic::ID getID() const { return IID; } const IntrinsicInst *getInst() const { return II; } @@ -156,7 +162,10 @@ class IntrinsicCostAttributes { InstructionCost getScalarizationCost() const { return ScalarizationCost; } const SmallVectorImpl &getArgs() const { return Arguments; } const SmallVectorImpl &getArgTypes() const { return ParamTys; } + + /* Downstream change: #87 (sincos vectorization)*/ const TargetLibraryInfo *getLibInfo() const { return LibInfo; } + /* End downstream change: #87 */ bool isTypeBasedOnly() const { return Arguments.empty(); diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 2e161d382bb2..7711a9fd5d92 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -286,6 +286,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return false; } + /* Downstream change: #87 (sincos vectorization)*/ /// Several intrinsics that return structs (including llvm.sincos[pi] and /// llvm.modf) can be lowered to a vector library call (for certain VFs). The /// vector library functions correspond to the scalar calls (e.g. sincos or @@ -343,6 +344,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { } return Cost; } + /* End downstream change: #87 */ protected: explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) @@ -1775,8 +1777,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { Type *RetTy = ICA.getReturnType(); + /* Downstream change: #87 (sincos vectorization)*/ ElementCount RetVF = isVectorizedTy(RetTy) ? getVectorizedTypeVF(RetTy) : ElementCount::getFixed(1); + /* End downstream change: #87 */ const IntrinsicInst *I = ICA.getInst(); const SmallVectorImpl &Args = ICA.getArgs(); @@ -2030,6 +2034,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { } case Intrinsic::experimental_vector_match: return thisT()->getTypeBasedIntrinsicInstrCost(ICA, CostKind); + /* Downstream change: #87 (sincos vectorization)*/ case Intrinsic::sincos: { Type *Ty = getContainedTypes(RetTy).front(); EVT VT = getTLI()->getValueType(DL, Ty); @@ -2040,6 +2045,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { // Otherwise, fallback to default scalarization cost. break; } + /* End downstream change: #87 */ } // Assume that we need to scalarize this intrinsic.) @@ -2048,6 +2054,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { InstructionCost ScalarizationCost = InstructionCost::getInvalid(); if (RetVF.isVector() && !RetVF.isScalable()) { ScalarizationCost = 0; + /* Downstream change: #87 (sincos vectorization)*/ if (!RetTy->isVoidTy()) { for (Type *VectorTy : getContainedTypes(RetTy)) { ScalarizationCost += getScalarizationOverhead( @@ -2055,6 +2062,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { /*Insert=*/true, /*Extract=*/false, CostKind); } } + /* End downstream change: #87 */ ScalarizationCost += getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind); } @@ -2709,6 +2717,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { // Else, assume that we need to scalarize this intrinsic. For math builtins // this will emit a costly libcall, adding call overhead and spills. Make it // very expensive. + /* Downstream change: #87 (sincos vectorization)*/ if (isVectorizedTy(RetTy)) { ArrayRef RetVTys = getContainedTypes(RetTy); @@ -2735,6 +2744,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { ScalarTys.push_back(Ty); } IntrinsicCostAttributes Attrs(IID, toScalarizedTy(RetTy), ScalarTys, FMF); + /* End downstream change: #87 */ InstructionCost ScalarCost = thisT()->getIntrinsicInstrCost(Attrs, CostKind); for (Type *Ty : Tys) { diff --git a/llvm/lib/Analysis/CostModel.cpp b/llvm/lib/Analysis/CostModel.cpp index 68cb536bf789..6e5b686cfaf1 100644 --- a/llvm/lib/Analysis/CostModel.cpp +++ b/llvm/lib/Analysis/CostModel.cpp @@ -44,10 +44,12 @@ static cl::opt TypeBasedIntrinsicCost("type-based-intrinsic-cost", cl::desc("Calculate intrinsics cost based only on argument types"), cl::init(false)); +/* Downstream change: #87 (sincos vectorization)*/ static cl::opt PreferIntrinsicCost( "prefer-intrinsic-cost", cl::desc("Prefer using getIntrinsicInstrCost over getInstructionCost"), cl::init(false)); +/* End downstream change: #87 */ #define CM_NAME "cost-model" #define DEBUG_TYPE CM_NAME @@ -55,6 +57,7 @@ static cl::opt PreferIntrinsicCost( PreservedAnalyses CostModelPrinterPass::run(Function &F, FunctionAnalysisManager &AM) { auto &TTI = AM.getResult(F); + // Downstream change: #87 (sincos vectorization) auto &TLI = AM.getResult(F); OS << "Printing analysis 'Cost Model Analysis' for function '" << F.getName() << "':\n"; for (BasicBlock &B : F) { @@ -63,12 +66,14 @@ PreservedAnalyses CostModelPrinterPass::run(Function &F, // which cost kind to print. InstructionCost Cost; auto *II = dyn_cast(&Inst); + /* Downstream change: #87 (sincos vectorization)*/ if (II && (PreferIntrinsicCost || TypeBasedIntrinsicCost)) { IntrinsicCostAttributes ICA( II->getIntrinsicID(), *II, InstructionCost::getInvalid(), /*TypeBasedOnly=*/TypeBasedIntrinsicCost, &TLI); Cost = TTI.getIntrinsicInstrCost(ICA, CostKind); } else { + /* End downstream change: #87 */ Cost = TTI.getInstructionCost(&Inst, CostKind); } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 4dfe5406566e..c062d43cb209 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -67,11 +67,13 @@ bool HardwareLoopInfo::canAnalyze(LoopInfo &LI) { return true; } +/* Downstream change: #87 (sincos vectorization)*/ IntrinsicCostAttributes::IntrinsicCostAttributes( Intrinsic::ID Id, const CallBase &CI, InstructionCost ScalarizationCost, bool TypeBasedOnly, const TargetLibraryInfo *LibInfo) : II(dyn_cast(&CI)), RetTy(CI.getType()), IID(Id), ScalarizationCost(ScalarizationCost), LibInfo(LibInfo) { + /* End downstream change: #87 */ if (const auto *FPMO = dyn_cast(&CI)) FMF = FPMO->getFastMathFlags(); @@ -101,12 +103,14 @@ IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *Ty, ParamTys.push_back(Argument->getType()); } +/* Downstream change: #87 (sincos vectorization)*/ IntrinsicCostAttributes::IntrinsicCostAttributes( Intrinsic::ID Id, Type *RTy, ArrayRef Args, ArrayRef Tys, FastMathFlags Flags, const IntrinsicInst *I, InstructionCost ScalarCost, TargetLibraryInfo const *LibInfo) : II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost), LibInfo(LibInfo) { + /* End downstream change: #87 */ ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end()); Arguments.insert(Arguments.begin(), Args.begin(), Args.end()); } diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 44678289d7c8..416cc52e93a1 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -72,6 +72,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) { case Intrinsic::atan2: case Intrinsic::sin: case Intrinsic::cos: + // Downstream change: #87 (sincos vectorization) case Intrinsic::sincos: case Intrinsic::tan: case Intrinsic::sinh: @@ -180,6 +181,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg( case Intrinsic::ucmp: case Intrinsic::scmp: return OpdIdx == -1 || OpdIdx == 0; + // Downstream change: #87 (sincos vectorization) case Intrinsic::sincos: case Intrinsic::is_fpclass: case Intrinsic::vp_is_fpclass: diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 2b861b2a8ede..7be14353f722 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2884,9 +2884,11 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, std::back_inserter(ParamTys), [&](Type *Ty) { return maybeVectorizeType(Ty, VF); }); + /* Downstream change: #87 (sincos vectorization)*/ IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, dyn_cast(CI), InstructionCost::getInvalid(), TLI); + /* End downstream change: #87 */ return TTI.getIntrinsicInstrCost(CostAttrs, CostKind); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 173edc60a211..07ea9d824342 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1149,10 +1149,12 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(); + /* Downstream change: #87 (sincos vectorization)*/ IntrinsicCostAttributes CostAttrs( VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF, dyn_cast_or_null(getUnderlyingValue()), InstructionCost::getInvalid(), &Ctx.TLI); + /* End downstream change: #87 */ return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind); }