From 0763c9fb52083259c826399d72b6f3be423a42df Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Thu, 27 Feb 2025 10:06:27 +0000
Subject: [PATCH 1/2] [LV] Teach the vectorizer to cost and vectorize
 llvm.sincos intrinsics

This teaches the loop vectorizer that `llvm.sincos` is trivially
vectorizable. Additionally, this patch updates the cost model to cost
intrinsics that return multiple values correctly. Previously, the cost
model only thought intrinsics that return `VectorType` need scalarizing,
which meant it cost intrinsics that return multiple vectors (that need
scalarizing) way too cheap (giving it the cost of a single function
call).

The `llvm.sincos` intrinsic also has a custom cost when a vector
function library is available, as certain VFs can be expanded (later in
code-gen) to a vector function, reducing the cost to a single call (+
the possible loads from the vector function returns values via output
pointers).
---
 .../llvm/Analysis/TargetTransformInfo.h       |   7 +-
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      | 113 ++++++--
 llvm/lib/Analysis/CostModel.cpp               |  18 +-
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  17 +-
 llvm/lib/Analysis/VectorUtils.cpp             |   2 +
 .../Transforms/Vectorize/LoopVectorize.cpp    |   3 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   3 +-
 .../test/Analysis/CostModel/AArch64/sincos.ll |  60 +++++
 llvm/test/Analysis/CostModel/AMDGPU/frexp.ll  |  56 ++--
 .../LoopVectorize/AArch64/sincos.ll           | 251 ++++++++++++++++++
 llvm/test/Transforms/LoopVectorize/sincos.ll  | 157 +++++++++++
 11 files changed, 623 insertions(+), 64 deletions(-)
 create mode 100644 llvm/test/Analysis/CostModel/AArch64/sincos.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/sincos.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/sincos.ll
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 45077f174115..1e80f90a47a8 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -126,12 +126,13 @@ class IntrinsicCostAttributes {
   // If ScalarizationCost is UINT_MAX, the cost of scalarizing the
   // arguments and the return value will be computed based on types.
   InstructionCost ScalarizationCost = InstructionCost::getInvalid();
+  TargetLibraryInfo const *LibInfo = nullptr;
 
 public:
   IntrinsicCostAttributes(
       Intrinsic::ID Id, const CallBase &CI,
       InstructionCost ScalarCost = InstructionCost::getInvalid(),
-      bool TypeBasedOnly = false);
+      bool TypeBasedOnly = false, TargetLibraryInfo const *LibInfo = nullptr);
 
   IntrinsicCostAttributes(
       Intrinsic::ID Id, Type *RTy, ArrayRef<Type *> Tys,
@@ -145,7 +146,8 @@ class IntrinsicCostAttributes {
       Intrinsic::ID Id, Type *RTy, ArrayRef<const Value *> Args,
       ArrayRef<Type *> Tys, FastMathFlags Flags = FastMathFlags(),
       const IntrinsicInst *I = nullptr,
-      InstructionCost ScalarCost = InstructionCost::getInvalid());
+      InstructionCost ScalarCost = InstructionCost::getInvalid(),
+      TargetLibraryInfo const *LibInfo = nullptr);
 
   Intrinsic::ID getID() const { return IID; }
   const IntrinsicInst *getInst() const { return II; }
@@ -154,6 +156,7 @@ class IntrinsicCostAttributes {
   InstructionCost getScalarizationCost() const { return ScalarizationCost; }
   const SmallVectorImpl<const Value *> &getArgs() const { return Arguments; }
   const SmallVectorImpl<Type *> &getArgTypes() const { return ParamTys; }
+  const TargetLibraryInfo *getLibInfo() const { return LibInfo; }
 
   bool isTypeBasedOnly() const {
     return Arguments.empty();
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 596db3923921..2e161d382bb2 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -22,6 +22,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/TargetTransformInfoImpl.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -285,6 +286,64 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return false;
   }
 
+  /// Several intrinsics that return structs (including llvm.sincos[pi] and
+  /// llvm.modf) can be lowered to a vector library call (for certain VFs). The
+  /// vector library functions correspond to the scalar calls (e.g. sincos or
+  /// modf), which unlike the intrinsic return values via output pointers. This
+  /// helper checks if a vector call exists for the given intrinsic, and returns
+  /// the cost, which includes the cost of the mask (if required), and the loads
+  /// for values returned via output pointers. \p LC is the scalar libcall and
+  /// \p CallRetElementIndex (optional) is the struct element which is mapped to
+  /// the call return value. If std::nullopt is returned, then no vector library
+  /// call is available, so the intrinsic should be assigned the default cost
+  /// (e.g. scalarization).
+  std::optional<InstructionCost> getMultipleResultIntrinsicVectorLibCallCost(
+      const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind,
+      RTLIB::Libcall LC, std::optional<unsigned> CallRetElementIndex = {}) {
+    Type *RetTy = ICA.getReturnType();
+    // Vector variants of the intrinsic can be mapped to a vector library call.
+    auto const *LibInfo = ICA.getLibInfo();
+    if (!LibInfo || !isa<StructType>(RetTy) ||
+        !isVectorizedStructTy(cast<StructType>(RetTy)))
+      return std::nullopt;
+
+    // Find associated libcall.
+    const char *LCName = getTLI()->getLibcallName(LC);
+    if (!LCName)
+      return std::nullopt;
+
+    // Search for a corresponding vector variant.
+    LLVMContext &Ctx = RetTy->getContext();
+    ElementCount VF = getVectorizedTypeVF(RetTy);
+    VecDesc const *VD = nullptr;
+    for (bool Masked : {false, true}) {
+      if ((VD = LibInfo->getVectorMappingInfo(LCName, VF, Masked)))
+        break;
+    }
+    if (!VD)
+      return std::nullopt;
+
+    // Cost the call + mask.
+    auto Cost =
+        thisT()->getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind);
+    if (VD->isMasked())
+      Cost += thisT()->getShuffleCost(
+          TargetTransformInfo::SK_Broadcast,
+          VectorType::get(IntegerType::getInt1Ty(Ctx), VF), {}, CostKind, 0,
+          nullptr, {});
+
+    // Lowering to a library call (with output pointers) may require us to emit
+    // reloads for the results.
+    for (auto [Idx, VectorTy] : enumerate(getContainedTypes(RetTy))) {
+      if (Idx == CallRetElementIndex)
+        continue;
+      Cost += thisT()->getMemoryOpCost(
+          Instruction::Load, VectorTy,
+          thisT()->getDataLayout().getABITypeAlign(VectorTy), 0, CostKind);
+    }
+    return Cost;
+  }
+
 protected:
   explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
       : BaseT(DL) {}
@@ -1716,9 +1775,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
 
     Type *RetTy = ICA.getReturnType();
 
-    ElementCount RetVF =
-        (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
-                             : ElementCount::getFixed(1));
+    ElementCount RetVF = isVectorizedTy(RetTy) ? getVectorizedTypeVF(RetTy)
+                                               : ElementCount::getFixed(1);
+
     const IntrinsicInst *I = ICA.getInst();
     const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
     FastMathFlags FMF = ICA.getFlags();
@@ -1971,6 +2030,16 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     }
     case Intrinsic::experimental_vector_match:
       return thisT()->getTypeBasedIntrinsicInstrCost(ICA, CostKind);
+    case Intrinsic::sincos: {
+      Type *Ty = getContainedTypes(RetTy).front();
+      EVT VT = getTLI()->getValueType(DL, Ty);
+      RTLIB::Libcall LC = RTLIB::getFSINCOS(VT.getScalarType());
+      if (auto Cost =
+              getMultipleResultIntrinsicVectorLibCallCost(ICA, CostKind, LC))
+        return *Cost;
+      // Otherwise, fallback to default scalarization cost.
+      break;
+    }
     }
 
     // Assume that we need to scalarize this intrinsic.)
@@ -1979,10 +2048,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     InstructionCost ScalarizationCost = InstructionCost::getInvalid();
     if (RetVF.isVector() && !RetVF.isScalable()) {
       ScalarizationCost = 0;
-      if (!RetTy->isVoidTy())
-        ScalarizationCost += getScalarizationOverhead(
-            cast<VectorType>(RetTy),
-            /*Insert*/ true, /*Extract*/ false, CostKind);
+      if (!RetTy->isVoidTy()) {
+        for (Type *VectorTy : getContainedTypes(RetTy)) {
+          ScalarizationCost += getScalarizationOverhead(
+              cast<VectorType>(VectorTy),
+              /*Insert=*/true, /*Extract=*/false, CostKind);
+        }
+      }
       ScalarizationCost +=
           getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind);
     }
@@ -2637,27 +2709,32 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     // Else, assume that we need to scalarize this intrinsic. For math builtins
     // this will emit a costly libcall, adding call overhead and spills. Make it
     // very expensive.
-    if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
+    if (isVectorizedTy(RetTy)) {
+      ArrayRef<Type *> RetVTys = getContainedTypes(RetTy);
+
       // Scalable vectors cannot be scalarized, so return Invalid.
-      if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
-            return isa<ScalableVectorType>(Ty);
-          }))
+      if (any_of(concat<Type *const>(RetVTys, Tys),
+                 [](Type *Ty) { return isa<ScalableVectorType>(Ty); }))
         return InstructionCost::getInvalid();
 
-      InstructionCost ScalarizationCost =
-          SkipScalarizationCost
-              ? ScalarizationCostPassed
-              : getScalarizationOverhead(RetVTy, /*Insert*/ true,
-                                         /*Extract*/ false, CostKind);
+      InstructionCost ScalarizationCost = ScalarizationCostPassed;
+      if (!SkipScalarizationCost) {
+        ScalarizationCost = 0;
+        for (Type *RetVTy : RetVTys) {
+          ScalarizationCost += getScalarizationOverhead(
+              cast<VectorType>(RetVTy), /*Insert=*/true,
+              /*Extract=*/false, CostKind);
+        }
+      }
 
-      unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
+      unsigned ScalarCalls = getVectorizedTypeVF(RetTy).getFixedValue();
       SmallVector<Type *, 4> ScalarTys;
       for (Type *Ty : Tys) {
         if (Ty->isVectorTy())
           Ty = Ty->getScalarType();
         ScalarTys.push_back(Ty);
       }
-      IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
+      IntrinsicCostAttributes Attrs(IID, toScalarizedTy(RetTy), ScalarTys, FMF);
       InstructionCost ScalarCost =
           thisT()->getIntrinsicInstrCost(Attrs, CostKind);
       for (Type *Ty : Tys) {
diff --git a/llvm/lib/Analysis/CostModel.cpp b/llvm/lib/Analysis/CostModel.cpp
index ee6622516a5a..68cb536bf789 100644
--- a/llvm/lib/Analysis/CostModel.cpp
+++ b/llvm/lib/Analysis/CostModel.cpp
@@ -17,6 +17,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/CostModel.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -24,6 +25,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
+
 using namespace llvm;
 
 static cl::opt<TargetTransformInfo::TargetCostKind> CostKind(
@@ -42,12 +44,18 @@ static cl::opt<bool> TypeBasedIntrinsicCost("type-based-intrinsic-cost",
     cl::desc("Calculate intrinsics cost based only on argument types"),
     cl::init(false));
 
+static cl::opt<bool> PreferIntrinsicCost(
+    "prefer-intrinsic-cost",
+    cl::desc("Prefer using getIntrinsicInstrCost over getInstructionCost"),
+    cl::init(false));
+
 #define CM_NAME "cost-model"
 #define DEBUG_TYPE CM_NAME
 
 PreservedAnalyses CostModelPrinterPass::run(Function &F,
                                             FunctionAnalysisManager &AM) {
   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   OS << "Printing analysis 'Cost Model Analysis' for function '" << F.getName() << "':\n";
   for (BasicBlock &B : F) {
     for (Instruction &Inst : B) {
@@ -55,12 +63,12 @@ PreservedAnalyses CostModelPrinterPass::run(Function &F,
       // which cost kind to print.
       InstructionCost Cost;
       auto *II = dyn_cast<IntrinsicInst>(&Inst);
-      if (II && TypeBasedIntrinsicCost) {
-        IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II,
-                                    InstructionCost::getInvalid(), true);
+      if (II && (PreferIntrinsicCost || TypeBasedIntrinsicCost)) {
+        IntrinsicCostAttributes ICA(
+            II->getIntrinsicID(), *II, InstructionCost::getInvalid(),
+            /*TypeBasedOnly=*/TypeBasedIntrinsicCost, &TLI);
         Cost = TTI.getIntrinsicInstrCost(ICA, CostKind);
-      }
-      else {
+      } else {
         Cost = TTI.getInstructionCost(&Inst, CostKind);
       }
 
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index ade398ea72f9..4dfe5406566e 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -69,9 +69,9 @@ bool HardwareLoopInfo::canAnalyze(LoopInfo &LI) {
 
 IntrinsicCostAttributes::IntrinsicCostAttributes(
     Intrinsic::ID Id, const CallBase &CI, InstructionCost ScalarizationCost,
-    bool TypeBasedOnly)
+    bool TypeBasedOnly, const TargetLibraryInfo *LibInfo)
     : II(dyn_cast<IntrinsicInst>(&CI)), RetTy(CI.getType()), IID(Id),
-      ScalarizationCost(ScalarizationCost) {
+      ScalarizationCost(ScalarizationCost), LibInfo(LibInfo) {
 
   if (const auto *FPMO = dyn_cast<FPMathOperator>(&CI))
     FMF = FPMO->getFastMathFlags();
@@ -101,13 +101,12 @@ IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *Ty,
     ParamTys.push_back(Argument->getType());
 }
 
-IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
-                                                 ArrayRef<const Value *> Args,
-                                                 ArrayRef<Type *> Tys,
-                                                 FastMathFlags Flags,
-                                                 const IntrinsicInst *I,
-                                                 InstructionCost ScalarCost)
-    : II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) {
+IntrinsicCostAttributes::IntrinsicCostAttributes(
+    Intrinsic::ID Id, Type *RTy, ArrayRef<const Value *> Args,
+    ArrayRef<Type *> Tys, FastMathFlags Flags, const IntrinsicInst *I,
+    InstructionCost ScalarCost, TargetLibraryInfo const *LibInfo)
+    : II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost),
+      LibInfo(LibInfo) {
   ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end());
   Arguments.insert(Arguments.begin(), Args.begin(), Args.end());
 }
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index ad80e458ab57..44678289d7c8 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -72,6 +72,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::atan2:
   case Intrinsic::sin:
   case Intrinsic::cos:
+  case Intrinsic::sincos:
   case Intrinsic::tan:
   case Intrinsic::sinh:
   case Intrinsic::cosh:
@@ -179,6 +180,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
   case Intrinsic::ucmp:
   case Intrinsic::scmp:
     return OpdIdx == -1 || OpdIdx == 0;
+  case Intrinsic::sincos:
   case Intrinsic::is_fpclass:
   case Intrinsic::vp_is_fpclass:
     return OpdIdx == 0;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e2aad1ccb904..2b861b2a8ede 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2885,7 +2885,8 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
                  [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
 
   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
-                                    dyn_cast<IntrinsicInst>(CI));
+                                    dyn_cast<IntrinsicInst>(CI),
+                                    InstructionCost::getInvalid(), TLI);
   return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 99b8da7fb349..173edc60a211 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1151,7 +1151,8 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
   FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags();
   IntrinsicCostAttributes CostAttrs(
       VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
-      dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()));
+      dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()),
+      InstructionCost::getInvalid(), &Ctx.TLI);
   return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
 }
 
diff --git a/llvm/test/Analysis/CostModel/AArch64/sincos.ll b/llvm/test/Analysis/CostModel/AArch64/sincos.ll
new file mode 100644
index 000000000000..e32c51667e0a
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/sincos.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "sincos"
+; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s
+; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=ArmPL -passes="print<cost-model>" -prefer-intrinsic-cost -cost-kind=throughput 2>&1 -disable-output | FileCheck %s -check-prefix=CHECK-VECLIB
+
+define void @sincos() {
+; CHECK-LABEL: 'sincos'
+; CHECK:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half poison)
+; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison)
+; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison)
+; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison)
+;
+; CHECK:  Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
+; CHECK:  Cost Model: Found an estimated cost of 52 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
+; CHECK:  Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
+; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
+; CHECK:  Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
+;
+; CHECK:  Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison)
+; CHECK:  Cost Model: Invalid cost for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison)
+; CHECK:  Cost Model: Invalid cost for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison)
+; CHECK:  Cost Model: Invalid cost for instruction: %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.nxv1f128(<vscale x 1 x fp128> poison)
+; CHECK:  Cost Model: Invalid cost for instruction: %nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.nxv8f32(<vscale x 8 x float> poison)
+;
+; CHECK-VECLIB-LABEL: 'sincos'
+; CHECK-VECLIB:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half poison)
+; CHECK-VECLIB:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison)
+; CHECK-VECLIB:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison)
+; CHECK-VECLIB:  Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison)
+;
+; CHECK-VECLIB:  Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
+; CHECK-VECLIB:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
+; CHECK-VECLIB:  Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
+; CHECK-VECLIB:  Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
+; CHECK-VECLIB:  Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
+;
+; CHECK-VECLIB:  Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison)
+; CHECK-VECLIB:  Cost Model: Found an estimated cost of 13 for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison)
+; CHECK-VECLIB:  Cost Model: Found an estimated cost of 13 for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison)
+; CHECK-VECLIB:  Cost Model: Invalid cost for instruction: %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.nxv1f128(<vscale x 1 x fp128> poison)
+; CHECK-VECLIB:  Cost Model: Invalid cost for instruction: %nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.nxv8f32(<vscale x 8 x float> poison)
+;
+  %f16 = call { half, half } @llvm.sincos.f16(half poison)
+  %f32 = call { float, float } @llvm.sincos.f32(float poison)
+  %f64 = call { double, double } @llvm.sincos.f64(double poison)
+  %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison)
+
+  %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
+  %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
+  %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
+  %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
+  %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
+
+  %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.v8f16(<vscale x 8 x half> poison)
+  %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.v4f32(<vscale x 4 x float> poison)
+  %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.v2f64(<vscale x 2 x double> poison)
+  %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.v1f128(<vscale x 1 x fp128> poison)
+  %nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.v8f32(<vscale x 8 x float> poison)
+
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll b/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll
index 22134d042fab..f5f4445b34b0 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll
@@ -68,46 +68,46 @@ define void @frexp_f16_i32() {
 define void @frexp_f16_i16() {
 ; GFX7-LABEL: 'frexp_f16_i16'
 ; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
 ; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX8PLUS-LABEL: 'frexp_f16_i16'
 ; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
-; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
-; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
-; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
-; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
-; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
-; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
-; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
 ; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX7-SIZE-LABEL: 'frexp_f16_i16'
 ; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
 ; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX8PLUS-SIZE-LABEL: 'frexp_f16_i16'
 ; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
-; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
-; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
-; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
-; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
-; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
-; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
-; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
 ; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sincos.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sincos.ll
new file mode 100644
index 000000000000..a7e949838f76
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sincos.ll
@@ -0,0 +1,251 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "(:|sincos|extractvalue|store)" --version 5
+; RUN: opt -passes=loop-vectorize -mtriple=aarch64-gnu-linux -mcpu=neoverse-v1 -mattr=+sve < %s -S -o - -debug-only=loop-vectorize 2>%t.1 | FileCheck %s --check-prefix=CHECK
+; RUN: opt -passes=loop-vectorize -mtriple=aarch64-gnu-linux -mcpu=neoverse-v1 -mattr=+sve -vector-library=ArmPL < %s -S -o - -debug-only=loop-vectorize 2>%t.2 | FileCheck %s --check-prefix=CHECK-ARMPL
+; RUN: FileCheck --input-file=%t.1 --check-prefix=CHECK-COST %s
+; RUN: FileCheck --input-file=%t.2 --check-prefix=CHECK-COST-ARMPL %s
+; REQUIRES: asserts
+
+; CHECK-COST-LABEL: sincos_f32
+; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction:   %call = tail call { float, float } @llvm.sincos.f32(float %in_val)
+; CHECK-COST: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST: Cost of 58 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST: Cost of Invalid for VF vscale x 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+
+; CHECK-COST-ARMPL-LABEL: sincos_f32
+; CHECK-COST-ARMPL: LV: Found an estimated cost of 10 for VF 1 For instruction:   %call = tail call { float, float } @llvm.sincos.f32(float %in_val)
+; CHECK-COST-ARMPL: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST-ARMPL: Cost of 12 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST-ARMPL: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST-ARMPL: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST-ARMPL: Cost of 13 for VF vscale x 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+
+define void @sincos_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @sincos_f32(
+; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK:  [[ENTRY:.*:]]
+; CHECK:  [[VECTOR_PH:.*:]]
+; CHECK:  [[VECTOR_BODY:.*:]]
+; CHECK:    [[TMP3:%.*]] = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+; CHECK:    [[TMP4:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 0
+; CHECK:    [[TMP5:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 1
+; CHECK:    store <2 x float> [[TMP4]], ptr [[TMP7:%.*]], align 4
+; CHECK:    store <2 x float> [[TMP5]], ptr [[TMP9:%.*]], align 4
+; CHECK:  [[MIDDLE_BLOCK:.*:]]
+; CHECK:  [[SCALAR_PH:.*:]]
+; CHECK:  [[FOR_BODY:.*:]]
+; CHECK:    [[CALL:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[IN_VAL:%.*]])
+; CHECK:    [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
+; CHECK:    [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
+; CHECK:    store float [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 4
+; CHECK:    store float [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 4
+; CHECK:  [[EXIT:.*:]]
+;
+; CHECK-ARMPL-LABEL: define void @sincos_f32(
+; CHECK-ARMPL-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-ARMPL:  [[ENTRY:.*:]]
+; CHECK-ARMPL:  [[VECTOR_PH:.*:]]
+; CHECK-ARMPL:  [[VECTOR_BODY:.*:]]
+; CHECK-ARMPL:    [[TMP12:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD:%.*]])
+; CHECK-ARMPL:    [[TMP13:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1:%.*]])
+; CHECK-ARMPL:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 0
+; CHECK-ARMPL:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP13]], 0
+; CHECK-ARMPL:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 1
+; CHECK-ARMPL:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP13]], 1
+; CHECK-ARMPL:    store <vscale x 4 x float> [[TMP14]], ptr [[TMP19:%.*]], align 4
+; CHECK-ARMPL:    store <vscale x 4 x float> [[TMP15]], ptr [[TMP22:%.*]], align 4
+; CHECK-ARMPL:    store <vscale x 4 x float> [[TMP16]], ptr [[TMP24:%.*]], align 4
+; CHECK-ARMPL:    store <vscale x 4 x float> [[TMP17]], ptr [[TMP27:%.*]], align 4
+; CHECK-ARMPL:  [[MIDDLE_BLOCK:.*:]]
+; CHECK-ARMPL:  [[SCALAR_PH:.*:]]
+; CHECK-ARMPL:  [[FOR_BODY:.*:]]
+; CHECK-ARMPL:    [[CALL:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[IN_VAL:%.*]])
+; CHECK-ARMPL:    [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
+; CHECK-ARMPL:    [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
+; CHECK-ARMPL:    store float [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 4
+; CHECK-ARMPL:    store float [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 4
+; CHECK-ARMPL:  [[EXIT:.*:]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  %call = tail call { float, float } @llvm.sincos.f32(float %in_val)
+  %extract_a = extractvalue { float, float } %call, 0
+  %extract_b = extractvalue { float, float } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+  store float %extract_b, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; CHECK-COST-LABEL: sincos_f64
+; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction:   %call = tail call { double, double } @llvm.sincos.f64(double %in_val)
+; CHECK-COST: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+
+; CHECK-COST-ARMPL-LABEL: sincos_f64
+; CHECK-COST-ARMPL: LV: Found an estimated cost of 10 for VF 1 For instruction:   %call = tail call { double, double } @llvm.sincos.f64(double %in_val)
+; CHECK-COST-ARMPL: Cost of 12 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST-ARMPL: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST-ARMPL: Cost of 13 for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+
+define void @sincos_f64(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @sincos_f64(
+; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0]] {
+; CHECK:  [[ENTRY:.*:]]
+; CHECK:  [[VECTOR_PH:.*:]]
+; CHECK:  [[VECTOR_BODY:.*:]]
+; CHECK:    [[TMP3:%.*]] = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+; CHECK:    [[TMP4:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP3]], 0
+; CHECK:    [[TMP5:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP3]], 1
+; CHECK:    store <2 x double> [[TMP4]], ptr [[TMP7:%.*]], align 8
+; CHECK:    store <2 x double> [[TMP5]], ptr [[TMP9:%.*]], align 8
+; CHECK:  [[MIDDLE_BLOCK:.*:]]
+; CHECK:  [[SCALAR_PH:.*:]]
+; CHECK:  [[FOR_BODY:.*:]]
+; CHECK:    [[CALL:%.*]] = tail call { double, double } @llvm.sincos.f64(double [[IN_VAL:%.*]])
+; CHECK:    [[EXTRACT_A:%.*]] = extractvalue { double, double } [[CALL]], 0
+; CHECK:    [[EXTRACT_B:%.*]] = extractvalue { double, double } [[CALL]], 1
+; CHECK:    store double [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 8
+; CHECK:    store double [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 8
+; CHECK:  [[EXIT:.*:]]
+;
+; CHECK-ARMPL-LABEL: define void @sincos_f64(
+; CHECK-ARMPL-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0]] {
+; CHECK-ARMPL:  [[ENTRY:.*:]]
+; CHECK-ARMPL:  [[VECTOR_PH:.*:]]
+; CHECK-ARMPL:  [[VECTOR_BODY:.*:]]
+; CHECK-ARMPL:    [[TMP12:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD:%.*]])
+; CHECK-ARMPL:    [[TMP13:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD1:%.*]])
+; CHECK-ARMPL:    [[TMP14:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP12]], 0
+; CHECK-ARMPL:    [[TMP15:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP13]], 0
+; CHECK-ARMPL:    [[TMP16:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP12]], 1
+; CHECK-ARMPL:    [[TMP17:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP13]], 1
+; CHECK-ARMPL:    store <vscale x 2 x double> [[TMP14]], ptr [[TMP19:%.*]], align 8
+; CHECK-ARMPL:    store <vscale x 2 x double> [[TMP15]], ptr [[TMP22:%.*]], align 8
+; CHECK-ARMPL:    store <vscale x 2 x double> [[TMP16]], ptr [[TMP24:%.*]], align 8
+; CHECK-ARMPL:    store <vscale x 2 x double> [[TMP17]], ptr [[TMP27:%.*]], align 8
+; CHECK-ARMPL:  [[MIDDLE_BLOCK:.*:]]
+; CHECK-ARMPL:  [[SCALAR_PH:.*:]]
+; CHECK-ARMPL:  [[FOR_BODY:.*:]]
+; CHECK-ARMPL:    [[CALL:%.*]] = tail call { double, double } @llvm.sincos.f64(double [[IN_VAL:%.*]])
+; CHECK-ARMPL:    [[EXTRACT_A:%.*]] = extractvalue { double, double } [[CALL]], 0
+; CHECK-ARMPL:    [[EXTRACT_B:%.*]] = extractvalue { double, double } [[CALL]], 1
+; CHECK-ARMPL:    store double [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 8
+; CHECK-ARMPL:    store double [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 8
+; CHECK-ARMPL:  [[EXIT:.*:]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %in, i64 %iv
+  %in_val = load double, ptr %arrayidx, align 8
+  %call = tail call { double, double } @llvm.sincos.f64(double %in_val)
+  %extract_a = extractvalue { double, double } %call, 0
+  %extract_b = extractvalue { double, double } %call, 1
+  %arrayidx2 = getelementptr inbounds double, ptr %out_a, i64 %iv
+  store double %extract_a, ptr %arrayidx2, align 8
+  %arrayidx4 = getelementptr inbounds double, ptr %out_b, i64 %iv
+  store double %extract_b, ptr %arrayidx4, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; CHECK-COST-LABEL: predicated_sincos
+; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction:   %call = tail call { float, float } @llvm.sincos.f32(float %in_val)
+; CHECK-COST: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST: Cost of 58 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST: Cost of Invalid for VF vscale x 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+
+; CHECK-COST-ARMPL-LABEL: predicated_sincos
+; CHECK-COST-ARMPL: LV: Found an estimated cost of 10 for VF 1 For instruction:   %call = tail call { float, float } @llvm.sincos.f32(float %in_val)
+; CHECK-COST-ARMPL: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST-ARMPL: Cost of 12 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST-ARMPL: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST-ARMPL: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST-ARMPL: Cost of 13 for VF vscale x 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+
+define void @predicated_sincos(float %x, ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @predicated_sincos(
+; CHECK-SAME: float [[X:%.*]], ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0]] {
+; CHECK:  [[ENTRY:.*:]]
+; CHECK:  [[FOR_BODY:.*:]]
+; CHECK:  [[IF_THEN:.*:]]
+; CHECK:    [[CALL:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[IN_VAL:%.*]])
+; CHECK:    [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
+; CHECK:    [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
+; CHECK:    store float [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 4
+; CHECK:    store float [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 4
+; CHECK:  [[IF_MERGE:.*:]]
+; CHECK:  [[FOR_END:.*:]]
+;
+; CHECK-ARMPL-LABEL: define void @predicated_sincos(
+; CHECK-ARMPL-SAME: float [[X:%.*]], ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0]] {
+; CHECK-ARMPL:  [[ENTRY:.*:]]
+; CHECK-ARMPL:  [[VECTOR_PH:.*:]]
+; CHECK-ARMPL:  [[VECTOR_BODY:.*:]]
+; CHECK-ARMPL:    [[TMP15:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+; CHECK-ARMPL:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP15]], 0
+; CHECK-ARMPL:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP15]], 1
+; CHECK-ARMPL:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP16]], ptr [[TMP19:%.*]], i32 4, <vscale x 4 x i1> [[TMP14:%.*]])
+; CHECK-ARMPL:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP17]], ptr [[TMP21:%.*]], i32 4, <vscale x 4 x i1> [[TMP14]])
+; CHECK-ARMPL:  [[MIDDLE_BLOCK:.*:]]
+; CHECK-ARMPL:  [[SCALAR_PH:.*:]]
+; CHECK-ARMPL:  [[FOR_BODY:.*:]]
+; CHECK-ARMPL:  [[IF_THEN:.*:]]
+; CHECK-ARMPL:    [[CALL:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[IN_VAL:%.*]])
+; CHECK-ARMPL:    [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
+; CHECK-ARMPL:    [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
+; CHECK-ARMPL:    store float [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 4
+; CHECK-ARMPL:    store float [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 4
+; CHECK-ARMPL:  [[IF_MERGE:.*:]]
+; CHECK-ARMPL:  [[FOR_END:.*:]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %if.merge ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  %if_cond = fcmp olt float %in_val, %x
+  br i1 %if_cond, label %if.then, label %if.merge
+
+if.then:
+  %call = tail call { float, float } @llvm.sincos.f32(float %in_val)
+  %extract_a = extractvalue { float, float } %call, 0
+  %extract_b = extractvalue { float, float } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+  store float %extract_b, ptr %arrayidx4, align 4
+  br label %if.merge
+
+if.merge:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %cond = icmp slt i64 %iv.next, 1024
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/sincos.ll b/llvm/test/Transforms/LoopVectorize/sincos.ll
new file mode 100644
index 000000000000..c2936eb8bb8b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/sincos.ll
@@ -0,0 +1,157 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "(:|sincos|extract|store)" --version 5
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 < %s -S -o - | FileCheck %s
+
+define void @sincos_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @sincos_f32(
+; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) {
+; CHECK:  [[ENTRY:.*:]]
+; CHECK:  [[VECTOR_PH:.*:]]
+; CHECK:  [[VECTOR_BODY:.*:]]
+; CHECK:    [[TMP3:%.*]] = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+; CHECK:    [[TMP4:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 0
+; CHECK:    [[TMP5:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 1
+; CHECK:    store <2 x float> [[TMP4]], ptr [[TMP7:%.*]], align 4
+; CHECK:    store <2 x float> [[TMP5]], ptr [[TMP9:%.*]], align 4
+; CHECK:  [[MIDDLE_BLOCK:.*:]]
+; CHECK:  [[SCALAR_PH:.*:]]
+; CHECK:  [[FOR_BODY:.*:]]
+; CHECK:    [[CALL:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[IN_VAL:%.*]])
+; CHECK:    [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
+; CHECK:    [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
+; CHECK:    store float [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 4
+; CHECK:    store float [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 4
+; CHECK:  [[EXIT:.*:]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  %call = tail call { float, float } @llvm.sincos.f32(float %in_val)
+  %extract_a = extractvalue { float, float } %call, 0
+  %extract_b = extractvalue { float, float } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+  store float %extract_b, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @sincos_f64(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @sincos_f64(
+; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) {
+; CHECK:  [[ENTRY:.*:]]
+; CHECK:  [[VECTOR_PH:.*:]]
+; CHECK:  [[VECTOR_BODY:.*:]]
+; CHECK:    [[TMP3:%.*]] = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+; CHECK:    [[TMP4:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP3]], 0
+; CHECK:    [[TMP5:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP3]], 1
+; CHECK:    store <2 x double> [[TMP4]], ptr [[TMP7:%.*]], align 8
+; CHECK:    store <2 x double> [[TMP5]], ptr [[TMP9:%.*]], align 8
+; CHECK:  [[MIDDLE_BLOCK:.*:]]
+; CHECK:  [[SCALAR_PH:.*:]]
+; CHECK:  [[FOR_BODY:.*:]]
+; CHECK:    [[CALL:%.*]] = tail call { double, double } @llvm.sincos.f64(double [[IN_VAL:%.*]])
+; CHECK:    [[EXTRACT_A:%.*]] = extractvalue { double, double } [[CALL]], 0
+; CHECK:    [[EXTRACT_B:%.*]] = extractvalue { double, double } [[CALL]], 1
+; CHECK:    store double [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 8
+; CHECK:    store double [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 8
+; CHECK:  [[EXIT:.*:]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %in, i64 %iv
+  %in_val = load double, ptr %arrayidx, align 8
+  %call = tail call { double, double } @llvm.sincos.f64(double %in_val)
+  %extract_a = extractvalue { double, double } %call, 0
+  %extract_b = extractvalue { double, double } %call, 1
+  %arrayidx2 = getelementptr inbounds double, ptr %out_a, i64 %iv
+  store double %extract_a, ptr %arrayidx2, align 8
+  %arrayidx4 = getelementptr inbounds double, ptr %out_b, i64 %iv
+  store double %extract_b, ptr %arrayidx4, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @predicated_sincos(float %x, ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @predicated_sincos(
+; CHECK-SAME: float [[X:%.*]], ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) {
+; CHECK:  [[ENTRY:.*:]]
+; CHECK:  [[VECTOR_BODY1:.*]]:
+; CHECK:  [[VECTOR_BODY:.*:]]
+; CHECK:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_BODY1]] ], [ [[INDEX_NEXT:%.*]], %[[FOR_BODY1:.*]] ]
+; CHECK:    [[TMP4:%.*]] = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+; CHECK:    [[TMP5:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP4]], 0
+; CHECK:    [[TMP6:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP4]], 1
+; CHECK:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP3:%.*]], i32 0
+; CHECK:    br i1 [[TMP7]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK:  [[PRED_STORE_IF]]:
+; CHECK:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
+; CHECK:    store float [[TMP9]], ptr [[TMP8:%.*]], align 4
+; CHECK:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
+; CHECK:    store float [[TMP11]], ptr [[TMP10:%.*]], align 4
+; CHECK:    br label %[[PRED_STORE_CONTINUE]]
+; CHECK:  [[PRED_STORE_CONTINUE]]:
+; CHECK:    [[TMP12:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1
+; CHECK:    br i1 [[TMP12]], label %[[PRED_STORE_IF1:.*]], label %[[FOR_BODY1]]
+; CHECK:  [[PRED_STORE_IF1]]:
+; CHECK:    [[TMP15:%.*]] = extractelement <2 x float> [[TMP5]], i32 1
+; CHECK:    store float [[TMP15]], ptr [[TMP14:%.*]], align 4
+; CHECK:    [[TMP17:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
+; CHECK:    store float [[TMP17]], ptr [[TMP16:%.*]], align 4
+; CHECK:    br label %[[FOR_BODY1]]
+; CHECK:  [[FOR_BODY1]]:
+; CHECK:  [[IF_THEN1:.*:]]
+; CHECK:  [[IF_THEN2:.*:]]
+; CHECK:  [[IF_THEN:.*:]]
+; CHECK:  [[IF_THEN3:.*:]]
+; CHECK:    [[CALL:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[IN_VAL:%.*]])
+; CHECK:    [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
+; CHECK:    [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
+; CHECK:    store float [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 4
+; CHECK:    store float [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 4
+; CHECK:  [[IF_MERGE:.*:]]
+; CHECK:  [[FOR_END:.*:]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %if.merge ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  %if_cond = fcmp olt float %in_val, %x
+  br i1 %if_cond, label %if.then, label %if.merge
+
+if.then:
+  %call = tail call { float, float } @llvm.sincos.f32(float %in_val)
+  %extract_a = extractvalue { float, float } %call, 0
+  %extract_b = extractvalue { float, float } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+  store float %extract_b, ptr %arrayidx4, align 4
+  br label %if.merge
+
+if.merge:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %cond = icmp slt i64 %iv.next, 1024
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}

From f1cf5988785e03fe44dc66089b44a0fc62f17e83 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Thu, 27 Feb 2025 11:10:04 +0000
Subject: [PATCH 2/2] Add a load of comments

---
 llvm/include/llvm/Analysis/TargetTransformInfo.h |  9 +++++++++
 llvm/include/llvm/CodeGen/BasicTTIImpl.h         | 10 ++++++++++
 llvm/lib/Analysis/CostModel.cpp                  |  5 +++++
 llvm/lib/Analysis/TargetTransformInfo.cpp        |  4 ++++
 llvm/lib/Analysis/VectorUtils.cpp                |  2 ++
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp  |  2 ++
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp   |  2 ++
 7 files changed, 34 insertions(+)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 1e80f90a47a8..d57da4f53e05 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -126,13 +126,17 @@ class IntrinsicCostAttributes {
   // If ScalarizationCost is UINT_MAX, the cost of scalarizing the
   // arguments and the return value will be computed based on types.
   InstructionCost ScalarizationCost = InstructionCost::getInvalid();
+  /* Downstream change: #87 (sincos vectorization)*/
   TargetLibraryInfo const *LibInfo = nullptr;
+  /* End downstream change: #87 */
 
 public:
+  /* Downstream change: #87 (sincos vectorization)*/
   IntrinsicCostAttributes(
       Intrinsic::ID Id, const CallBase &CI,
       InstructionCost ScalarCost = InstructionCost::getInvalid(),
       bool TypeBasedOnly = false, TargetLibraryInfo const *LibInfo = nullptr);
+  /* End downstream change: #87 */
 
   IntrinsicCostAttributes(
       Intrinsic::ID Id, Type *RTy, ArrayRef<Type *> Tys,
@@ -142,12 +146,14 @@ class IntrinsicCostAttributes {
   IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
                           ArrayRef<const Value *> Args);
 
+  /* Downstream change: #87 (sincos vectorization)*/
   IntrinsicCostAttributes(
       Intrinsic::ID Id, Type *RTy, ArrayRef<const Value *> Args,
       ArrayRef<Type *> Tys, FastMathFlags Flags = FastMathFlags(),
       const IntrinsicInst *I = nullptr,
       InstructionCost ScalarCost = InstructionCost::getInvalid(),
       TargetLibraryInfo const *LibInfo = nullptr);
+  /* End downstream change: #87 */
 
   Intrinsic::ID getID() const { return IID; }
   const IntrinsicInst *getInst() const { return II; }
@@ -156,7 +162,10 @@ class IntrinsicCostAttributes {
   InstructionCost getScalarizationCost() const { return ScalarizationCost; }
   const SmallVectorImpl<const Value *> &getArgs() const { return Arguments; }
   const SmallVectorImpl<Type *> &getArgTypes() const { return ParamTys; }
+
+  /* Downstream change: #87 (sincos vectorization)*/
   const TargetLibraryInfo *getLibInfo() const { return LibInfo; }
+  /* End downstream change: #87 */
 
   bool isTypeBasedOnly() const {
     return Arguments.empty();
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 2e161d382bb2..7711a9fd5d92 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -286,6 +286,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return false;
   }
 
+  /* Downstream change: #87 (sincos vectorization)*/
   /// Several intrinsics that return structs (including llvm.sincos[pi] and
   /// llvm.modf) can be lowered to a vector library call (for certain VFs). The
   /// vector library functions correspond to the scalar calls (e.g. sincos or
@@ -343,6 +344,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     }
     return Cost;
   }
+  /* End downstream change: #87 */
 
 protected:
   explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
@@ -1775,8 +1777,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
 
     Type *RetTy = ICA.getReturnType();
 
+    /* Downstream change: #87 (sincos vectorization)*/
     ElementCount RetVF = isVectorizedTy(RetTy) ? getVectorizedTypeVF(RetTy)
                                                : ElementCount::getFixed(1);
+    /* End downstream change: #87 */
 
     const IntrinsicInst *I = ICA.getInst();
     const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
@@ -2030,6 +2034,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     }
     case Intrinsic::experimental_vector_match:
       return thisT()->getTypeBasedIntrinsicInstrCost(ICA, CostKind);
+      /* Downstream change: #87 (sincos vectorization)*/
     case Intrinsic::sincos: {
       Type *Ty = getContainedTypes(RetTy).front();
       EVT VT = getTLI()->getValueType(DL, Ty);
@@ -2040,6 +2045,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       // Otherwise, fallback to default scalarization cost.
       break;
     }
+      /* End downstream change: #87 */
     }
 
     // Assume that we need to scalarize this intrinsic.)
@@ -2048,6 +2054,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     InstructionCost ScalarizationCost = InstructionCost::getInvalid();
     if (RetVF.isVector() && !RetVF.isScalable()) {
       ScalarizationCost = 0;
+      /* Downstream change: #87 (sincos vectorization)*/
       if (!RetTy->isVoidTy()) {
         for (Type *VectorTy : getContainedTypes(RetTy)) {
           ScalarizationCost += getScalarizationOverhead(
@@ -2055,6 +2062,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
               /*Insert=*/true, /*Extract=*/false, CostKind);
         }
       }
+      /* End downstream change: #87 */
       ScalarizationCost +=
           getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind);
     }
@@ -2709,6 +2717,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     // Else, assume that we need to scalarize this intrinsic. For math builtins
     // this will emit a costly libcall, adding call overhead and spills. Make it
     // very expensive.
+    /* Downstream change: #87 (sincos vectorization)*/
     if (isVectorizedTy(RetTy)) {
       ArrayRef<Type *> RetVTys = getContainedTypes(RetTy);
 
@@ -2735,6 +2744,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
         ScalarTys.push_back(Ty);
       }
       IntrinsicCostAttributes Attrs(IID, toScalarizedTy(RetTy), ScalarTys, FMF);
+      /* End downstream change: #87 */
       InstructionCost ScalarCost =
           thisT()->getIntrinsicInstrCost(Attrs, CostKind);
       for (Type *Ty : Tys) {
diff --git a/llvm/lib/Analysis/CostModel.cpp b/llvm/lib/Analysis/CostModel.cpp
index 68cb536bf789..6e5b686cfaf1 100644
--- a/llvm/lib/Analysis/CostModel.cpp
+++ b/llvm/lib/Analysis/CostModel.cpp
@@ -44,10 +44,12 @@ static cl::opt<bool> TypeBasedIntrinsicCost("type-based-intrinsic-cost",
     cl::desc("Calculate intrinsics cost based only on argument types"),
     cl::init(false));
 
+/* Downstream change: #87 (sincos vectorization)*/
 static cl::opt<bool> PreferIntrinsicCost(
     "prefer-intrinsic-cost",
     cl::desc("Prefer using getIntrinsicInstrCost over getInstructionCost"),
     cl::init(false));
+/* End downstream change: #87 */
 
 #define CM_NAME "cost-model"
 #define DEBUG_TYPE CM_NAME
@@ -55,6 +57,7 @@ static cl::opt<bool> PreferIntrinsicCost(
 PreservedAnalyses CostModelPrinterPass::run(Function &F,
                                             FunctionAnalysisManager &AM) {
   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  // Downstream change: #87 (sincos vectorization)
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   OS << "Printing analysis 'Cost Model Analysis' for function '" << F.getName() << "':\n";
   for (BasicBlock &B : F) {
@@ -63,12 +66,14 @@ PreservedAnalyses CostModelPrinterPass::run(Function &F,
       // which cost kind to print.
       InstructionCost Cost;
       auto *II = dyn_cast<IntrinsicInst>(&Inst);
+      /* Downstream change: #87 (sincos vectorization)*/
       if (II && (PreferIntrinsicCost || TypeBasedIntrinsicCost)) {
         IntrinsicCostAttributes ICA(
             II->getIntrinsicID(), *II, InstructionCost::getInvalid(),
             /*TypeBasedOnly=*/TypeBasedIntrinsicCost, &TLI);
         Cost = TTI.getIntrinsicInstrCost(ICA, CostKind);
       } else {
+        /* End downstream change: #87 */
         Cost = TTI.getInstructionCost(&Inst, CostKind);
       }
 
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 4dfe5406566e..c062d43cb209 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -67,11 +67,13 @@ bool HardwareLoopInfo::canAnalyze(LoopInfo &LI) {
   return true;
 }
 
+/* Downstream change: #87 (sincos vectorization)*/
 IntrinsicCostAttributes::IntrinsicCostAttributes(
     Intrinsic::ID Id, const CallBase &CI, InstructionCost ScalarizationCost,
     bool TypeBasedOnly, const TargetLibraryInfo *LibInfo)
     : II(dyn_cast<IntrinsicInst>(&CI)), RetTy(CI.getType()), IID(Id),
       ScalarizationCost(ScalarizationCost), LibInfo(LibInfo) {
+  /* End downstream change: #87 */
 
   if (const auto *FPMO = dyn_cast<FPMathOperator>(&CI))
     FMF = FPMO->getFastMathFlags();
@@ -101,12 +103,14 @@ IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *Ty,
     ParamTys.push_back(Argument->getType());
 }
 
+/* Downstream change: #87 (sincos vectorization)*/
 IntrinsicCostAttributes::IntrinsicCostAttributes(
     Intrinsic::ID Id, Type *RTy, ArrayRef<const Value *> Args,
     ArrayRef<Type *> Tys, FastMathFlags Flags, const IntrinsicInst *I,
     InstructionCost ScalarCost, TargetLibraryInfo const *LibInfo)
     : II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost),
       LibInfo(LibInfo) {
+  /* End downstream change: #87 */
   ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end());
   Arguments.insert(Arguments.begin(), Args.begin(), Args.end());
 }
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 44678289d7c8..416cc52e93a1 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -72,6 +72,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::atan2:
   case Intrinsic::sin:
   case Intrinsic::cos:
+  // Downstream change: #87 (sincos vectorization)
   case Intrinsic::sincos:
   case Intrinsic::tan:
   case Intrinsic::sinh:
@@ -180,6 +181,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
   case Intrinsic::ucmp:
   case Intrinsic::scmp:
     return OpdIdx == -1 || OpdIdx == 0;
+    // Downstream change: #87 (sincos vectorization)
   case Intrinsic::sincos:
   case Intrinsic::is_fpclass:
   case Intrinsic::vp_is_fpclass:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2b861b2a8ede..7be14353f722 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2884,9 +2884,11 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
                  std::back_inserter(ParamTys),
                  [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
 
+  /* Downstream change: #87 (sincos vectorization)*/
   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
                                     dyn_cast<IntrinsicInst>(CI),
                                     InstructionCost::getInvalid(), TLI);
+  /* End downstream change: #87 */
   return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 173edc60a211..07ea9d824342 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1149,10 +1149,12 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
 
   // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
   FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags();
+  /* Downstream change: #87 (sincos vectorization)*/
   IntrinsicCostAttributes CostAttrs(
       VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
       dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()),
       InstructionCost::getInvalid(), &Ctx.TLI);
+  /* End downstream change: #87 */
   return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
 }