From cc6fcd30933e16d7ebae0b7c16664e7e0b5f320a Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Fri, 20 Sep 2024 13:52:48 +0000
Subject: [PATCH 1/4] [LV] Add initial support for vectorizing literal struct
 return values

This patch adds initial support for vectorizing literal struct return
values. Currently, this is limited to the case where the struct is
homogeneous (all elements have the same type) and not packed. The users
of the call also must all be `extractvalue` instructions.

The intended use case for this is vectorizing intrinsics such as:

```
declare { float, float } @llvm.sincos.f32(float %x)
```

Mapping them to structure-returning library calls such as:

```
declare { <4 x float>, <4 x i32> } @Sleef_sincosf4_u10advsimd(<4 x float>)
```

Or their widened form (such as `@llvm.sincos.v4f32` in this case).

Implementing this required two main changes:

1. Supporting widening `extractvalue`
2. Adding support for vectorized struct types in LV
  * This is mostly limited to parts of the cost model and scalarization

Since the supported use case is narrow, the required changes are
relatively small.
---
 .../llvm/Analysis/TargetTransformInfo.h       |  14 ++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  15 +-
 .../Vectorize/LoopVectorizationLegality.h     |  10 -
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  10 +
 .../Vectorize/LoopVectorizationLegality.cpp   |  13 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 101 +++++----
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  27 ++-
 llvm/lib/Transforms/Vectorize/VPlan.h         |   7 +-
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |   6 +
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  14 +-
 .../AArch64/scalable-struct-return.ll         |  34 ++-
 .../AArch64/struct-return-cost.ll             | 199 ++++++++++++++++++
 .../Transforms/LoopVectorize/struct-return.ll | 111 ++++++++--
 .../vplan-widen-struct-return.ll              | 122 +++++++++++
 14 files changed, 580 insertions(+), 103 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index f07a4aea34d2..2ec116d0a9b4 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1473,6 +1473,12 @@ class TargetTransformInfo {
                                      TTI::TargetCostKind CostKind,
                                      unsigned Index = -1) const;
 
+  /// \return The expected cost of aggregate inserts and extracts. This is
+  /// used when the instruction is not available; a typical use case is to
+  /// provision the cost of vectorization/scalarization in vectorizer passes.
+  InstructionCost getInsertExtractValueCost(unsigned Opcode,
+                                            TTI::TargetCostKind CostKind) const;
+
   /// \return The cost of replication shuffle of \p VF elements typed \p EltTy
   /// \p ReplicationFactor times.
   ///
@@ -2205,6 +2211,9 @@ class TargetTransformInfo::Concept {
                             const APInt &DemandedDstElts,
                             TTI::TargetCostKind CostKind) = 0;
 
+  virtual InstructionCost
+  getInsertExtractValueCost(unsigned Opcode, TTI::TargetCostKind CostKind) = 0;
+
   virtual InstructionCost
   getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                   unsigned AddressSpace, TTI::TargetCostKind CostKind,
@@ -2926,6 +2935,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
                                           DemandedDstElts, CostKind);
   }
+  InstructionCost
+  getInsertExtractValueCost(unsigned Opcode,
+                            TTI::TargetCostKind CostKind) override {
+    return Impl.getInsertExtractValueCost(Opcode, CostKind);
+  }
   InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                                   unsigned AddressSpace,
                                   TTI::TargetCostKind CostKind,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index dcef4a1abcfa..4996c405bdda 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -745,6 +745,17 @@ class TargetTransformInfoImplBase {
     return 1;
   }
 
+  InstructionCost
+  getInsertExtractValueCost(unsigned Opcode,
+                            TTI::TargetCostKind CostKind) const {
+    // Note: The `insertvalue` cost here is chosen to match the default case of
+    // getInstructionCost() -- as pior to adding this helper `insertvalue` was
+    // not handled.
+    if (Opcode == Instruction::InsertValue)
+      return CostKind == TTI::TCK_RecipThroughput ? -1 : TTI::TCC_Basic;
+    return TTI::TCC_Free;
+  }
+
   InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                                   unsigned AddressSpace,
                                   TTI::TargetCostKind CostKind,
@@ -1296,9 +1307,11 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
     case Instruction::PHI:
     case Instruction::Switch:
       return TargetTTI->getCFInstrCost(Opcode, CostKind, I);
-    case Instruction::ExtractValue:
     case Instruction::Freeze:
       return TTI::TCC_Free;
+    case Instruction::ExtractValue:
+    case Instruction::InsertValue:
+      return TargetTTI->getInsertExtractValueCost(Opcode, CostKind);
     case Instruction::Alloca:
       if (cast<AllocaInst>(U)->isStaticAlloca())
         return TTI::TCC_Free;
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 3c5cf1ebe6ba..e959d93b5727 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -416,10 +416,6 @@ class LoopVectorizationLegality {
   /// has a vectorized variant available.
   bool hasVectorCallVariants() const { return VecCallVariantsFound; }
 
-  /// Returns true if there is at least one function call in the loop which
-  /// returns a struct type and needs to be vectorized.
-  bool hasStructVectorCall() const { return StructVecCallFound; }
-
   unsigned getNumStores() const { return LAI->getNumStores(); }
   unsigned getNumLoads() const { return LAI->getNumLoads(); }
 
@@ -639,12 +635,6 @@ class LoopVectorizationLegality {
   /// the use of those function variants.
   bool VecCallVariantsFound = false;
 
-  /// If we find a call (to be vectorized) that returns a struct type, record
-  /// that so we can bail out until this is supported.
-  /// TODO: Remove this flag once vectorizing calls with struct returns is
-  /// supported.
-  bool StructVecCallFound = false;
-
   /// Keep track of all the countable and uncountable exiting blocks if
   /// the exact backedge taken count is not computable.
   SmallVector<BasicBlock *, 4> CountableExitingBlocks;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 8b9722d047ed..820a66329406 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1113,6 +1113,16 @@ TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
   return Cost;
 }
 
+InstructionCost TargetTransformInfo::getInsertExtractValueCost(
+    unsigned Opcode, TTI::TargetCostKind CostKind) const {
+  assert((Opcode == Instruction::InsertValue ||
+          Opcode == Instruction::ExtractValue) &&
+         "Expecting Opcode to be insertvalue/extractvalue.");
+  InstructionCost Cost = TTIImpl->getInsertExtractValueCost(Opcode, CostKind);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
+}
+
 InstructionCost TargetTransformInfo::getReplicationShuffleCost(
     Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts,
     TTI::TargetCostKind CostKind) const {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index e3599315e224..420cbc5384ce 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -954,7 +954,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       if (CI && !VFDatabase::getMappings(*CI).empty())
         VecCallVariantsFound = true;
 
-      auto CanWidenInstructionTy = [this](Instruction const &Inst) {
+      auto CanWidenInstructionTy = [](Instruction const &Inst) {
         Type *InstTy = Inst.getType();
         if (!isa<StructType>(InstTy))
           return canVectorizeTy(InstTy);
@@ -962,15 +962,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         // For now, we only recognize struct values returned from calls where
         // all users are extractvalue as vectorizable. All element types of the
         // struct must be types that can be widened.
-        if (isa<CallInst>(Inst) && canWidenCallReturnType(InstTy) &&
-            all_of(Inst.users(), IsaPred<ExtractValueInst>)) {
-          // TODO: Remove the `StructVecCallFound` flag once vectorizing calls
-          // with struct returns is supported.
-          StructVecCallFound = true;
-          return true;
-        }
-
-        return false;
+        return isa<CallInst>(Inst) && canWidenCallReturnType(InstTy) &&
+               all_of(Inst.users(), IsaPred<ExtractValueInst>);
       };
 
       // Check that the instruction return type is vectorizable.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c4b159117e2e..73288b429e69 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2350,7 +2350,9 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
                                                VPReplicateRecipe *RepRecipe,
                                                const VPLane &Lane,
                                                VPTransformState &State) {
-  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
+  assert((!Instr->getType()->isAggregateType() ||
+          canVectorizeTy(Instr->getType())) &&
+         "Expected vectorizable or non-aggregate type.");
 
   // Does this instruction return a value ?
   bool IsVoidRetTy = Instr->getType()->isVoidTy();
@@ -2855,10 +2857,10 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
   return ScalarCallCost;
 }
 
-static Type *maybeVectorizeType(Type *Elt, ElementCount VF) {
-  if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
-    return Elt;
-  return VectorType::get(Elt, VF);
+static Type *maybeVectorizeType(Type *Ty, ElementCount VF) {
+  if (VF.isScalar() || !canVectorizeTy(Ty))
+    return Ty;
+  return toVectorizedTy(Ty, VF);
 }
 
 InstructionCost
@@ -3605,13 +3607,15 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
         }
       }
 
-      // ExtractValue instructions must be uniform, because the operands are
-      // known to be loop-invariant.
       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
-        assert(IsOutOfScope(EVI->getAggregateOperand()) &&
-               "Expected aggregate value to be loop invariant");
-        AddToWorklistIfAllowed(EVI);
-        continue;
+        if (IsOutOfScope(EVI->getAggregateOperand())) {
+          AddToWorklistIfAllowed(EVI);
+          continue;
+        }
+        // Only ExtractValue instructions where the aggregate value comes from a
+        // call are allowed to be non-uniform.
+        assert(isa<CallInst>(EVI->getAggregateOperand()) &&
+               "Expected aggregate value to be call return value");
       }
 
       // If there's no pointer operand, there's nothing to do.
@@ -4492,8 +4496,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
         llvm_unreachable("unhandled recipe");
       }
 
-      auto WillWiden = [&TTI, VF](Type *ScalarTy) {
-        Type *VectorTy = toVectorTy(ScalarTy, VF);
+      auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
         unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
         if (!NumLegalParts)
           return false;
@@ -4505,7 +4508,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
           // explicitly ask TTI about the register class uses for each part.
           return NumLegalParts <= VF.getKnownMinValue();
         }
-        // Two or more parts that share a register - are vectorized.
+        // Two or more elements that share a register - are vectorized.
         return NumLegalParts < VF.getKnownMinValue();
       };
 
@@ -4524,7 +4527,8 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
       Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
       if (!Visited.insert({ScalarTy}).second)
         continue;
-      if (WillWiden(ScalarTy))
+      Type *WideTy = toVectorizedTy(ScalarTy, VF);
+      if (any_of(getContainedTypes(WideTy), WillGenerateTargetVectors))
         return true;
     }
   }
@@ -5481,10 +5485,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
     // Compute the scalarization overhead of needed insertelement instructions
     // and phi nodes.
     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
-      ScalarCost += TTI.getScalarizationOverhead(
-          cast<VectorType>(toVectorTy(I->getType(), VF)),
-          APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
-          /*Extract*/ false, CostKind);
+      Type *WideTy = toVectorizedTy(I->getType(), VF);
+      for (Type *VectorTy : getContainedTypes(WideTy)) {
+        ScalarCost += TTI.getScalarizationOverhead(
+            cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
+            /*Insert=*/true,
+            /*Extract=*/false, CostKind);
+      }
       ScalarCost +=
           VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
     }
@@ -5495,15 +5502,18 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
     // overhead.
     for (Use &U : I->operands())
       if (auto *J = dyn_cast<Instruction>(U.get())) {
-        assert(VectorType::isValidElementType(J->getType()) &&
+        assert(canVectorizeTy(J->getType()) &&
                "Instruction has non-scalar type");
         if (CanBeScalarized(J))
           Worklist.push_back(J);
         else if (needsExtract(J, VF)) {
-          ScalarCost += TTI.getScalarizationOverhead(
-              cast<VectorType>(toVectorTy(J->getType(), VF)),
-              APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
-              /*Extract*/ true, CostKind);
+          Type *WideTy = toVectorizedTy(J->getType(), VF);
+          for (Type *VectorTy : getContainedTypes(WideTy)) {
+            ScalarCost += TTI.getScalarizationOverhead(
+                cast<VectorType>(VectorTy),
+                APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
+                /*Extract*/ true, CostKind);
+          }
         }
       }
 
@@ -5982,13 +5992,17 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
     return 0;
 
   InstructionCost Cost = 0;
-  Type *RetTy = toVectorTy(I->getType(), VF);
+  Type *RetTy = toVectorizedTy(I->getType(), VF);
   if (!RetTy->isVoidTy() &&
-      (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
-    Cost += TTI.getScalarizationOverhead(
-        cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
-        /*Insert*/ true,
-        /*Extract*/ false, CostKind);
+      (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) {
+
+    for (Type *VectorTy : getContainedTypes(RetTy)) {
+      Cost += TTI.getScalarizationOverhead(
+          cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getKnownMinValue()),
+          /*Insert=*/true,
+          /*Extract=*/false, CostKind);
+    }
+  }
 
   // Some targets keep addresses scalar.
   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
@@ -6246,9 +6260,9 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
 
       bool MaskRequired = Legal->isMaskRequired(CI);
       // Compute corresponding vector type for return value and arguments.
-      Type *RetTy = toVectorTy(ScalarRetTy, VF);
+      Type *RetTy = toVectorizedTy(ScalarRetTy, VF);
       for (Type *ScalarTy : ScalarTys)
-        Tys.push_back(toVectorTy(ScalarTy, VF));
+        Tys.push_back(toVectorizedTy(ScalarTy, VF));
 
       // An in-loop reduction using an fmuladd intrinsic is a special case;
       // we don't want the normal cost for that intrinsic.
@@ -6438,7 +6452,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
            HasSingleCopyAfterVectorization(I, VF));
     VectorTy = RetTy;
   } else
-    VectorTy = toVectorTy(RetTy, VF);
+    VectorTy = toVectorizedTy(RetTy, VF);
 
   if (VF.isVector() && VectorTy->isVectorTy() &&
       !TTI.getNumberOfParts(VectorTy))
@@ -8560,7 +8574,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
   case Instruction::Shl:
   case Instruction::Sub:
   case Instruction::Xor:
-  case Instruction::Freeze:
+  case Instruction::Freeze: {
     SmallVector<VPValue *> NewOps(Operands);
     if (Instruction::isBinaryOp(I->getOpcode())) {
       // The legacy cost model uses SCEV to check if some of the operands are
@@ -8585,6 +8599,16 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
       NewOps[1] = GetConstantViaSCEV(NewOps[1]);
     }
     return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
+  }
+  case Instruction::ExtractValue: {
+    SmallVector<VPValue *> NewOps(Operands);
+    Type *I32Ty = IntegerType::getInt32Ty(I->getContext());
+    auto *EVI = cast<ExtractValueInst>(I);
+    assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
+    unsigned Idx = EVI->getIndices()[0];
+    NewOps.push_back(Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, Idx, false)));
+    return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
+  }
   };
 }
 
@@ -9865,7 +9889,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
             VectorType::get(UI->getType(), State.VF));
         State.set(this, Poison);
       }
-      State.packScalarIntoVectorValue(this, *State.Lane);
+      State.packScalarIntoVectorizedValue(this, *State.Lane);
     }
     return;
   }
@@ -10382,13 +10406,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
-  if (LVL.hasStructVectorCall()) {
-    reportVectorizationFailure("Auto-vectorization of calls that return struct "
-                               "types is not yet supported",
-                               "StructCallVectorizationUnsupported", ORE, L);
-    return false;
-  }
-
   // Entrance to the VPlan-native vectorization path. Outer loops are processed
   // here. They may require CFG and instruction level transformations before
   // even evaluating whether vectorization is profitable. Since we cannot modify
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 35da93ee3b40..f46b13bc2815 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -334,10 +334,10 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
   } else {
     // Initialize packing with insertelements to start from undef.
     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
-    Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
+    Value *Undef = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
     set(Def, Undef);
     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
-      packScalarIntoVectorValue(Def, Lane);
+      packScalarIntoVectorizedValue(Def, Lane);
     VectorValue = get(Def);
   }
   Builder.restoreIP(OldIP);
@@ -390,13 +390,24 @@ void VPTransformState::setDebugLocFrom(DebugLoc DL) {
     Builder.SetCurrentDebugLocation(DIL);
 }
 
-void VPTransformState::packScalarIntoVectorValue(VPValue *Def,
-                                                 const VPLane &Lane) {
+void VPTransformState::packScalarIntoVectorizedValue(VPValue *Def,
+                                                     const VPLane &Lane) {
   Value *ScalarInst = get(Def, Lane);
-  Value *VectorValue = get(Def);
-  VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
-                                            Lane.getAsRuntimeExpr(Builder, VF));
-  set(Def, VectorValue);
+  Value *WideValue = get(Def);
+  Value *LaneExpr = Lane.getAsRuntimeExpr(Builder, VF);
+  if (auto *StructTy = dyn_cast<StructType>(WideValue->getType())) {
+    // We must handle each element of a vectorized struct type.
+    for (unsigned I = 0, E = StructTy->getNumElements(); I != E; I++) {
+      Value *ScalarValue = Builder.CreateExtractValue(ScalarInst, I);
+      Value *VectorValue = Builder.CreateExtractValue(WideValue, I);
+      VectorValue =
+          Builder.CreateInsertElement(VectorValue, ScalarValue, LaneExpr);
+      WideValue = Builder.CreateInsertValue(WideValue, VectorValue, I);
+    }
+  } else {
+    WideValue = Builder.CreateInsertElement(WideValue, ScalarInst, LaneExpr);
+  }
+  set(Def, WideValue);
 }
 
 BasicBlock *
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a1ff684b2b80..0122188526b2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -276,7 +276,7 @@ struct VPTransformState {
       set(Def, V, VPLane(0));
       return;
     }
-    assert((VF.isScalar() || V->getType()->isVectorTy()) &&
+    assert((VF.isScalar() || isVectorizedTy(V->getType())) &&
            "scalar values must be stored as (0, 0)");
     Data.VPV2Vector[Def] = V;
   }
@@ -325,8 +325,9 @@ struct VPTransformState {
   /// Set the debug location in the builder using the debug location \p DL.
   void setDebugLocFrom(DebugLoc DL);
 
-  /// Construct the vector value of a scalarized value \p V one lane at a time.
-  void packScalarIntoVectorValue(VPValue *Def, const VPLane &Lane);
+  /// Construct the vectorized value of a scalarized value \p V one lane at a
+  /// time.
+  void packScalarIntoVectorizedValue(VPValue *Def, const VPLane &Lane);
 
   /// Hold state information used when constructing the CFG of the output IR,
   /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 27357ff04b5f..0af52addc4b6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -124,6 +124,12 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenRecipe *R) {
   case Instruction::FNeg:
   case Instruction::Freeze:
     return inferScalarType(R->getOperand(0));
+  case Instruction::ExtractValue: {
+    assert(R->getNumOperands() == 2 && "expected single level extractvalue");
+    auto *StructTy = cast<StructType>(inferScalarType(R->getOperand(0)));
+    auto *CI = cast<ConstantInt>(R->getOperand(1)->getLiveInIRValue());
+    return StructTy->getTypeAtIndex(CI->getZExtValue());
+  }
   default:
     break;
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2679ed6b26b5..89dba6f452c7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1116,7 +1116,7 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
     Arguments.push_back(V);
   }
 
-  Type *RetTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
+  Type *RetTy = toVectorizedTy(Ctx.Types.inferScalarType(this), VF);
   SmallVector<Type *> ParamTys;
   for (unsigned I = 0; I != getNumOperands(); ++I)
     ParamTys.push_back(
@@ -1422,6 +1422,14 @@ void VPWidenRecipe::execute(VPTransformState &State) {
     State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
     break;
   }
+  case Instruction::ExtractValue: {
+    assert(getNumOperands() == 2 && "expected single level extractvalue");
+    Value *Op = State.get(getOperand(0));
+    auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
+    Value *Extract = Builder.CreateExtractValue(Op, CI->getZExtValue());
+    State.set(this, Extract);
+    break;
+  }
   case Instruction::Freeze: {
     Value *Op = State.get(getOperand(0));
 
@@ -1523,6 +1531,10 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
     return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
                                           Ctx.CostKind);
   }
+  case Instruction::ExtractValue: {
+    return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
+                                             Ctx.CostKind);
+  }
   case Instruction::ICmp:
   case Instruction::FCmp: {
     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
index 77781f95b085..2fde624624ee 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
@@ -1,15 +1,18 @@
-; RUN: opt < %s -mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S -pass-remarks-analysis=loop-vectorize 2>%t | FileCheck %s
-; RUN: cat %t | FileCheck --check-prefix=CHECK-REMARKS %s
+; RUN: opt < %s -mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
 ; Tests basic vectorization of scalable homogeneous struct literal returns.
 
-; TODO: Support vectorization in this case.
-; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
 define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
 ; CHECK-LABEL: define void @struct_return_f32_widen
-; CHECK-NOT:   vector.body:
+; CHECK-SAME:  (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]])
+; CHECK:       vector.body:
+; CHECK:         [[WIDE_CALL:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @scalable_vec_masked_foo(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; CHECK:         [[WIDE_A:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[WIDE_CALL]], 0
+; CHECK:         [[WIDE_B:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[WIDE_CALL]], 1
+; CHECK:         call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[WIDE_A]], ptr {{%.*}}, i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK:         call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[WIDE_B]], ptr {{%.*}}, i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 entry:
   br label %for.body
 
@@ -32,11 +35,15 @@ exit:
   ret void
 }
 
-; TODO: Support vectorization in this case.
-; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
 define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
 ; CHECK-LABEL: define void @struct_return_f64_widen
-; CHECK-NOT:   vector.body:
+; CHECK-SAME:  (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]])
+; CHECK:       vector.body:
+; CHECK:         [[WIDE_CALL:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @scalable_vec_masked_bar(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; CHECK:         [[WIDE_A:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[WIDE_CALL]], 0
+; CHECK:         [[WIDE_B:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[WIDE_CALL]], 1
+; CHECK:         call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[WIDE_A]], ptr {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK:         call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[WIDE_B]], ptr {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 entry:
   br label %for.body
 
@@ -59,11 +66,16 @@ exit:
   ret void
 }
 
-; TODO: Support vectorization in this case.
-; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
 define void @struct_return_f32_widen_rt_checks(ptr %in, ptr writeonly %out_a, ptr writeonly %out_b) {
 ; CHECK-LABEL: define void @struct_return_f32_widen_rt_checks
-; CHECK-NOT:   vector.body:
+; CHECK-SAME:  (ptr [[IN:%.*]], ptr writeonly [[OUT_A:%.*]], ptr writeonly [[OUT_B:%.*]])
+; CHECK:       entry:
+; CHECK:         br i1 false, label %scalar.ph, label %vector.memcheck
+; CHECK:       vector.memcheck:
+; CHECK:       vector.body:
+; CHECK:         call { <vscale x 4 x float>, <vscale x 4 x float> } @scalable_vec_masked_foo(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; CHECK:       for.body:
+; CHECK:         call { float, float } @foo(float [[LOAD:%.*]])
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll
new file mode 100644
index 000000000000..c72149324373
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll
@@ -0,0 +1,199 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "(:|@)" --version 5
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize < %s -S -o - 2>%t | FileCheck %s
+; RUN: cat %t | FileCheck %s --check-prefix=CHECK-COST
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-COST-LABEL: struct_return_widen
+; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction:   %call = tail call { half, half } @foo(half %in_val)
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %extract_a = extractvalue { half, half } %call, 0
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %extract_b = extractvalue { half, half } %call, 1
+;
+; CHECK-COST: Cost of 10 for VF 2: WIDEN-CALL ir<%call> = call  @foo(ir<%in_val>) (using library function: fixed_vec_foo)
+; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+;
+; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+;
+; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+
+define void @struct_return_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @struct_return_widen(
+; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) {
+; CHECK:  [[ENTRY:.*:]]
+; CHECK:  [[VECTOR_PH:.*:]]
+; CHECK:  [[VECTOR_BODY:.*:]]
+; CHECK:    [[TMP2:%.*]] = call { <2 x half>, <2 x half> } @fixed_vec_foo(<2 x half> [[WIDE_LOAD:%.*]])
+; CHECK:    [[TMP3:%.*]] = call { <2 x half>, <2 x half> } @fixed_vec_foo(<2 x half> [[WIDE_LOAD1:%.*]])
+; CHECK:  [[MIDDLE_BLOCK:.*:]]
+; CHECK:  [[SCALAR_PH:.*:]]
+; CHECK:  [[FOR_BODY:.*:]]
+; CHECK:    [[CALL:%.*]] = tail call { half, half } @foo(half [[IN_VAL:%.*]]) #[[ATTR2:[0-9]+]]
+; CHECK:  [[EXIT:.*:]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds half, ptr %in, i64 %iv
+  %in_val = load half, ptr %arrayidx, align 2
+  %call = tail call { half, half } @foo(half %in_val) #0
+  %extract_a = extractvalue { half, half } %call, 0
+  %extract_b = extractvalue { half, half } %call, 1
+  %arrayidx2 = getelementptr inbounds half, ptr %out_a, i64 %iv
+  store half %extract_a, ptr %arrayidx2, align 2
+  %arrayidx4 = getelementptr inbounds half, ptr %out_b, i64 %iv
+  store half %extract_b, ptr %arrayidx4, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; CHECK-COST-LABEL: struct_return_replicate
+; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction:   %call = tail call { half, half } @foo(half %in_val)
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %extract_a = extractvalue { half, half } %call, 0
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %extract_b = extractvalue { half, half } %call, 1
+;
+; CHECK-COST: Cost of 26 for VF 2: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+;
+; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+;
+; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+
+define void @struct_return_replicate(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @struct_return_replicate(
+; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) {
+; CHECK:  [[ENTRY:.*:]]
+; CHECK:  [[VECTOR_PH:.*:]]
+; CHECK:  [[VECTOR_BODY:.*:]]
+; CHECK:    [[TMP4:%.*]] = tail call { half, half } @foo(half [[TMP3:%.*]]) #[[ATTR3:[0-9]+]]
+; CHECK:    [[TMP6:%.*]] = tail call { half, half } @foo(half [[TMP5:%.*]]) #[[ATTR3]]
+; CHECK:  [[MIDDLE_BLOCK:.*:]]
+; CHECK:  [[SCALAR_PH:.*:]]
+; CHECK:  [[FOR_BODY:.*:]]
+; CHECK:    [[CALL:%.*]] = tail call { half, half } @foo(half [[IN_VAL:%.*]]) #[[ATTR3]]
+; CHECK:  [[EXIT:.*:]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds half, ptr %in, i64 %iv
+  %in_val = load half, ptr %arrayidx, align 2
+  ; #1 does not have a fixed-size vector mapping (so replication is used)
+  %call = tail call { half, half } @foo(half %in_val) #1
+  %extract_a = extractvalue { half, half } %call, 0
+  %extract_b = extractvalue { half, half } %call, 1
+  %arrayidx2 = getelementptr inbounds half, ptr %out_a, i64 %iv
+  store half %extract_a, ptr %arrayidx2, align 2
+  %arrayidx4 = getelementptr inbounds half, ptr %out_b, i64 %iv
+  store half %extract_b, ptr %arrayidx4, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; CHECK-COST-LABEL: struct_return_scalable
+; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction:   %call = tail call { half, half } @foo(half %in_val)
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %extract_a = extractvalue { half, half } %call, 0
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %extract_b = extractvalue { half, half } %call, 1
+;
+; CHECK-COST: Cost of 26 for VF 2: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+;
+; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+;
+; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+;
+; CHECK-COST: Cost of Invalid for VF vscale x 1: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF vscale x 1: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-COST: Cost of 0 for VF vscale x 1: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+;
+; CHECK-COST: Cost of Invalid for VF vscale x 2: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF vscale x 2: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-COST: Cost of 0 for VF vscale x 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+;
+; CHECK-COST: Cost of Invalid for VF vscale x 4: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF vscale x 4: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-COST: Cost of 0 for VF vscale x 4: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+;
+; CHECK-COST: Cost of 10 for VF vscale x 8: WIDEN-CALL ir<%call> = call  @foo(ir<%in_val>, ir<true>) (using library function: scalable_vec_masked_foo)
+; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+
+define void @struct_return_scalable(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) #2 {
+; CHECK-LABEL: define void @struct_return_scalable(
+; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK:  [[ENTRY:.*:]]
+; CHECK:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK:  [[VECTOR_PH:.*:]]
+; CHECK:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK:  [[VECTOR_BODY:.*:]]
+; CHECK:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK:    [[TMP12:%.*]] = call { <vscale x 8 x half>, <vscale x 8 x half> } @scalable_vec_masked_foo(<vscale x 8 x half> [[WIDE_LOAD:%.*]], <vscale x 8 x i1> splat (i1 true))
+; CHECK:    [[TMP13:%.*]] = call { <vscale x 8 x half>, <vscale x 8 x half> } @scalable_vec_masked_foo(<vscale x 8 x half> [[WIDE_LOAD1:%.*]], <vscale x 8 x i1> splat (i1 true))
+; CHECK:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK:  [[MIDDLE_BLOCK:.*:]]
+; CHECK:  [[SCALAR_PH:.*:]]
+; CHECK:  [[FOR_BODY:.*:]]
+; CHECK:    [[CALL:%.*]] = tail call { half, half } @foo(half [[IN_VAL:%.*]]) #[[ATTR3]]
+; CHECK:  [[EXIT:.*:]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds half, ptr %in, i64 %iv
+  %in_val = load half, ptr %arrayidx, align 2
+  %call = tail call { half, half } @foo(half %in_val) #1
+  %extract_a = extractvalue { half, half } %call, 0
+  %extract_b = extractvalue { half, half } %call, 1
+  %arrayidx2 = getelementptr inbounds half, ptr %out_a, i64 %iv
+  store half %extract_a, ptr %arrayidx2, align 2
+  %arrayidx4 = getelementptr inbounds half, ptr %out_b, i64 %iv
+  store half %extract_b, ptr %arrayidx4, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+
+declare { half, half } @foo(half)
+
+declare { <2 x half>, <2 x half> } @fixed_vec_foo(<2 x half>)
+declare { <vscale x 8 x half>, <vscale x 8 x half> } @scalable_vec_masked_foo(<vscale x 8 x half>, <vscale x 8 x i1>)
+
+attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_foo(fixed_vec_foo)" }
+attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVsMxv_foo(scalable_vec_masked_foo)" }
+attributes #2 = { "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopVectorize/struct-return.ll b/llvm/test/Transforms/LoopVectorize/struct-return.ll
index 9f98e8af2e98..1b2a809a552d 100644
--- a/llvm/test/Transforms/LoopVectorize/struct-return.ll
+++ b/llvm/test/Transforms/LoopVectorize/struct-return.ll
@@ -1,15 +1,20 @@
-; RUN: opt < %s -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S -pass-remarks-analysis=loop-vectorize 2>%t | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize 2>%t | FileCheck %s
 ; RUN: cat %t | FileCheck --check-prefix=CHECK-REMARKS %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 ; Tests basic vectorization of homogeneous struct literal returns.
 
-; TODO: Support vectorization in this case.
-; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
+; CHECK-REMARKS: remark: {{.*}} vectorized loop
 define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
 ; CHECK-LABEL: define void @struct_return_f32_widen
-; CHECK-NOT:   vector.body:
+; CHECK-SAME:  (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]])
+; CHECK:       vector.body:
+; CHECK:         [[WIDE_CALL:%.*]] = call { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float> [[WIDE_LOAD:%.*]])
+; CHECK:         [[WIDE_A:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_CALL]], 0
+; CHECK:         [[WIDE_B:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_CALL]], 1
+; CHECK:         store <2 x float> [[WIDE_A]], ptr {{%.*}}, align 4
+; CHECK:         store <2 x float> [[WIDE_B]], ptr {{%.*}}, align 4
 entry:
   br label %for.body
 
@@ -32,11 +37,16 @@ exit:
   ret void
 }
 
-; TODO: Support vectorization in this case.
-; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
+; CHECK-REMARKS: remark: {{.*}} vectorized loop
 define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
 ; CHECK-LABEL: define void @struct_return_f64_widen
-; CHECK-NOT:   vector.body:
+; CHECK-SAME:  (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]])
+; CHECK:        vector.body:
+; CHECK:          [[WIDE_CALL:%.*]] = call { <2 x double>, <2 x double> } @fixed_vec_bar(<2 x double> [[WIDE_LOAD:%.*]])
+; CHECK:          [[WIDE_A:%.*]] = extractvalue { <2 x double>, <2 x double> } [[WIDE_CALL]], 0
+; CHECK:          [[WIDE_B:%.*]] = extractvalue { <2 x double>, <2 x double> } [[WIDE_CALL]], 1
+; CHECK:          store <2 x double> [[WIDE_A]], ptr {{%.*}}, align 8
+; CHECK:          store <2 x double> [[WIDE_B]], ptr {{%.*}}, align 8
 entry:
   br label %for.body
 
@@ -59,11 +69,36 @@ exit:
   ret void
 }
 
-; TODO: Support vectorization in this case.
-; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
+; CHECK-REMARKS: remark: {{.*}} vectorized loop
+; Note: Later instcombines reduce this down quite a lot.
 define void @struct_return_f32_replicate(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
 ; CHECK-LABEL: define void @struct_return_f32_replicate
-; CHECK-NOT:   vector.body:
+; CHECK-SAME:  (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]])
+; CHECK:       vector.body:
+; CHECK:         [[CALL_LANE_0:%.*]] = tail call { float, float } @foo(float {{%.*}})
+; CHECK:         [[CALL_LANE_1:%.*]] = tail call { float, float } @foo(float {{%.*}})
+;                // Lane 0
+; CHECK:         [[A_0:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 0
+; CHECK:         [[VEC_A_0:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0
+; CHECK:         [[WIDE_A_0:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[VEC_A_0]], 0
+; CHECK:         [[B_0:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 1
+; CHECK:         [[UNDEF_B_0:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_A_0]], 1
+; CHECK:         [[VEC_B_0:%.*]] = insertelement <2 x float> [[UNDEF_B_0]], float [[B_0]], i32 0
+; CHECK:         [[WIDE_0:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_A_0]], <2 x float> [[VEC_B_0]], 1
+;                // Lane 1
+; CHECK:         [[A_1:%.*]] = extractvalue { float, float } [[CALL_LANE_1]], 0
+; CHECK:         [[VEC_A_0_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_0]], 0
+; CHECK:         [[VEC_A:%.*]] = insertelement <2 x float> [[VEC_A_0_EXT]], float [[A_1]], i32 1
+; CHECK:         [[WIDE_A:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_0]], <2 x float> [[VEC_A]], 0
+; CHECK:         [[B_1:%.*]] = extractvalue { float, float } [[CALL_LANE_1]], 1
+; CHECK:         [[VEC_B_0_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_A]], 1
+; CHECK:         [[VEC_B:%.*]] = insertelement <2 x float> [[VEC_B_0_EXT]], float [[B_1]], i32 1
+; CHECK:         [[WIDE:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_A]], <2 x float> [[VEC_B]], 1
+;                // Store wide values:
+; CHECK:         [[VEC_A_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE]], 0
+; CHECK:         [[VEC_B_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE]], 1
+; CHECK:         store <2 x float> [[VEC_A_EXT]], ptr {{%.*}}, align 4
+; CHECK:         store <2 x float> [[VEC_B_EXT]], ptr {{%.*}}, align 4
 entry:
   br label %for.body
 
@@ -87,11 +122,17 @@ exit:
   ret void
 }
 
-; TODO: Support vectorization in this case.
-; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
+; CHECK-REMARKS: remark: {{.*}} vectorized loop
 define void @struct_return_f32_widen_rt_checks(ptr %in, ptr writeonly %out_a, ptr writeonly %out_b) {
 ; CHECK-LABEL: define void @struct_return_f32_widen_rt_checks
-; CHECK-NOT:   vector.body:
+; CHECK-SAME:  (ptr [[IN:%.*]], ptr writeonly [[OUT_A:%.*]], ptr writeonly [[OUT_B:%.*]])
+; CHECK:       entry:
+; CHECK:         br i1 false, label %scalar.ph, label %vector.memcheck
+; CHECK:       vector.memcheck:
+; CHECK:       vector.body:
+; CHECK:         call { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float> [[WIDE_LOAD:%.*]])
+; CHECK:       for.body:
+; CHECK          call { float, float } @foo(float [[LOAD:%.*]])
 entry:
   br label %for.body
 
@@ -143,11 +184,11 @@ exit:
   ret void
 }
 
-; TODO: Support vectorization in this case.
-; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
+; CHECK-REMARKS: remark: {{.*}} vectorized loop
 define void @struct_return_i32_three_results_widen(ptr noalias %in, ptr noalias writeonly %out_a) {
 ; CHECK-LABEL: define void @struct_return_i32_three_results_widen
-; CHECK-NOT:   vector.body:
+; CHECK:   vector.body:
+; CHECK:     call { <2 x i32>, <2 x i32>, <2 x i32> } @fixed_vec_qux(<2 x i32> [[WIDE_LOAD:%.*]])
 entry:
   br label %for.body
 
@@ -167,6 +208,40 @@ exit:
   ret void
 }
 
+; Test crafted to exercise computePredInstDiscount with struct results
+; (mainly it does not crash).
+; CHECK-REMARKS: remark: {{.*}} vectorized loop
+define void @scalarized_predicated_struct_return(ptr %a) optsize {
+; CHECK-LABEL: define void @scalarized_predicated_struct_return
+; CHECK:  vector.body:
+; CHECK:  pred.store.if:
+; CHECK:     tail call { i64, i64 } @bar_i64(i64 %5)
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv
+  %in_val = load i64, ptr %arrayidx, align 8
+  %sgt_zero = icmp sgt i64 %in_val, 0
+  br i1 %sgt_zero, label %if.then, label %for.inc
+
+if.then:
+  %call = tail call { i64, i64 } @bar_i64(i64 %in_val) #6
+  %extract_a = extractvalue { i64, i64 } %call, 0
+  %div = udiv i64 %extract_a, %in_val
+  store i64 %div, ptr %arrayidx, align 8
+  br label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
 ; Negative test. Widening structs of vectors is not supported.
 ; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized
 define void @negative_struct_of_vectors(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
@@ -390,13 +465,14 @@ declare { [2 x float] } @foo_arrays(float)
 declare { float, [1 x float] } @foo_one_non_widenable_element(float)
 declare { <1 x float>, <1 x float> } @foo_vectors(<1 x float>)
 declare { i32, i32, i32 } @qux(i32)
+declare { i64, i64 } @bar_i64(i64)
 
 declare { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float>)
 declare { <2 x double>, <2 x double> } @fixed_vec_bar(<2 x double>)
 declare { <2 x float>, <2 x i32> } @fixed_vec_baz(<2 x float>)
 declare { <2 x i32>, <2 x i32>, <2 x i32> } @fixed_vec_qux(<2 x i32>)
-
 declare { <vscale x 4 x float>, <vscale x 4 x float> } @scalable_vec_masked_foo(<vscale x 4 x float>, <vscale x 4 x i1>)
+declare { <vscale x 4 x i64>, <vscale x 4 x i64> } @scalable_vec_masked_bar_i64(<vscale x 4 x i64>, <vscale x 4 x i1>)
 
 attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_foo(fixed_vec_foo)" }
 attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_bar(fixed_vec_bar)" }
@@ -404,3 +480,4 @@ attributes #2 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_baz(fixed_vec
 attributes #3 = { nounwind "vector-function-abi-variant"="_ZGVsMxv_foo(scalable_vec_masked_foo)" }
 attributes #4 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_bar_named(fixed_vec_bar)" }
 attributes #5 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_qux(fixed_vec_qux)" }
+attributes #6 = { nounwind "vector-function-abi-variant"="_ZGVsMxv_bar_i64(scalable_vec_masked_bar_i64)" }
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll b/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll
new file mode 100644
index 000000000000..bb61398ae5a6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll
@@ -0,0 +1,122 @@
+; REQUIRES: asserts
+; RUN: opt < %s -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -debug-only=loop-vectorize -disable-output -S 2>&1 | FileCheck %s
+
+define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: LV: Checking a loop in 'struct_return_f32_widen'
+; CHECK:       VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT:  Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT:  Live-in ir<1024> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:  Successor(s): vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%in>, vp<[[STEPS]]>
+; CHECK-NEXT:      vp<[[IN_VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:      WIDEN ir<%in_val> = load vp<[[IN_VEC_PTR]]>
+; CHECK-NEXT:      WIDEN-CALL ir<%call> = call  @foo(ir<%in_val>) (using library function: fixed_vec_foo)
+; CHECK-NEXT:      WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-NEXT:      WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+; CHECK-NEXT:      CLONE ir<%arrayidx2> = getelementptr inbounds ir<%out_a>, vp<[[STEPS]]>
+; CHECK-NEXT:      vp<[[OUT_A_VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx2>
+; CHECK-NEXT:      WIDEN store vp<[[OUT_A_VEC_PTR]]>, ir<%extract_a>
+; CHECK-NEXT:      CLONE ir<%arrayidx4> = getelementptr inbounds ir<%out_b>, vp<[[STEPS]]>
+; CHECK-NEXT:      vp<[[OUT_B_VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx4>
+; CHECK-NEXT:      WIDEN store vp<[[OUT_B_VEC_PTR]]>, ir<%extract_b>
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VTC]]>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  %call = tail call { float, float } @foo(float %in_val) #0
+  %extract_a = extractvalue { float, float } %call, 0
+  %extract_b = extractvalue { float, float } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+  store float %extract_b, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @struct_return_f32_replicate(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: LV: Checking a loop in 'struct_return_f32_replicate'
+; CHECK:       VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT:  Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT:  Live-in ir<1024> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:  Successor(s): vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%in>, vp<[[STEPS]]>
+; CHECK-NEXT:      vp<[[IN_VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:      WIDEN ir<%in_val> = load vp<[[IN_VEC_PTR]]>
+; CHECK-NEXT:      REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-NEXT:      WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-NEXT:      WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+; CHECK-NEXT:      CLONE ir<%arrayidx2> = getelementptr inbounds ir<%out_a>, vp<[[STEPS]]>
+; CHECK-NEXT:      vp<[[OUT_A_VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx2>
+; CHECK-NEXT:      WIDEN store vp<[[OUT_A_VEC_PTR]]>, ir<%extract_a>
+; CHECK-NEXT:      CLONE ir<%arrayidx4> = getelementptr inbounds ir<%out_b>, vp<[[STEPS]]>
+; CHECK-NEXT:      vp<[[OUT_B_VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx4>
+; CHECK-NEXT:      WIDEN store vp<[[OUT_B_VEC_PTR]]>, ir<%extract_b>
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VTC]]>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  ; #3 does not have a fixed-size vector mapping (so replication is used)
+  %call = tail call { float, float } @foo(float %in_val) #1
+  %extract_a = extractvalue { float, float } %call, 0
+  %extract_b = extractvalue { float, float } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+  store float %extract_b, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+
+declare { float, float } @foo(float)
+
+declare { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float>)
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @scalable_vec_masked_foo(<vscale x 4 x float>, <vscale x 4 x i1>)
+
+attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_foo(fixed_vec_foo)" }
+attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVsMxv_foo(scalable_vec_masked_foo)" }

From 8065001d363604822226896be2d41ea3709078ec Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Wed, 12 Feb 2025 16:14:30 +0000
Subject: [PATCH 2/4] Add a load of comments

---
 .../llvm/Analysis/TargetTransformInfo.h       |  6 +++++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  4 +++
 .../Vectorize/LoopVectorizationLegality.h     |  8 ++++++
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  2 ++
 .../Vectorize/LoopVectorizationLegality.cpp   |  2 ++
 .../Transforms/Vectorize/LoopVectorize.cpp    | 26 +++++++++++++++++++
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  4 +++
 llvm/lib/Transforms/Vectorize/VPlan.h         |  3 +++
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |  2 ++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  5 ++++
 10 files changed, 62 insertions(+)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 2ec116d0a9b4..45077f174115 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1473,11 +1473,13 @@ class TargetTransformInfo {
                                      TTI::TargetCostKind CostKind,
                                      unsigned Index = -1) const;
 
+  /* Downstream change: #87 (sincos vectorization)*/
   /// \return The expected cost of aggregate inserts and extracts. This is
   /// used when the instruction is not available; a typical use case is to
   /// provision the cost of vectorization/scalarization in vectorizer passes.
   InstructionCost getInsertExtractValueCost(unsigned Opcode,
                                             TTI::TargetCostKind CostKind) const;
+  /* End downstream change: #87 */
 
   /// \return The cost of replication shuffle of \p VF elements typed \p EltTy
   /// \p ReplicationFactor times.
@@ -2211,8 +2213,10 @@ class TargetTransformInfo::Concept {
                             const APInt &DemandedDstElts,
                             TTI::TargetCostKind CostKind) = 0;
 
+  /* Downstream change: #87 (sincos vectorization)*/
   virtual InstructionCost
   getInsertExtractValueCost(unsigned Opcode, TTI::TargetCostKind CostKind) = 0;
+  /* End downstream change: #87 */
 
   virtual InstructionCost
   getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
@@ -2935,11 +2939,13 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
                                           DemandedDstElts, CostKind);
   }
+  /* Downstream change: #87 (sincos vectorization)*/
   InstructionCost
   getInsertExtractValueCost(unsigned Opcode,
                             TTI::TargetCostKind CostKind) override {
     return Impl.getInsertExtractValueCost(Opcode, CostKind);
   }
+  /* End downstream change: #87 */
   InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                                   unsigned AddressSpace,
                                   TTI::TargetCostKind CostKind,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 4996c405bdda..c0dfd9aa72d8 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -745,6 +745,7 @@ class TargetTransformInfoImplBase {
     return 1;
   }
 
+  /* Downstream change: #87 (sincos vectorization)*/
   InstructionCost
   getInsertExtractValueCost(unsigned Opcode,
                             TTI::TargetCostKind CostKind) const {
@@ -755,6 +756,7 @@ class TargetTransformInfoImplBase {
       return CostKind == TTI::TCK_RecipThroughput ? -1 : TTI::TCC_Basic;
     return TTI::TCC_Free;
   }
+  /* End downstream change: #87 */
 
   InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                                   unsigned AddressSpace,
@@ -1309,9 +1311,11 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
       return TargetTTI->getCFInstrCost(Opcode, CostKind, I);
     case Instruction::Freeze:
       return TTI::TCC_Free;
+      /* Downstream change: #87 (sincos vectorization)*/
     case Instruction::ExtractValue:
     case Instruction::InsertValue:
       return TargetTTI->getInsertExtractValueCost(Opcode, CostKind);
+      /* End downstream change: #87 */
     case Instruction::Alloca:
       if (cast<AllocaInst>(U)->isStaticAlloca())
         return TTI::TCC_Free;
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index e959d93b5727..30465e991547 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -416,6 +416,10 @@ class LoopVectorizationLegality {
   /// has a vectorized variant available.
   bool hasVectorCallVariants() const { return VecCallVariantsFound; }
 
+  /* Downstream change: #87 (sincos vectorization)*/
+  // Removed hasStructVectorCall()
+  /* End downstream change: #87 */
+
   unsigned getNumStores() const { return LAI->getNumStores(); }
   unsigned getNumLoads() const { return LAI->getNumLoads(); }
 
@@ -635,6 +639,10 @@ class LoopVectorizationLegality {
   /// the use of those function variants.
   bool VecCallVariantsFound = false;
 
+  /* Downstream change: #87 (sincos vectorization)*/
+  // Removed StructVecCallFound
+  /* End downstream change: #87 */
+
   /// Keep track of all the countable and uncountable exiting blocks if
   /// the exact backedge taken count is not computable.
   SmallVector<BasicBlock *, 4> CountableExitingBlocks;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 820a66329406..ade398ea72f9 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1113,6 +1113,7 @@ TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
   return Cost;
 }
 
+/* Downstream change: #87 (sincos vectorization)*/
 InstructionCost TargetTransformInfo::getInsertExtractValueCost(
     unsigned Opcode, TTI::TargetCostKind CostKind) const {
   assert((Opcode == Instruction::InsertValue ||
@@ -1122,6 +1123,7 @@ InstructionCost TargetTransformInfo::getInsertExtractValueCost(
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
+/* End downstream change: #87 */
 
 InstructionCost TargetTransformInfo::getReplicationShuffleCost(
     Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 420cbc5384ce..6f33ac923ba3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -954,6 +954,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       if (CI && !VFDatabase::getMappings(*CI).empty())
         VecCallVariantsFound = true;
 
+      /* Downstream change: #87 (sincos vectorization)*/
       auto CanWidenInstructionTy = [](Instruction const &Inst) {
         Type *InstTy = Inst.getType();
         if (!isa<StructType>(InstTy))
@@ -965,6 +966,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         return isa<CallInst>(Inst) && canWidenCallReturnType(InstTy) &&
                all_of(Inst.users(), IsaPred<ExtractValueInst>);
       };
+      /* End downstream change: #87 */
 
       // Check that the instruction return type is vectorizable.
       // We can't vectorize casts from vector type to scalar type.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 73288b429e69..2c9da6940a8b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2350,9 +2350,11 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
                                                VPReplicateRecipe *RepRecipe,
                                                const VPLane &Lane,
                                                VPTransformState &State) {
+  /* Downstream change: #87 (sincos vectorization)*/
   assert((!Instr->getType()->isAggregateType() ||
           canVectorizeTy(Instr->getType())) &&
          "Expected vectorizable or non-aggregate type.");
+  /* End downstream change: #87 */
 
   // Does this instruction return a value ?
   bool IsVoidRetTy = Instr->getType()->isVoidTy();
@@ -2857,11 +2859,13 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
   return ScalarCallCost;
 }
 
+/* Downstream change: #87 (sincos vectorization)*/
 static Type *maybeVectorizeType(Type *Ty, ElementCount VF) {
   if (VF.isScalar() || !canVectorizeTy(Ty))
     return Ty;
   return toVectorizedTy(Ty, VF);
 }
+/* End downstream change: #87 */
 
 InstructionCost
 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
@@ -3607,6 +3611,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
         }
       }
 
+      /* Downstream change: #87 (sincos vectorization)*/
       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
         if (IsOutOfScope(EVI->getAggregateOperand())) {
           AddToWorklistIfAllowed(EVI);
@@ -3617,6 +3622,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
         assert(isa<CallInst>(EVI->getAggregateOperand()) &&
                "Expected aggregate value to be call return value");
       }
+      /* End downstream change: #87 */
 
       // If there's no pointer operand, there's nothing to do.
       auto *Ptr = getLoadStorePointerOperand(&I);
@@ -4496,6 +4502,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
         llvm_unreachable("unhandled recipe");
       }
 
+      /* Downstream change: #87 (sincos vectorization)*/
       auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
         unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
         if (!NumLegalParts)
@@ -4511,6 +4518,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
         // Two or more elements that share a register - are vectorized.
         return NumLegalParts < VF.getKnownMinValue();
       };
+      /* End downstream change: #87 */
 
       // If no def nor is a store, e.g., branches, continue - no value to check.
       if (R.getNumDefinedValues() == 0 &&
@@ -4528,8 +4536,10 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
       if (!Visited.insert({ScalarTy}).second)
         continue;
       Type *WideTy = toVectorizedTy(ScalarTy, VF);
+      /* Downstream change: #87 (sincos vectorization)*/
       if (any_of(getContainedTypes(WideTy), WillGenerateTargetVectors))
         return true;
+      /* End downstream change: #87 */
     }
   }
 
@@ -5485,6 +5495,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
     // Compute the scalarization overhead of needed insertelement instructions
     // and phi nodes.
     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
+      /* Downstream change: #87 (sincos vectorization)*/
       Type *WideTy = toVectorizedTy(I->getType(), VF);
       for (Type *VectorTy : getContainedTypes(WideTy)) {
         ScalarCost += TTI.getScalarizationOverhead(
@@ -5492,6 +5503,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
             /*Insert=*/true,
             /*Extract=*/false, CostKind);
       }
+      /* End downstream change: #87 */
       ScalarCost +=
           VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
     }
@@ -5502,6 +5514,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
     // overhead.
     for (Use &U : I->operands())
       if (auto *J = dyn_cast<Instruction>(U.get())) {
+        /* Downstream change: #87 (sincos vectorization)*/
         assert(canVectorizeTy(J->getType()) &&
                "Instruction has non-scalar type");
         if (CanBeScalarized(J))
@@ -5515,6 +5528,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
                 /*Extract*/ true, CostKind);
           }
         }
+        /* End downstream change: #87 */
       }
 
     // Scale the total scalar cost by block probability.
@@ -5992,6 +6006,7 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
     return 0;
 
   InstructionCost Cost = 0;
+  /* Downstream change: #87 (sincos vectorization)*/
   Type *RetTy = toVectorizedTy(I->getType(), VF);
   if (!RetTy->isVoidTy() &&
       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) {
@@ -6003,6 +6018,7 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
           /*Extract=*/false, CostKind);
     }
   }
+  /* End downstream change: #87 */
 
   // Some targets keep addresses scalar.
   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
@@ -6260,9 +6276,11 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
 
       bool MaskRequired = Legal->isMaskRequired(CI);
       // Compute corresponding vector type for return value and arguments.
+      /* Downstream change: #87 (sincos vectorization)*/
       Type *RetTy = toVectorizedTy(ScalarRetTy, VF);
       for (Type *ScalarTy : ScalarTys)
         Tys.push_back(toVectorizedTy(ScalarTy, VF));
+      /* End downstream change: #87 */
 
       // An in-loop reduction using an fmuladd intrinsic is a special case;
       // we don't want the normal cost for that intrinsic.
@@ -6452,6 +6470,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
            HasSingleCopyAfterVectorization(I, VF));
     VectorTy = RetTy;
   } else
+    // Downstream change: #87 (sincos vectorization)
     VectorTy = toVectorizedTy(RetTy, VF);
 
   if (VF.isVector() && VectorTy->isVectorTy() &&
@@ -8600,6 +8619,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
     }
     return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
   }
+  /* Downstream change: #87 (sincos vectorization)*/
   case Instruction::ExtractValue: {
     SmallVector<VPValue *> NewOps(Operands);
     Type *I32Ty = IntegerType::getInt32Ty(I->getContext());
@@ -8609,6 +8629,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
     NewOps.push_back(Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, Idx, false)));
     return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
   }
+    /* End downstream change: #87 */
   };
 }
 
@@ -9889,6 +9910,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
             VectorType::get(UI->getType(), State.VF));
         State.set(this, Poison);
       }
+      // Downstream change: #87 (sincos vectorization)
       State.packScalarIntoVectorizedValue(this, *State.Lane);
     }
     return;
@@ -10406,6 +10428,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
+  /* Downstream change: #87 (sincos vectorization)*/
+  // Remove StructCallVectorizationUnsupported failure.
+  /* End downstream change: #87 */
+
   // Entrance to the VPlan-native vectorization path. Outer loops are processed
   // here. They may require CFG and instruction level transformations before
   // even evaluating whether vectorization is profitable. Since we cannot modify
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index f46b13bc2815..6a3e5f4b46cb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -334,10 +334,12 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
   } else {
     // Initialize packing with insertelements to start from undef.
     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+    /* Downstream change: #87 (sincos vectorization)*/
     Value *Undef = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
     set(Def, Undef);
     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
       packScalarIntoVectorizedValue(Def, Lane);
+    /* End downstream change: #87 */
     VectorValue = get(Def);
   }
   Builder.restoreIP(OldIP);
@@ -390,6 +392,7 @@ void VPTransformState::setDebugLocFrom(DebugLoc DL) {
     Builder.SetCurrentDebugLocation(DIL);
 }
 
+/* Downstream change: #87 (sincos vectorization)*/
 void VPTransformState::packScalarIntoVectorizedValue(VPValue *Def,
                                                      const VPLane &Lane) {
   Value *ScalarInst = get(Def, Lane);
@@ -409,6 +412,7 @@ void VPTransformState::packScalarIntoVectorizedValue(VPValue *Def,
   }
   set(Def, WideValue);
 }
+/* End downstream change: #87 */
 
 BasicBlock *
 VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0122188526b2..aa7a208c2eb3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -276,6 +276,7 @@ struct VPTransformState {
       set(Def, V, VPLane(0));
       return;
     }
+    // Downstream change: #87 (sincos vectorization)
     assert((VF.isScalar() || isVectorizedTy(V->getType())) &&
            "scalar values must be stored as (0, 0)");
     Data.VPV2Vector[Def] = V;
@@ -325,9 +326,11 @@ struct VPTransformState {
   /// Set the debug location in the builder using the debug location \p DL.
   void setDebugLocFrom(DebugLoc DL);
 
+  /* Downstream change: #87 (sincos vectorization)*/
   /// Construct the vectorized value of a scalarized value \p V one lane at a
   /// time.
   void packScalarIntoVectorizedValue(VPValue *Def, const VPLane &Lane);
+  /* End downstream change: #87 */
 
   /// Hold state information used when constructing the CFG of the output IR,
   /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 0af52addc4b6..b2cb6276f16c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -124,12 +124,14 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenRecipe *R) {
   case Instruction::FNeg:
   case Instruction::Freeze:
     return inferScalarType(R->getOperand(0));
+  /* Downstream change: #87 (sincos vectorization)*/
   case Instruction::ExtractValue: {
     assert(R->getNumOperands() == 2 && "expected single level extractvalue");
     auto *StructTy = cast<StructType>(inferScalarType(R->getOperand(0)));
     auto *CI = cast<ConstantInt>(R->getOperand(1)->getLiveInIRValue());
     return StructTy->getTypeAtIndex(CI->getZExtValue());
   }
+  /* End downstream change: #87 */
   default:
     break;
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 89dba6f452c7..2074759e3b2d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1116,6 +1116,7 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
     Arguments.push_back(V);
   }
 
+  // Downstream change: #87 (sincos vectorization)
   Type *RetTy = toVectorizedTy(Ctx.Types.inferScalarType(this), VF);
   SmallVector<Type *> ParamTys;
   for (unsigned I = 0; I != getNumOperands(); ++I)
@@ -1422,6 +1423,7 @@ void VPWidenRecipe::execute(VPTransformState &State) {
     State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
     break;
   }
+  /* Downstream change: #87 (sincos vectorization)*/
   case Instruction::ExtractValue: {
     assert(getNumOperands() == 2 && "expected single level extractvalue");
     Value *Op = State.get(getOperand(0));
@@ -1430,6 +1432,7 @@ void VPWidenRecipe::execute(VPTransformState &State) {
     State.set(this, Extract);
     break;
   }
+  /* End downstream change: #87 */
   case Instruction::Freeze: {
     Value *Op = State.get(getOperand(0));
 
@@ -1531,10 +1534,12 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
     return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
                                           Ctx.CostKind);
   }
+  /* Downstream change: #87 (sincos vectorization)*/
   case Instruction::ExtractValue: {
     return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
                                              Ctx.CostKind);
   }
+  /* End downstream change: #87 */
   case Instruction::ICmp:
   case Instruction::FCmp: {
     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());

From 199a9ec06db71418d55daee17ca0f444af5f5062 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Mon, 17 Feb 2025 17:07:26 +0000
Subject: [PATCH 3/4] Revert "Add a load of comments"

This reverts commit 8065001d363604822226896be2d41ea3709078ec.
---
 .../llvm/Analysis/TargetTransformInfo.h       |  6 -----
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  4 ---
 .../Vectorize/LoopVectorizationLegality.h     |  8 ------
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  2 --
 .../Vectorize/LoopVectorizationLegality.cpp   |  2 --
 .../Transforms/Vectorize/LoopVectorize.cpp    | 26 -------------------
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  4 ---
 llvm/lib/Transforms/Vectorize/VPlan.h         |  3 ---
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |  2 --
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  5 ----
 10 files changed, 62 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 45077f174115..2ec116d0a9b4 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1473,13 +1473,11 @@ class TargetTransformInfo {
                                      TTI::TargetCostKind CostKind,
                                      unsigned Index = -1) const;
 
-  /* Downstream change: #87 (sincos vectorization)*/
   /// \return The expected cost of aggregate inserts and extracts. This is
   /// used when the instruction is not available; a typical use case is to
   /// provision the cost of vectorization/scalarization in vectorizer passes.
   InstructionCost getInsertExtractValueCost(unsigned Opcode,
                                             TTI::TargetCostKind CostKind) const;
-  /* End downstream change: #87 */
 
   /// \return The cost of replication shuffle of \p VF elements typed \p EltTy
   /// \p ReplicationFactor times.
@@ -2213,10 +2211,8 @@ class TargetTransformInfo::Concept {
                             const APInt &DemandedDstElts,
                             TTI::TargetCostKind CostKind) = 0;
 
-  /* Downstream change: #87 (sincos vectorization)*/
   virtual InstructionCost
   getInsertExtractValueCost(unsigned Opcode, TTI::TargetCostKind CostKind) = 0;
-  /* End downstream change: #87 */
 
   virtual InstructionCost
   getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
@@ -2939,13 +2935,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
                                           DemandedDstElts, CostKind);
   }
-  /* Downstream change: #87 (sincos vectorization)*/
   InstructionCost
   getInsertExtractValueCost(unsigned Opcode,
                             TTI::TargetCostKind CostKind) override {
     return Impl.getInsertExtractValueCost(Opcode, CostKind);
   }
-  /* End downstream change: #87 */
   InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                                   unsigned AddressSpace,
                                   TTI::TargetCostKind CostKind,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index c0dfd9aa72d8..4996c405bdda 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -745,7 +745,6 @@ class TargetTransformInfoImplBase {
     return 1;
   }
 
-  /* Downstream change: #87 (sincos vectorization)*/
   InstructionCost
   getInsertExtractValueCost(unsigned Opcode,
                             TTI::TargetCostKind CostKind) const {
@@ -756,7 +755,6 @@ class TargetTransformInfoImplBase {
       return CostKind == TTI::TCK_RecipThroughput ? -1 : TTI::TCC_Basic;
     return TTI::TCC_Free;
   }
-  /* End downstream change: #87 */
 
   InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                                   unsigned AddressSpace,
@@ -1311,11 +1309,9 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
       return TargetTTI->getCFInstrCost(Opcode, CostKind, I);
     case Instruction::Freeze:
       return TTI::TCC_Free;
-      /* Downstream change: #87 (sincos vectorization)*/
     case Instruction::ExtractValue:
     case Instruction::InsertValue:
       return TargetTTI->getInsertExtractValueCost(Opcode, CostKind);
-      /* End downstream change: #87 */
     case Instruction::Alloca:
       if (cast<AllocaInst>(U)->isStaticAlloca())
         return TTI::TCC_Free;
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 30465e991547..e959d93b5727 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -416,10 +416,6 @@ class LoopVectorizationLegality {
   /// has a vectorized variant available.
   bool hasVectorCallVariants() const { return VecCallVariantsFound; }
 
-  /* Downstream change: #87 (sincos vectorization)*/
-  // Removed hasStructVectorCall()
-  /* End downstream change: #87 */
-
   unsigned getNumStores() const { return LAI->getNumStores(); }
   unsigned getNumLoads() const { return LAI->getNumLoads(); }
 
@@ -639,10 +635,6 @@ class LoopVectorizationLegality {
   /// the use of those function variants.
   bool VecCallVariantsFound = false;
 
-  /* Downstream change: #87 (sincos vectorization)*/
-  // Removed StructVecCallFound
-  /* End downstream change: #87 */
-
   /// Keep track of all the countable and uncountable exiting blocks if
   /// the exact backedge taken count is not computable.
   SmallVector<BasicBlock *, 4> CountableExitingBlocks;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index ade398ea72f9..820a66329406 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1113,7 +1113,6 @@ TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
   return Cost;
 }
 
-/* Downstream change: #87 (sincos vectorization)*/
 InstructionCost TargetTransformInfo::getInsertExtractValueCost(
     unsigned Opcode, TTI::TargetCostKind CostKind) const {
   assert((Opcode == Instruction::InsertValue ||
@@ -1123,7 +1122,6 @@ InstructionCost TargetTransformInfo::getInsertExtractValueCost(
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
-/* End downstream change: #87 */
 
 InstructionCost TargetTransformInfo::getReplicationShuffleCost(
     Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 6f33ac923ba3..420cbc5384ce 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -954,7 +954,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       if (CI && !VFDatabase::getMappings(*CI).empty())
         VecCallVariantsFound = true;
 
-      /* Downstream change: #87 (sincos vectorization)*/
       auto CanWidenInstructionTy = [](Instruction const &Inst) {
         Type *InstTy = Inst.getType();
         if (!isa<StructType>(InstTy))
@@ -966,7 +965,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         return isa<CallInst>(Inst) && canWidenCallReturnType(InstTy) &&
                all_of(Inst.users(), IsaPred<ExtractValueInst>);
       };
-      /* End downstream change: #87 */
 
       // Check that the instruction return type is vectorizable.
       // We can't vectorize casts from vector type to scalar type.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2c9da6940a8b..73288b429e69 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2350,11 +2350,9 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
                                                VPReplicateRecipe *RepRecipe,
                                                const VPLane &Lane,
                                                VPTransformState &State) {
-  /* Downstream change: #87 (sincos vectorization)*/
   assert((!Instr->getType()->isAggregateType() ||
           canVectorizeTy(Instr->getType())) &&
          "Expected vectorizable or non-aggregate type.");
-  /* End downstream change: #87 */
 
   // Does this instruction return a value ?
   bool IsVoidRetTy = Instr->getType()->isVoidTy();
@@ -2859,13 +2857,11 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
   return ScalarCallCost;
 }
 
-/* Downstream change: #87 (sincos vectorization)*/
 static Type *maybeVectorizeType(Type *Ty, ElementCount VF) {
   if (VF.isScalar() || !canVectorizeTy(Ty))
     return Ty;
   return toVectorizedTy(Ty, VF);
 }
-/* End downstream change: #87 */
 
 InstructionCost
 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
@@ -3611,7 +3607,6 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
         }
       }
 
-      /* Downstream change: #87 (sincos vectorization)*/
       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
         if (IsOutOfScope(EVI->getAggregateOperand())) {
           AddToWorklistIfAllowed(EVI);
@@ -3622,7 +3617,6 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
         assert(isa<CallInst>(EVI->getAggregateOperand()) &&
                "Expected aggregate value to be call return value");
       }
-      /* End downstream change: #87 */
 
       // If there's no pointer operand, there's nothing to do.
       auto *Ptr = getLoadStorePointerOperand(&I);
@@ -4502,7 +4496,6 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
         llvm_unreachable("unhandled recipe");
       }
 
-      /* Downstream change: #87 (sincos vectorization)*/
       auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
         unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
         if (!NumLegalParts)
@@ -4518,7 +4511,6 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
         // Two or more elements that share a register - are vectorized.
         return NumLegalParts < VF.getKnownMinValue();
       };
-      /* End downstream change: #87 */
 
       // If no def nor is a store, e.g., branches, continue - no value to check.
       if (R.getNumDefinedValues() == 0 &&
@@ -4536,10 +4528,8 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
       if (!Visited.insert({ScalarTy}).second)
         continue;
       Type *WideTy = toVectorizedTy(ScalarTy, VF);
-      /* Downstream change: #87 (sincos vectorization)*/
       if (any_of(getContainedTypes(WideTy), WillGenerateTargetVectors))
         return true;
-      /* End downstream change: #87 */
     }
   }
 
@@ -5495,7 +5485,6 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
     // Compute the scalarization overhead of needed insertelement instructions
     // and phi nodes.
     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
-      /* Downstream change: #87 (sincos vectorization)*/
       Type *WideTy = toVectorizedTy(I->getType(), VF);
       for (Type *VectorTy : getContainedTypes(WideTy)) {
         ScalarCost += TTI.getScalarizationOverhead(
@@ -5503,7 +5492,6 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
             /*Insert=*/true,
             /*Extract=*/false, CostKind);
       }
-      /* End downstream change: #87 */
       ScalarCost +=
           VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
     }
@@ -5514,7 +5502,6 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
     // overhead.
     for (Use &U : I->operands())
       if (auto *J = dyn_cast<Instruction>(U.get())) {
-        /* Downstream change: #87 (sincos vectorization)*/
         assert(canVectorizeTy(J->getType()) &&
                "Instruction has non-scalar type");
         if (CanBeScalarized(J))
@@ -5528,7 +5515,6 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
                 /*Extract*/ true, CostKind);
           }
         }
-        /* End downstream change: #87 */
       }
 
     // Scale the total scalar cost by block probability.
@@ -6006,7 +5992,6 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
     return 0;
 
   InstructionCost Cost = 0;
-  /* Downstream change: #87 (sincos vectorization)*/
   Type *RetTy = toVectorizedTy(I->getType(), VF);
   if (!RetTy->isVoidTy() &&
       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) {
@@ -6018,7 +6003,6 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
           /*Extract=*/false, CostKind);
     }
   }
-  /* End downstream change: #87 */
 
   // Some targets keep addresses scalar.
   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
@@ -6276,11 +6260,9 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
 
       bool MaskRequired = Legal->isMaskRequired(CI);
       // Compute corresponding vector type for return value and arguments.
-      /* Downstream change: #87 (sincos vectorization)*/
       Type *RetTy = toVectorizedTy(ScalarRetTy, VF);
       for (Type *ScalarTy : ScalarTys)
         Tys.push_back(toVectorizedTy(ScalarTy, VF));
-      /* End downstream change: #87 */
 
       // An in-loop reduction using an fmuladd intrinsic is a special case;
       // we don't want the normal cost for that intrinsic.
@@ -6470,7 +6452,6 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
            HasSingleCopyAfterVectorization(I, VF));
     VectorTy = RetTy;
   } else
-    // Downstream change: #87 (sincos vectorization)
     VectorTy = toVectorizedTy(RetTy, VF);
 
   if (VF.isVector() && VectorTy->isVectorTy() &&
@@ -8619,7 +8600,6 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
     }
     return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
   }
-  /* Downstream change: #87 (sincos vectorization)*/
   case Instruction::ExtractValue: {
     SmallVector<VPValue *> NewOps(Operands);
     Type *I32Ty = IntegerType::getInt32Ty(I->getContext());
@@ -8629,7 +8609,6 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
     NewOps.push_back(Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, Idx, false)));
     return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
   }
-    /* End downstream change: #87 */
   };
 }
 
@@ -9910,7 +9889,6 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
             VectorType::get(UI->getType(), State.VF));
         State.set(this, Poison);
       }
-      // Downstream change: #87 (sincos vectorization)
       State.packScalarIntoVectorizedValue(this, *State.Lane);
     }
     return;
@@ -10428,10 +10406,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
-  /* Downstream change: #87 (sincos vectorization)*/
-  // Remove StructCallVectorizationUnsupported failure.
-  /* End downstream change: #87 */
-
   // Entrance to the VPlan-native vectorization path. Outer loops are processed
   // here. They may require CFG and instruction level transformations before
   // even evaluating whether vectorization is profitable. Since we cannot modify
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 6a3e5f4b46cb..f46b13bc2815 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -334,12 +334,10 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
   } else {
     // Initialize packing with insertelements to start from undef.
     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
-    /* Downstream change: #87 (sincos vectorization)*/
     Value *Undef = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
     set(Def, Undef);
     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
       packScalarIntoVectorizedValue(Def, Lane);
-    /* End downstream change: #87 */
     VectorValue = get(Def);
   }
   Builder.restoreIP(OldIP);
@@ -392,7 +390,6 @@ void VPTransformState::setDebugLocFrom(DebugLoc DL) {
     Builder.SetCurrentDebugLocation(DIL);
 }
 
-/* Downstream change: #87 (sincos vectorization)*/
 void VPTransformState::packScalarIntoVectorizedValue(VPValue *Def,
                                                      const VPLane &Lane) {
   Value *ScalarInst = get(Def, Lane);
@@ -412,7 +409,6 @@ void VPTransformState::packScalarIntoVectorizedValue(VPValue *Def,
   }
   set(Def, WideValue);
 }
-/* End downstream change: #87 */
 
 BasicBlock *
 VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index aa7a208c2eb3..0122188526b2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -276,7 +276,6 @@ struct VPTransformState {
       set(Def, V, VPLane(0));
       return;
     }
-    // Downstream change: #87 (sincos vectorization)
     assert((VF.isScalar() || isVectorizedTy(V->getType())) &&
            "scalar values must be stored as (0, 0)");
     Data.VPV2Vector[Def] = V;
@@ -326,11 +325,9 @@ struct VPTransformState {
   /// Set the debug location in the builder using the debug location \p DL.
   void setDebugLocFrom(DebugLoc DL);
 
-  /* Downstream change: #87 (sincos vectorization)*/
   /// Construct the vectorized value of a scalarized value \p V one lane at a
   /// time.
   void packScalarIntoVectorizedValue(VPValue *Def, const VPLane &Lane);
-  /* End downstream change: #87 */
 
   /// Hold state information used when constructing the CFG of the output IR,
   /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index b2cb6276f16c..0af52addc4b6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -124,14 +124,12 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenRecipe *R) {
   case Instruction::FNeg:
   case Instruction::Freeze:
     return inferScalarType(R->getOperand(0));
-  /* Downstream change: #87 (sincos vectorization)*/
   case Instruction::ExtractValue: {
     assert(R->getNumOperands() == 2 && "expected single level extractvalue");
     auto *StructTy = cast<StructType>(inferScalarType(R->getOperand(0)));
     auto *CI = cast<ConstantInt>(R->getOperand(1)->getLiveInIRValue());
     return StructTy->getTypeAtIndex(CI->getZExtValue());
   }
-  /* End downstream change: #87 */
   default:
     break;
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2074759e3b2d..89dba6f452c7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1116,7 +1116,6 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
     Arguments.push_back(V);
   }
 
-  // Downstream change: #87 (sincos vectorization)
   Type *RetTy = toVectorizedTy(Ctx.Types.inferScalarType(this), VF);
   SmallVector<Type *> ParamTys;
   for (unsigned I = 0; I != getNumOperands(); ++I)
@@ -1423,7 +1422,6 @@ void VPWidenRecipe::execute(VPTransformState &State) {
     State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
     break;
   }
-  /* Downstream change: #87 (sincos vectorization)*/
   case Instruction::ExtractValue: {
     assert(getNumOperands() == 2 && "expected single level extractvalue");
     Value *Op = State.get(getOperand(0));
@@ -1432,7 +1430,6 @@ void VPWidenRecipe::execute(VPTransformState &State) {
     State.set(this, Extract);
     break;
   }
-  /* End downstream change: #87 */
   case Instruction::Freeze: {
     Value *Op = State.get(getOperand(0));
 
@@ -1534,12 +1531,10 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
     return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
                                           Ctx.CostKind);
   }
-  /* Downstream change: #87 (sincos vectorization)*/
   case Instruction::ExtractValue: {
     return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
                                              Ctx.CostKind);
   }
-  /* End downstream change: #87 */
   case Instruction::ICmp:
   case Instruction::FCmp: {
     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());

From 545a1bde9ecec807d618aeb7ecea4bc2ada63b8b Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Tue, 18 Feb 2025 10:15:19 +0000
Subject: [PATCH 4/4] Revert "Revert "Add a load of comments""

This reverts commit 199a9ec06db71418d55daee17ca0f444af5f5062.
---
 .../llvm/Analysis/TargetTransformInfo.h       |  6 +++++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  4 +++
 .../Vectorize/LoopVectorizationLegality.h     |  8 ++++++
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  2 ++
 .../Vectorize/LoopVectorizationLegality.cpp   |  2 ++
 .../Transforms/Vectorize/LoopVectorize.cpp    | 26 +++++++++++++++++++
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  4 +++
 llvm/lib/Transforms/Vectorize/VPlan.h         |  3 +++
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |  2 ++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  5 ++++
 10 files changed, 62 insertions(+)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 2ec116d0a9b4..45077f174115 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1473,11 +1473,13 @@ class TargetTransformInfo {
                                      TTI::TargetCostKind CostKind,
                                      unsigned Index = -1) const;
 
+  /* Downstream change: #87 (sincos vectorization)*/
   /// \return The expected cost of aggregate inserts and extracts. This is
   /// used when the instruction is not available; a typical use case is to
   /// provision the cost of vectorization/scalarization in vectorizer passes.
   InstructionCost getInsertExtractValueCost(unsigned Opcode,
                                             TTI::TargetCostKind CostKind) const;
+  /* End downstream change: #87 */
 
   /// \return The cost of replication shuffle of \p VF elements typed \p EltTy
   /// \p ReplicationFactor times.
@@ -2211,8 +2213,10 @@ class TargetTransformInfo::Concept {
                             const APInt &DemandedDstElts,
                             TTI::TargetCostKind CostKind) = 0;
 
+  /* Downstream change: #87 (sincos vectorization)*/
   virtual InstructionCost
   getInsertExtractValueCost(unsigned Opcode, TTI::TargetCostKind CostKind) = 0;
+  /* End downstream change: #87 */
 
   virtual InstructionCost
   getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
@@ -2935,11 +2939,13 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
                                           DemandedDstElts, CostKind);
   }
+  /* Downstream change: #87 (sincos vectorization)*/
   InstructionCost
   getInsertExtractValueCost(unsigned Opcode,
                             TTI::TargetCostKind CostKind) override {
     return Impl.getInsertExtractValueCost(Opcode, CostKind);
   }
+  /* End downstream change: #87 */
   InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                                   unsigned AddressSpace,
                                   TTI::TargetCostKind CostKind,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 4996c405bdda..c0dfd9aa72d8 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -745,6 +745,7 @@ class TargetTransformInfoImplBase {
     return 1;
   }
 
+  /* Downstream change: #87 (sincos vectorization)*/
   InstructionCost
   getInsertExtractValueCost(unsigned Opcode,
                             TTI::TargetCostKind CostKind) const {
@@ -755,6 +756,7 @@ class TargetTransformInfoImplBase {
       return CostKind == TTI::TCK_RecipThroughput ? -1 : TTI::TCC_Basic;
     return TTI::TCC_Free;
   }
+  /* End downstream change: #87 */
 
   InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                                   unsigned AddressSpace,
@@ -1309,9 +1311,11 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
       return TargetTTI->getCFInstrCost(Opcode, CostKind, I);
     case Instruction::Freeze:
       return TTI::TCC_Free;
+      /* Downstream change: #87 (sincos vectorization)*/
     case Instruction::ExtractValue:
     case Instruction::InsertValue:
       return TargetTTI->getInsertExtractValueCost(Opcode, CostKind);
+      /* End downstream change: #87 */
     case Instruction::Alloca:
       if (cast<AllocaInst>(U)->isStaticAlloca())
         return TTI::TCC_Free;
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index e959d93b5727..30465e991547 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -416,6 +416,10 @@ class LoopVectorizationLegality {
   /// has a vectorized variant available.
   bool hasVectorCallVariants() const { return VecCallVariantsFound; }
 
+  /* Downstream change: #87 (sincos vectorization)*/
+  // Removed hasStructVectorCall()
+  /* End downstream change: #87 */
+
   unsigned getNumStores() const { return LAI->getNumStores(); }
   unsigned getNumLoads() const { return LAI->getNumLoads(); }
 
@@ -635,6 +639,10 @@ class LoopVectorizationLegality {
   /// the use of those function variants.
   bool VecCallVariantsFound = false;
 
+  /* Downstream change: #87 (sincos vectorization)*/
+  // Removed StructVecCallFound
+  /* End downstream change: #87 */
+
   /// Keep track of all the countable and uncountable exiting blocks if
   /// the exact backedge taken count is not computable.
   SmallVector<BasicBlock *, 4> CountableExitingBlocks;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 820a66329406..ade398ea72f9 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1113,6 +1113,7 @@ TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
   return Cost;
 }
 
+/* Downstream change: #87 (sincos vectorization)*/
 InstructionCost TargetTransformInfo::getInsertExtractValueCost(
     unsigned Opcode, TTI::TargetCostKind CostKind) const {
   assert((Opcode == Instruction::InsertValue ||
@@ -1122,6 +1123,7 @@ InstructionCost TargetTransformInfo::getInsertExtractValueCost(
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
+/* End downstream change: #87 */
 
 InstructionCost TargetTransformInfo::getReplicationShuffleCost(
     Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 420cbc5384ce..6f33ac923ba3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -954,6 +954,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       if (CI && !VFDatabase::getMappings(*CI).empty())
         VecCallVariantsFound = true;
 
+      /* Downstream change: #87 (sincos vectorization)*/
       auto CanWidenInstructionTy = [](Instruction const &Inst) {
         Type *InstTy = Inst.getType();
         if (!isa<StructType>(InstTy))
@@ -965,6 +966,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         return isa<CallInst>(Inst) && canWidenCallReturnType(InstTy) &&
                all_of(Inst.users(), IsaPred<ExtractValueInst>);
       };
+      /* End downstream change: #87 */
 
       // Check that the instruction return type is vectorizable.
       // We can't vectorize casts from vector type to scalar type.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 73288b429e69..2c9da6940a8b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2350,9 +2350,11 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
                                                VPReplicateRecipe *RepRecipe,
                                                const VPLane &Lane,
                                                VPTransformState &State) {
+  /* Downstream change: #87 (sincos vectorization)*/
   assert((!Instr->getType()->isAggregateType() ||
           canVectorizeTy(Instr->getType())) &&
          "Expected vectorizable or non-aggregate type.");
+  /* End downstream change: #87 */
 
   // Does this instruction return a value ?
   bool IsVoidRetTy = Instr->getType()->isVoidTy();
@@ -2857,11 +2859,13 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
   return ScalarCallCost;
 }
 
+/* Downstream change: #87 (sincos vectorization)*/
 static Type *maybeVectorizeType(Type *Ty, ElementCount VF) {
   if (VF.isScalar() || !canVectorizeTy(Ty))
     return Ty;
   return toVectorizedTy(Ty, VF);
 }
+/* End downstream change: #87 */
 
 InstructionCost
 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
@@ -3607,6 +3611,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
         }
       }
 
+      /* Downstream change: #87 (sincos vectorization)*/
       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
         if (IsOutOfScope(EVI->getAggregateOperand())) {
           AddToWorklistIfAllowed(EVI);
@@ -3617,6 +3622,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
         assert(isa<CallInst>(EVI->getAggregateOperand()) &&
                "Expected aggregate value to be call return value");
       }
+      /* End downstream change: #87 */
 
       // If there's no pointer operand, there's nothing to do.
       auto *Ptr = getLoadStorePointerOperand(&I);
@@ -4496,6 +4502,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
         llvm_unreachable("unhandled recipe");
       }
 
+      /* Downstream change: #87 (sincos vectorization)*/
       auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
         unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
         if (!NumLegalParts)
@@ -4511,6 +4518,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
         // Two or more elements that share a register - are vectorized.
         return NumLegalParts < VF.getKnownMinValue();
       };
+      /* End downstream change: #87 */
 
       // If no def nor is a store, e.g., branches, continue - no value to check.
       if (R.getNumDefinedValues() == 0 &&
@@ -4528,8 +4536,10 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
       if (!Visited.insert({ScalarTy}).second)
         continue;
       Type *WideTy = toVectorizedTy(ScalarTy, VF);
+      /* Downstream change: #87 (sincos vectorization)*/
       if (any_of(getContainedTypes(WideTy), WillGenerateTargetVectors))
         return true;
+      /* End downstream change: #87 */
     }
   }
 
@@ -5485,6 +5495,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
     // Compute the scalarization overhead of needed insertelement instructions
     // and phi nodes.
     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
+      /* Downstream change: #87 (sincos vectorization)*/
       Type *WideTy = toVectorizedTy(I->getType(), VF);
       for (Type *VectorTy : getContainedTypes(WideTy)) {
         ScalarCost += TTI.getScalarizationOverhead(
@@ -5492,6 +5503,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
             /*Insert=*/true,
             /*Extract=*/false, CostKind);
       }
+      /* End downstream change: #87 */
       ScalarCost +=
           VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
     }
@@ -5502,6 +5514,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
     // overhead.
     for (Use &U : I->operands())
       if (auto *J = dyn_cast<Instruction>(U.get())) {
+        /* Downstream change: #87 (sincos vectorization)*/
         assert(canVectorizeTy(J->getType()) &&
                "Instruction has non-scalar type");
         if (CanBeScalarized(J))
@@ -5515,6 +5528,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
                 /*Extract*/ true, CostKind);
           }
         }
+        /* End downstream change: #87 */
       }
 
     // Scale the total scalar cost by block probability.
@@ -5992,6 +6006,7 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
     return 0;
 
   InstructionCost Cost = 0;
+  /* Downstream change: #87 (sincos vectorization)*/
   Type *RetTy = toVectorizedTy(I->getType(), VF);
   if (!RetTy->isVoidTy() &&
       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) {
@@ -6003,6 +6018,7 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
           /*Extract=*/false, CostKind);
     }
   }
+  /* End downstream change: #87 */
 
   // Some targets keep addresses scalar.
   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
@@ -6260,9 +6276,11 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
 
       bool MaskRequired = Legal->isMaskRequired(CI);
       // Compute corresponding vector type for return value and arguments.
+      /* Downstream change: #87 (sincos vectorization)*/
       Type *RetTy = toVectorizedTy(ScalarRetTy, VF);
       for (Type *ScalarTy : ScalarTys)
         Tys.push_back(toVectorizedTy(ScalarTy, VF));
+      /* End downstream change: #87 */
 
       // An in-loop reduction using an fmuladd intrinsic is a special case;
       // we don't want the normal cost for that intrinsic.
@@ -6452,6 +6470,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
            HasSingleCopyAfterVectorization(I, VF));
     VectorTy = RetTy;
   } else
+    // Downstream change: #87 (sincos vectorization)
     VectorTy = toVectorizedTy(RetTy, VF);
 
   if (VF.isVector() && VectorTy->isVectorTy() &&
@@ -8600,6 +8619,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
     }
     return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
   }
+  /* Downstream change: #87 (sincos vectorization)*/
   case Instruction::ExtractValue: {
     SmallVector<VPValue *> NewOps(Operands);
     Type *I32Ty = IntegerType::getInt32Ty(I->getContext());
@@ -8609,6 +8629,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
     NewOps.push_back(Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, Idx, false)));
     return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
   }
+    /* End downstream change: #87 */
   };
 }
 
@@ -9889,6 +9910,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
             VectorType::get(UI->getType(), State.VF));
         State.set(this, Poison);
       }
+      // Downstream change: #87 (sincos vectorization)
       State.packScalarIntoVectorizedValue(this, *State.Lane);
     }
     return;
@@ -10406,6 +10428,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
+  /* Downstream change: #87 (sincos vectorization)*/
+  // Remove StructCallVectorizationUnsupported failure.
+  /* End downstream change: #87 */
+
   // Entrance to the VPlan-native vectorization path. Outer loops are processed
   // here. They may require CFG and instruction level transformations before
   // even evaluating whether vectorization is profitable. Since we cannot modify
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index f46b13bc2815..6a3e5f4b46cb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -334,10 +334,12 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
   } else {
     // Initialize packing with insertelements to start from undef.
     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+    /* Downstream change: #87 (sincos vectorization)*/
     Value *Undef = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
     set(Def, Undef);
     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
       packScalarIntoVectorizedValue(Def, Lane);
+    /* End downstream change: #87 */
     VectorValue = get(Def);
   }
   Builder.restoreIP(OldIP);
@@ -390,6 +392,7 @@ void VPTransformState::setDebugLocFrom(DebugLoc DL) {
     Builder.SetCurrentDebugLocation(DIL);
 }
 
+/* Downstream change: #87 (sincos vectorization)*/
 void VPTransformState::packScalarIntoVectorizedValue(VPValue *Def,
                                                      const VPLane &Lane) {
   Value *ScalarInst = get(Def, Lane);
@@ -409,6 +412,7 @@ void VPTransformState::packScalarIntoVectorizedValue(VPValue *Def,
   }
   set(Def, WideValue);
 }
+/* End downstream change: #87 */
 
 BasicBlock *
 VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0122188526b2..aa7a208c2eb3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -276,6 +276,7 @@ struct VPTransformState {
       set(Def, V, VPLane(0));
       return;
     }
+    // Downstream change: #87 (sincos vectorization)
     assert((VF.isScalar() || isVectorizedTy(V->getType())) &&
            "scalar values must be stored as (0, 0)");
     Data.VPV2Vector[Def] = V;
@@ -325,9 +326,11 @@ struct VPTransformState {
   /// Set the debug location in the builder using the debug location \p DL.
   void setDebugLocFrom(DebugLoc DL);
 
+  /* Downstream change: #87 (sincos vectorization)*/
   /// Construct the vectorized value of a scalarized value \p V one lane at a
   /// time.
   void packScalarIntoVectorizedValue(VPValue *Def, const VPLane &Lane);
+  /* End downstream change: #87 */
 
   /// Hold state information used when constructing the CFG of the output IR,
   /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 0af52addc4b6..b2cb6276f16c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -124,12 +124,14 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenRecipe *R) {
   case Instruction::FNeg:
   case Instruction::Freeze:
     return inferScalarType(R->getOperand(0));
+  /* Downstream change: #87 (sincos vectorization)*/
   case Instruction::ExtractValue: {
     assert(R->getNumOperands() == 2 && "expected single level extractvalue");
     auto *StructTy = cast<StructType>(inferScalarType(R->getOperand(0)));
     auto *CI = cast<ConstantInt>(R->getOperand(1)->getLiveInIRValue());
     return StructTy->getTypeAtIndex(CI->getZExtValue());
   }
+  /* End downstream change: #87 */
   default:
     break;
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 89dba6f452c7..2074759e3b2d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1116,6 +1116,7 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
     Arguments.push_back(V);
   }
 
+  // Downstream change: #87 (sincos vectorization)
   Type *RetTy = toVectorizedTy(Ctx.Types.inferScalarType(this), VF);
   SmallVector<Type *> ParamTys;
   for (unsigned I = 0; I != getNumOperands(); ++I)
@@ -1422,6 +1423,7 @@ void VPWidenRecipe::execute(VPTransformState &State) {
     State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
     break;
   }
+  /* Downstream change: #87 (sincos vectorization)*/
   case Instruction::ExtractValue: {
     assert(getNumOperands() == 2 && "expected single level extractvalue");
     Value *Op = State.get(getOperand(0));
@@ -1430,6 +1432,7 @@ void VPWidenRecipe::execute(VPTransformState &State) {
     State.set(this, Extract);
     break;
   }
+  /* End downstream change: #87 */
   case Instruction::Freeze: {
     Value *Op = State.get(getOperand(0));
 
@@ -1531,10 +1534,12 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
     return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
                                           Ctx.CostKind);
   }
+  /* Downstream change: #87 (sincos vectorization)*/
   case Instruction::ExtractValue: {
     return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
                                              Ctx.CostKind);
   }
+  /* End downstream change: #87 */
   case Instruction::ICmp:
   case Instruction::FCmp: {
     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());