From 28e3bc67af84fecaf9b48f833ca096ff138922c1 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Mon, 18 Aug 2025 14:16:00 +0000
Subject: [PATCH 1/2] [LV][TTI] Calculate cost of extracting last index in a
 scalable vector

There are a couple of places in the loop vectoriser where we
want to calculate the cost of extracting the last lane in a
vector. However, we wrongly assume that asking for the cost
of extracting lane (VF.getKnownMinValue() - 1) is an accurate
representation of the cost of extracting the last lane. For
SVE at least, this is non-trivial as it requires the use of
whilelo and lastb instructions.

To solve this problem I have added a new
getReverseVectorInstrCost interface where the index is used
in reverse from the end of the vector. Suppose a vector has
a given ElementCount EC, the extracted/inserted lane would be
EC - 1 - Index. For scalable vectors this index is unknown at
compile time. I've added a AArch64 hook that better represents
the cost, and also a RISCV hook that maintains compatibility
with the behaviour prior to this PR.

I've also taken the liberty of adding support in vplan for
calculating the cost of VPInstruction::ExtractLastElement.
---
 .../llvm/Analysis/TargetTransformInfo.h       |  8 ++++++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  7 ++++++
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      | 13 ++++++++++
 llvm/lib/Analysis/TargetTransformInfo.cpp     | 10 ++++++++
 .../AArch64/AArch64TargetTransformInfo.cpp    | 17 +++++++++++++
 .../AArch64/AArch64TargetTransformInfo.h      |  4 +++
 .../Target/RISCV/RISCVTargetTransformInfo.cpp | 17 +++++++++++++
 .../Target/RISCV/RISCVTargetTransformInfo.h   |  4 +++
 .../Transforms/Vectorize/LoopVectorize.cpp    | 14 +++++------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  6 +++++
 .../LoopVectorize/AArch64/masked-call.ll      | 25 +++++++------------
 11 files changed, 102 insertions(+), 23 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 9186419715cc4..19bd70a91606e 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1512,6 +1512,14 @@ class TargetTransformInfo {
                                               TTI::TargetCostKind CostKind,
                                               unsigned Index = -1) const;
 
+  /// \return The expected cost of inserting or extracting a lane that is \p
+  /// Index from the end of a vector, i.e. the mathematical expression for
+  /// the lane is (VF - 1 - Index). This is required for scalable vectors where
+  /// the exact lane index is unknown at compile time.
+  LLVM_ABI InstructionCost
+  getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
+                            TTI::TargetCostKind CostKind, unsigned Index) const;
+
   /// \return The expected cost of aggregate inserts and extracts. This is
   /// used when the instruction is not available; a typical use case is to
   /// provision the cost of vectorization/scalarization in vectorizer passes.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 200cbafbaa6e2..138ee3bbed40d 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -809,6 +809,13 @@ class TargetTransformInfoImplBase {
     return 1;
   }
 
+  virtual InstructionCost
+  getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
+                            TTI::TargetCostKind CostKind,
+                            unsigned Index) const {
+    return 1;
+  }
+
   virtual InstructionCost
   getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF,
                             const APInt &DemandedDstElts,
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index aa9d1f0a1ccea..124b93804a630 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1444,6 +1444,19 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
                                        Op1);
   }
 
+  InstructionCost getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
+                                            TTI::TargetCostKind CostKind,
+                                            unsigned Index) const override {
+    unsigned NewIndex = -1;
+    if (auto *FVTy = dyn_cast<FixedVectorType>(Val)) {
+      assert(Index < FVTy->getNumElements() &&
+             "Unexpected index from end of vector");
+      NewIndex = FVTy->getNumElements() - 1 - Index;
+    }
+    return thisT()->getVectorInstrCost(Opcode, Val, CostKind, NewIndex, nullptr,
+                                       nullptr);
+  }
+
   InstructionCost
   getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF,
                             const APInt &DemandedDstElts,
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 3141060a710ce..262fe51e41739 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1130,6 +1130,16 @@ TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
   return Cost;
 }
 
+InstructionCost
+TargetTransformInfo::getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
+                                               TTI::TargetCostKind CostKind,
+                                               unsigned Index) const {
+  InstructionCost Cost =
+      TTIImpl->getVectorInstrCostFromEnd(Opcode, Val, CostKind, Index);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
+}
+
 InstructionCost TargetTransformInfo::getInsertExtractValueCost(
     unsigned Opcode, TTI::TargetCostKind CostKind) const {
   assert((Opcode == Instruction::InsertValue ||
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index fc332d5320181..c10025727b884 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3986,6 +3986,23 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
   return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I);
 }
 
+InstructionCost
+AArch64TTIImpl::getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
+                                          TTI::TargetCostKind CostKind,
+                                          unsigned Index) const {
+  if (auto *FixedVecTy = dyn_cast<FixedVectorType>(Val))
+    return BaseT::getVectorInstrCostFromEnd(Opcode, Val, CostKind, Index);
+
+  // This typically requires both while and lastb instructions in order
+  // to extract the last element. If this is in a loop the while
+  // instruction can at least be hoisted out, although it will consume a
+  // predicate register. The cost should be more expensive than the base
+  // extract cost, which is 2 for most CPUs.
+  return CostKind == TTI::TCK_CodeSize
+             ? 2
+             : ST->getVectorInsertExtractBaseCost() + 1;
+}
+
 InstructionCost AArch64TTIImpl::getScalarizationOverhead(
     VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
     TTI::TargetCostKind CostKind, bool ForPoisonSrc,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 9c96fdd427814..36706096cf964 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -220,6 +220,10 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
                                      TTI::TargetCostKind CostKind,
                                      unsigned Index) const override;
 
+  InstructionCost getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
+                                            TTI::TargetCostKind CostKind,
+                                            unsigned Index) const override;
+
   InstructionCost
   getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF,
                          TTI::TargetCostKind CostKind) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 85b3059d87da7..396bb196bc1d7 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -2415,6 +2415,23 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
   return BaseCost + SlideCost;
 }
 
+InstructionCost
+RISCVTTIImpl::getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
+                                        TTI::TargetCostKind CostKind,
+                                        unsigned Index) const {
+  if (auto *FixedVecTy = dyn_cast<FixedVectorType>(Val))
+    return BaseT::getVectorInstrCostFromEnd(Opcode, Val, CostKind, Index);
+
+  // TODO: This code replicates what LoopVectorize.cpp used to do when asking
+  // for the cost of extracting the last lane of a scalable vector. It probably
+  // needs a more accurate cost.
+  ElementCount EC = cast<VectorType>(Val)->getElementCount();
+  assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
+  return getVectorInstrCost(Opcode, Val, CostKind,
+                            EC.getKnownMinValue() - 1 - Index, nullptr,
+                            nullptr);
+}
+
 InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 6a1f4b3e3bedf..f502904645d0e 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -243,6 +243,10 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
                                      unsigned Index, const Value *Op0,
                                      const Value *Op1) const override;
 
+  InstructionCost getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
+                                            TTI::TargetCostKind CostKind,
+                                            unsigned Index) const override;
+
   InstructionCost getArithmeticInstrCost(
       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
       TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 675a230bd2c94..fde5c0ba85b82 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5297,13 +5297,13 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
   // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
   // the actual generated code, which involves extracting the last element of
   // a scalable vector where the lane to extract is unknown at compile time.
-  return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
-         TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
-                             CostKind) +
-         (IsLoopInvariantStoreValue
-              ? 0
-              : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
-                                       CostKind, VF.getKnownMinValue() - 1));
+  InstructionCost Cost =
+      TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
+      TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind);
+  if (!IsLoopInvariantStoreValue)
+    Cost += TTI.getVectorInstrCostFromEnd(Instruction::ExtractElement, VectorTy,
+                                          CostKind, 0);
+  return Cost;
 }
 
 InstructionCost
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index fa62547d374cd..9c7ecb30f0ac2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1011,6 +1011,12 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
                                   I32Ty, {Arg0Ty, I32Ty, I1Ty});
     return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
   }
+  case VPInstruction::ExtractLastElement: {
+    // Add on the cost of extracting the element.
+    auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
+    return Ctx.TTI.getVectorInstrCostFromEnd(Instruction::ExtractElement, VecTy,
+                                             Ctx.CostKind, 0);
+  }
   case VPInstruction::ExtractPenultimateElement:
     if (VF == ElementCount::getScalable(1))
       return InstructionCost::getInvalid();
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index 58f2af73bd04c..11bb4d234f3f3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -903,30 +903,23 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFNONE-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
 ; TFNONE-NEXT:  [[ENTRY:.*]]:
 ; TFNONE-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
-; TFNONE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 1
-; TFNONE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; TFNONE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
 ; TFNONE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; TFNONE:       [[VECTOR_PH]]:
-; TFNONE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
-; TFNONE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; TFNONE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2
 ; TFNONE-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; TFNONE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TFNONE:       [[VECTOR_BODY]]:
 ; TFNONE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFNONE-NEXT:    [[TMP7:%.*]] = load double, ptr [[P2]], align 8
-; TFNONE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP7]], i64 0
-; TFNONE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
-; TFNONE-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true))
-; TFNONE-NEXT:    [[TMP9:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP8]], zeroinitializer
-; TFNONE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00)
-; TFNONE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
-; TFNONE-NEXT:    [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 2
-; TFNONE-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP12]], 1
-; TFNONE-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI]], i32 [[TMP13]]
+; TFNONE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i64 0
+; TFNONE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
+; TFNONE-NEXT:    [[TMP2:%.*]] = call <2 x double> @exp_fixed(<2 x double> [[BROADCAST_SPLAT]])
+; TFNONE-NEXT:    [[TMP3:%.*]] = fcmp ogt <2 x double> [[TMP2]], zeroinitializer
+; TFNONE-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00)
+; TFNONE-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 1
 ; TFNONE-NEXT:    store double [[TMP14]], ptr [[P]], align 8
-; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; TFNONE-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; TFNONE-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; TFNONE:       [[MIDDLE_BLOCK]]:

From 96232207a5cbd54c010b7ad43e3331b6a481ab7a Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Mon, 18 Aug 2025 14:16:27 +0000
Subject: [PATCH 2/2] Address review comments

---
 llvm/include/llvm/Analysis/TargetTransformInfo.h     | 12 ++++++------
 llvm/include/llvm/Analysis/TargetTransformInfoImpl.h |  6 +++---
 llvm/include/llvm/CodeGen/BasicTTIImpl.h             |  7 ++++---
 llvm/lib/Analysis/TargetTransformInfo.cpp            |  9 ++++-----
 .../Target/AArch64/AArch64TargetTransformInfo.cpp    | 11 ++++++-----
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h |  7 ++++---
 llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp   | 11 ++++++-----
 llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h     |  7 ++++---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp      |  4 ++--
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp       |  4 ++--
 10 files changed, 41 insertions(+), 37 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 19bd70a91606e..1e03209e888bf 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1513,12 +1513,12 @@ class TargetTransformInfo {
                                               unsigned Index = -1) const;
 
   /// \return The expected cost of inserting or extracting a lane that is \p
-  /// Index from the end of a vector, i.e. the mathematical expression for
-  /// the lane is (VF - 1 - Index). This is required for scalable vectors where
-  /// the exact lane index is unknown at compile time.
-  LLVM_ABI InstructionCost
-  getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
-                            TTI::TargetCostKind CostKind, unsigned Index) const;
+  /// Index elements from the end of a vector, i.e. the mathematical expression
+  /// for the lane is (VF - 1 - Index). This is required for scalable vectors
+  /// where the exact lane index is unknown at compile time.
+  LLVM_ABI InstructionCost getIndexedVectorInstrCostFromEnd(
+      unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind,
+      unsigned Index) const;
 
   /// \return The expected cost of aggregate inserts and extracts. This is
   /// used when the instruction is not available; a typical use case is to
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 138ee3bbed40d..252acf381a8e1 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -810,9 +810,9 @@ class TargetTransformInfoImplBase {
   }
 
   virtual InstructionCost
-  getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
-                            TTI::TargetCostKind CostKind,
-                            unsigned Index) const {
+  getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
+                                   TTI::TargetCostKind CostKind,
+                                   unsigned Index) const {
     return 1;
   }
 
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 124b93804a630..27320b510b950 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1444,9 +1444,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
                                        Op1);
   }
 
-  InstructionCost getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
-                                            TTI::TargetCostKind CostKind,
-                                            unsigned Index) const override {
+  InstructionCost
+  getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
+                                   TTI::TargetCostKind CostKind,
+                                   unsigned Index) const override {
     unsigned NewIndex = -1;
     if (auto *FVTy = dyn_cast<FixedVectorType>(Val)) {
       assert(Index < FVTy->getNumElements() &&
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 262fe51e41739..323ab8b1ddad1 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1130,12 +1130,11 @@ TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
   return Cost;
 }
 
-InstructionCost
-TargetTransformInfo::getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
-                                               TTI::TargetCostKind CostKind,
-                                               unsigned Index) const {
+InstructionCost TargetTransformInfo::getIndexedVectorInstrCostFromEnd(
+    unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind,
+    unsigned Index) const {
   InstructionCost Cost =
-      TTIImpl->getVectorInstrCostFromEnd(Opcode, Val, CostKind, Index);
+      TTIImpl->getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind, Index);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index c10025727b884..cd3b85dd52173 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3987,11 +3987,12 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
 }
 
 InstructionCost
-AArch64TTIImpl::getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
-                                          TTI::TargetCostKind CostKind,
-                                          unsigned Index) const {
-  if (auto *FixedVecTy = dyn_cast<FixedVectorType>(Val))
-    return BaseT::getVectorInstrCostFromEnd(Opcode, Val, CostKind, Index);
+AArch64TTIImpl::getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
+                                                 TTI::TargetCostKind CostKind,
+                                                 unsigned Index) const {
+  if (isa<FixedVectorType>(Val))
+    return BaseT::getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind,
+                                                   Index);
 
   // This typically requires both while and lastb instructions in order
   // to extract the last element. If this is in a loop the while
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 36706096cf964..42ae962b3b426 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -220,9 +220,10 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
                                      TTI::TargetCostKind CostKind,
                                      unsigned Index) const override;
 
-  InstructionCost getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
-                                            TTI::TargetCostKind CostKind,
-                                            unsigned Index) const override;
+  InstructionCost
+  getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
+                                   TTI::TargetCostKind CostKind,
+                                   unsigned Index) const override;
 
   InstructionCost
   getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF,
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 396bb196bc1d7..c707fb110b10c 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -2416,11 +2416,12 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
 }
 
 InstructionCost
-RISCVTTIImpl::getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
-                                        TTI::TargetCostKind CostKind,
-                                        unsigned Index) const {
-  if (auto *FixedVecTy = dyn_cast<FixedVectorType>(Val))
-    return BaseT::getVectorInstrCostFromEnd(Opcode, Val, CostKind, Index);
+RISCVTTIImpl::getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
+                                               TTI::TargetCostKind CostKind,
+                                               unsigned Index) const {
+  if (isa<FixedVectorType>(Val))
+    return BaseT::getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind,
+                                                   Index);
 
   // TODO: This code replicates what LoopVectorize.cpp used to do when asking
   // for the cost of extracting the last lane of a scalable vector. It probably
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index f502904645d0e..b632f25b963f7 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -243,9 +243,10 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
                                      unsigned Index, const Value *Op0,
                                      const Value *Op1) const override;
 
-  InstructionCost getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
-                                            TTI::TargetCostKind CostKind,
-                                            unsigned Index) const override;
+  InstructionCost
+  getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
+                                   TTI::TargetCostKind CostKind,
+                                   unsigned Index) const override;
 
   InstructionCost getArithmeticInstrCost(
       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index fde5c0ba85b82..4f795b4d0fb9e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5301,8 +5301,8 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
       TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
       TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind);
   if (!IsLoopInvariantStoreValue)
-    Cost += TTI.getVectorInstrCostFromEnd(Instruction::ExtractElement, VectorTy,
-                                          CostKind, 0);
+    Cost += TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
+                                                 VectorTy, CostKind, 0);
   return Cost;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9c7ecb30f0ac2..2a5d14cb6fa09 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1014,8 +1014,8 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
   case VPInstruction::ExtractLastElement: {
     // Add on the cost of extracting the element.
     auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
-    return Ctx.TTI.getVectorInstrCostFromEnd(Instruction::ExtractElement, VecTy,
-                                             Ctx.CostKind, 0);
+    return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
+                                                    VecTy, Ctx.CostKind, 0);
   }
   case VPInstruction::ExtractPenultimateElement:
     if (VF == ElementCount::getScalable(1))