From 28e3bc67af84fecaf9b48f833ca096ff138922c1 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Mon, 18 Aug 2025 14:16:00 +0000 Subject: [PATCH 1/2] [LV][TTI] Calculate cost of extracting last index in a scalable vector There are a couple of places in the loop vectoriser where we want to calculate the cost of extracting the last lane in a vector. However, we wrongly assume that asking for the cost of extracting lane (VF.getKnownMinValue() - 1) is an accurate representation of the cost of extracting the last lane. For SVE at least, this is non-trivial as it requires the use of whilelo and lastb instructions. To solve this problem I have added a new getReverseVectorInstrCost interface where the index is used in reverse from the end of the vector. Suppose a vector has a given ElementCount EC, the extracted/inserted lane would be EC - 1 - Index. For scalable vectors this index is unknown at compile time. I've added a AArch64 hook that better represents the cost, and also a RISCV hook that maintains compatibility with the behaviour prior to this PR. I've also taken the liberty of adding support in vplan for calculating the cost of VPInstruction::ExtractLastElement. --- .../llvm/Analysis/TargetTransformInfo.h | 8 ++++++ .../llvm/Analysis/TargetTransformInfoImpl.h | 7 ++++++ llvm/include/llvm/CodeGen/BasicTTIImpl.h | 13 ++++++++++ llvm/lib/Analysis/TargetTransformInfo.cpp | 10 ++++++++ .../AArch64/AArch64TargetTransformInfo.cpp | 17 +++++++++++++ .../AArch64/AArch64TargetTransformInfo.h | 4 +++ .../Target/RISCV/RISCVTargetTransformInfo.cpp | 17 +++++++++++++ .../Target/RISCV/RISCVTargetTransformInfo.h | 4 +++ .../Transforms/Vectorize/LoopVectorize.cpp | 14 +++++------ .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 6 +++++ .../LoopVectorize/AArch64/masked-call.ll | 25 +++++++------------ 11 files changed, 102 insertions(+), 23 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 9186419715cc4..19bd70a91606e 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1512,6 +1512,14 @@ class TargetTransformInfo { TTI::TargetCostKind CostKind, unsigned Index = -1) const; + /// \return The expected cost of inserting or extracting a lane that is \p + /// Index from the end of a vector, i.e. the mathematical expression for + /// the lane is (VF - 1 - Index). This is required for scalable vectors where + /// the exact lane index is unknown at compile time. + LLVM_ABI InstructionCost + getVectorInstrCostFromEnd(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index) const; + /// \return The expected cost of aggregate inserts and extracts. This is /// used when the instruction is not available; a typical use case is to /// provision the cost of vectorization/scalarization in vectorizer passes. diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 200cbafbaa6e2..138ee3bbed40d 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -809,6 +809,13 @@ class TargetTransformInfoImplBase { return 1; } + virtual InstructionCost + getVectorInstrCostFromEnd(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) const { + return 1; + } + virtual InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index aa9d1f0a1ccea..124b93804a630 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1444,6 +1444,19 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { Op1); } + InstructionCost getVectorInstrCostFromEnd(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) const override { + unsigned NewIndex = -1; + if (auto *FVTy = dyn_cast(Val)) { + assert(Index < FVTy->getNumElements() && + "Unexpected index from end of vector"); + NewIndex = FVTy->getNumElements() - 1 - Index; + } + return thisT()->getVectorInstrCost(Opcode, Val, CostKind, NewIndex, nullptr, + nullptr); + } + InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 3141060a710ce..262fe51e41739 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1130,6 +1130,16 @@ TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val, return Cost; } +InstructionCost +TargetTransformInfo::getVectorInstrCostFromEnd(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) const { + InstructionCost Cost = + TTIImpl->getVectorInstrCostFromEnd(Opcode, Val, CostKind, Index); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + InstructionCost TargetTransformInfo::getInsertExtractValueCost( unsigned Opcode, TTI::TargetCostKind CostKind) const { assert((Opcode == Instruction::InsertValue || diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index fc332d5320181..c10025727b884 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -3986,6 +3986,23 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I, return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I); } +InstructionCost +AArch64TTIImpl::getVectorInstrCostFromEnd(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) const { + if (auto *FixedVecTy = dyn_cast(Val)) + return BaseT::getVectorInstrCostFromEnd(Opcode, Val, CostKind, Index); + + // This typically requires both while and lastb instructions in order + // to extract the last element. If this is in a loop the while + // instruction can at least be hoisted out, although it will consume a + // predicate register. The cost should be more expensive than the base + // extract cost, which is 2 for most CPUs. + return CostKind == TTI::TCK_CodeSize + ? 2 + : ST->getVectorInsertExtractBaseCost() + 1; +} + InstructionCost AArch64TTIImpl::getScalarizationOverhead( VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 9c96fdd427814..36706096cf964 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -220,6 +220,10 @@ class AArch64TTIImpl final : public BasicTTIImplBase { TTI::TargetCostKind CostKind, unsigned Index) const override; + InstructionCost getVectorInstrCostFromEnd(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) const override; + InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 85b3059d87da7..396bb196bc1d7 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -2415,6 +2415,23 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, return BaseCost + SlideCost; } +InstructionCost +RISCVTTIImpl::getVectorInstrCostFromEnd(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) const { + if (auto *FixedVecTy = dyn_cast(Val)) + return BaseT::getVectorInstrCostFromEnd(Opcode, Val, CostKind, Index); + + // TODO: This code replicates what LoopVectorize.cpp used to do when asking + // for the cost of extracting the last lane of a scalable vector. It probably + // needs a more accurate cost. + ElementCount EC = cast(Val)->getElementCount(); + assert(Index < EC.getKnownMinValue() && "Unexpected reverse index"); + return getVectorInstrCost(Opcode, Val, CostKind, + EC.getKnownMinValue() - 1 - Index, nullptr, + nullptr); +} + InstructionCost RISCVTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 6a1f4b3e3bedf..f502904645d0e 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -243,6 +243,10 @@ class RISCVTTIImpl final : public BasicTTIImplBase { unsigned Index, const Value *Op0, const Value *Op1) const override; + InstructionCost getVectorInstrCostFromEnd(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) const override; + InstructionCost getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None}, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 675a230bd2c94..fde5c0ba85b82 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5297,13 +5297,13 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent // the actual generated code, which involves extracting the last element of // a scalable vector where the lane to extract is unknown at compile time. - return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) + - TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, - CostKind) + - (IsLoopInvariantStoreValue - ? 0 - : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, - CostKind, VF.getKnownMinValue() - 1)); + InstructionCost Cost = + TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) + + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind); + if (!IsLoopInvariantStoreValue) + Cost += TTI.getVectorInstrCostFromEnd(Instruction::ExtractElement, VectorTy, + CostKind, 0); + return Cost; } InstructionCost diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index fa62547d374cd..9c7ecb30f0ac2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1011,6 +1011,12 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, I32Ty, {Arg0Ty, I32Ty, I1Ty}); return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); } + case VPInstruction::ExtractLastElement: { + // Add on the cost of extracting the element. + auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); + return Ctx.TTI.getVectorInstrCostFromEnd(Instruction::ExtractElement, VecTy, + Ctx.CostKind, 0); + } case VPInstruction::ExtractPenultimateElement: if (VF == ElementCount::getScalable(1)) return InstructionCost::getInvalid(); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll index 58f2af73bd04c..11bb4d234f3f3 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -903,30 +903,23 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFNONE-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { ; TFNONE-NEXT: [[ENTRY:.*]]: ; TFNONE-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; TFNONE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 1 -; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2 ; TFNONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; TFNONE: [[VECTOR_PH]]: -; TFNONE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2 -; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]] +; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2 ; TFNONE-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; TFNONE-NEXT: br label %[[VECTOR_BODY:.*]] ; TFNONE: [[VECTOR_BODY]]: ; TFNONE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; TFNONE-NEXT: [[TMP7:%.*]] = load double, ptr [[P2]], align 8 -; TFNONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, double [[TMP7]], i64 0 -; TFNONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; TFNONE-NEXT: [[TMP8:%.*]] = call @exp_masked_scalable( [[BROADCAST_SPLAT]], splat (i1 true)) -; TFNONE-NEXT: [[TMP9:%.*]] = fcmp ogt [[TMP8]], zeroinitializer -; TFNONE-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], zeroinitializer, splat (double 1.000000e+00) -; TFNONE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32() -; TFNONE-NEXT: [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 2 -; TFNONE-NEXT: [[TMP13:%.*]] = sub i32 [[TMP12]], 1 -; TFNONE-NEXT: [[TMP14:%.*]] = extractelement [[PREDPHI]], i32 [[TMP13]] +; TFNONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i64 0 +; TFNONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer +; TFNONE-NEXT: [[TMP2:%.*]] = call <2 x double> @exp_fixed(<2 x double> [[BROADCAST_SPLAT]]) +; TFNONE-NEXT: [[TMP3:%.*]] = fcmp ogt <2 x double> [[TMP2]], zeroinitializer +; TFNONE-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00) +; TFNONE-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 1 ; TFNONE-NEXT: store double [[TMP14]], ptr [[P]], align 8 -; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; TFNONE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TFNONE-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; TFNONE: [[MIDDLE_BLOCK]]: From 96232207a5cbd54c010b7ad43e3331b6a481ab7a Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Mon, 18 Aug 2025 14:16:27 +0000 Subject: [PATCH 2/2] Address review comments --- llvm/include/llvm/Analysis/TargetTransformInfo.h | 12 ++++++------ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h | 6 +++--- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 7 ++++--- llvm/lib/Analysis/TargetTransformInfo.cpp | 9 ++++----- .../Target/AArch64/AArch64TargetTransformInfo.cpp | 11 ++++++----- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h | 7 ++++--- llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp | 11 ++++++----- llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h | 7 ++++--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++-- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 4 ++-- 10 files changed, 41 insertions(+), 37 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 19bd70a91606e..1e03209e888bf 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1513,12 +1513,12 @@ class TargetTransformInfo { unsigned Index = -1) const; /// \return The expected cost of inserting or extracting a lane that is \p - /// Index from the end of a vector, i.e. the mathematical expression for - /// the lane is (VF - 1 - Index). This is required for scalable vectors where - /// the exact lane index is unknown at compile time. - LLVM_ABI InstructionCost - getVectorInstrCostFromEnd(unsigned Opcode, Type *Val, - TTI::TargetCostKind CostKind, unsigned Index) const; + /// Index elements from the end of a vector, i.e. the mathematical expression + /// for the lane is (VF - 1 - Index). This is required for scalable vectors + /// where the exact lane index is unknown at compile time. + LLVM_ABI InstructionCost getIndexedVectorInstrCostFromEnd( + unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, + unsigned Index) const; /// \return The expected cost of aggregate inserts and extracts. This is /// used when the instruction is not available; a typical use case is to diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 138ee3bbed40d..252acf381a8e1 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -810,9 +810,9 @@ class TargetTransformInfoImplBase { } virtual InstructionCost - getVectorInstrCostFromEnd(unsigned Opcode, Type *Val, - TTI::TargetCostKind CostKind, - unsigned Index) const { + getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) const { return 1; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 124b93804a630..27320b510b950 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1444,9 +1444,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { Op1); } - InstructionCost getVectorInstrCostFromEnd(unsigned Opcode, Type *Val, - TTI::TargetCostKind CostKind, - unsigned Index) const override { + InstructionCost + getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) const override { unsigned NewIndex = -1; if (auto *FVTy = dyn_cast(Val)) { assert(Index < FVTy->getNumElements() && diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 262fe51e41739..323ab8b1ddad1 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1130,12 +1130,11 @@ TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val, return Cost; } -InstructionCost -TargetTransformInfo::getVectorInstrCostFromEnd(unsigned Opcode, Type *Val, - TTI::TargetCostKind CostKind, - unsigned Index) const { +InstructionCost TargetTransformInfo::getIndexedVectorInstrCostFromEnd( + unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, + unsigned Index) const { InstructionCost Cost = - TTIImpl->getVectorInstrCostFromEnd(Opcode, Val, CostKind, Index); + TTIImpl->getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind, Index); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index c10025727b884..cd3b85dd52173 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -3987,11 +3987,12 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I, } InstructionCost -AArch64TTIImpl::getVectorInstrCostFromEnd(unsigned Opcode, Type *Val, - TTI::TargetCostKind CostKind, - unsigned Index) const { - if (auto *FixedVecTy = dyn_cast(Val)) - return BaseT::getVectorInstrCostFromEnd(Opcode, Val, CostKind, Index); +AArch64TTIImpl::getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) const { + if (isa(Val)) + return BaseT::getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind, + Index); // This typically requires both while and lastb instructions in order // to extract the last element. If this is in a loop the while diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 36706096cf964..42ae962b3b426 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -220,9 +220,10 @@ class AArch64TTIImpl final : public BasicTTIImplBase { TTI::TargetCostKind CostKind, unsigned Index) const override; - InstructionCost getVectorInstrCostFromEnd(unsigned Opcode, Type *Val, - TTI::TargetCostKind CostKind, - unsigned Index) const override; + InstructionCost + getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) const override; InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 396bb196bc1d7..c707fb110b10c 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -2416,11 +2416,12 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, } InstructionCost -RISCVTTIImpl::getVectorInstrCostFromEnd(unsigned Opcode, Type *Val, - TTI::TargetCostKind CostKind, - unsigned Index) const { - if (auto *FixedVecTy = dyn_cast(Val)) - return BaseT::getVectorInstrCostFromEnd(Opcode, Val, CostKind, Index); +RISCVTTIImpl::getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) const { + if (isa(Val)) + return BaseT::getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind, + Index); // TODO: This code replicates what LoopVectorize.cpp used to do when asking // for the cost of extracting the last lane of a scalable vector. It probably diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index f502904645d0e..b632f25b963f7 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -243,9 +243,10 @@ class RISCVTTIImpl final : public BasicTTIImplBase { unsigned Index, const Value *Op0, const Value *Op1) const override; - InstructionCost getVectorInstrCostFromEnd(unsigned Opcode, Type *Val, - TTI::TargetCostKind CostKind, - unsigned Index) const override; + InstructionCost + getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) const override; InstructionCost getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index fde5c0ba85b82..4f795b4d0fb9e 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5301,8 +5301,8 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind); if (!IsLoopInvariantStoreValue) - Cost += TTI.getVectorInstrCostFromEnd(Instruction::ExtractElement, VectorTy, - CostKind, 0); + Cost += TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement, + VectorTy, CostKind, 0); return Cost; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 9c7ecb30f0ac2..2a5d14cb6fa09 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1014,8 +1014,8 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, case VPInstruction::ExtractLastElement: { // Add on the cost of extracting the element. auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); - return Ctx.TTI.getVectorInstrCostFromEnd(Instruction::ExtractElement, VecTy, - Ctx.CostKind, 0); + return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement, + VecTy, Ctx.CostKind, 0); } case VPInstruction::ExtractPenultimateElement: if (VF == ElementCount::getScalable(1))