Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1512,6 +1512,14 @@ class TargetTransformInfo {
TTI::TargetCostKind CostKind,
unsigned Index = -1) const;

/// \return The expected cost of inserting or extracting a lane that is \p
/// Index elements from the end of a vector, i.e. the mathematical expression
/// for the lane is (VF - 1 - Index). This is required for scalable vectors
/// where the exact lane index is unknown at compile time.
LLVM_ABI InstructionCost getIndexedVectorInstrCostFromEnd(
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind,
unsigned Index) const;

/// \return The expected cost of aggregate inserts and extracts. This is
/// used when the instruction is not available; a typical use case is to
/// provision the cost of vectorization/scalarization in vectorizer passes.
Expand Down
7 changes: 7 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -809,6 +809,13 @@ class TargetTransformInfoImplBase {
return 1;
}

virtual InstructionCost
getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index) const {
return 1;
}

virtual InstructionCost
getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF,
const APInt &DemandedDstElts,
Expand Down
14 changes: 14 additions & 0 deletions llvm/include/llvm/CodeGen/BasicTTIImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1444,6 +1444,20 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
Op1);
}

InstructionCost
getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index) const override {
unsigned NewIndex = -1;
if (auto *FVTy = dyn_cast<FixedVectorType>(Val)) {
assert(Index < FVTy->getNumElements() &&
"Unexpected index from end of vector");
NewIndex = FVTy->getNumElements() - 1 - Index;
}
return thisT()->getVectorInstrCost(Opcode, Val, CostKind, NewIndex, nullptr,
nullptr);
}

InstructionCost
getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF,
const APInt &DemandedDstElts,
Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Analysis/TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1130,6 +1130,15 @@ TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
return Cost;
}

InstructionCost TargetTransformInfo::getIndexedVectorInstrCostFromEnd(
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind,
unsigned Index) const {
InstructionCost Cost =
TTIImpl->getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind, Index);
assert(Cost >= 0 && "TTI should not produce negative costs!");
return Cost;
}

InstructionCost TargetTransformInfo::getInsertExtractValueCost(
unsigned Opcode, TTI::TargetCostKind CostKind) const {
assert((Opcode == Instruction::InsertValue ||
Expand Down
18 changes: 18 additions & 0 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3986,6 +3986,24 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I);
}

InstructionCost
AArch64TTIImpl::getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index) const {
if (isa<FixedVectorType>(Val))
return BaseT::getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind,
Index);

// This typically requires both while and lastb instructions in order
// to extract the last element. If this is in a loop the while
// instruction can at least be hoisted out, although it will consume a
// predicate register. The cost should be more expensive than the base
// extract cost, which is 2 for most CPUs.
return CostKind == TTI::TCK_CodeSize
? 2
: ST->getVectorInsertExtractBaseCost() + 1;
}

InstructionCost AArch64TTIImpl::getScalarizationOverhead(
VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
TTI::TargetCostKind CostKind, bool ForPoisonSrc,
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,11 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
TTI::TargetCostKind CostKind,
unsigned Index) const override;

InstructionCost
getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index) const override;

InstructionCost
getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF,
TTI::TargetCostKind CostKind) const override;
Expand Down
18 changes: 18 additions & 0 deletions llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2415,6 +2415,24 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
return BaseCost + SlideCost;
}

InstructionCost
RISCVTTIImpl::getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index) const {
if (isa<FixedVectorType>(Val))
return BaseT::getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind,
Index);

// TODO: This code replicates what LoopVectorize.cpp used to do when asking
// for the cost of extracting the last lane of a scalable vector. It probably
// needs a more accurate cost.
ElementCount EC = cast<VectorType>(Val)->getElementCount();
assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
return getVectorInstrCost(Opcode, Val, CostKind,
EC.getKnownMinValue() - 1 - Index, nullptr,
nullptr);
}

InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,11 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
unsigned Index, const Value *Op0,
const Value *Op1) const override;

InstructionCost
getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index) const override;

InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
Expand Down
14 changes: 7 additions & 7 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5297,13 +5297,13 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
// VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
// the actual generated code, which involves extracting the last element of
// a scalable vector where the lane to extract is unknown at compile time.
return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
CostKind) +
(IsLoopInvariantStoreValue
? 0
: TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
CostKind, VF.getKnownMinValue() - 1));
InstructionCost Cost =
TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind);
if (!IsLoopInvariantStoreValue)
Cost += TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
VectorTy, CostKind, 0);
return Cost;
}

InstructionCost
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1011,6 +1011,12 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
I32Ty, {Arg0Ty, I32Ty, I1Ty});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
}
case VPInstruction::ExtractLastElement: {
// Add on the cost of extracting the element.
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
VecTy, Ctx.CostKind, 0);
}
case VPInstruction::ExtractPenultimateElement:
if (VF == ElementCount::getScalable(1))
return InstructionCost::getInvalid();
Expand Down
25 changes: 9 additions & 16 deletions llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
Original file line number Diff line number Diff line change
Expand Up @@ -903,30 +903,23 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
; TFNONE-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
; TFNONE-NEXT: [[ENTRY:.*]]:
; TFNONE-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
; TFNONE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
; TFNONE-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 1
; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
; TFNONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; TFNONE: [[VECTOR_PH]]:
; TFNONE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; TFNONE-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2
; TFNONE-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; TFNONE-NEXT: br label %[[VECTOR_BODY:.*]]
; TFNONE: [[VECTOR_BODY]]:
; TFNONE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; TFNONE-NEXT: [[TMP7:%.*]] = load double, ptr [[P2]], align 8
; TFNONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP7]], i64 0
; TFNONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
; TFNONE-NEXT: [[TMP8:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true))
; TFNONE-NEXT: [[TMP9:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP8]], zeroinitializer
; TFNONE-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00)
; TFNONE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
; TFNONE-NEXT: [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 2
; TFNONE-NEXT: [[TMP13:%.*]] = sub i32 [[TMP12]], 1
; TFNONE-NEXT: [[TMP14:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI]], i32 [[TMP13]]
; TFNONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i64 0
; TFNONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
; TFNONE-NEXT: [[TMP2:%.*]] = call <2 x double> @exp_fixed(<2 x double> [[BROADCAST_SPLAT]])
; TFNONE-NEXT: [[TMP3:%.*]] = fcmp ogt <2 x double> [[TMP2]], zeroinitializer
; TFNONE-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00)
; TFNONE-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 1
; TFNONE-NEXT: store double [[TMP14]], ptr [[P]], align 8
; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; TFNONE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; TFNONE-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; TFNONE: [[MIDDLE_BLOCK]]:
Expand Down