Skip to content

Commit 02d3738

Browse files
authored
[AArch64,TTI] Remove RealUse check for vector insert/extract costs. (#146526)
getVectorInstrCostHelper would return costs of zero for vector inserts/extracts that move data between GPR and vector registers, if there was no 'real' use, i.e. there was no corresponding existing instruction. This meant that passes like LoopVectorize and SLPVectorizer, which likely are the main users of the interface, would understimate the cost of insert/extracts that move data between GPR and vector registers, which has non-trivial costs. The patch removes the special case and only returns costs of zero for lane 0 if it there is no need to transfer between integer and vector registers. This impacts a number of SLP test, and most of them look like general improvements.I think the change should make things more accurate for any AArch64 target, but if not it could also just be Apple CPU specific. I am seeing +2% end-to-end improvements on SLP-heavy workloads. PR: #146526
1 parent 9f79374 commit 02d3738

27 files changed

+891
-351
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3724,7 +3724,7 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
37243724

37253725
InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
37263726
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3727-
bool HasRealUse, const Instruction *I, Value *Scalar,
3727+
const Instruction *I, Value *Scalar,
37283728
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
37293729
assert(Val->isVectorTy() && "This must be a vector type");
37303730

@@ -3744,12 +3744,10 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
37443744
}
37453745

37463746
// The element at index zero is already inside the vector.
3747-
// - For a physical (HasRealUse==true) insert-element or extract-element
3747+
// - For a insert-element or extract-element
37483748
// instruction that extracts integers, an explicit FPR -> GPR move is
37493749
// needed. So it has non-zero cost.
3750-
// - For the rest of cases (virtual instruction or element type is float),
3751-
// consider the instruction free.
3752-
if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
3750+
if (Index == 0 && !Val->getScalarType()->isIntegerTy())
37533751
return 0;
37543752

37553753
// This is recognising a LD1 single-element structure to one lane of one
@@ -3899,25 +3897,28 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
38993897
unsigned Index,
39003898
const Value *Op0,
39013899
const Value *Op1) const {
3902-
bool HasRealUse =
3903-
Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
3904-
return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse);
3900+
// Treat insert at lane 0 into a poison vector as having zero cost. This
3901+
// ensures vector broadcasts via an insert + shuffle (and will be lowered to a
3902+
// single dup) are treated as cheap.
3903+
if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
3904+
isa<PoisonValue>(Op0))
3905+
return 0;
3906+
return getVectorInstrCostHelper(Opcode, Val, CostKind, Index);
39053907
}
39063908

39073909
InstructionCost AArch64TTIImpl::getVectorInstrCost(
39083910
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
39093911
Value *Scalar,
39103912
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
3911-
return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr,
3912-
Scalar, ScalarUserAndIdx);
3913+
return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
3914+
ScalarUserAndIdx);
39133915
}
39143916

39153917
InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
39163918
Type *Val,
39173919
TTI::TargetCostKind CostKind,
39183920
unsigned Index) const {
3919-
return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index,
3920-
true /* HasRealUse */, &I);
3921+
return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I);
39213922
}
39223923

39233924
InstructionCost AArch64TTIImpl::getScalarizationOverhead(

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -65,16 +65,14 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
6565

6666
// A helper function called by 'getVectorInstrCost'.
6767
//
68-
// 'Val' and 'Index' are forwarded from 'getVectorInstrCost'; 'HasRealUse'
69-
// indicates whether the vector instruction is available in the input IR or
70-
// just imaginary in vectorizer passes.
71-
/// \param ScalarUserAndIdx encodes the information about extracts from a
68+
// 'Val' and 'Index' are forwarded from 'getVectorInstrCost';
69+
// \param ScalarUserAndIdx encodes the information about extracts from a
7270
/// vector with 'Scalar' being the value being extracted,'User' being the user
7371
/// of the extract(nullptr if user is not known before vectorization) and
7472
/// 'Idx' being the extract lane.
7573
InstructionCost getVectorInstrCostHelper(
7674
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
77-
bool HasRealUse, const Instruction *I = nullptr, Value *Scalar = nullptr,
75+
const Instruction *I = nullptr, Value *Scalar = nullptr,
7876
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx = {}) const;
7977

8078
public:

llvm/test/Analysis/CostModel/AArch64/reduce-and.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ define void @reduce() {
1313
; CHECK-NEXT: Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
1414
; CHECK-NEXT: Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
1515
; CHECK-NEXT: Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
16-
; CHECK-NEXT: Cost Model: Found costs of 0 for: %V1i8 = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> undef)
17-
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V3i8 = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> undef)
16+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> undef)
17+
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> undef)
1818
; CHECK-NEXT: Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
1919
; CHECK-NEXT: Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
2020
; CHECK-NEXT: Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)

llvm/test/Analysis/CostModel/AArch64/reduce-or.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ define void @reduce() {
1313
; CHECK-NEXT: Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
1414
; CHECK-NEXT: Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
1515
; CHECK-NEXT: Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
16-
; CHECK-NEXT: Cost Model: Found costs of 0 for: %V1i8 = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> undef)
17-
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V3i8 = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> undef)
16+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> undef)
17+
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> undef)
1818
; CHECK-NEXT: Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
1919
; CHECK-NEXT: Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
2020
; CHECK-NEXT: Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)

llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ define void @reduce() {
1313
; CHECK-NEXT: Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
1414
; CHECK-NEXT: Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
1515
; CHECK-NEXT: Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
16-
; CHECK-NEXT: Cost Model: Found costs of 0 for: %V1i8 = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> undef)
17-
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V3i8 = call i8 @llvm.vector.reduce.xor.v3i8(<3 x i8> undef)
16+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> undef)
17+
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.xor.v3i8(<3 x i8> undef)
1818
; CHECK-NEXT: Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
1919
; CHECK-NEXT: Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
2020
; CHECK-NEXT: Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)

0 commit comments

Comments
 (0)