Skip to content

Commit 4436d1d

Browse files
authored
[AArch64] Give a higher cost for more expensive SVE FCMP instructions (#153816)
This tries to add a higher cost for SVE FCM** comparison instructions that often have a lower throughput than the Neon equivalents that can be executed on more vector pipelines. This patch takes the slightly unorthodox approach of using the information in the scheduling model to compare the throughput of a FCMEQ_PPzZZ_S (SVE) and a FCMEQv4f32 (Neon). This isn't how things will (probably) want to work in the long run, where all the information comes more directly from the scheduling model, but that still needs to be proven out. The downsides of this approach of using the scheduling model info is if the core does not have a scheduling model but wants a different cost - then an alternative approach will be needed (but then maybe that is a good reason to create a new scheduling model). The alternative would either be to make a subtarget feature for the affected cores or just always enable it.
1 parent a0c2d6e commit 4436d1d

File tree

4 files changed

+603
-268
lines changed

4 files changed

+603
-268
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4409,6 +4409,32 @@ AArch64TTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
44094409
return 1;
44104410
}
44114411

4412+
/// Check whether Opcode1 has less throughput according to the scheduling
4413+
/// model than Opcode2.
4414+
bool AArch64TTIImpl::hasKnownLowerThroughputFromSchedulingModel(
4415+
unsigned Opcode1, unsigned Opcode2) const {
4416+
const MCSchedModel &Sched = ST->getSchedModel();
4417+
const TargetInstrInfo *TII = ST->getInstrInfo();
4418+
if (!Sched.hasInstrSchedModel())
4419+
return false;
4420+
4421+
const MCSchedClassDesc *SCD1 =
4422+
Sched.getSchedClassDesc(TII->get(Opcode1).getSchedClass());
4423+
const MCSchedClassDesc *SCD2 =
4424+
Sched.getSchedClassDesc(TII->get(Opcode2).getSchedClass());
4425+
// We cannot handle variant scheduling classes without an MI. If we need to
4426+
// support them for any of the instructions we query the information of we
4427+
// might need to add a way to resolve them without a MI or not use the
4428+
// scheduling info.
4429+
assert(!SCD1->isVariant() && !SCD2->isVariant() &&
4430+
"Cannot handle variant scheduling classes without an MI");
4431+
if (!SCD1->isValid() || !SCD2->isValid())
4432+
return false;
4433+
4434+
return MCSchedModel::getReciprocalThroughput(*ST, *SCD1) >
4435+
MCSchedModel::getReciprocalThroughput(*ST, *SCD2);
4436+
}
4437+
44124438
InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
44134439
unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
44144440
TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
@@ -4506,6 +4532,12 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
45064532
(VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
45074533
Factor = 3; // fcmxx+fcmyy+or
45084534

4535+
if (isa<ScalableVectorType>(ValTy) &&
4536+
CostKind == TTI::TCK_RecipThroughput &&
4537+
hasKnownLowerThroughputFromSchedulingModel(AArch64::FCMEQ_PPzZZ_S,
4538+
AArch64::FCMEQv4f32))
4539+
Factor *= 2;
4540+
45094541
return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
45104542
}
45114543

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,11 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
174174

175175
bool prefersVectorizedAddressing() const override;
176176

177+
/// Check whether Opcode1 has less throughput according to the scheduling
178+
/// model than Opcode2.
179+
bool hasKnownLowerThroughputFromSchedulingModel(unsigned Opcode1,
180+
unsigned Opcode2) const;
181+
177182
InstructionCost
178183
getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
179184
unsigned AddressSpace,

llvm/test/Analysis/CostModel/AArch64/sve-cmpsel.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,10 @@ define <vscale x 32 x i1> @cmp_nxv32i1() {
5858
; Check fcmp for legal FP vectors
5959
define void @cmp_legal_fp() #0 {
6060
; CHECK-LABEL: 'cmp_legal_fp'
61-
; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %1 = fcmp oge <vscale x 2 x double> undef, undef
62-
; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %2 = fcmp oge <vscale x 4 x float> undef, undef
63-
; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %3 = fcmp oge <vscale x 8 x half> undef, undef
64-
; CHECK-NEXT: Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %4 = fcmp oge <vscale x 8 x bfloat> undef, undef
61+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %1 = fcmp oge <vscale x 2 x double> undef, undef
62+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %2 = fcmp oge <vscale x 4 x float> undef, undef
63+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %3 = fcmp oge <vscale x 8 x half> undef, undef
64+
; CHECK-NEXT: Cost Model: Found costs of RThru:13 CodeSize:5 Lat:5 SizeLat:5 for: %4 = fcmp oge <vscale x 8 x bfloat> undef, undef
6565
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
6666
;
6767
%1 = fcmp oge <vscale x 2 x double> undef, undef
@@ -74,7 +74,7 @@ define void @cmp_legal_fp() #0 {
7474
; Check fcmp for an illegal FP vector
7575
define <vscale x 16 x i1> @cmp_nxv16f16() {
7676
; CHECK-LABEL: 'cmp_nxv16f16'
77-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %res = fcmp oge <vscale x 16 x half> undef, undef
77+
; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %res = fcmp oge <vscale x 16 x half> undef, undef
7878
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <vscale x 16 x i1> %res
7979
;
8080
%res = fcmp oge <vscale x 16 x half> undef, undef

0 commit comments

Comments
 (0)