Skip to content

Commit b73771c

Browse files
authored
[AArch64] Increase scatter overhead on Neoverse-V2 (#101296)
This patch increases scatter overhead on Neoverse-V2 to 13. This benefits s128 kernel from TSVC_2 test suite. SPEC 17, RAJAPerf, and Sptter are unaffected by this patch. This patch boosts s128 kernel's performance from TSVC test suite by about 40% as this enables vectorization. Also, handle minor code refactoring for gather related part.
1 parent b8f560f commit b73771c

File tree

4 files changed

+40
-6
lines changed

4 files changed

+40
-6
lines changed

llvm/lib/Target/AArch64/AArch64Subtarget.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,9 +233,12 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
233233
PrefLoopAlignment = Align(32);
234234
MaxBytesForLoopAlignment = 16;
235235
break;
236+
case NeoverseV2:
237+
// Specialize cost for Neoverse-V2.
238+
ScatterOverhead = 13;
239+
LLVM_FALLTHROUGH;
236240
case NeoverseN2:
237241
case NeoverseN3:
238-
case NeoverseV2:
239242
case NeoverseV3:
240243
PrefFunctionAlignment = Align(16);
241244
PrefLoopAlignment = Align(32);

llvm/lib/Target/AArch64/AArch64Subtarget.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,9 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
5959
uint8_t MaxInterleaveFactor = 2;
6060
uint8_t VectorInsertExtractBaseCost = 2;
6161
uint16_t CacheLineSize = 0;
62+
// Default scatter/gather overhead.
63+
unsigned ScatterOverhead = 10;
64+
unsigned GatherOverhead = 10;
6265
uint16_t PrefetchDistance = 0;
6366
uint16_t MinPrefetchStride = 1;
6467
unsigned MaxPrefetchIterationsAhead = UINT_MAX;
@@ -225,6 +228,8 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
225228
unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
226229
unsigned getVectorInsertExtractBaseCost() const;
227230
unsigned getCacheLineSize() const override { return CacheLineSize; }
231+
unsigned getScatterOverhead() const { return ScatterOverhead; }
232+
unsigned getGatherOverhead() const { return GatherOverhead; }
228233
unsigned getPrefetchDistance() const override { return PrefetchDistance; }
229234
unsigned getMinPrefetchStride(unsigned NumMemAccesses,
230235
unsigned NumStridedMemAccesses,

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3411,8 +3411,26 @@ AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
34113411
return LT.first;
34123412
}
34133413

3414-
static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
3415-
return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
3414+
// This function returns gather/scatter overhead either from
3415+
// user-provided value or specialized values per-target from \p ST.
3416+
static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
3417+
const AArch64Subtarget *ST) {
3418+
assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
3419+
"Should be called on only load or stores.");
3420+
switch (Opcode) {
3421+
case Instruction::Load:
3422+
if (SVEGatherOverhead.getNumOccurrences() > 0)
3423+
return SVEGatherOverhead;
3424+
return ST->getGatherOverhead();
3425+
break;
3426+
case Instruction::Store:
3427+
if (SVEScatterOverhead.getNumOccurrences() > 0)
3428+
return SVEScatterOverhead;
3429+
return ST->getScatterOverhead();
3430+
break;
3431+
default:
3432+
llvm_unreachable("Shouldn't have reached here");
3433+
}
34163434
}
34173435

34183436
InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
@@ -3444,9 +3462,7 @@ InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
34443462
getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
34453463
{TTI::OK_AnyValue, TTI::OP_None}, I);
34463464
// Add on an overhead cost for using gathers/scatters.
3447-
// TODO: At the moment this is applied unilaterally for all CPUs, but at some
3448-
// point we may want a per-CPU overhead.
3449-
MemOpCost *= getSVEGatherScatterOverhead(Opcode);
3465+
MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
34503466
return LT.first * MemOpCost * getMaxNumElements(LegalVF);
34513467
}
34523468

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
; RUN: opt -mtriple aarch64 -mcpu=neoverse-v2 -passes="print<cost-model>" -disable-output < %s 2>&1 | FileCheck %s -check-prefix=CHECK-V2
2+
; RUN: opt -mtriple aarch64 -mattr=+sve2 -passes="print<cost-model>" -disable-output < %s 2>&1 | FileCheck %s -check-prefix=CHECK-GENERIC
3+
; CHECK-V2: Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.scatter.nxv4f32
4+
; CHECK-GENERIC: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv4f32
5+
6+
define void @masked_scatter_nxv8f32_i64(<vscale x 4 x float> %data, <vscale x 4 x ptr> %b, <vscale x 4 x i64> %V) #0 {
7+
call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> %data, <vscale x 4 x ptr> %b, i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
8+
ret void
9+
}
10+

0 commit comments

Comments
 (0)