Skip to content

Commit 9bccf61

Browse files
authored
[AArch64][LV] Set MaxInterleaving to 4 for Neoverse V2 and V3 (llvm#100385)
Set the maximum interleaving factor to 4, aligning with the number of available SIMD pipelines. This increases the number of vector instructions in the vectorised loop body, enhancing performance during its execution. However, for very low iteration counts, the vectorised body might not execute at all, leaving only the epilogue loop to run. This issue affects e.g. cam4_r from SPEC FP, which experienced a performance regression. To address this, the patch reduces the minimum epilogue vectorisation factor from 16 to 8, enabling the epilogue to be vectorised and largely mitigating the regression.
1 parent 2b5214b commit 9bccf61

File tree

13 files changed

+176
-6
lines changed

13 files changed

+176
-6
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -630,6 +630,10 @@ class TargetTransformInfo {
630630
AssumptionCache &AC, TargetLibraryInfo *LibInfo,
631631
HardwareLoopInfo &HWLoopInfo) const;
632632

633+
// Query the target for which minimum vectorization factor epilogue
634+
// vectorization should be considered.
635+
unsigned getEpilogueVectorizationMinVF() const;
636+
633637
/// Query the target whether it would be prefered to create a predicated
634638
/// vector loop, which can avoid the need to emit a scalar epilogue loop.
635639
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const;
@@ -1912,6 +1916,7 @@ class TargetTransformInfo::Concept {
19121916
AssumptionCache &AC,
19131917
TargetLibraryInfo *LibInfo,
19141918
HardwareLoopInfo &HWLoopInfo) = 0;
1919+
virtual unsigned getEpilogueVectorizationMinVF() = 0;
19151920
virtual bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) = 0;
19161921
virtual TailFoldingStyle
19171922
getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) = 0;
@@ -2392,6 +2397,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
23922397
HardwareLoopInfo &HWLoopInfo) override {
23932398
return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
23942399
}
2400+
unsigned getEpilogueVectorizationMinVF() override {
2401+
return Impl.getEpilogueVectorizationMinVF();
2402+
}
23952403
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) override {
23962404
return Impl.preferPredicateOverEpilogue(TFI);
23972405
}

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,8 @@ class TargetTransformInfoImplBase {
199199
return false;
200200
}
201201

202+
unsigned getEpilogueVectorizationMinVF() const { return 16; }
203+
202204
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const { return false; }
203205

204206
TailFoldingStyle

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -666,6 +666,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
666666
return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
667667
}
668668

669+
unsigned getEpilogueVectorizationMinVF() {
670+
return BaseT::getEpilogueVectorizationMinVF();
671+
}
672+
669673
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
670674
return BaseT::preferPredicateOverEpilogue(TFI);
671675
}

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,10 @@ bool TargetTransformInfo::isHardwareLoopProfitable(
359359
return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
360360
}
361361

362+
unsigned TargetTransformInfo::getEpilogueVectorizationMinVF() const {
363+
return TTIImpl->getEpilogueVectorizationMinVF();
364+
}
365+
362366
bool TargetTransformInfo::preferPredicateOverEpilogue(
363367
TailFoldingInfo *TFI) const {
364368
return TTIImpl->preferPredicateOverEpilogue(TFI);

llvm/lib/Target/AArch64/AArch64Subtarget.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -255,12 +255,13 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
255255
MaxBytesForLoopAlignment = 16;
256256
break;
257257
case NeoverseV2:
258-
// Specialize cost for Neoverse-V2.
258+
case NeoverseV3:
259+
EpilogueVectorizationMinVF = 8;
260+
MaxInterleaveFactor = 4;
259261
ScatterOverhead = 13;
260262
LLVM_FALLTHROUGH;
261263
case NeoverseN2:
262264
case NeoverseN3:
263-
case NeoverseV3:
264265
PrefFunctionAlignment = Align(16);
265266
PrefLoopAlignment = Align(32);
266267
MaxBytesForLoopAlignment = 16;

llvm/lib/Target/AArch64/AArch64Subtarget.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
5656
bool ATTRIBUTE = DEFAULT;
5757
#include "AArch64GenSubtargetInfo.inc"
5858

59+
unsigned EpilogueVectorizationMinVF = 16;
5960
uint8_t MaxInterleaveFactor = 2;
6061
uint8_t VectorInsertExtractBaseCost = 2;
6162
uint16_t CacheLineSize = 0;
@@ -237,6 +238,9 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
237238
hasFuseAdrpAdd() || hasFuseLiterals();
238239
}
239240

241+
unsigned getEpilogueVectorizationMinVF() const {
242+
return EpilogueVectorizationMinVF;
243+
}
240244
unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
241245
unsigned getVectorInsertExtractBaseCost() const;
242246
unsigned getCacheLineSize() const override { return CacheLineSize; }

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4736,6 +4736,10 @@ static bool containsDecreasingPointers(Loop *TheLoop,
47364736
return false;
47374737
}
47384738

4739+
unsigned AArch64TTIImpl::getEpilogueVectorizationMinVF() const {
4740+
return ST->getEpilogueVectorizationMinVF();
4741+
}
4742+
47394743
bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
47404744
if (!ST->hasSVE())
47414745
return false;

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
391391
return ST->useFixedOverScalableIfEqualCost();
392392
}
393393

394+
unsigned getEpilogueVectorizationMinVF() const;
395+
394396
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI);
395397

396398
bool supportsScalableVectors() const {

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ static cl::opt<unsigned> EpilogueVectorizationForceVF(
186186
"loops."));
187187

188188
static cl::opt<unsigned> EpilogueVectorizationMinVF(
189-
"epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
189+
"epilogue-vectorization-minimum-VF", cl::Hidden,
190190
cl::desc("Only loops with vectorization factor equal to or larger than "
191191
"the specified value are considered for epilogue vectorization."));
192192

@@ -4701,8 +4701,10 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
47014701
// See related "TODO: extend to support scalable VFs." in
47024702
// selectEpilogueVectorizationFactor.
47034703
unsigned Multiplier = VF.isFixed() ? IC : 1;
4704-
return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >=
4705-
EpilogueVectorizationMinVF;
4704+
unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4705+
? EpilogueVectorizationMinVF
4706+
: TTI.getEpilogueVectorizationMinVF();
4707+
return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= MinVFThreshold;
47064708
}
47074709

47084710
VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(

llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a14 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
66
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a15 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
77
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a16 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
8+
; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v2 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
9+
; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v3 -S %s | FileCheck --check-prefix=INTERLEAVE-4-VLA %s
810

911
; Tests for selecting interleave counts for loops with loads and stores.
1012

@@ -213,6 +215,12 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8
213215
; INTERLEAVE-2: exit:
214216
; INTERLEAVE-2-NEXT: ret void
215217
;
218+
; INTERLEAVE-4-VLA-LABEL: @interleave_single_load_store(
219+
; INTERLEAVE-4-VLA: call <vscale x 16 x i8> @llvm.smax.nxv16i8(
220+
; INTERLEAVE-4-VLA-NEXT: call <vscale x 16 x i8> @llvm.smax.nxv16i8(
221+
; INTERLEAVE-4-VLA-NEXT: call <vscale x 16 x i8> @llvm.smax.nxv16i8(
222+
; INTERLEAVE-4-VLA-NEXT: call <vscale x 16 x i8> @llvm.smax.nxv16i8(
223+
;
216224
entry:
217225
br label %loop
218226

0 commit comments

Comments
 (0)