Skip to content

Commit 6323d9c

Browse files
committed
[MachineScheduler][AArch64] Skip Neoverse V2 Pre-RA MISched for large vector intrinsic codes
Skip the Pre-RA MachineScheduler for large hand-written vector intrinsic codes when targetting the Neoverse V2. The motivation to skip the scheduler is the same as this abandoned patch: #127784 But this reimplementation is much more focused and fine-grained and based on the following heuristic: - only skip the pre-ra machine scheduler for large (hand-written) vector intrinsic code, - do this only for the Neoverse V2 (a wide micro-architecture). The intuition of this patch is that: - scheduling based on instruction latency isn't useful for a very wide micro-architecture (which is why GCC also partly stopped doing this), - however, the machine scheduler also performs some optimisations: i) load/store clusttering, and ii) copy elimination. These are useful optimisations, and that's why disabling the machine scheduler in general isn't a good idea, i.e. this results in some regressions. - but the function where the machine scheduler and register allocator are not working well together is a large, hand-written vector code. Thus, one could argue that scheduling this kind of code is against the programmer's intent, so let's not do that, which avoids complications later down in the optimisation pipeline. The heuristic is trying to recognise large hand-written intrinsic code by calculating a percentage of vector code and other instructions in a function and skips the machine scheduler if certain treshold values are exceeded. I.e., if a function is more than 70% vector code, contains more than 2800 IR instructions and 425 intrinsics, don't schedule this function. This obviously is a heuristic, but is hopefully narrow enough to not cause regressions (I haven't found any). The alternative is to look into regalloc, which is where the problems occur with the placement of spill/reload code. However, there will be heuristics involved there too, and so this seems like a valid heuristic and looking into regalloc is an orthogonal exercise.
1 parent 001cc34 commit 6323d9c

File tree

9 files changed

+181
-0
lines changed

9 files changed

+181
-0
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1019,6 +1019,10 @@ class TargetTransformInfo {
10191019
/// Enable matching of interleaved access groups.
10201020
bool enableInterleavedAccessVectorization() const;
10211021

1022+
/// Disable the machine scheduler for a large function with a lot of
1023+
/// (hand-written) vector code and intrinsics.
1024+
bool skipPreRASchedLargeVecFunc() const;
1025+
10221026
/// Enable matching of interleaved access groups that contain predicated
10231027
/// accesses or gaps and therefore vectorized using masked
10241028
/// vector loads/stores.

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -499,6 +499,8 @@ class TargetTransformInfoImplBase {
499499

500500
virtual bool enableInterleavedAccessVectorization() const { return false; }
501501

502+
virtual bool skipPreRASchedLargeVecFunc() const { return false; }
503+
502504
virtual bool enableMaskedInterleavedAccessVectorization() const {
503505
return false;
504506
}

llvm/include/llvm/CodeGen/TargetSubtargetInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,10 @@ class TargetSubtargetInfo : public MCSubtargetInfo {
184184
return false;
185185
}
186186

187+
virtual bool enableSkipPreRASchedLargeVecFunc() const {
188+
return false;
189+
}
190+
187191
/// True if the subtarget should run MachineScheduler after aggressive
188192
/// coalescing.
189193
///

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -677,6 +677,10 @@ bool TargetTransformInfo::enableInterleavedAccessVectorization() const {
677677
return TTIImpl->enableInterleavedAccessVectorization();
678678
}
679679

680+
bool TargetTransformInfo::skipPreRASchedLargeVecFunc() const {
681+
return TTIImpl->skipPreRASchedLargeVecFunc();
682+
}
683+
680684
bool TargetTransformInfo::enableMaskedInterleavedAccessVectorization() const {
681685
return TTIImpl->enableMaskedInterleavedAccessVectorization();
682686
}

llvm/lib/CodeGen/MachineScheduler.cpp

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "llvm/ADT/Statistic.h"
2222
#include "llvm/ADT/iterator_range.h"
2323
#include "llvm/Analysis/AliasAnalysis.h"
24+
#include "llvm/Analysis/TargetTransformInfo.h"
2425
#include "llvm/CodeGen/LiveInterval.h"
2526
#include "llvm/CodeGen/LiveIntervals.h"
2627
#include "llvm/CodeGen/MachineBasicBlock.h"
@@ -49,6 +50,7 @@
4950
#include "llvm/CodeGen/TargetSubtargetInfo.h"
5051
#include "llvm/CodeGenTypes/MachineValueType.h"
5152
#include "llvm/Config/llvm-config.h"
53+
#include "llvm/IR/IntrinsicInst.h"
5254
#include "llvm/InitializePasses.h"
5355
#include "llvm/MC/LaneBitmask.h"
5456
#include "llvm/Pass.h"
@@ -110,6 +112,21 @@ cl::opt<bool> VerifyScheduling(
110112
"verify-misched", cl::Hidden,
111113
cl::desc("Verify machine instrs before and after machine scheduling"));
112114

115+
// Heuristics for skipping pre-RA machine scheduling for large functions,
116+
// containing (handwritten) intrinsic vector-code.
117+
cl::opt<unsigned> LargeFunctionThreshold(
118+
"misched-large-func-threshold", cl::Hidden, cl::init(2800),
119+
cl::desc("The minimum number of IR instructions in a large (hand-written) "
120+
"intrinsic vector code function"));
121+
cl::opt<unsigned> NbOfIntrinsicsThreshold(
122+
"misched-intrinsics-threshold", cl::Hidden, cl::init(425),
123+
cl::desc("The minimum number of intrinsic instructions in a large "
124+
"(hand-written) intrinsic vector code function"));
125+
cl::opt<unsigned> VectorCodeDensityPercentageThreshold(
126+
"misched-vector-density-threshold", cl::Hidden, cl::init(70),
127+
cl::desc("Minimum percentage of vector instructions compared to scalar in "
128+
"a large (hand-written) intrinsic vector code function"));
129+
113130
#ifndef NDEBUG
114131
cl::opt<bool> ViewMISchedDAGs(
115132
"view-misched-dags", cl::Hidden,
@@ -319,6 +336,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
319336
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
320337
INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass)
321338
INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
339+
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
322340
INITIALIZE_PASS_END(MachineSchedulerLegacy, DEBUG_TYPE,
323341
"Machine Instruction Scheduler", false, false)
324342

@@ -336,6 +354,7 @@ void MachineSchedulerLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
336354
AU.addPreserved<SlotIndexesWrapperPass>();
337355
AU.addRequired<LiveIntervalsWrapperPass>();
338356
AU.addPreserved<LiveIntervalsWrapperPass>();
357+
AU.addRequired<TargetTransformInfoWrapperPass>();
339358
MachineFunctionPass::getAnalysisUsage(AU);
340359
}
341360

@@ -557,6 +576,47 @@ bool MachineSchedulerLegacy::runOnMachineFunction(MachineFunction &MF) {
557576
return false;
558577
}
559578

579+
// Try to recognise large hand-written instrinc vector code, and skip the
580+
// machine scheduler for this function if the target and TTI hook are okay
581+
// with this.
582+
const TargetSubtargetInfo &STI = MF.getSubtarget();
583+
const MCSchedModel &SchedModel = STI.getSchedModel();
584+
auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(MF.getFunction());
585+
586+
if (TTI.skipPreRASchedLargeVecFunc()) {
587+
uint64_t InstructionCount = 0;
588+
uint64_t IntrinsicCount = 0;
589+
uint64_t VectorTypeCount = 0;
590+
for (auto &BB : MF.getFunction()) {
591+
for (Instruction &I : BB) {
592+
InstructionCount++;
593+
if (isa<IntrinsicInst>(I))
594+
IntrinsicCount++;
595+
Type *T = I.getType();
596+
if (T && T->isVectorTy())
597+
VectorTypeCount++;
598+
}
599+
}
600+
601+
unsigned VecDensity = (VectorTypeCount / (double) InstructionCount) * 100;
602+
603+
LLVM_DEBUG(dbgs() << "Instruction count: " << InstructionCount << ", ";
604+
dbgs() << "threshold: " << LargeFunctionThreshold << "\n";
605+
dbgs() << "Intrinsic count: " << IntrinsicCount << ", ";
606+
dbgs() << "threshold: " << NbOfIntrinsicsThreshold << "\n";
607+
dbgs() << "Vector density: " << VecDensity << ", ";
608+
dbgs() << "threshold: " << VectorCodeDensityPercentageThreshold
609+
<< "\n";);
610+
611+
if (InstructionCount > LargeFunctionThreshold &&
612+
IntrinsicCount > NbOfIntrinsicsThreshold &&
613+
VecDensity > VectorCodeDensityPercentageThreshold) {
614+
LLVM_DEBUG(
615+
dbgs() << "Skipping MISched for very vector and intrinsic heavy code");
616+
return false;
617+
}
618+
}
619+
560620
LLVM_DEBUG(dbgs() << "Before MISched:\n"; MF.print(dbgs()));
561621

562622
auto &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();

llvm/lib/Target/AArch64/AArch64Subtarget.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,8 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
268268
MaxBytesForLoopAlignment = 16;
269269
break;
270270
case NeoverseV2:
271+
SkipPreRASchedLargeVecFunc = true;
272+
LLVM_FALLTHROUGH;
271273
case NeoverseV3:
272274
EpilogueVectorizationMinVF = 8;
273275
MaxInterleaveFactor = 4;

llvm/lib/Target/AArch64/AArch64Subtarget.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
7171
unsigned MaxBytesForLoopAlignment = 0;
7272
unsigned MinimumJumpTableEntries = 4;
7373
unsigned MaxJumpTableSize = 0;
74+
bool SkipPreRASchedLargeVecFunc = false;
7475

7576
// ReserveXRegister[i] - X#i is not available as a general purpose register.
7677
BitVector ReserveXRegister;
@@ -160,6 +161,12 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
160161
bool enablePostRAScheduler() const override { return usePostRAScheduler(); }
161162
bool enableSubRegLiveness() const override { return EnableSubregLiveness; }
162163

164+
/// Returns true if the subtarget should consider skipping the pre-RA
165+
/// machine scheduler for large (hand-written) instrinsic vector functions.
166+
bool enableSkipPreRASchedLargeVecFunc() const override {
167+
return SkipPreRASchedLargeVecFunc;
168+
}
169+
163170
bool enableMachinePipeliner() const override;
164171
bool useDFAforSMS() const override { return false; }
165172

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,10 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
118118

119119
bool enableInterleavedAccessVectorization() const override { return true; }
120120

121+
bool skipPreRASchedLargeVecFunc() const override {
122+
return ST->enableSkipPreRASchedLargeVecFunc();
123+
}
124+
121125
bool enableMaskedInterleavedAccessVectorization() const override {
122126
return ST->hasSVE();
123127
}
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v1 | FileCheck %s --check-prefix=SCHED
3+
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 | FileCheck %s --check-prefix=SCHED
4+
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -misched-large-func-threshold=30 -misched-intrinsics-threshold=2 -misched-vector-density-threshold=31 | FileCheck %s --check-prefix=NOSCHED
5+
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -misched-large-func-threshold=31 -misched-intrinsics-threshold=2 -misched-vector-density-threshold=31 | FileCheck %s --check-prefix=SCHED
6+
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -misched-large-func-threshold=30 -misched-intrinsics-threshold=3 -misched-vector-density-threshold=31 | FileCheck %s --check-prefix=SCHED
7+
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -misched-large-func-threshold=30 -misched-intrinsics-threshold=2 -misched-vector-density-threshold=32 | FileCheck %s --check-prefix=SCHED
8+
9+
define void @test_fma_loop(ptr %ptr_a, ptr %ptr_b, ptr %ptr_c, ptr %ptr_out, i32 %n) {
10+
; SCHED-LABEL: test_fma_loop:
11+
; SCHED: // %bb.0: // %entry
12+
; SCHED-NEXT: cbz w4, .LBB0_2
13+
; SCHED-NEXT: .p2align 5, , 16
14+
; SCHED-NEXT: .LBB0_1: // %loop
15+
; SCHED-NEXT: // =>This Inner Loop Header: Depth=1
16+
; SCHED-NEXT: ldr q0, [x0], #16
17+
; SCHED-NEXT: ldp q1, q2, [x1]
18+
; SCHED-NEXT: subs w4, w4, #1
19+
; SCHED-NEXT: ldp q3, q4, [x2]
20+
; SCHED-NEXT: fmla v3.4s, v1.4s, v0.4s
21+
; SCHED-NEXT: ldr q0, [x1, #32]
22+
; SCHED-NEXT: ldr q1, [x2, #32]
23+
; SCHED-NEXT: add x1, x1, #48
24+
; SCHED-NEXT: add x2, x2, #48
25+
; SCHED-NEXT: fmla v4.4s, v2.4s, v3.4s
26+
; SCHED-NEXT: fmla v1.4s, v0.4s, v4.4s
27+
; SCHED-NEXT: str q1, [x3], #16
28+
; SCHED-NEXT: b.ne .LBB0_1
29+
; SCHED-NEXT: .LBB0_2: // %exit
30+
; SCHED-NEXT: ret
31+
;
32+
; NOSCHED-LABEL: test_fma_loop:
33+
; NOSCHED: // %bb.0: // %entry
34+
; NOSCHED-NEXT: cbz w4, .LBB0_2
35+
; NOSCHED-NEXT: .p2align 5, , 16
36+
; NOSCHED-NEXT: .LBB0_1: // %loop
37+
; NOSCHED-NEXT: // =>This Inner Loop Header: Depth=1
38+
; NOSCHED-NEXT: ldr q0, [x0], #16
39+
; NOSCHED-NEXT: ldr q1, [x1]
40+
; NOSCHED-NEXT: ldr q2, [x2]
41+
; NOSCHED-NEXT: subs w4, w4, #1
42+
; NOSCHED-NEXT: fmla v2.4s, v1.4s, v0.4s
43+
; NOSCHED-NEXT: ldp q0, q3, [x1, #16]
44+
; NOSCHED-NEXT: ldp q1, q4, [x2, #16]
45+
; NOSCHED-NEXT: add x1, x1, #48
46+
; NOSCHED-NEXT: add x2, x2, #48
47+
; NOSCHED-NEXT: fmla v1.4s, v0.4s, v2.4s
48+
; NOSCHED-NEXT: fmla v4.4s, v3.4s, v1.4s
49+
; NOSCHED-NEXT: str q4, [x3], #16
50+
; NOSCHED-NEXT: b.ne .LBB0_1
51+
; NOSCHED-NEXT: .LBB0_2: // %exit
52+
; NOSCHED-NEXT: ret
53+
entry:
54+
%cmp = icmp eq i32 %n, 0
55+
br i1 %cmp, label %exit, label %loop
56+
57+
loop:
58+
%iv = phi i32 [ %n, %entry ], [ %iv.next, %loop ]
59+
%ptr_a.addr = phi ptr [ %ptr_a, %entry ], [ %ptr_a.next, %loop ]
60+
%ptr_b.addr = phi ptr [ %ptr_b, %entry ], [ %ptr_b.next, %loop ]
61+
%ptr_c.addr = phi ptr [ %ptr_c, %entry ], [ %ptr_c.next, %loop ]
62+
%ptr_out.addr = phi ptr [ %ptr_out, %entry ], [ %ptr_out.next, %loop ]
63+
64+
%a = load <4 x float>, ptr %ptr_a.addr
65+
%b1 = load <4 x float>, ptr %ptr_b.addr
66+
%c1 = load <4 x float>, ptr %ptr_c.addr
67+
%res1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b1, <4 x float> %c1)
68+
69+
%ptr_b2 = getelementptr <4 x float>, ptr %ptr_b.addr, i64 1
70+
%ptr_c2 = getelementptr <4 x float>, ptr %ptr_c.addr, i64 1
71+
%b2 = load <4 x float>, ptr %ptr_b2
72+
%c2 = load <4 x float>, ptr %ptr_c2
73+
%ptr_b3 = getelementptr <4 x float>, ptr %ptr_b.addr, i64 2
74+
%ptr_c3 = getelementptr <4 x float>, ptr %ptr_c.addr, i64 2
75+
%b3 = load <4 x float>, ptr %ptr_b3
76+
%c3 = load <4 x float>, ptr %ptr_c3
77+
78+
%res2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %res1, <4 x float> %b2, <4 x float> %c2)
79+
%res3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %res2, <4 x float> %b3, <4 x float> %c3)
80+
81+
store <4 x float> %res3, ptr %ptr_out.addr
82+
83+
%ptr_a.next = getelementptr <4 x float>, ptr %ptr_a.addr, i64 1
84+
%ptr_b.next = getelementptr <4 x float>, ptr %ptr_b.addr, i64 3
85+
%ptr_c.next = getelementptr <4 x float>, ptr %ptr_c.addr, i64 3
86+
%ptr_out.next = getelementptr <4 x float>, ptr %ptr_out.addr, i64 1
87+
88+
%iv.next = sub i32 %iv, 1
89+
%cmp.next = icmp ne i32 %iv.next, 0
90+
br i1 %cmp.next, label %loop, label %exit
91+
92+
exit:
93+
ret void
94+
}

0 commit comments

Comments
 (0)