-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[MachineScheduler][AArch64] Skip Neoverse V2 Pre-RA MISched for large vector intrinsic codes #139557
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MachineScheduler][AArch64] Skip Neoverse V2 Pre-RA MISched for large vector intrinsic codes #139557
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ | |
| #include "llvm/ADT/Statistic.h" | ||
| #include "llvm/ADT/iterator_range.h" | ||
| #include "llvm/Analysis/AliasAnalysis.h" | ||
| #include "llvm/Analysis/TargetTransformInfo.h" | ||
| #include "llvm/CodeGen/LiveInterval.h" | ||
| #include "llvm/CodeGen/LiveIntervals.h" | ||
| #include "llvm/CodeGen/MachineBasicBlock.h" | ||
|
|
@@ -49,6 +50,7 @@ | |
| #include "llvm/CodeGen/TargetSubtargetInfo.h" | ||
| #include "llvm/CodeGenTypes/MachineValueType.h" | ||
| #include "llvm/Config/llvm-config.h" | ||
| #include "llvm/IR/IntrinsicInst.h" | ||
| #include "llvm/InitializePasses.h" | ||
| #include "llvm/MC/LaneBitmask.h" | ||
| #include "llvm/Pass.h" | ||
|
|
@@ -110,6 +112,21 @@ cl::opt<bool> VerifyScheduling( | |
| "verify-misched", cl::Hidden, | ||
| cl::desc("Verify machine instrs before and after machine scheduling")); | ||
|
|
||
| // Heuristics for skipping pre-RA machine scheduling for large functions, | ||
| // containing (handwritten) intrinsic vector-code. | ||
| cl::opt<unsigned> LargeFunctionThreshold( | ||
| "misched-large-func-threshold", cl::Hidden, cl::init(2800), | ||
| cl::desc("The minimum number of IR instructions in a large (hand-written) " | ||
| "intrinsic vector code function")); | ||
| cl::opt<unsigned> NbOfIntrinsicsThreshold( | ||
| "misched-intrinsics-threshold", cl::Hidden, cl::init(425), | ||
| cl::desc("The minimum number of intrinsic instructions in a large " | ||
| "(hand-written) intrinsic vector code function")); | ||
| cl::opt<unsigned> VectorCodeDensityPercentageThreshold( | ||
| "misched-vector-density-threshold", cl::Hidden, cl::init(70), | ||
| cl::desc("Minimum percentage of vector instructions compared to scalar in " | ||
| "a large (hand-written) intrinsic vector code function")); | ||
|
|
||
| #ifndef NDEBUG | ||
| cl::opt<bool> ViewMISchedDAGs( | ||
| "view-misched-dags", cl::Hidden, | ||
|
|
@@ -319,6 +336,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) | |
| INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) | ||
| INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass) | ||
| INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) | ||
| INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) | ||
| INITIALIZE_PASS_END(MachineSchedulerLegacy, DEBUG_TYPE, | ||
| "Machine Instruction Scheduler", false, false) | ||
|
|
||
|
|
@@ -336,6 +354,7 @@ void MachineSchedulerLegacy::getAnalysisUsage(AnalysisUsage &AU) const { | |
| AU.addPreserved<SlotIndexesWrapperPass>(); | ||
| AU.addRequired<LiveIntervalsWrapperPass>(); | ||
| AU.addPreserved<LiveIntervalsWrapperPass>(); | ||
| AU.addRequired<TargetTransformInfoWrapperPass>(); | ||
| MachineFunctionPass::getAnalysisUsage(AU); | ||
| } | ||
|
|
||
|
|
@@ -557,6 +576,47 @@ bool MachineSchedulerLegacy::runOnMachineFunction(MachineFunction &MF) { | |
| return false; | ||
| } | ||
|
|
||
| // Try to recognise large hand-written instrinc vector code, and skip the | ||
| // machine scheduler for this function if the target and TTI hook are okay | ||
| // with this. | ||
| const TargetSubtargetInfo &STI = MF.getSubtarget(); | ||
| const MCSchedModel &SchedModel = STI.getSchedModel(); | ||
| auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(MF.getFunction()); | ||
|
|
||
| if (TTI.skipPreRASchedLargeVecFunc()) { | ||
| uint64_t InstructionCount = 0; | ||
| uint64_t IntrinsicCount = 0; | ||
| uint64_t VectorTypeCount = 0; | ||
| for (auto &BB : MF.getFunction()) { | ||
| for (Instruction &I : BB) { | ||
| InstructionCount++; | ||
| if (isa<IntrinsicInst>(I)) | ||
| IntrinsicCount++; | ||
| Type *T = I.getType(); | ||
| if (T && T->isVectorTy()) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't be null |
||
| VectorTypeCount++; | ||
| } | ||
| } | ||
|
Comment on lines
+586
to
+599
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Really should not be writing an IR based heuristic in a machine pass. This is baking in a lot of assumptions about the architecture and how the IR will be lowered. You have better information from the current machine instructions
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see you what you mean, but I intentionally iterated over the IR to extract high level information that is not available on MIR, i.e. the vector intrinsics are lowered (FMAs) on MIR and are no longer recognisable.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That makes more sense to me, this IR processing is extremely vague as written |
||
|
|
||
| unsigned VecDensity = (VectorTypeCount / (double) InstructionCount) * 100; | ||
|
|
||
| LLVM_DEBUG(dbgs() << "Instruction count: " << InstructionCount << ", "; | ||
| dbgs() << "threshold: " << LargeFunctionThreshold << "\n"; | ||
| dbgs() << "Intrinsic count: " << IntrinsicCount << ", "; | ||
| dbgs() << "threshold: " << NbOfIntrinsicsThreshold << "\n"; | ||
| dbgs() << "Vector density: " << VecDensity << ", "; | ||
| dbgs() << "threshold: " << VectorCodeDensityPercentageThreshold | ||
| << "\n";); | ||
|
|
||
| if (InstructionCount > LargeFunctionThreshold && | ||
| IntrinsicCount > NbOfIntrinsicsThreshold && | ||
| VecDensity > VectorCodeDensityPercentageThreshold) { | ||
| LLVM_DEBUG( | ||
| dbgs() << "Skipping MISched for very vector and intrinsic heavy code"); | ||
| return false; | ||
| } | ||
| } | ||
|
|
||
| LLVM_DEBUG(dbgs() << "Before MISched:\n"; MF.print(dbgs())); | ||
|
|
||
| auto &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI(); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,94 @@ | ||
| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | ||
| ; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v1 | FileCheck %s --check-prefix=SCHED | ||
| ; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 | FileCheck %s --check-prefix=SCHED | ||
| ; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -misched-large-func-threshold=30 -misched-intrinsics-threshold=2 -misched-vector-density-threshold=31 | FileCheck %s --check-prefix=NOSCHED | ||
| ; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -misched-large-func-threshold=31 -misched-intrinsics-threshold=2 -misched-vector-density-threshold=31 | FileCheck %s --check-prefix=SCHED | ||
| ; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -misched-large-func-threshold=30 -misched-intrinsics-threshold=3 -misched-vector-density-threshold=31 | FileCheck %s --check-prefix=SCHED | ||
| ; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -misched-large-func-threshold=30 -misched-intrinsics-threshold=2 -misched-vector-density-threshold=32 | FileCheck %s --check-prefix=SCHED | ||
|
|
||
| define void @test_fma_loop(ptr %ptr_a, ptr %ptr_b, ptr %ptr_c, ptr %ptr_out, i32 %n) { | ||
| ; SCHED-LABEL: test_fma_loop: | ||
| ; SCHED: // %bb.0: // %entry | ||
| ; SCHED-NEXT: cbz w4, .LBB0_2 | ||
| ; SCHED-NEXT: .p2align 5, , 16 | ||
| ; SCHED-NEXT: .LBB0_1: // %loop | ||
| ; SCHED-NEXT: // =>This Inner Loop Header: Depth=1 | ||
| ; SCHED-NEXT: ldr q0, [x0], #16 | ||
| ; SCHED-NEXT: ldp q1, q2, [x1] | ||
| ; SCHED-NEXT: subs w4, w4, #1 | ||
| ; SCHED-NEXT: ldp q3, q4, [x2] | ||
| ; SCHED-NEXT: fmla v3.4s, v1.4s, v0.4s | ||
| ; SCHED-NEXT: ldr q0, [x1, #32] | ||
| ; SCHED-NEXT: ldr q1, [x2, #32] | ||
| ; SCHED-NEXT: add x1, x1, #48 | ||
| ; SCHED-NEXT: add x2, x2, #48 | ||
| ; SCHED-NEXT: fmla v4.4s, v2.4s, v3.4s | ||
| ; SCHED-NEXT: fmla v1.4s, v0.4s, v4.4s | ||
| ; SCHED-NEXT: str q1, [x3], #16 | ||
| ; SCHED-NEXT: b.ne .LBB0_1 | ||
| ; SCHED-NEXT: .LBB0_2: // %exit | ||
| ; SCHED-NEXT: ret | ||
| ; | ||
| ; NOSCHED-LABEL: test_fma_loop: | ||
| ; NOSCHED: // %bb.0: // %entry | ||
| ; NOSCHED-NEXT: cbz w4, .LBB0_2 | ||
| ; NOSCHED-NEXT: .p2align 5, , 16 | ||
| ; NOSCHED-NEXT: .LBB0_1: // %loop | ||
| ; NOSCHED-NEXT: // =>This Inner Loop Header: Depth=1 | ||
| ; NOSCHED-NEXT: ldr q0, [x0], #16 | ||
| ; NOSCHED-NEXT: ldr q1, [x1] | ||
| ; NOSCHED-NEXT: ldr q2, [x2] | ||
| ; NOSCHED-NEXT: subs w4, w4, #1 | ||
| ; NOSCHED-NEXT: fmla v2.4s, v1.4s, v0.4s | ||
| ; NOSCHED-NEXT: ldp q0, q3, [x1, #16] | ||
| ; NOSCHED-NEXT: ldp q1, q4, [x2, #16] | ||
| ; NOSCHED-NEXT: add x1, x1, #48 | ||
| ; NOSCHED-NEXT: add x2, x2, #48 | ||
| ; NOSCHED-NEXT: fmla v1.4s, v0.4s, v2.4s | ||
| ; NOSCHED-NEXT: fmla v4.4s, v3.4s, v1.4s | ||
| ; NOSCHED-NEXT: str q4, [x3], #16 | ||
| ; NOSCHED-NEXT: b.ne .LBB0_1 | ||
| ; NOSCHED-NEXT: .LBB0_2: // %exit | ||
| ; NOSCHED-NEXT: ret | ||
| entry: | ||
| %cmp = icmp eq i32 %n, 0 | ||
| br i1 %cmp, label %exit, label %loop | ||
|
|
||
| loop: | ||
| %iv = phi i32 [ %n, %entry ], [ %iv.next, %loop ] | ||
| %ptr_a.addr = phi ptr [ %ptr_a, %entry ], [ %ptr_a.next, %loop ] | ||
| %ptr_b.addr = phi ptr [ %ptr_b, %entry ], [ %ptr_b.next, %loop ] | ||
| %ptr_c.addr = phi ptr [ %ptr_c, %entry ], [ %ptr_c.next, %loop ] | ||
| %ptr_out.addr = phi ptr [ %ptr_out, %entry ], [ %ptr_out.next, %loop ] | ||
|
|
||
| %a = load <4 x float>, ptr %ptr_a.addr | ||
| %b1 = load <4 x float>, ptr %ptr_b.addr | ||
| %c1 = load <4 x float>, ptr %ptr_c.addr | ||
| %res1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b1, <4 x float> %c1) | ||
|
|
||
| %ptr_b2 = getelementptr <4 x float>, ptr %ptr_b.addr, i64 1 | ||
| %ptr_c2 = getelementptr <4 x float>, ptr %ptr_c.addr, i64 1 | ||
| %b2 = load <4 x float>, ptr %ptr_b2 | ||
| %c2 = load <4 x float>, ptr %ptr_c2 | ||
| %ptr_b3 = getelementptr <4 x float>, ptr %ptr_b.addr, i64 2 | ||
| %ptr_c3 = getelementptr <4 x float>, ptr %ptr_c.addr, i64 2 | ||
| %b3 = load <4 x float>, ptr %ptr_b3 | ||
| %c3 = load <4 x float>, ptr %ptr_c3 | ||
|
|
||
| %res2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %res1, <4 x float> %b2, <4 x float> %c2) | ||
| %res3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %res2, <4 x float> %b3, <4 x float> %c3) | ||
|
|
||
| store <4 x float> %res3, ptr %ptr_out.addr | ||
|
|
||
| %ptr_a.next = getelementptr <4 x float>, ptr %ptr_a.addr, i64 1 | ||
| %ptr_b.next = getelementptr <4 x float>, ptr %ptr_b.addr, i64 3 | ||
| %ptr_c.next = getelementptr <4 x float>, ptr %ptr_c.addr, i64 3 | ||
| %ptr_out.next = getelementptr <4 x float>, ptr %ptr_out.addr, i64 1 | ||
|
|
||
| %iv.next = sub i32 %iv, 1 | ||
| %cmp.next = icmp ne i32 %iv.next, 0 | ||
| br i1 %cmp.next, label %loop, label %exit | ||
|
|
||
| exit: | ||
| ret void | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need to skip debug intrinsics too