diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index f4f66447d1c3d..42a1025e10024 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1019,6 +1019,10 @@ class TargetTransformInfo { /// Enable matching of interleaved access groups. bool enableInterleavedAccessVectorization() const; + /// Disable the machine scheduler for a large function with a lot of + /// (hand-written) vector code and intrinsics. + bool skipPreRASchedLargeVecFunc() const; + /// Enable matching of interleaved access groups that contain predicated /// accesses or gaps and therefore vectorized using masked /// vector loads/stores. diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 02d6435e61b4d..8d8f02338a3b0 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -499,6 +499,8 @@ class TargetTransformInfoImplBase { virtual bool enableInterleavedAccessVectorization() const { return false; } + virtual bool skipPreRASchedLargeVecFunc() const { return false; } + virtual bool enableMaskedInterleavedAccessVectorization() const { return false; } diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h index 1230349956973..ab901f969f948 100644 --- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h +++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h @@ -184,6 +184,10 @@ class TargetSubtargetInfo : public MCSubtargetInfo { return false; } + virtual bool enableSkipPreRASchedLargeVecFunc() const { + return false; + } + /// True if the subtarget should run MachineScheduler after aggressive /// coalescing. /// diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 3ced70e113bf7..1422cfcdcb762 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -677,6 +677,10 @@ bool TargetTransformInfo::enableInterleavedAccessVectorization() const { return TTIImpl->enableInterleavedAccessVectorization(); } +bool TargetTransformInfo::skipPreRASchedLargeVecFunc() const { + return TTIImpl->skipPreRASchedLargeVecFunc(); +} + bool TargetTransformInfo::enableMaskedInterleavedAccessVectorization() const { return TTIImpl->enableMaskedInterleavedAccessVectorization(); } diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index 0c3ffb1bbaa6f..83dc71c880cb8 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -21,6 +21,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -49,6 +50,7 @@ #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/CodeGenTypes/MachineValueType.h" #include "llvm/Config/llvm-config.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/InitializePasses.h" #include "llvm/MC/LaneBitmask.h" #include "llvm/Pass.h" @@ -110,6 +112,21 @@ cl::opt VerifyScheduling( "verify-misched", cl::Hidden, cl::desc("Verify machine instrs before and after machine scheduling")); +// Heuristics for skipping pre-RA machine scheduling for large functions, +// containing (handwritten) intrinsic vector-code. +cl::opt LargeFunctionThreshold( + "misched-large-func-threshold", cl::Hidden, cl::init(2800), + cl::desc("The minimum number of IR instructions in a large (hand-written) " + "intrinsic vector code function")); +cl::opt NbOfIntrinsicsThreshold( + "misched-intrinsics-threshold", cl::Hidden, cl::init(425), + cl::desc("The minimum number of intrinsic instructions in a large " + "(hand-written) intrinsic vector code function")); +cl::opt VectorCodeDensityPercentageThreshold( + "misched-vector-density-threshold", cl::Hidden, cl::init(70), + cl::desc("Minimum percentage of vector instructions compared to scalar in " + "a large (hand-written) intrinsic vector code function")); + #ifndef NDEBUG cl::opt ViewMISchedDAGs( "view-misched-dags", cl::Hidden, @@ -319,6 +336,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass) INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(MachineSchedulerLegacy, DEBUG_TYPE, "Machine Instruction Scheduler", false, false) @@ -336,6 +354,7 @@ void MachineSchedulerLegacy::getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved(); AU.addRequired(); AU.addPreserved(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -557,6 +576,47 @@ bool MachineSchedulerLegacy::runOnMachineFunction(MachineFunction &MF) { return false; } + // Try to recognise large hand-written instrinc vector code, and skip the + // machine scheduler for this function if the target and TTI hook are okay + // with this. + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const MCSchedModel &SchedModel = STI.getSchedModel(); + auto &TTI = getAnalysis().getTTI(MF.getFunction()); + + if (TTI.skipPreRASchedLargeVecFunc()) { + uint64_t InstructionCount = 0; + uint64_t IntrinsicCount = 0; + uint64_t VectorTypeCount = 0; + for (auto &BB : MF.getFunction()) { + for (Instruction &I : BB) { + InstructionCount++; + if (isa(I)) + IntrinsicCount++; + Type *T = I.getType(); + if (T && T->isVectorTy()) + VectorTypeCount++; + } + } + + unsigned VecDensity = (VectorTypeCount / (double) InstructionCount) * 100; + + LLVM_DEBUG(dbgs() << "Instruction count: " << InstructionCount << ", "; + dbgs() << "threshold: " << LargeFunctionThreshold << "\n"; + dbgs() << "Intrinsic count: " << IntrinsicCount << ", "; + dbgs() << "threshold: " << NbOfIntrinsicsThreshold << "\n"; + dbgs() << "Vector density: " << VecDensity << ", "; + dbgs() << "threshold: " << VectorCodeDensityPercentageThreshold + << "\n";); + + if (InstructionCount > LargeFunctionThreshold && + IntrinsicCount > NbOfIntrinsicsThreshold && + VecDensity > VectorCodeDensityPercentageThreshold) { + LLVM_DEBUG( + dbgs() << "Skipping MISched for very vector and intrinsic heavy code"); + return false; + } + } + LLVM_DEBUG(dbgs() << "Before MISched:\n"; MF.print(dbgs())); auto &MLI = getAnalysis().getLI(); diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 7b4ded6322098..7b1e26ba1fad2 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -268,6 +268,8 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) { MaxBytesForLoopAlignment = 16; break; case NeoverseV2: + SkipPreRASchedLargeVecFunc = true; + LLVM_FALLTHROUGH; case NeoverseV3: EpilogueVectorizationMinVF = 8; MaxInterleaveFactor = 4; diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index f5ffc72cae537..5e1801e821e1b 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -71,6 +71,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { unsigned MaxBytesForLoopAlignment = 0; unsigned MinimumJumpTableEntries = 4; unsigned MaxJumpTableSize = 0; + bool SkipPreRASchedLargeVecFunc = false; // ReserveXRegister[i] - X#i is not available as a general purpose register. BitVector ReserveXRegister; @@ -160,6 +161,12 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { bool enablePostRAScheduler() const override { return usePostRAScheduler(); } bool enableSubRegLiveness() const override { return EnableSubregLiveness; } + /// Returns true if the subtarget should consider skipping the pre-RA + /// machine scheduler for large (hand-written) instrinsic vector functions. + bool enableSkipPreRASchedLargeVecFunc() const override { + return SkipPreRASchedLargeVecFunc; + } + bool enableMachinePipeliner() const override; bool useDFAforSMS() const override { return false; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index be6bca2225eac..8d26ec2b6149f 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -118,6 +118,10 @@ class AArch64TTIImpl : public BasicTTIImplBase { bool enableInterleavedAccessVectorization() const override { return true; } + bool skipPreRASchedLargeVecFunc() const override { + return ST->enableSkipPreRASchedLargeVecFunc(); + } + bool enableMaskedInterleavedAccessVectorization() const override { return ST->hasSVE(); } diff --git a/llvm/test/CodeGen/AArch64/skip-misched-large-vec-func.ll b/llvm/test/CodeGen/AArch64/skip-misched-large-vec-func.ll new file mode 100644 index 0000000000000..93e9051ade118 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/skip-misched-large-vec-func.ll @@ -0,0 +1,94 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v1 | FileCheck %s --check-prefix=SCHED +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 | FileCheck %s --check-prefix=SCHED +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -misched-large-func-threshold=30 -misched-intrinsics-threshold=2 -misched-vector-density-threshold=31 | FileCheck %s --check-prefix=NOSCHED +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -misched-large-func-threshold=31 -misched-intrinsics-threshold=2 -misched-vector-density-threshold=31 | FileCheck %s --check-prefix=SCHED +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -misched-large-func-threshold=30 -misched-intrinsics-threshold=3 -misched-vector-density-threshold=31 | FileCheck %s --check-prefix=SCHED +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -misched-large-func-threshold=30 -misched-intrinsics-threshold=2 -misched-vector-density-threshold=32 | FileCheck %s --check-prefix=SCHED + +define void @test_fma_loop(ptr %ptr_a, ptr %ptr_b, ptr %ptr_c, ptr %ptr_out, i32 %n) { +; SCHED-LABEL: test_fma_loop: +; SCHED: // %bb.0: // %entry +; SCHED-NEXT: cbz w4, .LBB0_2 +; SCHED-NEXT: .p2align 5, , 16 +; SCHED-NEXT: .LBB0_1: // %loop +; SCHED-NEXT: // =>This Inner Loop Header: Depth=1 +; SCHED-NEXT: ldr q0, [x0], #16 +; SCHED-NEXT: ldp q1, q2, [x1] +; SCHED-NEXT: subs w4, w4, #1 +; SCHED-NEXT: ldp q3, q4, [x2] +; SCHED-NEXT: fmla v3.4s, v1.4s, v0.4s +; SCHED-NEXT: ldr q0, [x1, #32] +; SCHED-NEXT: ldr q1, [x2, #32] +; SCHED-NEXT: add x1, x1, #48 +; SCHED-NEXT: add x2, x2, #48 +; SCHED-NEXT: fmla v4.4s, v2.4s, v3.4s +; SCHED-NEXT: fmla v1.4s, v0.4s, v4.4s +; SCHED-NEXT: str q1, [x3], #16 +; SCHED-NEXT: b.ne .LBB0_1 +; SCHED-NEXT: .LBB0_2: // %exit +; SCHED-NEXT: ret +; +; NOSCHED-LABEL: test_fma_loop: +; NOSCHED: // %bb.0: // %entry +; NOSCHED-NEXT: cbz w4, .LBB0_2 +; NOSCHED-NEXT: .p2align 5, , 16 +; NOSCHED-NEXT: .LBB0_1: // %loop +; NOSCHED-NEXT: // =>This Inner Loop Header: Depth=1 +; NOSCHED-NEXT: ldr q0, [x0], #16 +; NOSCHED-NEXT: ldr q1, [x1] +; NOSCHED-NEXT: ldr q2, [x2] +; NOSCHED-NEXT: subs w4, w4, #1 +; NOSCHED-NEXT: fmla v2.4s, v1.4s, v0.4s +; NOSCHED-NEXT: ldp q0, q3, [x1, #16] +; NOSCHED-NEXT: ldp q1, q4, [x2, #16] +; NOSCHED-NEXT: add x1, x1, #48 +; NOSCHED-NEXT: add x2, x2, #48 +; NOSCHED-NEXT: fmla v1.4s, v0.4s, v2.4s +; NOSCHED-NEXT: fmla v4.4s, v3.4s, v1.4s +; NOSCHED-NEXT: str q4, [x3], #16 +; NOSCHED-NEXT: b.ne .LBB0_1 +; NOSCHED-NEXT: .LBB0_2: // %exit +; NOSCHED-NEXT: ret +entry: + %cmp = icmp eq i32 %n, 0 + br i1 %cmp, label %exit, label %loop + +loop: + %iv = phi i32 [ %n, %entry ], [ %iv.next, %loop ] + %ptr_a.addr = phi ptr [ %ptr_a, %entry ], [ %ptr_a.next, %loop ] + %ptr_b.addr = phi ptr [ %ptr_b, %entry ], [ %ptr_b.next, %loop ] + %ptr_c.addr = phi ptr [ %ptr_c, %entry ], [ %ptr_c.next, %loop ] + %ptr_out.addr = phi ptr [ %ptr_out, %entry ], [ %ptr_out.next, %loop ] + + %a = load <4 x float>, ptr %ptr_a.addr + %b1 = load <4 x float>, ptr %ptr_b.addr + %c1 = load <4 x float>, ptr %ptr_c.addr + %res1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b1, <4 x float> %c1) + + %ptr_b2 = getelementptr <4 x float>, ptr %ptr_b.addr, i64 1 + %ptr_c2 = getelementptr <4 x float>, ptr %ptr_c.addr, i64 1 + %b2 = load <4 x float>, ptr %ptr_b2 + %c2 = load <4 x float>, ptr %ptr_c2 + %ptr_b3 = getelementptr <4 x float>, ptr %ptr_b.addr, i64 2 + %ptr_c3 = getelementptr <4 x float>, ptr %ptr_c.addr, i64 2 + %b3 = load <4 x float>, ptr %ptr_b3 + %c3 = load <4 x float>, ptr %ptr_c3 + + %res2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %res1, <4 x float> %b2, <4 x float> %c2) + %res3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %res2, <4 x float> %b3, <4 x float> %c3) + + store <4 x float> %res3, ptr %ptr_out.addr + + %ptr_a.next = getelementptr <4 x float>, ptr %ptr_a.addr, i64 1 + %ptr_b.next = getelementptr <4 x float>, ptr %ptr_b.addr, i64 3 + %ptr_c.next = getelementptr <4 x float>, ptr %ptr_c.addr, i64 3 + %ptr_out.next = getelementptr <4 x float>, ptr %ptr_out.addr, i64 1 + + %iv.next = sub i32 %iv, 1 + %cmp.next = icmp ne i32 %iv.next, 0 + br i1 %cmp.next, label %loop, label %exit + +exit: + ret void +}