[MachineScheduler][AArch64] Skip Neoverse V2 Pre-RA MISched for large vector intrinsic codes

sjoerdmeijer · sjoerdmeijer · commit 6323d9c73766 · 2025-05-12T07:06:14.000-07:00
Skip the Pre-RA MachineScheduler for large hand-written vector intrinsic codes when targetting the Neoverse V2. The motivation to skip the scheduler is the same as this abandoned patch: #127784 But this reimplementation is much more focused and fine-grained and based on the following heuristic: - only skip the pre-ra machine scheduler for large (hand-written) vector intrinsic code, - do this only for the Neoverse V2 (a wide micro-architecture). The intuition of this patch is that: - scheduling based on instruction latency isn't useful for a very wide micro-architecture (which is why GCC also partly stopped doing this), - however, the machine scheduler also performs some optimisations: i) load/store clusttering, and ii) copy elimination. These are useful optimisations, and that's why disabling the machine scheduler in general isn't a good idea, i.e. this results in some regressions. - but the function where the machine scheduler and register allocator are not working well together is a large, hand-written vector code. Thus, one could argue that scheduling this kind of code is against the programmer's intent, so let's not do that, which avoids complications later down in the optimisation pipeline. The heuristic is trying to recognise large hand-written intrinsic code by calculating a percentage of vector code and other instructions in a function and skips the machine scheduler if certain treshold values are exceeded. I.e., if a function is more than 70% vector code, contains more than 2800 IR instructions and 425 intrinsics, don't schedule this function. This obviously is a heuristic, but is hopefully narrow enough to not cause regressions (I haven't found any). The alternative is to look into regalloc, which is where the problems occur with the placement of spill/reload code. However, there will be heuristics involved there too, and so this seems like a valid heuristic and looking into regalloc is an orthogonal exercise.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1019,6 +1019,10 @@ class TargetTransformInfo {
   /// Enable matching of interleaved access groups.
   bool enableInterleavedAccessVectorization() const;
 
+  /// Disable the machine scheduler for a large function with a lot of
+  /// (hand-written) vector code and intrinsics.
+  bool skipPreRASchedLargeVecFunc() const;
+
   /// Enable matching of interleaved access groups that contain predicated
   /// accesses or gaps and therefore vectorized using masked
   /// vector loads/stores.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -499,6 +499,8 @@ class TargetTransformInfoImplBase {
 
   virtual bool enableInterleavedAccessVectorization() const { return false; }
 
+  virtual bool skipPreRASchedLargeVecFunc() const { return false; }
+
   virtual bool enableMaskedInterleavedAccessVectorization() const {
     return false;
   }
diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -184,6 +184,10 @@ class TargetSubtargetInfo : public MCSubtargetInfo {
     return false;
   }
 
+  virtual bool enableSkipPreRASchedLargeVecFunc() const {
+    return false;
+  }
+
   /// True if the subtarget should run MachineScheduler after aggressive
   /// coalescing.
   ///
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -677,6 +677,10 @@ bool TargetTransformInfo::enableInterleavedAccessVectorization() const {
   return TTIImpl->enableInterleavedAccessVectorization();
 }
 
+bool TargetTransformInfo::skipPreRASchedLargeVecFunc() const {
+  return TTIImpl->skipPreRASchedLargeVecFunc();
+}
+
 bool TargetTransformInfo::enableMaskedInterleavedAccessVectorization() const {
   return TTIImpl->enableMaskedInterleavedAccessVectorization();
 }
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -49,6 +50,7 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGenTypes/MachineValueType.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/Pass.h"
@@ -110,6 +112,21 @@ cl::opt<bool> VerifyScheduling(
     "verify-misched", cl::Hidden,
     cl::desc("Verify machine instrs before and after machine scheduling"));
 
+// Heuristics for skipping pre-RA machine scheduling for large functions,
+// containing (handwritten) intrinsic vector-code.
+cl::opt<unsigned> LargeFunctionThreshold(
+    "misched-large-func-threshold", cl::Hidden, cl::init(2800),
+    cl::desc("The minimum number of IR instructions in a large (hand-written) "
+             "intrinsic vector code function"));
+cl::opt<unsigned> NbOfIntrinsicsThreshold(
+    "misched-intrinsics-threshold", cl::Hidden, cl::init(425),
+    cl::desc("The minimum number of intrinsic instructions in a large "
+             "(hand-written) intrinsic vector code function"));
+cl::opt<unsigned> VectorCodeDensityPercentageThreshold(
+    "misched-vector-density-threshold", cl::Hidden, cl::init(70),
+    cl::desc("Minimum percentage of vector instructions compared to scalar in "
+             "a large (hand-written) intrinsic vector code function"));
+
 #ifndef NDEBUG
 cl::opt<bool> ViewMISchedDAGs(
     "view-misched-dags", cl::Hidden,
@@ -319,6 +336,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(MachineSchedulerLegacy, DEBUG_TYPE,
                     "Machine Instruction Scheduler", false, false)
 
@@ -336,6 +354,7 @@ void MachineSchedulerLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<SlotIndexesWrapperPass>();
   AU.addRequired<LiveIntervalsWrapperPass>();
   AU.addPreserved<LiveIntervalsWrapperPass>();
+  AU.addRequired<TargetTransformInfoWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -557,6 +576,47 @@ bool MachineSchedulerLegacy::runOnMachineFunction(MachineFunction &MF) {
     return false;
   }
 
+  // Try to recognise large hand-written instrinc vector code, and skip the
+  // machine scheduler for this function if the target and TTI hook are okay
+  // with this.
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  const MCSchedModel &SchedModel = STI.getSchedModel();
+  auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(MF.getFunction());
+
+  if (TTI.skipPreRASchedLargeVecFunc()) {
+    uint64_t InstructionCount = 0;
+    uint64_t IntrinsicCount = 0;
+    uint64_t VectorTypeCount = 0;
+    for (auto &BB : MF.getFunction()) {
+      for (Instruction &I : BB) {
+       InstructionCount++;
+       if (isa<IntrinsicInst>(I))
+         IntrinsicCount++;
+       Type *T = I.getType();
+       if (T && T->isVectorTy())
+         VectorTypeCount++;
+      }
+    }
+
+    unsigned VecDensity = (VectorTypeCount / (double) InstructionCount) * 100;
+
+    LLVM_DEBUG(dbgs() << "Instruction count: " << InstructionCount << ", ";
+               dbgs() << "threshold: " << LargeFunctionThreshold << "\n";
+               dbgs() << "Intrinsic count: " << IntrinsicCount << ", ";
+               dbgs() << "threshold: " << NbOfIntrinsicsThreshold << "\n";
+               dbgs() << "Vector density: " << VecDensity << ", ";
+               dbgs() << "threshold: " << VectorCodeDensityPercentageThreshold
+                      << "\n";);
+
+    if (InstructionCount > LargeFunctionThreshold &&
+        IntrinsicCount > NbOfIntrinsicsThreshold &&
+        VecDensity > VectorCodeDensityPercentageThreshold) {
+      LLVM_DEBUG(
+          dbgs() << "Skipping MISched for very vector and intrinsic heavy code");
+      return false;
+    }
+  }
+
   LLVM_DEBUG(dbgs() << "Before MISched:\n"; MF.print(dbgs()));
 
   auto &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -268,6 +268,8 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
     MaxBytesForLoopAlignment = 16;
     break;
   case NeoverseV2:
+    SkipPreRASchedLargeVecFunc = true;
+    LLVM_FALLTHROUGH;
   case NeoverseV3:
     EpilogueVectorizationMinVF = 8;
     MaxInterleaveFactor = 4;
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -71,6 +71,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   unsigned MaxBytesForLoopAlignment = 0;
   unsigned MinimumJumpTableEntries = 4;
   unsigned MaxJumpTableSize = 0;
+  bool SkipPreRASchedLargeVecFunc = false;
 
   // ReserveXRegister[i] - X#i is not available as a general purpose register.
   BitVector ReserveXRegister;
@@ -160,6 +161,12 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   bool enablePostRAScheduler() const override { return usePostRAScheduler(); }
   bool enableSubRegLiveness() const override { return EnableSubregLiveness; }
 
+  /// Returns true if the subtarget should consider skipping the pre-RA
+  /// machine scheduler for large (hand-written) instrinsic vector functions.
+  bool enableSkipPreRASchedLargeVecFunc() const override {
+    return SkipPreRASchedLargeVecFunc;
+  }
+
   bool enableMachinePipeliner() const override;
   bool useDFAforSMS() const override { return false; }
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -118,6 +118,10 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
 
   bool enableInterleavedAccessVectorization() const override { return true; }
 
+  bool skipPreRASchedLargeVecFunc() const override {
+    return ST->enableSkipPreRASchedLargeVecFunc();
+  }
+
   bool enableMaskedInterleavedAccessVectorization() const override {
     return ST->hasSVE();
   }
diff --git a/llvm/test/CodeGen/AArch64/skip-misched-large-vec-func.ll b/llvm/test/CodeGen/AArch64/skip-misched-large-vec-func.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v1 | FileCheck %s --check-prefix=SCHED
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 | FileCheck %s --check-prefix=SCHED
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -misched-large-func-threshold=30 -misched-intrinsics-threshold=2 -misched-vector-density-threshold=31 | FileCheck %s --check-prefix=NOSCHED
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -misched-large-func-threshold=31 -misched-intrinsics-threshold=2 -misched-vector-density-threshold=31 | FileCheck %s --check-prefix=SCHED
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -misched-large-func-threshold=30 -misched-intrinsics-threshold=3 -misched-vector-density-threshold=31 | FileCheck %s --check-prefix=SCHED
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -misched-large-func-threshold=30 -misched-intrinsics-threshold=2 -misched-vector-density-threshold=32 | FileCheck %s --check-prefix=SCHED
+
+define void @test_fma_loop(ptr %ptr_a, ptr %ptr_b, ptr %ptr_c, ptr %ptr_out, i32 %n) {
+; SCHED-LABEL: test_fma_loop:
+; SCHED:       // %bb.0: // %entry
+; SCHED-NEXT:    cbz w4, .LBB0_2
+; SCHED-NEXT:    .p2align 5, , 16
+; SCHED-NEXT:  .LBB0_1: // %loop
+; SCHED-NEXT:    // =>This Inner Loop Header: Depth=1
+; SCHED-NEXT:    ldr q0, [x0], #16
+; SCHED-NEXT:    ldp q1, q2, [x1]
+; SCHED-NEXT:    subs w4, w4, #1
+; SCHED-NEXT:    ldp q3, q4, [x2]
+; SCHED-NEXT:    fmla v3.4s, v1.4s, v0.4s
+; SCHED-NEXT:    ldr q0, [x1, #32]
+; SCHED-NEXT:    ldr q1, [x2, #32]
+; SCHED-NEXT:    add x1, x1, #48
+; SCHED-NEXT:    add x2, x2, #48
+; SCHED-NEXT:    fmla v4.4s, v2.4s, v3.4s
+; SCHED-NEXT:    fmla v1.4s, v0.4s, v4.4s
+; SCHED-NEXT:    str q1, [x3], #16
+; SCHED-NEXT:    b.ne .LBB0_1
+; SCHED-NEXT:  .LBB0_2: // %exit
+; SCHED-NEXT:    ret
+;
+; NOSCHED-LABEL: test_fma_loop:
+; NOSCHED:       // %bb.0: // %entry
+; NOSCHED-NEXT:    cbz w4, .LBB0_2
+; NOSCHED-NEXT:    .p2align 5, , 16
+; NOSCHED-NEXT:  .LBB0_1: // %loop
+; NOSCHED-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOSCHED-NEXT:    ldr q0, [x0], #16
+; NOSCHED-NEXT:    ldr q1, [x1]
+; NOSCHED-NEXT:    ldr q2, [x2]
+; NOSCHED-NEXT:    subs w4, w4, #1
+; NOSCHED-NEXT:    fmla v2.4s, v1.4s, v0.4s
+; NOSCHED-NEXT:    ldp q0, q3, [x1, #16]
+; NOSCHED-NEXT:    ldp q1, q4, [x2, #16]
+; NOSCHED-NEXT:    add x1, x1, #48
+; NOSCHED-NEXT:    add x2, x2, #48
+; NOSCHED-NEXT:    fmla v1.4s, v0.4s, v2.4s
+; NOSCHED-NEXT:    fmla v4.4s, v3.4s, v1.4s
+; NOSCHED-NEXT:    str q4, [x3], #16
+; NOSCHED-NEXT:    b.ne .LBB0_1
+; NOSCHED-NEXT:  .LBB0_2: // %exit
+; NOSCHED-NEXT:    ret
+entry:
+  %cmp = icmp eq i32 %n, 0
+  br i1 %cmp, label %exit, label %loop
+
+loop:
+  %iv = phi i32 [ %n, %entry ], [ %iv.next, %loop ]
+  %ptr_a.addr = phi ptr [ %ptr_a, %entry ], [ %ptr_a.next, %loop ]
+  %ptr_b.addr = phi ptr [ %ptr_b, %entry ], [ %ptr_b.next, %loop ]
+  %ptr_c.addr = phi ptr [ %ptr_c, %entry ], [ %ptr_c.next, %loop ]
+  %ptr_out.addr = phi ptr [ %ptr_out, %entry ], [ %ptr_out.next, %loop ]
+
+  %a = load <4 x float>, ptr %ptr_a.addr
+  %b1 = load <4 x float>, ptr %ptr_b.addr
+  %c1 = load <4 x float>, ptr %ptr_c.addr
+  %res1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b1, <4 x float> %c1)
+
+  %ptr_b2 = getelementptr <4 x float>, ptr %ptr_b.addr, i64 1
+  %ptr_c2 = getelementptr <4 x float>, ptr %ptr_c.addr, i64 1
+  %b2 = load <4 x float>, ptr %ptr_b2
+  %c2 = load <4 x float>, ptr %ptr_c2
+  %ptr_b3 = getelementptr <4 x float>, ptr %ptr_b.addr, i64 2
+  %ptr_c3 = getelementptr <4 x float>, ptr %ptr_c.addr, i64 2
+  %b3 = load <4 x float>, ptr %ptr_b3
+  %c3 = load <4 x float>, ptr %ptr_c3
+
+  %res2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %res1, <4 x float> %b2, <4 x float> %c2)
+  %res3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %res2, <4 x float> %b3, <4 x float> %c3)
+
+  store <4 x float> %res3, ptr %ptr_out.addr
+
+  %ptr_a.next = getelementptr <4 x float>, ptr %ptr_a.addr, i64 1
+  %ptr_b.next = getelementptr <4 x float>, ptr %ptr_b.addr, i64 3
+  %ptr_c.next = getelementptr <4 x float>, ptr %ptr_c.addr, i64 3
+  %ptr_out.next = getelementptr <4 x float>, ptr %ptr_out.addr, i64 1
+
+  %iv.next = sub i32 %iv, 1
+  %cmp.next = icmp ne i32 %iv.next, 0
+  br i1 %cmp.next, label %loop, label %exit
+
+exit:
+  ret void
+}

Original file line number	Diff line number	Diff line change
`@@ -499,6 +499,8 @@ class TargetTransformInfoImplBase {`
`499`	`499`
`500`	`500`	`virtual bool enableInterleavedAccessVectorization() const { return false; }`
`501`	`501`
	`502`	`+ virtual bool skipPreRASchedLargeVecFunc() const { return false; }`
	`503`	`+`
`502`	`504`	`virtual bool enableMaskedInterleavedAccessVectorization() const {`
`503`	`505`	`return false;`
`504`	`506`	`}`
Original file line number	Diff line number	Diff line change
`@@ -677,6 +677,10 @@ bool TargetTransformInfo::enableInterleavedAccessVectorization() const {`
`677`	`677`	`return TTIImpl->enableInterleavedAccessVectorization();`
`678`	`678`	`}`
`679`	`679`
	`680`	`+bool TargetTransformInfo::skipPreRASchedLargeVecFunc() const {`
	`681`	`+ return TTIImpl->skipPreRASchedLargeVecFunc();`
	`682`	`+}`
	`683`	`+`
`680`	`684`	`bool TargetTransformInfo::enableMaskedInterleavedAccessVectorization() const {`
`681`	`685`	`return TTIImpl->enableMaskedInterleavedAccessVectorization();`
`682`	`686`	`}`
Original file line number	Diff line number	Diff line change
`@@ -118,6 +118,10 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {`
`118`	`118`
`119`	`119`	`bool enableInterleavedAccessVectorization() const override { return true; }`
`120`	`120`
	`121`	`+ bool skipPreRASchedLargeVecFunc() const override {`
	`122`	`+ return ST->enableSkipPreRASchedLargeVecFunc();`
	`123`	`+ }`
	`124`	`+`
`121`	`125`	`bool enableMaskedInterleavedAccessVectorization() const override {`
`122`	`126`	`return ST->hasSVE();`
`123`	`127`	`}`