Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1019,6 +1019,10 @@ class TargetTransformInfo {
/// Enable matching of interleaved access groups.
bool enableInterleavedAccessVectorization() const;

/// Disable the machine scheduler for a large function with a lot of
/// (hand-written) vector code and intrinsics.
bool skipPreRASchedLargeVecFunc() const;

/// Enable matching of interleaved access groups that contain predicated
/// accesses or gaps and therefore vectorized using masked
/// vector loads/stores.
Expand Down
2 changes: 2 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,8 @@ class TargetTransformInfoImplBase {

virtual bool enableInterleavedAccessVectorization() const { return false; }

virtual bool skipPreRASchedLargeVecFunc() const { return false; }

virtual bool enableMaskedInterleavedAccessVectorization() const {
return false;
}
Expand Down
4 changes: 4 additions & 0 deletions llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,10 @@ class TargetSubtargetInfo : public MCSubtargetInfo {
return false;
}

virtual bool enableSkipPreRASchedLargeVecFunc() const {
return false;
}

/// True if the subtarget should run MachineScheduler after aggressive
/// coalescing.
///
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Analysis/TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -677,6 +677,10 @@ bool TargetTransformInfo::enableInterleavedAccessVectorization() const {
return TTIImpl->enableInterleavedAccessVectorization();
}

bool TargetTransformInfo::skipPreRASchedLargeVecFunc() const {
return TTIImpl->skipPreRASchedLargeVecFunc();
}

bool TargetTransformInfo::enableMaskedInterleavedAccessVectorization() const {
return TTIImpl->enableMaskedInterleavedAccessVectorization();
}
Expand Down
60 changes: 60 additions & 0 deletions llvm/lib/CodeGen/MachineScheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/LiveInterval.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
Expand Down Expand Up @@ -49,6 +50,7 @@
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/CodeGenTypes/MachineValueType.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/InitializePasses.h"
#include "llvm/MC/LaneBitmask.h"
#include "llvm/Pass.h"
Expand Down Expand Up @@ -110,6 +112,21 @@ cl::opt<bool> VerifyScheduling(
"verify-misched", cl::Hidden,
cl::desc("Verify machine instrs before and after machine scheduling"));

// Heuristics for skipping pre-RA machine scheduling for large functions,
// containing (handwritten) intrinsic vector-code.
cl::opt<unsigned> LargeFunctionThreshold(
"misched-large-func-threshold", cl::Hidden, cl::init(2800),
cl::desc("The minimum number of IR instructions in a large (hand-written) "
"intrinsic vector code function"));
cl::opt<unsigned> NbOfIntrinsicsThreshold(
"misched-intrinsics-threshold", cl::Hidden, cl::init(425),
cl::desc("The minimum number of intrinsic instructions in a large "
"(hand-written) intrinsic vector code function"));
cl::opt<unsigned> VectorCodeDensityPercentageThreshold(
"misched-vector-density-threshold", cl::Hidden, cl::init(70),
cl::desc("Minimum percentage of vector instructions compared to scalar in "
"a large (hand-written) intrinsic vector code function"));

#ifndef NDEBUG
cl::opt<bool> ViewMISchedDAGs(
"view-misched-dags", cl::Hidden,
Expand Down Expand Up @@ -319,6 +336,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(MachineSchedulerLegacy, DEBUG_TYPE,
"Machine Instruction Scheduler", false, false)

Expand All @@ -336,6 +354,7 @@ void MachineSchedulerLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addPreserved<SlotIndexesWrapperPass>();
AU.addRequired<LiveIntervalsWrapperPass>();
AU.addPreserved<LiveIntervalsWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}

Expand Down Expand Up @@ -557,6 +576,47 @@ bool MachineSchedulerLegacy::runOnMachineFunction(MachineFunction &MF) {
return false;
}

// Try to recognise large hand-written instrinc vector code, and skip the
// machine scheduler for this function if the target and TTI hook are okay
// with this.
const TargetSubtargetInfo &STI = MF.getSubtarget();
const MCSchedModel &SchedModel = STI.getSchedModel();
auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(MF.getFunction());

if (TTI.skipPreRASchedLargeVecFunc()) {
uint64_t InstructionCount = 0;
uint64_t IntrinsicCount = 0;
uint64_t VectorTypeCount = 0;
for (auto &BB : MF.getFunction()) {
for (Instruction &I : BB) {
InstructionCount++;
if (isa<IntrinsicInst>(I))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to skip debug intrinsics too

IntrinsicCount++;
Type *T = I.getType();
if (T && T->isVectorTy())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't be null

VectorTypeCount++;
}
}
Comment on lines +586 to +599
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Really should not be writing an IR based heuristic in a machine pass. This is baking in a lot of assumptions about the architecture and how the IR will be lowered. You have better information from the current machine instructions

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see you what you mean, but I intentionally iterated over the IR to extract high level information that is not available on MIR, i.e. the vector intrinsics are lowered (FMAs) on MIR and are no longer recognisable.
I can calculate the heuristic on the MIR too, but then I will have to change it and drop the number intrinsics from the heuristic calculation, which then becomes "recognising a large and very vector code dense function". Which is slightly less specific, so if that is acceptable, that's easy to implement.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes more sense to me, this IR processing is extremely vague as written


unsigned VecDensity = (VectorTypeCount / (double) InstructionCount) * 100;

LLVM_DEBUG(dbgs() << "Instruction count: " << InstructionCount << ", ";
dbgs() << "threshold: " << LargeFunctionThreshold << "\n";
dbgs() << "Intrinsic count: " << IntrinsicCount << ", ";
dbgs() << "threshold: " << NbOfIntrinsicsThreshold << "\n";
dbgs() << "Vector density: " << VecDensity << ", ";
dbgs() << "threshold: " << VectorCodeDensityPercentageThreshold
<< "\n";);

if (InstructionCount > LargeFunctionThreshold &&
IntrinsicCount > NbOfIntrinsicsThreshold &&
VecDensity > VectorCodeDensityPercentageThreshold) {
LLVM_DEBUG(
dbgs() << "Skipping MISched for very vector and intrinsic heavy code");
return false;
}
}

LLVM_DEBUG(dbgs() << "Before MISched:\n"; MF.print(dbgs()));

auto &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AArch64/AArch64Subtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,8 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
MaxBytesForLoopAlignment = 16;
break;
case NeoverseV2:
SkipPreRASchedLargeVecFunc = true;
LLVM_FALLTHROUGH;
case NeoverseV3:
EpilogueVectorizationMinVF = 8;
MaxInterleaveFactor = 4;
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/AArch64/AArch64Subtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
unsigned MaxBytesForLoopAlignment = 0;
unsigned MinimumJumpTableEntries = 4;
unsigned MaxJumpTableSize = 0;
bool SkipPreRASchedLargeVecFunc = false;

// ReserveXRegister[i] - X#i is not available as a general purpose register.
BitVector ReserveXRegister;
Expand Down Expand Up @@ -160,6 +161,12 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
bool enablePostRAScheduler() const override { return usePostRAScheduler(); }
bool enableSubRegLiveness() const override { return EnableSubregLiveness; }

/// Returns true if the subtarget should consider skipping the pre-RA
/// machine scheduler for large (hand-written) instrinsic vector functions.
bool enableSkipPreRASchedLargeVecFunc() const override {
return SkipPreRASchedLargeVecFunc;
}

bool enableMachinePipeliner() const override;
bool useDFAforSMS() const override { return false; }

Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,10 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {

bool enableInterleavedAccessVectorization() const override { return true; }

bool skipPreRASchedLargeVecFunc() const override {
return ST->enableSkipPreRASchedLargeVecFunc();
}

bool enableMaskedInterleavedAccessVectorization() const override {
return ST->hasSVE();
}
Expand Down
94 changes: 94 additions & 0 deletions llvm/test/CodeGen/AArch64/skip-misched-large-vec-func.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v1 | FileCheck %s --check-prefix=SCHED
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 | FileCheck %s --check-prefix=SCHED
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -misched-large-func-threshold=30 -misched-intrinsics-threshold=2 -misched-vector-density-threshold=31 | FileCheck %s --check-prefix=NOSCHED
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -misched-large-func-threshold=31 -misched-intrinsics-threshold=2 -misched-vector-density-threshold=31 | FileCheck %s --check-prefix=SCHED
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -misched-large-func-threshold=30 -misched-intrinsics-threshold=3 -misched-vector-density-threshold=31 | FileCheck %s --check-prefix=SCHED
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -misched-large-func-threshold=30 -misched-intrinsics-threshold=2 -misched-vector-density-threshold=32 | FileCheck %s --check-prefix=SCHED

define void @test_fma_loop(ptr %ptr_a, ptr %ptr_b, ptr %ptr_c, ptr %ptr_out, i32 %n) {
; SCHED-LABEL: test_fma_loop:
; SCHED: // %bb.0: // %entry
; SCHED-NEXT: cbz w4, .LBB0_2
; SCHED-NEXT: .p2align 5, , 16
; SCHED-NEXT: .LBB0_1: // %loop
; SCHED-NEXT: // =>This Inner Loop Header: Depth=1
; SCHED-NEXT: ldr q0, [x0], #16
; SCHED-NEXT: ldp q1, q2, [x1]
; SCHED-NEXT: subs w4, w4, #1
; SCHED-NEXT: ldp q3, q4, [x2]
; SCHED-NEXT: fmla v3.4s, v1.4s, v0.4s
; SCHED-NEXT: ldr q0, [x1, #32]
; SCHED-NEXT: ldr q1, [x2, #32]
; SCHED-NEXT: add x1, x1, #48
; SCHED-NEXT: add x2, x2, #48
; SCHED-NEXT: fmla v4.4s, v2.4s, v3.4s
; SCHED-NEXT: fmla v1.4s, v0.4s, v4.4s
; SCHED-NEXT: str q1, [x3], #16
; SCHED-NEXT: b.ne .LBB0_1
; SCHED-NEXT: .LBB0_2: // %exit
; SCHED-NEXT: ret
;
; NOSCHED-LABEL: test_fma_loop:
; NOSCHED: // %bb.0: // %entry
; NOSCHED-NEXT: cbz w4, .LBB0_2
; NOSCHED-NEXT: .p2align 5, , 16
; NOSCHED-NEXT: .LBB0_1: // %loop
; NOSCHED-NEXT: // =>This Inner Loop Header: Depth=1
; NOSCHED-NEXT: ldr q0, [x0], #16
; NOSCHED-NEXT: ldr q1, [x1]
; NOSCHED-NEXT: ldr q2, [x2]
; NOSCHED-NEXT: subs w4, w4, #1
; NOSCHED-NEXT: fmla v2.4s, v1.4s, v0.4s
; NOSCHED-NEXT: ldp q0, q3, [x1, #16]
; NOSCHED-NEXT: ldp q1, q4, [x2, #16]
; NOSCHED-NEXT: add x1, x1, #48
; NOSCHED-NEXT: add x2, x2, #48
; NOSCHED-NEXT: fmla v1.4s, v0.4s, v2.4s
; NOSCHED-NEXT: fmla v4.4s, v3.4s, v1.4s
; NOSCHED-NEXT: str q4, [x3], #16
; NOSCHED-NEXT: b.ne .LBB0_1
; NOSCHED-NEXT: .LBB0_2: // %exit
; NOSCHED-NEXT: ret
entry:
%cmp = icmp eq i32 %n, 0
br i1 %cmp, label %exit, label %loop

loop:
%iv = phi i32 [ %n, %entry ], [ %iv.next, %loop ]
%ptr_a.addr = phi ptr [ %ptr_a, %entry ], [ %ptr_a.next, %loop ]
%ptr_b.addr = phi ptr [ %ptr_b, %entry ], [ %ptr_b.next, %loop ]
%ptr_c.addr = phi ptr [ %ptr_c, %entry ], [ %ptr_c.next, %loop ]
%ptr_out.addr = phi ptr [ %ptr_out, %entry ], [ %ptr_out.next, %loop ]

%a = load <4 x float>, ptr %ptr_a.addr
%b1 = load <4 x float>, ptr %ptr_b.addr
%c1 = load <4 x float>, ptr %ptr_c.addr
%res1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b1, <4 x float> %c1)

%ptr_b2 = getelementptr <4 x float>, ptr %ptr_b.addr, i64 1
%ptr_c2 = getelementptr <4 x float>, ptr %ptr_c.addr, i64 1
%b2 = load <4 x float>, ptr %ptr_b2
%c2 = load <4 x float>, ptr %ptr_c2
%ptr_b3 = getelementptr <4 x float>, ptr %ptr_b.addr, i64 2
%ptr_c3 = getelementptr <4 x float>, ptr %ptr_c.addr, i64 2
%b3 = load <4 x float>, ptr %ptr_b3
%c3 = load <4 x float>, ptr %ptr_c3

%res2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %res1, <4 x float> %b2, <4 x float> %c2)
%res3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %res2, <4 x float> %b3, <4 x float> %c3)

store <4 x float> %res3, ptr %ptr_out.addr

%ptr_a.next = getelementptr <4 x float>, ptr %ptr_a.addr, i64 1
%ptr_b.next = getelementptr <4 x float>, ptr %ptr_b.addr, i64 3
%ptr_c.next = getelementptr <4 x float>, ptr %ptr_c.addr, i64 3
%ptr_out.next = getelementptr <4 x float>, ptr %ptr_out.addr, i64 1

%iv.next = sub i32 %iv, 1
%cmp.next = icmp ne i32 %iv.next, 0
br i1 %cmp.next, label %loop, label %exit

exit:
ret void
}
Loading