Skip to content

Commit cacb6c5

Browse files
committed
[X86][BreakFalseDeps] Using reverse order for undef register selection
BreakFalseDeps picks the best register for undef operands if instructions have false dependency. The problem is if the instruction is close to the beginning of the function, ReachingDefAnalysis is over optimism of the unused registers, which results in collision with registers just defined in the caller. This patch changes the selection of undef register in an reverse order, which reduces the probability of register collisions between caller and callee. It brings improvement in some of our internal benchmarks with negligible effect on other benchmarks.
1 parent 571e024 commit cacb6c5

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+918
-898
lines changed

llvm/include/llvm/CodeGen/RegisterClassInfo.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ class RegisterClassInfo {
4949
// entry is valid when its tag matches.
5050
unsigned Tag = 0;
5151

52+
bool Reverse = false;
53+
5254
const MachineFunction *MF = nullptr;
5355
const TargetRegisterInfo *TRI = nullptr;
5456

@@ -87,7 +89,7 @@ class RegisterClassInfo {
8789

8890
/// runOnFunction - Prepare to answer questions about MF. This must be called
8991
/// before any other methods are used.
90-
void runOnMachineFunction(const MachineFunction &MF);
92+
void runOnMachineFunction(const MachineFunction &MF, bool Rev = false);
9193

9294
/// getNumAllocatableRegs - Returns the number of actually allocatable
9395
/// registers in RC in the current function.

llvm/include/llvm/CodeGen/TargetRegisterInfo.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ class TargetRegisterClass {
6767
const bool CoveredBySubRegs;
6868
const unsigned *SuperClasses;
6969
const uint16_t SuperClassesSize;
70-
ArrayRef<MCPhysReg> (*OrderFunc)(const MachineFunction&);
70+
ArrayRef<MCPhysReg> (*OrderFunc)(const MachineFunction &, bool Rev);
7171

7272
/// Return the register class ID number.
7373
unsigned getID() const { return MC->getID(); }
@@ -198,8 +198,9 @@ class TargetRegisterClass {
198198
/// other criteria.
199199
///
200200
/// By default, this method returns all registers in the class.
201-
ArrayRef<MCPhysReg> getRawAllocationOrder(const MachineFunction &MF) const {
202-
return OrderFunc ? OrderFunc(MF) : getRegisters();
201+
ArrayRef<MCPhysReg> getRawAllocationOrder(const MachineFunction &MF,
202+
bool Rev = false) const {
203+
return OrderFunc ? OrderFunc(MF, Rev) : getRegisters();
203204
}
204205

205206
/// Returns the combination of all lane masks of register in this class.

llvm/include/llvm/Target/Target.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ class RegisterClass<string namespace, list<ValueType> regTypes, int alignment,
314314
// to use in a given machine function. The code will be inserted in a
315315
// function like this:
316316
//
317-
// static inline unsigned f(const MachineFunction &MF) { ... }
317+
// static inline unsigned f(const MachineFunction &MF, bool Rev) { ... }
318318
//
319319
// The function should return 0 to select the default order defined by
320320
// MemberList, 1 to select the first AltOrders entry and so on.

llvm/lib/CodeGen/BreakFalseDeps.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,7 @@ bool BreakFalseDeps::runOnMachineFunction(MachineFunction &mf) {
286286
TRI = MF->getSubtarget().getRegisterInfo();
287287
RDA = &getAnalysis<ReachingDefAnalysis>();
288288

289-
RegClassInfo.runOnMachineFunction(mf);
289+
RegClassInfo.runOnMachineFunction(mf, /*Rev=*/true);
290290

291291
LLVM_DEBUG(dbgs() << "********** BREAK FALSE DEPENDENCIES **********\n");
292292

llvm/lib/CodeGen/RegisterClassInfo.cpp

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,16 @@ StressRA("stress-regalloc", cl::Hidden, cl::init(0), cl::value_desc("N"),
3939

4040
RegisterClassInfo::RegisterClassInfo() = default;
4141

42-
void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
42+
void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf,
43+
bool Rev) {
4344
bool Update = false;
4445
MF = &mf;
4546

4647
auto &STI = MF->getSubtarget();
4748

4849
// Allocate new array the first time we see a new target.
49-
if (STI.getRegisterInfo() != TRI) {
50+
if (STI.getRegisterInfo() != TRI || Reverse != Rev) {
51+
Reverse = Rev;
5052
TRI = STI.getRegisterInfo();
5153
RegClass.reset(new RCInfo[TRI->getNumRegClasses()]);
5254
Update = true;
@@ -142,7 +144,12 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
142144

143145
// FIXME: Once targets reserve registers instead of removing them from the
144146
// allocation order, we can simply use begin/end here.
145-
ArrayRef<MCPhysReg> RawOrder = RC->getRawAllocationOrder(*MF);
147+
ArrayRef<MCPhysReg> RawOrder = RC->getRawAllocationOrder(*MF, Reverse);
148+
std::vector<MCPhysReg> ReverseOrder;
149+
if (Reverse) {
150+
llvm::append_range(ReverseOrder, reverse(RawOrder));
151+
RawOrder = ArrayRef<MCPhysReg>(ReverseOrder);
152+
}
146153
for (unsigned PhysReg : RawOrder) {
147154
// Remove reserved registers from the allocation order.
148155
if (Reserved.test(PhysReg))

llvm/lib/Target/X86/X86RegisterInfo.td

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -802,17 +802,37 @@ def VR512_0_15 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i
802802
512, (sequence "ZMM%u", 0, 15)>;
803803

804804
// Scalar AVX-512 floating point registers.
805-
def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>;
805+
def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)> {
806+
let AltOrders = [(add (sequence "XMM%u", 16, 31), (sequence "XMM%u", 0, 15))];
807+
let AltOrderSelect = [{
808+
return Rev;
809+
}];
810+
}
806811

807-
def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>;
812+
def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)> {
813+
let AltOrders = [(add (sequence "XMM%u", 16, 31), (sequence "XMM%u", 0, 15))];
814+
let AltOrderSelect = [{
815+
return Rev;
816+
}];
817+
}
808818

809819
def FR16X : RegisterClass<"X86", [f16], 16, (add FR32X)> {let Size = 32;}
810820

811821
// Extended VR128 and VR256 for AVX-512 instructions
812822
def VR128X : RegisterClass<"X86", [v4f32, v2f64, v8f16, v8bf16, v16i8, v8i16, v4i32, v2i64, f128],
813-
128, (add FR32X)>;
823+
128, (add FR32X)> {
824+
let AltOrders = [(add (sequence "XMM%u", 16, 31), (sequence "XMM%u", 0, 15))];
825+
let AltOrderSelect = [{
826+
return Rev;
827+
}];
828+
}
814829
def VR256X : RegisterClass<"X86", [v8f32, v4f64, v16f16, v16bf16, v32i8, v16i16, v8i32, v4i64],
815-
256, (sequence "YMM%u", 0, 31)>;
830+
256, (sequence "YMM%u", 0, 31)> {
831+
let AltOrders = [(add (sequence "YMM%u", 16, 31), (sequence "YMM%u", 0, 15))];
832+
let AltOrderSelect = [{
833+
return Rev;
834+
}];
835+
}
816836

817837
// Mask registers
818838
def VK1 : RegisterClass<"X86", [v1i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;}

llvm/test/CodeGen/X86/avx-cvt.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ define <2 x double> @fpext01(<2 x double> %a0, <4 x float> %a1) nounwind {
108108
define double @funcA(ptr nocapture %e) nounwind uwtable readonly ssp {
109109
; CHECK-LABEL: funcA:
110110
; CHECK: # %bb.0:
111-
; CHECK-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0
111+
; CHECK-NEXT: vcvtsi2sdq (%rdi), %xmm15, %xmm0
112112
; CHECK-NEXT: retq
113113
%tmp1 = load i64, ptr %e, align 8
114114
%conv = sitofp i64 %tmp1 to double
@@ -118,7 +118,7 @@ define double @funcA(ptr nocapture %e) nounwind uwtable readonly ssp {
118118
define double @funcB(ptr nocapture %e) nounwind uwtable readonly ssp {
119119
; CHECK-LABEL: funcB:
120120
; CHECK: # %bb.0:
121-
; CHECK-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0
121+
; CHECK-NEXT: vcvtsi2sdl (%rdi), %xmm15, %xmm0
122122
; CHECK-NEXT: retq
123123
%tmp1 = load i32, ptr %e, align 4
124124
%conv = sitofp i32 %tmp1 to double
@@ -128,7 +128,7 @@ define double @funcB(ptr nocapture %e) nounwind uwtable readonly ssp {
128128
define float @funcC(ptr nocapture %e) nounwind uwtable readonly ssp {
129129
; CHECK-LABEL: funcC:
130130
; CHECK: # %bb.0:
131-
; CHECK-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0
131+
; CHECK-NEXT: vcvtsi2ssl (%rdi), %xmm15, %xmm0
132132
; CHECK-NEXT: retq
133133
%tmp1 = load i32, ptr %e, align 4
134134
%conv = sitofp i32 %tmp1 to float
@@ -138,7 +138,7 @@ define float @funcC(ptr nocapture %e) nounwind uwtable readonly ssp {
138138
define float @funcD(ptr nocapture %e) nounwind uwtable readonly ssp {
139139
; CHECK-LABEL: funcD:
140140
; CHECK: # %bb.0:
141-
; CHECK-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0
141+
; CHECK-NEXT: vcvtsi2ssq (%rdi), %xmm15, %xmm0
142142
; CHECK-NEXT: retq
143143
%tmp1 = load i64, ptr %e, align 8
144144
%conv = sitofp i64 %tmp1 to float
@@ -183,7 +183,7 @@ declare float @llvm.floor.f32(float %p)
183183
define float @floor_f32_load(ptr %aptr) optsize {
184184
; CHECK-LABEL: floor_f32_load:
185185
; CHECK: # %bb.0:
186-
; CHECK-NEXT: vroundss $9, (%rdi), %xmm0, %xmm0
186+
; CHECK-NEXT: vroundss $9, (%rdi), %xmm15, %xmm0
187187
; CHECK-NEXT: retq
188188
%a = load float, ptr %aptr
189189
%res = call float @llvm.floor.f32(float %a)
@@ -193,7 +193,7 @@ define float @floor_f32_load(ptr %aptr) optsize {
193193
define float @floor_f32_load_pgso(ptr %aptr) !prof !14 {
194194
; CHECK-LABEL: floor_f32_load_pgso:
195195
; CHECK: # %bb.0:
196-
; CHECK-NEXT: vroundss $9, (%rdi), %xmm0, %xmm0
196+
; CHECK-NEXT: vroundss $9, (%rdi), %xmm15, %xmm0
197197
; CHECK-NEXT: retq
198198
%a = load float, ptr %aptr
199199
%res = call float @llvm.floor.f32(float %a)
@@ -203,7 +203,7 @@ define float @floor_f32_load_pgso(ptr %aptr) !prof !14 {
203203
define double @nearbyint_f64_load(ptr %aptr) optsize {
204204
; CHECK-LABEL: nearbyint_f64_load:
205205
; CHECK: # %bb.0:
206-
; CHECK-NEXT: vroundsd $12, (%rdi), %xmm0, %xmm0
206+
; CHECK-NEXT: vroundsd $12, (%rdi), %xmm15, %xmm0
207207
; CHECK-NEXT: retq
208208
%a = load double, ptr %aptr
209209
%res = call double @llvm.nearbyint.f64(double %a)
@@ -213,7 +213,7 @@ define double @nearbyint_f64_load(ptr %aptr) optsize {
213213
define double @nearbyint_f64_load_pgso(ptr %aptr) !prof !14 {
214214
; CHECK-LABEL: nearbyint_f64_load_pgso:
215215
; CHECK: # %bb.0:
216-
; CHECK-NEXT: vroundsd $12, (%rdi), %xmm0, %xmm0
216+
; CHECK-NEXT: vroundsd $12, (%rdi), %xmm15, %xmm0
217217
; CHECK-NEXT: retq
218218
%a = load double, ptr %aptr
219219
%res = call double @llvm.nearbyint.f64(double %a)

0 commit comments

Comments
 (0)