Skip to content

Commit 8b212b6

Browse files
author
Vasileios Porpodas
committed
[Spill2Reg] Use AVX opcodes when available
This patch updates the vector spill/reload instructions to use the AVX opcodes by default if the targets supports it. This can be turned off with the -spill2reg-no-avx flag. Original review: https://reviews.llvm.org/D118951
1 parent 8252cd3 commit 8b212b6

17 files changed

+441
-49
lines changed

llvm/include/llvm/CodeGen/TargetInstrInfo.h

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2303,21 +2303,19 @@ class TargetInstrInfo : public MCInstrInfo {
23032303
}
23042304

23052305
/// Inserts \p SrcReg into the destination Spill2Reg register \p DstReg.
2306-
virtual MachineInstr *
2307-
spill2RegInsertToS2RReg(Register S2RReg, Register SrcReg, int OperationBits,
2308-
MachineBasicBlock *MBB,
2309-
MachineBasicBlock::iterator InsertBeforeIt,
2310-
const TargetRegisterInfo *TRI) const {
2306+
virtual MachineInstr *spill2RegInsertToS2RReg(
2307+
Register S2RReg, Register SrcReg, int OperationBits,
2308+
MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertBeforeIt,
2309+
const TargetRegisterInfo *TRI, const TargetSubtargetInfo *STI) const {
23112310
llvm_unreachable(
23122311
"Target didn't implement TargetInstrInfo::spill2RegInsertToS2RReg!");
23132312
}
23142313

23152314
/// Extracts from \p S2RReg into \p DstReg.
2316-
virtual MachineInstr *
2317-
spill2RegExtractFromS2RReg(Register DstReg, Register S2RReg,
2318-
int OperationBits, MachineBasicBlock *InsertMBB,
2319-
MachineBasicBlock::iterator InsertBeforeIt,
2320-
const TargetRegisterInfo *TRI) const {
2315+
virtual MachineInstr *spill2RegExtractFromS2RReg(
2316+
Register DstReg, Register S2RReg, int OperationBits,
2317+
MachineBasicBlock *InsertMBB, MachineBasicBlock::iterator InsertBeforeIt,
2318+
const TargetRegisterInfo *TRI, const TargetSubtargetInfo *STI) const {
23212319
llvm_unreachable("Target didn't implement "
23222320
"TargetInstrInfo::spill2RegExtractFromS2RReg!");
23232321
}

llvm/include/llvm/CodeGen/TargetRegisterInfo.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1249,6 +1249,7 @@ class TargetRegisterInfo : public MCRegisterInfo {
12491249

12501250
virtual const TargetRegisterClass *
12511251
getCandidateRegisterClassForSpill2Reg(const TargetRegisterInfo *TRI,
1252+
const TargetSubtargetInfo *STI,
12521253
Register SpilledReg) const {
12531254
llvm_unreachable("Target didn't implement "
12541255
"TargetInstrInfo::getCandidateRegisterClassForSpill2Reg!");

llvm/lib/CodeGen/Spill2Reg.cpp

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -99,9 +99,10 @@ class Spill2Reg : public MachineFunctionPass {
9999
/// \Returns the register class of the register being spilled.
100100
const TargetRegisterClass *
101101
getSpilledRegClass(const TargetInstrInfo *TII,
102-
const TargetRegisterInfo *TRI) const {
102+
const TargetRegisterInfo *TRI,
103+
const TargetSubtargetInfo *STI) const {
103104
auto Reg0 = Spills.front().MO->getReg();
104-
return TRI->getCandidateRegisterClassForSpill2Reg(TRI, Reg0);
105+
return TRI->getCandidateRegisterClassForSpill2Reg(TRI, STI, Reg0);
105106
}
106107

107108
#ifndef NDEBUG
@@ -220,15 +221,6 @@ void Spill2Reg::collectSpillsAndReloads() {
220221
}
221222
unsigned SpillBits = TRI->getRegSizeInBits(MO->getReg(), *MRI);
222223
Entry.Spills.emplace_back(Spill, MO, SpillBits);
223-
224-
// If any of the reloads collected so far is in the same MBB then mark
225-
// it as non live-in. This is used in `updateLiveIns()` where we update
226-
// the liveins of MBBs to include the new vector register. Doing this
227-
// now avoids an MBB walk in `updateLiveIns()` which should save
228-
// compilation time.
229-
for (auto &MID : Entry.Reloads)
230-
if (MID.MI->getParent() == &MBB)
231-
MID.IsLiveIn = false;
232224
} else if (const MachineOperand *MO =
233225
TII->isLoadFromStackSlotMO(MI, StackSlot)) {
234226
MachineInstr *Reload = &MI;
@@ -352,7 +344,7 @@ void Spill2Reg::replaceStackWithReg(StackSlotDataEntry &Entry,
352344

353345
TII->spill2RegInsertToS2RReg(
354346
VectorReg, OldReg, SpillData.SpillBits, StackSpill->getParent(),
355-
/*InsertBeforeIt=*/StackSpill->getIterator(), TRI);
347+
/*InsertBeforeIt=*/StackSpill->getIterator(), TRI, &MF->getSubtarget());
356348

357349
// Mark VectorReg as live in the instr's BB.
358350
LRUs[StackSpill->getParent()].addReg(VectorReg);
@@ -369,7 +361,8 @@ void Spill2Reg::replaceStackWithReg(StackSlotDataEntry &Entry,
369361

370362
TII->spill2RegExtractFromS2RReg(
371363
OldReg, VectorReg, ReloadData.SpillBits, StackReload->getParent(),
372-
/*InsertBeforeIt=*/StackReload->getIterator(), TRI);
364+
/*InsertBeforeIt=*/StackReload->getIterator(), TRI,
365+
&MF->getSubtarget());
373366

374367
// Mark VectorReg as live in the instr's BB.
375368
LRUs[StackReload->getParent()].addReg(VectorReg);
@@ -480,8 +473,8 @@ void Spill2Reg::generateCode() {
480473
calculateLiveRegs(Entry, LRU);
481474

482475
// Look for a physical register that is not in LRU.
483-
std::optional<MCRegister> PhysVectorRegOpt =
484-
tryGetFreePhysicalReg(Entry.getSpilledRegClass(TII, TRI), LRU);
476+
std::optional<MCRegister> PhysVectorRegOpt = tryGetFreePhysicalReg(
477+
Entry.getSpilledRegClass(TII, TRI, &MF->getSubtarget()), LRU);
485478
if (!PhysVectorRegOpt)
486479
continue;
487480

llvm/lib/Target/X86/X86InstrInfo.cpp

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11018,14 +11018,24 @@ bool X86InstrInfo::isSpill2RegProfitable(const MachineInstr *MI,
1101811018
return MemHeuristic && VecHeuristic;
1101911019
}
1102011020

11021-
static unsigned getInsertOrExtractOpcode(unsigned Bits, bool Insert) {
11021+
extern bool useAVX(const TargetSubtargetInfo *STI);
11022+
11023+
static unsigned getInsertOrExtractOpcode(unsigned Bits, bool Insert,
11024+
const TargetSubtargetInfo *STI) {
11025+
bool UseAVX = useAVX(STI);
1102211026
switch (Bits) {
1102311027
case 8:
1102411028
case 16:
1102511029
case 32:
11026-
return Insert ? X86::MOVDI2PDIrr : X86::MOVPDI2DIrr;
11030+
if (UseAVX)
11031+
return Insert ? X86::VMOVDI2PDIZrr : X86::VMOVPDI2DIZrr;
11032+
else
11033+
return Insert ? X86::MOVDI2PDIrr : X86::MOVPDI2DIrr;
1102711034
case 64:
11028-
return Insert ? X86::MOV64toPQIrr : X86::MOVPQIto64rr;
11035+
if (UseAVX)
11036+
return Insert ? X86::VMOV64toPQIZrr : X86::VMOVPQIto64Zrr;
11037+
else
11038+
return Insert ? X86::MOV64toPQIrr : X86::MOVPQIto64rr;
1102911039
default:
1103011040
llvm_unreachable("Unsupported bits");
1103111041
}
@@ -11063,11 +11073,11 @@ X86InstrInfo::getMovdCompatibleReg(MCRegister OldReg, uint32_t OldRegBits,
1106311073

1106411074
MachineInstr *X86InstrInfo::spill2RegInsertToS2RReg(
1106511075
Register S2RReg, Register SrcReg, int OperationBits, MachineBasicBlock *MBB,
11066-
MachineBasicBlock::iterator InsertBeforeIt,
11067-
const TargetRegisterInfo *TRI) const {
11076+
MachineBasicBlock::iterator InsertBeforeIt, const TargetRegisterInfo *TRI,
11077+
const TargetSubtargetInfo *STI) const {
1106811078
DebugLoc DL;
1106911079
unsigned InsertOpcode =
11070-
getInsertOrExtractOpcode(OperationBits, true /*insert*/);
11080+
getInsertOrExtractOpcode(OperationBits, true /*insert*/, STI);
1107111081
const MCInstrDesc &InsertMCID = get(InsertOpcode);
1107211082
// `movd` does not support 8/16 bit operands. Instead, we use a 32-bit
1107311083
// register. For example:
@@ -11083,10 +11093,10 @@ MachineInstr *X86InstrInfo::spill2RegInsertToS2RReg(
1108311093
MachineInstr *X86InstrInfo::spill2RegExtractFromS2RReg(
1108411094
Register DstReg, Register S2RReg, int OperationBits,
1108511095
MachineBasicBlock *InsertMBB, MachineBasicBlock::iterator InsertBeforeIt,
11086-
const TargetRegisterInfo *TRI) const {
11096+
const TargetRegisterInfo *TRI, const TargetSubtargetInfo *STI) const {
1108711097
DebugLoc DL;
1108811098
unsigned ExtractOpcode =
11089-
getInsertOrExtractOpcode(OperationBits, false /*extract*/);
11099+
getInsertOrExtractOpcode(OperationBits, false /*extract*/, STI);
1109011100
const MCInstrDesc &ExtractMCID = get(ExtractOpcode);
1109111101
// `movd` does not support 8/16 bit operands. Instead, we use a 32-bit
1109211102
// register. For example:

llvm/lib/Target/X86/X86InstrInfo.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -752,13 +752,15 @@ class X86InstrInfo final : public X86GenInstrInfo {
752752
spill2RegInsertToS2RReg(Register S2RReg, Register SrcReg, int OperationBits,
753753
MachineBasicBlock *MBB,
754754
MachineBasicBlock::iterator InsertBeforeIt,
755-
const TargetRegisterInfo *TRI) const override;
755+
const TargetRegisterInfo *TRI,
756+
const TargetSubtargetInfo *STI) const override;
756757

757758
MachineInstr *
758759
spill2RegExtractFromS2RReg(Register DstReg, Register S2RReg,
759760
int OperationBits, MachineBasicBlock *InsertMBB,
760761
MachineBasicBlock::iterator InsertBeforeIt,
761-
const TargetRegisterInfo *TRI) const override;
762+
const TargetRegisterInfo *TRI,
763+
const TargetSubtargetInfo *STI) const override;
762764
};
763765
} // namespace llvm
764766

llvm/lib/Target/X86/X86RegisterInfo.cpp

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,10 @@ static cl::opt<bool>
5050
cl::desc("Disable two address hints for register "
5151
"allocation"));
5252

53+
static cl::opt<bool> Spill2RegNoAVX(
54+
"spill2reg-no-avx", cl::Hidden, cl::init(false),
55+
cl::desc("Don't use AVX instructions even if the targets supports them."));
56+
5357
X86RegisterInfo::X86RegisterInfo(const Triple &TT)
5458
: X86GenRegisterInfo((TT.isArch64Bit() ? X86::RIP : X86::EIP),
5559
X86_MC::getDwarfRegFlavour(TT, false),
@@ -1279,10 +1283,17 @@ bool X86RegisterInfo::targetSupportsSpill2Reg(
12791283
return X86STI->hasSSE41();
12801284
}
12811285

1286+
bool useAVX(const TargetSubtargetInfo *STI) {
1287+
const X86Subtarget *X86STI = static_cast<const X86Subtarget *>(STI);
1288+
bool UseAVX = X86STI->hasAVX() && !Spill2RegNoAVX;
1289+
return UseAVX;
1290+
}
1291+
12821292
const TargetRegisterClass *
12831293
X86RegisterInfo::getCandidateRegisterClassForSpill2Reg(
1284-
const TargetRegisterInfo *TRI, Register SpilledReg) const {
1285-
const TargetRegisterClass *VecRegClass =
1286-
TRI->getRegClass(X86::VR128RegClassID);
1294+
const TargetRegisterInfo *TRI, const TargetSubtargetInfo *STI,
1295+
Register SpilledReg) const {
1296+
const TargetRegisterClass *VecRegClass = TRI->getRegClass(
1297+
useAVX(STI) ? X86::VR128XRegClassID : X86::VR128RegClassID);
12871298
return VecRegClass;
12881299
}

llvm/lib/Target/X86/X86RegisterInfo.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ class X86RegisterInfo final : public X86GenRegisterInfo {
184184

185185
const TargetRegisterClass *
186186
getCandidateRegisterClassForSpill2Reg(const TargetRegisterInfo *TRI,
187+
const TargetSubtargetInfo *STI,
187188
Register SpilledReg) const override;
188189
};
189190

llvm/test/CodeGen/X86/spill2reg_end_to_end_16bit.ll

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 | FileCheck %s
3+
; RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+avx | FileCheck --check-prefix=AVX %s
34

45
; End-to-end check that Spill2Reg works with 16-bit registers.
56

@@ -130,6 +131,90 @@ define dso_local void @_Z5spillv() local_unnamed_addr #0 {
130131
; CHECK-NEXT: popq %rbp
131132
; CHECK-NEXT: .cfi_def_cfa_offset 8
132133
; CHECK-NEXT: retq
134+
;
135+
; AVX-LABEL: _Z5spillv:
136+
; AVX: # %bb.0: # %entry
137+
; AVX-NEXT: pushq %rbp
138+
; AVX-NEXT: .cfi_def_cfa_offset 16
139+
; AVX-NEXT: pushq %r15
140+
; AVX-NEXT: .cfi_def_cfa_offset 24
141+
; AVX-NEXT: pushq %r14
142+
; AVX-NEXT: .cfi_def_cfa_offset 32
143+
; AVX-NEXT: pushq %r13
144+
; AVX-NEXT: .cfi_def_cfa_offset 40
145+
; AVX-NEXT: pushq %r12
146+
; AVX-NEXT: .cfi_def_cfa_offset 48
147+
; AVX-NEXT: pushq %rbx
148+
; AVX-NEXT: .cfi_def_cfa_offset 56
149+
; AVX-NEXT: .cfi_offset %rbx, -56
150+
; AVX-NEXT: .cfi_offset %r12, -48
151+
; AVX-NEXT: .cfi_offset %r13, -40
152+
; AVX-NEXT: .cfi_offset %r14, -32
153+
; AVX-NEXT: .cfi_offset %r15, -24
154+
; AVX-NEXT: .cfi_offset %rbp, -16
155+
; AVX-NEXT: movw D0(%rip), %ax
156+
; AVX-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
157+
; AVX-NEXT: movzwl D1(%rip), %ecx
158+
; AVX-NEXT: movzwl D2(%rip), %edx
159+
; AVX-NEXT: movzwl D3(%rip), %esi
160+
; AVX-NEXT: movzwl D4(%rip), %edi
161+
; AVX-NEXT: movzwl D5(%rip), %r8d
162+
; AVX-NEXT: movzwl D6(%rip), %r9d
163+
; AVX-NEXT: movzwl D7(%rip), %r10d
164+
; AVX-NEXT: movzwl D8(%rip), %r11d
165+
; AVX-NEXT: movzwl D9(%rip), %ebx
166+
; AVX-NEXT: movzwl D10(%rip), %ebp
167+
; AVX-NEXT: movzwl D11(%rip), %r14d
168+
; AVX-NEXT: movzwl D12(%rip), %r15d
169+
; AVX-NEXT: movzwl D13(%rip), %r12d
170+
; AVX-NEXT: movzwl D14(%rip), %r13d
171+
; AVX-NEXT: movw D15(%rip), %ax
172+
; AVX-NEXT: vmovd %eax, %xmm0
173+
; AVX-NEXT: movw D16(%rip), %ax
174+
; AVX-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
175+
; AVX-NEXT: movw D17(%rip), %ax
176+
; AVX-NEXT: vmovd %eax, %xmm1
177+
; AVX-NEXT: movzwl D18(%rip), %eax
178+
; AVX-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
179+
; AVX-NEXT: #APP
180+
; AVX-NEXT: #NO_APP
181+
; AVX-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
182+
; AVX-NEXT: movw %ax, U0(%rip)
183+
; AVX-NEXT: movw %cx, U1(%rip)
184+
; AVX-NEXT: movw %dx, U2(%rip)
185+
; AVX-NEXT: movw %si, U3(%rip)
186+
; AVX-NEXT: movw %di, U4(%rip)
187+
; AVX-NEXT: movw %r8w, U5(%rip)
188+
; AVX-NEXT: movw %r9w, U6(%rip)
189+
; AVX-NEXT: movw %r10w, U7(%rip)
190+
; AVX-NEXT: movw %r11w, U8(%rip)
191+
; AVX-NEXT: movw %bx, U9(%rip)
192+
; AVX-NEXT: movw %bp, U10(%rip)
193+
; AVX-NEXT: movw %r14w, U11(%rip)
194+
; AVX-NEXT: movw %r15w, U12(%rip)
195+
; AVX-NEXT: movw %r12w, U13(%rip)
196+
; AVX-NEXT: movw %r13w, U14(%rip)
197+
; AVX-NEXT: vmovd %xmm0, %eax
198+
; AVX-NEXT: movw %ax, U15(%rip)
199+
; AVX-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
200+
; AVX-NEXT: movw %ax, U16(%rip)
201+
; AVX-NEXT: vmovd %xmm1, %eax
202+
; AVX-NEXT: movw %ax, U17(%rip)
203+
; AVX-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
204+
; AVX-NEXT: movw %ax, U18(%rip)
205+
; AVX-NEXT: popq %rbx
206+
; AVX-NEXT: .cfi_def_cfa_offset 48
207+
; AVX-NEXT: popq %r12
208+
; AVX-NEXT: .cfi_def_cfa_offset 40
209+
; AVX-NEXT: popq %r13
210+
; AVX-NEXT: .cfi_def_cfa_offset 32
211+
; AVX-NEXT: popq %r14
212+
; AVX-NEXT: .cfi_def_cfa_offset 24
213+
; AVX-NEXT: popq %r15
214+
; AVX-NEXT: .cfi_def_cfa_offset 16
215+
; AVX-NEXT: popq %rbp
216+
; AVX-NEXT: .cfi_def_cfa_offset 8
217+
; AVX-NEXT: retq
133218
entry:
134219
%0 = load i16, i16* @D0
135220
%1 = load i16, i16* @D1

0 commit comments

Comments
 (0)