Skip to content

Commit 879f258

Browse files
authored
(cherry-pick 6.4) [AMDGPU] Prevent m0 from being used as v_readlane/v_readfirstlane dst (llvm#1081)
Auto-submit by Jenkins
2 parents 8b8daf0 + 5ca2bd0 commit 879f258

File tree

150 files changed

+3766
-3718
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

150 files changed

+3766
-3718
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -733,7 +733,7 @@ Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
733733

734734
for (unsigned i = 0; i < NumParts; ++i) {
735735
Register SrcPart = SrcParts[i];
736-
Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
736+
Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
737737
MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
738738

739739
const TargetRegisterClass *Constrained =

llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,7 @@ DECODE_OPERAND_REG_8(VReg_512)
289289
DECODE_OPERAND_REG_8(VReg_1024)
290290

291291
DECODE_OPERAND_REG_7(SReg_32, OPW32)
292+
DECODE_OPERAND_REG_7(SReg_32_XM0, OPW32)
292293
DECODE_OPERAND_REG_7(SReg_32_XEXEC, OPW32)
293294
DECODE_OPERAND_REG_7(SReg_32_XM0_XEXEC, OPW32)
294295
DECODE_OPERAND_REG_7(SReg_32_XEXEC_HI, OPW32)

llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1058,6 +1058,8 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
10581058
<< " is being turned to v_readfirstlane_b32"
10591059
<< " Score: " << C.second.Score << "\n");
10601060
Register DstReg = MI->getOperand(0).getReg();
1061+
MRI->constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);
1062+
10611063
Register SrcReg = MI->getOperand(1).getReg();
10621064
unsigned SubReg = MI->getOperand(1).getSubReg();
10631065
const TargetRegisterClass *SrcRC =
@@ -1081,7 +1083,7 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
10811083
Result, *MRI, MI->getOperand(1), SrcRC,
10821084
TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass);
10831085
Register PartialDst =
1084-
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1086+
MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
10851087
BuildMI(*MBB, *Result, Result->getDebugLoc(),
10861088
TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst)
10871089
.addReg(PartialSrc);

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,7 @@ class PrologEpilogSGPRSpillBuilder {
439439

440440
buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL,
441441
TmpVGPR, FI, FrameReg, DwordOff);
442+
MRI.constrainRegClass(SubReg, &AMDGPU::SReg_32_XM0RegClass);
442443
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
443444
.addReg(TmpVGPR, RegState::Kill);
444445
DwordOff += 4;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4500,7 +4500,8 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
45004500
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
45014501
Register PhiExec = MRI.createVirtualRegister(BoolRC);
45024502
Register NewExec = MRI.createVirtualRegister(BoolRC);
4503-
Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4503+
Register CurrentIdxReg =
4504+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
45044505
Register CondReg = MRI.createVirtualRegister(BoolRC);
45054506

45064507
BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
@@ -4916,7 +4917,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
49164917
Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
49174918

49184919
Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4919-
Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4920+
Register LaneValueReg =
4921+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
49204922

49214923
bool IsWave32 = ST.isWave32();
49224924
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
@@ -5164,18 +5166,18 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
51645166
? AMDGPU::S_ADDC_U32
51655167
: AMDGPU::S_SUBB_U32;
51665168
if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5167-
Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5169+
Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
51685170
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
51695171
.addReg(Src0.getReg());
51705172
Src0.setReg(RegOp0);
51715173
}
51725174
if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5173-
Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5175+
Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
51745176
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
51755177
.addReg(Src1.getReg());
51765178
Src1.setReg(RegOp1);
51775179
}
5178-
Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5180+
Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
51795181
if (TRI->isVectorRegister(MRI, Src2.getReg())) {
51805182
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
51815183
.addReg(Src2.getReg());

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2230,6 +2230,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
22302230

22312231
case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
22322232
MI.setDesc(get(AMDGPU::V_READLANE_B32));
2233+
MI.getMF()->getRegInfo().constrainRegClass(MI.getOperand(0).getReg(),
2234+
&AMDGPU::SReg_32_XM0RegClass);
22332235
break;
22342236

22352237
case AMDGPU::V_MOV_B64_PSEUDO: {
@@ -6352,7 +6354,7 @@ static void emitLoadScalarOpsFromVGPRLoop(
63526354
Register VScalarOp = ScalarOp->getReg();
63536355

63546356
if (NumSubRegs == 1) {
6355-
Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6357+
Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
63566358

63576359
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
63586360
.addReg(VScalarOp);
@@ -6383,8 +6385,10 @@ static void emitLoadScalarOpsFromVGPRLoop(
63836385
"Unhandled register size");
63846386

63856387
for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6386-
Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6387-
Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6388+
Register CurRegLo =
6389+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6390+
Register CurRegHi =
6391+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
63886392

63896393
// Read the next variant <- also loop target.
63906394
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
@@ -7440,9 +7444,20 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
74407444
if (Inst.isCopy() && DstReg.isPhysical() &&
74417445
RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
74427446
// TODO: Only works for 32 bit registers.
7443-
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7444-
get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
7445-
.add(Inst.getOperand(1));
7447+
if (MRI.constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass)) {
7448+
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7449+
get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
7450+
.add(Inst.getOperand(1));
7451+
} else {
7452+
Register NewDst =
7453+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7454+
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7455+
get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
7456+
.add(Inst.getOperand(1));
7457+
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
7458+
DstReg)
7459+
.addReg(NewDst);
7460+
}
74467461
Inst.eraseFromParent();
74477462
return;
74487463
}

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2126,6 +2126,8 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI,
21262126
// Don't need to write VGPR out.
21272127
}
21282128

2129+
MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2130+
21292131
// Restore clobbered registers in the specified restore block.
21302132
MI = RestoreMBB.end();
21312133
SB.setMI(&RestoreMBB, MI);
@@ -2140,6 +2142,7 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI,
21402142
SB.NumSubRegs == 1
21412143
? SB.SuperReg
21422144
: Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2145+
MRI.constrainRegClass(SubReg, &AMDGPU::SReg_32_XM0RegClass);
21432146
bool LastSubReg = (i + 1 == e);
21442147
auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
21452148
SubReg)
@@ -2662,12 +2665,16 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
26622665
if (IsSALU && !LiveSCC)
26632666
Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
26642667
if (IsSALU && LiveSCC) {
2665-
Register NewDest =
2666-
IsCopy ? ResultReg
2667-
: RS->scavengeRegisterBackwards(AMDGPU::SReg_32RegClass,
2668-
Shift, false, 0);
2669-
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
2670-
NewDest)
2668+
Register NewDest;
2669+
if (IsCopy) {
2670+
MF->getRegInfo().constrainRegClass(ResultReg,
2671+
&AMDGPU::SReg_32_XM0RegClass);
2672+
NewDest = ResultReg;
2673+
} else {
2674+
NewDest = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2675+
Shift, false, 0);
2676+
}
2677+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest)
26712678
.addReg(TmpResultReg);
26722679
ResultReg = NewDest;
26732680
}
@@ -2781,10 +2788,17 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
27812788
.addReg(TmpResultReg);
27822789
}
27832790

2784-
Register NewDest = IsCopy ? ResultReg
2785-
: RS->scavengeRegisterBackwards(
2786-
AMDGPU::SReg_32RegClass, *Add,
2787-
false, 0, /*AllowSpill=*/true);
2791+
Register NewDest;
2792+
if (IsCopy) {
2793+
MF->getRegInfo().constrainRegClass(ResultReg,
2794+
&AMDGPU::SReg_32_XM0RegClass);
2795+
NewDest = ResultReg;
2796+
} else {
2797+
NewDest = RS->scavengeRegisterBackwards(
2798+
AMDGPU::SReg_32_XM0RegClass, *Add, false, 0,
2799+
/*AllowSpill=*/true);
2800+
}
2801+
27882802
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
27892803
NewDest)
27902804
.addReg(TmpResultReg);

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ defm V_MOV_B64 : VOP1Inst <"v_mov_b64", VOP_I64_I64>;
241241
} // End isMoveImm = 1
242242

243243
def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> {
244-
let DstRC = RegisterOperand<SReg_32>;
244+
let DstRC = RegisterOperand<SReg_32_XM0>;
245245
let Src0RC32 = VRegOrLdsSrc_32;
246246
let Asm32 = " $vdst, $src0";
247247
}

llvm/lib/Target/AMDGPU/VOP2Instructions.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -685,7 +685,7 @@ def VOP2e_I16_I16_I16_I1_fake16 : VOP2e_SGPR<[i16, i16, i16, i1]> {
685685
}
686686

687687
def VOP_READLANE : VOPProfile<[i32, i32, i32, untyped]> {
688-
let Outs32 = (outs SReg_32:$vdst);
688+
let Outs32 = (outs SReg_32_XM0:$vdst);
689689
let Outs64 = Outs32;
690690
let Ins32 = (ins VRegOrLdsSrc_32:$src0, SCSrc_b32:$src1);
691691
let Ins64 = Ins32;

llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform.mir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ body: |
1212
; CHECK-NEXT: ALL VALUES UNIFORM
1313
%0:vgpr_32 = IMPLICIT_DEF
1414
%1:vgpr_32 = IMPLICIT_DEF
15-
%2:sgpr_32 = V_READFIRSTLANE_B32 %0, implicit $exec
15+
%2:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec
1616
%3:sgpr_32 = V_READLANE_B32 %1, 0, implicit $exec
1717
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
1818
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
@@ -33,7 +33,7 @@ body: |
3333
%4:sgpr_32 = V_READLANE_B32 $vgpr0, 0, implicit $exec
3434
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
3535
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
36-
%5:sgpr_32 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
36+
%5:sreg_32_xm0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
3737
S_ENDPGM 0
3838
...
3939

0 commit comments

Comments
 (0)