Skip to content

Commit f8f2573

Browse files
authored
(cherry-pick) [AMDGPU] Prevent m0 from being used as v_readlane/v_readfirstlane dst (llvm#1577)
Combined cherry-pick of 0f0d3fb and 5231736 from amd-staging Fixes SWDEV-513763
1 parent 8595f09 commit f8f2573

File tree

155 files changed

+3907
-3876
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

155 files changed

+3907
-3876
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -733,7 +733,7 @@ Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
733733

734734
for (unsigned i = 0; i < NumParts; ++i) {
735735
Register SrcPart = SrcParts[i];
736-
Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
736+
Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
737737
MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
738738

739739
const TargetRegisterClass *Constrained =

llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,7 @@ DECODE_OPERAND_REG_8(VReg_512)
271271
DECODE_OPERAND_REG_8(VReg_1024)
272272

273273
DECODE_OPERAND_REG_7(SReg_32, OPW32)
274+
DECODE_OPERAND_REG_7(SReg_32_XM0, OPW32)
274275
DECODE_OPERAND_REG_7(SReg_32_XEXEC, OPW32)
275276
DECODE_OPERAND_REG_7(SReg_32_XM0_XEXEC, OPW32)
276277
DECODE_OPERAND_REG_7(SReg_32_XEXEC_HI, OPW32)

llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1069,6 +1069,8 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
10691069
<< " is being turned to v_readfirstlane_b32"
10701070
<< " Score: " << C.second.Score << "\n");
10711071
Register DstReg = MI->getOperand(0).getReg();
1072+
MRI->constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);
1073+
10721074
Register SrcReg = MI->getOperand(1).getReg();
10731075
unsigned SubReg = MI->getOperand(1).getSubReg();
10741076
const TargetRegisterClass *SrcRC =
@@ -1092,7 +1094,7 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
10921094
Result, *MRI, MI->getOperand(1), SrcRC,
10931095
TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass);
10941096
Register PartialDst =
1095-
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1097+
MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
10961098
BuildMI(*MBB, *Result, Result->getDebugLoc(),
10971099
TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst)
10981100
.addReg(PartialSrc);

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,7 @@ class PrologEpilogSGPRSpillBuilder {
439439

440440
buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL,
441441
TmpVGPR, FI, FrameReg, DwordOff);
442+
MRI.constrainRegClass(SubReg, &AMDGPU::SReg_32_XM0RegClass);
442443
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
443444
.addReg(TmpVGPR, RegState::Kill);
444445
DwordOff += 4;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4568,7 +4568,8 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
45684568
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
45694569
Register PhiExec = MRI.createVirtualRegister(BoolRC);
45704570
Register NewExec = MRI.createVirtualRegister(BoolRC);
4571-
Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4571+
Register CurrentIdxReg =
4572+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
45724573
Register CondReg = MRI.createVirtualRegister(BoolRC);
45734574

45744575
BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
@@ -4995,7 +4996,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
49954996
Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
49964997

49974998
Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4998-
Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4999+
Register LaneValueReg =
5000+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
49995001

50005002
bool IsWave32 = ST.isWave32();
50015003
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
@@ -5254,18 +5256,18 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
52545256
? AMDGPU::S_ADDC_U32
52555257
: AMDGPU::S_SUBB_U32;
52565258
if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5257-
Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5259+
Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
52585260
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
52595261
.addReg(Src0.getReg());
52605262
Src0.setReg(RegOp0);
52615263
}
52625264
if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5263-
Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5265+
Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
52645266
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
52655267
.addReg(Src1.getReg());
52665268
Src1.setReg(RegOp1);
52675269
}
5268-
Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5270+
Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
52695271
if (TRI->isVectorRegister(MRI, Src2.getReg())) {
52705272
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
52715273
.addReg(Src2.getReg());

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2239,6 +2239,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
22392239

22402240
case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
22412241
MI.setDesc(get(AMDGPU::V_READLANE_B32));
2242+
MI.getMF()->getRegInfo().constrainRegClass(MI.getOperand(0).getReg(),
2243+
&AMDGPU::SReg_32_XM0RegClass);
22422244
break;
22432245

22442246
case AMDGPU::V_MOV_B64_PSEUDO: {
@@ -6527,7 +6529,7 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
65276529
Register VScalarOp = ScalarOp->getReg();
65286530

65296531
if (NumSubRegs == 1) {
6530-
Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6532+
Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
65316533

65326534
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
65336535
.addReg(VScalarOp);
@@ -6559,8 +6561,10 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
65596561
"Unhandled register size");
65606562

65616563
for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6562-
Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6563-
Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6564+
Register CurRegLo =
6565+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6566+
Register CurRegHi =
6567+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
65646568

65656569
// Read the next variant <- also loop target.
65666570
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
@@ -7667,9 +7671,20 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
76677671
if (Inst.isCopy() && DstReg.isPhysical() &&
76687672
RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
76697673
// TODO: Only works for 32 bit registers.
7670-
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7671-
get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
7672-
.add(Inst.getOperand(1));
7674+
if (MRI.constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass)) {
7675+
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7676+
get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
7677+
.add(Inst.getOperand(1));
7678+
} else {
7679+
Register NewDst =
7680+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7681+
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7682+
get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
7683+
.add(Inst.getOperand(1));
7684+
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
7685+
DstReg)
7686+
.addReg(NewDst);
7687+
}
76737688
Inst.eraseFromParent();
76747689
return;
76757690
}

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2296,6 +2296,8 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI,
22962296
// Don't need to write VGPR out.
22972297
}
22982298

2299+
MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2300+
22992301
// Restore clobbered registers in the specified restore block.
23002302
MI = RestoreMBB.end();
23012303
SB.setMI(&RestoreMBB, MI);
@@ -2310,6 +2312,7 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI,
23102312
SB.NumSubRegs == 1
23112313
? SB.SuperReg
23122314
: Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2315+
MRI.constrainRegClass(SubReg, &AMDGPU::SReg_32_XM0RegClass);
23132316
bool LastSubReg = (i + 1 == e);
23142317
auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
23152318
SubReg)
@@ -3149,10 +3152,15 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
31493152
if (IsSALU && !LiveSCC)
31503153
Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
31513154
if (IsSALU && LiveSCC) {
3152-
Register NewDest =
3153-
IsCopy ? ResultReg
3154-
: RS->scavengeRegisterBackwards(AMDGPU::SReg_32RegClass,
3155-
Shift, false, 0);
3155+
Register NewDest;
3156+
if (IsCopy) {
3157+
MF->getRegInfo().constrainRegClass(ResultReg,
3158+
&AMDGPU::SReg_32_XM0RegClass);
3159+
NewDest = ResultReg;
3160+
} else {
3161+
NewDest = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
3162+
Shift, false, 0);
3163+
}
31563164
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest)
31573165
.addReg(TmpResultReg);
31583166
ResultReg = NewDest;
@@ -3275,10 +3283,16 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
32753283
.addReg(TmpResultReg);
32763284
}
32773285

3278-
Register NewDest = IsCopy ? ResultReg
3279-
: RS->scavengeRegisterBackwards(
3280-
AMDGPU::SReg_32RegClass, *Add,
3281-
false, 0, /*AllowSpill=*/true);
3286+
Register NewDest;
3287+
if (IsCopy) {
3288+
MF->getRegInfo().constrainRegClass(ResultReg,
3289+
&AMDGPU::SReg_32_XM0RegClass);
3290+
NewDest = ResultReg;
3291+
} else {
3292+
NewDest = RS->scavengeRegisterBackwards(
3293+
AMDGPU::SReg_32_XM0RegClass, *Add, false, 0,
3294+
/*AllowSpill=*/true);
3295+
}
32823296
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
32833297
NewDest)
32843298
.addReg(TmpResultReg);

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ defm V_MOV_B64 : VOP1Inst <"v_mov_b64", VOP_I64_I64>;
243243
} // End isMoveImm = 1
244244

245245
def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> {
246-
let DstRC = RegisterOperand<SReg_32>;
246+
let DstRC = RegisterOperand<SReg_32_XM0>;
247247
let Src0RC32 = VRegOrLdsSrc_32;
248248
let Asm32 = " $vdst, $src0";
249249
}

llvm/lib/Target/AMDGPU/VOP2Instructions.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -763,7 +763,7 @@ def VOP2e_I16_I16_I16_I1_fake16 : VOP2e_SGPR<[i16, i16, i16, i1]> {
763763
}
764764

765765
def VOP_READLANE : VOPProfile<[i32, i32, i32, untyped]> {
766-
let Outs32 = (outs SReg_32:$vdst);
766+
let Outs32 = (outs SReg_32_XM0:$vdst);
767767
let Outs64 = Outs32;
768768
let Ins32 = (ins VRegOrLdsSrc_32:$src0, SCSrc_b32:$src1);
769769
let Ins64 = Ins32;

llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform.mir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ body: |
1212
; CHECK-NEXT: ALL VALUES UNIFORM
1313
%0:vgpr_32 = IMPLICIT_DEF
1414
%1:vgpr_32 = IMPLICIT_DEF
15-
%2:sgpr_32 = V_READFIRSTLANE_B32 %0, implicit $exec
15+
%2:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec
1616
%3:sgpr_32 = V_READLANE_B32 %1, 0, implicit $exec
1717
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
1818
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
@@ -33,7 +33,7 @@ body: |
3333
%4:sgpr_32 = V_READLANE_B32 $vgpr0, 0, implicit $exec
3434
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
3535
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
36-
%5:sgpr_32 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
36+
%5:sreg_32_xm0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
3737
S_ENDPGM 0
3838
...
3939

0 commit comments

Comments
 (0)