Skip to content

Commit 98e63d4

Browse files
committed
(cherry-pick) [AMDGPU] Do not allow M0 as v_readfirstlane_b32 dst (llvm#128851)
M0 can only be written to by the SALU, so `v_readfirstlane_b32 m0` is effectively useless. Represent this by restricting the dest RC of that instruction to `SReg_32_XM0` which excludes M0. There is a lot of test changes due to the register class changing, but most changes are trivial. In some cases, an extra register and `s_mov_b32` is needed. Fixes SWDEV-513269
1 parent 9d8e2dc commit 98e63d4

File tree

143 files changed

+3710
-3668
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

143 files changed

+3710
-3668
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -733,7 +733,7 @@ Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
733733

734734
for (unsigned i = 0; i < NumParts; ++i) {
735735
Register SrcPart = SrcParts[i];
736-
Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
736+
Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
737737
MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
738738

739739
const TargetRegisterClass *Constrained =

llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,7 @@ DECODE_OPERAND_REG_8(VReg_512)
289289
DECODE_OPERAND_REG_8(VReg_1024)
290290

291291
DECODE_OPERAND_REG_7(SReg_32, OPW32)
292+
DECODE_OPERAND_REG_7(SReg_32_XM0, OPW32)
292293
DECODE_OPERAND_REG_7(SReg_32_XEXEC, OPW32)
293294
DECODE_OPERAND_REG_7(SReg_32_XM0_XEXEC, OPW32)
294295
DECODE_OPERAND_REG_7(SReg_32_XEXEC_HI, OPW32)

llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1058,6 +1058,8 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
10581058
<< " is being turned to v_readfirstlane_b32"
10591059
<< " Score: " << C.second.Score << "\n");
10601060
Register DstReg = MI->getOperand(0).getReg();
1061+
MRI->constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);
1062+
10611063
Register SrcReg = MI->getOperand(1).getReg();
10621064
unsigned SubReg = MI->getOperand(1).getSubReg();
10631065
const TargetRegisterClass *SrcRC =
@@ -1081,7 +1083,7 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
10811083
Result, *MRI, MI->getOperand(1), SrcRC,
10821084
TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass);
10831085
Register PartialDst =
1084-
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1086+
MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
10851087
BuildMI(*MBB, *Result, Result->getDebugLoc(),
10861088
TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst)
10871089
.addReg(PartialSrc);

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,7 @@ class PrologEpilogSGPRSpillBuilder {
439439

440440
buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL,
441441
TmpVGPR, FI, FrameReg, DwordOff);
442+
MRI.constrainRegClass(SubReg, &AMDGPU::SReg_32_XM0RegClass);
442443
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
443444
.addReg(TmpVGPR, RegState::Kill);
444445
DwordOff += 4;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4500,7 +4500,8 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
45004500
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
45014501
Register PhiExec = MRI.createVirtualRegister(BoolRC);
45024502
Register NewExec = MRI.createVirtualRegister(BoolRC);
4503-
Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4503+
Register CurrentIdxReg =
4504+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
45044505
Register CondReg = MRI.createVirtualRegister(BoolRC);
45054506

45064507
BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
@@ -5164,18 +5165,18 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
51645165
? AMDGPU::S_ADDC_U32
51655166
: AMDGPU::S_SUBB_U32;
51665167
if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5167-
Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5168+
Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
51685169
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
51695170
.addReg(Src0.getReg());
51705171
Src0.setReg(RegOp0);
51715172
}
51725173
if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5173-
Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5174+
Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
51745175
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
51755176
.addReg(Src1.getReg());
51765177
Src1.setReg(RegOp1);
51775178
}
5178-
Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5179+
Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
51795180
if (TRI->isVectorRegister(MRI, Src2.getReg())) {
51805181
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
51815182
.addReg(Src2.getReg());

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6352,7 +6352,7 @@ static void emitLoadScalarOpsFromVGPRLoop(
63526352
Register VScalarOp = ScalarOp->getReg();
63536353

63546354
if (NumSubRegs == 1) {
6355-
Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6355+
Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
63566356

63576357
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
63586358
.addReg(VScalarOp);
@@ -6383,8 +6383,10 @@ static void emitLoadScalarOpsFromVGPRLoop(
63836383
"Unhandled register size");
63846384

63856385
for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6386-
Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6387-
Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6386+
Register CurRegLo =
6387+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6388+
Register CurRegHi =
6389+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
63886390

63896391
// Read the next variant <- also loop target.
63906392
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
@@ -7440,9 +7442,20 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
74407442
if (Inst.isCopy() && DstReg.isPhysical() &&
74417443
RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
74427444
// TODO: Only works for 32 bit registers.
7443-
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7444-
get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
7445-
.add(Inst.getOperand(1));
7445+
if (MRI.constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass)) {
7446+
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7447+
get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
7448+
.add(Inst.getOperand(1));
7449+
} else {
7450+
Register NewDst =
7451+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7452+
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7453+
get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
7454+
.add(Inst.getOperand(1));
7455+
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
7456+
DstReg)
7457+
.addReg(NewDst);
7458+
}
74467459
Inst.eraseFromParent();
74477460
return;
74487461
}

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2662,12 +2662,16 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
26622662
if (IsSALU && !LiveSCC)
26632663
Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
26642664
if (IsSALU && LiveSCC) {
2665-
Register NewDest =
2666-
IsCopy ? ResultReg
2667-
: RS->scavengeRegisterBackwards(AMDGPU::SReg_32RegClass,
2668-
Shift, false, 0);
2669-
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
2670-
NewDest)
2665+
Register NewDest;
2666+
if (IsCopy) {
2667+
MF->getRegInfo().constrainRegClass(ResultReg,
2668+
&AMDGPU::SReg_32_XM0RegClass);
2669+
NewDest = ResultReg;
2670+
} else {
2671+
NewDest = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2672+
Shift, false, 0);
2673+
}
2674+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest)
26712675
.addReg(TmpResultReg);
26722676
ResultReg = NewDest;
26732677
}
@@ -2781,10 +2785,17 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
27812785
.addReg(TmpResultReg);
27822786
}
27832787

2784-
Register NewDest = IsCopy ? ResultReg
2785-
: RS->scavengeRegisterBackwards(
2786-
AMDGPU::SReg_32RegClass, *Add,
2787-
false, 0, /*AllowSpill=*/true);
2788+
Register NewDest;
2789+
if (IsCopy) {
2790+
MF->getRegInfo().constrainRegClass(ResultReg,
2791+
&AMDGPU::SReg_32_XM0RegClass);
2792+
NewDest = ResultReg;
2793+
} else {
2794+
NewDest = RS->scavengeRegisterBackwards(
2795+
AMDGPU::SReg_32_XM0RegClass, *Add, false, 0,
2796+
/*AllowSpill=*/true);
2797+
}
2798+
27882799
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
27892800
NewDest)
27902801
.addReg(TmpResultReg);

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ defm V_MOV_B64 : VOP1Inst <"v_mov_b64", VOP_I64_I64>;
241241
} // End isMoveImm = 1
242242

243243
def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> {
244-
let DstRC = RegisterOperand<SReg_32>;
244+
let DstRC = RegisterOperand<SReg_32_XM0>;
245245
let Src0RC32 = VRegOrLdsSrc_32;
246246
let Asm32 = " $vdst, $src0";
247247
}

llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform.mir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ body: |
1212
; CHECK-NEXT: ALL VALUES UNIFORM
1313
%0:vgpr_32 = IMPLICIT_DEF
1414
%1:vgpr_32 = IMPLICIT_DEF
15-
%2:sgpr_32 = V_READFIRSTLANE_B32 %0, implicit $exec
15+
%2:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec
1616
%3:sgpr_32 = V_READLANE_B32 %1, 0, implicit $exec
1717
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
1818
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
@@ -33,7 +33,7 @@ body: |
3333
%4:sgpr_32 = V_READLANE_B32 $vgpr0, 0, implicit $exec
3434
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
3535
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
36-
%5:sgpr_32 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
36+
%5:sreg_32_xm0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
3737
S_ENDPGM 0
3838
...
3939

llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ body: |
1414
%0:vreg_64 = IMPLICIT_DEF
1515
%1:vgpr_32(s32) = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
1616
%2:vgpr_32(s32) = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 1)
17-
%3:sreg_32 = V_READFIRSTLANE_B32 %1(s32), implicit $exec
17+
%3:sreg_32_xm0 = V_READFIRSTLANE_B32 %1(s32), implicit $exec
1818
S_ENDPGM 0
1919
...
2020

@@ -50,7 +50,7 @@ body: |
5050
%1:vreg_64 = IMPLICIT_DEF
5151
%2:vgpr_32(s32) = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
5252
%3:vreg_64 = GLOBAL_LOAD_DWORDX2 %1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
53-
%4:sreg_32 = V_READFIRSTLANE_B32 %2(s32), implicit $exec
53+
%4:sreg_32_xm0 = V_READFIRSTLANE_B32 %2(s32), implicit $exec
5454
S_ENDPGM 0
5555
...
5656

@@ -104,7 +104,7 @@ body: |
104104
105105
%0:vgpr_32 = IMPLICIT_DEF
106106
%1:vgpr_32 = IMPLICIT_DEF
107-
%2:sgpr_32 = V_READFIRSTLANE_B32 %0, implicit $exec
107+
%2:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec
108108
%3:sgpr_32 = V_READLANE_B32 %1, 0, implicit $exec
109109
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
110110
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec

0 commit comments

Comments
 (0)