Skip to content

Commit 52ed03d

Browse files
authored
AMDGPU: Simplify foldImmediate with register class based checks (llvm#154682)
Generalize the code over the properties of the mov instruction, rather than maintaining parallel logic to figure out the type of mov to use. I've maintained the behavior with 16-bit physical SGPRs, though I think the behavior here is broken and corrupting any value that happens to be live in the high bits. It just happens there's no way to separately write to those with a real instruction but I don't think we should be trying to make assumptions around that property. This is NFC-ish. It now does a better job with imm pseudos which practically won't reach here. This also will make it easier to support more folds in a future patch. I added a couple of new tests with 16-bit extract of 64-bit sources.
1 parent 4325a07 commit 52ed03d

File tree

2 files changed

+138
-42
lines changed

2 files changed

+138
-42
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 75 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -3573,54 +3573,93 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
35733573
assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
35743574

35753575
Register DstReg = UseMI.getOperand(0).getReg();
3576-
unsigned OpSize = getOpSize(UseMI, 0);
3577-
bool Is16Bit = OpSize == 2;
3578-
bool Is64Bit = OpSize == 8;
3579-
bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
3580-
unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
3581-
: AMDGPU::V_MOV_B32_e32
3582-
: Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
3583-
: AMDGPU::S_MOV_B32;
3584-
3585-
std::optional<int64_t> SubRegImm =
3586-
extractSubregFromImm(Imm, UseMI.getOperand(1).getSubReg());
3587-
3588-
APInt Imm(Is64Bit ? 64 : 32, *SubRegImm,
3589-
/*isSigned=*/true, /*implicitTrunc=*/true);
3590-
3591-
if (RI.isAGPR(*MRI, DstReg)) {
3592-
if (Is64Bit || !isInlineConstant(Imm))
3593-
return false;
3594-
NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
3595-
}
3576+
Register UseSubReg = UseMI.getOperand(1).getSubReg();
3577+
3578+
const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3579+
3580+
bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3581+
RI.getSubRegIdxSize(UseSubReg) == 16;
35963582

35973583
if (Is16Bit) {
3598-
if (isVGPRCopy)
3584+
if (RI.hasVGPRs(DstRC))
35993585
return false; // Do not clobber vgpr_hi16
36003586

3601-
if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
3587+
if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
36023588
return false;
3603-
3604-
UseMI.getOperand(0).setSubReg(0);
3605-
if (DstReg.isPhysical()) {
3606-
DstReg = RI.get32BitRegister(DstReg);
3607-
UseMI.getOperand(0).setReg(DstReg);
3608-
}
3609-
assert(UseMI.getOperand(1).getReg().isVirtual());
36103589
}
36113590

36123591
MachineFunction *MF = UseMI.getMF();
3613-
const MCInstrDesc &NewMCID = get(NewOpc);
3614-
const TargetRegisterClass *NewDefRC = getRegClass(NewMCID, 0, &RI, *MF);
36153592

3616-
if (DstReg.isPhysical()) {
3617-
if (!NewDefRC->contains(DstReg))
3618-
return false;
3619-
} else if (!MRI->constrainRegClass(DstReg, NewDefRC))
3593+
unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3594+
MCRegister MovDstPhysReg =
3595+
DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3596+
3597+
std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3598+
3599+
// TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3600+
for (unsigned MovOp :
3601+
{AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3602+
AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3603+
const MCInstrDesc &MovDesc = get(MovOp);
3604+
3605+
const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI, *MF);
3606+
if (Is16Bit) {
3607+
// We just need to find a correctly sized register class, so the
3608+
// subregister index compatibility doesn't matter since we're statically
3609+
// extracting the immediate value.
3610+
MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3611+
if (!MovDstRC)
3612+
continue;
3613+
3614+
if (MovDstPhysReg) {
3615+
// FIXME: We probably should not do this. If there is a live value in
3616+
// the high half of the register, it will be corrupted.
3617+
MovDstPhysReg =
3618+
RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3619+
if (!MovDstPhysReg)
3620+
continue;
3621+
}
3622+
}
3623+
3624+
// Result class isn't the right size, try the next instruction.
3625+
if (MovDstPhysReg) {
3626+
if (!MovDstRC->contains(MovDstPhysReg))
3627+
return false;
3628+
} else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3629+
// TODO: This will be overly conservative in the case of 16-bit virtual
3630+
// SGPRs. We could hack up the virtual register uses to use a compatible
3631+
// 32-bit class.
3632+
continue;
3633+
}
3634+
3635+
const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3636+
3637+
// Ensure the interpreted immediate value is a valid operand in the new
3638+
// mov.
3639+
//
3640+
// FIXME: isImmOperandLegal should have form that doesn't require existing
3641+
// MachineInstr or MachineOperand
3642+
if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3643+
!isInlineConstant(*SubRegImm, OpInfo.OperandType))
3644+
break;
3645+
3646+
NewOpc = MovOp;
3647+
break;
3648+
}
3649+
3650+
if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
36203651
return false;
36213652

3653+
if (Is16Bit) {
3654+
UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3655+
if (MovDstPhysReg)
3656+
UseMI.getOperand(0).setReg(MovDstPhysReg);
3657+
assert(UseMI.getOperand(1).getReg().isVirtual());
3658+
}
3659+
3660+
const MCInstrDesc &NewMCID = get(NewOpc);
36223661
UseMI.setDesc(NewMCID);
3623-
UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
3662+
UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
36243663
UseMI.addImplicitDefUseOperands(*MF);
36253664
return true;
36263665
}

llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir

Lines changed: 63 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -188,8 +188,7 @@ body: |
188188
189189
; GCN-LABEL: name: fold_sreg_64_to_sreg_64
190190
; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1311768467750121200
191-
; GCN-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
192-
; GCN-NEXT: SI_RETURN_TO_EPILOG [[S_MOV_B]]
191+
; GCN-NEXT: SI_RETURN_TO_EPILOG [[S_MOV_B64_]]
193192
%0:sreg_64 = S_MOV_B64 1311768467750121200
194193
%1:sreg_64 = COPY killed %0
195194
SI_RETURN_TO_EPILOG %1
@@ -761,8 +760,8 @@ body: |
761760
bb.0:
762761
; GCN-LABEL: name: fold_av_mov_b32_imm_pseudo_inlineimm_to_av
763762
; GCN: [[AV_MOV_:%[0-9]+]]:av_32 = AV_MOV_B32_IMM_PSEUDO 64, implicit $exec
764-
; GCN-NEXT: [[COPY:%[0-9]+]]:av_32 = COPY killed [[AV_MOV_]]
765-
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[COPY]]
763+
; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 64, implicit $exec
764+
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B32_e32_]]
766765
%0:av_32 = AV_MOV_B32_IMM_PSEUDO 64, implicit $exec
767766
%1:av_32 = COPY killed %0
768767
SI_RETURN_TO_EPILOG implicit %1
@@ -800,9 +799,67 @@ body: |
800799
bb.0:
801800
; GCN-LABEL: name: fold_av_mov_b64_imm_pseudo_inlineimm_to_av
802801
; GCN: [[AV_MOV_:%[0-9]+]]:av_64_align2 = AV_MOV_B64_IMM_PSEUDO 64, implicit $exec
803-
; GCN-NEXT: [[COPY:%[0-9]+]]:av_64_align2 = COPY killed [[AV_MOV_]]
804-
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[COPY]]
802+
; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 64, implicit $exec
803+
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B]]
805804
%0:av_64_align2 = AV_MOV_B64_IMM_PSEUDO 64, implicit $exec
806805
%1:av_64_align2 = COPY killed %0
807806
SI_RETURN_TO_EPILOG implicit %1
808807
...
808+
809+
---
810+
name: fold_simm_16_sub_to_lo_from_mov_64_virt_sgpr16
811+
body: |
812+
bb.0:
813+
814+
; GCN-LABEL: name: fold_simm_16_sub_to_lo_from_mov_64_virt_sgpr16
815+
; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 64
816+
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_lo16 = COPY killed [[S_MOV_B64_]].lo16
817+
; GCN-NEXT: SI_RETURN_TO_EPILOG [[COPY]]
818+
%0:sreg_64 = S_MOV_B64 64
819+
%1:sgpr_lo16 = COPY killed %0.lo16
820+
SI_RETURN_TO_EPILOG %1
821+
822+
...
823+
---
824+
name: fold_simm_16_sub_to_hi_from_mov_64_inline_imm_virt_sgpr16
825+
body: |
826+
bb.0:
827+
828+
; GCN-LABEL: name: fold_simm_16_sub_to_hi_from_mov_64_inline_imm_virt_sgpr16
829+
; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 64
830+
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_lo16 = COPY killed [[S_MOV_B64_]].hi16
831+
; GCN-NEXT: SI_RETURN_TO_EPILOG [[COPY]]
832+
%0:sreg_64 = S_MOV_B64 64
833+
%1:sgpr_lo16 = COPY killed %0.hi16
834+
SI_RETURN_TO_EPILOG %1
835+
836+
...
837+
838+
---
839+
name: fold_simm_16_sub_to_lo_from_mov_64_phys_sgpr16_lo
840+
body: |
841+
bb.0:
842+
843+
; GCN-LABEL: name: fold_simm_16_sub_to_lo_from_mov_64_phys_sgpr16_lo
844+
; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 64
845+
; GCN-NEXT: $sgpr0 = S_MOV_B32 64
846+
; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0_lo16
847+
%0:sreg_64 = S_MOV_B64 64
848+
$sgpr0_lo16 = COPY killed %0.lo16
849+
SI_RETURN_TO_EPILOG $sgpr0_lo16
850+
851+
...
852+
---
853+
name: fold_simm_16_sub_to_hi_from_mov_64_inline_imm_phys_sgpr16_lo
854+
body: |
855+
bb.0:
856+
857+
; GCN-LABEL: name: fold_simm_16_sub_to_hi_from_mov_64_inline_imm_phys_sgpr16_lo
858+
; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 64
859+
; GCN-NEXT: $sgpr0 = S_MOV_B32 0
860+
; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0_lo16
861+
%0:sreg_64 = S_MOV_B64 64
862+
$sgpr0_lo16 = COPY killed %0.hi16
863+
SI_RETURN_TO_EPILOG $sgpr0_lo16
864+
865+
...

0 commit comments

Comments
 (0)