Skip to content

Commit 254c576

Browse files
committed
16bit sgpr folding
1 parent 0f1175c commit 254c576

25 files changed

+704
-1058
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 91 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,11 @@
1212
#include "AMDGPU.h"
1313
#include "GCNSubtarget.h"
1414
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
15+
#include "SIInstrInfo.h"
1516
#include "SIMachineFunctionInfo.h"
17+
#include "SIRegisterInfo.h"
1618
#include "llvm/ADT/DepthFirstIterator.h"
19+
#include "llvm/CodeGen/MachineFunction.h"
1720
#include "llvm/CodeGen/MachineFunctionPass.h"
1821
#include "llvm/CodeGen/MachineOperand.h"
1922

@@ -576,6 +579,11 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
576579
}
577580

578581
MachineOperand *New = Fold.OpToFold;
582+
// TODO: Temporarily allow folding from SGPRs to 16-bit VGPRs.
583+
// Rework once the VS_16 register class is updated to include proper
584+
// 16-bit SGPRs instead of 32-bit ones.
585+
if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
586+
Old.setSubReg(AMDGPU::NoSubRegister);
579587
Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
580588
Old.setIsUndef(New->isUndef());
581589
return true;
@@ -947,9 +955,15 @@ void SIFoldOperandsImpl::foldOperand(
947955
return;
948956

949957
// FIXME: Fold operands with subregs.
950-
if (UseOp->isReg() && OpToFold.isReg() &&
951-
(UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister))
952-
return;
958+
if (UseOp->isReg() && OpToFold.isReg()) {
959+
if (UseOp->isImplicit())
960+
return;
961+
// Allow folding from SGPRs to 16-bit VGPRs.
962+
if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&
963+
(UseOp->getSubReg() != AMDGPU::lo16 ||
964+
!TRI->isSGPRReg(*MRI, OpToFold.getReg())))
965+
return;
966+
}
953967

954968
// Special case for REG_SEQUENCE: We can't fold literals into
955969
// REG_SEQUENCE instructions, so we have to fold them into the
@@ -1030,14 +1044,20 @@ void SIFoldOperandsImpl::foldOperand(
10301044
return;
10311045

10321046
const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
1033-
if (!DestReg.isPhysical()) {
1034-
if (DestRC == &AMDGPU::AGPR_32RegClass &&
1035-
TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1036-
UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
1037-
UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
1038-
CopiesToReplace.push_back(UseMI);
1039-
return;
1040-
}
1047+
if (DestRC == &AMDGPU::AGPR_32RegClass &&
1048+
TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1049+
UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
1050+
UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
1051+
CopiesToReplace.push_back(UseMI);
1052+
return;
1053+
}
1054+
1055+
// Allow immediates COPYd into sgpr_lo16 to be further folded while
1056+
// still being legal if not further folded
1057+
if (DestRC == &AMDGPU::SGPR_LO16RegClass) {
1058+
assert(ST->useRealTrue16Insts());
1059+
MRI->setRegClass(DestReg, &AMDGPU::SGPR_32RegClass);
1060+
DestRC = &AMDGPU::SGPR_32RegClass;
10411061
}
10421062

10431063
// In order to fold immediates into copies, we need to change the
@@ -1073,9 +1093,43 @@ void SIFoldOperandsImpl::foldOperand(
10731093
UseMI->getOperand(0).getReg().isVirtual() &&
10741094
!UseMI->getOperand(1).getSubReg()) {
10751095
LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
1096+
unsigned Size = TII->getOpSize(*UseMI, 1);
10761097
Register UseReg = OpToFold.getReg();
10771098
UseMI->getOperand(1).setReg(UseReg);
1078-
UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1099+
unsigned SubRegIdx = OpToFold.getSubReg();
1100+
// Hack to allow 32-bit SGPRs to be folded into True16 instructions
1101+
// Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the
1102+
// VS_16RegClass
1103+
//
1104+
// Excerpt from AMDGPUGenRegisterInfo.inc
1105+
// NoSubRegister, //0
1106+
// hi16, // 1
1107+
// lo16, // 2
1108+
// sub0, // 3
1109+
// ...
1110+
// sub1, // 11
1111+
// sub1_hi16, // 12
1112+
// sub1_lo16, // 13
1113+
static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed");
1114+
if (Size == 2 && TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
1115+
TRI->isSGPRReg(*MRI, UseReg)) {
1116+
// Produce the 32 bit subregister index to which the 16-bit subregister
1117+
// is aligned.
1118+
if (SubRegIdx > AMDGPU::sub1) {
1119+
LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx);
1120+
M |= M.getLane(M.getHighestLane() - 1);
1121+
SmallVector<unsigned, 4> Indexes;
1122+
TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M,
1123+
Indexes);
1124+
assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover");
1125+
SubRegIdx = Indexes[0];
1126+
// 32-bit registers do not have a sub0 index
1127+
} else if (TII->getOpSize(*UseMI, 1) == 4)
1128+
SubRegIdx = 0;
1129+
else
1130+
SubRegIdx = AMDGPU::sub0;
1131+
}
1132+
UseMI->getOperand(1).setSubReg(SubRegIdx);
10791133
UseMI->getOperand(1).setIsKill(false);
10801134
CopiesToReplace.push_back(UseMI);
10811135
OpToFold.setIsKill(false);
@@ -1713,6 +1767,31 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
17131767
if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
17141768
return false;
17151769

1770+
// True16: Fix malformed 16-bit sgpr COPY produced by peephole-opt
1771+
// Can remove this code if proper 16-bit SGPRs are implemented
1772+
// Example: Pre-peephole-opt
1773+
// %29:sgpr_lo16 = COPY %16.lo16:sreg_32
1774+
// %32:sreg_32 = COPY %29:sgpr_lo16
1775+
// %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
1776+
// Post-peephole-opt and DCE
1777+
// %32:sreg_32 = COPY %16.lo16:sreg_32
1778+
// %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
1779+
// After this transform
1780+
// %32:sreg_32 = COPY %16:sreg_32
1781+
// %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
1782+
// After the fold operands pass
1783+
// %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %16:sreg_32
1784+
if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() &&
1785+
OpToFold.getSubReg()) {
1786+
const TargetRegisterClass *DstRC =
1787+
MRI->getRegClass(MI.getOperand(0).getReg());
1788+
if (DstRC == &AMDGPU::SReg_32RegClass &&
1789+
DstRC == MRI->getRegClass(OpToFold.getReg())) {
1790+
assert(OpToFold.getSubReg() == AMDGPU::lo16);
1791+
OpToFold.setSubReg(0);
1792+
}
1793+
}
1794+
17161795
// Prevent folding operands backwards in the function. For example,
17171796
// the COPY opcode must not be replaced by 1 in this example:
17181797
//

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 29 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -38259,16 +38259,14 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
3825938259
; GFX11TRUE16-LABEL: s_select_v2bf16:
3826038260
; GFX11TRUE16: ; %bb.0:
3826138261
; GFX11TRUE16-NEXT: s_lshr_b32 s2, s0, 16
38262-
; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16
3826338262
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
38264-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
38265-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s2
38266-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1
38267-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0
38268-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
38269-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, vcc_lo
38270-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, vcc_lo
38271-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
38263+
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s2
38264+
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
38265+
; GFX11TRUE16-NEXT: s_lshr_b32 s0, s1, 16
38266+
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
38267+
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, s0, v1.l, vcc_lo
38268+
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
38269+
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s1, v0.l, vcc_lo
3827238270
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
3827338271
; GFX11TRUE16-NEXT: ; return to shader part epilog
3827438272
;
@@ -38376,19 +38374,17 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
3837638374
;
3837738375
; GFX11TRUE16-LABEL: s_vselect_v2bf16:
3837838376
; GFX11TRUE16: ; %bb.0:
38379-
; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16
38380-
; GFX11TRUE16-NEXT: s_lshr_b32 s4, s0, 16
38377+
; GFX11TRUE16-NEXT: s_lshr_b32 s3, s0, 16
3838138378
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
3838238379
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v1
3838338380
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
38384-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s4
38385-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1
38386-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0
38387-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
38388-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2
38389-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, vcc_lo
38390-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
38391-
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
38381+
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0
38382+
; GFX11TRUE16-NEXT: s_lshr_b32 s0, s1, 16
38383+
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
38384+
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, s0, v0.l, s2
38385+
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
38386+
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, s1, v0.h, vcc_lo
38387+
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v1
3839238388
; GFX11TRUE16-NEXT: ; return to shader part epilog
3839338389
;
3839438390
; GFX11FAKE16-LABEL: s_vselect_v2bf16:
@@ -40095,30 +40091,25 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
4009540091
;
4009640092
; GFX11TRUE16-LABEL: s_vselect_v4bf16:
4009740093
; GFX11TRUE16: ; %bb.0:
40098-
; GFX11TRUE16-NEXT: s_lshr_b32 s7, s3, 16
40094+
; GFX11TRUE16-NEXT: s_lshr_b32 s7, s1, 16
40095+
; GFX11TRUE16-NEXT: s_lshr_b32 s9, s0, 16
4009940096
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
4010040097
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 0, v1
40101-
; GFX11TRUE16-NEXT: s_lshr_b32 s8, s1, 16
40102-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s7
40103-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s3
40104-
; GFX11TRUE16-NEXT: s_lshr_b32 s3, s2, 16
40105-
; GFX11TRUE16-NEXT: s_lshr_b32 s7, s0, 16
4010640098
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 0, v2
4010740099
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 0, v3
40108-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s8
40109-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s3
40110-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s7
40111-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, s2
40112-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, s0
40113-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, s1
40114-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s6
40115-
; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v1.h, v2.l, s4
40116-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
40117-
; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v2.h, v3.l, vcc_lo
40118-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v3.h, s5
40100+
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s7
40101+
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s9
40102+
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s0
40103+
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s1
40104+
; GFX11TRUE16-NEXT: s_lshr_b32 s8, s3, 16
40105+
; GFX11TRUE16-NEXT: s_lshr_b32 s0, s2, 16
40106+
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, s8, v0.l, s6
40107+
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, s0, v0.h, s4
40108+
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s2, v1.l, vcc_lo
40109+
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, s3, v1.h, s5
4011940110
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
40120-
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v4
40121-
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v0
40111+
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
40112+
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v2
4012240113
; GFX11TRUE16-NEXT: ; return to shader part epilog
4012340114
;
4012440115
; GFX11FAKE16-LABEL: s_vselect_v4bf16:

0 commit comments

Comments
 (0)