Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 84 additions & 4 deletions llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,11 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineOperand.h"

Expand Down Expand Up @@ -576,6 +579,10 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
}

MachineOperand *New = Fold.OpToFold;
// Rework once the VS_16 register class is updated to include proper
// 16-bit SGPRs instead of 32-bit ones.
if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
Old.setSubReg(AMDGPU::NoSubRegister);
Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
Old.setIsUndef(New->isUndef());
return true;
Expand Down Expand Up @@ -947,9 +954,15 @@ void SIFoldOperandsImpl::foldOperand(
return;

// FIXME: Fold operands with subregs.
if (UseOp->isReg() && OpToFold.isReg() &&
(UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister))
return;
if (UseOp->isReg() && OpToFold.isReg()) {
if (UseOp->isImplicit())
return;
// Allow folding from SGPRs to 16-bit VGPRs.
if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&
(UseOp->getSubReg() != AMDGPU::lo16 ||
!TRI->isSGPRReg(*MRI, OpToFold.getReg())))
return;
}

// Special case for REG_SEQUENCE: We can't fold literals into
// REG_SEQUENCE instructions, so we have to fold them into the
Expand Down Expand Up @@ -1040,6 +1053,14 @@ void SIFoldOperandsImpl::foldOperand(
}
}

// Allow immediates COPYd into sgpr_lo16 to be further folded while
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is COPYd intentional?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, to indicate the use of the COPY instruction. But if it's confusing I'd be happy to change it.

// still being legal if not further folded
if (DestRC == &AMDGPU::SGPR_LO16RegClass) {
assert(ST->useRealTrue16Insts());
MRI->setRegClass(DestReg, &AMDGPU::SGPR_32RegClass);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should not modify the register class of the value until fully committing the rewrite of the instruction. This is speculatively modifying the class, so we may change the class and then not perform the fold based on the below conditions

DestRC = &AMDGPU::SGPR_32RegClass;
}

// In order to fold immediates into copies, we need to change the
// copy to a MOV.

Expand Down Expand Up @@ -1073,9 +1094,43 @@ void SIFoldOperandsImpl::foldOperand(
UseMI->getOperand(0).getReg().isVirtual() &&
!UseMI->getOperand(1).getSubReg()) {
LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
unsigned Size = TII->getOpSize(*UseMI, 1);
Register UseReg = OpToFold.getReg();
UseMI->getOperand(1).setReg(UseReg);
UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
unsigned SubRegIdx = OpToFold.getSubReg();
// Hack to allow 32-bit SGPRs to be folded into True16 instructions
// Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the
// VS_16RegClass
//
// Excerpt from AMDGPUGenRegisterInfo.inc
// NoSubRegister, //0
// hi16, // 1
// lo16, // 2
// sub0, // 3
// ...
// sub1, // 11
// sub1_hi16, // 12
// sub1_lo16, // 13
static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed");
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there no way to avoid the hardcoded value 12 here? These fields are autogenerated and they are bound to change.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The point of the hardcoded value here is as an alarm that something has changed. The folding code below will not work if the indices change, and it is difficult to predict how they would change. In that sense being brittle is a feature, rather than a bug. That said, the code has existed for almost 2 years downstream, and these values haven't changed in that time.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see your point, and I don't have a better solution to recommend here. But it looks like any sub-reg layout change in the future needed a fixup in this hardcoded value.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see how the code below depends on the exact values cited in this comment. Is it just the condition SubRegIdx > AMDGPU::sub1, which could be written as SubRegIdx == AMDGPU::sub1_hi16 || SubRegIdx == AMDGPU::sub1_lo16?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, SubRegIdx can lo16, sub0_lo16, sub1_lo16, sub2_lo16, sub3_lo16, sub4_lo16 etc. I'm pretty sure SubRegIdx is always lo16. Also It requires that the order goes ... subX, subX_hi16, subX_lo16 ..., so we add subX_hi16 to subX_lo16 to form M

if (Size == 2 && TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
TRI->isSGPRReg(*MRI, UseReg)) {
// Produce the 32 bit subregister index to which the 16-bit subregister
// is aligned.
if (SubRegIdx > AMDGPU::sub1) {
LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx);
M |= M.getLane(M.getHighestLane() - 1);
SmallVector<unsigned, 4> Indexes;
TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M,
Indexes);
assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover");
SubRegIdx = Indexes[0];
// 32-bit registers do not have a sub0 index
} else if (TII->getOpSize(*UseMI, 1) == 4)
SubRegIdx = 0;
else
SubRegIdx = AMDGPU::sub0;
}
UseMI->getOperand(1).setSubReg(SubRegIdx);
UseMI->getOperand(1).setIsKill(false);
CopiesToReplace.push_back(UseMI);
OpToFold.setIsKill(false);
Expand Down Expand Up @@ -1713,6 +1768,31 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
return false;

// True16: Fix malformed 16-bit sgpr COPY produced by peephole-opt
// Can remove this code if proper 16-bit SGPRs are implemented
// Example: Pre-peephole-opt
// %29:sgpr_lo16 = COPY %16.lo16:sreg_32
// %32:sreg_32 = COPY %29:sgpr_lo16
// %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
// Post-peephole-opt and DCE
// %32:sreg_32 = COPY %16.lo16:sreg_32
// %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
// After this transform
// %32:sreg_32 = COPY %16:sreg_32
// %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
// After the fold operands pass
// %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %16:sreg_32
if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() &&
OpToFold.getSubReg()) {
const TargetRegisterClass *DstRC =
MRI->getRegClass(MI.getOperand(0).getReg());
if (DstRC == &AMDGPU::SReg_32RegClass &&
DstRC == MRI->getRegClass(OpToFold.getReg())) {
assert(OpToFold.getSubReg() == AMDGPU::lo16);
OpToFold.setSubReg(0);
}
}

// Prevent folding operands backwards in the function. For example,
// the COPY opcode must not be replaced by 1 in this example:
//
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/VOP1Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -776,6 +776,7 @@ let SubtargetPredicate = isGFX11Plus in {
// Restrict src0 to be VGPR
def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
[], /*VOP1Only=*/ 1>;
let isAsCheapAsAMove = 1 in
defm V_MOV_B16 : VOP1Inst_t16<"v_mov_b16", VOP_I16_I16>;
defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>;
defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>;
Expand Down
67 changes: 29 additions & 38 deletions llvm/test/CodeGen/AMDGPU/bf16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -38819,16 +38819,14 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
; GFX11TRUE16-LABEL: s_select_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_lshr_b32 s2, s0, 16
; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s2
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, vcc_lo
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s2
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
; GFX11TRUE16-NEXT: s_lshr_b32 s0, s1, 16
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, s0, v1.l, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s1, v0.l, vcc_lo
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11TRUE16-NEXT: ; return to shader part epilog
;
Expand Down Expand Up @@ -38936,19 +38934,17 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
;
; GFX11TRUE16-LABEL: s_vselect_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16
; GFX11TRUE16-NEXT: s_lshr_b32 s4, s0, 16
; GFX11TRUE16-NEXT: s_lshr_b32 s3, s0, 16
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v1
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s4
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0
; GFX11TRUE16-NEXT: s_lshr_b32 s0, s1, 16
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, s0, v0.l, s2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, s1, v0.h, vcc_lo
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v1
; GFX11TRUE16-NEXT: ; return to shader part epilog
;
; GFX11FAKE16-LABEL: s_vselect_v2bf16:
Expand Down Expand Up @@ -40655,30 +40651,25 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
;
; GFX11TRUE16-LABEL: s_vselect_v4bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_lshr_b32 s7, s3, 16
; GFX11TRUE16-NEXT: s_lshr_b32 s7, s1, 16
; GFX11TRUE16-NEXT: s_lshr_b32 s9, s0, 16
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 0, v1
; GFX11TRUE16-NEXT: s_lshr_b32 s8, s1, 16
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s7
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s3
; GFX11TRUE16-NEXT: s_lshr_b32 s3, s2, 16
; GFX11TRUE16-NEXT: s_lshr_b32 s7, s0, 16
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 0, v2
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 0, v3
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s8
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s3
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s7
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, s2
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, s0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, s1
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s6
; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v1.h, v2.l, s4
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v2.h, v3.l, vcc_lo
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v3.h, s5
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s7
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s9
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s1
; GFX11TRUE16-NEXT: s_lshr_b32 s8, s3, 16
; GFX11TRUE16-NEXT: s_lshr_b32 s0, s2, 16
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, s8, v0.l, s6
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, s0, v0.h, s4
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s2, v1.l, vcc_lo
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, s3, v1.h, s5
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v4
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v0
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v2
; GFX11TRUE16-NEXT: ; return to shader part epilog
;
; GFX11FAKE16-LABEL: s_vselect_v4bf16:
Expand Down
Loading
Loading