Skip to content

Commit ac193bc

Browse files
authored
[AMDGPU][True16][CodeGen] S_PACK_XX_B32_B16 lowering for true16 mode (llvm#162389)
S_PACK_XX_B32_B16 requires special lowering for true16 mode when it's being lowered to VALU in fix-sgpr-copy pass. Added test cases in fix-sgpr-copies-f16-true16.mir
1 parent 7be2d75 commit ac193bc

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+52538
-27539
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9072,6 +9072,67 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
90729072
MachineOperand &Src1 = Inst.getOperand(2);
90739073
const DebugLoc &DL = Inst.getDebugLoc();
90749074

9075+
if (ST.useRealTrue16Insts()) {
9076+
Register SrcReg0, SrcReg1;
9077+
if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
9078+
SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9079+
BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg0).add(Src0);
9080+
} else {
9081+
SrcReg0 = Src0.getReg();
9082+
}
9083+
9084+
if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
9085+
SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9086+
BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg1).add(Src1);
9087+
} else {
9088+
SrcReg1 = Src1.getReg();
9089+
}
9090+
9091+
bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
9092+
bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
9093+
9094+
auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
9095+
switch (Inst.getOpcode()) {
9096+
case AMDGPU::S_PACK_LL_B32_B16:
9097+
NewMI
9098+
.addReg(SrcReg0, 0,
9099+
isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9100+
.addImm(AMDGPU::lo16)
9101+
.addReg(SrcReg1, 0,
9102+
isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9103+
.addImm(AMDGPU::hi16);
9104+
break;
9105+
case AMDGPU::S_PACK_LH_B32_B16:
9106+
NewMI
9107+
.addReg(SrcReg0, 0,
9108+
isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9109+
.addImm(AMDGPU::lo16)
9110+
.addReg(SrcReg1, 0, AMDGPU::hi16)
9111+
.addImm(AMDGPU::hi16);
9112+
break;
9113+
case AMDGPU::S_PACK_HL_B32_B16:
9114+
NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
9115+
.addImm(AMDGPU::lo16)
9116+
.addReg(SrcReg1, 0,
9117+
isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9118+
.addImm(AMDGPU::hi16);
9119+
break;
9120+
case AMDGPU::S_PACK_HH_B32_B16:
9121+
NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
9122+
.addImm(AMDGPU::lo16)
9123+
.addReg(SrcReg1, 0, AMDGPU::hi16)
9124+
.addImm(AMDGPU::hi16);
9125+
break;
9126+
default:
9127+
llvm_unreachable("unhandled s_pack_* instruction");
9128+
}
9129+
9130+
MachineOperand &Dest = Inst.getOperand(0);
9131+
MRI.replaceRegWith(Dest.getReg(), ResultReg);
9132+
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9133+
return;
9134+
}
9135+
90759136
switch (Inst.getOpcode()) {
90769137
case AMDGPU::S_PACK_LL_B32_B16: {
90779138
Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

llvm/test/CodeGen/AMDGPU/add.v2i16.ll

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -780,7 +780,7 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
780780
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
781781
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
782782
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
783-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
783+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
784784
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
785785
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
786786
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -790,11 +790,9 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
790790
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
791791
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v1, v0
792792
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
793-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
794-
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
795-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
796-
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v3
797-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
793+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
794+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
795+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.h
798796
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1
799797
; GFX11-TRUE16-NEXT: global_store_b128 v1, v[0:3], s[0:1]
800798
; GFX11-TRUE16-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll

Lines changed: 14415 additions & 7927 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll

Lines changed: 1536 additions & 820 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll

Lines changed: 3865 additions & 2029 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)