Skip to content

Commit 4454152

Browse files
authored
AMDGPU: Replace copy-to-mov-imm folding logic with class compat checks (#154501)
This strengthens the check to ensure the new mov's source class is compatible with the source register. This avoids using the register sized based checks in getMovOpcode, which don't quite understand AV superclasses correctly. As a side effect it also enables more folds into true16 movs. getMovOpcode should probably be deleted, or at least replaced with class check based logic. In this particular case other legality checks need to be mixed in with attempted IR changes, so I didn't try to push all of that into the opcode selection.
1 parent d85069c commit 4454152

File tree

7 files changed

+574
-291
lines changed

7 files changed

+574
-291
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 51 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1248,6 +1248,7 @@ void SIFoldOperandsImpl::foldOperand(
12481248
if (FoldingImmLike && UseMI->isCopy()) {
12491249
Register DestReg = UseMI->getOperand(0).getReg();
12501250
Register SrcReg = UseMI->getOperand(1).getReg();
1251+
unsigned UseSubReg = UseMI->getOperand(1).getSubReg();
12511252
assert(SrcReg.isVirtual());
12521253

12531254
const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
@@ -1278,44 +1279,60 @@ void SIFoldOperandsImpl::foldOperand(
12781279
DestRC = &AMDGPU::SGPR_32RegClass;
12791280
}
12801281

1281-
// In order to fold immediates into copies, we need to change the
1282-
// copy to a MOV.
1282+
// In order to fold immediates into copies, we need to change the copy to a
1283+
// MOV. Find a compatible mov instruction with the value.
1284+
for (unsigned MovOp :
1285+
{AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
1286+
AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_MOV_B16_t16_e64}) {
1287+
const MCInstrDesc &MovDesc = TII->get(MovOp);
1288+
assert(MovDesc.getNumDefs() > 0 && MovDesc.operands()[0].RegClass != -1);
1289+
1290+
const TargetRegisterClass *MovDstRC =
1291+
TRI->getRegClass(MovDesc.operands()[0].RegClass);
1292+
1293+
// Fold if the destination register class of the MOV instruction (ResRC)
1294+
// is a superclass of (or equal to) the destination register class of the
1295+
// COPY (DestRC). If this condition fails, folding would be illegal.
1296+
if (!DestRC->hasSuperClassEq(MovDstRC))
1297+
continue;
12831298

1284-
unsigned MovOp = TII->getMovOpcode(DestRC);
1285-
if (MovOp == AMDGPU::COPY)
1286-
return;
1299+
const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1;
1300+
const TargetRegisterClass *MovSrcRC =
1301+
TRI->getRegClass(MovDesc.operands()[SrcIdx].RegClass);
12871302

1288-
// Fold if the destination register class of the MOV instruction (ResRC)
1289-
// is a superclass of (or equal to) the destination register class of the
1290-
// COPY (DestRC). If this condition fails, folding would be illegal.
1291-
const MCInstrDesc &MovDesc = TII->get(MovOp);
1292-
assert(MovDesc.getNumDefs() > 0 && MovDesc.operands()[0].RegClass != -1);
1293-
const TargetRegisterClass *ResRC =
1294-
TRI->getRegClass(MovDesc.operands()[0].RegClass);
1295-
if (!DestRC->hasSuperClassEq(ResRC))
1296-
return;
1303+
if (UseSubReg)
1304+
MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg);
1305+
if (!MRI->constrainRegClass(SrcReg, MovSrcRC))
1306+
break;
12971307

1298-
MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin();
1299-
MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end();
1300-
while (ImpOpI != ImpOpE) {
1301-
MachineInstr::mop_iterator Tmp = ImpOpI;
1302-
ImpOpI++;
1303-
UseMI->removeOperand(UseMI->getOperandNo(Tmp));
1304-
}
1305-
UseMI->setDesc(TII->get(MovOp));
1306-
1307-
if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
1308-
const auto &SrcOp = UseMI->getOperand(UseOpIdx);
1309-
MachineOperand NewSrcOp(SrcOp);
1310-
MachineFunction *MF = UseMI->getParent()->getParent();
1311-
UseMI->removeOperand(1);
1312-
UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
1313-
UseMI->addOperand(NewSrcOp); // src0
1314-
UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
1315-
UseOpIdx = 2;
1316-
UseOp = &UseMI->getOperand(UseOpIdx);
1308+
MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin();
1309+
MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end();
1310+
while (ImpOpI != ImpOpE) {
1311+
MachineInstr::mop_iterator Tmp = ImpOpI;
1312+
ImpOpI++;
1313+
UseMI->removeOperand(UseMI->getOperandNo(Tmp));
1314+
}
1315+
UseMI->setDesc(MovDesc);
1316+
1317+
if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
1318+
const auto &SrcOp = UseMI->getOperand(UseOpIdx);
1319+
MachineOperand NewSrcOp(SrcOp);
1320+
MachineFunction *MF = UseMI->getParent()->getParent();
1321+
UseMI->removeOperand(1);
1322+
UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
1323+
UseMI->addOperand(NewSrcOp); // src0
1324+
UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
1325+
UseOpIdx = SrcIdx;
1326+
UseOp = &UseMI->getOperand(UseOpIdx);
1327+
}
1328+
CopiesToReplace.push_back(UseMI);
1329+
break;
13171330
}
1318-
CopiesToReplace.push_back(UseMI);
1331+
1332+
// We failed to replace the copy, so give up.
1333+
if (UseMI->getOpcode() == AMDGPU::COPY)
1334+
return;
1335+
13191336
} else {
13201337
if (UseMI->isCopy() && OpToFold.isReg() &&
13211338
UseMI->getOperand(0).getReg().isVirtual() &&

llvm/test/CodeGen/AMDGPU/br_cc.f16.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ define amdgpu_kernel void @br_cc_f16_imm_a(
197197
; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0.5, v1.l
198198
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB1_2
199199
; GFX11-TRUE16-NEXT: ; %bb.1: ; %one
200-
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0x3800
200+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x3800
201201
; GFX11-TRUE16-NEXT: .LBB1_2: ; %two
202202
; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6
203203
; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7
@@ -303,7 +303,7 @@ define amdgpu_kernel void @br_cc_f16_imm_b(
303303
; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, 0.5, v1.l
304304
; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB2_2
305305
; GFX11-TRUE16-NEXT: ; %bb.1: ; %two
306-
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0x3800
306+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x3800
307307
; GFX11-TRUE16-NEXT: .LBB2_2: ; %one
308308
; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6
309309
; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7

llvm/test/CodeGen/AMDGPU/call-argument-types.ll

Lines changed: 63 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -426,16 +426,27 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
426426
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
427427
; GFX9-NEXT: s_endpgm
428428
;
429-
; GFX11-LABEL: test_call_external_void_func_i8_imm:
430-
; GFX11: ; %bb.0:
431-
; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b
432-
; GFX11-NEXT: s_getpc_b64 s[2:3]
433-
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8@rel32@lo+4
434-
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8@rel32@hi+12
435-
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
436-
; GFX11-NEXT: s_mov_b32 s32, 0
437-
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
438-
; GFX11-NEXT: s_endpgm
429+
; GFX11-TRUE16-LABEL: test_call_external_void_func_i8_imm:
430+
; GFX11-TRUE16: ; %bb.0:
431+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7b
432+
; GFX11-TRUE16-NEXT: s_getpc_b64 s[2:3]
433+
; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, external_void_func_i8@rel32@lo+4
434+
; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, external_void_func_i8@rel32@hi+12
435+
; GFX11-TRUE16-NEXT: s_mov_b64 s[6:7], s[0:1]
436+
; GFX11-TRUE16-NEXT: s_mov_b32 s32, 0
437+
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[2:3]
438+
; GFX11-TRUE16-NEXT: s_endpgm
439+
;
440+
; GFX11-FAKE16-LABEL: test_call_external_void_func_i8_imm:
441+
; GFX11-FAKE16: ; %bb.0:
442+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x7b
443+
; GFX11-FAKE16-NEXT: s_getpc_b64 s[2:3]
444+
; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, external_void_func_i8@rel32@lo+4
445+
; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, external_void_func_i8@rel32@hi+12
446+
; GFX11-FAKE16-NEXT: s_mov_b64 s[6:7], s[0:1]
447+
; GFX11-FAKE16-NEXT: s_mov_b32 s32, 0
448+
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[2:3]
449+
; GFX11-FAKE16-NEXT: s_endpgm
439450
;
440451
; HSA-LABEL: test_call_external_void_func_i8_imm:
441452
; HSA: ; %bb.0:
@@ -723,16 +734,27 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
723734
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
724735
; GFX9-NEXT: s_endpgm
725736
;
726-
; GFX11-LABEL: test_call_external_void_func_i16_imm:
727-
; GFX11: ; %bb.0:
728-
; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b
729-
; GFX11-NEXT: s_getpc_b64 s[2:3]
730-
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16@rel32@lo+4
731-
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16@rel32@hi+12
732-
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
733-
; GFX11-NEXT: s_mov_b32 s32, 0
734-
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
735-
; GFX11-NEXT: s_endpgm
737+
; GFX11-TRUE16-LABEL: test_call_external_void_func_i16_imm:
738+
; GFX11-TRUE16: ; %bb.0:
739+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7b
740+
; GFX11-TRUE16-NEXT: s_getpc_b64 s[2:3]
741+
; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, external_void_func_i16@rel32@lo+4
742+
; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, external_void_func_i16@rel32@hi+12
743+
; GFX11-TRUE16-NEXT: s_mov_b64 s[6:7], s[0:1]
744+
; GFX11-TRUE16-NEXT: s_mov_b32 s32, 0
745+
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[2:3]
746+
; GFX11-TRUE16-NEXT: s_endpgm
747+
;
748+
; GFX11-FAKE16-LABEL: test_call_external_void_func_i16_imm:
749+
; GFX11-FAKE16: ; %bb.0:
750+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x7b
751+
; GFX11-FAKE16-NEXT: s_getpc_b64 s[2:3]
752+
; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, external_void_func_i16@rel32@lo+4
753+
; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, external_void_func_i16@rel32@hi+12
754+
; GFX11-FAKE16-NEXT: s_mov_b64 s[6:7], s[0:1]
755+
; GFX11-FAKE16-NEXT: s_mov_b32 s32, 0
756+
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[2:3]
757+
; GFX11-FAKE16-NEXT: s_endpgm
736758
;
737759
; HSA-LABEL: test_call_external_void_func_i16_imm:
738760
; HSA: ; %bb.0:
@@ -1642,16 +1664,27 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
16421664
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
16431665
; GFX9-NEXT: s_endpgm
16441666
;
1645-
; GFX11-LABEL: test_call_external_void_func_f16_imm:
1646-
; GFX11: ; %bb.0:
1647-
; GFX11-NEXT: v_mov_b32_e32 v0, 0x4400
1648-
; GFX11-NEXT: s_getpc_b64 s[2:3]
1649-
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f16@rel32@lo+4
1650-
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f16@rel32@hi+12
1651-
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
1652-
; GFX11-NEXT: s_mov_b32 s32, 0
1653-
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
1654-
; GFX11-NEXT: s_endpgm
1667+
; GFX11-TRUE16-LABEL: test_call_external_void_func_f16_imm:
1668+
; GFX11-TRUE16: ; %bb.0:
1669+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x4400
1670+
; GFX11-TRUE16-NEXT: s_getpc_b64 s[2:3]
1671+
; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, external_void_func_f16@rel32@lo+4
1672+
; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, external_void_func_f16@rel32@hi+12
1673+
; GFX11-TRUE16-NEXT: s_mov_b64 s[6:7], s[0:1]
1674+
; GFX11-TRUE16-NEXT: s_mov_b32 s32, 0
1675+
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[2:3]
1676+
; GFX11-TRUE16-NEXT: s_endpgm
1677+
;
1678+
; GFX11-FAKE16-LABEL: test_call_external_void_func_f16_imm:
1679+
; GFX11-FAKE16: ; %bb.0:
1680+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x4400
1681+
; GFX11-FAKE16-NEXT: s_getpc_b64 s[2:3]
1682+
; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, external_void_func_f16@rel32@lo+4
1683+
; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, external_void_func_f16@rel32@hi+12
1684+
; GFX11-FAKE16-NEXT: s_mov_b64 s[6:7], s[0:1]
1685+
; GFX11-FAKE16-NEXT: s_mov_b32 s32, 0
1686+
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[2:3]
1687+
; GFX11-FAKE16-NEXT: s_endpgm
16551688
;
16561689
; HSA-LABEL: test_call_external_void_func_f16_imm:
16571690
; HSA: ; %bb.0:

0 commit comments

Comments
 (0)