Skip to content

Commit 5341a84

Browse files
committed
[AMDGPU][SIFoldOperand] Hoist readlane through some instructions
SIFoldOperand version of llvm#129037 Handles a limited amount of opcodes because going from VALU to SALU isn't trivial, and we don't have a helper for it. I looked at our test suite and added all opcodes that were eligible and appeared as v_read(first)lane operands.
1 parent a557861 commit 5341a84

14 files changed

+1200
-859
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ class SIFoldOperandsImpl {
126126
std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const;
127127
bool tryConstantFoldOp(MachineInstr *MI) const;
128128
bool tryFoldCndMask(MachineInstr &MI) const;
129+
bool tryScalarizeReadLaneSrc(MachineInstr &MI) const;
129130
bool tryFoldZeroHighBits(MachineInstr &MI) const;
130131
bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
131132

@@ -1407,6 +1408,148 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
14071408
return true;
14081409
}
14091410

1411+
static unsigned
1412+
getScalarizedReadLaneSrcOpc(const GCNSubtarget &ST, unsigned Opc,
1413+
SmallVectorImpl<MachineOperand *> &Ops) {
1414+
// Opcodes here are added as-needed because there are hundreds of
1415+
// instructions we could convert, but realistically we only need
1416+
// the most frequent ones to make an impact.
1417+
//
1418+
// The InstCombine version of this transform will do the heavy
1419+
// lifting, this is just a cleanup for the readlanes added during
1420+
// lowering.
1421+
switch (Opc) {
1422+
case AMDGPU::V_OR_B32_e32:
1423+
case AMDGPU::V_OR_B32_e64:
1424+
return AMDGPU::S_OR_B32;
1425+
case AMDGPU::V_MUL_HI_U32_e64:
1426+
if (ST.getGeneration() >= GCNSubtarget::GFX9)
1427+
return AMDGPU::S_MUL_HI_U32;
1428+
break;
1429+
case AMDGPU::V_AND_B32_e32:
1430+
case AMDGPU::V_AND_B32_e64:
1431+
return AMDGPU::S_AND_B32;
1432+
case AMDGPU::V_LSHRREV_B32_e32: // dst = S1 >> S0
1433+
case AMDGPU::V_LSHRREV_B32_e64:
1434+
std::swap(Ops[0], Ops[1]); // dst = S0 >> S1 (!)
1435+
return AMDGPU::S_LSHR_B32;
1436+
case AMDGPU::V_CVT_U32_F32_e32:
1437+
case AMDGPU::V_CVT_U32_F32_e64:
1438+
if (ST.hasSALUFloatInsts())
1439+
return AMDGPU::S_CVT_U32_F32;
1440+
break;
1441+
case AMDGPU::V_MIN_U32_e32:
1442+
case AMDGPU::V_MIN_U32_e64:
1443+
return AMDGPU::S_MIN_U32;
1444+
case AMDGPU::V_MIN_I32_e32:
1445+
case AMDGPU::V_MIN_I32_e64:
1446+
return AMDGPU::S_MIN_I32;
1447+
case AMDGPU::V_MAX_U32_e32:
1448+
case AMDGPU::V_MAX_U32_e64:
1449+
return AMDGPU::S_MAX_U32;
1450+
case AMDGPU::V_MAX_I32_e32:
1451+
case AMDGPU::V_MAX_I32_e64:
1452+
return AMDGPU::S_MAX_I32;
1453+
default:
1454+
break;
1455+
}
1456+
1457+
return -1;
1458+
}
1459+
1460+
// Try to transform
1461+
// %0:vgpr = (valu op) %x:vgpr
1462+
// %1:sgpr = v_readfirstlane %0
1463+
// Into
1464+
// %0:sgpr = v_readfirstlane %x:vgpr
1465+
// %1:sgpr = (salu op) %0
1466+
bool SIFoldOperandsImpl::tryScalarizeReadLaneSrc(MachineInstr &MI) const {
1467+
const unsigned Opc = MI.getOpcode();
1468+
if (Opc != AMDGPU::V_READFIRSTLANE_B32 && Opc != AMDGPU::V_READLANE_B32)
1469+
return false;
1470+
1471+
const auto VSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1472+
const Register VSrc = MI.getOperand(VSrcIdx).getReg();
1473+
1474+
if (!MRI->hasOneNonDBGUse(VSrc))
1475+
return false;
1476+
1477+
MachineInstr *VSrcDef = MRI->getVRegDef(VSrc);
1478+
// Need a unary or binary VALU instruction as operand.
1479+
if (!VSrcDef || (VSrcDef->getParent() != MI.getParent()) ||
1480+
!TII->isVALU(*VSrcDef) || VSrcDef->getNumExplicitOperands() > 3 ||
1481+
execMayBeModifiedBeforeUse(*MRI, VSrc, *VSrcDef, MI))
1482+
return false;
1483+
1484+
const bool IsReadLane = (Opc == AMDGPU::V_READLANE_B32);
1485+
if (IsReadLane) {
1486+
MachineOperand &LaneOp = MI.getOperand(2);
1487+
if (LaneOp.isReg()) { // Can the lane be an imm?
1488+
Register LaneReg = LaneOp.getReg();
1489+
for (auto It = VSrcDef->getIterator(); It != MI.getIterator(); ++It) {
1490+
if (It->modifiesRegister(LaneReg, TRI))
1491+
return false;
1492+
}
1493+
}
1494+
}
1495+
1496+
SmallVector<MachineOperand *, 2> Ops;
1497+
MachineOperand *TargetOp = nullptr;
1498+
for (MachineOperand &SrcOp : VSrcDef->operands()) {
1499+
if (SrcOp.isReg()) {
1500+
if (SrcOp.isImplicit() || SrcOp.isDef())
1501+
continue;
1502+
1503+
Ops.push_back(&SrcOp);
1504+
1505+
Register Reg = SrcOp.getReg();
1506+
if (TRI->isVectorRegister(*MRI, Reg)) {
1507+
// This only works if we have one VGPR src.
1508+
if (TargetOp)
1509+
return false;
1510+
TargetOp = &SrcOp;
1511+
}
1512+
} else {
1513+
Ops.push_back(&SrcOp); // also collect imms
1514+
}
1515+
}
1516+
if (!TargetOp)
1517+
return false;
1518+
1519+
LLVM_DEBUG(dbgs() << "tryScalarizeReadLaneSrc:\n\treadlane: " << MI
1520+
<< "\tsrc: " << *VSrcDef << "\top: " << *TargetOp << "\n");
1521+
1522+
const unsigned ScalarOp =
1523+
getScalarizedReadLaneSrcOpc(*ST, VSrcDef->getOpcode(), Ops);
1524+
if (ScalarOp == unsigned(-1))
1525+
return false;
1526+
1527+
// We only support unary/binary ops.
1528+
assert(Ops.size() <= 2);
1529+
1530+
MachineBasicBlock *MBB = VSrcDef->getParent();
1531+
auto InsertBefore = VSrcDef->getIterator();
1532+
const DebugLoc &DL = VSrcDef->getDebugLoc();
1533+
Register SDst = MI.getOperand(0).getReg();
1534+
1535+
Register STargetOp = MRI->createVirtualRegister(MRI->getRegClass(SDst));
1536+
auto NewMI = BuildMI(*MBB, InsertBefore, DL, MI.getDesc(), STargetOp)
1537+
.addReg(TargetOp->getReg());
1538+
if (IsReadLane)
1539+
NewMI.add(MI.getOperand(2)); // lane index
1540+
auto ScalarMI = BuildMI(*MBB, InsertBefore, DL, TII->get(ScalarOp), SDst);
1541+
for (MachineOperand *Op : Ops) {
1542+
if (Op == TargetOp)
1543+
ScalarMI.addReg(STargetOp);
1544+
else
1545+
ScalarMI.add(*Op);
1546+
}
1547+
1548+
VSrcDef->eraseFromParent();
1549+
MI.eraseFromParent();
1550+
return true;
1551+
}
1552+
14101553
bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
14111554
if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
14121555
MI.getOpcode() != AMDGPU::V_AND_B32_e32)
@@ -2353,6 +2496,11 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) {
23532496
for (auto &MI : make_early_inc_range(*MBB)) {
23542497
Changed |= tryFoldCndMask(MI);
23552498

2499+
if (tryScalarizeReadLaneSrc(MI)) {
2500+
Changed = true;
2501+
continue;
2502+
}
2503+
23562504
if (tryFoldZeroHighBits(MI)) {
23572505
Changed = true;
23582506
continue;

llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -730,18 +730,18 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
730730
; GFX6-NEXT: s_lshl_b32 s3, s4, 24
731731
; GFX6-NEXT: s_lshl_b32 s4, s7, 24
732732
; GFX6-NEXT: s_not_b32 s5, s3
733-
; GFX6-NEXT: s_min_u32 s4, s5, s4
734733
; GFX6-NEXT: s_lshr_b32 s1, s1, 24
734+
; GFX6-NEXT: s_min_u32 s4, s5, s4
735+
; GFX6-NEXT: v_mov_b32_e32 v0, s0
735736
; GFX6-NEXT: s_lshr_b32 s2, s2, 24
736737
; GFX6-NEXT: s_add_i32 s3, s3, s4
737-
; GFX6-NEXT: v_mov_b32_e32 v0, s0
738-
; GFX6-NEXT: s_lshr_b32 s3, s3, 24
739738
; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24
739+
; GFX6-NEXT: s_lshr_b32 s3, s3, 24
740740
; GFX6-NEXT: s_lshl_b32 s0, s2, 16
741-
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
742-
; GFX6-NEXT: s_lshl_b32 s0, s3, 24
743-
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
744-
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
741+
; GFX6-NEXT: v_readfirstlane_b32 s1, v0
742+
; GFX6-NEXT: s_or_b32 s0, s1, s0
743+
; GFX6-NEXT: s_lshl_b32 s1, s3, 24
744+
; GFX6-NEXT: s_or_b32 s0, s0, s1
745745
; GFX6-NEXT: ; return to shader part epilog
746746
;
747747
; GFX8-LABEL: s_uaddsat_v4i8:
@@ -1020,8 +1020,8 @@ define amdgpu_ps i24 @s_uaddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
10201020
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
10211021
; GFX8-NEXT: v_mov_b32_e32 v0, s1
10221022
; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 clamp
1023-
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0
10241023
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
1024+
; GFX8-NEXT: s_lshr_b32 s0, s0, 8
10251025
; GFX8-NEXT: ; return to shader part epilog
10261026
;
10271027
; GFX9-LABEL: s_uaddsat_i24:
@@ -1030,17 +1030,17 @@ define amdgpu_ps i24 @s_uaddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
10301030
; GFX9-NEXT: s_lshl_b32 s0, s0, 8
10311031
; GFX9-NEXT: v_mov_b32_e32 v0, s1
10321032
; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp
1033-
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0
10341033
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1034+
; GFX9-NEXT: s_lshr_b32 s0, s0, 8
10351035
; GFX9-NEXT: ; return to shader part epilog
10361036
;
10371037
; GFX10PLUS-LABEL: s_uaddsat_i24:
10381038
; GFX10PLUS: ; %bb.0:
10391039
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
10401040
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
10411041
; GFX10PLUS-NEXT: v_add_nc_u32_e64 v0, s0, s1 clamp
1042-
; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, 8, v0
10431042
; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1043+
; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 8
10441044
; GFX10PLUS-NEXT: ; return to shader part epilog
10451045
%result = call i24 @llvm.uadd.sat.i24(i24 %lhs, i24 %rhs)
10461046
ret i24 %result

llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -714,18 +714,18 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
714714
; GFX6-NEXT: s_sub_i32 s2, s2, s3
715715
; GFX6-NEXT: s_lshl_b32 s3, s4, 24
716716
; GFX6-NEXT: s_lshl_b32 s4, s7, 24
717-
; GFX6-NEXT: s_min_u32 s4, s3, s4
718717
; GFX6-NEXT: s_lshr_b32 s1, s1, 24
718+
; GFX6-NEXT: s_min_u32 s4, s3, s4
719+
; GFX6-NEXT: v_mov_b32_e32 v0, s0
719720
; GFX6-NEXT: s_lshr_b32 s2, s2, 24
720721
; GFX6-NEXT: s_sub_i32 s3, s3, s4
721-
; GFX6-NEXT: v_mov_b32_e32 v0, s0
722-
; GFX6-NEXT: s_lshr_b32 s3, s3, 24
723722
; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24
723+
; GFX6-NEXT: s_lshr_b32 s3, s3, 24
724724
; GFX6-NEXT: s_lshl_b32 s0, s2, 16
725-
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
726-
; GFX6-NEXT: s_lshl_b32 s0, s3, 24
727-
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
728-
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
725+
; GFX6-NEXT: v_readfirstlane_b32 s1, v0
726+
; GFX6-NEXT: s_or_b32 s0, s1, s0
727+
; GFX6-NEXT: s_lshl_b32 s1, s3, 24
728+
; GFX6-NEXT: s_or_b32 s0, s0, s1
729729
; GFX6-NEXT: ; return to shader part epilog
730730
;
731731
; GFX8-LABEL: s_usubsat_v4i8:
@@ -1002,8 +1002,8 @@ define amdgpu_ps i24 @s_usubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
10021002
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
10031003
; GFX8-NEXT: v_mov_b32_e32 v0, s1
10041004
; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s0, v0 clamp
1005-
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0
10061005
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
1006+
; GFX8-NEXT: s_lshr_b32 s0, s0, 8
10071007
; GFX8-NEXT: ; return to shader part epilog
10081008
;
10091009
; GFX9-LABEL: s_usubsat_i24:
@@ -1012,17 +1012,17 @@ define amdgpu_ps i24 @s_usubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
10121012
; GFX9-NEXT: s_lshl_b32 s0, s0, 8
10131013
; GFX9-NEXT: v_mov_b32_e32 v0, s1
10141014
; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp
1015-
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0
10161015
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1016+
; GFX9-NEXT: s_lshr_b32 s0, s0, 8
10171017
; GFX9-NEXT: ; return to shader part epilog
10181018
;
10191019
; GFX10PLUS-LABEL: s_usubsat_i24:
10201020
; GFX10PLUS: ; %bb.0:
10211021
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
10221022
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
10231023
; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, s0, s1 clamp
1024-
; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, 8, v0
10251024
; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1025+
; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 8
10261026
; GFX10PLUS-NEXT: ; return to shader part epilog
10271027
%result = call i24 @llvm.usub.sat.i24(i24 %lhs, i24 %rhs)
10281028
ret i24 %result

0 commit comments

Comments
 (0)