@@ -126,6 +126,7 @@ class SIFoldOperandsImpl {
126126 std::optional<int64_t > getImmOrMaterializedImm (MachineOperand &Op) const ;
127127 bool tryConstantFoldOp (MachineInstr *MI) const ;
128128 bool tryFoldCndMask (MachineInstr &MI) const ;
129+ bool tryScalarizeReadLaneSrc (MachineInstr &MI) const ;
129130 bool tryFoldZeroHighBits (MachineInstr &MI) const ;
130131 bool foldInstOperand (MachineInstr &MI, MachineOperand &OpToFold) const ;
131132
@@ -1407,6 +1408,148 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
14071408 return true ;
14081409}
14091410
1411+ static unsigned
1412+ getScalarizedReadLaneSrcOpc (const GCNSubtarget &ST, unsigned Opc,
1413+ SmallVectorImpl<MachineOperand *> &Ops) {
1414+ // Opcodes here are added as-needed because there are hundreds of
1415+ // instructions we could convert, but realistically we only need
1416+ // the most frequent ones to make an impact.
1417+ //
1418+ // The InstCombine version of this transform will do the heavy
1419+ // lifting, this is just a cleanup for the readlanes added during
1420+ // lowering.
1421+ switch (Opc) {
1422+ case AMDGPU::V_OR_B32_e32:
1423+ case AMDGPU::V_OR_B32_e64:
1424+ return AMDGPU::S_OR_B32;
1425+ case AMDGPU::V_MUL_HI_U32_e64:
1426+ if (ST.getGeneration () >= GCNSubtarget::GFX9)
1427+ return AMDGPU::S_MUL_HI_U32;
1428+ break ;
1429+ case AMDGPU::V_AND_B32_e32:
1430+ case AMDGPU::V_AND_B32_e64:
1431+ return AMDGPU::S_AND_B32;
1432+ case AMDGPU::V_LSHRREV_B32_e32: // dst = S1 >> S0
1433+ case AMDGPU::V_LSHRREV_B32_e64:
1434+ std::swap (Ops[0 ], Ops[1 ]); // dst = S0 >> S1 (!)
1435+ return AMDGPU::S_LSHR_B32;
1436+ case AMDGPU::V_CVT_U32_F32_e32:
1437+ case AMDGPU::V_CVT_U32_F32_e64:
1438+ if (ST.hasSALUFloatInsts ())
1439+ return AMDGPU::S_CVT_U32_F32;
1440+ break ;
1441+ case AMDGPU::V_MIN_U32_e32:
1442+ case AMDGPU::V_MIN_U32_e64:
1443+ return AMDGPU::S_MIN_U32;
1444+ case AMDGPU::V_MIN_I32_e32:
1445+ case AMDGPU::V_MIN_I32_e64:
1446+ return AMDGPU::S_MIN_I32;
1447+ case AMDGPU::V_MAX_U32_e32:
1448+ case AMDGPU::V_MAX_U32_e64:
1449+ return AMDGPU::S_MAX_U32;
1450+ case AMDGPU::V_MAX_I32_e32:
1451+ case AMDGPU::V_MAX_I32_e64:
1452+ return AMDGPU::S_MAX_I32;
1453+ default :
1454+ break ;
1455+ }
1456+
1457+ return -1 ;
1458+ }
1459+
1460+ // Try to transform
1461+ // %0:vgpr = (valu op) %x:vgpr
1462+ // %1:sgpr = v_readfirstlane %0
1463+ // Into
1464+ // %0:sgpr = v_readfirstlane %x:vgpr
1465+ // %1:sgpr = (salu op) %0
1466+ bool SIFoldOperandsImpl::tryScalarizeReadLaneSrc (MachineInstr &MI) const {
1467+ const unsigned Opc = MI.getOpcode ();
1468+ if (Opc != AMDGPU::V_READFIRSTLANE_B32 && Opc != AMDGPU::V_READLANE_B32)
1469+ return false ;
1470+
1471+ const auto VSrcIdx = AMDGPU::getNamedOperandIdx (Opc, AMDGPU::OpName::src0);
1472+ const Register VSrc = MI.getOperand (VSrcIdx).getReg ();
1473+
1474+ if (!MRI->hasOneNonDBGUse (VSrc))
1475+ return false ;
1476+
1477+ MachineInstr *VSrcDef = MRI->getVRegDef (VSrc);
1478+ // Need a unary or binary VALU instruction as operand.
1479+ if (!VSrcDef || (VSrcDef->getParent () != MI.getParent ()) ||
1480+ !TII->isVALU (*VSrcDef) || VSrcDef->getNumExplicitOperands () > 3 ||
1481+ execMayBeModifiedBeforeUse (*MRI, VSrc, *VSrcDef, MI))
1482+ return false ;
1483+
1484+ const bool IsReadLane = (Opc == AMDGPU::V_READLANE_B32);
1485+ if (IsReadLane) {
1486+ MachineOperand &LaneOp = MI.getOperand (2 );
1487+ if (LaneOp.isReg ()) { // Can the lane be an imm?
1488+ Register LaneReg = LaneOp.getReg ();
1489+ for (auto It = VSrcDef->getIterator (); It != MI.getIterator (); ++It) {
1490+ if (It->modifiesRegister (LaneReg, TRI))
1491+ return false ;
1492+ }
1493+ }
1494+ }
1495+
1496+ SmallVector<MachineOperand *, 2 > Ops;
1497+ MachineOperand *TargetOp = nullptr ;
1498+ for (MachineOperand &SrcOp : VSrcDef->operands ()) {
1499+ if (SrcOp.isReg ()) {
1500+ if (SrcOp.isImplicit () || SrcOp.isDef ())
1501+ continue ;
1502+
1503+ Ops.push_back (&SrcOp);
1504+
1505+ Register Reg = SrcOp.getReg ();
1506+ if (TRI->isVectorRegister (*MRI, Reg)) {
1507+ // This only works if we have one VGPR src.
1508+ if (TargetOp)
1509+ return false ;
1510+ TargetOp = &SrcOp;
1511+ }
1512+ } else {
1513+ Ops.push_back (&SrcOp); // also collect imms
1514+ }
1515+ }
1516+ if (!TargetOp)
1517+ return false ;
1518+
1519+ LLVM_DEBUG (dbgs () << " tryScalarizeReadLaneSrc:\n\t readlane: " << MI
1520+ << " \t src: " << *VSrcDef << " \t op: " << *TargetOp << " \n " );
1521+
1522+ const unsigned ScalarOp =
1523+ getScalarizedReadLaneSrcOpc (*ST, VSrcDef->getOpcode (), Ops);
1524+ if (ScalarOp == unsigned (-1 ))
1525+ return false ;
1526+
1527+ // We only support unary/binary ops.
1528+ assert (Ops.size () <= 2 );
1529+
1530+ MachineBasicBlock *MBB = VSrcDef->getParent ();
1531+ auto InsertBefore = VSrcDef->getIterator ();
1532+ const DebugLoc &DL = VSrcDef->getDebugLoc ();
1533+ Register SDst = MI.getOperand (0 ).getReg ();
1534+
1535+ Register STargetOp = MRI->createVirtualRegister (MRI->getRegClass (SDst));
1536+ auto NewMI = BuildMI (*MBB, InsertBefore, DL, MI.getDesc (), STargetOp)
1537+ .addReg (TargetOp->getReg ());
1538+ if (IsReadLane)
1539+ NewMI.add (MI.getOperand (2 )); // lane index
1540+ auto ScalarMI = BuildMI (*MBB, InsertBefore, DL, TII->get (ScalarOp), SDst);
1541+ for (MachineOperand *Op : Ops) {
1542+ if (Op == TargetOp)
1543+ ScalarMI.addReg (STargetOp);
1544+ else
1545+ ScalarMI.add (*Op);
1546+ }
1547+
1548+ VSrcDef->eraseFromParent ();
1549+ MI.eraseFromParent ();
1550+ return true ;
1551+ }
1552+
14101553bool SIFoldOperandsImpl::tryFoldZeroHighBits (MachineInstr &MI) const {
14111554 if (MI.getOpcode () != AMDGPU::V_AND_B32_e64 &&
14121555 MI.getOpcode () != AMDGPU::V_AND_B32_e32)
@@ -2353,6 +2496,11 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) {
23532496 for (auto &MI : make_early_inc_range (*MBB)) {
23542497 Changed |= tryFoldCndMask (MI);
23552498
2499+ if (tryScalarizeReadLaneSrc (MI)) {
2500+ Changed = true ;
2501+ continue ;
2502+ }
2503+
23562504 if (tryFoldZeroHighBits (MI)) {
23572505 Changed = true ;
23582506 continue ;
0 commit comments