@@ -128,6 +128,8 @@ class SIFoldOperandsImpl {
128128 bool tryFoldCndMask (MachineInstr &MI) const ;
129129 bool tryFoldZeroHighBits (MachineInstr &MI) const ;
130130 bool foldInstOperand (MachineInstr &MI, MachineOperand &OpToFold) const ;
131+
132+ bool foldCopyToAGPRRegSequence (MachineInstr *CopyMI) const ;
131133 bool tryFoldFoldableCopy (MachineInstr &MI,
132134 MachineOperand *&CurrentKnownM0Val) const ;
133135
@@ -1012,7 +1014,6 @@ void SIFoldOperandsImpl::foldOperand(
10121014 UseMI->getOperand (0 ).getReg ().isVirtual () &&
10131015 !UseMI->getOperand (1 ).getSubReg ()) {
10141016 LLVM_DEBUG (dbgs () << " Folding " << OpToFold << " \n into " << *UseMI);
1015- unsigned Size = TII->getOpSize (*UseMI, 1 );
10161017 Register UseReg = OpToFold.getReg ();
10171018 UseMI->getOperand (1 ).setReg (UseReg);
10181019 UseMI->getOperand (1 ).setSubReg (OpToFold.getSubReg ());
@@ -1022,83 +1023,8 @@ void SIFoldOperandsImpl::foldOperand(
10221023
10231024 // Remove kill flags as kills may now be out of order with uses.
10241025 MRI->clearKillFlags (OpToFold.getReg ());
1025-
1026- // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
1027- // can only accept VGPR or inline immediate. Recreate a reg_sequence with
1028- // its initializers right here, so we will rematerialize immediates and
1029- // avoid copies via different reg classes.
1030- SmallVector<std::pair<MachineOperand*, unsigned >, 32 > Defs;
1031- if (Size > 4 && TRI->isAGPR (*MRI, UseMI->getOperand (0 ).getReg ()) &&
1032- getRegSeqInit (Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1033- const DebugLoc &DL = UseMI->getDebugLoc ();
1034- MachineBasicBlock &MBB = *UseMI->getParent ();
1035-
1036- UseMI->setDesc (TII->get (AMDGPU::REG_SEQUENCE));
1037- for (unsigned I = UseMI->getNumOperands () - 1 ; I > 0 ; --I)
1038- UseMI->removeOperand (I);
1039-
1040- MachineInstrBuilder B (*MBB.getParent (), UseMI);
1041- DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
1042- SmallSetVector<TargetInstrInfo::RegSubRegPair, 32 > SeenAGPRs;
1043- for (unsigned I = 0 ; I < Size / 4 ; ++I) {
1044- MachineOperand *Def = Defs[I].first ;
1045- TargetInstrInfo::RegSubRegPair CopyToVGPR;
1046- if (Def->isImm () &&
1047- TII->isInlineConstant (*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1048- int64_t Imm = Def->getImm ();
1049-
1050- auto Tmp = MRI->createVirtualRegister (&AMDGPU::AGPR_32RegClass);
1051- BuildMI (MBB, UseMI, DL,
1052- TII->get (AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm (Imm);
1053- B.addReg (Tmp);
1054- } else if (Def->isReg () && TRI->isAGPR (*MRI, Def->getReg ())) {
1055- auto Src = getRegSubRegPair (*Def);
1056- Def->setIsKill (false );
1057- if (!SeenAGPRs.insert (Src)) {
1058- // We cannot build a reg_sequence out of the same registers, they
1059- // must be copied. Better do it here before copyPhysReg() created
1060- // several reads to do the AGPR->VGPR->AGPR copy.
1061- CopyToVGPR = Src;
1062- } else {
1063- B.addReg (Src.Reg , Def->isUndef () ? RegState::Undef : 0 ,
1064- Src.SubReg );
1065- }
1066- } else {
1067- assert (Def->isReg ());
1068- Def->setIsKill (false );
1069- auto Src = getRegSubRegPair (*Def);
1070-
1071- // Direct copy from SGPR to AGPR is not possible. To avoid creation
1072- // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
1073- // create a copy here and track if we already have such a copy.
1074- if (TRI->isSGPRReg (*MRI, Src.Reg )) {
1075- CopyToVGPR = Src;
1076- } else {
1077- auto Tmp = MRI->createVirtualRegister (&AMDGPU::AGPR_32RegClass);
1078- BuildMI (MBB, UseMI, DL, TII->get (AMDGPU::COPY), Tmp).add (*Def);
1079- B.addReg (Tmp);
1080- }
1081- }
1082-
1083- if (CopyToVGPR.Reg ) {
1084- auto [It, Inserted] = VGPRCopies.try_emplace (CopyToVGPR);
1085- Register &Vgpr = It->second ;
1086- if (Inserted) {
1087- Vgpr = MRI->createVirtualRegister (&AMDGPU::VGPR_32RegClass);
1088- BuildMI (MBB, UseMI, DL, TII->get (AMDGPU::COPY), Vgpr).add (*Def);
1089- }
1090- auto Tmp = MRI->createVirtualRegister (&AMDGPU::AGPR_32RegClass);
1091- BuildMI (MBB, UseMI, DL,
1092- TII->get (AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg (Vgpr);
1093- B.addReg (Tmp);
1094- }
1095-
1096- B.addImm (Defs[I].second );
1097- }
1098- LLVM_DEBUG (dbgs () << " Folded " << *UseMI);
1099- }
1100-
1101- return ;
1026+ if (foldCopyToAGPRRegSequence (UseMI))
1027+ return ;
11021028 }
11031029
11041030 unsigned UseOpc = UseMI->getOpcode ();
@@ -1558,6 +1484,92 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
15581484 return true ;
15591485}
15601486
1487+ // / Fold %agpr = COPY (REG_SEQUENCE x_MOV_B32, ...) into REG_SEQUENCE
1488+ // / (V_ACCVGPR_WRITE_B32_e64) ... depending on the reg_sequence input values.
1489+ bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence (MachineInstr *CopyMI) const {
1490+ // It is very tricky to store a value into an AGPR. v_accvgpr_write_b32 can
1491+ // only accept VGPR or inline immediate. Recreate a reg_sequence with its
1492+ // initializers right here, so we will rematerialize immediates and avoid
1493+ // copies via different reg classes.
1494+ if (!TRI->isAGPR (*MRI, CopyMI->getOperand (0 ).getReg ()))
1495+ return false ;
1496+ unsigned Size = TII->getOpSize (*CopyMI, 1 );
1497+ if (Size <= 4 )
1498+ return false ;
1499+
1500+ Register UseReg = CopyMI->getOperand (1 ).getReg ();
1501+ SmallVector<std::pair<MachineOperand *, unsigned >, 32 > Defs;
1502+ if (!getRegSeqInit (Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32))
1503+ return false ;
1504+
1505+ const DebugLoc &DL = CopyMI->getDebugLoc ();
1506+ MachineBasicBlock &MBB = *CopyMI->getParent ();
1507+
1508+ CopyMI->setDesc (TII->get (AMDGPU::REG_SEQUENCE));
1509+ for (unsigned I = CopyMI->getNumOperands () - 1 ; I > 0 ; --I)
1510+ CopyMI->removeOperand (I);
1511+
1512+ MachineInstrBuilder B (*MBB.getParent (), CopyMI);
1513+ DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
1514+ SmallSetVector<TargetInstrInfo::RegSubRegPair, 32 > SeenAGPRs;
1515+ for (unsigned I = 0 ; I < Size / 4 ; ++I) {
1516+ MachineOperand *Def = Defs[I].first ;
1517+ TargetInstrInfo::RegSubRegPair CopyToVGPR;
1518+ if (Def->isImm () &&
1519+ TII->isInlineConstant (*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1520+ int64_t Imm = Def->getImm ();
1521+
1522+ auto Tmp = MRI->createVirtualRegister (&AMDGPU::AGPR_32RegClass);
1523+ BuildMI (MBB, CopyMI, DL, TII->get (AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
1524+ .addImm (Imm);
1525+ B.addReg (Tmp);
1526+ } else if (Def->isReg () && TRI->isAGPR (*MRI, Def->getReg ())) {
1527+ auto Src = getRegSubRegPair (*Def);
1528+ Def->setIsKill (false );
1529+ if (!SeenAGPRs.insert (Src)) {
1530+ // We cannot build a reg_sequence out of the same registers, they
1531+ // must be copied. Better do it here before copyPhysReg() created
1532+ // several reads to do the AGPR->VGPR->AGPR copy.
1533+ CopyToVGPR = Src;
1534+ } else {
1535+ B.addReg (Src.Reg , Def->isUndef () ? RegState::Undef : 0 , Src.SubReg );
1536+ }
1537+ } else {
1538+ assert (Def->isReg ());
1539+ Def->setIsKill (false );
1540+ auto Src = getRegSubRegPair (*Def);
1541+
1542+ // Direct copy from SGPR to AGPR is not possible. To avoid creation
1543+ // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
1544+ // create a copy here and track if we already have such a copy.
1545+ if (TRI->isSGPRReg (*MRI, Src.Reg )) {
1546+ CopyToVGPR = Src;
1547+ } else {
1548+ auto Tmp = MRI->createVirtualRegister (&AMDGPU::AGPR_32RegClass);
1549+ BuildMI (MBB, CopyMI, DL, TII->get (AMDGPU::COPY), Tmp).add (*Def);
1550+ B.addReg (Tmp);
1551+ }
1552+ }
1553+
1554+ if (CopyToVGPR.Reg ) {
1555+ auto [It, Inserted] = VGPRCopies.try_emplace (CopyToVGPR);
1556+ Register &Vgpr = It->second ;
1557+ if (Inserted) {
1558+ Vgpr = MRI->createVirtualRegister (&AMDGPU::VGPR_32RegClass);
1559+ BuildMI (MBB, CopyMI, DL, TII->get (AMDGPU::COPY), Vgpr).add (*Def);
1560+ }
1561+ auto Tmp = MRI->createVirtualRegister (&AMDGPU::AGPR_32RegClass);
1562+ BuildMI (MBB, CopyMI, DL, TII->get (AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
1563+ .addReg (Vgpr);
1564+ B.addReg (Tmp);
1565+ }
1566+
1567+ B.addImm (Defs[I].second );
1568+ }
1569+ LLVM_DEBUG (dbgs () << " Folded " << *CopyMI);
1570+ return true ;
1571+ }
1572+
15611573bool SIFoldOperandsImpl::tryFoldFoldableCopy (
15621574 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
15631575 Register DstReg = MI.getOperand (0 ).getReg ();
0 commit comments