@@ -128,6 +128,8 @@ class SIFoldOperandsImpl {
128128 bool tryFoldCndMask (MachineInstr &MI) const ;
129129 bool tryFoldZeroHighBits (MachineInstr &MI) const ;
130130 bool foldInstOperand (MachineInstr &MI, MachineOperand &OpToFold) const ;
131+
132+ bool foldCopyToAGPRRegSequence (MachineInstr *CopyMI) const ;
131133 bool tryFoldFoldableCopy (MachineInstr &MI,
132134 MachineOperand *&CurrentKnownM0Val) const ;
133135
@@ -1012,7 +1014,6 @@ void SIFoldOperandsImpl::foldOperand(
10121014 UseMI->getOperand (0 ).getReg ().isVirtual () &&
10131015 !UseMI->getOperand (1 ).getSubReg ()) {
10141016 LLVM_DEBUG (dbgs () << " Folding " << OpToFold << " \n into " << *UseMI);
1015- unsigned Size = TII->getOpSize (*UseMI, 1 );
10161017 Register UseReg = OpToFold.getReg ();
10171018 UseMI->getOperand (1 ).setReg (UseReg);
10181019 UseMI->getOperand (1 ).setSubReg (OpToFold.getSubReg ());
@@ -1021,84 +1022,9 @@ void SIFoldOperandsImpl::foldOperand(
10211022 OpToFold.setIsKill (false );
10221023
10231024 // Remove kill flags as kills may now be out of order with uses.
1024- MRI->clearKillFlags (OpToFold.getReg ());
1025-
1026- // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
1027- // can only accept VGPR or inline immediate. Recreate a reg_sequence with
1028- // its initializers right here, so we will rematerialize immediates and
1029- // avoid copies via different reg classes.
1030- SmallVector<std::pair<MachineOperand*, unsigned >, 32 > Defs;
1031- if (Size > 4 && TRI->isAGPR (*MRI, UseMI->getOperand (0 ).getReg ()) &&
1032- getRegSeqInit (Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1033- const DebugLoc &DL = UseMI->getDebugLoc ();
1034- MachineBasicBlock &MBB = *UseMI->getParent ();
1035-
1036- UseMI->setDesc (TII->get (AMDGPU::REG_SEQUENCE));
1037- for (unsigned I = UseMI->getNumOperands () - 1 ; I > 0 ; --I)
1038- UseMI->removeOperand (I);
1039-
1040- MachineInstrBuilder B (*MBB.getParent (), UseMI);
1041- DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
1042- SmallSetVector<TargetInstrInfo::RegSubRegPair, 32 > SeenAGPRs;
1043- for (unsigned I = 0 ; I < Size / 4 ; ++I) {
1044- MachineOperand *Def = Defs[I].first ;
1045- TargetInstrInfo::RegSubRegPair CopyToVGPR;
1046- if (Def->isImm () &&
1047- TII->isInlineConstant (*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1048- int64_t Imm = Def->getImm ();
1049-
1050- auto Tmp = MRI->createVirtualRegister (&AMDGPU::AGPR_32RegClass);
1051- BuildMI (MBB, UseMI, DL,
1052- TII->get (AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm (Imm);
1053- B.addReg (Tmp);
1054- } else if (Def->isReg () && TRI->isAGPR (*MRI, Def->getReg ())) {
1055- auto Src = getRegSubRegPair (*Def);
1056- Def->setIsKill (false );
1057- if (!SeenAGPRs.insert (Src)) {
1058- // We cannot build a reg_sequence out of the same registers, they
1059- // must be copied. Better do it here before copyPhysReg() created
1060- // several reads to do the AGPR->VGPR->AGPR copy.
1061- CopyToVGPR = Src;
1062- } else {
1063- B.addReg (Src.Reg , Def->isUndef () ? RegState::Undef : 0 ,
1064- Src.SubReg );
1065- }
1066- } else {
1067- assert (Def->isReg ());
1068- Def->setIsKill (false );
1069- auto Src = getRegSubRegPair (*Def);
1070-
1071- // Direct copy from SGPR to AGPR is not possible. To avoid creation
1072- // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
1073- // create a copy here and track if we already have such a copy.
1074- if (TRI->isSGPRReg (*MRI, Src.Reg )) {
1075- CopyToVGPR = Src;
1076- } else {
1077- auto Tmp = MRI->createVirtualRegister (&AMDGPU::AGPR_32RegClass);
1078- BuildMI (MBB, UseMI, DL, TII->get (AMDGPU::COPY), Tmp).add (*Def);
1079- B.addReg (Tmp);
1080- }
1081- }
1082-
1083- if (CopyToVGPR.Reg ) {
1084- auto [It, Inserted] = VGPRCopies.try_emplace (CopyToVGPR);
1085- Register &Vgpr = It->second ;
1086- if (Inserted) {
1087- Vgpr = MRI->createVirtualRegister (&AMDGPU::VGPR_32RegClass);
1088- BuildMI (MBB, UseMI, DL, TII->get (AMDGPU::COPY), Vgpr).add (*Def);
1089- }
1090- auto Tmp = MRI->createVirtualRegister (&AMDGPU::AGPR_32RegClass);
1091- BuildMI (MBB, UseMI, DL,
1092- TII->get (AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg (Vgpr);
1093- B.addReg (Tmp);
1094- }
1095-
1096- B.addImm (Defs[I].second );
1097- }
1098- LLVM_DEBUG (dbgs () << " Folded " << *UseMI);
1099- }
1100-
1101- return ;
1025+ MRI->clearKillFlags (UseReg);
1026+ if (foldCopyToAGPRRegSequence (UseMI))
1027+ return ;
11021028 }
11031029
11041030 unsigned UseOpc = UseMI->getOpcode ();
@@ -1558,6 +1484,88 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
15581484 return true ;
15591485}
15601486
1487+ // / Fold %agpr = COPY (REG_SEQUENCE x_MOV_B32, ...) into REG_SEQUENCE
1488+ // / (V_ACCVGPR_WRITE_B32_e64) ... depending on the reg_sequence input values.
1489+ bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence (MachineInstr *CopyMI) const {
1490+ // It is very tricky to store a value into an AGPR. v_accvgpr_write_b32 can
1491+ // only accept VGPR or inline immediate. Recreate a reg_sequence with its
1492+ // initializers right here, so we will rematerialize immediates and avoid
1493+ // copies via different reg classes.
1494+ if (!TRI->isAGPR (*MRI, CopyMI->getOperand (0 ).getReg ()))
1495+ return false ;
1496+ Register UseReg = CopyMI->getOperand (1 ).getReg ();
1497+ SmallVector<std::pair<MachineOperand *, unsigned >, 32 > Defs;
1498+ if (!getRegSeqInit (Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32))
1499+ return false ;
1500+
1501+ const DebugLoc &DL = CopyMI->getDebugLoc ();
1502+ MachineBasicBlock &MBB = *CopyMI->getParent ();
1503+
1504+ CopyMI->setDesc (TII->get (AMDGPU::REG_SEQUENCE));
1505+ for (unsigned I = CopyMI->getNumOperands () - 1 ; I > 0 ; --I)
1506+ CopyMI->removeOperand (I);
1507+
1508+ MachineInstrBuilder B (*MBB.getParent (), CopyMI);
1509+ DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
1510+ SmallSetVector<TargetInstrInfo::RegSubRegPair, 32 > SeenAGPRs;
1511+ for (unsigned I = 0 , NumElts = Defs.size (); I != NumElts; ++I) {
1512+ MachineOperand *Def = Defs[I].first ;
1513+ TargetInstrInfo::RegSubRegPair CopyToVGPR;
1514+ if (Def->isImm () &&
1515+ TII->isInlineConstant (*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1516+ int64_t Imm = Def->getImm ();
1517+
1518+ auto Tmp = MRI->createVirtualRegister (&AMDGPU::AGPR_32RegClass);
1519+ BuildMI (MBB, CopyMI, DL, TII->get (AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
1520+ .addImm (Imm);
1521+ B.addReg (Tmp);
1522+ } else if (Def->isReg () && TRI->isAGPR (*MRI, Def->getReg ())) {
1523+ auto Src = getRegSubRegPair (*Def);
1524+ Def->setIsKill (false );
1525+ if (!SeenAGPRs.insert (Src)) {
1526+ // We cannot build a reg_sequence out of the same registers, they
1527+ // must be copied. Better do it here before copyPhysReg() created
1528+ // several reads to do the AGPR->VGPR->AGPR copy.
1529+ CopyToVGPR = Src;
1530+ } else {
1531+ B.addReg (Src.Reg , Def->isUndef () ? RegState::Undef : 0 , Src.SubReg );
1532+ }
1533+ } else {
1534+ assert (Def->isReg ());
1535+ Def->setIsKill (false );
1536+ auto Src = getRegSubRegPair (*Def);
1537+
1538+ // Direct copy from SGPR to AGPR is not possible. To avoid creation
1539+ // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
1540+ // create a copy here and track if we already have such a copy.
1541+ if (TRI->isSGPRReg (*MRI, Src.Reg )) {
1542+ CopyToVGPR = Src;
1543+ } else {
1544+ auto Tmp = MRI->createVirtualRegister (&AMDGPU::AGPR_32RegClass);
1545+ BuildMI (MBB, CopyMI, DL, TII->get (AMDGPU::COPY), Tmp).add (*Def);
1546+ B.addReg (Tmp);
1547+ }
1548+ }
1549+
1550+ if (CopyToVGPR.Reg ) {
1551+ auto [It, Inserted] = VGPRCopies.try_emplace (CopyToVGPR);
1552+ Register &Vgpr = It->second ;
1553+ if (Inserted) {
1554+ Vgpr = MRI->createVirtualRegister (&AMDGPU::VGPR_32RegClass);
1555+ BuildMI (MBB, CopyMI, DL, TII->get (AMDGPU::COPY), Vgpr).add (*Def);
1556+ }
1557+ auto Tmp = MRI->createVirtualRegister (&AMDGPU::AGPR_32RegClass);
1558+ BuildMI (MBB, CopyMI, DL, TII->get (AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
1559+ .addReg (Vgpr);
1560+ B.addReg (Tmp);
1561+ }
1562+
1563+ B.addImm (Defs[I].second );
1564+ }
1565+ LLVM_DEBUG (dbgs () << " Folded " << *CopyMI);
1566+ return true ;
1567+ }
1568+
15611569bool SIFoldOperandsImpl::tryFoldFoldableCopy (
15621570 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
15631571 Register DstReg = MI.getOperand (0 ).getReg ();
0 commit comments