From 1fa4482ee6d61d02c76c2955fb6077262c178cce Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 27 Feb 2025 09:12:16 +0700
Subject: [PATCH 1/3] AMDGPU: Factor agpr reg_sequence folding into a function

---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 168 ++++++++++++----------
 1 file changed, 90 insertions(+), 78 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 3a019dbaad02c..0e41a78c2c8ae 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -128,6 +128,8 @@ class SIFoldOperandsImpl {
   bool tryFoldCndMask(MachineInstr &MI) const;
   bool tryFoldZeroHighBits(MachineInstr &MI) const;
   bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
+
+  bool foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const;
   bool tryFoldFoldableCopy(MachineInstr &MI,
                            MachineOperand *&CurrentKnownM0Val) const;
 
@@ -1012,7 +1014,6 @@ void SIFoldOperandsImpl::foldOperand(
         UseMI->getOperand(0).getReg().isVirtual() &&
         !UseMI->getOperand(1).getSubReg()) {
       LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
-      unsigned Size = TII->getOpSize(*UseMI, 1);
       Register UseReg = OpToFold.getReg();
       UseMI->getOperand(1).setReg(UseReg);
       UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
@@ -1022,83 +1023,8 @@ void SIFoldOperandsImpl::foldOperand(
 
       // Remove kill flags as kills may now be out of order with uses.
       MRI->clearKillFlags(OpToFold.getReg());
-
-      // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
-      // can only accept VGPR or inline immediate. Recreate a reg_sequence with
-      // its initializers right here, so we will rematerialize immediates and
-      // avoid copies via different reg classes.
-      SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
-      if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
-          getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
-        const DebugLoc &DL = UseMI->getDebugLoc();
-        MachineBasicBlock &MBB = *UseMI->getParent();
-
-        UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
-        for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
-          UseMI->removeOperand(I);
-
-        MachineInstrBuilder B(*MBB.getParent(), UseMI);
-        DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
-        SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs;
-        for (unsigned I = 0; I < Size / 4; ++I) {
-          MachineOperand *Def = Defs[I].first;
-          TargetInstrInfo::RegSubRegPair CopyToVGPR;
-          if (Def->isImm() &&
-              TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
-            int64_t Imm = Def->getImm();
-
-            auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
-            BuildMI(MBB, UseMI, DL,
-                    TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm);
-            B.addReg(Tmp);
-          } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
-            auto Src = getRegSubRegPair(*Def);
-            Def->setIsKill(false);
-            if (!SeenAGPRs.insert(Src)) {
-              // We cannot build a reg_sequence out of the same registers, they
-              // must be copied. Better do it here before copyPhysReg() created
-              // several reads to do the AGPR->VGPR->AGPR copy.
-              CopyToVGPR = Src;
-            } else {
-              B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0,
-                       Src.SubReg);
-            }
-          } else {
-            assert(Def->isReg());
-            Def->setIsKill(false);
-            auto Src = getRegSubRegPair(*Def);
-
-            // Direct copy from SGPR to AGPR is not possible. To avoid creation
-            // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
-            // create a copy here and track if we already have such a copy.
-            if (TRI->isSGPRReg(*MRI, Src.Reg)) {
-              CopyToVGPR = Src;
-            } else {
-              auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
-              BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
-              B.addReg(Tmp);
-            }
-          }
-
-          if (CopyToVGPR.Reg) {
-            auto [It, Inserted] = VGPRCopies.try_emplace(CopyToVGPR);
-            Register &Vgpr = It->second;
-            if (Inserted) {
-              Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-              BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
-            }
-            auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
-            BuildMI(MBB, UseMI, DL,
-                    TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg(Vgpr);
-            B.addReg(Tmp);
-          }
-
-          B.addImm(Defs[I].second);
-        }
-        LLVM_DEBUG(dbgs() << "Folded " << *UseMI);
-      }
-
-      return;
+      if (foldCopyToAGPRRegSequence(UseMI))
+        return;
     }
 
     unsigned UseOpc = UseMI->getOpcode();
@@ -1558,6 +1484,92 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
   return true;
 }
 
+/// Fold %agpr = COPY (REG_SEQUENCE x_MOV_B32, ...) into REG_SEQUENCE
+///  (V_ACCVGPR_WRITE_B32_e64) ... depending on the reg_sequence input values.
+bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const {
+  // It is very tricky to store a value into an AGPR. v_accvgpr_write_b32 can
+  // only accept VGPR or inline immediate. Recreate a reg_sequence with its
+  // initializers right here, so we will rematerialize immediates and avoid
+  // copies via different reg classes.
+  if (!TRI->isAGPR(*MRI, CopyMI->getOperand(0).getReg()))
+    return false;
+  unsigned Size = TII->getOpSize(*CopyMI, 1);
+  if (Size <= 4)
+    return false;
+
+  Register UseReg = CopyMI->getOperand(1).getReg();
+  SmallVector<std::pair<MachineOperand *, unsigned>, 32> Defs;
+  if (!getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32))
+    return false;
+
+  const DebugLoc &DL = CopyMI->getDebugLoc();
+  MachineBasicBlock &MBB = *CopyMI->getParent();
+
+  CopyMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
+  for (unsigned I = CopyMI->getNumOperands() - 1; I > 0; --I)
+    CopyMI->removeOperand(I);
+
+  MachineInstrBuilder B(*MBB.getParent(), CopyMI);
+  DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
+  SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs;
+  for (unsigned I = 0; I < Size / 4; ++I) {
+    MachineOperand *Def = Defs[I].first;
+    TargetInstrInfo::RegSubRegPair CopyToVGPR;
+    if (Def->isImm() &&
+        TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
+      int64_t Imm = Def->getImm();
+
+      auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
+      BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
+          .addImm(Imm);
+      B.addReg(Tmp);
+    } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
+      auto Src = getRegSubRegPair(*Def);
+      Def->setIsKill(false);
+      if (!SeenAGPRs.insert(Src)) {
+        // We cannot build a reg_sequence out of the same registers, they
+        // must be copied. Better do it here before copyPhysReg() created
+        // several reads to do the AGPR->VGPR->AGPR copy.
+        CopyToVGPR = Src;
+      } else {
+        B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0, Src.SubReg);
+      }
+    } else {
+      assert(Def->isReg());
+      Def->setIsKill(false);
+      auto Src = getRegSubRegPair(*Def);
+
+      // Direct copy from SGPR to AGPR is not possible. To avoid creation
+      // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
+      // create a copy here and track if we already have such a copy.
+      if (TRI->isSGPRReg(*MRI, Src.Reg)) {
+        CopyToVGPR = Src;
+      } else {
+        auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
+        BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
+        B.addReg(Tmp);
+      }
+    }
+
+    if (CopyToVGPR.Reg) {
+      auto [It, Inserted] = VGPRCopies.try_emplace(CopyToVGPR);
+      Register &Vgpr = It->second;
+      if (Inserted) {
+        Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+        BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
+      }
+      auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
+      BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
+          .addReg(Vgpr);
+      B.addReg(Tmp);
+    }
+
+    B.addImm(Defs[I].second);
+  }
+  LLVM_DEBUG(dbgs() << "Folded " << *CopyMI);
+  return true;
+}
+
 bool SIFoldOperandsImpl::tryFoldFoldableCopy(
     MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
   Register DstReg = MI.getOperand(0).getReg();

From c78690f7db23a78c02a1cb2533a80549af8ad634 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 27 Feb 2025 09:43:14 +0700
Subject: [PATCH 2/3] Remove unnecessary check for register size

---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 0e41a78c2c8ae..85a1c5d83c3c2 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1493,10 +1493,6 @@ bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const {
   // copies via different reg classes.
   if (!TRI->isAGPR(*MRI, CopyMI->getOperand(0).getReg()))
     return false;
-  unsigned Size = TII->getOpSize(*CopyMI, 1);
-  if (Size <= 4)
-    return false;
-
   Register UseReg = CopyMI->getOperand(1).getReg();
   SmallVector<std::pair<MachineOperand *, unsigned>, 32> Defs;
   if (!getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32))
@@ -1512,7 +1508,7 @@ bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const {
   MachineInstrBuilder B(*MBB.getParent(), CopyMI);
   DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
   SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs;
-  for (unsigned I = 0; I < Size / 4; ++I) {
+  for (unsigned I = 0, NumElts = Defs.size(); I != NumElts; ++I) {
     MachineOperand *Def = Defs[I].first;
     TargetInstrInfo::RegSubRegPair CopyToVGPR;
     if (Def->isImm() &&

From d91f9722a38ef5125bb37b3740902abba60f6e92 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 27 Feb 2025 10:12:35 +0700
Subject: [PATCH 3/3] Use UseReg consistently

---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 85a1c5d83c3c2..f1ba199fbae3f 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1022,7 +1022,7 @@ void SIFoldOperandsImpl::foldOperand(
       OpToFold.setIsKill(false);
 
       // Remove kill flags as kills may now be out of order with uses.
-      MRI->clearKillFlags(OpToFold.getReg());
+      MRI->clearKillFlags(UseReg);
       if (foldCopyToAGPRRegSequence(UseMI))
         return;
     }