From 5e3960409142e376ecee1f53fd64ec4a944731a3 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta@amd.com>
Date: Thu, 30 Oct 2025 20:41:02 -0500
Subject: [PATCH] revert old opt to fold rfls across BB

---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp     |  9 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 94 +++----------------
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |  4 -
 .../fold-redundant-sgpr-vgpr-rw-across-bb.ll  | 17 +---
 4 files changed, 20 insertions(+), 104 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index bdc32594c13a3..b6e012f621785 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -23,11 +23,6 @@
 #define DEBUG_TYPE "si-fold-operands"
 using namespace llvm;
 
-static cl::opt<int> SIFoldOperandsPreheaderThreshold(
-    "amdgpu-si-fold-operands-preheader-threshold", cl::init(1000),
-    cl::desc("Threshold for operand folding hazard check. "
-             "Defaults to 1000 MIs, upper limit 10000."));
-
 namespace {
 
 /// Track a value we may want to fold into downstream users, applying
@@ -1458,9 +1453,9 @@ void SIFoldOperandsImpl::foldOperand(
       }
 
       if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
-        if (checkIfExecMayBeModifiedBeforeUseAcrossBB(
+        if (execMayBeModifiedBeforeUse(
                 *MRI, UseMI->getOperand(UseOpIdx).getReg(),
-                *OpToFold.DefMI, *UseMI, SIFoldOperandsPreheaderThreshold))
+                *OpToFold.DefMI, *UseMI))
           return;
 
         // %vgpr = COPY %sgpr0
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 06ca84e951487..d2eddd20ec475 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10246,82 +10246,6 @@ MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
   return nullptr;
 }
 
-// helper function to checkIfExecMayBeModifiedBeforeUseAcrossBB and
-// execMayBeModifiedBeforeUse. This checks possible EXEC register modifications
-// for a straight-line sequence of instructions between BeginIterator and
-// EndIterator (both inclusive) upto a pre-defined limit MaxInstScan
-bool execMayBeModifiedBeforeUseUtil(
-    const TargetRegisterInfo *TRI,
-    const MachineInstrBundleIterator<const MachineInstr> BeginIterator,
-    const MachineInstrBundleIterator<const MachineInstr> EndIterator,
-    const int MaxInstScan) {
-
-  int NumInst = 0;
-  for (auto I = BeginIterator; I != EndIterator; ++I) {
-    if (I->isMetaInstruction())
-      continue;
-
-    if (++NumInst > MaxInstScan)
-      return true;
-
-    if (I->modifiesRegister(AMDGPU::EXEC, TRI))
-      return true;
-  }
-  return false;
-}
-
-// Variant of execMayBeModifiedBeforeUse(), where DefMI and UseMI belong to
-// different basic blocks. Current code is limited to a very simple case: DefMI
-// in the predecessor BB of the single BB loop where UseMI resides.
-bool llvm::checkIfExecMayBeModifiedBeforeUseAcrossBB(
-    const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI,
-    const MachineInstr &UseMI, const int SIFoldOperandsPreheaderThreshold) {
-
-  assert(MRI.isSSA() && "Must be run on SSA");
-  auto *TRI = MRI.getTargetRegisterInfo();
-  auto *DefBB = DefMI.getParent();
-  const int MaxInstScan = (SIFoldOperandsPreheaderThreshold > 10000)
-                              ? 10000
-                              : SIFoldOperandsPreheaderThreshold;
-
-  // Check whether EXEC is modified along all possible control flow between
-  // DefMI and UseMI, which may include loop backedge
-  // 1. UseBB is the only successor of DefBB
-  // 2. UseBB is a single basic block loop (only two predecessor blocks: DefBB
-  // and UseBB)
-  // 3. check if EXEC is modified
-  auto *UseBB = UseMI.getParent();
-  if (UseBB != DefBB) {
-    if (!(DefBB->isSuccessor(UseBB) && (DefBB->succ_size() == 1)))
-      return true;
-
-    if (!((UseBB->pred_size() == 2) && UseBB->isPredecessor(UseBB) &&
-          UseBB->isPredecessor(DefBB)))
-      return true;
-
-    bool canExecBeModifiedBeforeUse = execMayBeModifiedBeforeUseUtil(
-        TRI, UseBB->begin(), UseBB->end(), MaxInstScan);
-    if (canExecBeModifiedBeforeUse)
-      return true;
-
-    // Stop scan at the end of the DEF basic block.
-    // If we are here, we know for sure that the instructions in focus are in
-    // the same basic block. Scan them to be safe.
-    canExecBeModifiedBeforeUse = execMayBeModifiedBeforeUseUtil(
-        TRI, std::next(DefMI.getIterator()), DefBB->end(), MaxInstScan);
-    if (canExecBeModifiedBeforeUse)
-      return true;
-
-  } else {
-      // Stop scan at the use.
-    bool canExecBeModifiedBeforeUse = execMayBeModifiedBeforeUseUtil(
-        TRI, std::next(DefMI.getIterator()), UseMI.getIterator(), MaxInstScan);
-    if (canExecBeModifiedBeforeUse)
-      return true;
-  }
-  return false;
-}
-
 bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
                                       Register VReg,
                                       const MachineInstr &DefMI,
@@ -10337,12 +10261,20 @@ bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
     return true;
 
   const int MaxInstScan = 20;
+  int NumInst = 0;
 
   // Stop scan at the use.
-  bool canExecBeModifiedBeforeUse = execMayBeModifiedBeforeUseUtil(
-      TRI, std::next(DefMI.getIterator()), UseMI.getIterator(), MaxInstScan);
-  if (canExecBeModifiedBeforeUse)
-    return true;
+  auto E = UseMI.getIterator();
+  for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
+    if (I->isDebugInstr())
+      continue;
+
+    if (++NumInst > MaxInstScan)
+      return true;
+
+    if (I->modifiesRegister(AMDGPU::EXEC, TRI))
+      return true;
+  }
 
   return false;
 }
@@ -10379,7 +10311,7 @@ bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
   for (auto I = std::next(DefMI.getIterator()); ; ++I) {
     assert(I != DefBB->end());
 
-    if (I->isMetaInstruction())
+    if (I->isDebugInstr())
       continue;
 
     if (++NumInst > MaxInstScan)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 211642d7c4460..42e5c69773176 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1714,10 +1714,6 @@ bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
                                 const MachineInstr &DefMI,
                                 const MachineInstr &UseMI);
 
-bool checkIfExecMayBeModifiedBeforeUseAcrossBB(
-    const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI,
-    const MachineInstr &UseMI, const int SIFoldOperandsPreheaderThreshold);
-    
 /// \brief Return false if EXEC is not changed between the def of \p VReg at \p
 /// DefMI and all its uses. Should be run on SSA. Currently does not attempt to
 /// track between blocks.
diff --git a/llvm/test/CodeGen/AMDGPU/fold-redundant-sgpr-vgpr-rw-across-bb.ll b/llvm/test/CodeGen/AMDGPU/fold-redundant-sgpr-vgpr-rw-across-bb.ll
index 2ad9c0e71c5f3..8fbe2199a08a5 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-redundant-sgpr-vgpr-rw-across-bb.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-redundant-sgpr-vgpr-rw-across-bb.ll
@@ -1,20 +1,13 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefix=CHECK1 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-si-fold-operands-preheader-threshold=10 < %s | FileCheck --check-prefix=CHECK2 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefix=CHECK %s
 
 define protected amdgpu_kernel void @main(ptr addrspace(1) noundef %args.coerce, ptr addrspace(1) noundef %args.coerce2, ptr addrspace(1) noundef %args.coerce4, i32 noundef %args12) {
-; CHECK1-LABEL: main:
+; CHECK-LABEL: main:
 ; check that non-redundant readfirstlanes are not removed
-; CHECK1:      v_readfirstlane_b32
+; CHECK:      v_readfirstlane_b32
 ; check that all redundant readfirstlanes are removed
-; CHECK1-NOT:  v_readfirstlane_b32
-; CHECK1:      s_endpgm
+; CHECK-NOT:  v_readfirstlane_b32
+; CHECK:      s_endpgm
 
-; CHECK2-LABEL: main:
-; CHECK2:      v_readfirstlane_b32
-; check that all redundant readfirstlanes across basic blocks persist
-; CHECK2:      v_readfirstlane_b32
-; CHECK2:      v_readfirstlane_b32
-; CHECK2:      s_endpgm
 entry:
     %wid = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
     %div1 = lshr i32 %wid, 6