From 5e3960409142e376ecee1f53fd64ec4a944731a3 Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Thu, 30 Oct 2025 20:41:02 -0500 Subject: [PATCH] revert old opt to fold rfls across BB --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 9 +- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 94 +++---------------- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 4 - .../fold-redundant-sgpr-vgpr-rw-across-bb.ll | 17 +--- 4 files changed, 20 insertions(+), 104 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index bdc32594c13a3..b6e012f621785 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -23,11 +23,6 @@ #define DEBUG_TYPE "si-fold-operands" using namespace llvm; -static cl::opt SIFoldOperandsPreheaderThreshold( - "amdgpu-si-fold-operands-preheader-threshold", cl::init(1000), - cl::desc("Threshold for operand folding hazard check. " - "Defaults to 1000 MIs, upper limit 10000.")); - namespace { /// Track a value we may want to fold into downstream users, applying @@ -1458,9 +1453,9 @@ void SIFoldOperandsImpl::foldOperand( } if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) { - if (checkIfExecMayBeModifiedBeforeUseAcrossBB( + if (execMayBeModifiedBeforeUse( *MRI, UseMI->getOperand(UseOpIdx).getReg(), - *OpToFold.DefMI, *UseMI, SIFoldOperandsPreheaderThreshold)) + *OpToFold.DefMI, *UseMI)) return; // %vgpr = COPY %sgpr0 diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 06ca84e951487..d2eddd20ec475 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10246,82 +10246,6 @@ MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, return nullptr; } -// helper function to checkIfExecMayBeModifiedBeforeUseAcrossBB and -// execMayBeModifiedBeforeUse. This checks possible EXEC register modifications -// for a straight-line sequence of instructions between BeginIterator and -// EndIterator (both inclusive) upto a pre-defined limit MaxInstScan -bool execMayBeModifiedBeforeUseUtil( - const TargetRegisterInfo *TRI, - const MachineInstrBundleIterator BeginIterator, - const MachineInstrBundleIterator EndIterator, - const int MaxInstScan) { - - int NumInst = 0; - for (auto I = BeginIterator; I != EndIterator; ++I) { - if (I->isMetaInstruction()) - continue; - - if (++NumInst > MaxInstScan) - return true; - - if (I->modifiesRegister(AMDGPU::EXEC, TRI)) - return true; - } - return false; -} - -// Variant of execMayBeModifiedBeforeUse(), where DefMI and UseMI belong to -// different basic blocks. Current code is limited to a very simple case: DefMI -// in the predecessor BB of the single BB loop where UseMI resides. -bool llvm::checkIfExecMayBeModifiedBeforeUseAcrossBB( - const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, - const MachineInstr &UseMI, const int SIFoldOperandsPreheaderThreshold) { - - assert(MRI.isSSA() && "Must be run on SSA"); - auto *TRI = MRI.getTargetRegisterInfo(); - auto *DefBB = DefMI.getParent(); - const int MaxInstScan = (SIFoldOperandsPreheaderThreshold > 10000) - ? 10000 - : SIFoldOperandsPreheaderThreshold; - - // Check whether EXEC is modified along all possible control flow between - // DefMI and UseMI, which may include loop backedge - // 1. UseBB is the only successor of DefBB - // 2. UseBB is a single basic block loop (only two predecessor blocks: DefBB - // and UseBB) - // 3. check if EXEC is modified - auto *UseBB = UseMI.getParent(); - if (UseBB != DefBB) { - if (!(DefBB->isSuccessor(UseBB) && (DefBB->succ_size() == 1))) - return true; - - if (!((UseBB->pred_size() == 2) && UseBB->isPredecessor(UseBB) && - UseBB->isPredecessor(DefBB))) - return true; - - bool canExecBeModifiedBeforeUse = execMayBeModifiedBeforeUseUtil( - TRI, UseBB->begin(), UseBB->end(), MaxInstScan); - if (canExecBeModifiedBeforeUse) - return true; - - // Stop scan at the end of the DEF basic block. - // If we are here, we know for sure that the instructions in focus are in - // the same basic block. Scan them to be safe. - canExecBeModifiedBeforeUse = execMayBeModifiedBeforeUseUtil( - TRI, std::next(DefMI.getIterator()), DefBB->end(), MaxInstScan); - if (canExecBeModifiedBeforeUse) - return true; - - } else { - // Stop scan at the use. - bool canExecBeModifiedBeforeUse = execMayBeModifiedBeforeUseUtil( - TRI, std::next(DefMI.getIterator()), UseMI.getIterator(), MaxInstScan); - if (canExecBeModifiedBeforeUse) - return true; - } - return false; -} - bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, @@ -10337,12 +10261,20 @@ bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, return true; const int MaxInstScan = 20; + int NumInst = 0; // Stop scan at the use. - bool canExecBeModifiedBeforeUse = execMayBeModifiedBeforeUseUtil( - TRI, std::next(DefMI.getIterator()), UseMI.getIterator(), MaxInstScan); - if (canExecBeModifiedBeforeUse) - return true; + auto E = UseMI.getIterator(); + for (auto I = std::next(DefMI.getIterator()); I != E; ++I) { + if (I->isDebugInstr()) + continue; + + if (++NumInst > MaxInstScan) + return true; + + if (I->modifiesRegister(AMDGPU::EXEC, TRI)) + return true; + } return false; } @@ -10379,7 +10311,7 @@ bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, for (auto I = std::next(DefMI.getIterator()); ; ++I) { assert(I != DefBB->end()); - if (I->isMetaInstruction()) + if (I->isDebugInstr()) continue; if (++NumInst > MaxInstScan) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 211642d7c4460..42e5c69773176 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1714,10 +1714,6 @@ bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, const MachineInstr &DefMI, const MachineInstr &UseMI); -bool checkIfExecMayBeModifiedBeforeUseAcrossBB( - const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, - const MachineInstr &UseMI, const int SIFoldOperandsPreheaderThreshold); - /// \brief Return false if EXEC is not changed between the def of \p VReg at \p /// DefMI and all its uses. Should be run on SSA. Currently does not attempt to /// track between blocks. diff --git a/llvm/test/CodeGen/AMDGPU/fold-redundant-sgpr-vgpr-rw-across-bb.ll b/llvm/test/CodeGen/AMDGPU/fold-redundant-sgpr-vgpr-rw-across-bb.ll index 2ad9c0e71c5f3..8fbe2199a08a5 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-redundant-sgpr-vgpr-rw-across-bb.ll +++ b/llvm/test/CodeGen/AMDGPU/fold-redundant-sgpr-vgpr-rw-across-bb.ll @@ -1,20 +1,13 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefix=CHECK1 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-si-fold-operands-preheader-threshold=10 < %s | FileCheck --check-prefix=CHECK2 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefix=CHECK %s define protected amdgpu_kernel void @main(ptr addrspace(1) noundef %args.coerce, ptr addrspace(1) noundef %args.coerce2, ptr addrspace(1) noundef %args.coerce4, i32 noundef %args12) { -; CHECK1-LABEL: main: +; CHECK-LABEL: main: ; check that non-redundant readfirstlanes are not removed -; CHECK1: v_readfirstlane_b32 +; CHECK: v_readfirstlane_b32 ; check that all redundant readfirstlanes are removed -; CHECK1-NOT: v_readfirstlane_b32 -; CHECK1: s_endpgm +; CHECK-NOT: v_readfirstlane_b32 +; CHECK: s_endpgm -; CHECK2-LABEL: main: -; CHECK2: v_readfirstlane_b32 -; check that all redundant readfirstlanes across basic blocks persist -; CHECK2: v_readfirstlane_b32 -; CHECK2: v_readfirstlane_b32 -; CHECK2: s_endpgm entry: %wid = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() %div1 = lshr i32 %wid, 6