Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 8 additions & 122 deletions llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
#include "llvm/Analysis/RegionInfo.h"
#include "llvm/Analysis/RegionIterator.h"
#include "llvm/Analysis/RegionPass.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
Expand Down Expand Up @@ -129,7 +128,6 @@ struct PredInfo {
using BBPredicates = DenseMap<BasicBlock *, PredInfo>;
using PredMap = DenseMap<BasicBlock *, BBPredicates>;
using BB2BBMap = DenseMap<BasicBlock *, BasicBlock *>;
using Val2BBMap = DenseMap<Value *, BasicBlock *>;

// A traits type that is intended to be used in graph algorithms. The graph
// traits starts at an entry node, and traverses the RegionNodes that are in
Expand Down Expand Up @@ -281,7 +279,7 @@ class StructurizeCFG {
ConstantInt *BoolTrue;
ConstantInt *BoolFalse;
Value *BoolPoison;
const TargetTransformInfo *TTI;

Function *Func;
Region *ParentRegion;

Expand All @@ -303,12 +301,8 @@ class StructurizeCFG {
PredMap LoopPreds;
BranchVector LoopConds;

Val2BBMap HoistedValues;

RegionNode *PrevNode;

void hoistZeroCostElseBlockPhiValues(BasicBlock *ElseBB, BasicBlock *ThenBB);

void orderNodes();

void analyzeLoops(RegionNode *N);
Expand Down Expand Up @@ -338,8 +332,6 @@ class StructurizeCFG {

void simplifyAffectedPhis();

void simplifyHoistedPhis();

DebugLoc killTerminator(BasicBlock *BB);

void changeExit(RegionNode *Node, BasicBlock *NewExit,
Expand Down Expand Up @@ -367,7 +359,7 @@ class StructurizeCFG {

public:
void init(Region *R);
bool run(Region *R, DominatorTree *DT, const TargetTransformInfo *TTI);
bool run(Region *R, DominatorTree *DT);
bool makeUniformRegion(Region *R, UniformityInfo &UA);
};

Expand All @@ -393,21 +385,16 @@ class StructurizeCFGLegacyPass : public RegionPass {
if (SCFG.makeUniformRegion(R, UA))
return false;
}
Function *F = R->getEntry()->getParent();
const TargetTransformInfo *TTI =
&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*F);
DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
return SCFG.run(R, DT, TTI);
return SCFG.run(R, DT);
}

StringRef getPassName() const override { return "Structurize control flow"; }

void getAnalysisUsage(AnalysisUsage &AU) const override {
if (SkipUniformRegions)
AU.addRequired<UniformityInfoWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();

AU.addPreserved<DominatorTreeWrapperPass>();
RegionPass::getAnalysisUsage(AU);
Expand All @@ -416,34 +403,6 @@ class StructurizeCFGLegacyPass : public RegionPass {

} // end anonymous namespace

/// Checks whether an instruction is zero cost instruction and checks if the
/// operands are from different BB. If so, this instruction can be coalesced
/// if its hoisted to predecessor block. So, this returns true.
static bool isHoistableInstruction(Instruction *I, BasicBlock *BB,
const TargetTransformInfo *TTI) {
if (I->getParent() != BB)
return false;

// If the instruction is not a zero cost instruction, return false.
auto Cost = TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency);
InstructionCost::CostType CostVal =
Cost.isValid()
? Cost.getValue()
: (InstructionCost::CostType)TargetTransformInfo::TCC_Expensive;
if (CostVal != 0)
return false;

// Check if any operands are instructions defined in the same block.
for (auto &Op : I->operands()) {
if (auto *OpI = dyn_cast<Instruction>(Op)) {
if (OpI->getParent() == BB)
return false;
}
}

return true;
}

char StructurizeCFGLegacyPass::ID = 0;

INITIALIZE_PASS_BEGIN(StructurizeCFGLegacyPass, "structurizecfg",
Expand All @@ -454,39 +413,6 @@ INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
INITIALIZE_PASS_END(StructurizeCFGLegacyPass, "structurizecfg",
"Structurize the CFG", false, false)

/// Structurization can introduce unnecessary VGPR copies due to register
/// coalescing interference. For example, if the Else block has a zero-cost
/// instruction and the Then block modifies the VGPR value, only one value is
/// live at a time in merge block before structurization. After structurization,
/// the coalescer may incorrectly treat the Then value as live in the Else block
/// (via the path Then → Flow → Else), leading to unnecessary VGPR copies.
///
/// This function examines phi nodes whose incoming values are zero-cost
/// instructions in the Else block. It identifies such values that can be safely
/// hoisted and moves them to the nearest common dominator of Then and Else
/// blocks. A follow-up function after setting PhiNodes assigns the hoisted
/// value to poison phi nodes along the if→flow edge, aiding register coalescing
/// and minimizing unnecessary live ranges.
void StructurizeCFG::hoistZeroCostElseBlockPhiValues(BasicBlock *ElseBB,
BasicBlock *ThenBB) {

BasicBlock *ElseSucc = ElseBB->getSingleSuccessor();
BasicBlock *CommonDominator = DT->findNearestCommonDominator(ElseBB, ThenBB);

if (!ElseSucc || !CommonDominator)
return;
Instruction *Term = CommonDominator->getTerminator();
for (PHINode &Phi : ElseSucc->phis()) {
Value *ElseVal = Phi.getIncomingValueForBlock(ElseBB);
auto *Inst = dyn_cast<Instruction>(ElseVal);
if (!Inst || !isHoistableInstruction(Inst, ElseBB, TTI))
continue;
Inst->removeFromParent();
Inst->insertInto(CommonDominator, Term->getIterator());
HoistedValues[Inst] = CommonDominator;
}
}

/// Build up the general order of nodes, by performing a topological sort of the
/// parent region's nodes, while ensuring that there is no outer cycle node
/// between any two inner cycle nodes.
Expand Down Expand Up @@ -609,7 +535,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
BasicBlock *Other = Term->getSuccessor(!i);
if (Visited.count(Other) && !Loops.count(Other) &&
!Pred.count(Other) && !Pred.count(P)) {
hoistZeroCostElseBlockPhiValues(Succ, Other);

Pred[Other] = {BoolFalse, std::nullopt};
Pred[P] = {BoolTrue, std::nullopt};
continue;
Expand Down Expand Up @@ -965,44 +891,6 @@ void StructurizeCFG::setPhiValues() {
AffectedPhis.append(InsertedPhis.begin(), InsertedPhis.end());
}

/// Updates PHI nodes after hoisted zero cost instructions by replacing poison
/// entries on Flow nodes with the appropriate hoisted values
void StructurizeCFG::simplifyHoistedPhis() {
for (WeakVH VH : AffectedPhis) {
PHINode *Phi = dyn_cast_or_null<PHINode>(VH);
if (!Phi || Phi->getNumIncomingValues() != 2)
continue;

for (int i = 0; i < 2; i++) {
Value *V = Phi->getIncomingValue(i);
auto BBIt = HoistedValues.find(V);

if (BBIt == HoistedValues.end())
continue;

Value *OtherV = Phi->getIncomingValue(!i);
PHINode *OtherPhi = dyn_cast<PHINode>(OtherV);
if (!OtherPhi)
continue;

int PoisonValBBIdx = -1;
for (size_t i = 0; i < OtherPhi->getNumIncomingValues(); i++) {
if (!isa<PoisonValue>(OtherPhi->getIncomingValue(i)))
continue;
PoisonValBBIdx = i;
break;
}
if (PoisonValBBIdx == -1 ||
!DT->dominates(BBIt->second,
OtherPhi->getIncomingBlock(PoisonValBBIdx)))
continue;

OtherPhi->setIncomingValue(PoisonValBBIdx, V);
Phi->setIncomingValue(i, OtherV);
}
}
}

void StructurizeCFG::simplifyAffectedPhis() {
bool Changed;
do {
Expand Down Expand Up @@ -1395,13 +1283,12 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) {
}

/// Run the transformation for each region found
bool StructurizeCFG::run(Region *R, DominatorTree *DT,
const TargetTransformInfo *TTI) {
bool StructurizeCFG::run(Region *R, DominatorTree *DT) {
if (R->isTopLevelRegion())
return false;

this->DT = DT;
this->TTI = TTI;

Func = R->getEntry()->getParent();
assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator.");

Expand All @@ -1413,7 +1300,6 @@ bool StructurizeCFG::run(Region *R, DominatorTree *DT,
insertConditions(false);
insertConditions(true);
setPhiValues();
simplifyHoistedPhis();
simplifyConditions();
simplifyAffectedPhis();
rebuildSSA();
Expand Down Expand Up @@ -1463,7 +1349,7 @@ PreservedAnalyses StructurizeCFGPass::run(Function &F,
bool Changed = false;
DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
auto &RI = AM.getResult<RegionInfoAnalysis>(F);
TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F);

UniformityInfo *UI = nullptr;
if (SkipUniformRegions)
UI = &AM.getResult<UniformityInfoAnalysis>(F);
Expand All @@ -1482,7 +1368,7 @@ PreservedAnalyses StructurizeCFGPass::run(Function &F,
continue;
}

Changed |= SCFG.run(R, DT, TTI);
Changed |= SCFG.run(R, DT);
}
if (!Changed)
return PreservedAnalyses::all();
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9851,8 +9851,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
; CHECK-NEXT: s_andn2_saveexec_b32 s6, s6
; CHECK-NEXT: s_cbranch_execz .LBB8_6
; CHECK-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader
; CHECK-NEXT: v_add_nc_u32_e32 v1, 0x700, v1
; CHECK-NEXT: v_add_nc_u32_e32 v0, 0x700, v0
; CHECK-NEXT: v_add_nc_u32_e32 v1, 0x700, v1
; CHECK-NEXT: s_movk_i32 s4, 0xf800
; CHECK-NEXT: s_mov_b32 s5, -1
; CHECK-NEXT: .LBB8_5: ; %memmove_bwd_loop
Expand Down Expand Up @@ -11167,8 +11167,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_andn2_saveexec_b32 s6, s6
; ALIGNED-NEXT: s_cbranch_execz .LBB8_6
; ALIGNED-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader
; ALIGNED-NEXT: v_add_nc_u32_e32 v1, 0x700, v1
; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x700, v0
; ALIGNED-NEXT: v_add_nc_u32_e32 v1, 0x700, v1
; ALIGNED-NEXT: s_movk_i32 s4, 0xf800
; ALIGNED-NEXT: s_mov_b32 s5, -1
; ALIGNED-NEXT: .LBB8_5: ; %memmove_bwd_loop
Expand Down Expand Up @@ -12381,8 +12381,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
; UNROLL3-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:2024
; UNROLL3-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:2020
; UNROLL3-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:2016
; UNROLL3-NEXT: v_add_nc_u32_e32 v1, 0x7b0, v1
; UNROLL3-NEXT: v_add_nc_u32_e32 v2, 0x7b0, v0
; UNROLL3-NEXT: v_add_nc_u32_e32 v1, 0x7b0, v1
; UNROLL3-NEXT: s_waitcnt vmcnt(3)
; UNROLL3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:2028
; UNROLL3-NEXT: s_waitcnt vmcnt(2)
Expand Down
Loading
Loading