Skip to content

Commit 6bfa659

Browse files
author
git apple-llvm automerger
committed
Merge commit '8d3f497eb834' from llvm.org/main into next
2 parents 7fd8d49 + 8d3f497 commit 6bfa659

File tree

4 files changed

+466
-11
lines changed

4 files changed

+466
-11
lines changed

llvm/lib/Transforms/Scalar/StructurizeCFG.cpp

Lines changed: 122 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "llvm/Analysis/RegionInfo.h"
2020
#include "llvm/Analysis/RegionIterator.h"
2121
#include "llvm/Analysis/RegionPass.h"
22+
#include "llvm/Analysis/TargetTransformInfo.h"
2223
#include "llvm/Analysis/UniformityAnalysis.h"
2324
#include "llvm/IR/BasicBlock.h"
2425
#include "llvm/IR/CFG.h"
@@ -128,6 +129,7 @@ struct PredInfo {
128129
using BBPredicates = DenseMap<BasicBlock *, PredInfo>;
129130
using PredMap = DenseMap<BasicBlock *, BBPredicates>;
130131
using BB2BBMap = DenseMap<BasicBlock *, BasicBlock *>;
132+
using Val2BBMap = DenseMap<Value *, BasicBlock *>;
131133

132134
// A traits type that is intended to be used in graph algorithms. The graph
133135
// traits starts at an entry node, and traverses the RegionNodes that are in
@@ -279,7 +281,7 @@ class StructurizeCFG {
279281
ConstantInt *BoolTrue;
280282
ConstantInt *BoolFalse;
281283
Value *BoolPoison;
282-
284+
const TargetTransformInfo *TTI;
283285
Function *Func;
284286
Region *ParentRegion;
285287

@@ -301,8 +303,12 @@ class StructurizeCFG {
301303
PredMap LoopPreds;
302304
BranchVector LoopConds;
303305

306+
Val2BBMap HoistedValues;
307+
304308
RegionNode *PrevNode;
305309

310+
void hoistZeroCostElseBlockPhiValues(BasicBlock *ElseBB, BasicBlock *ThenBB);
311+
306312
void orderNodes();
307313

308314
void analyzeLoops(RegionNode *N);
@@ -332,6 +338,8 @@ class StructurizeCFG {
332338

333339
void simplifyAffectedPhis();
334340

341+
void simplifyHoistedPhis();
342+
335343
DebugLoc killTerminator(BasicBlock *BB);
336344

337345
void changeExit(RegionNode *Node, BasicBlock *NewExit,
@@ -359,7 +367,7 @@ class StructurizeCFG {
359367

360368
public:
361369
void init(Region *R);
362-
bool run(Region *R, DominatorTree *DT);
370+
bool run(Region *R, DominatorTree *DT, const TargetTransformInfo *TTI);
363371
bool makeUniformRegion(Region *R, UniformityInfo &UA);
364372
};
365373

@@ -385,16 +393,21 @@ class StructurizeCFGLegacyPass : public RegionPass {
385393
if (SCFG.makeUniformRegion(R, UA))
386394
return false;
387395
}
396+
Function *F = R->getEntry()->getParent();
397+
const TargetTransformInfo *TTI =
398+
&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*F);
388399
DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
389-
return SCFG.run(R, DT);
400+
return SCFG.run(R, DT, TTI);
390401
}
391402

392403
StringRef getPassName() const override { return "Structurize control flow"; }
393404

394405
void getAnalysisUsage(AnalysisUsage &AU) const override {
395406
if (SkipUniformRegions)
396407
AU.addRequired<UniformityInfoWrapperPass>();
408+
AU.addRequired<TargetTransformInfoWrapperPass>();
397409
AU.addRequired<DominatorTreeWrapperPass>();
410+
AU.addRequired<TargetTransformInfoWrapperPass>();
398411

399412
AU.addPreserved<DominatorTreeWrapperPass>();
400413
RegionPass::getAnalysisUsage(AU);
@@ -403,6 +416,34 @@ class StructurizeCFGLegacyPass : public RegionPass {
403416

404417
} // end anonymous namespace
405418

419+
/// Checks whether an instruction is zero cost instruction and checks if the
420+
/// operands are from different BB. If so, this instruction can be coalesced
421+
/// if its hoisted to predecessor block. So, this returns true.
422+
static bool isHoistableInstruction(Instruction *I, BasicBlock *BB,
423+
const TargetTransformInfo *TTI) {
424+
if (I->getParent() != BB)
425+
return false;
426+
427+
// If the instruction is not a zero cost instruction, return false.
428+
auto Cost = TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency);
429+
InstructionCost::CostType CostVal =
430+
Cost.isValid()
431+
? Cost.getValue()
432+
: (InstructionCost::CostType)TargetTransformInfo::TCC_Expensive;
433+
if (CostVal != 0)
434+
return false;
435+
436+
// Check if any operands are instructions defined in the same block.
437+
for (auto &Op : I->operands()) {
438+
if (auto *OpI = dyn_cast<Instruction>(Op)) {
439+
if (OpI->getParent() == BB)
440+
return false;
441+
}
442+
}
443+
444+
return true;
445+
}
446+
406447
char StructurizeCFGLegacyPass::ID = 0;
407448

408449
INITIALIZE_PASS_BEGIN(StructurizeCFGLegacyPass, "structurizecfg",
@@ -413,6 +454,39 @@ INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
413454
INITIALIZE_PASS_END(StructurizeCFGLegacyPass, "structurizecfg",
414455
"Structurize the CFG", false, false)
415456

457+
/// Structurization can introduce unnecessary VGPR copies due to register
458+
/// coalescing interference. For example, if the Else block has a zero-cost
459+
/// instruction and the Then block modifies the VGPR value, only one value is
460+
/// live at a time in merge block before structurization. After structurization,
461+
/// the coalescer may incorrectly treat the Then value as live in the Else block
462+
/// (via the path Then → Flow → Else), leading to unnecessary VGPR copies.
463+
///
464+
/// This function examines phi nodes whose incoming values are zero-cost
465+
/// instructions in the Else block. It identifies such values that can be safely
466+
/// hoisted and moves them to the nearest common dominator of Then and Else
467+
/// blocks. A follow-up function after setting PhiNodes assigns the hoisted
468+
/// value to poison phi nodes along the if→flow edge, aiding register coalescing
469+
/// and minimizing unnecessary live ranges.
470+
void StructurizeCFG::hoistZeroCostElseBlockPhiValues(BasicBlock *ElseBB,
471+
BasicBlock *ThenBB) {
472+
473+
BasicBlock *ElseSucc = ElseBB->getSingleSuccessor();
474+
BasicBlock *CommonDominator = DT->findNearestCommonDominator(ElseBB, ThenBB);
475+
476+
if (!ElseSucc || !CommonDominator)
477+
return;
478+
Instruction *Term = CommonDominator->getTerminator();
479+
for (PHINode &Phi : ElseSucc->phis()) {
480+
Value *ElseVal = Phi.getIncomingValueForBlock(ElseBB);
481+
auto *Inst = dyn_cast<Instruction>(ElseVal);
482+
if (!Inst || !isHoistableInstruction(Inst, ElseBB, TTI))
483+
continue;
484+
Inst->removeFromParent();
485+
Inst->insertInto(CommonDominator, Term->getIterator());
486+
HoistedValues[Inst] = CommonDominator;
487+
}
488+
}
489+
416490
/// Build up the general order of nodes, by performing a topological sort of the
417491
/// parent region's nodes, while ensuring that there is no outer cycle node
418492
/// between any two inner cycle nodes.
@@ -535,7 +609,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
535609
BasicBlock *Other = Term->getSuccessor(!i);
536610
if (Visited.count(Other) && !Loops.count(Other) &&
537611
!Pred.count(Other) && !Pred.count(P)) {
538-
612+
hoistZeroCostElseBlockPhiValues(Succ, Other);
539613
Pred[Other] = {BoolFalse, std::nullopt};
540614
Pred[P] = {BoolTrue, std::nullopt};
541615
continue;
@@ -891,6 +965,44 @@ void StructurizeCFG::setPhiValues() {
891965
AffectedPhis.append(InsertedPhis.begin(), InsertedPhis.end());
892966
}
893967

968+
/// Updates PHI nodes after hoisted zero cost instructions by replacing poison
969+
/// entries on Flow nodes with the appropriate hoisted values
970+
void StructurizeCFG::simplifyHoistedPhis() {
971+
for (WeakVH VH : AffectedPhis) {
972+
PHINode *Phi = dyn_cast_or_null<PHINode>(VH);
973+
if (!Phi || Phi->getNumIncomingValues() != 2)
974+
continue;
975+
976+
for (int i = 0; i < 2; i++) {
977+
Value *V = Phi->getIncomingValue(i);
978+
auto BBIt = HoistedValues.find(V);
979+
980+
if (BBIt == HoistedValues.end())
981+
continue;
982+
983+
Value *OtherV = Phi->getIncomingValue(!i);
984+
PHINode *OtherPhi = dyn_cast<PHINode>(OtherV);
985+
if (!OtherPhi)
986+
continue;
987+
988+
int PoisonValBBIdx = -1;
989+
for (size_t i = 0; i < OtherPhi->getNumIncomingValues(); i++) {
990+
if (!isa<PoisonValue>(OtherPhi->getIncomingValue(i)))
991+
continue;
992+
PoisonValBBIdx = i;
993+
break;
994+
}
995+
if (PoisonValBBIdx == -1 ||
996+
!DT->dominates(BBIt->second,
997+
OtherPhi->getIncomingBlock(PoisonValBBIdx)))
998+
continue;
999+
1000+
OtherPhi->setIncomingValue(PoisonValBBIdx, V);
1001+
Phi->setIncomingValue(i, OtherV);
1002+
}
1003+
}
1004+
}
1005+
8941006
void StructurizeCFG::simplifyAffectedPhis() {
8951007
bool Changed;
8961008
do {
@@ -1283,12 +1395,13 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) {
12831395
}
12841396

12851397
/// Run the transformation for each region found
1286-
bool StructurizeCFG::run(Region *R, DominatorTree *DT) {
1398+
bool StructurizeCFG::run(Region *R, DominatorTree *DT,
1399+
const TargetTransformInfo *TTI) {
12871400
if (R->isTopLevelRegion())
12881401
return false;
12891402

12901403
this->DT = DT;
1291-
1404+
this->TTI = TTI;
12921405
Func = R->getEntry()->getParent();
12931406
assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator.");
12941407

@@ -1300,6 +1413,7 @@ bool StructurizeCFG::run(Region *R, DominatorTree *DT) {
13001413
insertConditions(false);
13011414
insertConditions(true);
13021415
setPhiValues();
1416+
simplifyHoistedPhis();
13031417
simplifyConditions();
13041418
simplifyAffectedPhis();
13051419
rebuildSSA();
@@ -1349,7 +1463,7 @@ PreservedAnalyses StructurizeCFGPass::run(Function &F,
13491463
bool Changed = false;
13501464
DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
13511465
auto &RI = AM.getResult<RegionInfoAnalysis>(F);
1352-
1466+
TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F);
13531467
UniformityInfo *UI = nullptr;
13541468
if (SkipUniformRegions)
13551469
UI = &AM.getResult<UniformityInfoAnalysis>(F);
@@ -1368,7 +1482,7 @@ PreservedAnalyses StructurizeCFGPass::run(Function &F,
13681482
continue;
13691483
}
13701484

1371-
Changed |= SCFG.run(R, DT);
1485+
Changed |= SCFG.run(R, DT, TTI);
13721486
}
13731487
if (!Changed)
13741488
return PreservedAnalyses::all();

llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9851,8 +9851,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
98519851
; CHECK-NEXT: s_andn2_saveexec_b32 s6, s6
98529852
; CHECK-NEXT: s_cbranch_execz .LBB8_6
98539853
; CHECK-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader
9854-
; CHECK-NEXT: v_add_nc_u32_e32 v0, 0x700, v0
98559854
; CHECK-NEXT: v_add_nc_u32_e32 v1, 0x700, v1
9855+
; CHECK-NEXT: v_add_nc_u32_e32 v0, 0x700, v0
98569856
; CHECK-NEXT: s_movk_i32 s4, 0xf800
98579857
; CHECK-NEXT: s_mov_b32 s5, -1
98589858
; CHECK-NEXT: .LBB8_5: ; %memmove_bwd_loop
@@ -11167,8 +11167,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
1116711167
; ALIGNED-NEXT: s_andn2_saveexec_b32 s6, s6
1116811168
; ALIGNED-NEXT: s_cbranch_execz .LBB8_6
1116911169
; ALIGNED-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader
11170-
; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x700, v0
1117111170
; ALIGNED-NEXT: v_add_nc_u32_e32 v1, 0x700, v1
11171+
; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x700, v0
1117211172
; ALIGNED-NEXT: s_movk_i32 s4, 0xf800
1117311173
; ALIGNED-NEXT: s_mov_b32 s5, -1
1117411174
; ALIGNED-NEXT: .LBB8_5: ; %memmove_bwd_loop
@@ -12381,8 +12381,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
1238112381
; UNROLL3-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:2024
1238212382
; UNROLL3-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:2020
1238312383
; UNROLL3-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:2016
12384-
; UNROLL3-NEXT: v_add_nc_u32_e32 v2, 0x7b0, v0
1238512384
; UNROLL3-NEXT: v_add_nc_u32_e32 v1, 0x7b0, v1
12385+
; UNROLL3-NEXT: v_add_nc_u32_e32 v2, 0x7b0, v0
1238612386
; UNROLL3-NEXT: s_waitcnt vmcnt(3)
1238712387
; UNROLL3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:2028
1238812388
; UNROLL3-NEXT: s_waitcnt vmcnt(2)

0 commit comments

Comments
 (0)